1 /* Speed measuring program.
2
3 Copyright 1999-2003, 2005, 2006, 2008-2019 Free Software Foundation, Inc.
4
5 This file is part of the GNU MP Library.
6
7 The GNU MP Library is free software; you can redistribute it and/or modify
8 it under the terms of either:
9
10 * the GNU Lesser General Public License as published by the Free
11 Software Foundation; either version 3 of the License, or (at your
12 option) any later version.
13
14 or
15
16 * the GNU General Public License as published by the Free Software
17 Foundation; either version 2 of the License, or (at your option) any
18 later version.
19
20 or both in parallel, as here.
21
22 The GNU MP Library is distributed in the hope that it will be useful, but
23 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 for more details.
26
27 You should have received copies of the GNU General Public License and the
28 GNU Lesser General Public License along with the GNU MP Library. If not,
29 see https://www.gnu.org/licenses/. */
30
31 /* Usage message is in the code below, run with no arguments to print it.
32 See README for interesting applications.
33
34 To add a new routine foo(), create a speed_foo() function in the style of
35 the existing ones and add an entry in the routine[] array. Put FLAG_R if
36 speed_foo() wants an "r" parameter.
37
38 The routines don't have help messages or descriptions, but most have
39 suggestive names. See the source code for full details. */
40
41 #include "config.h"
42
43 #include <limits.h>
44 #include <stdio.h>
45 #include <stdlib.h>
46 #include <string.h>
47
48 #if HAVE_UNISTD_H
49 #include <unistd.h> /* for getpid, R_OK */
50 #endif
51
52 #if TIME_WITH_SYS_TIME
53 # include <sys/time.h> /* for struct timeval */
54 # include <time.h>
55 #else
56 # if HAVE_SYS_TIME_H
57 # include <sys/time.h>
58 # else
59 # include <time.h>
60 # endif
61 #endif
62
63 #if HAVE_SYS_RESOURCE_H
64 #include <sys/resource.h> /* for getrusage() */
65 #endif
66
67
68 #include "gmp-impl.h"
69 #include "longlong.h" /* for the benefit of speed-many.c */
70 #include "tests.h"
71 #include "speed.h"
72
73
74 #if !HAVE_DECL_OPTARG
75 extern char *optarg;
76 extern int optind, opterr;
77 #endif
78
79 #if !HAVE_STRTOUL
80 #define strtoul(p,e,b) (unsigned long) strtol(p,e,b)
81 #endif
82
83 #ifdef SPEED_EXTRA_PROTOS
84 SPEED_EXTRA_PROTOS
85 #endif
86 #ifdef SPEED_EXTRA_PROTOS2
87 SPEED_EXTRA_PROTOS2
88 #endif
89
90
91 #if GMP_LIMB_BITS == 32
92 #define GMP_NUMB_0xAA (CNST_LIMB(0xAAAAAAAA) & GMP_NUMB_MASK)
93 #endif
94 #if GMP_LIMB_BITS == 64
95 #define GMP_NUMB_0xAA (CNST_LIMB(0xAAAAAAAAAAAAAAAA) & GMP_NUMB_MASK)
96 #endif
97
98
99 #define CMP_ABSOLUTE 1
100 #define CMP_RATIO 2
101 #define CMP_DIFFERENCE 3
102 #define CMP_DIFFPREV 4
103 int option_cmp = CMP_ABSOLUTE;
104
105 #define UNIT_SECONDS 1
106 #define UNIT_CYCLES 2
107 #define UNIT_CYCLESPERLIMB 3
108 int option_unit = UNIT_SECONDS;
109
110 #define DATA_RANDOM 1
111 #define DATA_RANDOM2 2
112 #define DATA_ZEROS 3
113 #define DATA_AAS 4
114 #define DATA_FFS 5
115 #define DATA_2FD 6
116 int option_data = DATA_RANDOM;
117
118 int option_square = 0;
119 double option_factor = 0.0;
120 mp_size_t option_step = 1;
121 int option_gnuplot = 0;
122 char *option_gnuplot_basename;
123 struct size_array_t {
124 mp_size_t start, end;
125 } *size_array = NULL;
126 mp_size_t size_num = 0;
127 mp_size_t size_allocnum = 0;
128 int option_resource_usage = 0;
129 long option_seed = 123456789;
130
131 struct speed_params sp;
132
133 #define COLUMN_WIDTH 13 /* for the free-form output */
134
135 #define FLAG_R (1<<0) /* require ".r" */
136 #define FLAG_R_OPTIONAL (1<<1) /* optional ".r" */
137 #define FLAG_RSIZE (1<<2)
138 #define FLAG_NODATA (1<<3) /* don't alloc xp, yp */
139
140 const struct routine_t {
141 /* constants */
142 const char *name;
143 speed_function_t fun;
144 int flag;
145 } routine[] = {
146
147 { "noop", speed_noop },
148 { "noop_wxs", speed_noop_wxs },
149 { "noop_wxys", speed_noop_wxys },
150
151 { "mpn_add_n", speed_mpn_add_n, FLAG_R_OPTIONAL },
152 { "mpn_sub_n", speed_mpn_sub_n, FLAG_R_OPTIONAL },
153 { "mpn_add_1", speed_mpn_add_1, FLAG_R },
154 { "mpn_add_1_inplace", speed_mpn_add_1_inplace, FLAG_R },
155 { "mpn_sub_1", speed_mpn_sub_1, FLAG_R },
156 { "mpn_sub_1_inplace", speed_mpn_sub_1_inplace, FLAG_R },
157
158 { "mpn_add_err1_n", speed_mpn_add_err1_n },
159 { "mpn_add_err2_n", speed_mpn_add_err2_n },
160 { "mpn_add_err3_n", speed_mpn_add_err3_n },
161 { "mpn_sub_err1_n", speed_mpn_sub_err1_n },
162 { "mpn_sub_err2_n", speed_mpn_sub_err2_n },
163 { "mpn_sub_err3_n", speed_mpn_sub_err3_n },
164
165 #if HAVE_NATIVE_mpn_add_n_sub_n
166 { "mpn_add_n_sub_n", speed_mpn_add_n_sub_n, FLAG_R_OPTIONAL },
167 #endif
168
169 { "mpn_addmul_1", speed_mpn_addmul_1, FLAG_R },
170 { "mpn_submul_1", speed_mpn_submul_1, FLAG_R },
171 #if HAVE_NATIVE_mpn_addmul_2
172 { "mpn_addmul_2", speed_mpn_addmul_2, FLAG_R_OPTIONAL },
173 #endif
174 #if HAVE_NATIVE_mpn_addmul_3
175 { "mpn_addmul_3", speed_mpn_addmul_3, FLAG_R_OPTIONAL },
176 #endif
177 #if HAVE_NATIVE_mpn_addmul_4
178 { "mpn_addmul_4", speed_mpn_addmul_4, FLAG_R_OPTIONAL },
179 #endif
180 #if HAVE_NATIVE_mpn_addmul_5
181 { "mpn_addmul_5", speed_mpn_addmul_5, FLAG_R_OPTIONAL },
182 #endif
183 #if HAVE_NATIVE_mpn_addmul_6
184 { "mpn_addmul_6", speed_mpn_addmul_6, FLAG_R_OPTIONAL },
185 #endif
186 #if HAVE_NATIVE_mpn_addmul_7
187 { "mpn_addmul_7", speed_mpn_addmul_7, FLAG_R_OPTIONAL },
188 #endif
189 #if HAVE_NATIVE_mpn_addmul_8
190 { "mpn_addmul_8", speed_mpn_addmul_8, FLAG_R_OPTIONAL },
191 #endif
192 { "mpn_mul_1", speed_mpn_mul_1, FLAG_R },
193 { "mpn_mul_1_inplace", speed_mpn_mul_1_inplace, FLAG_R },
194 #if HAVE_NATIVE_mpn_mul_2
195 { "mpn_mul_2", speed_mpn_mul_2, FLAG_R_OPTIONAL },
196 #endif
197 #if HAVE_NATIVE_mpn_mul_3
198 { "mpn_mul_3", speed_mpn_mul_3, FLAG_R_OPTIONAL },
199 #endif
200 #if HAVE_NATIVE_mpn_mul_4
201 { "mpn_mul_4", speed_mpn_mul_4, FLAG_R_OPTIONAL },
202 #endif
203 #if HAVE_NATIVE_mpn_mul_5
204 { "mpn_mul_5", speed_mpn_mul_5, FLAG_R_OPTIONAL },
205 #endif
206 #if HAVE_NATIVE_mpn_mul_6
207 { "mpn_mul_6", speed_mpn_mul_6, FLAG_R_OPTIONAL },
208 #endif
209
210 { "mpn_divrem_1", speed_mpn_divrem_1, FLAG_R },
211 { "mpn_divrem_1f", speed_mpn_divrem_1f, FLAG_R },
212 #if HAVE_NATIVE_mpn_divrem_1c
213 { "mpn_divrem_1c", speed_mpn_divrem_1c, FLAG_R },
214 { "mpn_divrem_1cf", speed_mpn_divrem_1cf,FLAG_R },
215 #endif
216 { "mpn_mod_1", speed_mpn_mod_1, FLAG_R },
217 #if HAVE_NATIVE_mpn_mod_1c
218 { "mpn_mod_1c", speed_mpn_mod_1c, FLAG_R },
219 #endif
220 { "mpn_preinv_divrem_1", speed_mpn_preinv_divrem_1, FLAG_R },
221 { "mpn_preinv_divrem_1f", speed_mpn_preinv_divrem_1f, FLAG_R },
222 { "mpn_preinv_mod_1", speed_mpn_preinv_mod_1, FLAG_R },
223
224 { "mpn_mod_1_1", speed_mpn_mod_1_1, FLAG_R },
225 { "mpn_mod_1_1_1", speed_mpn_mod_1_1_1, FLAG_R },
226 { "mpn_mod_1_1_2", speed_mpn_mod_1_1_2, FLAG_R },
227 { "mpn_mod_1s_2", speed_mpn_mod_1_2, FLAG_R },
228 { "mpn_mod_1s_3", speed_mpn_mod_1_3, FLAG_R },
229 { "mpn_mod_1s_4", speed_mpn_mod_1_4, FLAG_R },
230
231 { "mpn_divrem_1_div", speed_mpn_divrem_1_div, FLAG_R },
232 { "mpn_divrem_1_inv", speed_mpn_divrem_1_inv, FLAG_R },
233 { "mpn_divrem_1f_div", speed_mpn_divrem_1f_div, FLAG_R },
234 { "mpn_divrem_1f_inv", speed_mpn_divrem_1f_inv, FLAG_R },
235 { "mpn_mod_1_div", speed_mpn_mod_1_div, FLAG_R },
236 { "mpn_mod_1_inv", speed_mpn_mod_1_inv, FLAG_R },
237
238 { "mpn_divrem_2", speed_mpn_divrem_2, },
239 { "mpn_divrem_2_div", speed_mpn_divrem_2_div, },
240 { "mpn_divrem_2_inv", speed_mpn_divrem_2_inv, },
241
242 { "mpn_div_qr_1n_pi1", speed_mpn_div_qr_1n_pi1, FLAG_R },
243 { "mpn_div_qr_1n_pi1_1",speed_mpn_div_qr_1n_pi1_1, FLAG_R },
244 { "mpn_div_qr_1n_pi1_2",speed_mpn_div_qr_1n_pi1_2, FLAG_R },
245 { "mpn_div_qr_1", speed_mpn_div_qr_1, FLAG_R },
246
247 { "mpn_div_qr_2n", speed_mpn_div_qr_2n, },
248 { "mpn_div_qr_2u", speed_mpn_div_qr_2u, },
249
250 { "mpn_divexact_1", speed_mpn_divexact_1, FLAG_R },
251 { "mpn_divexact_by3", speed_mpn_divexact_by3 },
252
253 { "mpn_bdiv_q_1", speed_mpn_bdiv_q_1, FLAG_R },
254 { "mpn_pi1_bdiv_q_1", speed_mpn_pi1_bdiv_q_1, FLAG_R_OPTIONAL },
255 { "mpn_bdiv_dbm1c", speed_mpn_bdiv_dbm1c, FLAG_R_OPTIONAL },
256
257 #if HAVE_NATIVE_mpn_modexact_1_odd
258 { "mpn_modexact_1_odd", speed_mpn_modexact_1_odd, FLAG_R },
259 #endif
260 { "mpn_modexact_1c_odd", speed_mpn_modexact_1c_odd, FLAG_R },
261
262 #if GMP_NUMB_BITS % 4 == 0
263 { "mpn_mod_34lsub1", speed_mpn_mod_34lsub1 },
264 #endif
265
266 { "mpn_lshift", speed_mpn_lshift, FLAG_R },
267 { "mpn_lshiftc", speed_mpn_lshiftc, FLAG_R },
268 { "mpn_rshift", speed_mpn_rshift, FLAG_R },
269
270 { "mpn_and_n", speed_mpn_and_n, FLAG_R_OPTIONAL },
271 { "mpn_andn_n", speed_mpn_andn_n, FLAG_R_OPTIONAL },
272 { "mpn_nand_n", speed_mpn_nand_n, FLAG_R_OPTIONAL },
273 { "mpn_ior_n", speed_mpn_ior_n, FLAG_R_OPTIONAL },
274 { "mpn_iorn_n", speed_mpn_iorn_n, FLAG_R_OPTIONAL },
275 { "mpn_nior_n", speed_mpn_nior_n, FLAG_R_OPTIONAL },
276 { "mpn_xor_n", speed_mpn_xor_n, FLAG_R_OPTIONAL },
277 { "mpn_xnor_n", speed_mpn_xnor_n, FLAG_R_OPTIONAL },
278 { "mpn_com", speed_mpn_com },
279 { "mpn_neg", speed_mpn_neg },
280
281 { "mpn_popcount", speed_mpn_popcount },
282 { "mpn_hamdist", speed_mpn_hamdist },
283
284 { "mpn_matrix22_mul", speed_mpn_matrix22_mul },
285
286 { "mpn_hgcd2", speed_mpn_hgcd2, FLAG_NODATA },
287 { "mpn_hgcd2_1", speed_mpn_hgcd2_1, FLAG_NODATA },
288 { "mpn_hgcd2_2", speed_mpn_hgcd2_2, FLAG_NODATA },
289 { "mpn_hgcd2_3", speed_mpn_hgcd2_3, FLAG_NODATA },
290 { "mpn_hgcd2_4", speed_mpn_hgcd2_4, FLAG_NODATA },
291 { "mpn_hgcd2_5", speed_mpn_hgcd2_5, FLAG_NODATA },
292 { "mpn_hgcd", speed_mpn_hgcd },
293 { "mpn_hgcd_lehmer", speed_mpn_hgcd_lehmer },
294 { "mpn_hgcd_appr", speed_mpn_hgcd_appr },
295 { "mpn_hgcd_appr_lehmer", speed_mpn_hgcd_appr_lehmer },
296
297 { "mpn_hgcd_reduce", speed_mpn_hgcd_reduce },
298 { "mpn_hgcd_reduce_1", speed_mpn_hgcd_reduce_1 },
299 { "mpn_hgcd_reduce_2", speed_mpn_hgcd_reduce_2 },
300
301 { "mpn_gcd_1", speed_mpn_gcd_1, FLAG_R_OPTIONAL },
302 { "mpn_gcd_11", speed_mpn_gcd_11, FLAG_R_OPTIONAL },
303 { "mpn_gcd_1N", speed_mpn_gcd_1N, FLAG_R_OPTIONAL },
304 { "mpn_gcd_22", speed_mpn_gcd_22, FLAG_R_OPTIONAL },
305
306 { "mpn_gcd", speed_mpn_gcd },
307
308 { "mpn_gcdext", speed_mpn_gcdext },
309 { "mpn_gcdext_single", speed_mpn_gcdext_single },
310 { "mpn_gcdext_double", speed_mpn_gcdext_double },
311 { "mpn_gcdext_one_single", speed_mpn_gcdext_one_single },
312 { "mpn_gcdext_one_double", speed_mpn_gcdext_one_double },
313 #if 0
314 { "mpn_gcdext_lehmer", speed_mpn_gcdext_lehmer },
315 #endif
316
317 { "mpz_nextprime", speed_mpz_nextprime },
318
319 { "mpz_jacobi", speed_mpz_jacobi },
320 { "mpn_jacobi_base", speed_mpn_jacobi_base },
321 { "mpn_jacobi_base_1", speed_mpn_jacobi_base_1 },
322 { "mpn_jacobi_base_2", speed_mpn_jacobi_base_2 },
323 { "mpn_jacobi_base_3", speed_mpn_jacobi_base_3 },
324 { "mpn_jacobi_base_4", speed_mpn_jacobi_base_4 },
325
326 { "mpn_mul", speed_mpn_mul, FLAG_R_OPTIONAL },
327 { "mpn_mul_basecase", speed_mpn_mul_basecase,FLAG_R_OPTIONAL },
328 { "mpn_sqr_basecase", speed_mpn_sqr_basecase },
329 #if HAVE_NATIVE_mpn_sqr_diagonal
330 { "mpn_sqr_diagonal", speed_mpn_sqr_diagonal },
331 #endif
332 #if HAVE_NATIVE_mpn_sqr_diag_addlsh1
333 { "mpn_sqr_diag_addlsh1", speed_mpn_sqr_diag_addlsh1 },
334 #endif
335
336 { "mpn_mul_n", speed_mpn_mul_n },
337 { "mpn_sqr", speed_mpn_sqr },
338
339 { "mpn_toom2_sqr", speed_mpn_toom2_sqr },
340 { "mpn_toom3_sqr", speed_mpn_toom3_sqr },
341 { "mpn_toom4_sqr", speed_mpn_toom4_sqr },
342 { "mpn_toom6_sqr", speed_mpn_toom6_sqr },
343 { "mpn_toom8_sqr", speed_mpn_toom8_sqr },
344 { "mpn_toom22_mul", speed_mpn_toom22_mul },
345 { "mpn_toom33_mul", speed_mpn_toom33_mul },
346 { "mpn_toom44_mul", speed_mpn_toom44_mul },
347 { "mpn_toom6h_mul", speed_mpn_toom6h_mul },
348 { "mpn_toom8h_mul", speed_mpn_toom8h_mul },
349 { "mpn_toom32_mul", speed_mpn_toom32_mul },
350 { "mpn_toom42_mul", speed_mpn_toom42_mul },
351 { "mpn_toom43_mul", speed_mpn_toom43_mul },
352 { "mpn_toom63_mul", speed_mpn_toom63_mul },
353 { "mpn_nussbaumer_mul", speed_mpn_nussbaumer_mul },
354 { "mpn_nussbaumer_mul_sqr",speed_mpn_nussbaumer_mul_sqr},
355 #if WANT_OLD_FFT_FULL
356 { "mpn_mul_fft_full", speed_mpn_mul_fft_full },
357 { "mpn_mul_fft_full_sqr", speed_mpn_mul_fft_full_sqr },
358 #endif
359 { "mpn_mul_fft", speed_mpn_mul_fft, FLAG_R_OPTIONAL },
360 { "mpn_mul_fft_sqr", speed_mpn_mul_fft_sqr, FLAG_R_OPTIONAL },
361
362 { "mpn_sqrlo", speed_mpn_sqrlo },
363 { "mpn_sqrlo_basecase", speed_mpn_sqrlo_basecase },
364 { "mpn_mullo_n", speed_mpn_mullo_n },
365 { "mpn_mullo_basecase", speed_mpn_mullo_basecase },
366
367 { "mpn_mulmid_basecase", speed_mpn_mulmid_basecase, FLAG_R_OPTIONAL },
368 { "mpn_toom42_mulmid", speed_mpn_toom42_mulmid },
369 { "mpn_mulmid_n", speed_mpn_mulmid_n },
370 { "mpn_mulmid", speed_mpn_mulmid, FLAG_R_OPTIONAL },
371
372 { "mpn_bc_mulmod_bnm1", speed_mpn_bc_mulmod_bnm1 },
373 { "mpn_mulmod_bnm1", speed_mpn_mulmod_bnm1 },
374 { "mpn_mulmod_bnm1_rounded", speed_mpn_mulmod_bnm1_rounded },
375 { "mpn_sqrmod_bnm1", speed_mpn_sqrmod_bnm1 },
376
377 { "mpn_invert", speed_mpn_invert },
378 { "mpn_invertappr", speed_mpn_invertappr },
379 { "mpn_ni_invertappr", speed_mpn_ni_invertappr },
380 { "mpn_binvert", speed_mpn_binvert },
381 { "mpn_sec_invert", speed_mpn_sec_invert },
382
383 { "mpn_sbpi1_div_qr", speed_mpn_sbpi1_div_qr, FLAG_R_OPTIONAL},
384 { "mpn_dcpi1_div_qr", speed_mpn_dcpi1_div_qr, FLAG_R_OPTIONAL},
385 { "mpn_mu_div_qr", speed_mpn_mu_div_qr, FLAG_R_OPTIONAL},
386 { "mpn_mupi_div_qr", speed_mpn_mupi_div_qr, FLAG_R_OPTIONAL},
387 { "mpn_sbpi1_divappr_q", speed_mpn_sbpi1_divappr_q, FLAG_R_OPTIONAL},
388 { "mpn_dcpi1_divappr_q", speed_mpn_dcpi1_divappr_q, FLAG_R_OPTIONAL},
389
390 { "mpn_sbpi1_bdiv_qr", speed_mpn_sbpi1_bdiv_qr },
391 { "mpn_dcpi1_bdiv_qr", speed_mpn_dcpi1_bdiv_qr },
392 { "mpn_sbpi1_bdiv_q", speed_mpn_sbpi1_bdiv_q },
393 { "mpn_dcpi1_bdiv_q", speed_mpn_dcpi1_bdiv_q },
394 { "mpn_sbpi1_bdiv_r", speed_mpn_sbpi1_bdiv_r },
395
396 { "mpn_broot", speed_mpn_broot, FLAG_R },
397 { "mpn_broot_invm1", speed_mpn_broot_invm1, FLAG_R },
398 { "mpn_brootinv", speed_mpn_brootinv, FLAG_R },
399
400 { "mpn_get_str", speed_mpn_get_str, FLAG_R_OPTIONAL },
401 { "mpn_set_str", speed_mpn_set_str, FLAG_R_OPTIONAL },
402 { "mpn_set_str_basecase", speed_mpn_bc_set_str, FLAG_R_OPTIONAL },
403
404 { "mpn_sqrtrem", speed_mpn_sqrtrem },
405 { "mpn_rootrem", speed_mpn_rootrem, FLAG_R },
406 { "mpn_sqrt", speed_mpn_sqrt },
407 { "mpn_root", speed_mpn_root, FLAG_R },
408
409 { "mpn_perfect_power_p", speed_mpn_perfect_power_p, },
410 { "mpn_perfect_square_p", speed_mpn_perfect_square_p, },
411
412 { "mpn_fib2_ui", speed_mpn_fib2_ui, FLAG_NODATA },
413 { "mpz_fib_ui", speed_mpz_fib_ui, FLAG_NODATA },
414 { "mpz_fib2_ui", speed_mpz_fib2_ui, FLAG_NODATA },
415 { "mpz_lucnum_ui", speed_mpz_lucnum_ui, FLAG_NODATA },
416 { "mpz_lucnum2_ui", speed_mpz_lucnum2_ui, FLAG_NODATA },
417
418 { "mpz_add", speed_mpz_add },
419 { "mpz_invert", speed_mpz_invert, FLAG_R_OPTIONAL },
420 { "mpz_bin_uiui", speed_mpz_bin_uiui, FLAG_NODATA | FLAG_R_OPTIONAL },
421 { "mpz_bin_ui", speed_mpz_bin_ui, FLAG_NODATA | FLAG_R_OPTIONAL },
422 { "mpz_fac_ui", speed_mpz_fac_ui, FLAG_NODATA },
423 { "mpz_2fac_ui", speed_mpz_2fac_ui, FLAG_NODATA },
424 { "mpz_mfac_uiui", speed_mpz_mfac_uiui, FLAG_NODATA | FLAG_R_OPTIONAL },
425 { "mpz_primorial_ui", speed_mpz_primorial_ui, FLAG_NODATA },
426 { "mpz_powm", speed_mpz_powm, FLAG_R_OPTIONAL },
427 { "mpz_powm_mod", speed_mpz_powm_mod },
428 { "mpz_powm_redc", speed_mpz_powm_redc },
429 { "mpz_powm_sec", speed_mpz_powm_sec },
430 { "mpz_powm_ui", speed_mpz_powm_ui, FLAG_R_OPTIONAL },
431
432 { "mpz_mod", speed_mpz_mod },
433 { "mpn_redc_1", speed_mpn_redc_1 },
434 { "mpn_redc_2", speed_mpn_redc_2 },
435 { "mpn_redc_n", speed_mpn_redc_n },
436
437 { "MPN_COPY", speed_MPN_COPY },
438 { "MPN_COPY_INCR", speed_MPN_COPY_INCR },
439 { "MPN_COPY_DECR", speed_MPN_COPY_DECR },
440 { "memcpy", speed_memcpy },
441 #if HAVE_NATIVE_mpn_copyi
442 { "mpn_copyi", speed_mpn_copyi },
443 #endif
444 #if HAVE_NATIVE_mpn_copyd
445 { "mpn_copyd", speed_mpn_copyd },
446 #endif
447 { "mpn_sec_tabselect", speed_mpn_sec_tabselect, FLAG_R_OPTIONAL },
448 #if HAVE_NATIVE_mpn_addlsh1_n == 1
449 { "mpn_addlsh1_n", speed_mpn_addlsh1_n, FLAG_R_OPTIONAL },
450 #endif
451 #if HAVE_NATIVE_mpn_sublsh1_n == 1
452 { "mpn_sublsh1_n", speed_mpn_sublsh1_n, FLAG_R_OPTIONAL },
453 #endif
454 #if HAVE_NATIVE_mpn_addlsh1_n_ip1
455 { "mpn_addlsh1_n_ip1", speed_mpn_addlsh1_n_ip1 },
456 #endif
457 #if HAVE_NATIVE_mpn_addlsh1_n_ip2
458 { "mpn_addlsh1_n_ip2", speed_mpn_addlsh1_n_ip2 },
459 #endif
460 #if HAVE_NATIVE_mpn_sublsh1_n_ip1
461 { "mpn_sublsh1_n_ip1", speed_mpn_sublsh1_n_ip1 },
462 #endif
463 #if HAVE_NATIVE_mpn_rsblsh1_n == 1
464 { "mpn_rsblsh1_n", speed_mpn_rsblsh1_n, FLAG_R_OPTIONAL },
465 #endif
466 #if HAVE_NATIVE_mpn_addlsh2_n == 1
467 { "mpn_addlsh2_n", speed_mpn_addlsh2_n, FLAG_R_OPTIONAL },
468 #endif
469 #if HAVE_NATIVE_mpn_sublsh2_n == 1
470 { "mpn_sublsh2_n", speed_mpn_sublsh2_n, FLAG_R_OPTIONAL },
471 #endif
472 #if HAVE_NATIVE_mpn_addlsh2_n_ip1
473 { "mpn_addlsh2_n_ip1", speed_mpn_addlsh2_n_ip1 },
474 #endif
475 #if HAVE_NATIVE_mpn_addlsh2_n_ip2
476 { "mpn_addlsh2_n_ip2", speed_mpn_addlsh2_n_ip2 },
477 #endif
478 #if HAVE_NATIVE_mpn_sublsh2_n_ip1
479 { "mpn_sublsh2_n_ip1", speed_mpn_sublsh2_n_ip1 },
480 #endif
481 #if HAVE_NATIVE_mpn_rsblsh2_n == 1
482 { "mpn_rsblsh2_n", speed_mpn_rsblsh2_n, FLAG_R_OPTIONAL },
483 #endif
484 #if HAVE_NATIVE_mpn_addlsh_n
485 { "mpn_addlsh_n", speed_mpn_addlsh_n, FLAG_R_OPTIONAL },
486 #endif
487 #if HAVE_NATIVE_mpn_sublsh_n
488 { "mpn_sublsh_n", speed_mpn_sublsh_n, FLAG_R_OPTIONAL },
489 #endif
490 #if HAVE_NATIVE_mpn_addlsh_n_ip1
491 { "mpn_addlsh_n_ip1", speed_mpn_addlsh_n_ip1 },
492 #endif
493 #if HAVE_NATIVE_mpn_addlsh_n_ip2
494 { "mpn_addlsh_n_ip2", speed_mpn_addlsh_n_ip2 },
495 #endif
496 #if HAVE_NATIVE_mpn_sublsh_n_ip1
497 { "mpn_sublsh_n_ip1", speed_mpn_sublsh_n_ip1 },
498 #endif
499 #if HAVE_NATIVE_mpn_rsblsh_n
500 { "mpn_rsblsh_n", speed_mpn_rsblsh_n, FLAG_R_OPTIONAL },
501 #endif
502 #if HAVE_NATIVE_mpn_rsh1add_n
503 { "mpn_rsh1add_n", speed_mpn_rsh1add_n, FLAG_R_OPTIONAL },
504 #endif
505 #if HAVE_NATIVE_mpn_rsh1sub_n
506 { "mpn_rsh1sub_n", speed_mpn_rsh1sub_n, FLAG_R_OPTIONAL },
507 #endif
508
509 { "mpn_cnd_add_n", speed_mpn_cnd_add_n, FLAG_R_OPTIONAL },
510 { "mpn_cnd_sub_n", speed_mpn_cnd_sub_n, FLAG_R_OPTIONAL },
511
512 { "MPN_ZERO", speed_MPN_ZERO },
513
514 { "binvert_limb", speed_binvert_limb, FLAG_NODATA },
515 { "binvert_limb_mul1", speed_binvert_limb_mul1, FLAG_NODATA },
516 { "binvert_limb_loop", speed_binvert_limb_loop, FLAG_NODATA },
517 { "binvert_limb_cond", speed_binvert_limb_cond, FLAG_NODATA },
518 { "binvert_limb_arith", speed_binvert_limb_arith, FLAG_NODATA },
519
520 { "malloc_free", speed_malloc_free },
521 { "malloc_realloc_free", speed_malloc_realloc_free },
522 { "gmp_allocate_free", speed_gmp_allocate_free },
523 { "gmp_allocate_reallocate_free", speed_gmp_allocate_reallocate_free },
524 { "mpz_init_clear", speed_mpz_init_clear },
525 { "mpq_init_clear", speed_mpq_init_clear },
526 { "mpf_init_clear", speed_mpf_init_clear },
527 { "mpz_init_realloc_clear", speed_mpz_init_realloc_clear },
528
529 { "umul_ppmm", speed_umul_ppmm, FLAG_R_OPTIONAL },
530 #if HAVE_NATIVE_mpn_umul_ppmm
531 { "mpn_umul_ppmm", speed_mpn_umul_ppmm, FLAG_R_OPTIONAL },
532 #endif
533 #if HAVE_NATIVE_mpn_umul_ppmm_r
534 { "mpn_umul_ppmm_r", speed_mpn_umul_ppmm_r, FLAG_R_OPTIONAL },
535 #endif
536
537 { "count_leading_zeros", speed_count_leading_zeros, FLAG_NODATA | FLAG_R_OPTIONAL },
538 { "count_trailing_zeros", speed_count_trailing_zeros, FLAG_NODATA | FLAG_R_OPTIONAL },
539
540 { "udiv_qrnnd", speed_udiv_qrnnd, FLAG_R_OPTIONAL },
541 { "udiv_qrnnd_c", speed_udiv_qrnnd_c, FLAG_R_OPTIONAL },
542 #if HAVE_NATIVE_mpn_udiv_qrnnd
543 { "mpn_udiv_qrnnd", speed_mpn_udiv_qrnnd, FLAG_R_OPTIONAL },
544 #endif
545 #if HAVE_NATIVE_mpn_udiv_qrnnd_r
546 { "mpn_udiv_qrnnd_r", speed_mpn_udiv_qrnnd_r, FLAG_R_OPTIONAL },
547 #endif
548 { "invert_limb", speed_invert_limb, FLAG_R_OPTIONAL },
549
550 { "operator_div", speed_operator_div, FLAG_R_OPTIONAL },
551 { "operator_mod", speed_operator_mod, FLAG_R_OPTIONAL },
552
553 { "gmp_randseed", speed_gmp_randseed, FLAG_R_OPTIONAL },
554 { "gmp_randseed_ui", speed_gmp_randseed_ui, FLAG_R_OPTIONAL | FLAG_NODATA },
555 { "mpz_urandomb", speed_mpz_urandomb, FLAG_R_OPTIONAL | FLAG_NODATA },
556
557 #ifdef SPEED_EXTRA_ROUTINES
558 SPEED_EXTRA_ROUTINES
559 #endif
560 #ifdef SPEED_EXTRA_ROUTINES2
561 SPEED_EXTRA_ROUTINES2
562 #endif
563 };
564
565
566 struct choice_t {
567 const struct routine_t *p;
568 mp_limb_t r;
569 double scale;
570 double time;
571 int no_time;
572 double prev_time;
573 const char *name;
574 };
575 struct choice_t *choice;
576 int num_choices = 0;
577
578
579 void
data_fill(mp_ptr ptr,mp_size_t size)580 data_fill (mp_ptr ptr, mp_size_t size)
581 {
582 switch (option_data) {
583 case DATA_RANDOM:
584 mpn_random (ptr, size);
585 break;
586 case DATA_RANDOM2:
587 mpn_random2 (ptr, size);
588 break;
589 case DATA_ZEROS:
590 MPN_ZERO (ptr, size);
591 break;
592 case DATA_AAS:
593 MPN_FILL (ptr, size, GMP_NUMB_0xAA);
594 break;
595 case DATA_FFS:
596 MPN_FILL (ptr, size, GMP_NUMB_MAX);
597 break;
598 case DATA_2FD:
599 MPN_FILL (ptr, size, GMP_NUMB_MAX);
600 ptr[0] -= 2;
601 break;
602 default:
603 abort();
604 /*NOTREACHED*/
605 }
606 }
607
608 /* The code here handling the various combinations of output options isn't
609 too attractive, but it works and is fairly clean. */
610
611 #define SIZE_TO_DIVISOR(n) \
612 (option_square == 1 ? (n)*(n) \
613 : option_square == 2 ? (n)*((n)+1)/2 \
614 : (n))
615
616 void
run_one(FILE * fp,struct speed_params * s,mp_size_t prev_size)617 run_one (FILE *fp, struct speed_params *s, mp_size_t prev_size)
618 {
619 const char *first_open_fastest, *first_open_notfastest, *first_close;
620 int i, fastest, want_data;
621 double fastest_time;
622 TMP_DECL;
623
624 TMP_MARK;
625
626 /* allocate data, unless all routines are NODATA */
627 want_data = 0;
628 for (i = 0; i < num_choices; i++)
629 want_data |= ((choice[i].p->flag & FLAG_NODATA) == 0);
630
631 if (want_data)
632 {
633 SPEED_TMP_ALLOC_LIMBS (sp.xp, s->size, s->align_xp);
634 SPEED_TMP_ALLOC_LIMBS (sp.yp, s->size, s->align_yp);
635
636 data_fill (s->xp, s->size);
637 data_fill (s->yp, s->size);
638 }
639 else
640 {
641 sp.xp = NULL;
642 sp.yp = NULL;
643 }
644
645 if (prev_size == -1 && option_cmp == CMP_DIFFPREV)
646 {
647 first_open_fastest = "(#";
648 first_open_notfastest = " (";
649 first_close = ")";
650 }
651 else
652 {
653 first_open_fastest = "#";
654 first_open_notfastest = " ";
655 first_close = "";
656 }
657
658 fastest = -1;
659 fastest_time = -1.0;
660 for (i = 0; i < num_choices; i++)
661 {
662 s->r = choice[i].r;
663 choice[i].time = speed_measure (choice[i].p->fun, s);
664 choice[i].no_time = (choice[i].time == -1.0);
665 if (! choice[i].no_time)
666 choice[i].time *= choice[i].scale;
667
668 /* Apply the effect of CMP_DIFFPREV, but the new choice[i].prev_time
669 is before any differences. */
670 {
671 double t;
672 t = choice[i].time;
673 if (t != -1.0 && option_cmp == CMP_DIFFPREV && prev_size != -1)
674 {
675 if (choice[i].prev_time == -1.0)
676 choice[i].no_time = 1;
677 else
678 choice[i].time = choice[i].time - choice[i].prev_time;
679 }
680 choice[i].prev_time = t;
681 }
682
683 if (choice[i].no_time)
684 continue;
685
686 /* Look for the fastest after CMP_DIFFPREV has been applied, but
687 before CMP_RATIO or CMP_DIFFERENCE. There's only a fastest shown
688 if there's more than one routine. */
689 if (num_choices > 1 && (fastest == -1 || choice[i].time < fastest_time))
690 {
691 fastest = i;
692 fastest_time = choice[i].time;
693 }
694
695 if (option_cmp == CMP_DIFFPREV)
696 {
697 /* Conversion for UNIT_CYCLESPERLIMB differs in CMP_DIFFPREV. */
698 if (option_unit == UNIT_CYCLES)
699 choice[i].time /= speed_cycletime;
700 else if (option_unit == UNIT_CYCLESPERLIMB)
701 {
702 if (prev_size == -1)
703 choice[i].time /= speed_cycletime;
704 else
705 choice[i].time /= (speed_cycletime
706 * (SIZE_TO_DIVISOR(s->size)
707 - SIZE_TO_DIVISOR(prev_size)));
708 }
709 }
710 else
711 {
712 if (option_unit == UNIT_CYCLES)
713 choice[i].time /= speed_cycletime;
714 else if (option_unit == UNIT_CYCLESPERLIMB)
715 choice[i].time /= (speed_cycletime * SIZE_TO_DIVISOR(s->size));
716
717 if (option_cmp == CMP_RATIO && i > 0)
718 {
719 /* A ratio isn't affected by the units chosen. */
720 if (choice[0].no_time || choice[0].time == 0.0)
721 choice[i].no_time = 1;
722 else
723 choice[i].time /= choice[0].time;
724 }
725 else if (option_cmp == CMP_DIFFERENCE && i > 0)
726 {
727 if (choice[0].no_time)
728 {
729 choice[i].no_time = 1;
730 continue;
731 }
732 choice[i].time -= choice[0].time;
733 }
734 }
735 }
736
737 if (option_gnuplot)
738 {
739 /* In CMP_DIFFPREV, don't print anything for the first size, start
740 with the second where an actual difference is available.
741
742 In CMP_RATIO, print the first column as 1.0.
743
744 The 9 decimals printed is much more than the expected precision of
745 the measurements actually. */
746
747 if (! (option_cmp == CMP_DIFFPREV && prev_size == -1))
748 {
749 fprintf (fp, "%-6ld ", s->size);
750 for (i = 0; i < num_choices; i++)
751 fprintf (fp, " %.9e",
752 choice[i].no_time ? 0.0
753 : (option_cmp == CMP_RATIO && i == 0) ? 1.0
754 : choice[i].time);
755 fprintf (fp, "\n");
756 }
757 }
758 else
759 {
760 fprintf (fp, "%-6ld ", s->size);
761 for (i = 0; i < num_choices; i++)
762 {
763 char buf[128];
764 int decimals;
765
766 if (choice[i].no_time)
767 {
768 fprintf (fp, " %*s", COLUMN_WIDTH, "n/a");
769 }
770 else
771 {if (option_unit == UNIT_CYCLESPERLIMB
772 || (option_cmp == CMP_RATIO && i > 0))
773 decimals = 4;
774 else if (option_unit == UNIT_CYCLES)
775 decimals = 2;
776 else
777 decimals = 9;
778
779 sprintf (buf, "%s%.*f%s",
780 i == fastest ? first_open_fastest : first_open_notfastest,
781 decimals, choice[i].time, first_close);
782 fprintf (fp, " %*s", COLUMN_WIDTH, buf);
783 }
784 }
785 fprintf (fp, "\n");
786 }
787
788 TMP_FREE;
789 }
790
791 void
run_all(FILE * fp)792 run_all (FILE *fp)
793 {
794 mp_size_t prev_size;
795 int i;
796 TMP_DECL;
797
798 TMP_MARK;
799 SPEED_TMP_ALLOC_LIMBS (sp.xp_block, SPEED_BLOCK_SIZE, sp.align_xp);
800 SPEED_TMP_ALLOC_LIMBS (sp.yp_block, SPEED_BLOCK_SIZE, sp.align_yp);
801
802 data_fill (sp.xp_block, SPEED_BLOCK_SIZE);
803 data_fill (sp.yp_block, SPEED_BLOCK_SIZE);
804
805 for (i = 0; i < size_num; i++)
806 {
807 sp.size = size_array[i].start;
808 prev_size = -1;
809 for (;;)
810 {
811 mp_size_t step;
812
813 if (option_data == DATA_2FD && sp.size >= 2)
814 sp.xp[sp.size-1] = 2;
815
816 run_one (fp, &sp, prev_size);
817 prev_size = sp.size;
818
819 if (option_data == DATA_2FD && sp.size >= 2)
820 sp.xp[sp.size-1] = MP_LIMB_T_MAX;
821
822 if (option_factor != 0.0)
823 {
824 step = (mp_size_t) (sp.size * option_factor - sp.size);
825 if (step < 1)
826 step = 1;
827 }
828 else
829 step = 1;
830 if (step < option_step)
831 step = option_step;
832
833 sp.size += step;
834 if (sp.size > size_array[i].end)
835 break;
836 }
837 }
838
839 TMP_FREE;
840 }
841
842
843 FILE *
fopen_for_write(const char * filename)844 fopen_for_write (const char *filename)
845 {
846 FILE *fp;
847 if ((fp = fopen (filename, "w")) == NULL)
848 {
849 fprintf (stderr, "Cannot create %s\n", filename);
850 exit(1);
851 }
852 return fp;
853 }
854
855 void
fclose_written(FILE * fp,const char * filename)856 fclose_written (FILE *fp, const char *filename)
857 {
858 int err;
859
860 err = ferror (fp);
861 err |= fclose (fp);
862
863 if (err)
864 {
865 fprintf (stderr, "Error writing %s\n", filename);
866 exit(1);
867 }
868 }
869
870
871 void
run_gnuplot(int argc,char * argv[])872 run_gnuplot (int argc, char *argv[])
873 {
874 char *plot_filename;
875 char *data_filename;
876 FILE *fp;
877 int i;
878
879 plot_filename = (char *) (*__gmp_allocate_func)
880 (strlen (option_gnuplot_basename) + 20);
881 data_filename = (char *) (*__gmp_allocate_func)
882 (strlen (option_gnuplot_basename) + 20);
883
884 sprintf (plot_filename, "%s.gnuplot", option_gnuplot_basename);
885 sprintf (data_filename, "%s.data", option_gnuplot_basename);
886
887 fp = fopen_for_write (plot_filename);
888
889 fprintf (fp, "# Generated with:\n");
890 fprintf (fp, "#");
891 for (i = 0; i < argc; i++)
892 fprintf (fp, " %s", argv[i]);
893 fprintf (fp, "\n");
894 fprintf (fp, "\n");
895
896 fprintf (fp, "reset\n");
897
898 /* Putting the key at the top left is usually good, and you can change it
899 interactively if it's not. */
900 fprintf (fp, "set key left\n");
901
902 /* write underscores, not subscripts */
903 fprintf (fp, "set termoption noenhanced\n");
904
905 /* designed to make it possible to see crossovers easily */
906 fprintf (fp, "set style data lines\n");
907
908 fprintf (fp, "plot ");
909 for (i = 0; i < num_choices; i++)
910 {
911 fprintf (fp, " \"%s\" using 1:%d", data_filename, i+2);
912 fprintf (fp, " title \"%s\"", choice[i].name);
913
914 if (i != num_choices-1)
915 fprintf (fp, ", \\");
916 fprintf (fp, "\n");
917 }
918
919 fprintf (fp, "load \"-\"\n");
920 fclose_written (fp, plot_filename);
921
922 fp = fopen_for_write (data_filename);
923
924 /* Unbuffered so you can see where the program was up to if it crashes or
925 you kill it. */
926 setbuf (fp, NULL);
927
928 run_all (fp);
929 fclose_written (fp, data_filename);
930 }
931
932
933 /* Return a limb with n many one bits (starting from the least significant) */
934
935 #define LIMB_ONES(n) \
936 ((n) == GMP_LIMB_BITS ? MP_LIMB_T_MAX \
937 : (n) == 0 ? CNST_LIMB(0) \
938 : (CNST_LIMB(1) << (n)) - 1)
939
940 mp_limb_t
r_string(const char * s)941 r_string (const char *s)
942 {
943 const char *s_orig = s;
944 long n;
945
946 if (strcmp (s, "aas") == 0)
947 return GMP_NUMB_0xAA;
948
949 {
950 mpz_t z;
951 mp_limb_t l;
952 int set, siz;
953
954 mpz_init (z);
955 set = mpz_set_str (z, s, 0);
956 siz = SIZ(z);
957 l = (siz == 0 ? 0 : siz > 0 ? PTR(z)[0] : -PTR(z)[0]);
958 mpz_clear (z);
959 if (set == 0)
960 {
961 if (siz > 1 || siz < -1)
962 printf ("Warning, r parameter %s truncated to %d bits\n",
963 s_orig, GMP_LIMB_BITS);
964 return l;
965 }
966 }
967
968 if (s[0] == '0' && (s[1] == 'x' || s[1] == 'X'))
969 n = strtoul (s+2, (char **) &s, 16);
970 else
971 n = strtol (s, (char **) &s, 10);
972
973 if (strcmp (s, "bits") == 0)
974 {
975 mp_limb_t l;
976 if (n > GMP_LIMB_BITS)
977 {
978 fprintf (stderr, "%ld bit parameter invalid (max %d bits)\n",
979 n, GMP_LIMB_BITS);
980 exit (1);
981 }
982 mpn_random (&l, 1);
983 return (l | (CNST_LIMB(1) << (n-1))) & LIMB_ONES(n);
984 }
985 else if (strcmp (s, "ones") == 0)
986 {
987 if (n > GMP_LIMB_BITS)
988 {
989 fprintf (stderr, "%ld bit parameter invalid (max %d bits)\n",
990 n, GMP_LIMB_BITS);
991 exit (1);
992 }
993 return LIMB_ONES (n);
994 }
995 else if (*s != '\0')
996 {
997 fprintf (stderr, "invalid r parameter: %s\n", s_orig);
998 exit (1);
999 }
1000
1001 return n;
1002 }
1003
1004
1005 void
routine_find(struct choice_t * c,const char * s_orig)1006 routine_find (struct choice_t *c, const char *s_orig)
1007 {
1008 const char *s;
1009 int i;
1010 size_t nlen;
1011
1012 c->name = s_orig;
1013 s = strchr (s_orig, '*');
1014 if (s != NULL)
1015 {
1016 c->scale = atof(s_orig);
1017 s++;
1018 }
1019 else
1020 {
1021 c->scale = 1.0;
1022 s = s_orig;
1023 }
1024
1025 for (i = 0; i < numberof (routine); i++)
1026 {
1027 nlen = strlen (routine[i].name);
1028 if (memcmp (s, routine[i].name, nlen) != 0)
1029 continue;
1030
1031 if (s[nlen] == '.')
1032 {
1033 /* match, with a .r parameter */
1034
1035 if (! (routine[i].flag & (FLAG_R|FLAG_R_OPTIONAL)))
1036 {
1037 fprintf (stderr,
1038 "Choice %s bad: doesn't take a \".<r>\" parameter\n",
1039 s_orig);
1040 exit (1);
1041 }
1042
1043 c->p = &routine[i];
1044 c->r = r_string (s + nlen + 1);
1045 return;
1046 }
1047
1048 if (s[nlen] == '\0')
1049 {
1050 /* match, with no parameter */
1051
1052 if (routine[i].flag & FLAG_R)
1053 {
1054 fprintf (stderr,
1055 "Choice %s bad: needs a \".<r>\" parameter\n",
1056 s_orig);
1057 exit (1);
1058 }
1059
1060 c->p = &routine[i];
1061 c->r = 0;
1062 return;
1063 }
1064 }
1065
1066 fprintf (stderr, "Choice %s unrecognised\n", s_orig);
1067 exit (1);
1068 }
1069
1070
1071 void
usage(void)1072 usage (void)
1073 {
1074 int i;
1075
1076 speed_time_init ();
1077
1078 printf ("Usage: speed [-options] -s size <routine>...\n");
1079 printf ("Measure the speed of some routines.\n");
1080 printf ("Times are in seconds, accuracy is shown.\n");
1081 printf ("\n");
1082 printf (" -p num set precision as number of time units each routine must run\n");
1083 printf (" -s size[-end][,size[-end]]... sizes to measure\n");
1084 printf (" single sizes or ranges, sep with comma or use multiple -s\n");
1085 printf (" -t step step through sizes by given amount\n");
1086 printf (" -f factor step through sizes by given factor (eg. 1.05)\n");
1087 printf (" -r show times as ratios of the first routine\n");
1088 printf (" -d show times as difference from the first routine\n");
1089 printf (" -D show times as difference from previous size shown\n");
1090 printf (" -c show times in CPU cycles\n");
1091 printf (" -C show times in cycles per limb\n");
1092 printf (" -u print resource usage (memory) at end\n");
1093 printf (" -P name output plot files \"name.gnuplot\" and \"name.data\"\n");
1094 printf (" -a <type> use given data: random(default), random2, zeros, aas, ffs, 2fd\n");
1095 printf (" -x, -y, -w, -W <align> specify data alignments, sources and dests\n");
1096 printf (" -o addrs print addresses of data blocks\n");
1097 printf ("\n");
1098 printf ("If both -t and -f are used, it means step by the factor or the step, whichever\n");
1099 printf ("is greater.\n");
1100 printf ("If both -C and -D are used, it means cycles per however many limbs between a\n");
1101 printf ("size and the previous size.\n");
1102 printf ("\n");
1103 printf ("After running with -P, plots can be viewed with Gnuplot or Quickplot.\n");
1104 printf ("\"gnuplot name.gnuplot\" (use \"set logscale xy; replot\" at the prompt for\n");
1105 printf ("a log/log plot).\n");
1106 printf ("\"quickplot -s name.data\" (has interactive zooming, and note -s is important\n");
1107 printf ("when viewing more than one routine, it means same axis scales for all data).\n");
1108 printf ("\n");
1109 printf ("The available routines are as follows.\n");
1110 printf ("\n");
1111
1112 for (i = 0; i < numberof (routine); i++)
1113 {
1114 if (routine[i].flag & FLAG_R)
1115 printf ("\t%s.r\n", routine[i].name);
1116 else if (routine[i].flag & FLAG_R_OPTIONAL)
1117 printf ("\t%s (optional .r)\n", routine[i].name);
1118 else
1119 printf ("\t%s\n", routine[i].name);
1120 }
1121 printf ("\n");
1122 printf ("Routines with a \".r\" need an extra parameter, for example mpn_lshift.6\n");
1123 printf ("r should be in decimal, or use 0xN for hexadecimal.\n");
1124 printf ("\n");
1125 printf ("Special forms for r are \"<N>bits\" for a random N bit number, \"<N>ones\" for\n");
1126 printf ("N one bits, or \"aas\" for 0xAA..AA.\n");
1127 printf ("\n");
1128 printf ("Times for sizes out of the range accepted by a routine are shown as 0.\n");
1129 printf ("The fastest routine at each size is marked with a # (free form output only).\n");
1130 printf ("\n");
1131 printf ("%s", speed_time_string);
1132 printf ("\n");
1133 printf ("Gnuplot home page http://www.gnuplot.info/\n");
1134 printf ("Quickplot home page http://quickplot.sourceforge.net/\n");
1135 }
1136
1137 void
check_align_option(const char * name,mp_size_t align)1138 check_align_option (const char *name, mp_size_t align)
1139 {
1140 if (align < 0 || align > SPEED_TMP_ALLOC_ADJUST_MASK)
1141 {
1142 fprintf (stderr, "Alignment request out of range: %s %ld\n",
1143 name, (long) align);
1144 fprintf (stderr, " should be 0 to %d (limbs), inclusive\n",
1145 SPEED_TMP_ALLOC_ADJUST_MASK);
1146 exit (1);
1147 }
1148 }
1149
1150 int
main(int argc,char * argv[])1151 main (int argc, char *argv[])
1152 {
1153 int i;
1154 int opt;
1155
1156 /* Unbuffered so output goes straight out when directed to a pipe or file
1157 and isn't lost on killing the program half way. */
1158 setbuf (stdout, NULL);
1159
1160 for (;;)
1161 {
1162 opt = getopt(argc, argv, "a:CcDdEFf:o:p:P:rRs:t:ux:y:w:W:z");
1163 if (opt == EOF)
1164 break;
1165
1166 switch (opt) {
1167 case 'a':
1168 if (strcmp (optarg, "random") == 0) option_data = DATA_RANDOM;
1169 else if (strcmp (optarg, "random2") == 0) option_data = DATA_RANDOM2;
1170 else if (strcmp (optarg, "zeros") == 0) option_data = DATA_ZEROS;
1171 else if (strcmp (optarg, "aas") == 0) option_data = DATA_AAS;
1172 else if (strcmp (optarg, "ffs") == 0) option_data = DATA_FFS;
1173 else if (strcmp (optarg, "2fd") == 0) option_data = DATA_2FD;
1174 else
1175 {
1176 fprintf (stderr, "unrecognised data option: %s\n", optarg);
1177 exit (1);
1178 }
1179 break;
1180 case 'C':
1181 if (option_unit != UNIT_SECONDS) goto bad_unit;
1182 option_unit = UNIT_CYCLESPERLIMB;
1183 break;
1184 case 'c':
1185 if (option_unit != UNIT_SECONDS)
1186 {
1187 bad_unit:
1188 fprintf (stderr, "cannot use more than one of -c, -C\n");
1189 exit (1);
1190 }
1191 option_unit = UNIT_CYCLES;
1192 break;
1193 case 'D':
1194 if (option_cmp != CMP_ABSOLUTE) goto bad_cmp;
1195 option_cmp = CMP_DIFFPREV;
1196 break;
1197 case 'd':
1198 if (option_cmp != CMP_ABSOLUTE)
1199 {
1200 bad_cmp:
1201 fprintf (stderr, "cannot use more than one of -d, -D, -r\n");
1202 exit (1);
1203 }
1204 option_cmp = CMP_DIFFERENCE;
1205 break;
1206 case 'E':
1207 option_square = 1;
1208 break;
1209 case 'F':
1210 option_square = 2;
1211 break;
1212 case 'f':
1213 option_factor = atof (optarg);
1214 if (option_factor <= 1.0)
1215 {
1216 fprintf (stderr, "-f factor must be > 1.0\n");
1217 exit (1);
1218 }
1219 break;
1220 case 'o':
1221 speed_option_set (optarg);
1222 break;
1223 case 'P':
1224 option_gnuplot = 1;
1225 option_gnuplot_basename = optarg;
1226 break;
1227 case 'p':
1228 speed_precision = atoi (optarg);
1229 break;
1230 case 'R':
1231 option_seed = time (NULL);
1232 break;
1233 case 'r':
1234 if (option_cmp != CMP_ABSOLUTE)
1235 goto bad_cmp;
1236 option_cmp = CMP_RATIO;
1237 break;
1238 case 's':
1239 {
1240 char *s;
1241 for (s = strtok (optarg, ","); s != NULL; s = strtok (NULL, ","))
1242 {
1243 if (size_num == size_allocnum)
1244 {
1245 size_array = (struct size_array_t *)
1246 __gmp_allocate_or_reallocate
1247 (size_array,
1248 size_allocnum * sizeof(size_array[0]),
1249 (size_allocnum+10) * sizeof(size_array[0]));
1250 size_allocnum += 10;
1251 }
1252 if (sscanf (s, "%ld-%ld",
1253 &size_array[size_num].start,
1254 &size_array[size_num].end) != 2)
1255 {
1256 size_array[size_num].start = size_array[size_num].end
1257 = atol (s);
1258 }
1259
1260 if (size_array[size_num].start < 0
1261 || size_array[size_num].end < 0
1262 || size_array[size_num].start > size_array[size_num].end)
1263 {
1264 fprintf (stderr, "invalid size parameter: %s\n", s);
1265 exit (1);
1266 }
1267
1268 size_num++;
1269 }
1270 }
1271 break;
1272 case 't':
1273 option_step = atol (optarg);
1274 if (option_step < 1)
1275 {
1276 fprintf (stderr, "-t step must be >= 1\n");
1277 exit (1);
1278 }
1279 break;
1280 case 'u':
1281 option_resource_usage = 1;
1282 break;
1283 case 'z':
1284 sp.cache = 1;
1285 break;
1286 case 'x':
1287 sp.align_xp = atol (optarg);
1288 check_align_option ("-x", sp.align_xp);
1289 break;
1290 case 'y':
1291 sp.align_yp = atol (optarg);
1292 check_align_option ("-y", sp.align_yp);
1293 break;
1294 case 'w':
1295 sp.align_wp = atol (optarg);
1296 check_align_option ("-w", sp.align_wp);
1297 break;
1298 case 'W':
1299 sp.align_wp2 = atol (optarg);
1300 check_align_option ("-W", sp.align_wp2);
1301 break;
1302 case '?':
1303 exit(1);
1304 }
1305 }
1306
1307 if (optind >= argc)
1308 {
1309 usage ();
1310 exit (1);
1311 }
1312
1313 if (size_num == 0)
1314 {
1315 fprintf (stderr, "-s <size> must be specified\n");
1316 exit (1);
1317 }
1318
1319 gmp_randinit_default (__gmp_rands);
1320 __gmp_rands_initialized = 1;
1321 gmp_randseed_ui (__gmp_rands, option_seed);
1322
1323 choice = (struct choice_t *) (*__gmp_allocate_func)
1324 ((argc - optind) * sizeof(choice[0]));
1325 for ( ; optind < argc; optind++)
1326 {
1327 struct choice_t c;
1328 routine_find (&c, argv[optind]);
1329 choice[num_choices] = c;
1330 num_choices++;
1331 }
1332
1333 if ((option_cmp == CMP_RATIO || option_cmp == CMP_DIFFERENCE) &&
1334 num_choices < 2)
1335 {
1336 fprintf (stderr, "WARNING, -d or -r does nothing when only one routine requested\n");
1337 }
1338
1339 speed_time_init ();
1340 if (option_unit == UNIT_CYCLES || option_unit == UNIT_CYCLESPERLIMB)
1341 speed_cycletime_need_cycles ();
1342 else
1343 speed_cycletime_need_seconds ();
1344
1345 if (option_gnuplot)
1346 {
1347 run_gnuplot (argc, argv);
1348 }
1349 else
1350 {
1351 if (option_unit == UNIT_SECONDS)
1352 printf ("overhead %.9f secs", speed_measure (speed_noop, NULL));
1353 else
1354 printf ("overhead %.2f cycles",
1355 speed_measure (speed_noop, NULL) / speed_cycletime);
1356 printf (", precision %d units of %.2e secs",
1357 speed_precision, speed_unittime);
1358
1359 if (speed_cycletime == 1.0 || speed_cycletime == 0.0)
1360 printf (", CPU freq unknown\n");
1361 else
1362 printf (", CPU freq %.2f MHz\n", 1e-6/speed_cycletime);
1363
1364 printf (" ");
1365 for (i = 0; i < num_choices; i++)
1366 printf (" %*s", COLUMN_WIDTH, choice[i].name);
1367 printf ("\n");
1368
1369 run_all (stdout);
1370 }
1371
1372 if (option_resource_usage)
1373 {
1374 #if HAVE_GETRUSAGE
1375 {
1376 /* This doesn't give data sizes on linux 2.0.x, only utime. */
1377 struct rusage r;
1378 if (getrusage (RUSAGE_SELF, &r) != 0)
1379 perror ("getrusage");
1380 else
1381 printf ("getrusage(): utime %ld.%06ld data %ld stack %ld maxresident %ld\n",
1382 (long) r.ru_utime.tv_sec, (long) r.ru_utime.tv_usec,
1383 r.ru_idrss, r.ru_isrss, r.ru_ixrss);
1384 }
1385 #else
1386 printf ("getrusage() not available\n");
1387 #endif
1388
1389 /* Linux kernel. */
1390 {
1391 char buf[128];
1392 sprintf (buf, "/proc/%d/status", getpid());
1393 if (access (buf, R_OK) == 0)
1394 {
1395 sprintf (buf, "cat /proc/%d/status", getpid());
1396 system (buf);
1397 }
1398
1399 }
1400 }
1401
1402 return 0;
1403 }
1404