xref: /netbsd-src/external/lgpl3/gmp/dist/tune/speed.h (revision e5d758f832e07a177fa24707c434b7ce26d0f762)
1 /* Header for speed and threshold things.
2 
3 Copyright 1999-2003, 2005, 2006, 2008-2017, 2019 Free Software
4 Foundation, Inc.
5 
6 This file is part of the GNU MP Library.
7 
8 The GNU MP Library is free software; you can redistribute it and/or modify
9 it under the terms of either:
10 
11   * the GNU Lesser General Public License as published by the Free
12     Software Foundation; either version 3 of the License, or (at your
13     option) any later version.
14 
15 or
16 
17   * the GNU General Public License as published by the Free Software
18     Foundation; either version 2 of the License, or (at your option) any
19     later version.
20 
21 or both in parallel, as here.
22 
23 The GNU MP Library is distributed in the hope that it will be useful, but
24 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
26 for more details.
27 
28 You should have received copies of the GNU General Public License and the
29 GNU Lesser General Public License along with the GNU MP Library.  If not,
30 see https://www.gnu.org/licenses/.  */
31 
32 #ifndef __SPEED_H__
33 #define __SPEED_H__
34 
35 
36 /* Pad ptr,oldsize with zero limbs (at the most significant end) to make it
37    newsize long. */
38 #define MPN_ZERO_EXTEND(ptr, oldsize, newsize)		\
39   do {							\
40     ASSERT ((newsize) >= (oldsize));			\
41     MPN_ZERO ((ptr)+(oldsize), (newsize)-(oldsize));	\
42   } while (0)
43 
44 /* A mask of the least significant n bits.  Note 1<<32 doesn't give zero on
45    x86 family CPUs, hence the separate case for GMP_LIMB_BITS. */
46 #define MP_LIMB_T_LOWBITMASK(n)	\
47   ((n) == GMP_LIMB_BITS ? MP_LIMB_T_MAX : ((mp_limb_t) 1 << (n)) - 1)
48 
49 
50 /* align must be a power of 2 here, usually CACHE_LINE_SIZE is a good choice */
51 
52 #define TMP_ALLOC_ALIGNED(bytes, align)	\
53   align_pointer (TMP_ALLOC ((bytes) + (align)-1), (align))
54 #define TMP_ALLOC_LIMBS_ALIGNED(limbs, align)	\
55   ((mp_ptr) TMP_ALLOC_ALIGNED ((limbs)*sizeof(mp_limb_t), align))
56 
57 /* CACHE_LINE_SIZE is our default alignment for speed operands, and the
58    limit on what s->align_xp etc and then request for off-alignment.  Maybe
59    this should be an option of some sort, but in any case here are some line
60    sizes,
61 
62        bytes
63 	 32   pentium
64 	 64   athlon
65 	 64   itanium-2 L1
66 	128   itanium-2 L2
67 */
68 #define CACHE_LINE_SIZE   64 /* bytes */
69 
70 #define SPEED_TMP_ALLOC_ADJUST_MASK  (CACHE_LINE_SIZE/GMP_LIMB_BYTES - 1)
71 
72 /* Set ptr to a TMP_ALLOC block of the given limbs, with the given limb
73    alignment.  */
74 #define SPEED_TMP_ALLOC_LIMBS(ptr, limbs, align)			\
75   do {									\
76     mp_ptr     __ptr;							\
77     mp_size_t  __ptr_align, __ptr_add;					\
78 									\
79     ASSERT ((CACHE_LINE_SIZE % GMP_LIMB_BYTES) == 0);		\
80     __ptr = TMP_ALLOC_LIMBS ((limbs) + SPEED_TMP_ALLOC_ADJUST_MASK);	\
81     __ptr_align = (__ptr - (mp_ptr) NULL);				\
82     __ptr_add = ((align) - __ptr_align) & SPEED_TMP_ALLOC_ADJUST_MASK;	\
83     (ptr) = __ptr + __ptr_add;						\
84   } while (0)
85 
86 
87 /* This is the size for s->xp_block and s->yp_block, used in certain
88    routines that want to run across many different data values and use
89    s->size for a different purpose, eg. SPEED_ROUTINE_MPN_GCD_1.
90 
91    512 means 2kbytes of data for each of xp_block and yp_block, making 4k
92    total, which should fit easily in any L1 data cache. */
93 
94 #define SPEED_BLOCK_SIZE   512 /* limbs */
95 
96 
97 extern double  speed_unittime;
98 extern double  speed_cycletime;
99 extern int     speed_precision;
100 extern char    speed_time_string[];
101 void speed_time_init (void);
102 void speed_cycletime_fail (const char *str);
103 void speed_cycletime_init (void);
104 void speed_cycletime_need_cycles (void);
105 void speed_cycletime_need_seconds (void);
106 void speed_starttime (void);
107 double speed_endtime (void);
108 
109 
110 struct speed_params {
111   unsigned   reps;	/* how many times to run the routine */
112   mp_ptr     xp;	/* first argument */
113   mp_ptr     yp;	/* second argument */
114   mp_size_t  size;	/* size of both arguments */
115   mp_limb_t  r;		/* user supplied parameter */
116   mp_size_t  align_xp;	/* alignment of xp */
117   mp_size_t  align_yp;	/* alignment of yp */
118   mp_size_t  align_wp;	/* intended alignment of wp */
119   mp_size_t  align_wp2; /* intended alignment of wp2 */
120   mp_ptr     xp_block;	/* first special SPEED_BLOCK_SIZE block */
121   mp_ptr     yp_block;	/* second special SPEED_BLOCK_SIZE block */
122 
123   double     time_divisor; /* optionally set by the speed routine */
124 
125   /* used by the cache priming things */
126   int	     cache;
127   unsigned   src_num, dst_num;
128   struct {
129     mp_ptr    ptr;
130     mp_size_t size;
131   } src[5], dst[4];
132 };
133 
134 typedef double (*speed_function_t) (struct speed_params *);
135 
136 double speed_measure (speed_function_t fun, struct speed_params *);
137 
138 /* Prototypes for speed measuring routines */
139 
140 double speed_back_to_back (struct speed_params *);
141 double speed_count_leading_zeros (struct speed_params *);
142 double speed_count_trailing_zeros (struct speed_params *);
143 double speed_find_a (struct speed_params *);
144 double speed_gmp_allocate_free (struct speed_params *);
145 double speed_gmp_allocate_reallocate_free (struct speed_params *);
146 double speed_invert_limb (struct speed_params *);
147 double speed_malloc_free (struct speed_params *);
148 double speed_malloc_realloc_free (struct speed_params *);
149 double speed_memcpy (struct speed_params *);
150 double speed_binvert_limb (struct speed_params *);
151 double speed_binvert_limb_mul1 (struct speed_params *);
152 double speed_binvert_limb_loop (struct speed_params *);
153 double speed_binvert_limb_cond (struct speed_params *);
154 double speed_binvert_limb_arith (struct speed_params *);
155 
156 double speed_mpf_init_clear (struct speed_params *);
157 
158 double speed_mpn_add_n (struct speed_params *);
159 double speed_mpn_add_1 (struct speed_params *);
160 double speed_mpn_add_1_inplace (struct speed_params *);
161 double speed_mpn_add_err1_n (struct speed_params *);
162 double speed_mpn_add_err2_n (struct speed_params *);
163 double speed_mpn_add_err3_n (struct speed_params *);
164 double speed_mpn_addlsh_n (struct speed_params *);
165 double speed_mpn_addlsh1_n (struct speed_params *);
166 double speed_mpn_addlsh2_n (struct speed_params *);
167 double speed_mpn_addlsh_n_ip1 (struct speed_params *);
168 double speed_mpn_addlsh1_n_ip1 (struct speed_params *);
169 double speed_mpn_addlsh2_n_ip1 (struct speed_params *);
170 double speed_mpn_addlsh_n_ip2 (struct speed_params *);
171 double speed_mpn_addlsh1_n_ip2 (struct speed_params *);
172 double speed_mpn_addlsh2_n_ip2 (struct speed_params *);
173 double speed_mpn_add_n_sub_n (struct speed_params *);
174 double speed_mpn_and_n (struct speed_params *);
175 double speed_mpn_andn_n (struct speed_params *);
176 double speed_mpn_addmul_1 (struct speed_params *);
177 double speed_mpn_addmul_2 (struct speed_params *);
178 double speed_mpn_addmul_3 (struct speed_params *);
179 double speed_mpn_addmul_4 (struct speed_params *);
180 double speed_mpn_addmul_5 (struct speed_params *);
181 double speed_mpn_addmul_6 (struct speed_params *);
182 double speed_mpn_addmul_7 (struct speed_params *);
183 double speed_mpn_addmul_8 (struct speed_params *);
184 double speed_mpn_cnd_add_n (struct speed_params *);
185 double speed_mpn_cnd_sub_n (struct speed_params *);
186 double speed_mpn_com (struct speed_params *);
187 double speed_mpn_neg (struct speed_params *);
188 double speed_mpn_copyd (struct speed_params *);
189 double speed_mpn_copyi (struct speed_params *);
190 double speed_MPN_COPY (struct speed_params *);
191 double speed_MPN_COPY_DECR (struct speed_params *);
192 double speed_MPN_COPY_INCR (struct speed_params *);
193 double speed_mpn_sec_tabselect (struct speed_params *);
194 double speed_mpn_divexact_1 (struct speed_params *);
195 double speed_mpn_divexact_by3 (struct speed_params *);
196 double speed_mpn_bdiv_q_1 (struct speed_params *);
197 double speed_mpn_pi1_bdiv_q_1 (struct speed_params *);
198 double speed_mpn_bdiv_dbm1c (struct speed_params *);
199 double speed_mpn_divrem_1 (struct speed_params *);
200 double speed_mpn_divrem_1f (struct speed_params *);
201 double speed_mpn_divrem_1c (struct speed_params *);
202 double speed_mpn_divrem_1cf (struct speed_params *);
203 double speed_mpn_divrem_1_div (struct speed_params *);
204 double speed_mpn_divrem_1f_div (struct speed_params *);
205 double speed_mpn_divrem_1_inv (struct speed_params *);
206 double speed_mpn_divrem_1f_inv (struct speed_params *);
207 double speed_mpn_divrem_2 (struct speed_params *);
208 double speed_mpn_divrem_2_div (struct speed_params *);
209 double speed_mpn_divrem_2_inv (struct speed_params *);
210 double speed_mpn_div_qr_1n_pi1 (struct speed_params *);
211 double speed_mpn_div_qr_1n_pi1_1 (struct speed_params *);
212 double speed_mpn_div_qr_1n_pi1_2 (struct speed_params *);
213 double speed_mpn_div_qr_1 (struct speed_params *);
214 double speed_mpn_div_qr_2n (struct speed_params *);
215 double speed_mpn_div_qr_2u (struct speed_params *);
216 double speed_mpn_fib2_ui (struct speed_params *);
217 double speed_mpn_matrix22_mul (struct speed_params *);
218 double speed_mpn_hgcd2 (struct speed_params *);
219 double speed_mpn_hgcd2_1 (struct speed_params *);
220 double speed_mpn_hgcd2_2 (struct speed_params *);
221 double speed_mpn_hgcd2_3 (struct speed_params *);
222 double speed_mpn_hgcd2_4 (struct speed_params *);
223 double speed_mpn_hgcd2_5 (struct speed_params *);
224 double speed_mpn_hgcd (struct speed_params *);
225 double speed_mpn_hgcd_lehmer (struct speed_params *);
226 double speed_mpn_hgcd_appr (struct speed_params *);
227 double speed_mpn_hgcd_appr_lehmer (struct speed_params *);
228 double speed_mpn_hgcd_reduce (struct speed_params *);
229 double speed_mpn_hgcd_reduce_1 (struct speed_params *);
230 double speed_mpn_hgcd_reduce_2 (struct speed_params *);
231 double speed_mpn_gcd (struct speed_params *);
232 double speed_mpn_gcd_1 (struct speed_params *);
233 double speed_mpn_gcd_11 (struct speed_params *);
234 double speed_mpn_gcd_1N (struct speed_params *);
235 double speed_mpn_gcd_22 (struct speed_params *);
236 double speed_mpn_gcdext (struct speed_params *);
237 double speed_mpn_gcdext_double (struct speed_params *);
238 double speed_mpn_gcdext_one_double (struct speed_params *);
239 double speed_mpn_gcdext_one_single (struct speed_params *);
240 double speed_mpn_gcdext_single (struct speed_params *);
241 double speed_mpn_get_str (struct speed_params *);
242 double speed_mpn_hamdist (struct speed_params *);
243 double speed_mpn_ior_n (struct speed_params *);
244 double speed_mpn_iorn_n (struct speed_params *);
245 double speed_mpn_jacobi_base (struct speed_params *);
246 double speed_mpn_jacobi_base_1 (struct speed_params *);
247 double speed_mpn_jacobi_base_2 (struct speed_params *);
248 double speed_mpn_jacobi_base_3 (struct speed_params *);
249 double speed_mpn_jacobi_base_4 (struct speed_params *);
250 double speed_mpn_lshift (struct speed_params *);
251 double speed_mpn_lshiftc (struct speed_params *);
252 double speed_mpn_mod_1 (struct speed_params *);
253 double speed_mpn_mod_1c (struct speed_params *);
254 double speed_mpn_mod_1_div (struct speed_params *);
255 double speed_mpn_mod_1_inv (struct speed_params *);
256 double speed_mpn_mod_1_1 (struct speed_params *);
257 double speed_mpn_mod_1_1_1 (struct speed_params *);
258 double speed_mpn_mod_1_1_2 (struct speed_params *);
259 double speed_mpn_mod_1_2 (struct speed_params *);
260 double speed_mpn_mod_1_3 (struct speed_params *);
261 double speed_mpn_mod_1_4 (struct speed_params *);
262 double speed_mpn_mod_34lsub1 (struct speed_params *);
263 double speed_mpn_modexact_1_odd (struct speed_params *);
264 double speed_mpn_modexact_1c_odd (struct speed_params *);
265 double speed_mpn_mul_1 (struct speed_params *);
266 double speed_mpn_mul_1_inplace (struct speed_params *);
267 double speed_mpn_mul_2 (struct speed_params *);
268 double speed_mpn_mul_3 (struct speed_params *);
269 double speed_mpn_mul_4 (struct speed_params *);
270 double speed_mpn_mul_5 (struct speed_params *);
271 double speed_mpn_mul_6 (struct speed_params *);
272 double speed_mpn_mul (struct speed_params *);
273 double speed_mpn_mul_basecase (struct speed_params *);
274 double speed_mpn_mulmid (struct speed_params *);
275 double speed_mpn_mulmid_basecase (struct speed_params *);
276 double speed_mpn_mul_fft (struct speed_params *);
277 double speed_mpn_mul_fft_sqr (struct speed_params *);
278 double speed_mpn_fft_mul (struct speed_params *);
279 double speed_mpn_fft_sqr (struct speed_params *);
280 #if WANT_OLD_FFT_FULL
281 double speed_mpn_mul_fft_full (struct speed_params *);
282 double speed_mpn_mul_fft_full_sqr (struct speed_params *);
283 #endif
284 double speed_mpn_nussbaumer_mul (struct speed_params *);
285 double speed_mpn_nussbaumer_mul_sqr (struct speed_params *);
286 double speed_mpn_mul_n (struct speed_params *);
287 double speed_mpn_mul_n_sqr (struct speed_params *);
288 double speed_mpn_mulmid_n (struct speed_params *);
289 double speed_mpn_sqrlo (struct speed_params *);
290 double speed_mpn_sqrlo_basecase (struct speed_params *);
291 double speed_mpn_mullo_n (struct speed_params *);
292 double speed_mpn_mullo_basecase (struct speed_params *);
293 double speed_mpn_nand_n (struct speed_params *);
294 double speed_mpn_nior_n (struct speed_params *);
295 double speed_mpn_popcount (struct speed_params *);
296 double speed_mpn_preinv_divrem_1 (struct speed_params *);
297 double speed_mpn_preinv_divrem_1f (struct speed_params *);
298 double speed_mpn_preinv_mod_1 (struct speed_params *);
299 double speed_mpn_sbpi1_div_qr (struct speed_params *);
300 double speed_mpn_dcpi1_div_qr (struct speed_params *);
301 double speed_mpn_sbpi1_divappr_q (struct speed_params *);
302 double speed_mpn_dcpi1_divappr_q (struct speed_params *);
303 double speed_mpn_mu_div_qr (struct speed_params *);
304 double speed_mpn_mu_divappr_q (struct speed_params *);
305 double speed_mpn_mupi_div_qr (struct speed_params *);
306 double speed_mpn_mu_div_q (struct speed_params *);
307 double speed_mpn_sbpi1_bdiv_qr (struct speed_params *);
308 double speed_mpn_dcpi1_bdiv_qr (struct speed_params *);
309 double speed_mpn_sbpi1_bdiv_q (struct speed_params *);
310 double speed_mpn_dcpi1_bdiv_q (struct speed_params *);
311 double speed_mpn_sbpi1_bdiv_r (struct speed_params *);
312 double speed_mpn_mu_bdiv_q (struct speed_params *);
313 double speed_mpn_mu_bdiv_qr (struct speed_params *);
314 double speed_mpn_broot (struct speed_params *);
315 double speed_mpn_broot_invm1 (struct speed_params *);
316 double speed_mpn_brootinv (struct speed_params *);
317 double speed_mpn_invert (struct speed_params *);
318 double speed_mpn_invertappr (struct speed_params *);
319 double speed_mpn_ni_invertappr (struct speed_params *);
320 double speed_mpn_sec_invert (struct speed_params *s);
321 double speed_mpn_binvert (struct speed_params *);
322 double speed_mpn_redc_1 (struct speed_params *);
323 double speed_mpn_redc_2 (struct speed_params *);
324 double speed_mpn_redc_n (struct speed_params *);
325 double speed_mpn_rsblsh_n (struct speed_params *);
326 double speed_mpn_rsblsh1_n (struct speed_params *);
327 double speed_mpn_rsblsh2_n (struct speed_params *);
328 double speed_mpn_rsh1add_n (struct speed_params *);
329 double speed_mpn_rsh1sub_n (struct speed_params *);
330 double speed_mpn_rshift (struct speed_params *);
331 double speed_mpn_sb_divrem_m3 (struct speed_params *);
332 double speed_mpn_sb_divrem_m3_div (struct speed_params *);
333 double speed_mpn_sb_divrem_m3_inv (struct speed_params *);
334 double speed_mpn_set_str (struct speed_params *);
335 double speed_mpn_bc_set_str (struct speed_params *);
336 double speed_mpn_dc_set_str (struct speed_params *);
337 double speed_mpn_set_str_pre (struct speed_params *);
338 double speed_mpn_sqr_basecase (struct speed_params *);
339 double speed_mpn_sqr_diag_addlsh1 (struct speed_params *);
340 double speed_mpn_sqr_diagonal (struct speed_params *);
341 double speed_mpn_sqr (struct speed_params *);
342 double speed_mpn_sqrtrem (struct speed_params *);
343 double speed_mpn_rootrem (struct speed_params *);
344 double speed_mpn_sqrt (struct speed_params *);
345 double speed_mpn_root (struct speed_params *);
346 double speed_mpn_perfect_power_p (struct speed_params *);
347 double speed_mpn_perfect_square_p (struct speed_params *);
348 double speed_mpn_sub_n (struct speed_params *);
349 double speed_mpn_sub_1 (struct speed_params *);
350 double speed_mpn_sub_1_inplace (struct speed_params *);
351 double speed_mpn_sub_err1_n (struct speed_params *);
352 double speed_mpn_sub_err2_n (struct speed_params *);
353 double speed_mpn_sub_err3_n (struct speed_params *);
354 double speed_mpn_sublsh_n (struct speed_params *);
355 double speed_mpn_sublsh1_n (struct speed_params *);
356 double speed_mpn_sublsh2_n (struct speed_params *);
357 double speed_mpn_sublsh_n_ip1 (struct speed_params *);
358 double speed_mpn_sublsh1_n_ip1 (struct speed_params *);
359 double speed_mpn_sublsh2_n_ip1 (struct speed_params *);
360 double speed_mpn_submul_1 (struct speed_params *);
361 double speed_mpn_toom2_sqr (struct speed_params *);
362 double speed_mpn_toom3_sqr (struct speed_params *);
363 double speed_mpn_toom4_sqr (struct speed_params *);
364 double speed_mpn_toom6_sqr (struct speed_params *);
365 double speed_mpn_toom8_sqr (struct speed_params *);
366 double speed_mpn_toom22_mul (struct speed_params *);
367 double speed_mpn_toom33_mul (struct speed_params *);
368 double speed_mpn_toom44_mul (struct speed_params *);
369 double speed_mpn_toom6h_mul (struct speed_params *);
370 double speed_mpn_toom8h_mul (struct speed_params *);
371 double speed_mpn_toom32_mul (struct speed_params *);
372 double speed_mpn_toom42_mul (struct speed_params *);
373 double speed_mpn_toom43_mul (struct speed_params *);
374 double speed_mpn_toom63_mul (struct speed_params *);
375 double speed_mpn_toom32_for_toom43_mul (struct speed_params *);
376 double speed_mpn_toom43_for_toom32_mul (struct speed_params *);
377 double speed_mpn_toom32_for_toom53_mul (struct speed_params *);
378 double speed_mpn_toom53_for_toom32_mul (struct speed_params *);
379 double speed_mpn_toom42_for_toom53_mul (struct speed_params *);
380 double speed_mpn_toom53_for_toom42_mul (struct speed_params *);
381 double speed_mpn_toom43_for_toom54_mul (struct speed_params *);
382 double speed_mpn_toom54_for_toom43_mul (struct speed_params *);
383 double speed_mpn_toom42_mulmid (struct speed_params *);
384 double speed_mpn_mulmod_bnm1 (struct speed_params *);
385 double speed_mpn_bc_mulmod_bnm1 (struct speed_params *);
386 double speed_mpn_mulmod_bnm1_rounded (struct speed_params *);
387 double speed_mpn_sqrmod_bnm1 (struct speed_params *);
388 double speed_mpn_udiv_qrnnd (struct speed_params *);
389 double speed_mpn_udiv_qrnnd_r (struct speed_params *);
390 double speed_mpn_umul_ppmm (struct speed_params *);
391 double speed_mpn_umul_ppmm_r (struct speed_params *);
392 double speed_mpn_xnor_n (struct speed_params *);
393 double speed_mpn_xor_n (struct speed_params *);
394 double speed_MPN_ZERO (struct speed_params *);
395 
396 double speed_mpq_init_clear (struct speed_params *);
397 
398 double speed_mpz_add (struct speed_params *);
399 double speed_mpz_invert (struct speed_params *);
400 double speed_mpz_bin_uiui (struct speed_params *);
401 double speed_mpz_bin_ui (struct speed_params *);
402 double speed_mpz_fac_ui (struct speed_params *);
403 double speed_mpz_2fac_ui (struct speed_params *);
404 double speed_mpz_mfac_uiui (struct speed_params *);
405 double speed_mpz_primorial_ui (struct speed_params *);
406 double speed_mpz_fib_ui (struct speed_params *);
407 double speed_mpz_fib2_ui (struct speed_params *);
408 double speed_mpz_init_clear (struct speed_params *);
409 double speed_mpz_init_realloc_clear (struct speed_params *);
410 double speed_mpz_nextprime (struct speed_params *);
411 double speed_mpz_jacobi (struct speed_params *);
412 double speed_mpz_lucnum_ui (struct speed_params *);
413 double speed_mpz_lucnum2_ui (struct speed_params *);
414 double speed_mpz_mod (struct speed_params *);
415 double speed_mpz_powm (struct speed_params *);
416 double speed_mpz_powm_mod (struct speed_params *);
417 double speed_mpz_powm_redc (struct speed_params *);
418 double speed_mpz_powm_sec (struct speed_params *);
419 double speed_mpz_powm_ui (struct speed_params *);
420 double speed_mpz_urandomb (struct speed_params *);
421 
422 double speed_gmp_randseed (struct speed_params *);
423 double speed_gmp_randseed_ui (struct speed_params *);
424 
425 double speed_noop (struct speed_params *);
426 double speed_noop_wxs (struct speed_params *);
427 double speed_noop_wxys (struct speed_params *);
428 
429 double speed_operator_div (struct speed_params *);
430 double speed_operator_mod (struct speed_params *);
431 
432 double speed_udiv_qrnnd (struct speed_params *);
433 double speed_udiv_qrnnd_preinv1 (struct speed_params *);
434 double speed_udiv_qrnnd_preinv2 (struct speed_params *);
435 double speed_udiv_qrnnd_preinv3 (struct speed_params *);
436 double speed_udiv_qrnnd_c (struct speed_params *);
437 double speed_umul_ppmm (struct speed_params *);
438 
439 /* Prototypes for other routines */
440 
441 #if defined (__cplusplus)
442 extern "C" {
443 #endif
444 
445 /* low 32-bits in p[0], high 32-bits in p[1] */
446 void speed_cyclecounter (unsigned p[2]);
447 
448 #if defined (__cplusplus)
449 }
450 #endif
451 
452 void mftb_function (unsigned p[2]);
453 
454 double speed_cyclecounter_diff (const unsigned [2], const unsigned [2]);
455 int gettimeofday_microseconds_p (void);
456 int getrusage_microseconds_p (void);
457 int cycles_works_p (void);
458 long clk_tck (void);
459 double freq_measure (const char *, double (*)(void));
460 
461 int double_cmp_ptr (const double *, const double *);
462 void pentium_wbinvd (void);
463 typedef int (*qsort_function_t) (const void *, const void *);
464 
465 void noop (void);
466 void noop_1 (mp_limb_t);
467 void noop_wxs (mp_ptr, mp_srcptr, mp_size_t);
468 void noop_wxys (mp_ptr, mp_srcptr, mp_srcptr, mp_size_t);
469 void mpn_cache_fill (mp_srcptr, mp_size_t);
470 void mpn_cache_fill_dummy (mp_limb_t);
471 void speed_cache_fill (struct speed_params *);
472 void speed_operand_src (struct speed_params *, mp_ptr, mp_size_t);
473 void speed_operand_dst (struct speed_params *, mp_ptr, mp_size_t);
474 
475 extern int  speed_option_addrs;
476 extern int  speed_option_verbose;
477 extern int  speed_option_cycles_broken;
478 void speed_option_set (const char *);
479 
480 mp_limb_t mpn_div_qr_1n_pi1_1 (mp_ptr, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t, mp_limb_t);
481 mp_limb_t mpn_div_qr_1n_pi1_2 (mp_ptr, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t, mp_limb_t);
482 
483 mp_limb_t mpn_divrem_1_div (mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t);
484 mp_limb_t mpn_divrem_1_inv (mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t);
485 mp_limb_t mpn_divrem_2_div (mp_ptr, mp_size_t, mp_ptr, mp_size_t, mp_srcptr);
486 mp_limb_t mpn_divrem_2_inv (mp_ptr, mp_size_t, mp_ptr, mp_size_t, mp_srcptr);
487 
488 int mpn_jacobi_base_1 (mp_limb_t, mp_limb_t, int);
489 int mpn_jacobi_base_2 (mp_limb_t, mp_limb_t, int);
490 int mpn_jacobi_base_3 (mp_limb_t, mp_limb_t, int);
491 int mpn_jacobi_base_4 (mp_limb_t, mp_limb_t, int);
492 
493 int mpn_hgcd2_1 (mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t, struct hgcd_matrix1*);
494 int mpn_hgcd2_2 (mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t, struct hgcd_matrix1*);
495 int mpn_hgcd2_3 (mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t, struct hgcd_matrix1*);
496 int mpn_hgcd2_4 (mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t, struct hgcd_matrix1*);
497 int mpn_hgcd2_5 (mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t, struct hgcd_matrix1*);
498 
499 mp_limb_t mpn_mod_1_div (mp_srcptr, mp_size_t, mp_limb_t);
500 mp_limb_t mpn_mod_1_inv (mp_srcptr, mp_size_t, mp_limb_t);
501 
502 mp_limb_t mpn_mod_1_1p_1 (mp_srcptr, mp_size_t, mp_limb_t, const mp_limb_t [4]);
503 mp_limb_t mpn_mod_1_1p_2 (mp_srcptr, mp_size_t, mp_limb_t, const mp_limb_t [4]);
504 
505 void mpn_mod_1_1p_cps_1 (mp_limb_t [4], mp_limb_t);
506 void mpn_mod_1_1p_cps_2 (mp_limb_t [4], mp_limb_t);
507 
508 mp_size_t mpn_gcdext_one_double (mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t);
509 mp_size_t mpn_gcdext_one_single (mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t);
510 mp_size_t mpn_gcdext_single (mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t);
511 mp_size_t mpn_gcdext_double (mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t);
512 mp_size_t mpn_hgcd_lehmer (mp_ptr, mp_ptr, mp_size_t, struct hgcd_matrix *, mp_ptr);
513 mp_size_t mpn_hgcd_lehmer_itch (mp_size_t);
514 
515 mp_size_t mpn_hgcd_appr_lehmer (mp_ptr, mp_ptr, mp_size_t, struct hgcd_matrix *, mp_ptr);
516 mp_size_t mpn_hgcd_appr_lehmer_itch (mp_size_t);
517 
518 mp_size_t mpn_hgcd_reduce_1 (struct hgcd_matrix *, mp_ptr, mp_ptr, mp_size_t, mp_size_t, mp_ptr);
519 mp_size_t mpn_hgcd_reduce_1_itch (mp_size_t, mp_size_t);
520 
521 mp_size_t mpn_hgcd_reduce_2 (struct hgcd_matrix *, mp_ptr, mp_ptr, mp_size_t, mp_size_t, mp_ptr);
522 mp_size_t mpn_hgcd_reduce_2_itch (mp_size_t, mp_size_t);
523 
524 mp_limb_t mpn_sb_divrem_mn_div (mp_ptr, mp_ptr, mp_size_t, mp_srcptr, mp_size_t);
525 mp_limb_t mpn_sb_divrem_mn_inv (mp_ptr, mp_ptr, mp_size_t, mp_srcptr, mp_size_t);
526 
527 mp_size_t mpn_set_str_basecase (mp_ptr, const unsigned char *, size_t, int);
528 void mpn_pre_set_str (mp_ptr, unsigned char *, size_t, powers_t *, mp_ptr);
529 
530 void mpz_powm_mod (mpz_ptr, mpz_srcptr, mpz_srcptr, mpz_srcptr);
531 void mpz_powm_redc (mpz_ptr, mpz_srcptr, mpz_srcptr, mpz_srcptr);
532 
533 int speed_routine_count_zeros_setup (struct speed_params *, mp_ptr, int, int);
534 
535 
536 /* "get" is called repeatedly until it ticks over, just in case on a fast
537    processor it takes less than a microsecond, though this is probably
538    unlikely if it's a system call.
539 
540    speed_cyclecounter is called on the same side of the "get" for the start
541    and end measurements.  It doesn't matter how long it takes from the "get"
542    sample to the cycles sample, since that period will cancel out in the
543    difference calculation (assuming it's the same each time).
544 
545    Letting the test run for more than a process time slice is probably only
546    going to reduce accuracy, especially for getrusage when the cycle counter
547    is real time, or for gettimeofday if the cycle counter is in fact process
548    time.  Use CLK_TCK/2 as a reasonable stop.
549 
550    It'd be desirable to be quite accurate here.  The default speed_precision
551    for a cycle counter is 10000 cycles, so to mix that with getrusage or
552    gettimeofday the frequency should be at least that accurate.  But running
553    measurements for 10000 microseconds (or more) is too long.  Be satisfied
554    with just a half clock tick (5000 microseconds usually).  */
555 
556 #define FREQ_MEASURE_ONE(name, type, get, getc, sec, usec)		\
557   do {									\
558     type      st1, st, et1, et;						\
559     unsigned  sc[2], ec[2];						\
560     long      dt, half_tick;						\
561     double    dc, cyc;							\
562 									\
563     half_tick = (1000000L / clk_tck()) / 2;				\
564 									\
565     get (st1);								\
566     do {								\
567       get (st);								\
568     } while (usec(st) == usec(st1) && sec(st) == sec(st1));		\
569 									\
570     getc (sc);								\
571 									\
572     for (;;)								\
573       {									\
574 	get (et1);							\
575 	do {								\
576 	  get (et);							\
577 	} while (usec(et) == usec(et1) && sec(et) == sec(et1));		\
578 									\
579 	getc (ec);							\
580 									\
581 	dc = speed_cyclecounter_diff (ec, sc);				\
582 									\
583 	/* allow secs to cancel before multiplying */			\
584 	dt = sec(et) - sec(st);						\
585 	dt = dt * 1000000L + (usec(et) - usec(st));			\
586 									\
587 	if (dt >= half_tick)						\
588 	  break;							\
589       }									\
590 									\
591     cyc = dt * 1e-6 / dc;						\
592 									\
593     if (speed_option_verbose >= 2)					\
594       printf ("freq_measure_%s_one() dc=%.6g dt=%ld cyc=%.6g\n",	\
595 	      name, dc, dt, cyc);					\
596 									\
597     return dt * 1e-6 / dc;						\
598 									\
599   } while (0)
600 
601 
602 
603 
604 /* The measuring routines use these big macros to save duplication for
605    similar forms.  They also get used for some automatically generated
606    measuring of new implementations of functions.
607 
608    Having something like SPEED_ROUTINE_BINARY_N as a subroutine accepting a
609    function pointer is considered undesirable since it's not the way a
610    normal application will be calling, and some processors might do
611    different things with an indirect call, like not branch predicting, or
612    doing a full pipe flush.  At least some of the "functions" measured are
613    actually macros too.
614 
615    The net effect is to bloat the object code, possibly in a big way, but
616    only what's being measured is being run, so that doesn't matter.
617 
618    The loop forms don't try to cope with __GMP_ATTRIBUTE_PURE or
619    ATTRIBUTE_CONST on the called functions.  Adding a cast to a non-pure
620    function pointer doesn't work in gcc 3.2.  Using an actual non-pure
621    function pointer variable works, but stands a real risk of a
622    non-optimizing compiler generating unnecessary overheads in the call.
623    Currently the best idea is not to use those attributes for a timing
624    program build.  __GMP_NO_ATTRIBUTE_CONST_PURE will tell gmp.h and
625    gmp-impl.h to omit them from routines there.  */
626 
627 #define SPEED_RESTRICT_COND(cond)   if (!(cond)) return -1.0;
628 
629 /* For mpn_copy or similar. */
630 #define SPEED_ROUTINE_MPN_COPY_CALL(call)				\
631   {									\
632     mp_ptr    wp;							\
633     unsigned  i;							\
634     double    t;							\
635     TMP_DECL;								\
636 									\
637     SPEED_RESTRICT_COND (s->size >= 0);					\
638 									\
639     TMP_MARK;								\
640     SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
641 									\
642     speed_operand_src (s, s->xp, s->size);				\
643     speed_operand_dst (s, wp, s->size);					\
644     speed_cache_fill (s);						\
645 									\
646     speed_starttime ();							\
647     i = s->reps;							\
648     do									\
649       call;								\
650     while (--i != 0);							\
651     t = speed_endtime ();						\
652 									\
653     TMP_FREE;								\
654     return t;								\
655   }
656 #define SPEED_ROUTINE_MPN_COPY(function)				\
657   SPEED_ROUTINE_MPN_COPY_CALL (function (wp, s->xp, s->size))
658 
659 #define SPEED_ROUTINE_MPN_TABSELECT(function)				\
660   {									\
661     mp_ptr    xp, wp;							\
662     unsigned  i;							\
663     double    t;							\
664     TMP_DECL;								\
665 									\
666     SPEED_RESTRICT_COND (s->size >= 0);					\
667 									\
668     if (s->r == 0)							\
669       s->r = s->size;	/* default to a quadratic shape */		\
670 									\
671     TMP_MARK;								\
672     SPEED_TMP_ALLOC_LIMBS (xp, s->size * s->r, s->align_xp);		\
673     SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
674 									\
675     speed_operand_src (s, xp, s->size * s->r);				\
676     speed_operand_dst (s, wp, s->size);					\
677     speed_cache_fill (s);						\
678 									\
679     speed_starttime ();							\
680     i = s->reps;							\
681     do									\
682       function (wp, xp, s->size, s->r, (s->r) / 2);			\
683     while (--i != 0);							\
684     t = speed_endtime () / s->r;					\
685 									\
686     TMP_FREE;								\
687     return t;								\
688   }
689 
690 
691 #define SPEED_ROUTINE_MPN_COPYC(function)				\
692   {									\
693     mp_ptr    wp;							\
694     unsigned  i;							\
695     double    t;							\
696     TMP_DECL;								\
697 									\
698     SPEED_RESTRICT_COND (s->size >= 0);					\
699 									\
700     TMP_MARK;								\
701     SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
702 									\
703     speed_operand_src (s, s->xp, s->size);				\
704     speed_operand_dst (s, wp, s->size);					\
705     speed_cache_fill (s);						\
706 									\
707     speed_starttime ();							\
708     i = s->reps;							\
709     do									\
710       function (wp, s->xp, s->size, 0);					\
711     while (--i != 0);							\
712     t = speed_endtime ();						\
713 									\
714     TMP_FREE;								\
715     return t;								\
716   }
717 
718 /* s->size is still in limbs, and it's limbs which are copied, but
719    "function" takes a size in bytes not limbs.  */
720 #define SPEED_ROUTINE_MPN_COPY_BYTES(function)				\
721   {									\
722     mp_ptr    wp;							\
723     unsigned  i;							\
724     double    t;							\
725     TMP_DECL;								\
726 									\
727     SPEED_RESTRICT_COND (s->size >= 0);					\
728 									\
729     TMP_MARK;								\
730     SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
731 									\
732     speed_operand_src (s, s->xp, s->size);				\
733     speed_operand_dst (s, wp, s->size);					\
734     speed_cache_fill (s);						\
735 									\
736     speed_starttime ();							\
737     i = s->reps;							\
738     do									\
739       function (wp, s->xp, s->size * GMP_LIMB_BYTES);		\
740     while (--i != 0);							\
741     t = speed_endtime ();						\
742 									\
743     TMP_FREE;								\
744     return t;								\
745   }
746 
747 
748 /* For mpn_add_n, mpn_sub_n, or similar. */
749 #define SPEED_ROUTINE_MPN_BINARY_N_CALL(call)				\
750   {									\
751     mp_ptr     wp;							\
752     mp_ptr     xp, yp;							\
753     unsigned   i;							\
754     double     t;							\
755     TMP_DECL;								\
756 									\
757     SPEED_RESTRICT_COND (s->size >= 1);					\
758 									\
759     TMP_MARK;								\
760     SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
761 									\
762     xp = s->xp;								\
763     yp = s->yp;								\
764 									\
765     if (s->r == 0)	;						\
766     else if (s->r == 1) { xp = wp;	    }				\
767     else if (s->r == 2) {	   yp = wp; }				\
768     else if (s->r == 3) { xp = wp; yp = wp; }				\
769     else if (s->r == 4) {     yp = xp;	    }				\
770     else		{						\
771       TMP_FREE;								\
772       return -1.0;							\
773     }									\
774 									\
775     /* initialize wp if operand overlap */				\
776     if (xp == wp || yp == wp)						\
777       MPN_COPY (wp, s->xp, s->size);					\
778 									\
779     speed_operand_src (s, xp, s->size);					\
780     speed_operand_src (s, yp, s->size);					\
781     speed_operand_dst (s, wp, s->size);					\
782     speed_cache_fill (s);						\
783 									\
784     speed_starttime ();							\
785     i = s->reps;							\
786     do									\
787       call;								\
788     while (--i != 0);							\
789     t = speed_endtime ();						\
790 									\
791     TMP_FREE;								\
792     return t;								\
793   }
794 
795 
796 /* For mpn_aors_errK_n, where 1 <= K <= 3. */
797 #define SPEED_ROUTINE_MPN_BINARY_ERR_N_CALL(call, K)			\
798   {									\
799     mp_ptr     wp;							\
800     mp_ptr     xp, yp;							\
801     mp_ptr     zp[K];							\
802     mp_limb_t  ep[2*K];							\
803     unsigned   i;							\
804     double     t;							\
805     TMP_DECL;								\
806 									\
807     SPEED_RESTRICT_COND (s->size >= 1);					\
808 									\
809     TMP_MARK;								\
810     SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
811 									\
812     /* (don't have a mechanism to specify zp alignments) */		\
813     for (i = 0; i < K; i++)						\
814       SPEED_TMP_ALLOC_LIMBS (zp[i], s->size, 0);			\
815 									\
816     xp = s->xp;								\
817     yp = s->yp;								\
818 									\
819     if (s->r == 0)	;						\
820     else if (s->r == 1) { xp = wp;	    }				\
821     else if (s->r == 2) {	   yp = wp; }				\
822     else if (s->r == 3) { xp = wp; yp = wp; }				\
823     else if (s->r == 4) {     yp = xp;	    }				\
824     else		{						\
825       TMP_FREE;								\
826       return -1.0;							\
827     }									\
828 									\
829     /* initialize wp if operand overlap */				\
830     if (xp == wp || yp == wp)						\
831       MPN_COPY (wp, s->xp, s->size);					\
832 									\
833     speed_operand_src (s, xp, s->size);					\
834     speed_operand_src (s, yp, s->size);					\
835     for (i = 0; i < K; i++)						\
836       speed_operand_src (s, zp[i], s->size);				\
837     speed_operand_dst (s, wp, s->size);					\
838     speed_cache_fill (s);						\
839 									\
840     speed_starttime ();							\
841     i = s->reps;							\
842     do									\
843       call;								\
844     while (--i != 0);							\
845     t = speed_endtime ();						\
846 									\
847     TMP_FREE;								\
848     return t;								\
849   }
850 
851 #define SPEED_ROUTINE_MPN_BINARY_ERR1_N(function)			\
852   SPEED_ROUTINE_MPN_BINARY_ERR_N_CALL ((*function) (wp, xp, yp, ep, zp[0], s->size, 0), 1)
853 
854 #define SPEED_ROUTINE_MPN_BINARY_ERR2_N(function)			\
855   SPEED_ROUTINE_MPN_BINARY_ERR_N_CALL ((*function) (wp, xp, yp, ep, zp[0], zp[1], s->size, 0), 2)
856 
857 #define SPEED_ROUTINE_MPN_BINARY_ERR3_N(function)			\
858   SPEED_ROUTINE_MPN_BINARY_ERR_N_CALL ((*function) (wp, xp, yp, ep, zp[0], zp[1], zp[2], s->size, 0), 3)
859 
860 
861 /* For mpn_add_n, mpn_sub_n, or similar. */
862 #define SPEED_ROUTINE_MPN_ADDSUB_N_CALL(call)				\
863   {									\
864     mp_ptr     ap, sp;							\
865     mp_ptr     xp, yp;							\
866     unsigned   i;							\
867     double     t;							\
868     TMP_DECL;								\
869 									\
870     SPEED_RESTRICT_COND (s->size >= 1);					\
871 									\
872     TMP_MARK;								\
873     SPEED_TMP_ALLOC_LIMBS (ap, s->size, s->align_wp);			\
874     SPEED_TMP_ALLOC_LIMBS (sp, s->size, s->align_wp);			\
875 									\
876     xp = s->xp;								\
877     yp = s->yp;								\
878 									\
879     if ((s->r & 1) != 0) { xp = ap; }					\
880     if ((s->r & 2) != 0) { yp = ap; }					\
881     if ((s->r & 4) != 0) { xp = sp; }					\
882     if ((s->r & 8) != 0) { yp = sp; }					\
883     if ((s->r & 3) == 3  ||  (s->r & 12) == 12)				\
884       {									\
885 	TMP_FREE;							\
886 	return -1.0;							\
887       }									\
888 									\
889     /* initialize ap if operand overlap */				\
890     if (xp == ap || yp == ap)						\
891       MPN_COPY (ap, s->xp, s->size);					\
892     /* initialize sp if operand overlap */				\
893     if (xp == sp || yp == sp)						\
894       MPN_COPY (sp, s->xp, s->size);					\
895 									\
896     speed_operand_src (s, xp, s->size);					\
897     speed_operand_src (s, yp, s->size);					\
898     speed_operand_dst (s, ap, s->size);					\
899     speed_operand_dst (s, sp, s->size);					\
900     speed_cache_fill (s);						\
901 									\
902     speed_starttime ();							\
903     i = s->reps;							\
904     do									\
905       call;								\
906     while (--i != 0);							\
907     t = speed_endtime ();						\
908 									\
909     TMP_FREE;								\
910     return t;								\
911   }
912 
913 #define SPEED_ROUTINE_MPN_BINARY_N(function)				\
914    SPEED_ROUTINE_MPN_BINARY_N_CALL ((*function) (wp, xp, yp, s->size))
915 
916 #define SPEED_ROUTINE_MPN_BINARY_NC(function)				\
917    SPEED_ROUTINE_MPN_BINARY_N_CALL ((*function) (wp, xp, yp, s->size, 0))
918 
919 
920 /* For mpn_lshift, mpn_rshift, mpn_mul_1, with r, or similar. */
921 #define SPEED_ROUTINE_MPN_UNARY_1_CALL(call)				\
922   {									\
923     mp_ptr    wp;							\
924     unsigned  i;							\
925     double    t;							\
926     TMP_DECL;								\
927 									\
928     SPEED_RESTRICT_COND (s->size >= 1);					\
929 									\
930     TMP_MARK;								\
931     SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
932 									\
933     speed_operand_src (s, s->xp, s->size);				\
934     speed_operand_dst (s, wp, s->size);					\
935     speed_cache_fill (s);						\
936 									\
937     speed_starttime ();							\
938     i = s->reps;							\
939     do									\
940       call;								\
941     while (--i != 0);							\
942     t = speed_endtime ();						\
943 									\
944     TMP_FREE;								\
945     return t;								\
946   }
947 
948 #define SPEED_ROUTINE_MPN_UNARY_1(function)				\
949   SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r))
950 
951 #define SPEED_ROUTINE_MPN_UNARY_1C(function)				\
952   SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r, 0))
953 
954 /* FIXME: wp is uninitialized here, should start it off from xp */
955 #define SPEED_ROUTINE_MPN_UNARY_1_INPLACE(function)			\
956   SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, wp, s->size, s->r))
957 
958 #define SPEED_ROUTINE_MPN_DIVEXACT_1(function)				\
959   SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r))
960 
961 #define SPEED_ROUTINE_MPN_BDIV_Q_1(function)				\
962     SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r))
963 
964 #define SPEED_ROUTINE_MPN_PI1_BDIV_Q_1_CALL(call)			\
965   {									\
966     unsigned   shift;							\
967     mp_limb_t  dinv;							\
968 									\
969     SPEED_RESTRICT_COND (s->size > 0);					\
970     SPEED_RESTRICT_COND (s->r != 0);					\
971 									\
972     count_trailing_zeros (shift, s->r);					\
973     binvert_limb (dinv, s->r >> shift);					\
974 									\
975     SPEED_ROUTINE_MPN_UNARY_1_CALL (call);				\
976   }
977 #define SPEED_ROUTINE_MPN_PI1_BDIV_Q_1(function)			\
978   SPEED_ROUTINE_MPN_PI1_BDIV_Q_1_CALL					\
979   ((*function) (wp, s->xp, s->size, s->r, dinv, shift))
980 
981 #define SPEED_ROUTINE_MPN_BDIV_DBM1C(function)				\
982   SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r, 0))
983 
984 #define SPEED_ROUTINE_MPN_DIVREM_1(function)				\
985   SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, 0, s->xp, s->size, s->r))
986 
987 #define SPEED_ROUTINE_MPN_DIVREM_1C(function)				\
988   SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, 0, s->xp, s->size, s->r, 0))
989 
990 #define SPEED_ROUTINE_MPN_DIVREM_1F(function)				\
991   SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->size, s->xp, 0, s->r))
992 
993 #define SPEED_ROUTINE_MPN_DIVREM_1CF(function)				\
994   SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->size, s->xp, 0, s->r, 0))
995 
996 
997 #define SPEED_ROUTINE_MPN_PREINV_DIVREM_1_CALL(call)			\
998   {									\
999     unsigned   shift;							\
1000     mp_limb_t  dinv;							\
1001 									\
1002     SPEED_RESTRICT_COND (s->size >= 0);					\
1003     SPEED_RESTRICT_COND (s->r != 0);					\
1004 									\
1005     count_leading_zeros (shift, s->r);					\
1006     invert_limb (dinv, s->r << shift);					\
1007 									\
1008     SPEED_ROUTINE_MPN_UNARY_1_CALL (call);				\
1009   }									\
1010 
1011 #define SPEED_ROUTINE_MPN_PREINV_DIVREM_1(function)			\
1012   SPEED_ROUTINE_MPN_PREINV_DIVREM_1_CALL				\
1013   ((*function) (wp, 0, s->xp, s->size, s->r, dinv, shift))
1014 
1015 /* s->size limbs worth of fraction part */
1016 #define SPEED_ROUTINE_MPN_PREINV_DIVREM_1F(function)			\
1017   SPEED_ROUTINE_MPN_PREINV_DIVREM_1_CALL				\
1018   ((*function) (wp, s->size, s->xp, 0, s->r, dinv, shift))
1019 
1020 
1021 /* s->r is duplicated to form the multiplier, defaulting to
1022    MP_BASES_BIG_BASE_10.  Not sure if that's particularly useful, but at
1023    least it provides some control.  */
1024 #define SPEED_ROUTINE_MPN_UNARY_N(function,N)				\
1025   {									\
1026     mp_ptr     wp;							\
1027     mp_size_t  wn;							\
1028     unsigned   i;							\
1029     double     t;							\
1030     mp_limb_t  yp[N];							\
1031     TMP_DECL;								\
1032 									\
1033     SPEED_RESTRICT_COND (s->size >= N);					\
1034 									\
1035     TMP_MARK;								\
1036     wn = s->size + N-1;							\
1037     SPEED_TMP_ALLOC_LIMBS (wp, wn, s->align_wp);			\
1038     for (i = 0; i < N; i++)						\
1039       yp[i] = (s->r != 0 ? s->r : MP_BASES_BIG_BASE_10);		\
1040 									\
1041     speed_operand_src (s, s->xp, s->size);				\
1042     speed_operand_src (s, yp, (mp_size_t) N);				\
1043     speed_operand_dst (s, wp, wn);					\
1044     speed_cache_fill (s);						\
1045 									\
1046     speed_starttime ();							\
1047     i = s->reps;							\
1048     do									\
1049       function (wp, s->xp, s->size, yp);				\
1050     while (--i != 0);							\
1051     t = speed_endtime ();						\
1052 									\
1053     TMP_FREE;								\
1054     return t;								\
1055   }
1056 
1057 #define SPEED_ROUTINE_MPN_UNARY_2(function)				\
1058   SPEED_ROUTINE_MPN_UNARY_N (function, 2)
1059 #define SPEED_ROUTINE_MPN_UNARY_3(function)				\
1060   SPEED_ROUTINE_MPN_UNARY_N (function, 3)
1061 #define SPEED_ROUTINE_MPN_UNARY_4(function)				\
1062   SPEED_ROUTINE_MPN_UNARY_N (function, 4)
1063 #define SPEED_ROUTINE_MPN_UNARY_5(function)				\
1064   SPEED_ROUTINE_MPN_UNARY_N (function, 5)
1065 #define SPEED_ROUTINE_MPN_UNARY_6(function)				\
1066   SPEED_ROUTINE_MPN_UNARY_N (function, 6)
1067 #define SPEED_ROUTINE_MPN_UNARY_7(function)				\
1068   SPEED_ROUTINE_MPN_UNARY_N (function, 7)
1069 #define SPEED_ROUTINE_MPN_UNARY_8(function)				\
1070   SPEED_ROUTINE_MPN_UNARY_N (function, 8)
1071 
1072 
1073 /* For mpn_mul, mpn_mul_basecase, xsize=r, ysize=s->size. */
1074 #define SPEED_ROUTINE_MPN_MUL(function)					\
1075   {									\
1076     mp_ptr    wp;							\
1077     mp_size_t size1;							\
1078     unsigned  i;							\
1079     double    t;							\
1080     TMP_DECL;								\
1081 									\
1082     size1 = (s->r == 0 ? s->size : s->r);				\
1083     if (size1 < 0) size1 = -size1 - s->size;				\
1084 									\
1085     SPEED_RESTRICT_COND (size1 >= 1);					\
1086     SPEED_RESTRICT_COND (s->size >= size1);				\
1087 									\
1088     TMP_MARK;								\
1089     SPEED_TMP_ALLOC_LIMBS (wp, size1 + s->size, s->align_wp);		\
1090 									\
1091     speed_operand_src (s, s->xp, s->size);				\
1092     speed_operand_src (s, s->yp, size1);				\
1093     speed_operand_dst (s, wp, size1 + s->size);				\
1094     speed_cache_fill (s);						\
1095 									\
1096     speed_starttime ();							\
1097     i = s->reps;							\
1098     do									\
1099       function (wp, s->xp, s->size, s->yp, size1);			\
1100     while (--i != 0);							\
1101     t = speed_endtime ();						\
1102 									\
1103     TMP_FREE;								\
1104     return t;								\
1105   }
1106 
1107 
1108 #define SPEED_ROUTINE_MPN_MUL_N_CALL(call)				\
1109   {									\
1110     mp_ptr    wp;							\
1111     unsigned  i;							\
1112     double    t;							\
1113     TMP_DECL;								\
1114 									\
1115     SPEED_RESTRICT_COND (s->size >= 1);					\
1116 									\
1117     TMP_MARK;								\
1118     SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp);			\
1119 									\
1120     speed_operand_src (s, s->xp, s->size);				\
1121     speed_operand_src (s, s->yp, s->size);				\
1122     speed_operand_dst (s, wp, 2*s->size);				\
1123     speed_cache_fill (s);						\
1124 									\
1125     speed_starttime ();							\
1126     i = s->reps;							\
1127     do									\
1128       call;								\
1129     while (--i != 0);							\
1130     t = speed_endtime ();						\
1131 									\
1132     TMP_FREE;								\
1133     return t;								\
1134   }
1135 
1136 #define SPEED_ROUTINE_MPN_MUL_N(function)				\
1137   SPEED_ROUTINE_MPN_MUL_N_CALL (function (wp, s->xp, s->yp, s->size));
1138 
1139 #define SPEED_ROUTINE_MPN_MULLO_N_CALL(call)				\
1140   {									\
1141     mp_ptr    wp;							\
1142     unsigned  i;							\
1143     double    t;							\
1144     TMP_DECL;								\
1145 									\
1146     SPEED_RESTRICT_COND (s->size >= 1);					\
1147 									\
1148     TMP_MARK;								\
1149     SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
1150 									\
1151     speed_operand_src (s, s->xp, s->size);				\
1152     speed_operand_src (s, s->yp, s->size);				\
1153     speed_operand_dst (s, wp, s->size);					\
1154     speed_cache_fill (s);						\
1155 									\
1156     speed_starttime ();							\
1157     i = s->reps;							\
1158     do									\
1159       call;								\
1160     while (--i != 0);							\
1161     t = speed_endtime ();						\
1162 									\
1163     TMP_FREE;								\
1164     return t;								\
1165   }
1166 
1167 #define SPEED_ROUTINE_MPN_MULLO_N(function)				\
1168   SPEED_ROUTINE_MPN_MULLO_N_CALL (function (wp, s->xp, s->yp, s->size));
1169 
1170 #define SPEED_ROUTINE_MPN_MULLO_BASECASE(function)			\
1171   SPEED_ROUTINE_MPN_MULLO_N_CALL (function (wp, s->xp, s->yp, s->size));
1172 
1173 #define SPEED_ROUTINE_MPN_SQRLO(function)				\
1174   {									\
1175     mp_ptr    wp;							\
1176     unsigned  i;							\
1177     double    t;							\
1178     TMP_DECL;								\
1179 									\
1180     SPEED_RESTRICT_COND (s->size >= 1);					\
1181 									\
1182     TMP_MARK;								\
1183     SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
1184 									\
1185     speed_operand_src (s, s->xp, s->size);				\
1186     speed_operand_dst (s, wp, s->size);					\
1187     speed_cache_fill (s);						\
1188 									\
1189     speed_starttime ();							\
1190     i = s->reps;							\
1191     do									\
1192       function (wp, s->xp, s->size);					\
1193     while (--i != 0);							\
1194     t = speed_endtime ();						\
1195 									\
1196     TMP_FREE;								\
1197     return t;								\
1198   }
1199 
1200 /* For mpn_mulmid, mpn_mulmid_basecase, xsize=r, ysize=s->size. */
1201 #define SPEED_ROUTINE_MPN_MULMID(function)				\
1202   {									\
1203     mp_ptr    wp, xp;							\
1204     mp_size_t size1;							\
1205     unsigned  i;							\
1206     double    t;							\
1207     TMP_DECL;								\
1208 									\
1209     size1 = (s->r == 0 ? (2 * s->size - 1) : s->r);			\
1210 									\
1211     SPEED_RESTRICT_COND (s->size >= 1);					\
1212     SPEED_RESTRICT_COND (size1 >= s->size);				\
1213 									\
1214     TMP_MARK;								\
1215     SPEED_TMP_ALLOC_LIMBS (wp, size1 - s->size + 3, s->align_wp);	\
1216     SPEED_TMP_ALLOC_LIMBS (xp, size1, s->align_xp);			\
1217 									\
1218     speed_operand_src (s, xp, size1);					\
1219     speed_operand_src (s, s->yp, s->size);				\
1220     speed_operand_dst (s, wp, size1 - s->size + 3);			\
1221     speed_cache_fill (s);						\
1222 									\
1223     speed_starttime ();							\
1224     i = s->reps;							\
1225     do									\
1226       function (wp, xp, size1, s->yp, s->size);				\
1227     while (--i != 0);							\
1228     t = speed_endtime ();						\
1229 									\
1230     TMP_FREE;								\
1231     return t;								\
1232   }
1233 
1234 #define SPEED_ROUTINE_MPN_MULMID_N(function)				\
1235   {									\
1236     mp_ptr    wp, xp;							\
1237     mp_size_t size1;							\
1238     unsigned  i;							\
1239     double    t;							\
1240     TMP_DECL;								\
1241 									\
1242     size1 = 2 * s->size - 1;						\
1243 									\
1244     SPEED_RESTRICT_COND (s->size >= 1);					\
1245 									\
1246     TMP_MARK;								\
1247     SPEED_TMP_ALLOC_LIMBS (wp, size1 - s->size + 3, s->align_wp);	\
1248     SPEED_TMP_ALLOC_LIMBS (xp, size1, s->align_xp);			\
1249 									\
1250     speed_operand_src (s, xp, size1);					\
1251     speed_operand_src (s, s->yp, s->size);				\
1252     speed_operand_dst (s, wp, size1 - s->size + 3);			\
1253     speed_cache_fill (s);						\
1254 									\
1255     speed_starttime ();							\
1256     i = s->reps;							\
1257     do									\
1258       function (wp, xp, s->yp, s->size);				\
1259     while (--i != 0);							\
1260     t = speed_endtime ();						\
1261 									\
1262     TMP_FREE;								\
1263     return t;								\
1264   }
1265 
1266 #define SPEED_ROUTINE_MPN_TOOM42_MULMID(function)			\
1267   {									\
1268     mp_ptr    wp, xp, scratch;						\
1269     mp_size_t size1, scratch_size;					\
1270     unsigned  i;							\
1271     double    t;							\
1272     TMP_DECL;								\
1273 									\
1274     size1 = 2 * s->size - 1;						\
1275 									\
1276     SPEED_RESTRICT_COND (s->size >= 1);					\
1277 									\
1278     TMP_MARK;								\
1279     SPEED_TMP_ALLOC_LIMBS (wp, size1 - s->size + 3, s->align_wp);	\
1280     SPEED_TMP_ALLOC_LIMBS (xp, size1, s->align_xp);			\
1281     scratch_size = mpn_toom42_mulmid_itch (s->size);			\
1282     SPEED_TMP_ALLOC_LIMBS (scratch, scratch_size, 0);			\
1283 									\
1284     speed_operand_src (s, xp, size1);					\
1285     speed_operand_src (s, s->yp, s->size);				\
1286     speed_operand_dst (s, wp, size1 - s->size + 3);			\
1287     speed_cache_fill (s);						\
1288 									\
1289     speed_starttime ();							\
1290     i = s->reps;							\
1291     do									\
1292       function (wp, xp, s->yp, s->size, scratch);			\
1293     while (--i != 0);							\
1294     t = speed_endtime ();						\
1295 									\
1296     TMP_FREE;								\
1297     return t;								\
1298   }
1299 
1300 #define SPEED_ROUTINE_MPN_MULMOD_BNM1_CALL(call)			\
1301   {									\
1302     mp_ptr    wp, tp;							\
1303     unsigned  i;							\
1304     double    t;							\
1305     mp_size_t itch;							\
1306     TMP_DECL;								\
1307 									\
1308     SPEED_RESTRICT_COND (s->size >= 1);					\
1309 									\
1310     itch = mpn_mulmod_bnm1_itch (s->size, s->size, s->size);		\
1311 									\
1312     TMP_MARK;								\
1313     SPEED_TMP_ALLOC_LIMBS (wp, 2 * s->size, s->align_wp);		\
1314     SPEED_TMP_ALLOC_LIMBS (tp, itch, s->align_wp2);			\
1315 									\
1316     speed_operand_src (s, s->xp, s->size);				\
1317     speed_operand_src (s, s->yp, s->size);				\
1318     speed_operand_dst (s, wp, 2 * s->size);				\
1319     speed_operand_dst (s, tp, itch);					\
1320     speed_cache_fill (s);						\
1321 									\
1322     speed_starttime ();							\
1323     i = s->reps;							\
1324     do									\
1325       call;								\
1326     while (--i != 0);							\
1327     t = speed_endtime ();						\
1328 									\
1329     TMP_FREE;								\
1330     return t;								\
1331   }
1332 #define SPEED_ROUTINE_MPN_MULMOD_BNM1_ROUNDED(function)			\
1333   {									\
1334     mp_ptr    wp, tp;							\
1335     unsigned  i;							\
1336     double    t;							\
1337     mp_size_t size, itch;						\
1338     TMP_DECL;								\
1339 									\
1340     SPEED_RESTRICT_COND (s->size >= 1);					\
1341 									\
1342     size = mpn_mulmod_bnm1_next_size (s->size);				\
1343     itch = mpn_mulmod_bnm1_itch (size, size, size);			\
1344 									\
1345     TMP_MARK;								\
1346     SPEED_TMP_ALLOC_LIMBS (wp, size, s->align_wp);			\
1347     SPEED_TMP_ALLOC_LIMBS (tp, itch, s->align_wp2);			\
1348 									\
1349     speed_operand_src (s, s->xp, s->size);				\
1350     speed_operand_src (s, s->yp, s->size);				\
1351     speed_operand_dst (s, wp, size);					\
1352     speed_operand_dst (s, tp, itch);					\
1353     speed_cache_fill (s);						\
1354 									\
1355     speed_starttime ();							\
1356     i = s->reps;							\
1357     do									\
1358       function (wp, size, s->xp, s->size, s->yp, s->size, tp);		\
1359     while (--i != 0);							\
1360     t = speed_endtime ();						\
1361 									\
1362     TMP_FREE;								\
1363     return t;								\
1364   }
1365 
1366 #define SPEED_ROUTINE_MPN_MUL_N_TSPACE(call, tsize, minsize)		\
1367   {									\
1368     mp_ptr    wp, tspace;						\
1369     unsigned  i;							\
1370     double    t;							\
1371     TMP_DECL;								\
1372 									\
1373     SPEED_RESTRICT_COND (s->size >= minsize);				\
1374 									\
1375     TMP_MARK;								\
1376     SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp);			\
1377     SPEED_TMP_ALLOC_LIMBS (tspace, tsize, s->align_wp2);		\
1378 									\
1379     speed_operand_src (s, s->xp, s->size);				\
1380     speed_operand_src (s, s->yp, s->size);				\
1381     speed_operand_dst (s, wp, 2*s->size);				\
1382     speed_operand_dst (s, tspace, tsize);				\
1383     speed_cache_fill (s);						\
1384 									\
1385     speed_starttime ();							\
1386     i = s->reps;							\
1387     do									\
1388       call;								\
1389     while (--i != 0);							\
1390     t = speed_endtime ();						\
1391 									\
1392     TMP_FREE;								\
1393     return t;								\
1394   }
1395 
1396 #define SPEED_ROUTINE_MPN_TOOM22_MUL_N(function)			\
1397   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1398     (function (wp, s->xp, s->size, s->yp, s->size, tspace),		\
1399      mpn_toom22_mul_itch (s->size, s->size),				\
1400      MPN_TOOM22_MUL_MINSIZE)
1401 
1402 #define SPEED_ROUTINE_MPN_TOOM33_MUL_N(function)			\
1403   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1404     (function (wp, s->xp, s->size, s->yp, s->size, tspace),		\
1405      mpn_toom33_mul_itch (s->size, s->size),				\
1406      MPN_TOOM33_MUL_MINSIZE)
1407 
1408 #define SPEED_ROUTINE_MPN_TOOM44_MUL_N(function)			\
1409   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1410     (function (wp, s->xp, s->size, s->yp, s->size, tspace),		\
1411      mpn_toom44_mul_itch (s->size, s->size),				\
1412      MPN_TOOM44_MUL_MINSIZE)
1413 
1414 #define SPEED_ROUTINE_MPN_TOOM6H_MUL_N(function)			\
1415   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1416     (function (wp, s->xp, s->size, s->yp, s->size, tspace),		\
1417      mpn_toom6h_mul_itch (s->size, s->size),				\
1418      MPN_TOOM6H_MUL_MINSIZE)
1419 
1420 #define SPEED_ROUTINE_MPN_TOOM8H_MUL_N(function)			\
1421   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1422     (function (wp, s->xp, s->size, s->yp, s->size, tspace),		\
1423      mpn_toom8h_mul_itch (s->size, s->size),				\
1424      MPN_TOOM8H_MUL_MINSIZE)
1425 
1426 #define SPEED_ROUTINE_MPN_TOOM32_MUL(function)				\
1427   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1428     (function (wp, s->xp, s->size, s->yp, 2*s->size/3, tspace),		\
1429      mpn_toom32_mul_itch (s->size, 2*s->size/3),			\
1430      MPN_TOOM32_MUL_MINSIZE)
1431 
1432 #define SPEED_ROUTINE_MPN_TOOM42_MUL(function)				\
1433   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1434     (function (wp, s->xp, s->size, s->yp, s->size/2, tspace),		\
1435      mpn_toom42_mul_itch (s->size, s->size/2),				\
1436      MPN_TOOM42_MUL_MINSIZE)
1437 
1438 #define SPEED_ROUTINE_MPN_TOOM43_MUL(function)				\
1439   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1440     (function (wp, s->xp, s->size, s->yp, s->size*3/4, tspace),		\
1441      mpn_toom43_mul_itch (s->size, s->size*3/4),			\
1442      MPN_TOOM43_MUL_MINSIZE)
1443 
1444 #define SPEED_ROUTINE_MPN_TOOM63_MUL(function)				\
1445   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1446     (function (wp, s->xp, s->size, s->yp, s->size/2, tspace),		\
1447      mpn_toom63_mul_itch (s->size, s->size/2),				\
1448      MPN_TOOM63_MUL_MINSIZE)
1449 
1450 #define SPEED_ROUTINE_MPN_TOOM32_FOR_TOOM43_MUL(function)		\
1451   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1452     (function (wp, s->xp, s->size, s->yp, 17*s->size/24, tspace),	\
1453      mpn_toom32_mul_itch (s->size, 17*s->size/24),			\
1454      MPN_TOOM32_MUL_MINSIZE)
1455 #define SPEED_ROUTINE_MPN_TOOM43_FOR_TOOM32_MUL(function)		\
1456   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1457     (function (wp, s->xp, s->size, s->yp, 17*s->size/24, tspace),	\
1458      mpn_toom43_mul_itch (s->size, 17*s->size/24),			\
1459      MPN_TOOM43_MUL_MINSIZE)
1460 
1461 #define SPEED_ROUTINE_MPN_TOOM32_FOR_TOOM53_MUL(function)		\
1462   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1463     (function (wp, s->xp, s->size, s->yp, 19*s->size/30, tspace),	\
1464      mpn_toom32_mul_itch (s->size, 19*s->size/30),			\
1465      MPN_TOOM32_MUL_MINSIZE)
1466 #define SPEED_ROUTINE_MPN_TOOM53_FOR_TOOM32_MUL(function)		\
1467   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1468     (function (wp, s->xp, s->size, s->yp, 19*s->size/30, tspace),	\
1469      mpn_toom53_mul_itch (s->size, 19*s->size/30),			\
1470      MPN_TOOM53_MUL_MINSIZE)
1471 
1472 #define SPEED_ROUTINE_MPN_TOOM42_FOR_TOOM53_MUL(function)		\
1473   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1474     (function (wp, s->xp, s->size, s->yp, 11*s->size/20, tspace),	\
1475      mpn_toom42_mul_itch (s->size, 11*s->size/20),			\
1476      MPN_TOOM42_MUL_MINSIZE)
1477 #define SPEED_ROUTINE_MPN_TOOM53_FOR_TOOM42_MUL(function)		\
1478   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1479     (function (wp, s->xp, s->size, s->yp, 11*s->size/20, tspace),	\
1480      mpn_toom53_mul_itch (s->size, 11*s->size/20),			\
1481      MPN_TOOM53_MUL_MINSIZE)
1482 
1483 #define SPEED_ROUTINE_MPN_TOOM43_FOR_TOOM54_MUL(function)		\
1484   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1485     (function (wp, s->xp, s->size, s->yp, 5*s->size/6, tspace),	\
1486      mpn_toom42_mul_itch (s->size, 5*s->size/6),			\
1487      MPN_TOOM54_MUL_MINSIZE)
1488 #define SPEED_ROUTINE_MPN_TOOM54_FOR_TOOM43_MUL(function)		\
1489   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1490     (function (wp, s->xp, s->size, s->yp, 5*s->size/6, tspace),	\
1491      mpn_toom54_mul_itch (s->size, 5*s->size/6),			\
1492      MPN_TOOM54_MUL_MINSIZE)
1493 
1494 
1495 
1496 #define SPEED_ROUTINE_MPN_SQR_CALL(call)				\
1497   {									\
1498     mp_ptr    wp;							\
1499     unsigned  i;							\
1500     double    t;							\
1501     TMP_DECL;								\
1502 									\
1503     SPEED_RESTRICT_COND (s->size >= 1);					\
1504 									\
1505     TMP_MARK;								\
1506     SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp);			\
1507 									\
1508     speed_operand_src (s, s->xp, s->size);				\
1509     speed_operand_dst (s, wp, 2*s->size);				\
1510     speed_cache_fill (s);						\
1511 									\
1512     speed_starttime ();							\
1513     i = s->reps;							\
1514     do									\
1515       call;								\
1516     while (--i != 0);							\
1517     t = speed_endtime ();						\
1518 									\
1519     TMP_FREE;								\
1520     return t;								\
1521   }
1522 
1523 #define SPEED_ROUTINE_MPN_SQR(function)					\
1524   SPEED_ROUTINE_MPN_SQR_CALL (function (wp, s->xp, s->size))
1525 
1526 #define SPEED_ROUTINE_MPN_SQR_DIAG_ADDLSH1_CALL(call)			\
1527   {									\
1528     mp_ptr    wp, tp;							\
1529     unsigned  i;							\
1530     double    t;							\
1531     TMP_DECL;								\
1532 									\
1533     SPEED_RESTRICT_COND (s->size >= 2);					\
1534 									\
1535     TMP_MARK;								\
1536     SPEED_TMP_ALLOC_LIMBS (tp, 2 * s->size, s->align_wp);		\
1537     SPEED_TMP_ALLOC_LIMBS (wp, 2 * s->size, s->align_wp);		\
1538 									\
1539     speed_operand_src (s, s->xp, s->size);				\
1540     speed_operand_src (s, tp, 2 * s->size);				\
1541     speed_operand_dst (s, wp, 2 * s->size);				\
1542     speed_cache_fill (s);						\
1543 									\
1544     speed_starttime ();							\
1545     i = s->reps;							\
1546     do									\
1547       call;								\
1548     while (--i != 0);							\
1549     t = speed_endtime () / 2;						\
1550 									\
1551     TMP_FREE;								\
1552     return t;								\
1553   }
1554 
1555 #define SPEED_ROUTINE_MPN_SQR_TSPACE(call, tsize, minsize)		\
1556   {									\
1557     mp_ptr    wp, tspace;						\
1558     unsigned  i;							\
1559     double    t;							\
1560     TMP_DECL;								\
1561 									\
1562     SPEED_RESTRICT_COND (s->size >= minsize);				\
1563 									\
1564     TMP_MARK;								\
1565     SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp);			\
1566     SPEED_TMP_ALLOC_LIMBS (tspace, tsize, s->align_wp2);		\
1567 									\
1568     speed_operand_src (s, s->xp, s->size);				\
1569     speed_operand_dst (s, wp, 2*s->size);				\
1570     speed_operand_dst (s, tspace, tsize);				\
1571     speed_cache_fill (s);						\
1572 									\
1573     speed_starttime ();							\
1574     i = s->reps;							\
1575     do									\
1576       call;								\
1577     while (--i != 0);							\
1578     t = speed_endtime ();						\
1579 									\
1580     TMP_FREE;								\
1581     return t;								\
1582   }
1583 
1584 #define SPEED_ROUTINE_MPN_TOOM2_SQR(function)				\
1585   SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace),	\
1586 				mpn_toom2_sqr_itch (s->size),		\
1587 				MPN_TOOM2_SQR_MINSIZE)
1588 
1589 #define SPEED_ROUTINE_MPN_TOOM3_SQR(function)				\
1590   SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace),	\
1591 				mpn_toom3_sqr_itch (s->size),		\
1592 				MPN_TOOM3_SQR_MINSIZE)
1593 
1594 
1595 #define SPEED_ROUTINE_MPN_TOOM4_SQR(function)				\
1596   SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace),	\
1597 				mpn_toom4_sqr_itch (s->size),		\
1598 				MPN_TOOM4_SQR_MINSIZE)
1599 
1600 #define SPEED_ROUTINE_MPN_TOOM6_SQR(function)				\
1601   SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace),	\
1602 				mpn_toom6_sqr_itch (s->size),		\
1603 				MPN_TOOM6_SQR_MINSIZE)
1604 
1605 #define SPEED_ROUTINE_MPN_TOOM8_SQR(function)				\
1606   SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace),	\
1607 				mpn_toom8_sqr_itch (s->size),		\
1608 				MPN_TOOM8_SQR_MINSIZE)
1609 
1610 #define SPEED_ROUTINE_MPN_MOD_CALL(call)				\
1611   {									\
1612     unsigned   i;							\
1613 									\
1614     SPEED_RESTRICT_COND (s->size >= 0);					\
1615 									\
1616     speed_operand_src (s, s->xp, s->size);				\
1617     speed_cache_fill (s);						\
1618 									\
1619     speed_starttime ();							\
1620     i = s->reps;							\
1621     do									\
1622       call;								\
1623     while (--i != 0);							\
1624 									\
1625     return speed_endtime ();						\
1626   }
1627 
1628 #define SPEED_ROUTINE_MPN_MOD_1(function)				\
1629    SPEED_ROUTINE_MPN_MOD_CALL ((*function) (s->xp, s->size, s->r))
1630 
1631 #define SPEED_ROUTINE_MPN_MOD_1C(function)				\
1632    SPEED_ROUTINE_MPN_MOD_CALL ((*function)(s->xp, s->size, s->r, CNST_LIMB(0)))
1633 
1634 #define SPEED_ROUTINE_MPN_MODEXACT_1_ODD(function)			\
1635   SPEED_ROUTINE_MPN_MOD_CALL (function (s->xp, s->size, s->r));
1636 
1637 #define SPEED_ROUTINE_MPN_MODEXACT_1C_ODD(function)			\
1638   SPEED_ROUTINE_MPN_MOD_CALL (function (s->xp, s->size, s->r, CNST_LIMB(0)));
1639 
1640 #define SPEED_ROUTINE_MPN_MOD_34LSUB1(function)				\
1641    SPEED_ROUTINE_MPN_MOD_CALL ((*function) (s->xp, s->size))
1642 
1643 #define SPEED_ROUTINE_MPN_PREINV_MOD_1(function)			\
1644   {									\
1645     unsigned   i;							\
1646     mp_limb_t  inv;							\
1647 									\
1648     SPEED_RESTRICT_COND (s->size >= 0);					\
1649     SPEED_RESTRICT_COND (s->r & GMP_LIMB_HIGHBIT);			\
1650 									\
1651     invert_limb (inv, s->r);						\
1652     speed_operand_src (s, s->xp, s->size);				\
1653     speed_cache_fill (s);						\
1654 									\
1655     speed_starttime ();							\
1656     i = s->reps;							\
1657     do									\
1658       (*function) (s->xp, s->size, s->r, inv);				\
1659     while (--i != 0);							\
1660 									\
1661     return speed_endtime ();						\
1662   }
1663 
1664 #define SPEED_ROUTINE_MPN_MOD_1_1(function,pfunc)			\
1665   {									\
1666     unsigned   i;							\
1667     mp_limb_t  inv[4];							\
1668 									\
1669     SPEED_RESTRICT_COND (s->size >= 2);					\
1670 									\
1671     mpn_mod_1_1p_cps (inv, s->r);					\
1672     speed_operand_src (s, s->xp, s->size);				\
1673     speed_cache_fill (s);						\
1674 									\
1675     speed_starttime ();							\
1676     i = s->reps;							\
1677     do {								\
1678       pfunc (inv, s->r);						\
1679       function (s->xp, s->size, s->r << inv[1], inv);				\
1680     } while (--i != 0);							\
1681 									\
1682     return speed_endtime ();						\
1683   }
1684 #define SPEED_ROUTINE_MPN_MOD_1_N(function,pfunc,N)			\
1685   {									\
1686     unsigned   i;							\
1687     mp_limb_t  inv[N+3];						\
1688 									\
1689     SPEED_RESTRICT_COND (s->size >= 1);					\
1690     SPEED_RESTRICT_COND (s->r <= ~(mp_limb_t)0 / N);			\
1691 									\
1692     speed_operand_src (s, s->xp, s->size);				\
1693     speed_cache_fill (s);						\
1694 									\
1695     speed_starttime ();							\
1696     i = s->reps;							\
1697     do {								\
1698       pfunc (inv, s->r);						\
1699       function (s->xp, s->size, s->r, inv);				\
1700     } while (--i != 0);							\
1701 									\
1702     return speed_endtime ();						\
1703   }
1704 
1705 
1706 /* A division of 2*s->size by s->size limbs */
1707 
1708 #define SPEED_ROUTINE_MPN_DC_DIVREM_CALL(call)				\
1709   {									\
1710     unsigned  i;							\
1711     mp_ptr    a, d, q, r;						\
1712     double    t;							\
1713     gmp_pi1_t dinv;							\
1714     TMP_DECL;								\
1715 									\
1716     SPEED_RESTRICT_COND (s->size >= 1);					\
1717 									\
1718     TMP_MARK;								\
1719     SPEED_TMP_ALLOC_LIMBS (a, 2*s->size, s->align_xp);			\
1720     SPEED_TMP_ALLOC_LIMBS (d, s->size,   s->align_yp);			\
1721     SPEED_TMP_ALLOC_LIMBS (q, s->size+1, s->align_wp);			\
1722     SPEED_TMP_ALLOC_LIMBS (r, s->size,   s->align_wp2);			\
1723 									\
1724     MPN_COPY (a, s->xp, s->size);					\
1725     MPN_COPY (a+s->size, s->xp, s->size);				\
1726 									\
1727     MPN_COPY (d, s->yp, s->size);					\
1728 									\
1729     /* normalize the data */						\
1730     d[s->size-1] |= GMP_NUMB_HIGHBIT;					\
1731     a[2*s->size-1] = d[s->size-1] - 1;					\
1732 									\
1733     invert_pi1 (dinv, d[s->size-1], d[s->size-2]);			\
1734 									\
1735     speed_operand_src (s, a, 2*s->size);				\
1736     speed_operand_src (s, d, s->size);					\
1737     speed_operand_dst (s, q, s->size+1);				\
1738     speed_operand_dst (s, r, s->size);					\
1739     speed_cache_fill (s);						\
1740 									\
1741     speed_starttime ();							\
1742     i = s->reps;							\
1743     do									\
1744       call;								\
1745     while (--i != 0);							\
1746     t = speed_endtime ();						\
1747 									\
1748     TMP_FREE;								\
1749     return t;								\
1750   }
1751 
1752 
1753 /* A remainder 2*s->size by s->size limbs */
1754 
1755 #define SPEED_ROUTINE_MPZ_MOD(function)					\
1756   {									\
1757     unsigned   i;							\
1758     mpz_t      a, d, r;							\
1759 									\
1760     SPEED_RESTRICT_COND (s->size >= 1);					\
1761 									\
1762     mpz_init_set_n (d, s->yp, s->size);					\
1763 									\
1764     /* high part less than d, low part a duplicate copied in */		\
1765     mpz_init_set_n (a, s->xp, s->size);					\
1766     mpz_mod (a, a, d);							\
1767     mpz_mul_2exp (a, a, GMP_LIMB_BITS * s->size);			\
1768     MPN_COPY (PTR(a), s->xp, s->size);					\
1769 									\
1770     mpz_init (r);							\
1771 									\
1772     speed_operand_src (s, PTR(a), SIZ(a));				\
1773     speed_operand_src (s, PTR(d), SIZ(d));				\
1774     speed_cache_fill (s);						\
1775 									\
1776     speed_starttime ();							\
1777     i = s->reps;							\
1778     do									\
1779       function (r, a, d);						\
1780     while (--i != 0);							\
1781     return speed_endtime ();						\
1782   }
1783 
1784 #define SPEED_ROUTINE_MPN_PI1_DIV(function, INV, DMIN, QMIN)		\
1785   {									\
1786     unsigned   i;							\
1787     mp_ptr     dp, tp, ap, qp;						\
1788     gmp_pi1_t  inv;							\
1789     double     t;							\
1790     mp_size_t size1;							\
1791     TMP_DECL;								\
1792 									\
1793     size1 = (s->r == 0 ? 2 * s->size : s->r);				\
1794 									\
1795     SPEED_RESTRICT_COND (s->size >= DMIN);				\
1796     SPEED_RESTRICT_COND (size1 - s->size >= QMIN);			\
1797 									\
1798     TMP_MARK;								\
1799     SPEED_TMP_ALLOC_LIMBS (ap, size1, s->align_xp);			\
1800     SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
1801     SPEED_TMP_ALLOC_LIMBS (qp, size1 - s->size, s->align_wp);		\
1802     SPEED_TMP_ALLOC_LIMBS (tp, size1, s->align_wp2);			\
1803 									\
1804     /* we don't fill in dividend completely when size1 > s->size */	\
1805     MPN_COPY (ap,         s->xp, s->size);				\
1806     MPN_COPY (ap + size1 - s->size, s->xp, s->size);			\
1807 									\
1808     MPN_COPY (dp,         s->yp, s->size);				\
1809 									\
1810     /* normalize the data */						\
1811     dp[s->size-1] |= GMP_NUMB_HIGHBIT;					\
1812     ap[size1 - 1] = dp[s->size - 1] - 1;				\
1813 									\
1814     invert_pi1 (inv, dp[s->size-1], dp[s->size-2]);			\
1815 									\
1816     speed_operand_src (s, ap, size1);					\
1817     speed_operand_dst (s, tp, size1);					\
1818     speed_operand_src (s, dp, s->size);					\
1819     speed_operand_dst (s, qp, size1 - s->size);				\
1820     speed_cache_fill (s);						\
1821 									\
1822     speed_starttime ();							\
1823     i = s->reps;							\
1824     do {								\
1825       MPN_COPY (tp, ap, size1);						\
1826       function (qp, tp, size1, dp, s->size, INV);			\
1827     } while (--i != 0);							\
1828     t = speed_endtime ();						\
1829 									\
1830     TMP_FREE;								\
1831     return t;								\
1832   }
1833 #define SPEED_ROUTINE_MPN_MU_DIV_Q(function,itchfn)			\
1834   {									\
1835     unsigned   i;							\
1836     mp_ptr     dp, tp, qp, scratch;					\
1837     double     t;							\
1838     mp_size_t itch;							\
1839     TMP_DECL;								\
1840 									\
1841     SPEED_RESTRICT_COND (s->size >= 2);					\
1842 									\
1843     itch = itchfn (2 * s->size, s->size, 0);				\
1844     TMP_MARK;								\
1845     SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
1846     SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp);			\
1847     SPEED_TMP_ALLOC_LIMBS (tp, 2 * s->size, s->align_xp);		\
1848     SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2);		\
1849 									\
1850     MPN_COPY (tp,         s->xp, s->size);				\
1851     MPN_COPY (tp+s->size, s->xp, s->size);				\
1852 									\
1853     /* normalize the data */						\
1854     dp[s->size-1] |= GMP_NUMB_HIGHBIT;					\
1855     tp[2*s->size-1] = dp[s->size-1] - 1;				\
1856 									\
1857     speed_operand_dst (s, qp, s->size);					\
1858     speed_operand_src (s, tp, 2 * s->size);				\
1859     speed_operand_src (s, dp, s->size);					\
1860     speed_operand_dst (s, scratch, itch);				\
1861     speed_cache_fill (s);						\
1862 									\
1863     speed_starttime ();							\
1864     i = s->reps;							\
1865     do {								\
1866       function (qp, tp, 2 * s->size, dp, s->size, scratch);		\
1867     } while (--i != 0);							\
1868     t = speed_endtime ();						\
1869 									\
1870     TMP_FREE;								\
1871     return t;								\
1872   }
1873 #define SPEED_ROUTINE_MPN_MU_DIV_QR(function,itchfn)			\
1874   {									\
1875     unsigned   i;							\
1876     mp_ptr     dp, tp, qp, rp, scratch;					\
1877     double     t;							\
1878     mp_size_t size1, itch;						\
1879     TMP_DECL;								\
1880 									\
1881     size1 = (s->r == 0 ? 2 * s->size : s->r);				\
1882 									\
1883     SPEED_RESTRICT_COND (s->size >= 2);					\
1884     SPEED_RESTRICT_COND (size1 >= s->size);				\
1885 									\
1886     itch = itchfn (size1, s->size, 0);					\
1887     TMP_MARK;								\
1888     SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
1889     SPEED_TMP_ALLOC_LIMBS (qp, size1 - s->size, s->align_wp);		\
1890     SPEED_TMP_ALLOC_LIMBS (tp, size1, s->align_xp);			\
1891     SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2);		\
1892     SPEED_TMP_ALLOC_LIMBS (rp, s->size, s->align_wp2); /* alignment? */	\
1893 									\
1894     /* we don't fill in dividend completely when size1 > s->size */	\
1895     MPN_COPY (tp,         s->xp, s->size);				\
1896     MPN_COPY (tp + size1 - s->size, s->xp, s->size);			\
1897 									\
1898     MPN_COPY (dp,         s->yp, s->size);				\
1899 									\
1900     /* normalize the data */						\
1901     dp[s->size-1] |= GMP_NUMB_HIGHBIT;					\
1902     tp[size1 - 1] = dp[s->size - 1] - 1;				\
1903 									\
1904     speed_operand_dst (s, qp, size1 - s->size);				\
1905     speed_operand_dst (s, rp, s->size);					\
1906     speed_operand_src (s, tp, size1);					\
1907     speed_operand_src (s, dp, s->size);					\
1908     speed_operand_dst (s, scratch, itch);				\
1909     speed_cache_fill (s);						\
1910 									\
1911     speed_starttime ();							\
1912     i = s->reps;							\
1913     do {								\
1914       function (qp, rp, tp, size1, dp, s->size, scratch);		\
1915     } while (--i != 0);							\
1916     t = speed_endtime ();						\
1917 									\
1918     TMP_FREE;								\
1919     return t;								\
1920   }
1921 #define SPEED_ROUTINE_MPN_MUPI_DIV_QR(function,itchfn)			\
1922   {									\
1923     unsigned   i;							\
1924     mp_ptr     dp, tp, qp, rp, ip, scratch, tmp;			\
1925     double     t;							\
1926     mp_size_t  size1, itch;						\
1927     TMP_DECL;								\
1928 									\
1929     size1 = (s->r == 0 ? 2 * s->size : s->r);				\
1930 									\
1931     SPEED_RESTRICT_COND (s->size >= 2);					\
1932     SPEED_RESTRICT_COND (size1 >= s->size);				\
1933 									\
1934     itch = itchfn (size1, s->size, s->size);				\
1935     TMP_MARK;								\
1936     SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
1937     SPEED_TMP_ALLOC_LIMBS (qp, size1 - s->size, s->align_wp);		\
1938     SPEED_TMP_ALLOC_LIMBS (tp, size1, s->align_xp);			\
1939     SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2);		\
1940     SPEED_TMP_ALLOC_LIMBS (rp, s->size, s->align_wp2); /* alignment? */	\
1941     SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_wp2); /* alignment? */	\
1942 									\
1943     /* we don't fill in dividend completely when size1 > s->size */	\
1944     MPN_COPY (tp,         s->xp, s->size);				\
1945     MPN_COPY (tp + size1 - s->size, s->xp, s->size);			\
1946 									\
1947     MPN_COPY (dp,         s->yp, s->size);				\
1948 									\
1949     /* normalize the data */						\
1950     dp[s->size-1] |= GMP_NUMB_HIGHBIT;					\
1951     tp[size1 - 1] = dp[s->size-1] - 1;					\
1952 									\
1953     tmp = TMP_ALLOC_LIMBS (mpn_invert_itch (s->size));			\
1954     mpn_invert (ip, dp, s->size, tmp);					\
1955 									\
1956     speed_operand_dst (s, qp, size1 - s->size);				\
1957     speed_operand_dst (s, rp, s->size);					\
1958     speed_operand_src (s, tp, size1);					\
1959     speed_operand_src (s, dp, s->size);					\
1960     speed_operand_src (s, ip, s->size);					\
1961     speed_operand_dst (s, scratch, itch);				\
1962     speed_cache_fill (s);						\
1963 									\
1964     speed_starttime ();							\
1965     i = s->reps;							\
1966     do {								\
1967       function (qp, rp, tp, size1, dp, s->size, ip, s->size, scratch);	\
1968     } while (--i != 0);							\
1969     t = speed_endtime ();						\
1970 									\
1971     TMP_FREE;								\
1972     return t;								\
1973   }
1974 
1975 #define SPEED_ROUTINE_MPN_PI1_BDIV_QR(function)				\
1976   {									\
1977     unsigned   i;							\
1978     mp_ptr     dp, tp, ap, qp;						\
1979     mp_limb_t  inv;							\
1980     double     t;							\
1981     TMP_DECL;								\
1982 									\
1983     SPEED_RESTRICT_COND (s->size >= 1);					\
1984 									\
1985     TMP_MARK;								\
1986     SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size, s->align_xp);			\
1987     SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
1988     SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp);			\
1989     SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size, s->align_wp2);		\
1990 									\
1991     MPN_COPY (ap,         s->xp, s->size);				\
1992     MPN_COPY (ap+s->size, s->xp, s->size);				\
1993 									\
1994     /* divisor must be odd */						\
1995     MPN_COPY (dp, s->yp, s->size);					\
1996     dp[0] |= 1;								\
1997     binvert_limb (inv, dp[0]);						\
1998     inv = -inv;								\
1999 									\
2000     speed_operand_src (s, ap, 2*s->size);				\
2001     speed_operand_dst (s, tp, 2*s->size);				\
2002     speed_operand_src (s, dp, s->size);					\
2003     speed_operand_dst (s, qp, s->size);					\
2004     speed_cache_fill (s);						\
2005 									\
2006     speed_starttime ();							\
2007     i = s->reps;							\
2008     do {								\
2009       MPN_COPY (tp, ap, 2*s->size);					\
2010       function (qp, tp, 2*s->size, dp, s->size, inv);			\
2011     } while (--i != 0);							\
2012     t = speed_endtime ();						\
2013 									\
2014     TMP_FREE;								\
2015     return t;								\
2016   }
2017 #define SPEED_ROUTINE_MPN_PI1_BDIV_Q(function)				\
2018   {									\
2019     unsigned   i;							\
2020     mp_ptr     dp, tp, qp;						\
2021     mp_limb_t  inv;							\
2022     double     t;							\
2023     TMP_DECL;								\
2024 									\
2025     SPEED_RESTRICT_COND (s->size >= 1);					\
2026 									\
2027     TMP_MARK;								\
2028     SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
2029     SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp);			\
2030     SPEED_TMP_ALLOC_LIMBS (tp, s->size, s->align_wp2);			\
2031 									\
2032     /* divisor must be odd */						\
2033     MPN_COPY (dp, s->yp, s->size);					\
2034     dp[0] |= 1;								\
2035     binvert_limb (inv, dp[0]);						\
2036     inv = -inv;								\
2037 									\
2038     speed_operand_src (s, s->xp, s->size);				\
2039     speed_operand_dst (s, tp, s->size);					\
2040     speed_operand_src (s, dp, s->size);					\
2041     speed_operand_dst (s, qp, s->size);					\
2042     speed_cache_fill (s);						\
2043 									\
2044     speed_starttime ();							\
2045     i = s->reps;							\
2046     do {								\
2047       MPN_COPY (tp, s->xp, s->size);					\
2048       function (qp, tp, s->size, dp, s->size, inv);			\
2049     } while (--i != 0);							\
2050     t = speed_endtime ();						\
2051 									\
2052     TMP_FREE;								\
2053     return t;								\
2054   }
2055 #define SPEED_ROUTINE_MPN_PI1_BDIV_R(function)				\
2056   {									\
2057     unsigned   i;							\
2058     mp_ptr     dp, tp, ap;						\
2059     mp_limb_t  inv;							\
2060     double     t;							\
2061     TMP_DECL;								\
2062 									\
2063     SPEED_RESTRICT_COND (s->size >= 1);					\
2064 									\
2065     TMP_MARK;								\
2066     SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size, s->align_xp);			\
2067     SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
2068     SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size, s->align_wp2);		\
2069 									\
2070     MPN_COPY (ap,         s->xp, s->size);				\
2071     MPN_COPY (ap+s->size, s->xp, s->size);				\
2072 									\
2073     /* divisor must be odd */						\
2074     MPN_COPY (dp, s->yp, s->size);					\
2075     dp[0] |= 1;								\
2076     binvert_limb (inv, dp[0]);						\
2077     inv = -inv;								\
2078 									\
2079     speed_operand_src (s, ap, 2*s->size);				\
2080     speed_operand_dst (s, tp, 2*s->size);				\
2081     speed_operand_src (s, dp, s->size);					\
2082     speed_cache_fill (s);						\
2083 									\
2084     speed_starttime ();							\
2085     i = s->reps;							\
2086     do {								\
2087       MPN_COPY (tp, ap, 2*s->size);					\
2088       function (tp, 2*s->size, dp, s->size, inv);			\
2089     } while (--i != 0);							\
2090     t = speed_endtime ();						\
2091 									\
2092     TMP_FREE;								\
2093     return t;								\
2094   }
2095 #define SPEED_ROUTINE_MPN_MU_BDIV_Q(function,itchfn)			\
2096   {									\
2097     unsigned   i;							\
2098     mp_ptr     dp, qp, scratch;						\
2099     double     t;							\
2100     mp_size_t itch;							\
2101     TMP_DECL;								\
2102 									\
2103     SPEED_RESTRICT_COND (s->size >= 2);					\
2104 									\
2105     itch = itchfn (s->size, s->size);					\
2106     TMP_MARK;								\
2107     SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
2108     SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp);			\
2109     SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2);		\
2110 									\
2111     /* divisor must be odd */						\
2112     MPN_COPY (dp, s->yp, s->size);					\
2113     dp[0] |= 1;								\
2114 									\
2115     speed_operand_dst (s, qp, s->size);					\
2116     speed_operand_src (s, s->xp, s->size);				\
2117     speed_operand_src (s, dp, s->size);					\
2118     speed_operand_dst (s, scratch, itch);				\
2119     speed_cache_fill (s);						\
2120 									\
2121     speed_starttime ();							\
2122     i = s->reps;							\
2123     do {								\
2124       function (qp, s->xp, s->size, dp, s->size, scratch);		\
2125     } while (--i != 0);							\
2126     t = speed_endtime ();						\
2127 									\
2128     TMP_FREE;								\
2129     return t;								\
2130   }
2131 #define SPEED_ROUTINE_MPN_MU_BDIV_QR(function,itchfn)			\
2132   {									\
2133     unsigned   i;							\
2134     mp_ptr     dp, tp, qp, rp, scratch;					\
2135     double     t;							\
2136     mp_size_t itch;							\
2137     TMP_DECL;								\
2138 									\
2139     SPEED_RESTRICT_COND (s->size >= 2);					\
2140 									\
2141     itch = itchfn (2 * s->size, s->size);				\
2142     TMP_MARK;								\
2143     SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
2144     SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp);			\
2145     SPEED_TMP_ALLOC_LIMBS (tp, 2 * s->size, s->align_xp);		\
2146     SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2);		\
2147     SPEED_TMP_ALLOC_LIMBS (rp, s->size, s->align_wp2); /* alignment? */	\
2148 									\
2149     MPN_COPY (tp,         s->xp, s->size);				\
2150     MPN_COPY (tp+s->size, s->xp, s->size);				\
2151 									\
2152     /* divisor must be odd */						\
2153     MPN_COPY (dp, s->yp, s->size);					\
2154     dp[0] |= 1;								\
2155 									\
2156     speed_operand_dst (s, qp, s->size);					\
2157     speed_operand_dst (s, rp, s->size);					\
2158     speed_operand_src (s, tp, 2 * s->size);				\
2159     speed_operand_src (s, dp, s->size);					\
2160     speed_operand_dst (s, scratch, itch);				\
2161     speed_cache_fill (s);						\
2162 									\
2163     speed_starttime ();							\
2164     i = s->reps;							\
2165     do {								\
2166       function (qp, rp, tp, 2 * s->size, dp, s->size, scratch);		\
2167     } while (--i != 0);							\
2168     t = speed_endtime ();						\
2169 									\
2170     TMP_FREE;								\
2171     return t;								\
2172   }
2173 
2174 #define SPEED_ROUTINE_MPN_BROOT(function)	\
2175   {						\
2176     SPEED_RESTRICT_COND (s->r & 1);		\
2177     s->xp[0] |= 1;				\
2178     SPEED_ROUTINE_MPN_UNARY_1_CALL		\
2179       ((*function) (wp, s->xp, s->size, s->r));	\
2180   }
2181 
2182 #define SPEED_ROUTINE_MPN_BROOTINV(function, itch)	\
2183   {							\
2184     mp_ptr    wp, tp;					\
2185     unsigned  i;					\
2186     double    t;					\
2187     TMP_DECL;						\
2188     TMP_MARK;						\
2189     SPEED_RESTRICT_COND (s->size >= 1);			\
2190     SPEED_RESTRICT_COND (s->r & 1);			\
2191     wp = TMP_ALLOC_LIMBS (s->size);			\
2192     tp = TMP_ALLOC_LIMBS ( (itch));			\
2193     s->xp[0] |= 1;					\
2194 							\
2195     speed_operand_src (s, s->xp, s->size);		\
2196     speed_operand_dst (s, wp, s->size);			\
2197     speed_cache_fill (s);				\
2198 							\
2199     speed_starttime ();					\
2200     i = s->reps;					\
2201     do							\
2202       (*function) (wp, s->xp, s->size, s->r, tp);	\
2203     while (--i != 0);					\
2204     t = speed_endtime ();				\
2205 							\
2206     TMP_FREE;						\
2207     return t;						\
2208   }
2209 
2210 #define SPEED_ROUTINE_MPN_INVERT(function,itchfn)			\
2211   {									\
2212     long  i;								\
2213     mp_ptr    up, tp, ip;						\
2214     double    t;							\
2215     TMP_DECL;								\
2216 									\
2217     SPEED_RESTRICT_COND (s->size >= 1);					\
2218 									\
2219     TMP_MARK;								\
2220     SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp);			\
2221     SPEED_TMP_ALLOC_LIMBS (up, s->size,   s->align_yp);			\
2222     SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp);		\
2223 									\
2224     MPN_COPY (up, s->xp, s->size);					\
2225 									\
2226     /* normalize the data */						\
2227     up[s->size-1] |= GMP_NUMB_HIGHBIT;					\
2228 									\
2229     speed_operand_src (s, up, s->size);					\
2230     speed_operand_dst (s, tp, s->size);					\
2231     speed_operand_dst (s, ip, s->size);					\
2232     speed_cache_fill (s);						\
2233 									\
2234     speed_starttime ();							\
2235     i = s->reps;							\
2236     do									\
2237       function (ip, up, s->size, tp);					\
2238     while (--i != 0);							\
2239     t = speed_endtime ();						\
2240 									\
2241     TMP_FREE;								\
2242     return t;								\
2243   }
2244 
2245 #define SPEED_ROUTINE_MPN_INVERTAPPR(function,itchfn)			\
2246   {									\
2247     long  i;								\
2248     mp_ptr    up, tp, ip;						\
2249     double    t;							\
2250     TMP_DECL;								\
2251 									\
2252     SPEED_RESTRICT_COND (s->size >= 1);					\
2253 									\
2254     TMP_MARK;								\
2255     SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp);			\
2256     SPEED_TMP_ALLOC_LIMBS (up, s->size, s->align_yp);			\
2257     SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp);		\
2258 									\
2259     MPN_COPY (up, s->xp, s->size);					\
2260 									\
2261     /* normalize the data */						\
2262     up[s->size-1] |= GMP_NUMB_HIGHBIT;					\
2263 									\
2264     speed_operand_src (s, up, s->size);					\
2265     speed_operand_dst (s, tp, s->size);					\
2266     speed_operand_dst (s, ip, s->size);					\
2267     speed_cache_fill (s);						\
2268 									\
2269     speed_starttime ();							\
2270     i = s->reps;							\
2271     do									\
2272       function (ip, up, s->size, tp);					\
2273     while (--i != 0);							\
2274     t = speed_endtime ();						\
2275 									\
2276     TMP_FREE;								\
2277     return t;								\
2278   }
2279 
2280 #define SPEED_ROUTINE_MPN_NI_INVERTAPPR(function,itchfn)		\
2281   {									\
2282     long  i;								\
2283     mp_ptr    up, tp, ip;						\
2284     double    t;							\
2285     TMP_DECL;								\
2286 									\
2287     SPEED_RESTRICT_COND (s->size >= 3);					\
2288 									\
2289     TMP_MARK;								\
2290     SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp);			\
2291     SPEED_TMP_ALLOC_LIMBS (up, s->size, s->align_yp);			\
2292     SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp);		\
2293 									\
2294     MPN_COPY (up, s->xp, s->size);					\
2295 									\
2296     /* normalize the data */						\
2297     up[s->size-1] |= GMP_NUMB_HIGHBIT;					\
2298 									\
2299     speed_operand_src (s, up, s->size);					\
2300     speed_operand_dst (s, tp, s->size);					\
2301     speed_operand_dst (s, ip, s->size);					\
2302     speed_cache_fill (s);						\
2303 									\
2304     speed_starttime ();							\
2305     i = s->reps;							\
2306     do									\
2307       function (ip, up, s->size, tp);					\
2308     while (--i != 0);							\
2309     t = speed_endtime ();						\
2310 									\
2311     TMP_FREE;								\
2312     return t;								\
2313   }
2314 
2315 #define SPEED_ROUTINE_MPN_BINVERT(function,itchfn)			\
2316   {									\
2317     long  i;								\
2318     mp_ptr    up, tp, ip;						\
2319     double    t;							\
2320     TMP_DECL;								\
2321 									\
2322     SPEED_RESTRICT_COND (s->size >= 1);					\
2323 									\
2324     TMP_MARK;								\
2325     SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp);			\
2326     SPEED_TMP_ALLOC_LIMBS (up, s->size,   s->align_yp);			\
2327     SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp);		\
2328 									\
2329     MPN_COPY (up, s->xp, s->size);					\
2330 									\
2331     /* normalize the data */						\
2332     up[0] |= 1;								\
2333 									\
2334     speed_operand_src (s, up, s->size);					\
2335     speed_operand_dst (s, tp, s->size);					\
2336     speed_operand_dst (s, ip, s->size);					\
2337     speed_cache_fill (s);						\
2338 									\
2339     speed_starttime ();							\
2340     i = s->reps;							\
2341     do									\
2342       function (ip, up, s->size, tp);					\
2343     while (--i != 0);							\
2344     t = speed_endtime ();						\
2345 									\
2346     TMP_FREE;								\
2347     return t;								\
2348   }
2349 
2350 #define SPEED_ROUTINE_MPN_SEC_INVERT(function,itchfn)			\
2351   {									\
2352     long  i;								\
2353     mp_ptr    up, mp, tp, ip;						\
2354     double    t;							\
2355     TMP_DECL;								\
2356 									\
2357     SPEED_RESTRICT_COND (s->size >= 1);					\
2358 									\
2359     TMP_MARK;								\
2360     SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp);			\
2361     SPEED_TMP_ALLOC_LIMBS (up, s->size, s->align_yp);			\
2362     SPEED_TMP_ALLOC_LIMBS (mp, s->size, s->align_yp);			\
2363     SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp);		\
2364 									\
2365     speed_operand_src (s, up, s->size);					\
2366     speed_operand_dst (s, tp, s->size);					\
2367     speed_operand_dst (s, ip, s->size);					\
2368     speed_cache_fill (s);						\
2369 									\
2370     MPN_COPY (mp, s->yp, s->size);					\
2371     /* Must be odd */							\
2372     mp[0] |= 1;								\
2373     speed_starttime ();							\
2374     i = s->reps;							\
2375     do									\
2376       {									\
2377 	MPN_COPY (up, s->xp, s->size);					\
2378 	function (ip, up, mp, s->size, 2*s->size*GMP_NUMB_BITS, tp);	\
2379       }									\
2380     while (--i != 0);							\
2381     t = speed_endtime ();						\
2382 									\
2383     TMP_FREE;								\
2384     return t;								\
2385   }
2386 
2387 #define SPEED_ROUTINE_REDC_1(function)					\
2388   {									\
2389     unsigned   i;							\
2390     mp_ptr     cp, mp, tp, ap;						\
2391     mp_limb_t  inv;							\
2392     double     t;							\
2393     TMP_DECL;								\
2394 									\
2395     SPEED_RESTRICT_COND (s->size >= 1);					\
2396 									\
2397     TMP_MARK;								\
2398     SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size+1, s->align_xp);		\
2399     SPEED_TMP_ALLOC_LIMBS (mp, s->size,     s->align_yp);		\
2400     SPEED_TMP_ALLOC_LIMBS (cp, s->size,     s->align_wp);		\
2401     SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size+1, s->align_wp2);		\
2402 									\
2403     MPN_COPY (ap,         s->xp, s->size);				\
2404     MPN_COPY (ap+s->size, s->xp, s->size);				\
2405 									\
2406     /* modulus must be odd */						\
2407     MPN_COPY (mp, s->yp, s->size);					\
2408     mp[0] |= 1;								\
2409     binvert_limb (inv, mp[0]);						\
2410     inv = -inv;								\
2411 									\
2412     speed_operand_src (s, ap, 2*s->size+1);				\
2413     speed_operand_dst (s, tp, 2*s->size+1);				\
2414     speed_operand_src (s, mp, s->size);					\
2415     speed_operand_dst (s, cp, s->size);					\
2416     speed_cache_fill (s);						\
2417 									\
2418     speed_starttime ();							\
2419     i = s->reps;							\
2420     do {								\
2421       MPN_COPY (tp, ap, 2*s->size);					\
2422       function (cp, tp, mp, s->size, inv);				\
2423     } while (--i != 0);							\
2424     t = speed_endtime ();						\
2425 									\
2426     TMP_FREE;								\
2427     return t;								\
2428   }
2429 #define SPEED_ROUTINE_REDC_2(function)					\
2430   {									\
2431     unsigned   i;							\
2432     mp_ptr     cp, mp, tp, ap;						\
2433     mp_limb_t  invp[2];							\
2434     double     t;							\
2435     TMP_DECL;								\
2436 									\
2437     SPEED_RESTRICT_COND (s->size >= 1);					\
2438 									\
2439     TMP_MARK;								\
2440     SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size+1, s->align_xp);		\
2441     SPEED_TMP_ALLOC_LIMBS (mp, s->size,     s->align_yp);		\
2442     SPEED_TMP_ALLOC_LIMBS (cp, s->size,     s->align_wp);		\
2443     SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size+1, s->align_wp2);		\
2444 									\
2445     MPN_COPY (ap,         s->xp, s->size);				\
2446     MPN_COPY (ap+s->size, s->xp, s->size);				\
2447 									\
2448     /* modulus must be odd */						\
2449     MPN_COPY (mp, s->yp, s->size);					\
2450     mp[0] |= 1;								\
2451     mpn_binvert (invp, mp, 2, tp);					\
2452     invp[0] = -invp[0]; invp[1] = ~invp[1];				\
2453 									\
2454     speed_operand_src (s, ap, 2*s->size+1);				\
2455     speed_operand_dst (s, tp, 2*s->size+1);				\
2456     speed_operand_src (s, mp, s->size);					\
2457     speed_operand_dst (s, cp, s->size);					\
2458     speed_cache_fill (s);						\
2459 									\
2460     speed_starttime ();							\
2461     i = s->reps;							\
2462     do {								\
2463       MPN_COPY (tp, ap, 2*s->size);					\
2464       function (cp, tp, mp, s->size, invp);				\
2465     } while (--i != 0);							\
2466     t = speed_endtime ();						\
2467 									\
2468     TMP_FREE;								\
2469     return t;								\
2470   }
2471 #define SPEED_ROUTINE_REDC_N(function)					\
2472   {									\
2473     unsigned   i;							\
2474     mp_ptr     cp, mp, tp, ap, invp;					\
2475     double     t;							\
2476     TMP_DECL;								\
2477 									\
2478     SPEED_RESTRICT_COND (s->size > 8);					\
2479 									\
2480     TMP_MARK;								\
2481     SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size+1, s->align_xp);		\
2482     SPEED_TMP_ALLOC_LIMBS (mp, s->size,     s->align_yp);		\
2483     SPEED_TMP_ALLOC_LIMBS (cp, s->size,     s->align_wp);		\
2484     SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size+1, s->align_wp2);		\
2485     SPEED_TMP_ALLOC_LIMBS (invp, s->size,   s->align_wp2); /* align? */	\
2486 									\
2487     MPN_COPY (ap,         s->xp, s->size);				\
2488     MPN_COPY (ap+s->size, s->xp, s->size);				\
2489 									\
2490     /* modulus must be odd */						\
2491     MPN_COPY (mp, s->yp, s->size);					\
2492     mp[0] |= 1;								\
2493     mpn_binvert (invp, mp, s->size, tp);				\
2494 									\
2495     speed_operand_src (s, ap, 2*s->size+1);				\
2496     speed_operand_dst (s, tp, 2*s->size+1);				\
2497     speed_operand_src (s, mp, s->size);					\
2498     speed_operand_dst (s, cp, s->size);					\
2499     speed_cache_fill (s);						\
2500 									\
2501     speed_starttime ();							\
2502     i = s->reps;							\
2503     do {								\
2504       MPN_COPY (tp, ap, 2*s->size);					\
2505       function (cp, tp, mp, s->size, invp);				\
2506     } while (--i != 0);							\
2507     t = speed_endtime ();						\
2508 									\
2509     TMP_FREE;								\
2510     return t;								\
2511   }
2512 
2513 
2514 #define SPEED_ROUTINE_MPN_POPCOUNT(function)				\
2515   {									\
2516     unsigned i;								\
2517 									\
2518     SPEED_RESTRICT_COND (s->size >= 1);					\
2519 									\
2520     speed_operand_src (s, s->xp, s->size);				\
2521     speed_cache_fill (s);						\
2522 									\
2523     speed_starttime ();							\
2524     i = s->reps;							\
2525     do									\
2526       function (s->xp, s->size);					\
2527     while (--i != 0);							\
2528 									\
2529     return speed_endtime ();						\
2530   }
2531 
2532 #define SPEED_ROUTINE_MPN_HAMDIST(function)				\
2533   {									\
2534     unsigned i;								\
2535 									\
2536     SPEED_RESTRICT_COND (s->size >= 1);					\
2537 									\
2538     speed_operand_src (s, s->xp, s->size);				\
2539     speed_operand_src (s, s->yp, s->size);				\
2540     speed_cache_fill (s);						\
2541 									\
2542     speed_starttime ();							\
2543     i = s->reps;							\
2544     do									\
2545       function (s->xp, s->yp, s->size);					\
2546     while (--i != 0);							\
2547 									\
2548     return speed_endtime ();						\
2549   }
2550 
2551 
2552 #define SPEED_ROUTINE_MPZ_UI(function)					\
2553   {									\
2554     mpz_t     z;							\
2555     unsigned  i;							\
2556     double    t;							\
2557 									\
2558     SPEED_RESTRICT_COND (s->size >= 0);					\
2559 									\
2560     mpz_init (z);							\
2561 									\
2562     speed_starttime ();							\
2563     i = s->reps;							\
2564     do									\
2565       function (z, s->size);						\
2566     while (--i != 0);							\
2567     t = speed_endtime ();						\
2568 									\
2569     mpz_clear (z);							\
2570     return t;								\
2571   }
2572 
2573 #define SPEED_ROUTINE_MPZ_FAC_UI(function)    SPEED_ROUTINE_MPZ_UI(function)
2574 #define SPEED_ROUTINE_MPZ_FIB_UI(function)    SPEED_ROUTINE_MPZ_UI(function)
2575 #define SPEED_ROUTINE_MPZ_LUCNUM_UI(function) SPEED_ROUTINE_MPZ_UI(function)
2576 
2577 
2578 #define SPEED_ROUTINE_MPZ_2_UI(function)				\
2579   {									\
2580     mpz_t     z, z2;							\
2581     unsigned  i;							\
2582     double    t;							\
2583 									\
2584     SPEED_RESTRICT_COND (s->size >= 0);					\
2585 									\
2586     mpz_init (z);							\
2587     mpz_init (z2);							\
2588 									\
2589     speed_starttime ();							\
2590     i = s->reps;							\
2591     do									\
2592       function (z, z2, s->size);					\
2593     while (--i != 0);							\
2594     t = speed_endtime ();						\
2595 									\
2596     mpz_clear (z);							\
2597     mpz_clear (z2);							\
2598     return t;								\
2599   }
2600 
2601 #define SPEED_ROUTINE_MPZ_FIB2_UI(function)    SPEED_ROUTINE_MPZ_2_UI(function)
2602 #define SPEED_ROUTINE_MPZ_LUCNUM2_UI(function) SPEED_ROUTINE_MPZ_2_UI(function)
2603 
2604 
2605 #define SPEED_ROUTINE_MPN_FIB2_UI(function)				\
2606   {									\
2607     mp_ptr     fp, f1p;							\
2608     mp_size_t  alloc;							\
2609     unsigned   i;							\
2610     double     t;							\
2611     TMP_DECL;								\
2612 									\
2613     SPEED_RESTRICT_COND (s->size >= 0);					\
2614 									\
2615     TMP_MARK;								\
2616     alloc = MPN_FIB2_SIZE (s->size);					\
2617     SPEED_TMP_ALLOC_LIMBS (fp,	alloc, s->align_xp);			\
2618     SPEED_TMP_ALLOC_LIMBS (f1p, alloc, s->align_yp);			\
2619 									\
2620     speed_starttime ();							\
2621     i = s->reps;							\
2622     do									\
2623       function (fp, f1p, s->size);					\
2624     while (--i != 0);							\
2625     t = speed_endtime ();						\
2626 									\
2627     TMP_FREE;								\
2628     return t;								\
2629   }
2630 
2631 
2632 
2633 /* Calculate b^e mod m for random b and m of s->size limbs and random e of 6
2634    limbs.  m is forced to odd so that redc can be used.  e is limited in
2635    size so the calculation doesn't take too long. */
2636 #define SPEED_ROUTINE_MPZ_POWM(function)				\
2637   {									\
2638     mpz_t     r, b, e, m;						\
2639     unsigned  i;							\
2640     double    t;							\
2641 									\
2642     SPEED_RESTRICT_COND (s->size >= 1);					\
2643 									\
2644     mpz_init (r);							\
2645     if (s->r < 2)							\
2646       mpz_init_set_n (b, s->xp, s->size);				\
2647     else								\
2648       mpz_init_set_ui (b, s->r);					\
2649     mpz_init_set_n (m, s->yp, s->size);					\
2650     mpz_setbit (m, 0);	/* force m to odd */				\
2651     mpz_init_set_n (e, s->xp_block, 6);					\
2652 									\
2653     speed_starttime ();							\
2654     i = s->reps;							\
2655     do									\
2656       function (r, b, e, m);						\
2657     while (--i != 0);							\
2658     t = speed_endtime ();						\
2659 									\
2660     mpz_clear (r);							\
2661     mpz_clear (b);							\
2662     mpz_clear (e);							\
2663     mpz_clear (m);							\
2664     return t;								\
2665   }
2666 
2667 /* (m-2)^0xAAAAAAAA mod m */
2668 #define SPEED_ROUTINE_MPZ_POWM_UI(function)				\
2669   {									\
2670     mpz_t     r, b, m;							\
2671     unsigned  long  e;							\
2672     unsigned  i;							\
2673     double    t;							\
2674 									\
2675     SPEED_RESTRICT_COND (s->size >= 1);					\
2676 									\
2677     mpz_init (r);							\
2678 									\
2679     /* force m to odd */						\
2680     mpz_init (m);							\
2681     mpz_set_n (m, s->xp, s->size);					\
2682     PTR(m)[0] |= 1;							\
2683 									\
2684     e = (~ (unsigned long) 0) / 3;					\
2685     if (s->r != 0)							\
2686       e = s->r;								\
2687 									\
2688     mpz_init_set (b, m);						\
2689     mpz_sub_ui (b, b, 2);						\
2690 /* printf ("%X\n", mpz_get_ui(m)); */					\
2691     i = s->reps;							\
2692     speed_starttime ();							\
2693     do									\
2694       function (r, b, e, m);						\
2695     while (--i != 0);							\
2696     t = speed_endtime ();						\
2697 									\
2698     mpz_clear (r);							\
2699     mpz_clear (b);							\
2700     mpz_clear (m);							\
2701     return t;								\
2702   }
2703 
2704 
2705 #define SPEED_ROUTINE_MPN_ADDSUB_CALL(call)				\
2706   {									\
2707     mp_ptr    wp, wp2, xp, yp;						\
2708     unsigned  i;							\
2709     double    t;							\
2710     TMP_DECL;								\
2711 									\
2712     SPEED_RESTRICT_COND (s->size >= 0);					\
2713 									\
2714     TMP_MARK;								\
2715     SPEED_TMP_ALLOC_LIMBS (wp,	s->size, s->align_wp);			\
2716     SPEED_TMP_ALLOC_LIMBS (wp2, s->size, s->align_wp2);			\
2717     xp = s->xp;								\
2718     yp = s->yp;								\
2719 									\
2720     if (s->r == 0)	;						\
2721     else if (s->r == 1) { xp = wp;	      }				\
2722     else if (s->r == 2) {	    yp = wp2; }				\
2723     else if (s->r == 3) { xp = wp;  yp = wp2; }				\
2724     else if (s->r == 4) { xp = wp2; yp = wp;  }				\
2725     else {								\
2726       TMP_FREE;								\
2727       return -1.0;							\
2728     }									\
2729     if (xp != s->xp) MPN_COPY (xp, s->xp, s->size);			\
2730     if (yp != s->yp) MPN_COPY (yp, s->yp, s->size);			\
2731 									\
2732     speed_operand_src (s, xp, s->size);					\
2733     speed_operand_src (s, yp, s->size);					\
2734     speed_operand_dst (s, wp, s->size);					\
2735     speed_operand_dst (s, wp2, s->size);				\
2736     speed_cache_fill (s);						\
2737 									\
2738     speed_starttime ();							\
2739     i = s->reps;							\
2740     do									\
2741       call;								\
2742     while (--i != 0);							\
2743     t = speed_endtime ();						\
2744 									\
2745     TMP_FREE;								\
2746     return t;								\
2747   }
2748 
2749 #define SPEED_ROUTINE_MPN_ADDSUB_N(function)				\
2750   SPEED_ROUTINE_MPN_ADDSUB_CALL						\
2751     (function (wp, wp2, xp, yp, s->size));
2752 
2753 #define SPEED_ROUTINE_MPN_ADDSUB_NC(function)				\
2754   SPEED_ROUTINE_MPN_ADDSUB_CALL						\
2755     (function (wp, wp2, xp, yp, s->size, 0));
2756 
2757 
2758 /* Doing an Nx1 gcd with the given r. */
2759 #define SPEED_ROUTINE_MPN_GCD_1N(function)				\
2760   {									\
2761     mp_ptr    xp;							\
2762     unsigned  i;							\
2763     double    t;							\
2764     TMP_DECL;								\
2765 									\
2766     SPEED_RESTRICT_COND (s->size >= 1);					\
2767     SPEED_RESTRICT_COND (s->r != 0);					\
2768 									\
2769     TMP_MARK;								\
2770     SPEED_TMP_ALLOC_LIMBS (xp, s->size, s->align_xp);			\
2771     MPN_COPY (xp, s->xp, s->size);					\
2772     xp[0] |= refmpn_zero_p (xp, s->size);				\
2773 									\
2774     speed_operand_src (s, s->xp, s->size);				\
2775     speed_cache_fill (s);						\
2776 									\
2777     speed_starttime ();							\
2778     i = s->reps;							\
2779     do									\
2780       function (xp, s->size, s->r);					\
2781     while (--i != 0);							\
2782     t = speed_endtime ();						\
2783 									\
2784     TMP_FREE;								\
2785     return t;								\
2786   }
2787 
2788 
2789 /* SPEED_BLOCK_SIZE many one GCDs of s->size bits each. */
2790 
2791 #define SPEED_ROUTINE_MPN_GCD_1_CALL(setup, call)			\
2792   {									\
2793     unsigned  i, j;							\
2794     mp_ptr    px, py;							\
2795     mp_limb_t x_mask, y_mask;						\
2796     double    t;							\
2797     TMP_DECL;								\
2798 									\
2799     SPEED_RESTRICT_COND (s->size >= 1);					\
2800     SPEED_RESTRICT_COND (s->size <= mp_bits_per_limb);			\
2801 									\
2802     TMP_MARK;								\
2803     SPEED_TMP_ALLOC_LIMBS (px, SPEED_BLOCK_SIZE, s->align_xp);		\
2804     SPEED_TMP_ALLOC_LIMBS (py, SPEED_BLOCK_SIZE, s->align_yp);		\
2805     MPN_COPY (px, s->xp_block, SPEED_BLOCK_SIZE);			\
2806     MPN_COPY (py, s->yp_block, SPEED_BLOCK_SIZE);			\
2807 									\
2808     x_mask = MP_LIMB_T_LOWBITMASK (s->size);				\
2809     y_mask = MP_LIMB_T_LOWBITMASK (s->r != 0 ? s->r : s->size);		\
2810     for (i = 0; i < SPEED_BLOCK_SIZE; i++)				\
2811       {									\
2812 	px[i] &= x_mask; px[i] += (px[i] == 0);				\
2813 	py[i] &= y_mask; py[i] += (py[i] == 0);				\
2814 	setup;								\
2815       }									\
2816 									\
2817     speed_operand_src (s, px, SPEED_BLOCK_SIZE);			\
2818     speed_operand_src (s, py, SPEED_BLOCK_SIZE);			\
2819     speed_cache_fill (s);						\
2820 									\
2821     speed_starttime ();							\
2822     i = s->reps;							\
2823     do									\
2824       {									\
2825 	j = SPEED_BLOCK_SIZE;						\
2826 	do								\
2827 	  {								\
2828 	    call;							\
2829 	  }								\
2830 	while (--j != 0);						\
2831       }									\
2832     while (--i != 0);							\
2833     t = speed_endtime ();						\
2834 									\
2835     TMP_FREE;								\
2836 									\
2837     s->time_divisor = SPEED_BLOCK_SIZE;					\
2838     return t;								\
2839   }
2840 
2841 #define SPEED_ROUTINE_MPN_GCD_1(function)				\
2842   SPEED_ROUTINE_MPN_GCD_1_CALL( , function (&px[j-1], 1, py[j-1]))
2843 
2844 #define SPEED_ROUTINE_MPN_GCD_11(function)				\
2845   SPEED_ROUTINE_MPN_GCD_1_CALL((px[i] |= 1, py[i] |= 1),		\
2846 			       function (px[j-1], py[j-1]))
2847 
2848 /* Multiply limbs by (B+1). Then we get a gcd exceeding one limb, so
2849    we can measure gcd_22 loop only, without gcd_11. */
2850 #define SPEED_ROUTINE_MPN_GCD_22(function)				\
2851   SPEED_ROUTINE_MPN_GCD_1_CALL((px[i] |= 1, py[i] |= 1),		\
2852 			       function (px[j-1], px[j-1], py[j-1], py[j-1]))
2853 
2854 #define SPEED_ROUTINE_MPN_JACBASE(function)				\
2855   SPEED_ROUTINE_MPN_GCD_1_CALL						\
2856     ({									\
2857        /* require x<y, y odd, y!=1 */					\
2858        px[i] %= py[i];							\
2859        px[i] |= 1;							\
2860        py[i] |= 1;							\
2861        if (py[i]==1) py[i]=3;						\
2862      },									\
2863      function (px[j-1], py[j-1], 0))
2864 
2865 #define SPEED_ROUTINE_MPN_HGCD2(function)				\
2866   {									\
2867     unsigned   i, j;							\
2868     struct hgcd_matrix1 m = {{{0,0},{0,0}}};				\
2869     double     t;							\
2870 									\
2871     speed_operand_src (s, s->xp_block, SPEED_BLOCK_SIZE);		\
2872     speed_operand_src (s, s->yp_block, SPEED_BLOCK_SIZE);		\
2873     speed_cache_fill (s);						\
2874 									\
2875     speed_starttime ();							\
2876     i = s->reps;							\
2877     mp_limb_t chain = 0;						\
2878     do									\
2879       {									\
2880 	for (j = 0; j < SPEED_BLOCK_SIZE; j+= 2)			\
2881 	  {								\
2882 	    /* randomized but successively dependent */			\
2883 	    function (s->xp_block[j] | GMP_NUMB_HIGHBIT,		\
2884 		      s->xp_block[j+1] + chain,				\
2885 		      s->yp_block[j] | GMP_NUMB_HIGHBIT,		\
2886 		      s->yp_block[j+1], &m);				\
2887 	    chain += m.u[0][0];						\
2888 	  }								\
2889       }									\
2890     while (--i != 0);							\
2891     t = speed_endtime ();						\
2892 									\
2893     /* make sure the compiler won't optimize away chain */		\
2894     noop_1 (chain);							\
2895 									\
2896     s->time_divisor = SPEED_BLOCK_SIZE / 2;				\
2897     return t;								\
2898   }
2899 
2900 #define SPEED_ROUTINE_MPN_HGCD_CALL(func, itchfunc)			\
2901   {									\
2902     mp_size_t hgcd_init_itch, hgcd_itch;				\
2903     mp_ptr ap, bp, wp, tmp1;						\
2904     struct hgcd_matrix hgcd;						\
2905     int res;								\
2906     unsigned i;								\
2907     double t;								\
2908     TMP_DECL;								\
2909 									\
2910     if (s->size < 2)							\
2911       return -1;							\
2912 									\
2913     TMP_MARK;								\
2914 									\
2915     SPEED_TMP_ALLOC_LIMBS (ap, s->size + 1, s->align_xp);		\
2916     SPEED_TMP_ALLOC_LIMBS (bp, s->size + 1, s->align_yp);		\
2917 									\
2918     s->xp[s->size - 1] |= 1;						\
2919     s->yp[s->size - 1] |= 1;						\
2920 									\
2921     hgcd_init_itch = MPN_HGCD_MATRIX_INIT_ITCH (s->size);		\
2922     hgcd_itch = itchfunc (s->size);					\
2923 									\
2924     SPEED_TMP_ALLOC_LIMBS (tmp1, hgcd_init_itch, s->align_wp);		\
2925     SPEED_TMP_ALLOC_LIMBS (wp, hgcd_itch, s->align_wp);			\
2926 									\
2927     speed_operand_src (s, s->xp, s->size);				\
2928     speed_operand_src (s, s->yp, s->size);				\
2929     speed_operand_dst (s, ap, s->size + 1);				\
2930     speed_operand_dst (s, bp, s->size + 1);				\
2931     speed_operand_dst (s, wp, hgcd_itch);				\
2932     speed_operand_dst (s, tmp1, hgcd_init_itch);			\
2933     speed_cache_fill (s);						\
2934 									\
2935     speed_starttime ();							\
2936     i = s->reps;							\
2937     do									\
2938       {									\
2939 	MPN_COPY (ap, s->xp, s->size);					\
2940 	MPN_COPY (bp, s->yp, s->size);					\
2941 	mpn_hgcd_matrix_init (&hgcd, s->size, tmp1);			\
2942 	res = func (ap, bp, s->size, &hgcd, wp);			\
2943       }									\
2944     while (--i != 0);							\
2945     t = speed_endtime ();						\
2946     TMP_FREE;								\
2947     return t;								\
2948   }
2949 
2950 #define SPEED_ROUTINE_MPN_HGCD_REDUCE_CALL(func, itchfunc)		\
2951   {									\
2952     mp_size_t hgcd_init_itch, hgcd_step_itch;				\
2953     mp_ptr ap, bp, wp, tmp1;						\
2954     struct hgcd_matrix hgcd;						\
2955     mp_size_t p = s->size/2;						\
2956     int res;								\
2957     unsigned i;								\
2958     double t;								\
2959     TMP_DECL;								\
2960 									\
2961     if (s->size < 2)							\
2962       return -1;							\
2963 									\
2964     TMP_MARK;								\
2965 									\
2966     SPEED_TMP_ALLOC_LIMBS (ap, s->size + 1, s->align_xp);		\
2967     SPEED_TMP_ALLOC_LIMBS (bp, s->size + 1, s->align_yp);		\
2968 									\
2969     s->xp[s->size - 1] |= 1;						\
2970     s->yp[s->size - 1] |= 1;						\
2971 									\
2972     hgcd_init_itch = MPN_HGCD_MATRIX_INIT_ITCH (s->size);		\
2973     hgcd_step_itch = itchfunc (s->size, p);				\
2974 									\
2975     SPEED_TMP_ALLOC_LIMBS (tmp1, hgcd_init_itch, s->align_wp);		\
2976     SPEED_TMP_ALLOC_LIMBS (wp, hgcd_step_itch, s->align_wp);			\
2977 									\
2978     speed_operand_src (s, s->xp, s->size);				\
2979     speed_operand_src (s, s->yp, s->size);				\
2980     speed_operand_dst (s, ap, s->size + 1);				\
2981     speed_operand_dst (s, bp, s->size + 1);				\
2982     speed_operand_dst (s, wp, hgcd_step_itch);				\
2983     speed_operand_dst (s, tmp1, hgcd_init_itch);			\
2984     speed_cache_fill (s);						\
2985 									\
2986     speed_starttime ();							\
2987     i = s->reps;							\
2988     do									\
2989       {									\
2990 	MPN_COPY (ap, s->xp, s->size);					\
2991 	MPN_COPY (bp, s->yp, s->size);					\
2992 	mpn_hgcd_matrix_init (&hgcd, s->size, tmp1);			\
2993 	res = func (&hgcd, ap, bp, s->size, p, wp);			\
2994       }									\
2995     while (--i != 0);							\
2996     t = speed_endtime ();						\
2997     TMP_FREE;								\
2998     return t;								\
2999   }
3000 
3001 /* Run some GCDs of s->size limbs each.  The number of different data values
3002    is decreased as s->size**2, since GCD is a quadratic algorithm.
3003    SPEED_ROUTINE_MPN_GCD runs more times than SPEED_ROUTINE_MPN_GCDEXT
3004    though, because the plain gcd is about twice as fast as gcdext.  */
3005 
3006 #define SPEED_ROUTINE_MPN_GCD_CALL(datafactor, call)			\
3007   {									\
3008     unsigned  i;							\
3009     mp_size_t j, pieces, psize;						\
3010     mp_ptr    wp, wp2, xtmp, ytmp, px, py;				\
3011     double    t;							\
3012     TMP_DECL;								\
3013 									\
3014     SPEED_RESTRICT_COND (s->size >= 1);					\
3015 									\
3016     TMP_MARK;								\
3017     SPEED_TMP_ALLOC_LIMBS (xtmp, s->size+1, s->align_xp);		\
3018     SPEED_TMP_ALLOC_LIMBS (ytmp, s->size+1, s->align_yp);		\
3019     SPEED_TMP_ALLOC_LIMBS (wp,   s->size+1, s->align_wp);		\
3020     SPEED_TMP_ALLOC_LIMBS (wp2,  s->size+1, s->align_wp2);		\
3021 									\
3022     pieces = SPEED_BLOCK_SIZE * datafactor / s->size / s->size;		\
3023     pieces = MIN (pieces, SPEED_BLOCK_SIZE / s->size);			\
3024     pieces = MAX (pieces, 1);						\
3025 									\
3026     psize = pieces * s->size;						\
3027     px = TMP_ALLOC_LIMBS (psize);					\
3028     py = TMP_ALLOC_LIMBS (psize);					\
3029     MPN_COPY (px, pieces==1 ? s->xp : s->xp_block, psize);		\
3030     MPN_COPY (py, pieces==1 ? s->yp : s->yp_block, psize);		\
3031 									\
3032     /* Requirements: x >= y, y must be odd, high limbs != 0.		\
3033        No need to ensure random numbers are really great.  */		\
3034     for (j = 0; j < pieces; j++)					\
3035       {									\
3036 	mp_ptr	x = px + j * s->size;					\
3037 	mp_ptr	y = py + j * s->size;					\
3038 	if (x[s->size - 1] == 0) x[s->size - 1] = 1;			\
3039 	if (y[s->size - 1] == 0) y[s->size - 1] = 1;			\
3040 									\
3041 	if (x[s->size - 1] < y[s->size - 1])				\
3042 	  MP_LIMB_T_SWAP (x[s->size - 1], y[s->size - 1]);		\
3043 	else if (x[s->size - 1] == y[s->size - 1])			\
3044 	  {								\
3045 	    x[s->size - 1] = 2;						\
3046 	    y[s->size - 1] = 1;						\
3047 	  }								\
3048 	y[0] |= 1;							\
3049       }									\
3050 									\
3051     speed_operand_src (s, px, psize);					\
3052     speed_operand_src (s, py, psize);					\
3053     speed_operand_dst (s, xtmp, s->size);				\
3054     speed_operand_dst (s, ytmp, s->size);				\
3055     speed_operand_dst (s, wp, s->size);					\
3056     speed_cache_fill (s);						\
3057 									\
3058     speed_starttime ();							\
3059     i = s->reps;							\
3060     do									\
3061       {									\
3062 	j = pieces;							\
3063 	do								\
3064 	  {								\
3065 	    MPN_COPY (xtmp, px+(j - 1)*s->size, s->size);		\
3066 	    MPN_COPY (ytmp, py+(j - 1)*s->size, s->size);		\
3067 	    call;							\
3068 	  }								\
3069 	while (--j != 0);						\
3070       }									\
3071     while (--i != 0);							\
3072     t = speed_endtime ();						\
3073 									\
3074     TMP_FREE;								\
3075 									\
3076     s->time_divisor = pieces;						\
3077     return t;								\
3078   }
3079 
3080 #define SPEED_ROUTINE_MPN_GCD(function)	\
3081   SPEED_ROUTINE_MPN_GCD_CALL (8, function (wp, xtmp, s->size, ytmp, s->size))
3082 
3083 #define SPEED_ROUTINE_MPN_GCDEXT(function)				\
3084   SPEED_ROUTINE_MPN_GCD_CALL						\
3085     (4, { mp_size_t  wp2size;						\
3086 	  function (wp, wp2, &wp2size, xtmp, s->size, ytmp, s->size); })
3087 
3088 
3089 #define SPEED_ROUTINE_MPN_GCDEXT_ONE(function)				\
3090   {									\
3091     unsigned  i;							\
3092     mp_size_t j, pieces, psize, wp2size;				\
3093     mp_ptr    wp, wp2, xtmp, ytmp, px, py;				\
3094     double    t;							\
3095     TMP_DECL;								\
3096 									\
3097     SPEED_RESTRICT_COND (s->size >= 1);					\
3098 									\
3099     TMP_MARK;								\
3100 									\
3101     SPEED_TMP_ALLOC_LIMBS (xtmp, s->size+1, s->align_xp);		\
3102     SPEED_TMP_ALLOC_LIMBS (ytmp, s->size+1, s->align_yp);		\
3103     MPN_COPY (xtmp, s->xp, s->size);					\
3104     MPN_COPY (ytmp, s->yp, s->size);					\
3105 									\
3106     SPEED_TMP_ALLOC_LIMBS (wp,	s->size+1, s->align_wp);		\
3107     SPEED_TMP_ALLOC_LIMBS (wp2, s->size+1, s->align_wp2);		\
3108 									\
3109     pieces = SPEED_BLOCK_SIZE / 3;					\
3110     psize = 3 * pieces;							\
3111     px = TMP_ALLOC_LIMBS (psize);					\
3112     py = TMP_ALLOC_LIMBS (psize);					\
3113     MPN_COPY (px, s->xp_block, psize);					\
3114     MPN_COPY (py, s->yp_block, psize);					\
3115 									\
3116     /* x must have at least as many bits as y,				\
3117        high limbs must be non-zero */					\
3118     for (j = 0; j < pieces; j++)					\
3119       {									\
3120 	mp_ptr	x = px+3*j;						\
3121 	mp_ptr	y = py+3*j;						\
3122 	x[2] += (x[2] == 0);						\
3123 	y[2] += (y[2] == 0);						\
3124 	if (x[2] < y[2])						\
3125 	  MP_LIMB_T_SWAP (x[2], y[2]);					\
3126       }									\
3127 									\
3128     speed_operand_src (s, px, psize);					\
3129     speed_operand_src (s, py, psize);					\
3130     speed_operand_dst (s, xtmp, s->size);				\
3131     speed_operand_dst (s, ytmp, s->size);				\
3132     speed_operand_dst (s, wp, s->size);					\
3133     speed_cache_fill (s);						\
3134 									\
3135     speed_starttime ();							\
3136     i = s->reps;							\
3137     do									\
3138       {									\
3139 	mp_ptr	x = px;							\
3140 	mp_ptr	y = py;							\
3141 	mp_ptr	xth = &xtmp[s->size-3];					\
3142 	mp_ptr	yth = &ytmp[s->size-3];					\
3143 	j = pieces;							\
3144 	do								\
3145 	  {								\
3146 	    xth[0] = x[0], xth[1] = x[1], xth[2] = x[2];		\
3147 	    yth[0] = y[0], yth[1] = y[1], yth[2] = y[2];		\
3148 									\
3149 	    ytmp[0] |= 1; /* y must be odd, */				\
3150 									\
3151 	    function (wp, wp2, &wp2size, xtmp, s->size, ytmp, s->size);	\
3152 									\
3153 	    x += 3;							\
3154 	    y += 3;							\
3155 	  }								\
3156 	while (--j != 0);						\
3157       }									\
3158     while (--i != 0);							\
3159     t = speed_endtime ();						\
3160 									\
3161     TMP_FREE;								\
3162 									\
3163     s->time_divisor = pieces;						\
3164     return t;								\
3165   }
3166 
3167 /* Calculate nextprime(n) for random n of s->size bits (not limbs). */
3168 #define SPEED_ROUTINE_MPZ_NEXTPRIME(function)				\
3169   {									\
3170     unsigned  i, j;							\
3171     mpz_t     wp, n;							\
3172     double    t;							\
3173 									\
3174     SPEED_RESTRICT_COND (s->size >= 10);				\
3175 									\
3176     mpz_init (wp);							\
3177     mpz_init_set_n (n, s->xp, s->size);					\
3178     /* limit to s->size bits, as this function is very slow */		\
3179     mpz_tdiv_r_2exp (n, n, s->size);					\
3180     /* set high bits so operand and result are genaral s->size bits */	\
3181     mpz_setbit (n, s->size - 1);					\
3182     mpz_clrbit (n, s->size - 2);					\
3183 									\
3184     speed_starttime ();							\
3185     i = s->reps;							\
3186     do									\
3187       {									\
3188         /* nextprime timing is variable, so average over many calls */	\
3189         j = SPEED_BLOCK_SIZE - 1;					\
3190         /* starts on random, after measures prime to next prime */	\
3191         function (wp, n);						\
3192         do								\
3193           {								\
3194             function (wp, wp);						\
3195           }								\
3196         while (--j != 0);						\
3197       }									\
3198     while (--i != 0);							\
3199     t = speed_endtime ();						\
3200 									\
3201     mpz_clear (wp);							\
3202     mpz_clear (n);							\
3203 									\
3204     s->time_divisor = SPEED_BLOCK_SIZE;					\
3205     return t;								\
3206   }
3207 
3208 #define SPEED_ROUTINE_MPZ_JACOBI(function)				\
3209   {									\
3210     mpz_t     a, b;							\
3211     unsigned  i;							\
3212     mp_size_t j, pieces, psize;						\
3213     mp_ptr    px, py;							\
3214     double    t;							\
3215     TMP_DECL;								\
3216 									\
3217     TMP_MARK;								\
3218     pieces = SPEED_BLOCK_SIZE / MAX (s->size, 1);			\
3219     pieces = MAX (pieces, 1);						\
3220     s->time_divisor = pieces;						\
3221 									\
3222     psize = pieces * s->size;						\
3223     px = TMP_ALLOC_LIMBS (psize);					\
3224     py = TMP_ALLOC_LIMBS (psize);					\
3225     MPN_COPY (px, pieces==1 ? s->xp : s->xp_block, psize);		\
3226     MPN_COPY (py, pieces==1 ? s->yp : s->yp_block, psize);		\
3227 									\
3228     for (j = 0; j < pieces; j++)					\
3229       {									\
3230 	mp_ptr	x = px+j*s->size;					\
3231 	mp_ptr	y = py+j*s->size;					\
3232 									\
3233 	/* y odd */							\
3234 	y[0] |= 1;							\
3235 									\
3236 	/* high limbs non-zero */					\
3237 	if (x[s->size-1] == 0) x[s->size-1] = 1;			\
3238 	if (y[s->size-1] == 0) y[s->size-1] = 1;			\
3239       }									\
3240 									\
3241     SIZ(a) = s->size;							\
3242     SIZ(b) = s->size;							\
3243 									\
3244     speed_operand_src (s, px, psize);					\
3245     speed_operand_src (s, py, psize);					\
3246     speed_cache_fill (s);						\
3247 									\
3248     speed_starttime ();							\
3249     i = s->reps;							\
3250     do									\
3251       {									\
3252 	j = pieces;							\
3253 	do								\
3254 	  {								\
3255 	    PTR(a) = px+(j-1)*s->size;					\
3256 	    PTR(b) = py+(j-1)*s->size;					\
3257 	    function (a, b);						\
3258 	  }								\
3259 	while (--j != 0);						\
3260       }									\
3261     while (--i != 0);							\
3262     t = speed_endtime ();						\
3263 									\
3264     TMP_FREE;								\
3265     return t;								\
3266   }
3267 
3268 #define SPEED_ROUTINE_MPN_DIVREM_2(function)				\
3269   {									\
3270     mp_ptr    wp, xp;							\
3271     mp_limb_t yp[2];							\
3272     unsigned  i;							\
3273     double    t;							\
3274     TMP_DECL;								\
3275 									\
3276     SPEED_RESTRICT_COND (s->size >= 2);					\
3277 									\
3278     TMP_MARK;								\
3279     SPEED_TMP_ALLOC_LIMBS (xp, s->size, s->align_xp);			\
3280     SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
3281 									\
3282     /* source is destroyed */						\
3283     MPN_COPY (xp, s->xp, s->size);					\
3284 									\
3285     /* divisor must be normalized */					\
3286     MPN_COPY (yp, s->yp_block, 2);					\
3287     yp[1] |= GMP_NUMB_HIGHBIT;						\
3288 									\
3289     speed_operand_src (s, xp, s->size);					\
3290     speed_operand_src (s, yp, 2);					\
3291     speed_operand_dst (s, wp, s->size);					\
3292     speed_cache_fill (s);						\
3293 									\
3294     speed_starttime ();							\
3295     i = s->reps;							\
3296     do									\
3297       function (wp, 0, xp, s->size, yp);				\
3298     while (--i != 0);							\
3299     t = speed_endtime ();						\
3300 									\
3301     TMP_FREE;								\
3302     return t;								\
3303   }
3304 
3305 #define SPEED_ROUTINE_MPN_DIV_QR_1(function)				\
3306   {									\
3307     mp_ptr    wp, xp;							\
3308     mp_limb_t d;							\
3309     mp_limb_t r;							\
3310     unsigned  i;							\
3311     double    t;							\
3312     TMP_DECL;								\
3313 									\
3314     SPEED_RESTRICT_COND (s->size >= 1);					\
3315 									\
3316     TMP_MARK;								\
3317     SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
3318 									\
3319     d = s->r;								\
3320     if (d == 0)								\
3321       d = 1;								\
3322     speed_operand_src (s, s->xp, s->size);				\
3323     speed_operand_dst (s, wp, s->size);					\
3324     speed_cache_fill (s);						\
3325 									\
3326     speed_starttime ();							\
3327     i = s->reps;							\
3328     do									\
3329       r = function (wp, wp+s->size-1, s->xp, s->size, d);		\
3330     while (--i != 0);							\
3331     t = speed_endtime ();						\
3332 									\
3333     TMP_FREE;								\
3334     return t;								\
3335   }
3336 
3337 #define SPEED_ROUTINE_MPN_DIV_QR_1N_PI1(function)			\
3338   {									\
3339     mp_ptr    wp, xp;							\
3340     mp_limb_t d, dinv;							\
3341     mp_limb_t r;							\
3342     unsigned  i;							\
3343     double    t;							\
3344     TMP_DECL;								\
3345 									\
3346     SPEED_RESTRICT_COND (s->size >= 1);					\
3347 									\
3348     TMP_MARK;								\
3349     SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
3350 									\
3351     d = s->r;								\
3352     /* divisor must be normalized */					\
3353     SPEED_RESTRICT_COND (d & GMP_NUMB_HIGHBIT);				\
3354     invert_limb (dinv, d);						\
3355     speed_operand_src (s, s->xp, s->size);				\
3356     speed_operand_dst (s, wp, s->size);					\
3357     speed_cache_fill (s);						\
3358 									\
3359     speed_starttime ();							\
3360     i = s->reps;							\
3361     do									\
3362       r = function (wp, s->xp, s->size, 0, d, dinv);			\
3363     while (--i != 0);							\
3364     t = speed_endtime ();						\
3365 									\
3366     TMP_FREE;								\
3367     return t;								\
3368   }
3369 
3370 #define SPEED_ROUTINE_MPN_DIV_QR_2(function, norm)			\
3371   {									\
3372     mp_ptr    wp, xp;							\
3373     mp_limb_t yp[2];							\
3374     mp_limb_t rp[2];							\
3375     unsigned  i;							\
3376     double    t;							\
3377     TMP_DECL;								\
3378 									\
3379     SPEED_RESTRICT_COND (s->size >= 2);					\
3380 									\
3381     TMP_MARK;								\
3382     SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
3383 									\
3384     /* divisor must be normalized */					\
3385     MPN_COPY (yp, s->yp_block, 2);					\
3386     if (norm)								\
3387       yp[1] |= GMP_NUMB_HIGHBIT;					\
3388     else								\
3389       {									\
3390 	yp[1] &= ~GMP_NUMB_HIGHBIT;					\
3391 	if (yp[1] == 0)							\
3392 	  yp[1] = 1;							\
3393       }									\
3394     speed_operand_src (s, s->xp, s->size);				\
3395     speed_operand_src (s, yp, 2);					\
3396     speed_operand_dst (s, wp, s->size);					\
3397     speed_operand_dst (s, rp, 2);					\
3398     speed_cache_fill (s);						\
3399 									\
3400     speed_starttime ();							\
3401     i = s->reps;							\
3402     do									\
3403       function (wp, rp, s->xp, s->size, yp);				\
3404     while (--i != 0);							\
3405     t = speed_endtime ();						\
3406 									\
3407     TMP_FREE;								\
3408     return t;								\
3409   }
3410 
3411 #define SPEED_ROUTINE_MODLIMB_INVERT(function)				\
3412   {									\
3413     unsigned   i, j;							\
3414     mp_ptr     xp;							\
3415     mp_limb_t  n = 1;							\
3416     double     t;							\
3417 									\
3418     xp = s->xp_block-1;							\
3419 									\
3420     speed_operand_src (s, s->xp_block, SPEED_BLOCK_SIZE);		\
3421     speed_cache_fill (s);						\
3422 									\
3423     speed_starttime ();							\
3424     i = s->reps;							\
3425     do									\
3426       {									\
3427 	j = SPEED_BLOCK_SIZE;						\
3428 	do								\
3429 	  {								\
3430 	    /* randomized but successively dependent */			\
3431 	    n += (xp[j] << 1);						\
3432 									\
3433 	    function (n, n);						\
3434 	  }								\
3435 	while (--j != 0);						\
3436       }									\
3437     while (--i != 0);							\
3438     t = speed_endtime ();						\
3439 									\
3440     /* make sure the compiler won't optimize away n */			\
3441     noop_1 (n);								\
3442 									\
3443     s->time_divisor = SPEED_BLOCK_SIZE;					\
3444     return t;								\
3445   }
3446 
3447 
3448 #define SPEED_ROUTINE_MPN_SQRTROOT_CALL(call)				\
3449   {									\
3450     mp_ptr    wp, wp2;							\
3451     unsigned  i;							\
3452     double    t;							\
3453     TMP_DECL;								\
3454 									\
3455     SPEED_RESTRICT_COND (s->size >= 1);					\
3456 									\
3457     TMP_MARK;								\
3458     SPEED_TMP_ALLOC_LIMBS (wp,	s->size, s->align_wp);			\
3459     SPEED_TMP_ALLOC_LIMBS (wp2, s->size, s->align_wp2);			\
3460 									\
3461     speed_operand_src (s, s->xp, s->size);				\
3462     speed_operand_dst (s, wp, s->size);					\
3463     speed_operand_dst (s, wp2, s->size);				\
3464     speed_cache_fill (s);						\
3465 									\
3466     speed_starttime ();							\
3467     i = s->reps;							\
3468     do									\
3469       call;								\
3470     while (--i != 0);							\
3471     t = speed_endtime ();						\
3472 									\
3473     TMP_FREE;								\
3474     return t;								\
3475   }
3476 
3477 
3478 /* Calculate worst case for perfect_power
3479    Worst case is multiple prime factors larger than trial div limit. */
3480 #define SPEED_ROUTINE_MPN_PERFECT_POWER(function)		 	\
3481   {									\
3482     mpz_t     r;							\
3483     unsigned  i, power;							\
3484     double    t;							\
3485 									\
3486     SPEED_RESTRICT_COND (s->size >= 10);				\
3487 									\
3488     mpz_init (r);							\
3489     power = s->size * GMP_NUMB_BITS / 17;				\
3490     mpz_ui_pow_ui(r, (1 << 17) - 1, power - 1);				\
3491     mpz_mul_ui(r, r, (1 << 16) + 1);	/* larger than 1000th prime */	\
3492 									\
3493     speed_starttime ();							\
3494     i = s->reps;							\
3495     do									\
3496       function (PTR(r), SIZ(r));					\
3497     while (--i != 0);							\
3498     t = speed_endtime ();						\
3499 									\
3500     mpz_clear (r);							\
3501     return t;								\
3502   }
3503 
3504 /* Calculate worst case (larger prime) for perfect_square */
3505 #define SPEED_ROUTINE_MPN_PERFECT_SQUARE(function)			\
3506   {									\
3507     mpz_t     r;							\
3508     unsigned  i;							\
3509     double    t;							\
3510 									\
3511     SPEED_RESTRICT_COND (s->size >= 2);					\
3512     mpz_init_set_n (r, s->xp, s->size / 2);				\
3513     mpz_setbit (r, s->size * GMP_NUMB_BITS / 2 - 1);			\
3514     mpz_mul (r, r, r);							\
3515 									\
3516     speed_starttime ();							\
3517     i = s->reps;							\
3518     do									\
3519       function (PTR(r), SIZ(r));					\
3520     while (--i != 0);							\
3521     t = speed_endtime ();						\
3522 									\
3523     mpz_clear (r);							\
3524     return t;								\
3525   }
3526 
3527 
3528 /* s->size controls the number of limbs in the input, s->r is the base, or
3529    decimal by default. */
3530 #define SPEED_ROUTINE_MPN_GET_STR(function)				\
3531   {									\
3532     unsigned char *wp;							\
3533     mp_size_t wn;							\
3534     mp_ptr xp;								\
3535     int base;								\
3536     unsigned i;								\
3537     double t;								\
3538     TMP_DECL;								\
3539 									\
3540     SPEED_RESTRICT_COND (s->size >= 1);					\
3541 									\
3542     base = s->r == 0 ? 10 : s->r;					\
3543     SPEED_RESTRICT_COND (base >= 2 && base <= 256);			\
3544 									\
3545     TMP_MARK;								\
3546     SPEED_TMP_ALLOC_LIMBS (xp, s->size + 1, s->align_xp);		\
3547 									\
3548     MPN_SIZEINBASE (wn, s->xp, s->size, base);				\
3549     wp = (unsigned char *) TMP_ALLOC (wn);				\
3550 									\
3551     /* use this during development to guard against overflowing wp */	\
3552     /*									\
3553     MPN_COPY (xp, s->xp, s->size);					\
3554     ASSERT_ALWAYS (mpn_get_str (wp, base, xp, s->size) <= wn);		\
3555     */									\
3556 									\
3557     speed_operand_src (s, s->xp, s->size);				\
3558     speed_operand_dst (s, xp, s->size);					\
3559     speed_operand_dst (s, (mp_ptr) wp, wn/GMP_LIMB_BYTES);		\
3560     speed_cache_fill (s);						\
3561 									\
3562     speed_starttime ();							\
3563     i = s->reps;							\
3564     do									\
3565       {									\
3566 	MPN_COPY (xp, s->xp, s->size);					\
3567 	function (wp, base, xp, s->size);				\
3568       }									\
3569     while (--i != 0);							\
3570     t = speed_endtime ();						\
3571 									\
3572     TMP_FREE;								\
3573     return t;								\
3574   }
3575 
3576 /* s->size controls the number of digits in the input, s->r is the base, or
3577    decimal by default. */
3578 #define SPEED_ROUTINE_MPN_SET_STR_CALL(call)				\
3579   {									\
3580     unsigned char *xp;							\
3581     mp_ptr     wp;							\
3582     mp_size_t  wn;							\
3583     unsigned   i;							\
3584     int        base;							\
3585     double     t;							\
3586     TMP_DECL;								\
3587 									\
3588     SPEED_RESTRICT_COND (s->size >= 1);					\
3589 									\
3590     base = s->r == 0 ? 10 : s->r;					\
3591     SPEED_RESTRICT_COND (base >= 2 && base <= 256);			\
3592 									\
3593     TMP_MARK;								\
3594 									\
3595     xp = (unsigned char *) TMP_ALLOC (s->size);				\
3596     for (i = 0; i < s->size; i++)					\
3597       xp[i] = s->xp[i] % base;						\
3598 									\
3599     LIMBS_PER_DIGIT_IN_BASE (wn, s->size, base);			\
3600     SPEED_TMP_ALLOC_LIMBS (wp, wn, s->align_wp);			\
3601 									\
3602     /* use this during development to check wn is big enough */		\
3603     /*									\
3604     ASSERT_ALWAYS (mpn_set_str (wp, xp, s->size, base) <= wn);		\
3605     */									\
3606 									\
3607     speed_operand_src (s, (mp_ptr) xp, s->size/GMP_LIMB_BYTES);	\
3608     speed_operand_dst (s, wp, wn);					\
3609     speed_cache_fill (s);						\
3610 									\
3611     speed_starttime ();							\
3612     i = s->reps;							\
3613     do									\
3614       call;								\
3615     while (--i != 0);							\
3616     t = speed_endtime ();						\
3617 									\
3618     TMP_FREE;								\
3619     return t;								\
3620   }
3621 
3622 
3623 /* Run an accel gcd find_a() function over various data values.  A set of
3624    values is used in case some run particularly fast or slow.  The size
3625    parameter is ignored, the amount of data tested is fixed.  */
3626 
3627 #define SPEED_ROUTINE_MPN_GCD_FINDA(function)				\
3628   {									\
3629     unsigned  i, j;							\
3630     mp_limb_t cp[SPEED_BLOCK_SIZE][2];					\
3631     double    t;							\
3632     TMP_DECL;								\
3633 									\
3634     TMP_MARK;								\
3635 									\
3636     /* low must be odd, high must be non-zero */			\
3637     for (i = 0; i < SPEED_BLOCK_SIZE; i++)				\
3638       {									\
3639 	cp[i][0] = s->xp_block[i] | 1;					\
3640 	cp[i][1] = s->yp_block[i] + (s->yp_block[i] == 0);		\
3641       }									\
3642 									\
3643     speed_operand_src (s, &cp[0][0], 2*SPEED_BLOCK_SIZE);		\
3644     speed_cache_fill (s);						\
3645 									\
3646     speed_starttime ();							\
3647     i = s->reps;							\
3648     do									\
3649       {									\
3650 	j = SPEED_BLOCK_SIZE;						\
3651 	do								\
3652 	  {								\
3653 	    function (cp[j-1]);						\
3654 	  }								\
3655 	while (--j != 0);						\
3656       }									\
3657     while (--i != 0);							\
3658     t = speed_endtime ();						\
3659 									\
3660     TMP_FREE;								\
3661 									\
3662     s->time_divisor = SPEED_BLOCK_SIZE;					\
3663     return t;								\
3664   }
3665 
3666 
3667 /* "call" should do "count_foo_zeros(c,n)".
3668    Give leading=1 if foo is leading zeros, leading=0 for trailing.
3669    Give zero=1 if n=0 is allowed in the call, zero=0 if not.  */
3670 
3671 #define SPEED_ROUTINE_COUNT_ZEROS_A(leading, zero)			\
3672   {									\
3673     mp_ptr     xp;							\
3674     int        i, c;							\
3675     unsigned   j;							\
3676     mp_limb_t  n;							\
3677     double     t;							\
3678     TMP_DECL;								\
3679 									\
3680     TMP_MARK;								\
3681     SPEED_TMP_ALLOC_LIMBS (xp, SPEED_BLOCK_SIZE, s->align_xp);		\
3682 									\
3683     if (! speed_routine_count_zeros_setup (s, xp, leading, zero))	\
3684       return -1.0;							\
3685     speed_operand_src (s, xp, SPEED_BLOCK_SIZE);			\
3686     speed_cache_fill (s);						\
3687 									\
3688     c = 0;								\
3689     speed_starttime ();							\
3690     j = s->reps;							\
3691     do {								\
3692       for (i = 0; i < SPEED_BLOCK_SIZE; i++)				\
3693 	{								\
3694 	  n = xp[i];							\
3695 	  n ^= c;							\
3696 
3697 #define SPEED_ROUTINE_COUNT_ZEROS_B()					\
3698 	}								\
3699     } while (--j != 0);							\
3700     t = speed_endtime ();						\
3701 									\
3702     /* don't let c go dead */						\
3703     noop_1 (c);								\
3704 									\
3705     s->time_divisor = SPEED_BLOCK_SIZE;					\
3706 									\
3707     TMP_FREE;								\
3708     return t;								\
3709   }									\
3710 
3711 #define SPEED_ROUTINE_COUNT_ZEROS_C(call, leading, zero)		\
3712   do {									\
3713     SPEED_ROUTINE_COUNT_ZEROS_A (leading, zero);			\
3714     call;								\
3715     SPEED_ROUTINE_COUNT_ZEROS_B ();					\
3716   } while (0)								\
3717 
3718 #define SPEED_ROUTINE_COUNT_LEADING_ZEROS_C(call,zero)			\
3719   SPEED_ROUTINE_COUNT_ZEROS_C (call, 1, zero)
3720 #define SPEED_ROUTINE_COUNT_LEADING_ZEROS(fun)				\
3721   SPEED_ROUTINE_COUNT_ZEROS_C (fun (c, n), 1, 0)
3722 
3723 #define SPEED_ROUTINE_COUNT_TRAILING_ZEROS_C(call,zero)			\
3724   SPEED_ROUTINE_COUNT_ZEROS_C (call, 0, zero)
3725 #define SPEED_ROUTINE_COUNT_TRAILING_ZEROS(call)			\
3726   SPEED_ROUTINE_COUNT_ZEROS_C (fun (c, n), 0, 0)
3727 
3728 
3729 #define SPEED_ROUTINE_INVERT_LIMB_CALL(call)				\
3730   {									\
3731     unsigned   i, j;							\
3732     mp_limb_t  d, dinv=0;						\
3733     mp_ptr     xp = s->xp_block - 1;					\
3734 									\
3735     s->time_divisor = SPEED_BLOCK_SIZE;					\
3736 									\
3737     speed_starttime ();							\
3738     i = s->reps;							\
3739     do									\
3740       {									\
3741 	j = SPEED_BLOCK_SIZE;						\
3742 	do								\
3743 	  {								\
3744 	    d = dinv ^ xp[j];						\
3745 	    d |= GMP_LIMB_HIGHBIT;					\
3746 	    do { call; } while (0);					\
3747 	  }								\
3748 	while (--j != 0);						\
3749       }									\
3750     while (--i != 0);							\
3751 									\
3752     /* don't let the compiler optimize everything away */		\
3753     noop_1 (dinv);							\
3754 									\
3755     return speed_endtime();						\
3756   }
3757 
3758 
3759 #define SPEED_ROUTINE_MPN_BACK_TO_BACK(function)			\
3760   {									\
3761     unsigned  i;							\
3762     speed_starttime ();							\
3763     i = s->reps;							\
3764     do									\
3765       function ();							\
3766     while (--i != 0);							\
3767     return speed_endtime ();						\
3768   }
3769 
3770 
3771 #define SPEED_ROUTINE_MPN_ZERO_CALL(call)				\
3772   {									\
3773     mp_ptr    wp;							\
3774     unsigned  i;							\
3775     double    t;							\
3776     TMP_DECL;								\
3777 									\
3778     SPEED_RESTRICT_COND (s->size >= 0);					\
3779 									\
3780     TMP_MARK;								\
3781     SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
3782     speed_operand_dst (s, wp, s->size);					\
3783     speed_cache_fill (s);						\
3784 									\
3785     speed_starttime ();							\
3786     i = s->reps;							\
3787     do									\
3788       call;								\
3789     while (--i != 0);							\
3790     t = speed_endtime ();						\
3791 									\
3792     TMP_FREE;								\
3793     return t;								\
3794   }
3795 
3796 #define SPEED_ROUTINE_MPN_ZERO(function)				\
3797   SPEED_ROUTINE_MPN_ZERO_CALL (function (wp, s->size))
3798 
3799 
3800 #endif
3801