xref: /netbsd-src/external/lgpl3/gmp/dist/tune/speed.h (revision 5dd36a3bc8bf2a9dec29ceb6349550414570c447)
1 /* Header for speed and threshold things.
2 
3 Copyright 1999-2003, 2005, 2006, 2008-2015 Free Software Foundation, Inc.
4 
5 This file is part of the GNU MP Library.
6 
7 The GNU MP Library is free software; you can redistribute it and/or modify
8 it under the terms of either:
9 
10   * the GNU Lesser General Public License as published by the Free
11     Software Foundation; either version 3 of the License, or (at your
12     option) any later version.
13 
14 or
15 
16   * the GNU General Public License as published by the Free Software
17     Foundation; either version 2 of the License, or (at your option) any
18     later version.
19 
20 or both in parallel, as here.
21 
22 The GNU MP Library is distributed in the hope that it will be useful, but
23 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25 for more details.
26 
27 You should have received copies of the GNU General Public License and the
28 GNU Lesser General Public License along with the GNU MP Library.  If not,
29 see https://www.gnu.org/licenses/.  */
30 
31 #ifndef __SPEED_H__
32 #define __SPEED_H__
33 
34 
35 /* Pad ptr,oldsize with zero limbs (at the most significant end) to make it
36    newsize long. */
37 #define MPN_ZERO_EXTEND(ptr, oldsize, newsize)		\
38   do {							\
39     ASSERT ((newsize) >= (oldsize));			\
40     MPN_ZERO ((ptr)+(oldsize), (newsize)-(oldsize));	\
41   } while (0)
42 
43 /* A mask of the least significant n bits.  Note 1<<32 doesn't give zero on
44    x86 family CPUs, hence the separate case for GMP_LIMB_BITS. */
45 #define MP_LIMB_T_LOWBITMASK(n)	\
46   ((n) == GMP_LIMB_BITS ? MP_LIMB_T_MAX : ((mp_limb_t) 1 << (n)) - 1)
47 
48 
49 /* align must be a power of 2 here, usually CACHE_LINE_SIZE is a good choice */
50 
51 #define TMP_ALLOC_ALIGNED(bytes, align)	\
52   align_pointer (TMP_ALLOC ((bytes) + (align)-1), (align))
53 #define TMP_ALLOC_LIMBS_ALIGNED(limbs, align)	\
54   ((mp_ptr) TMP_ALLOC_ALIGNED ((limbs)*sizeof(mp_limb_t), align))
55 
56 /* CACHE_LINE_SIZE is our default alignment for speed operands, and the
57    limit on what s->align_xp etc and then request for off-alignment.  Maybe
58    this should be an option of some sort, but in any case here are some line
59    sizes,
60 
61        bytes
62 	 32   pentium
63 	 64   athlon
64 	 64   itanium-2 L1
65 	128   itanium-2 L2
66 */
67 #define CACHE_LINE_SIZE   64 /* bytes */
68 
69 #define SPEED_TMP_ALLOC_ADJUST_MASK  (CACHE_LINE_SIZE/GMP_LIMB_BYTES - 1)
70 
71 /* Set ptr to a TMP_ALLOC block of the given limbs, with the given limb
72    alignment.  */
73 #define SPEED_TMP_ALLOC_LIMBS(ptr, limbs, align)			\
74   do {									\
75     mp_ptr     __ptr;							\
76     mp_size_t  __ptr_align, __ptr_add;					\
77 									\
78     ASSERT ((CACHE_LINE_SIZE % GMP_LIMB_BYTES) == 0);		\
79     __ptr = TMP_ALLOC_LIMBS ((limbs) + SPEED_TMP_ALLOC_ADJUST_MASK);	\
80     __ptr_align = (__ptr - (mp_ptr) NULL);				\
81     __ptr_add = ((align) - __ptr_align) & SPEED_TMP_ALLOC_ADJUST_MASK;	\
82     (ptr) = __ptr + __ptr_add;						\
83   } while (0)
84 
85 
86 /* This is the size for s->xp_block and s->yp_block, used in certain
87    routines that want to run across many different data values and use
88    s->size for a different purpose, eg. SPEED_ROUTINE_MPN_GCD_1.
89 
90    512 means 2kbytes of data for each of xp_block and yp_block, making 4k
91    total, which should fit easily in any L1 data cache. */
92 
93 #define SPEED_BLOCK_SIZE   512 /* limbs */
94 
95 
96 extern double  speed_unittime;
97 extern double  speed_cycletime;
98 extern int     speed_precision;
99 extern char    speed_time_string[];
100 void speed_time_init (void);
101 void speed_cycletime_fail (const char *str);
102 void speed_cycletime_init (void);
103 void speed_cycletime_need_cycles (void);
104 void speed_cycletime_need_seconds (void);
105 void speed_starttime (void);
106 double speed_endtime (void);
107 
108 
109 struct speed_params {
110   unsigned   reps;	/* how many times to run the routine */
111   mp_ptr     xp;	/* first argument */
112   mp_ptr     yp;	/* second argument */
113   mp_size_t  size;	/* size of both arguments */
114   mp_limb_t  r;		/* user supplied parameter */
115   mp_size_t  align_xp;	/* alignment of xp */
116   mp_size_t  align_yp;	/* alignment of yp */
117   mp_size_t  align_wp;	/* intended alignment of wp */
118   mp_size_t  align_wp2; /* intended alignment of wp2 */
119   mp_ptr     xp_block;	/* first special SPEED_BLOCK_SIZE block */
120   mp_ptr     yp_block;	/* second special SPEED_BLOCK_SIZE block */
121 
122   double     time_divisor; /* optionally set by the speed routine */
123 
124   /* used by the cache priming things */
125   int	     cache;
126   unsigned   src_num, dst_num;
127   struct {
128     mp_ptr    ptr;
129     mp_size_t size;
130   } src[5], dst[4];
131 };
132 
133 typedef double (*speed_function_t) (struct speed_params *);
134 
135 double speed_measure (speed_function_t fun, struct speed_params *);
136 
137 /* Prototypes for speed measuring routines */
138 
139 double speed_back_to_back (struct speed_params *);
140 double speed_count_leading_zeros (struct speed_params *);
141 double speed_count_trailing_zeros (struct speed_params *);
142 double speed_find_a (struct speed_params *);
143 double speed_gmp_allocate_free (struct speed_params *);
144 double speed_gmp_allocate_reallocate_free (struct speed_params *);
145 double speed_invert_limb (struct speed_params *);
146 double speed_malloc_free (struct speed_params *);
147 double speed_malloc_realloc_free (struct speed_params *);
148 double speed_memcpy (struct speed_params *);
149 double speed_binvert_limb (struct speed_params *);
150 double speed_binvert_limb_mul1 (struct speed_params *);
151 double speed_binvert_limb_loop (struct speed_params *);
152 double speed_binvert_limb_cond (struct speed_params *);
153 double speed_binvert_limb_arith (struct speed_params *);
154 
155 double speed_mpf_init_clear (struct speed_params *);
156 
157 double speed_mpn_add_n (struct speed_params *);
158 double speed_mpn_add_1 (struct speed_params *);
159 double speed_mpn_add_1_inplace (struct speed_params *);
160 double speed_mpn_add_err1_n (struct speed_params *);
161 double speed_mpn_add_err2_n (struct speed_params *);
162 double speed_mpn_add_err3_n (struct speed_params *);
163 double speed_mpn_addlsh_n (struct speed_params *);
164 double speed_mpn_addlsh1_n (struct speed_params *);
165 double speed_mpn_addlsh2_n (struct speed_params *);
166 double speed_mpn_addlsh_n_ip1 (struct speed_params *);
167 double speed_mpn_addlsh1_n_ip1 (struct speed_params *);
168 double speed_mpn_addlsh2_n_ip1 (struct speed_params *);
169 double speed_mpn_addlsh_n_ip2 (struct speed_params *);
170 double speed_mpn_addlsh1_n_ip2 (struct speed_params *);
171 double speed_mpn_addlsh2_n_ip2 (struct speed_params *);
172 double speed_mpn_add_n_sub_n (struct speed_params *);
173 double speed_mpn_and_n (struct speed_params *);
174 double speed_mpn_andn_n (struct speed_params *);
175 double speed_mpn_addmul_1 (struct speed_params *);
176 double speed_mpn_addmul_2 (struct speed_params *);
177 double speed_mpn_addmul_3 (struct speed_params *);
178 double speed_mpn_addmul_4 (struct speed_params *);
179 double speed_mpn_addmul_5 (struct speed_params *);
180 double speed_mpn_addmul_6 (struct speed_params *);
181 double speed_mpn_addmul_7 (struct speed_params *);
182 double speed_mpn_addmul_8 (struct speed_params *);
183 double speed_mpn_cnd_add_n (struct speed_params *);
184 double speed_mpn_cnd_sub_n (struct speed_params *);
185 double speed_mpn_com (struct speed_params *);
186 double speed_mpn_neg (struct speed_params *);
187 double speed_mpn_copyd (struct speed_params *);
188 double speed_mpn_copyi (struct speed_params *);
189 double speed_MPN_COPY (struct speed_params *);
190 double speed_MPN_COPY_DECR (struct speed_params *);
191 double speed_MPN_COPY_INCR (struct speed_params *);
192 double speed_mpn_sec_tabselect (struct speed_params *);
193 double speed_mpn_divexact_1 (struct speed_params *);
194 double speed_mpn_divexact_by3 (struct speed_params *);
195 double speed_mpn_bdiv_q_1 (struct speed_params *);
196 double speed_mpn_pi1_bdiv_q_1 (struct speed_params *);
197 double speed_mpn_bdiv_dbm1c (struct speed_params *);
198 double speed_mpn_divrem_1 (struct speed_params *);
199 double speed_mpn_divrem_1f (struct speed_params *);
200 double speed_mpn_divrem_1c (struct speed_params *);
201 double speed_mpn_divrem_1cf (struct speed_params *);
202 double speed_mpn_divrem_1_div (struct speed_params *);
203 double speed_mpn_divrem_1f_div (struct speed_params *);
204 double speed_mpn_divrem_1_inv (struct speed_params *);
205 double speed_mpn_divrem_1f_inv (struct speed_params *);
206 double speed_mpn_divrem_2 (struct speed_params *);
207 double speed_mpn_divrem_2_div (struct speed_params *);
208 double speed_mpn_divrem_2_inv (struct speed_params *);
209 double speed_mpn_div_qr_1n_pi1 (struct speed_params *);
210 double speed_mpn_div_qr_1n_pi1_1 (struct speed_params *);
211 double speed_mpn_div_qr_1n_pi1_2 (struct speed_params *);
212 double speed_mpn_div_qr_1 (struct speed_params *);
213 double speed_mpn_div_qr_2n (struct speed_params *);
214 double speed_mpn_div_qr_2u (struct speed_params *);
215 double speed_mpn_fib2_ui (struct speed_params *);
216 double speed_mpn_matrix22_mul (struct speed_params *);
217 double speed_mpn_hgcd (struct speed_params *);
218 double speed_mpn_hgcd_lehmer (struct speed_params *);
219 double speed_mpn_hgcd_appr (struct speed_params *);
220 double speed_mpn_hgcd_appr_lehmer (struct speed_params *);
221 double speed_mpn_hgcd_reduce (struct speed_params *);
222 double speed_mpn_hgcd_reduce_1 (struct speed_params *);
223 double speed_mpn_hgcd_reduce_2 (struct speed_params *);
224 double speed_mpn_gcd (struct speed_params *);
225 double speed_mpn_gcd_1 (struct speed_params *);
226 double speed_mpn_gcd_1N (struct speed_params *);
227 double speed_mpn_gcdext (struct speed_params *);
228 double speed_mpn_gcdext_double (struct speed_params *);
229 double speed_mpn_gcdext_one_double (struct speed_params *);
230 double speed_mpn_gcdext_one_single (struct speed_params *);
231 double speed_mpn_gcdext_single (struct speed_params *);
232 double speed_mpn_get_str (struct speed_params *);
233 double speed_mpn_hamdist (struct speed_params *);
234 double speed_mpn_ior_n (struct speed_params *);
235 double speed_mpn_iorn_n (struct speed_params *);
236 double speed_mpn_jacobi_base (struct speed_params *);
237 double speed_mpn_jacobi_base_1 (struct speed_params *);
238 double speed_mpn_jacobi_base_2 (struct speed_params *);
239 double speed_mpn_jacobi_base_3 (struct speed_params *);
240 double speed_mpn_jacobi_base_4 (struct speed_params *);
241 double speed_mpn_lshift (struct speed_params *);
242 double speed_mpn_lshiftc (struct speed_params *);
243 double speed_mpn_mod_1 (struct speed_params *);
244 double speed_mpn_mod_1c (struct speed_params *);
245 double speed_mpn_mod_1_div (struct speed_params *);
246 double speed_mpn_mod_1_inv (struct speed_params *);
247 double speed_mpn_mod_1_1 (struct speed_params *);
248 double speed_mpn_mod_1_1_1 (struct speed_params *);
249 double speed_mpn_mod_1_1_2 (struct speed_params *);
250 double speed_mpn_mod_1_2 (struct speed_params *);
251 double speed_mpn_mod_1_3 (struct speed_params *);
252 double speed_mpn_mod_1_4 (struct speed_params *);
253 double speed_mpn_mod_34lsub1 (struct speed_params *);
254 double speed_mpn_modexact_1_odd (struct speed_params *);
255 double speed_mpn_modexact_1c_odd (struct speed_params *);
256 double speed_mpn_mul_1 (struct speed_params *);
257 double speed_mpn_mul_1_inplace (struct speed_params *);
258 double speed_mpn_mul_2 (struct speed_params *);
259 double speed_mpn_mul_3 (struct speed_params *);
260 double speed_mpn_mul_4 (struct speed_params *);
261 double speed_mpn_mul_5 (struct speed_params *);
262 double speed_mpn_mul_6 (struct speed_params *);
263 double speed_mpn_mul (struct speed_params *);
264 double speed_mpn_mul_basecase (struct speed_params *);
265 double speed_mpn_mulmid (struct speed_params *);
266 double speed_mpn_mulmid_basecase (struct speed_params *);
267 double speed_mpn_mul_fft (struct speed_params *);
268 double speed_mpn_mul_fft_sqr (struct speed_params *);
269 double speed_mpn_fft_mul (struct speed_params *);
270 double speed_mpn_fft_sqr (struct speed_params *);
271 #if WANT_OLD_FFT_FULL
272 double speed_mpn_mul_fft_full (struct speed_params *);
273 double speed_mpn_mul_fft_full_sqr (struct speed_params *);
274 #endif
275 double speed_mpn_nussbaumer_mul (struct speed_params *);
276 double speed_mpn_nussbaumer_mul_sqr (struct speed_params *);
277 double speed_mpn_mul_n (struct speed_params *);
278 double speed_mpn_mul_n_sqr (struct speed_params *);
279 double speed_mpn_mulmid_n (struct speed_params *);
280 double speed_mpn_sqrlo (struct speed_params *);
281 double speed_mpn_sqrlo_basecase (struct speed_params *);
282 double speed_mpn_mullo_n (struct speed_params *);
283 double speed_mpn_mullo_basecase (struct speed_params *);
284 double speed_mpn_nand_n (struct speed_params *);
285 double speed_mpn_nior_n (struct speed_params *);
286 double speed_mpn_popcount (struct speed_params *);
287 double speed_mpn_preinv_divrem_1 (struct speed_params *);
288 double speed_mpn_preinv_divrem_1f (struct speed_params *);
289 double speed_mpn_preinv_mod_1 (struct speed_params *);
290 double speed_mpn_sbpi1_div_qr (struct speed_params *);
291 double speed_mpn_dcpi1_div_qr (struct speed_params *);
292 double speed_mpn_sbpi1_divappr_q (struct speed_params *);
293 double speed_mpn_dcpi1_divappr_q (struct speed_params *);
294 double speed_mpn_mu_div_qr (struct speed_params *);
295 double speed_mpn_mu_divappr_q (struct speed_params *);
296 double speed_mpn_mupi_div_qr (struct speed_params *);
297 double speed_mpn_mu_div_q (struct speed_params *);
298 double speed_mpn_sbpi1_bdiv_qr (struct speed_params *);
299 double speed_mpn_dcpi1_bdiv_qr (struct speed_params *);
300 double speed_mpn_sbpi1_bdiv_q (struct speed_params *);
301 double speed_mpn_dcpi1_bdiv_q (struct speed_params *);
302 double speed_mpn_mu_bdiv_q (struct speed_params *);
303 double speed_mpn_mu_bdiv_qr (struct speed_params *);
304 double speed_mpn_broot (struct speed_params *);
305 double speed_mpn_broot_invm1 (struct speed_params *);
306 double speed_mpn_brootinv (struct speed_params *);
307 double speed_mpn_invert (struct speed_params *);
308 double speed_mpn_invertappr (struct speed_params *);
309 double speed_mpn_ni_invertappr (struct speed_params *);
310 double speed_mpn_sec_invert (struct speed_params *s);
311 double speed_mpn_binvert (struct speed_params *);
312 double speed_mpn_redc_1 (struct speed_params *);
313 double speed_mpn_redc_2 (struct speed_params *);
314 double speed_mpn_redc_n (struct speed_params *);
315 double speed_mpn_rsblsh_n (struct speed_params *);
316 double speed_mpn_rsblsh1_n (struct speed_params *);
317 double speed_mpn_rsblsh2_n (struct speed_params *);
318 double speed_mpn_rsh1add_n (struct speed_params *);
319 double speed_mpn_rsh1sub_n (struct speed_params *);
320 double speed_mpn_rshift (struct speed_params *);
321 double speed_mpn_sb_divrem_m3 (struct speed_params *);
322 double speed_mpn_sb_divrem_m3_div (struct speed_params *);
323 double speed_mpn_sb_divrem_m3_inv (struct speed_params *);
324 double speed_mpn_set_str (struct speed_params *);
325 double speed_mpn_bc_set_str (struct speed_params *);
326 double speed_mpn_dc_set_str (struct speed_params *);
327 double speed_mpn_set_str_pre (struct speed_params *);
328 double speed_mpn_sqr_basecase (struct speed_params *);
329 double speed_mpn_sqr_diag_addlsh1 (struct speed_params *);
330 double speed_mpn_sqr_diagonal (struct speed_params *);
331 double speed_mpn_sqr (struct speed_params *);
332 double speed_mpn_sqrtrem (struct speed_params *);
333 double speed_mpn_rootrem (struct speed_params *);
334 double speed_mpn_sqrt (struct speed_params *);
335 double speed_mpn_root (struct speed_params *);
336 double speed_mpn_sub_n (struct speed_params *);
337 double speed_mpn_sub_1 (struct speed_params *);
338 double speed_mpn_sub_1_inplace (struct speed_params *);
339 double speed_mpn_sub_err1_n (struct speed_params *);
340 double speed_mpn_sub_err2_n (struct speed_params *);
341 double speed_mpn_sub_err3_n (struct speed_params *);
342 double speed_mpn_sublsh_n (struct speed_params *);
343 double speed_mpn_sublsh1_n (struct speed_params *);
344 double speed_mpn_sublsh2_n (struct speed_params *);
345 double speed_mpn_sublsh_n_ip1 (struct speed_params *);
346 double speed_mpn_sublsh1_n_ip1 (struct speed_params *);
347 double speed_mpn_sublsh2_n_ip1 (struct speed_params *);
348 double speed_mpn_submul_1 (struct speed_params *);
349 double speed_mpn_toom2_sqr (struct speed_params *);
350 double speed_mpn_toom3_sqr (struct speed_params *);
351 double speed_mpn_toom4_sqr (struct speed_params *);
352 double speed_mpn_toom6_sqr (struct speed_params *);
353 double speed_mpn_toom8_sqr (struct speed_params *);
354 double speed_mpn_toom22_mul (struct speed_params *);
355 double speed_mpn_toom33_mul (struct speed_params *);
356 double speed_mpn_toom44_mul (struct speed_params *);
357 double speed_mpn_toom6h_mul (struct speed_params *);
358 double speed_mpn_toom8h_mul (struct speed_params *);
359 double speed_mpn_toom32_mul (struct speed_params *);
360 double speed_mpn_toom42_mul (struct speed_params *);
361 double speed_mpn_toom43_mul (struct speed_params *);
362 double speed_mpn_toom63_mul (struct speed_params *);
363 double speed_mpn_toom32_for_toom43_mul (struct speed_params *);
364 double speed_mpn_toom43_for_toom32_mul (struct speed_params *);
365 double speed_mpn_toom32_for_toom53_mul (struct speed_params *);
366 double speed_mpn_toom53_for_toom32_mul (struct speed_params *);
367 double speed_mpn_toom42_for_toom53_mul (struct speed_params *);
368 double speed_mpn_toom53_for_toom42_mul (struct speed_params *);
369 double speed_mpn_toom43_for_toom54_mul (struct speed_params *);
370 double speed_mpn_toom54_for_toom43_mul (struct speed_params *);
371 double speed_mpn_toom42_mulmid (struct speed_params *);
372 double speed_mpn_mulmod_bnm1 (struct speed_params *);
373 double speed_mpn_bc_mulmod_bnm1 (struct speed_params *);
374 double speed_mpn_mulmod_bnm1_rounded (struct speed_params *);
375 double speed_mpn_sqrmod_bnm1 (struct speed_params *);
376 double speed_mpn_udiv_qrnnd (struct speed_params *);
377 double speed_mpn_udiv_qrnnd_r (struct speed_params *);
378 double speed_mpn_umul_ppmm (struct speed_params *);
379 double speed_mpn_umul_ppmm_r (struct speed_params *);
380 double speed_mpn_xnor_n (struct speed_params *);
381 double speed_mpn_xor_n (struct speed_params *);
382 double speed_MPN_ZERO (struct speed_params *);
383 
384 double speed_mpq_init_clear (struct speed_params *);
385 
386 double speed_mpz_add (struct speed_params *);
387 double speed_mpz_bin_uiui (struct speed_params *);
388 double speed_mpz_bin_ui (struct speed_params *);
389 double speed_mpz_fac_ui (struct speed_params *);
390 double speed_mpz_2fac_ui (struct speed_params *);
391 double speed_mpz_fib_ui (struct speed_params *);
392 double speed_mpz_fib2_ui (struct speed_params *);
393 double speed_mpz_init_clear (struct speed_params *);
394 double speed_mpz_init_realloc_clear (struct speed_params *);
395 double speed_mpz_jacobi (struct speed_params *);
396 double speed_mpz_lucnum_ui (struct speed_params *);
397 double speed_mpz_lucnum2_ui (struct speed_params *);
398 double speed_mpz_mod (struct speed_params *);
399 double speed_mpz_powm (struct speed_params *);
400 double speed_mpz_powm_mod (struct speed_params *);
401 double speed_mpz_powm_redc (struct speed_params *);
402 double speed_mpz_powm_sec (struct speed_params *);
403 double speed_mpz_powm_ui (struct speed_params *);
404 double speed_mpz_urandomb (struct speed_params *);
405 
406 double speed_gmp_randseed (struct speed_params *);
407 double speed_gmp_randseed_ui (struct speed_params *);
408 
409 double speed_noop (struct speed_params *);
410 double speed_noop_wxs (struct speed_params *);
411 double speed_noop_wxys (struct speed_params *);
412 
413 double speed_operator_div (struct speed_params *);
414 double speed_operator_mod (struct speed_params *);
415 
416 double speed_udiv_qrnnd (struct speed_params *);
417 double speed_udiv_qrnnd_preinv1 (struct speed_params *);
418 double speed_udiv_qrnnd_preinv2 (struct speed_params *);
419 double speed_udiv_qrnnd_preinv3 (struct speed_params *);
420 double speed_udiv_qrnnd_c (struct speed_params *);
421 double speed_umul_ppmm (struct speed_params *);
422 
423 /* Prototypes for other routines */
424 
425 #if defined (__cplusplus)
426 extern "C" {
427 #endif
428 
429 /* low 32-bits in p[0], high 32-bits in p[1] */
430 void speed_cyclecounter (unsigned p[2]);
431 
432 #if defined (__cplusplus)
433 }
434 #endif
435 
436 void mftb_function (unsigned p[2]);
437 
438 double speed_cyclecounter_diff (const unsigned [2], const unsigned [2]);
439 int gettimeofday_microseconds_p (void);
440 int getrusage_microseconds_p (void);
441 int cycles_works_p (void);
442 long clk_tck (void);
443 double freq_measure (const char *, double (*)(void));
444 
445 int double_cmp_ptr (const double *, const double *);
446 void pentium_wbinvd (void);
447 typedef int (*qsort_function_t) (const void *, const void *);
448 
449 void noop (void);
450 void noop_1 (mp_limb_t);
451 void noop_wxs (mp_ptr, mp_srcptr, mp_size_t);
452 void noop_wxys (mp_ptr, mp_srcptr, mp_srcptr, mp_size_t);
453 void mpn_cache_fill (mp_srcptr, mp_size_t);
454 void mpn_cache_fill_dummy (mp_limb_t);
455 void speed_cache_fill (struct speed_params *);
456 void speed_operand_src (struct speed_params *, mp_ptr, mp_size_t);
457 void speed_operand_dst (struct speed_params *, mp_ptr, mp_size_t);
458 
459 extern int  speed_option_addrs;
460 extern int  speed_option_verbose;
461 extern int  speed_option_cycles_broken;
462 void speed_option_set (const char *);
463 
464 mp_limb_t mpn_div_qr_1n_pi1_1 (mp_ptr, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t, mp_limb_t);
465 mp_limb_t mpn_div_qr_1n_pi1_2 (mp_ptr, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t, mp_limb_t);
466 
467 mp_limb_t mpn_divrem_1_div (mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t);
468 mp_limb_t mpn_divrem_1_inv (mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t);
469 mp_limb_t mpn_divrem_2_div (mp_ptr, mp_size_t, mp_ptr, mp_size_t, mp_srcptr);
470 mp_limb_t mpn_divrem_2_inv (mp_ptr, mp_size_t, mp_ptr, mp_size_t, mp_srcptr);
471 
472 int mpn_jacobi_base_1 (mp_limb_t, mp_limb_t, int);
473 int mpn_jacobi_base_2 (mp_limb_t, mp_limb_t, int);
474 int mpn_jacobi_base_3 (mp_limb_t, mp_limb_t, int);
475 int mpn_jacobi_base_4 (mp_limb_t, mp_limb_t, int);
476 
477 mp_limb_t mpn_mod_1_div (mp_srcptr, mp_size_t, mp_limb_t);
478 mp_limb_t mpn_mod_1_inv (mp_srcptr, mp_size_t, mp_limb_t);
479 
480 mp_limb_t mpn_mod_1_1p_1 (mp_srcptr, mp_size_t, mp_limb_t, const mp_limb_t [4]);
481 mp_limb_t mpn_mod_1_1p_2 (mp_srcptr, mp_size_t, mp_limb_t, const mp_limb_t [4]);
482 
483 void mpn_mod_1_1p_cps_1 (mp_limb_t [4], mp_limb_t);
484 void mpn_mod_1_1p_cps_2 (mp_limb_t [4], mp_limb_t);
485 
486 mp_size_t mpn_gcdext_one_double (mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t);
487 mp_size_t mpn_gcdext_one_single (mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t);
488 mp_size_t mpn_gcdext_single (mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t);
489 mp_size_t mpn_gcdext_double (mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t);
490 mp_size_t mpn_hgcd_lehmer (mp_ptr, mp_ptr, mp_size_t, struct hgcd_matrix *, mp_ptr);
491 mp_size_t mpn_hgcd_lehmer_itch (mp_size_t);
492 
493 mp_size_t mpn_hgcd_appr_lehmer (mp_ptr, mp_ptr, mp_size_t, struct hgcd_matrix *, mp_ptr);
494 mp_size_t mpn_hgcd_appr_lehmer_itch (mp_size_t);
495 
496 mp_size_t mpn_hgcd_reduce_1 (struct hgcd_matrix *, mp_ptr, mp_ptr, mp_size_t, mp_size_t, mp_ptr);
497 mp_size_t mpn_hgcd_reduce_1_itch (mp_size_t, mp_size_t);
498 
499 mp_size_t mpn_hgcd_reduce_2 (struct hgcd_matrix *, mp_ptr, mp_ptr, mp_size_t, mp_size_t, mp_ptr);
500 mp_size_t mpn_hgcd_reduce_2_itch (mp_size_t, mp_size_t);
501 
502 mp_limb_t mpn_sb_divrem_mn_div (mp_ptr, mp_ptr, mp_size_t, mp_srcptr, mp_size_t);
503 mp_limb_t mpn_sb_divrem_mn_inv (mp_ptr, mp_ptr, mp_size_t, mp_srcptr, mp_size_t);
504 
505 mp_size_t mpn_set_str_basecase (mp_ptr, const unsigned char *, size_t, int);
506 void mpn_pre_set_str (mp_ptr, unsigned char *, size_t, powers_t *, mp_ptr);
507 
508 void mpz_powm_mod (mpz_ptr, mpz_srcptr, mpz_srcptr, mpz_srcptr);
509 void mpz_powm_redc (mpz_ptr, mpz_srcptr, mpz_srcptr, mpz_srcptr);
510 
511 int speed_routine_count_zeros_setup (struct speed_params *, mp_ptr, int, int);
512 
513 
514 /* "get" is called repeatedly until it ticks over, just in case on a fast
515    processor it takes less than a microsecond, though this is probably
516    unlikely if it's a system call.
517 
518    speed_cyclecounter is called on the same side of the "get" for the start
519    and end measurements.  It doesn't matter how long it takes from the "get"
520    sample to the cycles sample, since that period will cancel out in the
521    difference calculation (assuming it's the same each time).
522 
523    Letting the test run for more than a process time slice is probably only
524    going to reduce accuracy, especially for getrusage when the cycle counter
525    is real time, or for gettimeofday if the cycle counter is in fact process
526    time.  Use CLK_TCK/2 as a reasonable stop.
527 
528    It'd be desirable to be quite accurate here.  The default speed_precision
529    for a cycle counter is 10000 cycles, so to mix that with getrusage or
530    gettimeofday the frequency should be at least that accurate.  But running
531    measurements for 10000 microseconds (or more) is too long.  Be satisfied
532    with just a half clock tick (5000 microseconds usually).  */
533 
534 #define FREQ_MEASURE_ONE(name, type, get, getc, sec, usec)		\
535   do {									\
536     type      st1, st, et1, et;						\
537     unsigned  sc[2], ec[2];						\
538     long      dt, half_tick;						\
539     double    dc, cyc;							\
540 									\
541     half_tick = (1000000L / clk_tck()) / 2;				\
542 									\
543     get (st1);								\
544     do {								\
545       get (st);								\
546     } while (usec(st) == usec(st1) && sec(st) == sec(st1));		\
547 									\
548     getc (sc);								\
549 									\
550     for (;;)								\
551       {									\
552 	get (et1);							\
553 	do {								\
554 	  get (et);							\
555 	} while (usec(et) == usec(et1) && sec(et) == sec(et1));		\
556 									\
557 	getc (ec);							\
558 									\
559 	dc = speed_cyclecounter_diff (ec, sc);				\
560 									\
561 	/* allow secs to cancel before multiplying */			\
562 	dt = sec(et) - sec(st);						\
563 	dt = dt * 1000000L + (usec(et) - usec(st));			\
564 									\
565 	if (dt >= half_tick)						\
566 	  break;							\
567       }									\
568 									\
569     cyc = dt * 1e-6 / dc;						\
570 									\
571     if (speed_option_verbose >= 2)					\
572       printf ("freq_measure_%s_one() dc=%.6g dt=%ld cyc=%.6g\n",	\
573 	      name, dc, dt, cyc);					\
574 									\
575     return dt * 1e-6 / dc;						\
576 									\
577   } while (0)
578 
579 
580 
581 
582 /* The measuring routines use these big macros to save duplication for
583    similar forms.  They also get used for some automatically generated
584    measuring of new implementations of functions.
585 
586    Having something like SPEED_ROUTINE_BINARY_N as a subroutine accepting a
587    function pointer is considered undesirable since it's not the way a
588    normal application will be calling, and some processors might do
589    different things with an indirect call, like not branch predicting, or
590    doing a full pipe flush.  At least some of the "functions" measured are
591    actually macros too.
592 
593    The net effect is to bloat the object code, possibly in a big way, but
594    only what's being measured is being run, so that doesn't matter.
595 
596    The loop forms don't try to cope with __GMP_ATTRIBUTE_PURE or
597    ATTRIBUTE_CONST on the called functions.  Adding a cast to a non-pure
598    function pointer doesn't work in gcc 3.2.  Using an actual non-pure
599    function pointer variable works, but stands a real risk of a
600    non-optimizing compiler generating unnecessary overheads in the call.
601    Currently the best idea is not to use those attributes for a timing
602    program build.  __GMP_NO_ATTRIBUTE_CONST_PURE will tell gmp.h and
603    gmp-impl.h to omit them from routines there.  */
604 
605 #define SPEED_RESTRICT_COND(cond)   if (!(cond)) return -1.0;
606 
607 /* For mpn_copy or similar. */
608 #define SPEED_ROUTINE_MPN_COPY_CALL(call)				\
609   {									\
610     mp_ptr    wp;							\
611     unsigned  i;							\
612     double    t;							\
613     TMP_DECL;								\
614 									\
615     SPEED_RESTRICT_COND (s->size >= 0);					\
616 									\
617     TMP_MARK;								\
618     SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
619 									\
620     speed_operand_src (s, s->xp, s->size);				\
621     speed_operand_dst (s, wp, s->size);					\
622     speed_cache_fill (s);						\
623 									\
624     speed_starttime ();							\
625     i = s->reps;							\
626     do									\
627       call;								\
628     while (--i != 0);							\
629     t = speed_endtime ();						\
630 									\
631     TMP_FREE;								\
632     return t;								\
633   }
634 #define SPEED_ROUTINE_MPN_COPY(function)				\
635   SPEED_ROUTINE_MPN_COPY_CALL (function (wp, s->xp, s->size))
636 
637 #define SPEED_ROUTINE_MPN_TABSELECT(function)				\
638   {									\
639     mp_ptr    xp, wp;							\
640     unsigned  i;							\
641     double    t;							\
642     TMP_DECL;								\
643 									\
644     SPEED_RESTRICT_COND (s->size >= 0);					\
645 									\
646     if (s->r == 0)							\
647       s->r = s->size;	/* default to a quadratic shape */		\
648 									\
649     TMP_MARK;								\
650     SPEED_TMP_ALLOC_LIMBS (xp, s->size * s->r, s->align_xp);		\
651     SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
652 									\
653     speed_operand_src (s, xp, s->size * s->r);				\
654     speed_operand_dst (s, wp, s->size);					\
655     speed_cache_fill (s);						\
656 									\
657     speed_starttime ();							\
658     i = s->reps;							\
659     do									\
660       function (wp, xp, s->size, s->r, (s->r) / 2);			\
661     while (--i != 0);							\
662     t = speed_endtime () / s->r;					\
663 									\
664     TMP_FREE;								\
665     return t;								\
666   }
667 
668 
669 #define SPEED_ROUTINE_MPN_COPYC(function)				\
670   {									\
671     mp_ptr    wp;							\
672     unsigned  i;							\
673     double    t;							\
674     TMP_DECL;								\
675 									\
676     SPEED_RESTRICT_COND (s->size >= 0);					\
677 									\
678     TMP_MARK;								\
679     SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
680 									\
681     speed_operand_src (s, s->xp, s->size);				\
682     speed_operand_dst (s, wp, s->size);					\
683     speed_cache_fill (s);						\
684 									\
685     speed_starttime ();							\
686     i = s->reps;							\
687     do									\
688       function (wp, s->xp, s->size, 0);					\
689     while (--i != 0);							\
690     t = speed_endtime ();						\
691 									\
692     TMP_FREE;								\
693     return t;								\
694   }
695 
696 /* s->size is still in limbs, and it's limbs which are copied, but
697    "function" takes a size in bytes not limbs.  */
698 #define SPEED_ROUTINE_MPN_COPY_BYTES(function)				\
699   {									\
700     mp_ptr    wp;							\
701     unsigned  i;							\
702     double    t;							\
703     TMP_DECL;								\
704 									\
705     SPEED_RESTRICT_COND (s->size >= 0);					\
706 									\
707     TMP_MARK;								\
708     SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
709 									\
710     speed_operand_src (s, s->xp, s->size);				\
711     speed_operand_dst (s, wp, s->size);					\
712     speed_cache_fill (s);						\
713 									\
714     speed_starttime ();							\
715     i = s->reps;							\
716     do									\
717       function (wp, s->xp, s->size * GMP_LIMB_BYTES);		\
718     while (--i != 0);							\
719     t = speed_endtime ();						\
720 									\
721     TMP_FREE;								\
722     return t;								\
723   }
724 
725 
726 /* For mpn_add_n, mpn_sub_n, or similar. */
727 #define SPEED_ROUTINE_MPN_BINARY_N_CALL(call)				\
728   {									\
729     mp_ptr     wp;							\
730     mp_ptr     xp, yp;							\
731     unsigned   i;							\
732     double     t;							\
733     TMP_DECL;								\
734 									\
735     SPEED_RESTRICT_COND (s->size >= 1);					\
736 									\
737     TMP_MARK;								\
738     SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
739 									\
740     xp = s->xp;								\
741     yp = s->yp;								\
742 									\
743     if (s->r == 0)	;						\
744     else if (s->r == 1) { xp = wp;	    }				\
745     else if (s->r == 2) {	   yp = wp; }				\
746     else if (s->r == 3) { xp = wp; yp = wp; }				\
747     else if (s->r == 4) {     yp = xp;	    }				\
748     else		{						\
749       TMP_FREE;								\
750       return -1.0;							\
751     }									\
752 									\
753     /* initialize wp if operand overlap */				\
754     if (xp == wp || yp == wp)						\
755       MPN_COPY (wp, s->xp, s->size);					\
756 									\
757     speed_operand_src (s, xp, s->size);					\
758     speed_operand_src (s, yp, s->size);					\
759     speed_operand_dst (s, wp, s->size);					\
760     speed_cache_fill (s);						\
761 									\
762     speed_starttime ();							\
763     i = s->reps;							\
764     do									\
765       call;								\
766     while (--i != 0);							\
767     t = speed_endtime ();						\
768 									\
769     TMP_FREE;								\
770     return t;								\
771   }
772 
773 
774 /* For mpn_aors_errK_n, where 1 <= K <= 3. */
775 #define SPEED_ROUTINE_MPN_BINARY_ERR_N_CALL(call, K)			\
776   {									\
777     mp_ptr     wp;							\
778     mp_ptr     xp, yp;							\
779     mp_ptr     zp[K];							\
780     mp_limb_t  ep[2*K];							\
781     unsigned   i;							\
782     double     t;							\
783     TMP_DECL;								\
784 									\
785     SPEED_RESTRICT_COND (s->size >= 1);					\
786 									\
787     TMP_MARK;								\
788     SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
789 									\
790     /* (don't have a mechanism to specify zp alignments) */		\
791     for (i = 0; i < K; i++)						\
792       SPEED_TMP_ALLOC_LIMBS (zp[i], s->size, 0);			\
793 									\
794     xp = s->xp;								\
795     yp = s->yp;								\
796 									\
797     if (s->r == 0)	;						\
798     else if (s->r == 1) { xp = wp;	    }				\
799     else if (s->r == 2) {	   yp = wp; }				\
800     else if (s->r == 3) { xp = wp; yp = wp; }				\
801     else if (s->r == 4) {     yp = xp;	    }				\
802     else		{						\
803       TMP_FREE;								\
804       return -1.0;							\
805     }									\
806 									\
807     /* initialize wp if operand overlap */				\
808     if (xp == wp || yp == wp)						\
809       MPN_COPY (wp, s->xp, s->size);					\
810 									\
811     speed_operand_src (s, xp, s->size);					\
812     speed_operand_src (s, yp, s->size);					\
813     for (i = 0; i < K; i++)						\
814       speed_operand_src (s, zp[i], s->size);				\
815     speed_operand_dst (s, wp, s->size);					\
816     speed_cache_fill (s);						\
817 									\
818     speed_starttime ();							\
819     i = s->reps;							\
820     do									\
821       call;								\
822     while (--i != 0);							\
823     t = speed_endtime ();						\
824 									\
825     TMP_FREE;								\
826     return t;								\
827   }
828 
829 #define SPEED_ROUTINE_MPN_BINARY_ERR1_N(function)			\
830   SPEED_ROUTINE_MPN_BINARY_ERR_N_CALL ((*function) (wp, xp, yp, ep, zp[0], s->size, 0), 1)
831 
832 #define SPEED_ROUTINE_MPN_BINARY_ERR2_N(function)			\
833   SPEED_ROUTINE_MPN_BINARY_ERR_N_CALL ((*function) (wp, xp, yp, ep, zp[0], zp[1], s->size, 0), 2)
834 
835 #define SPEED_ROUTINE_MPN_BINARY_ERR3_N(function)			\
836   SPEED_ROUTINE_MPN_BINARY_ERR_N_CALL ((*function) (wp, xp, yp, ep, zp[0], zp[1], zp[2], s->size, 0), 3)
837 
838 
839 /* For mpn_add_n, mpn_sub_n, or similar. */
840 #define SPEED_ROUTINE_MPN_ADDSUB_N_CALL(call)				\
841   {									\
842     mp_ptr     ap, sp;							\
843     mp_ptr     xp, yp;							\
844     unsigned   i;							\
845     double     t;							\
846     TMP_DECL;								\
847 									\
848     SPEED_RESTRICT_COND (s->size >= 1);					\
849 									\
850     TMP_MARK;								\
851     SPEED_TMP_ALLOC_LIMBS (ap, s->size, s->align_wp);			\
852     SPEED_TMP_ALLOC_LIMBS (sp, s->size, s->align_wp);			\
853 									\
854     xp = s->xp;								\
855     yp = s->yp;								\
856 									\
857     if ((s->r & 1) != 0) { xp = ap; }					\
858     if ((s->r & 2) != 0) { yp = ap; }					\
859     if ((s->r & 4) != 0) { xp = sp; }					\
860     if ((s->r & 8) != 0) { yp = sp; }					\
861     if ((s->r & 3) == 3  ||  (s->r & 12) == 12)				\
862       {									\
863 	TMP_FREE;							\
864 	return -1.0;							\
865       }									\
866 									\
867     /* initialize ap if operand overlap */				\
868     if (xp == ap || yp == ap)						\
869       MPN_COPY (ap, s->xp, s->size);					\
870     /* initialize sp if operand overlap */				\
871     if (xp == sp || yp == sp)						\
872       MPN_COPY (sp, s->xp, s->size);					\
873 									\
874     speed_operand_src (s, xp, s->size);					\
875     speed_operand_src (s, yp, s->size);					\
876     speed_operand_dst (s, ap, s->size);					\
877     speed_operand_dst (s, sp, s->size);					\
878     speed_cache_fill (s);						\
879 									\
880     speed_starttime ();							\
881     i = s->reps;							\
882     do									\
883       call;								\
884     while (--i != 0);							\
885     t = speed_endtime ();						\
886 									\
887     TMP_FREE;								\
888     return t;								\
889   }
890 
891 #define SPEED_ROUTINE_MPN_BINARY_N(function)				\
892    SPEED_ROUTINE_MPN_BINARY_N_CALL ((*function) (wp, xp, yp, s->size))
893 
894 #define SPEED_ROUTINE_MPN_BINARY_NC(function)				\
895    SPEED_ROUTINE_MPN_BINARY_N_CALL ((*function) (wp, xp, yp, s->size, 0))
896 
897 
898 /* For mpn_lshift, mpn_rshift, mpn_mul_1, with r, or similar. */
899 #define SPEED_ROUTINE_MPN_UNARY_1_CALL(call)				\
900   {									\
901     mp_ptr    wp;							\
902     unsigned  i;							\
903     double    t;							\
904     TMP_DECL;								\
905 									\
906     SPEED_RESTRICT_COND (s->size >= 1);					\
907 									\
908     TMP_MARK;								\
909     SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
910 									\
911     speed_operand_src (s, s->xp, s->size);				\
912     speed_operand_dst (s, wp, s->size);					\
913     speed_cache_fill (s);						\
914 									\
915     speed_starttime ();							\
916     i = s->reps;							\
917     do									\
918       call;								\
919     while (--i != 0);							\
920     t = speed_endtime ();						\
921 									\
922     TMP_FREE;								\
923     return t;								\
924   }
925 
926 #define SPEED_ROUTINE_MPN_UNARY_1(function)				\
927   SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r))
928 
929 #define SPEED_ROUTINE_MPN_UNARY_1C(function)				\
930   SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r, 0))
931 
932 /* FIXME: wp is uninitialized here, should start it off from xp */
933 #define SPEED_ROUTINE_MPN_UNARY_1_INPLACE(function)			\
934   SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, wp, s->size, s->r))
935 
936 #define SPEED_ROUTINE_MPN_DIVEXACT_1(function)				\
937   SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r))
938 
939 #define SPEED_ROUTINE_MPN_BDIV_Q_1(function)				\
940     SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r))
941 
942 #define SPEED_ROUTINE_MPN_PI1_BDIV_Q_1_CALL(call)			\
943   {									\
944     unsigned   shift;							\
945     mp_limb_t  dinv;							\
946 									\
947     SPEED_RESTRICT_COND (s->size > 0);					\
948     SPEED_RESTRICT_COND (s->r != 0);					\
949 									\
950     count_trailing_zeros (shift, s->r);					\
951     binvert_limb (dinv, s->r >> shift);					\
952 									\
953     SPEED_ROUTINE_MPN_UNARY_1_CALL (call);				\
954   }
955 #define SPEED_ROUTINE_MPN_PI1_BDIV_Q_1(function)			\
956   SPEED_ROUTINE_MPN_PI1_BDIV_Q_1_CALL					\
957   ((*function) (wp, s->xp, s->size, s->r, dinv, shift))
958 
959 #define SPEED_ROUTINE_MPN_BDIV_DBM1C(function)				\
960   SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r, 0))
961 
962 #define SPEED_ROUTINE_MPN_DIVREM_1(function)				\
963   SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, 0, s->xp, s->size, s->r))
964 
965 #define SPEED_ROUTINE_MPN_DIVREM_1C(function)				\
966   SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, 0, s->xp, s->size, s->r, 0))
967 
968 #define SPEED_ROUTINE_MPN_DIVREM_1F(function)				\
969   SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->size, s->xp, 0, s->r))
970 
971 #define SPEED_ROUTINE_MPN_DIVREM_1CF(function)				\
972   SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->size, s->xp, 0, s->r, 0))
973 
974 
975 #define SPEED_ROUTINE_MPN_PREINV_DIVREM_1_CALL(call)			\
976   {									\
977     unsigned   shift;							\
978     mp_limb_t  dinv;							\
979 									\
980     SPEED_RESTRICT_COND (s->size >= 0);					\
981     SPEED_RESTRICT_COND (s->r != 0);					\
982 									\
983     count_leading_zeros (shift, s->r);					\
984     invert_limb (dinv, s->r << shift);					\
985 									\
986     SPEED_ROUTINE_MPN_UNARY_1_CALL (call);				\
987   }									\
988 
989 #define SPEED_ROUTINE_MPN_PREINV_DIVREM_1(function)			\
990   SPEED_ROUTINE_MPN_PREINV_DIVREM_1_CALL				\
991   ((*function) (wp, 0, s->xp, s->size, s->r, dinv, shift))
992 
993 /* s->size limbs worth of fraction part */
994 #define SPEED_ROUTINE_MPN_PREINV_DIVREM_1F(function)			\
995   SPEED_ROUTINE_MPN_PREINV_DIVREM_1_CALL				\
996   ((*function) (wp, s->size, s->xp, 0, s->r, dinv, shift))
997 
998 
999 /* s->r is duplicated to form the multiplier, defaulting to
1000    MP_BASES_BIG_BASE_10.  Not sure if that's particularly useful, but at
1001    least it provides some control.  */
1002 #define SPEED_ROUTINE_MPN_UNARY_N(function,N)				\
1003   {									\
1004     mp_ptr     wp;							\
1005     mp_size_t  wn;							\
1006     unsigned   i;							\
1007     double     t;							\
1008     mp_limb_t  yp[N];							\
1009     TMP_DECL;								\
1010 									\
1011     SPEED_RESTRICT_COND (s->size >= N);					\
1012 									\
1013     TMP_MARK;								\
1014     wn = s->size + N-1;							\
1015     SPEED_TMP_ALLOC_LIMBS (wp, wn, s->align_wp);			\
1016     for (i = 0; i < N; i++)						\
1017       yp[i] = (s->r != 0 ? s->r : MP_BASES_BIG_BASE_10);		\
1018 									\
1019     speed_operand_src (s, s->xp, s->size);				\
1020     speed_operand_src (s, yp, (mp_size_t) N);				\
1021     speed_operand_dst (s, wp, wn);					\
1022     speed_cache_fill (s);						\
1023 									\
1024     speed_starttime ();							\
1025     i = s->reps;							\
1026     do									\
1027       function (wp, s->xp, s->size, yp);				\
1028     while (--i != 0);							\
1029     t = speed_endtime ();						\
1030 									\
1031     TMP_FREE;								\
1032     return t;								\
1033   }
1034 
1035 #define SPEED_ROUTINE_MPN_UNARY_2(function)				\
1036   SPEED_ROUTINE_MPN_UNARY_N (function, 2)
1037 #define SPEED_ROUTINE_MPN_UNARY_3(function)				\
1038   SPEED_ROUTINE_MPN_UNARY_N (function, 3)
1039 #define SPEED_ROUTINE_MPN_UNARY_4(function)				\
1040   SPEED_ROUTINE_MPN_UNARY_N (function, 4)
1041 #define SPEED_ROUTINE_MPN_UNARY_5(function)				\
1042   SPEED_ROUTINE_MPN_UNARY_N (function, 5)
1043 #define SPEED_ROUTINE_MPN_UNARY_6(function)				\
1044   SPEED_ROUTINE_MPN_UNARY_N (function, 6)
1045 #define SPEED_ROUTINE_MPN_UNARY_7(function)				\
1046   SPEED_ROUTINE_MPN_UNARY_N (function, 7)
1047 #define SPEED_ROUTINE_MPN_UNARY_8(function)				\
1048   SPEED_ROUTINE_MPN_UNARY_N (function, 8)
1049 
1050 
1051 /* For mpn_mul, mpn_mul_basecase, xsize=r, ysize=s->size. */
1052 #define SPEED_ROUTINE_MPN_MUL(function)					\
1053   {									\
1054     mp_ptr    wp;							\
1055     mp_size_t size1;							\
1056     unsigned  i;							\
1057     double    t;							\
1058     TMP_DECL;								\
1059 									\
1060     size1 = (s->r == 0 ? s->size : s->r);				\
1061     if (size1 < 0) size1 = -size1 - s->size;				\
1062 									\
1063     SPEED_RESTRICT_COND (size1 >= 1);					\
1064     SPEED_RESTRICT_COND (s->size >= size1);				\
1065 									\
1066     TMP_MARK;								\
1067     SPEED_TMP_ALLOC_LIMBS (wp, size1 + s->size, s->align_wp);		\
1068 									\
1069     speed_operand_src (s, s->xp, s->size);				\
1070     speed_operand_src (s, s->yp, size1);				\
1071     speed_operand_dst (s, wp, size1 + s->size);				\
1072     speed_cache_fill (s);						\
1073 									\
1074     speed_starttime ();							\
1075     i = s->reps;							\
1076     do									\
1077       function (wp, s->xp, s->size, s->yp, size1);			\
1078     while (--i != 0);							\
1079     t = speed_endtime ();						\
1080 									\
1081     TMP_FREE;								\
1082     return t;								\
1083   }
1084 
1085 
1086 #define SPEED_ROUTINE_MPN_MUL_N_CALL(call)				\
1087   {									\
1088     mp_ptr    wp;							\
1089     unsigned  i;							\
1090     double    t;							\
1091     TMP_DECL;								\
1092 									\
1093     SPEED_RESTRICT_COND (s->size >= 1);					\
1094 									\
1095     TMP_MARK;								\
1096     SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp);			\
1097 									\
1098     speed_operand_src (s, s->xp, s->size);				\
1099     speed_operand_src (s, s->yp, s->size);				\
1100     speed_operand_dst (s, wp, 2*s->size);				\
1101     speed_cache_fill (s);						\
1102 									\
1103     speed_starttime ();							\
1104     i = s->reps;							\
1105     do									\
1106       call;								\
1107     while (--i != 0);							\
1108     t = speed_endtime ();						\
1109 									\
1110     TMP_FREE;								\
1111     return t;								\
1112   }
1113 
1114 #define SPEED_ROUTINE_MPN_MUL_N(function)				\
1115   SPEED_ROUTINE_MPN_MUL_N_CALL (function (wp, s->xp, s->yp, s->size));
1116 
1117 #define SPEED_ROUTINE_MPN_MULLO_N_CALL(call)				\
1118   {									\
1119     mp_ptr    wp;							\
1120     unsigned  i;							\
1121     double    t;							\
1122     TMP_DECL;								\
1123 									\
1124     SPEED_RESTRICT_COND (s->size >= 1);					\
1125 									\
1126     TMP_MARK;								\
1127     SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
1128 									\
1129     speed_operand_src (s, s->xp, s->size);				\
1130     speed_operand_src (s, s->yp, s->size);				\
1131     speed_operand_dst (s, wp, s->size);					\
1132     speed_cache_fill (s);						\
1133 									\
1134     speed_starttime ();							\
1135     i = s->reps;							\
1136     do									\
1137       call;								\
1138     while (--i != 0);							\
1139     t = speed_endtime ();						\
1140 									\
1141     TMP_FREE;								\
1142     return t;								\
1143   }
1144 
1145 #define SPEED_ROUTINE_MPN_MULLO_N(function)				\
1146   SPEED_ROUTINE_MPN_MULLO_N_CALL (function (wp, s->xp, s->yp, s->size));
1147 
1148 #define SPEED_ROUTINE_MPN_MULLO_BASECASE(function)			\
1149   SPEED_ROUTINE_MPN_MULLO_N_CALL (function (wp, s->xp, s->yp, s->size));
1150 
1151 #define SPEED_ROUTINE_MPN_SQRLO(function)				\
1152   {									\
1153     mp_ptr    wp;							\
1154     unsigned  i;							\
1155     double    t;							\
1156     TMP_DECL;								\
1157 									\
1158     SPEED_RESTRICT_COND (s->size >= 1);					\
1159 									\
1160     TMP_MARK;								\
1161     SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
1162 									\
1163     speed_operand_src (s, s->xp, s->size);				\
1164     speed_operand_dst (s, wp, s->size);					\
1165     speed_cache_fill (s);						\
1166 									\
1167     speed_starttime ();							\
1168     i = s->reps;							\
1169     do									\
1170       function (wp, s->xp, s->size);					\
1171     while (--i != 0);							\
1172     t = speed_endtime ();						\
1173 									\
1174     TMP_FREE;								\
1175     return t;								\
1176   }
1177 
1178 /* For mpn_mulmid, mpn_mulmid_basecase, xsize=r, ysize=s->size. */
1179 #define SPEED_ROUTINE_MPN_MULMID(function)				\
1180   {									\
1181     mp_ptr    wp, xp;							\
1182     mp_size_t size1;							\
1183     unsigned  i;							\
1184     double    t;							\
1185     TMP_DECL;								\
1186 									\
1187     size1 = (s->r == 0 ? (2 * s->size - 1) : s->r);			\
1188 									\
1189     SPEED_RESTRICT_COND (s->size >= 1);					\
1190     SPEED_RESTRICT_COND (size1 >= s->size);				\
1191 									\
1192     TMP_MARK;								\
1193     SPEED_TMP_ALLOC_LIMBS (wp, size1 - s->size + 3, s->align_wp);	\
1194     SPEED_TMP_ALLOC_LIMBS (xp, size1, s->align_xp);			\
1195 									\
1196     speed_operand_src (s, xp, size1);					\
1197     speed_operand_src (s, s->yp, s->size);				\
1198     speed_operand_dst (s, wp, size1 - s->size + 3);			\
1199     speed_cache_fill (s);						\
1200 									\
1201     speed_starttime ();							\
1202     i = s->reps;							\
1203     do									\
1204       function (wp, xp, size1, s->yp, s->size);				\
1205     while (--i != 0);							\
1206     t = speed_endtime ();						\
1207 									\
1208     TMP_FREE;								\
1209     return t;								\
1210   }
1211 
1212 #define SPEED_ROUTINE_MPN_MULMID_N(function)				\
1213   {									\
1214     mp_ptr    wp, xp;							\
1215     mp_size_t size1;							\
1216     unsigned  i;							\
1217     double    t;							\
1218     TMP_DECL;								\
1219 									\
1220     size1 = 2 * s->size - 1;						\
1221 									\
1222     SPEED_RESTRICT_COND (s->size >= 1);					\
1223 									\
1224     TMP_MARK;								\
1225     SPEED_TMP_ALLOC_LIMBS (wp, size1 - s->size + 3, s->align_wp);	\
1226     SPEED_TMP_ALLOC_LIMBS (xp, size1, s->align_xp);			\
1227 									\
1228     speed_operand_src (s, xp, size1);					\
1229     speed_operand_src (s, s->yp, s->size);				\
1230     speed_operand_dst (s, wp, size1 - s->size + 3);			\
1231     speed_cache_fill (s);						\
1232 									\
1233     speed_starttime ();							\
1234     i = s->reps;							\
1235     do									\
1236       function (wp, xp, s->yp, s->size);				\
1237     while (--i != 0);							\
1238     t = speed_endtime ();						\
1239 									\
1240     TMP_FREE;								\
1241     return t;								\
1242   }
1243 
1244 #define SPEED_ROUTINE_MPN_TOOM42_MULMID(function)			\
1245   {									\
1246     mp_ptr    wp, xp, scratch;						\
1247     mp_size_t size1, scratch_size;					\
1248     unsigned  i;							\
1249     double    t;							\
1250     TMP_DECL;								\
1251 									\
1252     size1 = 2 * s->size - 1;						\
1253 									\
1254     SPEED_RESTRICT_COND (s->size >= 1);					\
1255 									\
1256     TMP_MARK;								\
1257     SPEED_TMP_ALLOC_LIMBS (wp, size1 - s->size + 3, s->align_wp);	\
1258     SPEED_TMP_ALLOC_LIMBS (xp, size1, s->align_xp);			\
1259     scratch_size = mpn_toom42_mulmid_itch (s->size);			\
1260     SPEED_TMP_ALLOC_LIMBS (scratch, scratch_size, 0);			\
1261 									\
1262     speed_operand_src (s, xp, size1);					\
1263     speed_operand_src (s, s->yp, s->size);				\
1264     speed_operand_dst (s, wp, size1 - s->size + 3);			\
1265     speed_cache_fill (s);						\
1266 									\
1267     speed_starttime ();							\
1268     i = s->reps;							\
1269     do									\
1270       function (wp, xp, s->yp, s->size, scratch);			\
1271     while (--i != 0);							\
1272     t = speed_endtime ();						\
1273 									\
1274     TMP_FREE;								\
1275     return t;								\
1276   }
1277 
1278 #define SPEED_ROUTINE_MPN_MULMOD_BNM1_CALL(call)			\
1279   {									\
1280     mp_ptr    wp, tp;							\
1281     unsigned  i;							\
1282     double    t;							\
1283     mp_size_t itch;							\
1284     TMP_DECL;								\
1285 									\
1286     SPEED_RESTRICT_COND (s->size >= 1);					\
1287 									\
1288     itch = mpn_mulmod_bnm1_itch (s->size, s->size, s->size);		\
1289 									\
1290     TMP_MARK;								\
1291     SPEED_TMP_ALLOC_LIMBS (wp, 2 * s->size, s->align_wp);		\
1292     SPEED_TMP_ALLOC_LIMBS (tp, itch, s->align_wp2);			\
1293 									\
1294     speed_operand_src (s, s->xp, s->size);				\
1295     speed_operand_src (s, s->yp, s->size);				\
1296     speed_operand_dst (s, wp, 2 * s->size);				\
1297     speed_operand_dst (s, tp, itch);					\
1298     speed_cache_fill (s);						\
1299 									\
1300     speed_starttime ();							\
1301     i = s->reps;							\
1302     do									\
1303       call;								\
1304     while (--i != 0);							\
1305     t = speed_endtime ();						\
1306 									\
1307     TMP_FREE;								\
1308     return t;								\
1309   }
1310 #define SPEED_ROUTINE_MPN_MULMOD_BNM1_ROUNDED(function)			\
1311   {									\
1312     mp_ptr    wp, tp;							\
1313     unsigned  i;							\
1314     double    t;							\
1315     mp_size_t size, itch;						\
1316     TMP_DECL;								\
1317 									\
1318     SPEED_RESTRICT_COND (s->size >= 1);					\
1319 									\
1320     size = mpn_mulmod_bnm1_next_size (s->size);				\
1321     itch = mpn_mulmod_bnm1_itch (size, size, size);			\
1322 									\
1323     TMP_MARK;								\
1324     SPEED_TMP_ALLOC_LIMBS (wp, size, s->align_wp);			\
1325     SPEED_TMP_ALLOC_LIMBS (tp, itch, s->align_wp2);			\
1326 									\
1327     speed_operand_src (s, s->xp, s->size);				\
1328     speed_operand_src (s, s->yp, s->size);				\
1329     speed_operand_dst (s, wp, size);					\
1330     speed_operand_dst (s, tp, itch);					\
1331     speed_cache_fill (s);						\
1332 									\
1333     speed_starttime ();							\
1334     i = s->reps;							\
1335     do									\
1336       function (wp, size, s->xp, s->size, s->yp, s->size, tp);		\
1337     while (--i != 0);							\
1338     t = speed_endtime ();						\
1339 									\
1340     TMP_FREE;								\
1341     return t;								\
1342   }
1343 
1344 #define SPEED_ROUTINE_MPN_MUL_N_TSPACE(call, tsize, minsize)		\
1345   {									\
1346     mp_ptr    wp, tspace;						\
1347     unsigned  i;							\
1348     double    t;							\
1349     TMP_DECL;								\
1350 									\
1351     SPEED_RESTRICT_COND (s->size >= minsize);				\
1352 									\
1353     TMP_MARK;								\
1354     SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp);			\
1355     SPEED_TMP_ALLOC_LIMBS (tspace, tsize, s->align_wp2);		\
1356 									\
1357     speed_operand_src (s, s->xp, s->size);				\
1358     speed_operand_src (s, s->yp, s->size);				\
1359     speed_operand_dst (s, wp, 2*s->size);				\
1360     speed_operand_dst (s, tspace, tsize);				\
1361     speed_cache_fill (s);						\
1362 									\
1363     speed_starttime ();							\
1364     i = s->reps;							\
1365     do									\
1366       call;								\
1367     while (--i != 0);							\
1368     t = speed_endtime ();						\
1369 									\
1370     TMP_FREE;								\
1371     return t;								\
1372   }
1373 
1374 #define SPEED_ROUTINE_MPN_TOOM22_MUL_N(function)			\
1375   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1376     (function (wp, s->xp, s->size, s->yp, s->size, tspace),		\
1377      mpn_toom22_mul_itch (s->size, s->size),				\
1378      MPN_TOOM22_MUL_MINSIZE)
1379 
1380 #define SPEED_ROUTINE_MPN_TOOM33_MUL_N(function)			\
1381   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1382     (function (wp, s->xp, s->size, s->yp, s->size, tspace),		\
1383      mpn_toom33_mul_itch (s->size, s->size),				\
1384      MPN_TOOM33_MUL_MINSIZE)
1385 
1386 #define SPEED_ROUTINE_MPN_TOOM44_MUL_N(function)			\
1387   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1388     (function (wp, s->xp, s->size, s->yp, s->size, tspace),		\
1389      mpn_toom44_mul_itch (s->size, s->size),				\
1390      MPN_TOOM44_MUL_MINSIZE)
1391 
1392 #define SPEED_ROUTINE_MPN_TOOM6H_MUL_N(function)			\
1393   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1394     (function (wp, s->xp, s->size, s->yp, s->size, tspace),		\
1395      mpn_toom6h_mul_itch (s->size, s->size),				\
1396      MPN_TOOM6H_MUL_MINSIZE)
1397 
1398 #define SPEED_ROUTINE_MPN_TOOM8H_MUL_N(function)			\
1399   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1400     (function (wp, s->xp, s->size, s->yp, s->size, tspace),		\
1401      mpn_toom8h_mul_itch (s->size, s->size),				\
1402      MPN_TOOM8H_MUL_MINSIZE)
1403 
1404 #define SPEED_ROUTINE_MPN_TOOM32_MUL(function)				\
1405   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1406     (function (wp, s->xp, s->size, s->yp, 2*s->size/3, tspace),		\
1407      mpn_toom32_mul_itch (s->size, 2*s->size/3),			\
1408      MPN_TOOM32_MUL_MINSIZE)
1409 
1410 #define SPEED_ROUTINE_MPN_TOOM42_MUL(function)				\
1411   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1412     (function (wp, s->xp, s->size, s->yp, s->size/2, tspace),		\
1413      mpn_toom42_mul_itch (s->size, s->size/2),				\
1414      MPN_TOOM42_MUL_MINSIZE)
1415 
1416 #define SPEED_ROUTINE_MPN_TOOM43_MUL(function)				\
1417   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1418     (function (wp, s->xp, s->size, s->yp, s->size*3/4, tspace),		\
1419      mpn_toom43_mul_itch (s->size, s->size*3/4),			\
1420      MPN_TOOM43_MUL_MINSIZE)
1421 
1422 #define SPEED_ROUTINE_MPN_TOOM63_MUL(function)				\
1423   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1424     (function (wp, s->xp, s->size, s->yp, s->size/2, tspace),		\
1425      mpn_toom63_mul_itch (s->size, s->size/2),				\
1426      MPN_TOOM63_MUL_MINSIZE)
1427 
1428 #define SPEED_ROUTINE_MPN_TOOM32_FOR_TOOM43_MUL(function)		\
1429   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1430     (function (wp, s->xp, s->size, s->yp, 17*s->size/24, tspace),	\
1431      mpn_toom32_mul_itch (s->size, 17*s->size/24),			\
1432      MPN_TOOM32_MUL_MINSIZE)
1433 #define SPEED_ROUTINE_MPN_TOOM43_FOR_TOOM32_MUL(function)		\
1434   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1435     (function (wp, s->xp, s->size, s->yp, 17*s->size/24, tspace),	\
1436      mpn_toom43_mul_itch (s->size, 17*s->size/24),			\
1437      MPN_TOOM43_MUL_MINSIZE)
1438 
1439 #define SPEED_ROUTINE_MPN_TOOM32_FOR_TOOM53_MUL(function)		\
1440   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1441     (function (wp, s->xp, s->size, s->yp, 19*s->size/30, tspace),	\
1442      mpn_toom32_mul_itch (s->size, 19*s->size/30),			\
1443      MPN_TOOM32_MUL_MINSIZE)
1444 #define SPEED_ROUTINE_MPN_TOOM53_FOR_TOOM32_MUL(function)		\
1445   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1446     (function (wp, s->xp, s->size, s->yp, 19*s->size/30, tspace),	\
1447      mpn_toom53_mul_itch (s->size, 19*s->size/30),			\
1448      MPN_TOOM53_MUL_MINSIZE)
1449 
1450 #define SPEED_ROUTINE_MPN_TOOM42_FOR_TOOM53_MUL(function)		\
1451   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1452     (function (wp, s->xp, s->size, s->yp, 11*s->size/20, tspace),	\
1453      mpn_toom42_mul_itch (s->size, 11*s->size/20),			\
1454      MPN_TOOM42_MUL_MINSIZE)
1455 #define SPEED_ROUTINE_MPN_TOOM53_FOR_TOOM42_MUL(function)		\
1456   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1457     (function (wp, s->xp, s->size, s->yp, 11*s->size/20, tspace),	\
1458      mpn_toom53_mul_itch (s->size, 11*s->size/20),			\
1459      MPN_TOOM53_MUL_MINSIZE)
1460 
1461 #define SPEED_ROUTINE_MPN_TOOM43_FOR_TOOM54_MUL(function)		\
1462   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1463     (function (wp, s->xp, s->size, s->yp, 5*s->size/6, tspace),	\
1464      mpn_toom42_mul_itch (s->size, 5*s->size/6),			\
1465      MPN_TOOM54_MUL_MINSIZE)
1466 #define SPEED_ROUTINE_MPN_TOOM54_FOR_TOOM43_MUL(function)		\
1467   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1468     (function (wp, s->xp, s->size, s->yp, 5*s->size/6, tspace),	\
1469      mpn_toom54_mul_itch (s->size, 5*s->size/6),			\
1470      MPN_TOOM54_MUL_MINSIZE)
1471 
1472 
1473 
1474 #define SPEED_ROUTINE_MPN_SQR_CALL(call)				\
1475   {									\
1476     mp_ptr    wp;							\
1477     unsigned  i;							\
1478     double    t;							\
1479     TMP_DECL;								\
1480 									\
1481     SPEED_RESTRICT_COND (s->size >= 1);					\
1482 									\
1483     TMP_MARK;								\
1484     SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp);			\
1485 									\
1486     speed_operand_src (s, s->xp, s->size);				\
1487     speed_operand_dst (s, wp, 2*s->size);				\
1488     speed_cache_fill (s);						\
1489 									\
1490     speed_starttime ();							\
1491     i = s->reps;							\
1492     do									\
1493       call;								\
1494     while (--i != 0);							\
1495     t = speed_endtime ();						\
1496 									\
1497     TMP_FREE;								\
1498     return t;								\
1499   }
1500 
1501 #define SPEED_ROUTINE_MPN_SQR(function)					\
1502   SPEED_ROUTINE_MPN_SQR_CALL (function (wp, s->xp, s->size))
1503 
1504 #define SPEED_ROUTINE_MPN_SQR_DIAG_ADDLSH1_CALL(call)			\
1505   {									\
1506     mp_ptr    wp, tp;							\
1507     unsigned  i;							\
1508     double    t;							\
1509     TMP_DECL;								\
1510 									\
1511     SPEED_RESTRICT_COND (s->size >= 2);					\
1512 									\
1513     TMP_MARK;								\
1514     SPEED_TMP_ALLOC_LIMBS (tp, 2 * s->size, s->align_wp);		\
1515     SPEED_TMP_ALLOC_LIMBS (wp, 2 * s->size, s->align_wp);		\
1516 									\
1517     speed_operand_src (s, s->xp, s->size);				\
1518     speed_operand_src (s, tp, 2 * s->size);				\
1519     speed_operand_dst (s, wp, 2 * s->size);				\
1520     speed_cache_fill (s);						\
1521 									\
1522     speed_starttime ();							\
1523     i = s->reps;							\
1524     do									\
1525       call;								\
1526     while (--i != 0);							\
1527     t = speed_endtime () / 2;						\
1528 									\
1529     TMP_FREE;								\
1530     return t;								\
1531   }
1532 
1533 #define SPEED_ROUTINE_MPN_SQR_TSPACE(call, tsize, minsize)		\
1534   {									\
1535     mp_ptr    wp, tspace;						\
1536     unsigned  i;							\
1537     double    t;							\
1538     TMP_DECL;								\
1539 									\
1540     SPEED_RESTRICT_COND (s->size >= minsize);				\
1541 									\
1542     TMP_MARK;								\
1543     SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp);			\
1544     SPEED_TMP_ALLOC_LIMBS (tspace, tsize, s->align_wp2);		\
1545 									\
1546     speed_operand_src (s, s->xp, s->size);				\
1547     speed_operand_dst (s, wp, 2*s->size);				\
1548     speed_operand_dst (s, tspace, tsize);				\
1549     speed_cache_fill (s);						\
1550 									\
1551     speed_starttime ();							\
1552     i = s->reps;							\
1553     do									\
1554       call;								\
1555     while (--i != 0);							\
1556     t = speed_endtime ();						\
1557 									\
1558     TMP_FREE;								\
1559     return t;								\
1560   }
1561 
1562 #define SPEED_ROUTINE_MPN_TOOM2_SQR(function)				\
1563   SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace),	\
1564 				mpn_toom2_sqr_itch (s->size),		\
1565 				MPN_TOOM2_SQR_MINSIZE)
1566 
1567 #define SPEED_ROUTINE_MPN_TOOM3_SQR(function)				\
1568   SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace),	\
1569 				mpn_toom3_sqr_itch (s->size),		\
1570 				MPN_TOOM3_SQR_MINSIZE)
1571 
1572 
1573 #define SPEED_ROUTINE_MPN_TOOM4_SQR(function)				\
1574   SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace),	\
1575 				mpn_toom4_sqr_itch (s->size),		\
1576 				MPN_TOOM4_SQR_MINSIZE)
1577 
1578 #define SPEED_ROUTINE_MPN_TOOM6_SQR(function)				\
1579   SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace),	\
1580 				mpn_toom6_sqr_itch (s->size),		\
1581 				MPN_TOOM6_SQR_MINSIZE)
1582 
1583 #define SPEED_ROUTINE_MPN_TOOM8_SQR(function)				\
1584   SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace),	\
1585 				mpn_toom8_sqr_itch (s->size),		\
1586 				MPN_TOOM8_SQR_MINSIZE)
1587 
1588 #define SPEED_ROUTINE_MPN_MOD_CALL(call)				\
1589   {									\
1590     unsigned   i;							\
1591 									\
1592     SPEED_RESTRICT_COND (s->size >= 0);					\
1593 									\
1594     speed_operand_src (s, s->xp, s->size);				\
1595     speed_cache_fill (s);						\
1596 									\
1597     speed_starttime ();							\
1598     i = s->reps;							\
1599     do									\
1600       call;								\
1601     while (--i != 0);							\
1602 									\
1603     return speed_endtime ();						\
1604   }
1605 
1606 #define SPEED_ROUTINE_MPN_MOD_1(function)				\
1607    SPEED_ROUTINE_MPN_MOD_CALL ((*function) (s->xp, s->size, s->r))
1608 
1609 #define SPEED_ROUTINE_MPN_MOD_1C(function)				\
1610    SPEED_ROUTINE_MPN_MOD_CALL ((*function)(s->xp, s->size, s->r, CNST_LIMB(0)))
1611 
1612 #define SPEED_ROUTINE_MPN_MODEXACT_1_ODD(function)			\
1613   SPEED_ROUTINE_MPN_MOD_CALL (function (s->xp, s->size, s->r));
1614 
1615 #define SPEED_ROUTINE_MPN_MODEXACT_1C_ODD(function)			\
1616   SPEED_ROUTINE_MPN_MOD_CALL (function (s->xp, s->size, s->r, CNST_LIMB(0)));
1617 
1618 #define SPEED_ROUTINE_MPN_MOD_34LSUB1(function)				\
1619    SPEED_ROUTINE_MPN_MOD_CALL ((*function) (s->xp, s->size))
1620 
1621 #define SPEED_ROUTINE_MPN_PREINV_MOD_1(function)			\
1622   {									\
1623     unsigned   i;							\
1624     mp_limb_t  inv;							\
1625 									\
1626     SPEED_RESTRICT_COND (s->size >= 0);					\
1627     SPEED_RESTRICT_COND (s->r & GMP_LIMB_HIGHBIT);			\
1628 									\
1629     invert_limb (inv, s->r);						\
1630     speed_operand_src (s, s->xp, s->size);				\
1631     speed_cache_fill (s);						\
1632 									\
1633     speed_starttime ();							\
1634     i = s->reps;							\
1635     do									\
1636       (*function) (s->xp, s->size, s->r, inv);				\
1637     while (--i != 0);							\
1638 									\
1639     return speed_endtime ();						\
1640   }
1641 
1642 #define SPEED_ROUTINE_MPN_MOD_1_1(function,pfunc)			\
1643   {									\
1644     unsigned   i;							\
1645     mp_limb_t  inv[4];							\
1646 									\
1647     SPEED_RESTRICT_COND (s->size >= 2);					\
1648 									\
1649     mpn_mod_1_1p_cps (inv, s->r);					\
1650     speed_operand_src (s, s->xp, s->size);				\
1651     speed_cache_fill (s);						\
1652 									\
1653     speed_starttime ();							\
1654     i = s->reps;							\
1655     do {								\
1656       pfunc (inv, s->r);						\
1657       function (s->xp, s->size, s->r << inv[1], inv);				\
1658     } while (--i != 0);							\
1659 									\
1660     return speed_endtime ();						\
1661   }
1662 #define SPEED_ROUTINE_MPN_MOD_1_N(function,pfunc,N)			\
1663   {									\
1664     unsigned   i;							\
1665     mp_limb_t  inv[N+3];						\
1666 									\
1667     SPEED_RESTRICT_COND (s->size >= 1);					\
1668     SPEED_RESTRICT_COND (s->r <= ~(mp_limb_t)0 / N);			\
1669 									\
1670     speed_operand_src (s, s->xp, s->size);				\
1671     speed_cache_fill (s);						\
1672 									\
1673     speed_starttime ();							\
1674     i = s->reps;							\
1675     do {								\
1676       pfunc (inv, s->r);						\
1677       function (s->xp, s->size, s->r, inv);				\
1678     } while (--i != 0);							\
1679 									\
1680     return speed_endtime ();						\
1681   }
1682 
1683 
1684 /* A division of 2*s->size by s->size limbs */
1685 
1686 #define SPEED_ROUTINE_MPN_DC_DIVREM_CALL(call)				\
1687   {									\
1688     unsigned  i;							\
1689     mp_ptr    a, d, q, r;						\
1690     double    t;							\
1691     gmp_pi1_t dinv;							\
1692     TMP_DECL;								\
1693 									\
1694     SPEED_RESTRICT_COND (s->size >= 1);					\
1695 									\
1696     TMP_MARK;								\
1697     SPEED_TMP_ALLOC_LIMBS (a, 2*s->size, s->align_xp);			\
1698     SPEED_TMP_ALLOC_LIMBS (d, s->size,   s->align_yp);			\
1699     SPEED_TMP_ALLOC_LIMBS (q, s->size+1, s->align_wp);			\
1700     SPEED_TMP_ALLOC_LIMBS (r, s->size,   s->align_wp2);			\
1701 									\
1702     MPN_COPY (a, s->xp, s->size);					\
1703     MPN_COPY (a+s->size, s->xp, s->size);				\
1704 									\
1705     MPN_COPY (d, s->yp, s->size);					\
1706 									\
1707     /* normalize the data */						\
1708     d[s->size-1] |= GMP_NUMB_HIGHBIT;					\
1709     a[2*s->size-1] = d[s->size-1] - 1;					\
1710 									\
1711     invert_pi1 (dinv, d[s->size-1], d[s->size-2]);			\
1712 									\
1713     speed_operand_src (s, a, 2*s->size);				\
1714     speed_operand_src (s, d, s->size);					\
1715     speed_operand_dst (s, q, s->size+1);				\
1716     speed_operand_dst (s, r, s->size);					\
1717     speed_cache_fill (s);						\
1718 									\
1719     speed_starttime ();							\
1720     i = s->reps;							\
1721     do									\
1722       call;								\
1723     while (--i != 0);							\
1724     t = speed_endtime ();						\
1725 									\
1726     TMP_FREE;								\
1727     return t;								\
1728   }
1729 
1730 
1731 /* A remainder 2*s->size by s->size limbs */
1732 
1733 #define SPEED_ROUTINE_MPZ_MOD(function)					\
1734   {									\
1735     unsigned   i;							\
1736     mpz_t      a, d, r;							\
1737 									\
1738     SPEED_RESTRICT_COND (s->size >= 1);					\
1739 									\
1740     mpz_init_set_n (d, s->yp, s->size);					\
1741 									\
1742     /* high part less than d, low part a duplicate copied in */		\
1743     mpz_init_set_n (a, s->xp, s->size);					\
1744     mpz_mod (a, a, d);							\
1745     mpz_mul_2exp (a, a, GMP_LIMB_BITS * s->size);			\
1746     MPN_COPY (PTR(a), s->xp, s->size);					\
1747 									\
1748     mpz_init (r);							\
1749 									\
1750     speed_operand_src (s, PTR(a), SIZ(a));				\
1751     speed_operand_src (s, PTR(d), SIZ(d));				\
1752     speed_cache_fill (s);						\
1753 									\
1754     speed_starttime ();							\
1755     i = s->reps;							\
1756     do									\
1757       function (r, a, d);						\
1758     while (--i != 0);							\
1759     return speed_endtime ();						\
1760   }
1761 
1762 #define SPEED_ROUTINE_MPN_PI1_DIV(function, INV, DMIN, QMIN)		\
1763   {									\
1764     unsigned   i;							\
1765     mp_ptr     dp, tp, ap, qp;						\
1766     gmp_pi1_t  inv;							\
1767     double     t;							\
1768     mp_size_t size1;							\
1769     TMP_DECL;								\
1770 									\
1771     size1 = (s->r == 0 ? 2 * s->size : s->r);				\
1772 									\
1773     SPEED_RESTRICT_COND (s->size >= DMIN);				\
1774     SPEED_RESTRICT_COND (size1 - s->size >= QMIN);			\
1775 									\
1776     TMP_MARK;								\
1777     SPEED_TMP_ALLOC_LIMBS (ap, size1, s->align_xp);			\
1778     SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
1779     SPEED_TMP_ALLOC_LIMBS (qp, size1 - s->size, s->align_wp);		\
1780     SPEED_TMP_ALLOC_LIMBS (tp, size1, s->align_wp2);			\
1781 									\
1782     /* we don't fill in dividend completely when size1 > s->size */	\
1783     MPN_COPY (ap,         s->xp, s->size);				\
1784     MPN_COPY (ap + size1 - s->size, s->xp, s->size);			\
1785 									\
1786     MPN_COPY (dp,         s->yp, s->size);				\
1787 									\
1788     /* normalize the data */						\
1789     dp[s->size-1] |= GMP_NUMB_HIGHBIT;					\
1790     ap[size1 - 1] = dp[s->size - 1] - 1;				\
1791 									\
1792     invert_pi1 (inv, dp[s->size-1], dp[s->size-2]);			\
1793 									\
1794     speed_operand_src (s, ap, size1);					\
1795     speed_operand_dst (s, tp, size1);					\
1796     speed_operand_src (s, dp, s->size);					\
1797     speed_operand_dst (s, qp, size1 - s->size);				\
1798     speed_cache_fill (s);						\
1799 									\
1800     speed_starttime ();							\
1801     i = s->reps;							\
1802     do {								\
1803       MPN_COPY (tp, ap, size1);						\
1804       function (qp, tp, size1, dp, s->size, INV);			\
1805     } while (--i != 0);							\
1806     t = speed_endtime ();						\
1807 									\
1808     TMP_FREE;								\
1809     return t;								\
1810   }
1811 #define SPEED_ROUTINE_MPN_MU_DIV_Q(function,itchfn)			\
1812   {									\
1813     unsigned   i;							\
1814     mp_ptr     dp, tp, qp, scratch;					\
1815     double     t;							\
1816     mp_size_t itch;							\
1817     TMP_DECL;								\
1818 									\
1819     SPEED_RESTRICT_COND (s->size >= 2);					\
1820 									\
1821     itch = itchfn (2 * s->size, s->size, 0);				\
1822     TMP_MARK;								\
1823     SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
1824     SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp);			\
1825     SPEED_TMP_ALLOC_LIMBS (tp, 2 * s->size, s->align_xp);		\
1826     SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2);		\
1827 									\
1828     MPN_COPY (tp,         s->xp, s->size);				\
1829     MPN_COPY (tp+s->size, s->xp, s->size);				\
1830 									\
1831     /* normalize the data */						\
1832     dp[s->size-1] |= GMP_NUMB_HIGHBIT;					\
1833     tp[2*s->size-1] = dp[s->size-1] - 1;				\
1834 									\
1835     speed_operand_dst (s, qp, s->size);					\
1836     speed_operand_src (s, tp, 2 * s->size);				\
1837     speed_operand_src (s, dp, s->size);					\
1838     speed_operand_dst (s, scratch, itch);				\
1839     speed_cache_fill (s);						\
1840 									\
1841     speed_starttime ();							\
1842     i = s->reps;							\
1843     do {								\
1844       function (qp, tp, 2 * s->size, dp, s->size, scratch);		\
1845     } while (--i != 0);							\
1846     t = speed_endtime ();						\
1847 									\
1848     TMP_FREE;								\
1849     return t;								\
1850   }
1851 #define SPEED_ROUTINE_MPN_MU_DIV_QR(function,itchfn)			\
1852   {									\
1853     unsigned   i;							\
1854     mp_ptr     dp, tp, qp, rp, scratch;					\
1855     double     t;							\
1856     mp_size_t size1, itch;						\
1857     TMP_DECL;								\
1858 									\
1859     size1 = (s->r == 0 ? 2 * s->size : s->r);				\
1860 									\
1861     SPEED_RESTRICT_COND (s->size >= 2);					\
1862     SPEED_RESTRICT_COND (size1 >= s->size);				\
1863 									\
1864     itch = itchfn (size1, s->size, 0);					\
1865     TMP_MARK;								\
1866     SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
1867     SPEED_TMP_ALLOC_LIMBS (qp, size1 - s->size, s->align_wp);		\
1868     SPEED_TMP_ALLOC_LIMBS (tp, size1, s->align_xp);			\
1869     SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2);		\
1870     SPEED_TMP_ALLOC_LIMBS (rp, s->size, s->align_wp2); /* alignment? */	\
1871 									\
1872     /* we don't fill in dividend completely when size1 > s->size */	\
1873     MPN_COPY (tp,         s->xp, s->size);				\
1874     MPN_COPY (tp + size1 - s->size, s->xp, s->size);			\
1875 									\
1876     MPN_COPY (dp,         s->yp, s->size);				\
1877 									\
1878     /* normalize the data */						\
1879     dp[s->size-1] |= GMP_NUMB_HIGHBIT;					\
1880     tp[size1 - 1] = dp[s->size - 1] - 1;				\
1881 									\
1882     speed_operand_dst (s, qp, size1 - s->size);				\
1883     speed_operand_dst (s, rp, s->size);					\
1884     speed_operand_src (s, tp, size1);					\
1885     speed_operand_src (s, dp, s->size);					\
1886     speed_operand_dst (s, scratch, itch);				\
1887     speed_cache_fill (s);						\
1888 									\
1889     speed_starttime ();							\
1890     i = s->reps;							\
1891     do {								\
1892       function (qp, rp, tp, size1, dp, s->size, scratch);		\
1893     } while (--i != 0);							\
1894     t = speed_endtime ();						\
1895 									\
1896     TMP_FREE;								\
1897     return t;								\
1898   }
1899 #define SPEED_ROUTINE_MPN_MUPI_DIV_QR(function,itchfn)			\
1900   {									\
1901     unsigned   i;							\
1902     mp_ptr     dp, tp, qp, rp, ip, scratch, tmp;			\
1903     double     t;							\
1904     mp_size_t  size1, itch;						\
1905     TMP_DECL;								\
1906 									\
1907     size1 = (s->r == 0 ? 2 * s->size : s->r);				\
1908 									\
1909     SPEED_RESTRICT_COND (s->size >= 2);					\
1910     SPEED_RESTRICT_COND (size1 >= s->size);				\
1911 									\
1912     itch = itchfn (size1, s->size, s->size);				\
1913     TMP_MARK;								\
1914     SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
1915     SPEED_TMP_ALLOC_LIMBS (qp, size1 - s->size, s->align_wp);		\
1916     SPEED_TMP_ALLOC_LIMBS (tp, size1, s->align_xp);			\
1917     SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2);		\
1918     SPEED_TMP_ALLOC_LIMBS (rp, s->size, s->align_wp2); /* alignment? */	\
1919     SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_wp2); /* alignment? */	\
1920 									\
1921     /* we don't fill in dividend completely when size1 > s->size */	\
1922     MPN_COPY (tp,         s->xp, s->size);				\
1923     MPN_COPY (tp + size1 - s->size, s->xp, s->size);			\
1924 									\
1925     MPN_COPY (dp,         s->yp, s->size);				\
1926 									\
1927     /* normalize the data */						\
1928     dp[s->size-1] |= GMP_NUMB_HIGHBIT;					\
1929     tp[size1 - 1] = dp[s->size-1] - 1;					\
1930 									\
1931     tmp = TMP_ALLOC_LIMBS (mpn_invert_itch (s->size));			\
1932     mpn_invert (ip, dp, s->size, tmp);					\
1933 									\
1934     speed_operand_dst (s, qp, size1 - s->size);				\
1935     speed_operand_dst (s, rp, s->size);					\
1936     speed_operand_src (s, tp, size1);					\
1937     speed_operand_src (s, dp, s->size);					\
1938     speed_operand_src (s, ip, s->size);					\
1939     speed_operand_dst (s, scratch, itch);				\
1940     speed_cache_fill (s);						\
1941 									\
1942     speed_starttime ();							\
1943     i = s->reps;							\
1944     do {								\
1945       function (qp, rp, tp, size1, dp, s->size, ip, s->size, scratch);	\
1946     } while (--i != 0);							\
1947     t = speed_endtime ();						\
1948 									\
1949     TMP_FREE;								\
1950     return t;								\
1951   }
1952 
1953 #define SPEED_ROUTINE_MPN_PI1_BDIV_QR(function)				\
1954   {									\
1955     unsigned   i;							\
1956     mp_ptr     dp, tp, ap, qp;						\
1957     mp_limb_t  inv;							\
1958     double     t;							\
1959     TMP_DECL;								\
1960 									\
1961     SPEED_RESTRICT_COND (s->size >= 1);					\
1962 									\
1963     TMP_MARK;								\
1964     SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size, s->align_xp);			\
1965     SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
1966     SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp);			\
1967     SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size, s->align_wp2);		\
1968 									\
1969     MPN_COPY (ap,         s->xp, s->size);				\
1970     MPN_COPY (ap+s->size, s->xp, s->size);				\
1971 									\
1972     /* divisor must be odd */						\
1973     MPN_COPY (dp, s->yp, s->size);					\
1974     dp[0] |= 1;								\
1975     binvert_limb (inv, dp[0]);						\
1976     inv = -inv;								\
1977 									\
1978     speed_operand_src (s, ap, 2*s->size);				\
1979     speed_operand_dst (s, tp, 2*s->size);				\
1980     speed_operand_src (s, dp, s->size);					\
1981     speed_operand_dst (s, qp, s->size);					\
1982     speed_cache_fill (s);						\
1983 									\
1984     speed_starttime ();							\
1985     i = s->reps;							\
1986     do {								\
1987       MPN_COPY (tp, ap, 2*s->size);					\
1988       function (qp, tp, 2*s->size, dp, s->size, inv);			\
1989     } while (--i != 0);							\
1990     t = speed_endtime ();						\
1991 									\
1992     TMP_FREE;								\
1993     return t;								\
1994   }
1995 #define SPEED_ROUTINE_MPN_PI1_BDIV_Q(function)				\
1996   {									\
1997     unsigned   i;							\
1998     mp_ptr     dp, tp, qp;						\
1999     mp_limb_t  inv;							\
2000     double     t;							\
2001     TMP_DECL;								\
2002 									\
2003     SPEED_RESTRICT_COND (s->size >= 1);					\
2004 									\
2005     TMP_MARK;								\
2006     SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
2007     SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp);			\
2008     SPEED_TMP_ALLOC_LIMBS (tp, s->size, s->align_wp2);			\
2009 									\
2010     /* divisor must be odd */						\
2011     MPN_COPY (dp, s->yp, s->size);					\
2012     dp[0] |= 1;								\
2013     binvert_limb (inv, dp[0]);						\
2014     inv = -inv;								\
2015 									\
2016     speed_operand_src (s, s->xp, s->size);				\
2017     speed_operand_dst (s, tp, s->size);					\
2018     speed_operand_src (s, dp, s->size);					\
2019     speed_operand_dst (s, qp, s->size);					\
2020     speed_cache_fill (s);						\
2021 									\
2022     speed_starttime ();							\
2023     i = s->reps;							\
2024     do {								\
2025       MPN_COPY (tp, s->xp, s->size);					\
2026       function (qp, tp, s->size, dp, s->size, inv);			\
2027     } while (--i != 0);							\
2028     t = speed_endtime ();						\
2029 									\
2030     TMP_FREE;								\
2031     return t;								\
2032   }
2033 #define SPEED_ROUTINE_MPN_MU_BDIV_Q(function,itchfn)			\
2034   {									\
2035     unsigned   i;							\
2036     mp_ptr     dp, qp, scratch;						\
2037     double     t;							\
2038     mp_size_t itch;							\
2039     TMP_DECL;								\
2040 									\
2041     SPEED_RESTRICT_COND (s->size >= 2);					\
2042 									\
2043     itch = itchfn (s->size, s->size);					\
2044     TMP_MARK;								\
2045     SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
2046     SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp);			\
2047     SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2);		\
2048 									\
2049     /* divisor must be odd */						\
2050     MPN_COPY (dp, s->yp, s->size);					\
2051     dp[0] |= 1;								\
2052 									\
2053     speed_operand_dst (s, qp, s->size);					\
2054     speed_operand_src (s, s->xp, s->size);				\
2055     speed_operand_src (s, dp, s->size);					\
2056     speed_operand_dst (s, scratch, itch);				\
2057     speed_cache_fill (s);						\
2058 									\
2059     speed_starttime ();							\
2060     i = s->reps;							\
2061     do {								\
2062       function (qp, s->xp, s->size, dp, s->size, scratch);		\
2063     } while (--i != 0);							\
2064     t = speed_endtime ();						\
2065 									\
2066     TMP_FREE;								\
2067     return t;								\
2068   }
2069 #define SPEED_ROUTINE_MPN_MU_BDIV_QR(function,itchfn)			\
2070   {									\
2071     unsigned   i;							\
2072     mp_ptr     dp, tp, qp, rp, scratch;					\
2073     double     t;							\
2074     mp_size_t itch;							\
2075     TMP_DECL;								\
2076 									\
2077     SPEED_RESTRICT_COND (s->size >= 2);					\
2078 									\
2079     itch = itchfn (2 * s->size, s->size);				\
2080     TMP_MARK;								\
2081     SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
2082     SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp);			\
2083     SPEED_TMP_ALLOC_LIMBS (tp, 2 * s->size, s->align_xp);		\
2084     SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2);		\
2085     SPEED_TMP_ALLOC_LIMBS (rp, s->size, s->align_wp2); /* alignment? */	\
2086 									\
2087     MPN_COPY (tp,         s->xp, s->size);				\
2088     MPN_COPY (tp+s->size, s->xp, s->size);				\
2089 									\
2090     /* divisor must be odd */						\
2091     MPN_COPY (dp, s->yp, s->size);					\
2092     dp[0] |= 1;								\
2093 									\
2094     speed_operand_dst (s, qp, s->size);					\
2095     speed_operand_dst (s, rp, s->size);					\
2096     speed_operand_src (s, tp, 2 * s->size);				\
2097     speed_operand_src (s, dp, s->size);					\
2098     speed_operand_dst (s, scratch, itch);				\
2099     speed_cache_fill (s);						\
2100 									\
2101     speed_starttime ();							\
2102     i = s->reps;							\
2103     do {								\
2104       function (qp, rp, tp, 2 * s->size, dp, s->size, scratch);		\
2105     } while (--i != 0);							\
2106     t = speed_endtime ();						\
2107 									\
2108     TMP_FREE;								\
2109     return t;								\
2110   }
2111 
2112 #define SPEED_ROUTINE_MPN_BROOT(function)	\
2113   {						\
2114     SPEED_RESTRICT_COND (s->r & 1);		\
2115     s->xp[0] |= 1;				\
2116     SPEED_ROUTINE_MPN_UNARY_1_CALL		\
2117       ((*function) (wp, s->xp, s->size, s->r));	\
2118   }
2119 
2120 #define SPEED_ROUTINE_MPN_BROOTINV(function, itch)	\
2121   {							\
2122     mp_ptr    wp, tp;					\
2123     unsigned  i;					\
2124     double    t;					\
2125     TMP_DECL;						\
2126     TMP_MARK;						\
2127     SPEED_RESTRICT_COND (s->size >= 1);			\
2128     SPEED_RESTRICT_COND (s->r & 1);			\
2129     wp = TMP_ALLOC_LIMBS (s->size);			\
2130     tp = TMP_ALLOC_LIMBS ( (itch));			\
2131     s->xp[0] |= 1;					\
2132 							\
2133     speed_operand_src (s, s->xp, s->size);		\
2134     speed_operand_dst (s, wp, s->size);			\
2135     speed_cache_fill (s);				\
2136 							\
2137     speed_starttime ();					\
2138     i = s->reps;					\
2139     do							\
2140       (*function) (wp, s->xp, s->size, s->r, tp);	\
2141     while (--i != 0);					\
2142     t = speed_endtime ();				\
2143 							\
2144     TMP_FREE;						\
2145     return t;						\
2146   }
2147 
2148 #define SPEED_ROUTINE_MPN_INVERT(function,itchfn)			\
2149   {									\
2150     long  i;								\
2151     mp_ptr    up, tp, ip;						\
2152     double    t;							\
2153     TMP_DECL;								\
2154 									\
2155     SPEED_RESTRICT_COND (s->size >= 1);					\
2156 									\
2157     TMP_MARK;								\
2158     SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp);			\
2159     SPEED_TMP_ALLOC_LIMBS (up, s->size,   s->align_yp);			\
2160     SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp);		\
2161 									\
2162     MPN_COPY (up, s->xp, s->size);					\
2163 									\
2164     /* normalize the data */						\
2165     up[s->size-1] |= GMP_NUMB_HIGHBIT;					\
2166 									\
2167     speed_operand_src (s, up, s->size);					\
2168     speed_operand_dst (s, tp, s->size);					\
2169     speed_operand_dst (s, ip, s->size);					\
2170     speed_cache_fill (s);						\
2171 									\
2172     speed_starttime ();							\
2173     i = s->reps;							\
2174     do									\
2175       function (ip, up, s->size, tp);					\
2176     while (--i != 0);							\
2177     t = speed_endtime ();						\
2178 									\
2179     TMP_FREE;								\
2180     return t;								\
2181   }
2182 
2183 #define SPEED_ROUTINE_MPN_INVERTAPPR(function,itchfn)			\
2184   {									\
2185     long  i;								\
2186     mp_ptr    up, tp, ip;						\
2187     double    t;							\
2188     TMP_DECL;								\
2189 									\
2190     SPEED_RESTRICT_COND (s->size >= 1);					\
2191 									\
2192     TMP_MARK;								\
2193     SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp);			\
2194     SPEED_TMP_ALLOC_LIMBS (up, s->size, s->align_yp);			\
2195     SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp);		\
2196 									\
2197     MPN_COPY (up, s->xp, s->size);					\
2198 									\
2199     /* normalize the data */						\
2200     up[s->size-1] |= GMP_NUMB_HIGHBIT;					\
2201 									\
2202     speed_operand_src (s, up, s->size);					\
2203     speed_operand_dst (s, tp, s->size);					\
2204     speed_operand_dst (s, ip, s->size);					\
2205     speed_cache_fill (s);						\
2206 									\
2207     speed_starttime ();							\
2208     i = s->reps;							\
2209     do									\
2210       function (ip, up, s->size, tp);					\
2211     while (--i != 0);							\
2212     t = speed_endtime ();						\
2213 									\
2214     TMP_FREE;								\
2215     return t;								\
2216   }
2217 
2218 #define SPEED_ROUTINE_MPN_NI_INVERTAPPR(function,itchfn)		\
2219   {									\
2220     long  i;								\
2221     mp_ptr    up, tp, ip;						\
2222     double    t;							\
2223     TMP_DECL;								\
2224 									\
2225     SPEED_RESTRICT_COND (s->size >= 3);					\
2226 									\
2227     TMP_MARK;								\
2228     SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp);			\
2229     SPEED_TMP_ALLOC_LIMBS (up, s->size, s->align_yp);			\
2230     SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp);		\
2231 									\
2232     MPN_COPY (up, s->xp, s->size);					\
2233 									\
2234     /* normalize the data */						\
2235     up[s->size-1] |= GMP_NUMB_HIGHBIT;					\
2236 									\
2237     speed_operand_src (s, up, s->size);					\
2238     speed_operand_dst (s, tp, s->size);					\
2239     speed_operand_dst (s, ip, s->size);					\
2240     speed_cache_fill (s);						\
2241 									\
2242     speed_starttime ();							\
2243     i = s->reps;							\
2244     do									\
2245       function (ip, up, s->size, tp);					\
2246     while (--i != 0);							\
2247     t = speed_endtime ();						\
2248 									\
2249     TMP_FREE;								\
2250     return t;								\
2251   }
2252 
2253 #define SPEED_ROUTINE_MPN_BINVERT(function,itchfn)			\
2254   {									\
2255     long  i;								\
2256     mp_ptr    up, tp, ip;						\
2257     double    t;							\
2258     TMP_DECL;								\
2259 									\
2260     SPEED_RESTRICT_COND (s->size >= 1);					\
2261 									\
2262     TMP_MARK;								\
2263     SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp);			\
2264     SPEED_TMP_ALLOC_LIMBS (up, s->size,   s->align_yp);			\
2265     SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp);		\
2266 									\
2267     MPN_COPY (up, s->xp, s->size);					\
2268 									\
2269     /* normalize the data */						\
2270     up[0] |= 1;								\
2271 									\
2272     speed_operand_src (s, up, s->size);					\
2273     speed_operand_dst (s, tp, s->size);					\
2274     speed_operand_dst (s, ip, s->size);					\
2275     speed_cache_fill (s);						\
2276 									\
2277     speed_starttime ();							\
2278     i = s->reps;							\
2279     do									\
2280       function (ip, up, s->size, tp);					\
2281     while (--i != 0);							\
2282     t = speed_endtime ();						\
2283 									\
2284     TMP_FREE;								\
2285     return t;								\
2286   }
2287 
2288 #define SPEED_ROUTINE_MPN_SEC_INVERT(function,itchfn)			\
2289   {									\
2290     long  i;								\
2291     mp_ptr    up, mp, tp, ip;						\
2292     double    t;							\
2293     TMP_DECL;								\
2294 									\
2295     SPEED_RESTRICT_COND (s->size >= 1);					\
2296 									\
2297     TMP_MARK;								\
2298     SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp);			\
2299     SPEED_TMP_ALLOC_LIMBS (up, s->size, s->align_yp);			\
2300     SPEED_TMP_ALLOC_LIMBS (mp, s->size, s->align_yp);			\
2301     SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp);		\
2302 									\
2303     speed_operand_src (s, up, s->size);					\
2304     speed_operand_dst (s, tp, s->size);					\
2305     speed_operand_dst (s, ip, s->size);					\
2306     speed_cache_fill (s);						\
2307 									\
2308     MPN_COPY (mp, s->yp, s->size);					\
2309     /* Must be odd */							\
2310     mp[0] |= 1;								\
2311     speed_starttime ();							\
2312     i = s->reps;							\
2313     do									\
2314       {									\
2315 	MPN_COPY (up, s->xp, s->size);					\
2316 	function (ip, up, mp, s->size, 2*s->size*GMP_NUMB_BITS, tp);	\
2317       }									\
2318     while (--i != 0);							\
2319     t = speed_endtime ();						\
2320 									\
2321     TMP_FREE;								\
2322     return t;								\
2323   }
2324 
2325 #define SPEED_ROUTINE_REDC_1(function)					\
2326   {									\
2327     unsigned   i;							\
2328     mp_ptr     cp, mp, tp, ap;						\
2329     mp_limb_t  inv;							\
2330     double     t;							\
2331     TMP_DECL;								\
2332 									\
2333     SPEED_RESTRICT_COND (s->size >= 1);					\
2334 									\
2335     TMP_MARK;								\
2336     SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size+1, s->align_xp);		\
2337     SPEED_TMP_ALLOC_LIMBS (mp, s->size,     s->align_yp);		\
2338     SPEED_TMP_ALLOC_LIMBS (cp, s->size,     s->align_wp);		\
2339     SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size+1, s->align_wp2);		\
2340 									\
2341     MPN_COPY (ap,         s->xp, s->size);				\
2342     MPN_COPY (ap+s->size, s->xp, s->size);				\
2343 									\
2344     /* modulus must be odd */						\
2345     MPN_COPY (mp, s->yp, s->size);					\
2346     mp[0] |= 1;								\
2347     binvert_limb (inv, mp[0]);						\
2348     inv = -inv;								\
2349 									\
2350     speed_operand_src (s, ap, 2*s->size+1);				\
2351     speed_operand_dst (s, tp, 2*s->size+1);				\
2352     speed_operand_src (s, mp, s->size);					\
2353     speed_operand_dst (s, cp, s->size);					\
2354     speed_cache_fill (s);						\
2355 									\
2356     speed_starttime ();							\
2357     i = s->reps;							\
2358     do {								\
2359       MPN_COPY (tp, ap, 2*s->size);					\
2360       function (cp, tp, mp, s->size, inv);				\
2361     } while (--i != 0);							\
2362     t = speed_endtime ();						\
2363 									\
2364     TMP_FREE;								\
2365     return t;								\
2366   }
2367 #define SPEED_ROUTINE_REDC_2(function)					\
2368   {									\
2369     unsigned   i;							\
2370     mp_ptr     cp, mp, tp, ap;						\
2371     mp_limb_t  invp[2];							\
2372     double     t;							\
2373     TMP_DECL;								\
2374 									\
2375     SPEED_RESTRICT_COND (s->size >= 1);					\
2376 									\
2377     TMP_MARK;								\
2378     SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size+1, s->align_xp);		\
2379     SPEED_TMP_ALLOC_LIMBS (mp, s->size,     s->align_yp);		\
2380     SPEED_TMP_ALLOC_LIMBS (cp, s->size,     s->align_wp);		\
2381     SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size+1, s->align_wp2);		\
2382 									\
2383     MPN_COPY (ap,         s->xp, s->size);				\
2384     MPN_COPY (ap+s->size, s->xp, s->size);				\
2385 									\
2386     /* modulus must be odd */						\
2387     MPN_COPY (mp, s->yp, s->size);					\
2388     mp[0] |= 1;								\
2389     mpn_binvert (invp, mp, 2, tp);					\
2390     invp[0] = -invp[0]; invp[1] = ~invp[1];				\
2391 									\
2392     speed_operand_src (s, ap, 2*s->size+1);				\
2393     speed_operand_dst (s, tp, 2*s->size+1);				\
2394     speed_operand_src (s, mp, s->size);					\
2395     speed_operand_dst (s, cp, s->size);					\
2396     speed_cache_fill (s);						\
2397 									\
2398     speed_starttime ();							\
2399     i = s->reps;							\
2400     do {								\
2401       MPN_COPY (tp, ap, 2*s->size);					\
2402       function (cp, tp, mp, s->size, invp);				\
2403     } while (--i != 0);							\
2404     t = speed_endtime ();						\
2405 									\
2406     TMP_FREE;								\
2407     return t;								\
2408   }
2409 #define SPEED_ROUTINE_REDC_N(function)					\
2410   {									\
2411     unsigned   i;							\
2412     mp_ptr     cp, mp, tp, ap, invp;					\
2413     double     t;							\
2414     TMP_DECL;								\
2415 									\
2416     SPEED_RESTRICT_COND (s->size > 8);					\
2417 									\
2418     TMP_MARK;								\
2419     SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size+1, s->align_xp);		\
2420     SPEED_TMP_ALLOC_LIMBS (mp, s->size,     s->align_yp);		\
2421     SPEED_TMP_ALLOC_LIMBS (cp, s->size,     s->align_wp);		\
2422     SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size+1, s->align_wp2);		\
2423     SPEED_TMP_ALLOC_LIMBS (invp, s->size,   s->align_wp2); /* align? */	\
2424 									\
2425     MPN_COPY (ap,         s->xp, s->size);				\
2426     MPN_COPY (ap+s->size, s->xp, s->size);				\
2427 									\
2428     /* modulus must be odd */						\
2429     MPN_COPY (mp, s->yp, s->size);					\
2430     mp[0] |= 1;								\
2431     mpn_binvert (invp, mp, s->size, tp);				\
2432 									\
2433     speed_operand_src (s, ap, 2*s->size+1);				\
2434     speed_operand_dst (s, tp, 2*s->size+1);				\
2435     speed_operand_src (s, mp, s->size);					\
2436     speed_operand_dst (s, cp, s->size);					\
2437     speed_cache_fill (s);						\
2438 									\
2439     speed_starttime ();							\
2440     i = s->reps;							\
2441     do {								\
2442       MPN_COPY (tp, ap, 2*s->size);					\
2443       function (cp, tp, mp, s->size, invp);				\
2444     } while (--i != 0);							\
2445     t = speed_endtime ();						\
2446 									\
2447     TMP_FREE;								\
2448     return t;								\
2449   }
2450 
2451 
2452 #define SPEED_ROUTINE_MPN_POPCOUNT(function)				\
2453   {									\
2454     unsigned i;								\
2455 									\
2456     SPEED_RESTRICT_COND (s->size >= 1);					\
2457 									\
2458     speed_operand_src (s, s->xp, s->size);				\
2459     speed_cache_fill (s);						\
2460 									\
2461     speed_starttime ();							\
2462     i = s->reps;							\
2463     do									\
2464       function (s->xp, s->size);					\
2465     while (--i != 0);							\
2466 									\
2467     return speed_endtime ();						\
2468   }
2469 
2470 #define SPEED_ROUTINE_MPN_HAMDIST(function)				\
2471   {									\
2472     unsigned i;								\
2473 									\
2474     SPEED_RESTRICT_COND (s->size >= 1);					\
2475 									\
2476     speed_operand_src (s, s->xp, s->size);				\
2477     speed_operand_src (s, s->yp, s->size);				\
2478     speed_cache_fill (s);						\
2479 									\
2480     speed_starttime ();							\
2481     i = s->reps;							\
2482     do									\
2483       function (s->xp, s->yp, s->size);					\
2484     while (--i != 0);							\
2485 									\
2486     return speed_endtime ();						\
2487   }
2488 
2489 
2490 #define SPEED_ROUTINE_MPZ_UI(function)					\
2491   {									\
2492     mpz_t     z;							\
2493     unsigned  i;							\
2494     double    t;							\
2495 									\
2496     SPEED_RESTRICT_COND (s->size >= 0);					\
2497 									\
2498     mpz_init (z);							\
2499 									\
2500     speed_starttime ();							\
2501     i = s->reps;							\
2502     do									\
2503       function (z, s->size);						\
2504     while (--i != 0);							\
2505     t = speed_endtime ();						\
2506 									\
2507     mpz_clear (z);							\
2508     return t;								\
2509   }
2510 
2511 #define SPEED_ROUTINE_MPZ_FAC_UI(function)    SPEED_ROUTINE_MPZ_UI(function)
2512 #define SPEED_ROUTINE_MPZ_FIB_UI(function)    SPEED_ROUTINE_MPZ_UI(function)
2513 #define SPEED_ROUTINE_MPZ_LUCNUM_UI(function) SPEED_ROUTINE_MPZ_UI(function)
2514 
2515 
2516 #define SPEED_ROUTINE_MPZ_2_UI(function)				\
2517   {									\
2518     mpz_t     z, z2;							\
2519     unsigned  i;							\
2520     double    t;							\
2521 									\
2522     SPEED_RESTRICT_COND (s->size >= 0);					\
2523 									\
2524     mpz_init (z);							\
2525     mpz_init (z2);							\
2526 									\
2527     speed_starttime ();							\
2528     i = s->reps;							\
2529     do									\
2530       function (z, z2, s->size);					\
2531     while (--i != 0);							\
2532     t = speed_endtime ();						\
2533 									\
2534     mpz_clear (z);							\
2535     mpz_clear (z2);							\
2536     return t;								\
2537   }
2538 
2539 #define SPEED_ROUTINE_MPZ_FIB2_UI(function)    SPEED_ROUTINE_MPZ_2_UI(function)
2540 #define SPEED_ROUTINE_MPZ_LUCNUM2_UI(function) SPEED_ROUTINE_MPZ_2_UI(function)
2541 
2542 
2543 #define SPEED_ROUTINE_MPN_FIB2_UI(function)				\
2544   {									\
2545     mp_ptr     fp, f1p;							\
2546     mp_size_t  alloc;							\
2547     unsigned   i;							\
2548     double     t;							\
2549     TMP_DECL;								\
2550 									\
2551     SPEED_RESTRICT_COND (s->size >= 0);					\
2552 									\
2553     TMP_MARK;								\
2554     alloc = MPN_FIB2_SIZE (s->size);					\
2555     SPEED_TMP_ALLOC_LIMBS (fp,	alloc, s->align_xp);			\
2556     SPEED_TMP_ALLOC_LIMBS (f1p, alloc, s->align_yp);			\
2557 									\
2558     speed_starttime ();							\
2559     i = s->reps;							\
2560     do									\
2561       function (fp, f1p, s->size);					\
2562     while (--i != 0);							\
2563     t = speed_endtime ();						\
2564 									\
2565     TMP_FREE;								\
2566     return t;								\
2567   }
2568 
2569 
2570 
2571 /* Calculate b^e mod m for random b and m of s->size limbs and random e of 6
2572    limbs.  m is forced to odd so that redc can be used.  e is limited in
2573    size so the calculation doesn't take too long. */
2574 #define SPEED_ROUTINE_MPZ_POWM(function)				\
2575   {									\
2576     mpz_t     r, b, e, m;						\
2577     unsigned  i;							\
2578     double    t;							\
2579 									\
2580     SPEED_RESTRICT_COND (s->size >= 1);					\
2581 									\
2582     mpz_init (r);							\
2583     mpz_init_set_n (b, s->xp, s->size);					\
2584     mpz_init_set_n (m, s->yp, s->size);					\
2585     mpz_setbit (m, 0);	/* force m to odd */				\
2586     mpz_init_set_n (e, s->xp_block, 6);					\
2587 									\
2588     speed_starttime ();							\
2589     i = s->reps;							\
2590     do									\
2591       function (r, b, e, m);						\
2592     while (--i != 0);							\
2593     t = speed_endtime ();						\
2594 									\
2595     mpz_clear (r);							\
2596     mpz_clear (b);							\
2597     mpz_clear (e);							\
2598     mpz_clear (m);							\
2599     return t;								\
2600   }
2601 
2602 /* (m-2)^0xAAAAAAAA mod m */
2603 #define SPEED_ROUTINE_MPZ_POWM_UI(function)				\
2604   {									\
2605     mpz_t     r, b, m;							\
2606     unsigned  long  e;							\
2607     unsigned  i;							\
2608     double    t;							\
2609 									\
2610     SPEED_RESTRICT_COND (s->size >= 1);					\
2611 									\
2612     mpz_init (r);							\
2613 									\
2614     /* force m to odd */						\
2615     mpz_init (m);							\
2616     mpz_set_n (m, s->xp, s->size);					\
2617     PTR(m)[0] |= 1;							\
2618 									\
2619     e = (~ (unsigned long) 0) / 3;					\
2620     if (s->r != 0)							\
2621       e = s->r;								\
2622 									\
2623     mpz_init_set (b, m);						\
2624     mpz_sub_ui (b, b, 2);						\
2625 /* printf ("%X\n", mpz_get_ui(m)); */					\
2626     i = s->reps;							\
2627     speed_starttime ();							\
2628     do									\
2629       function (r, b, e, m);						\
2630     while (--i != 0);							\
2631     t = speed_endtime ();						\
2632 									\
2633     mpz_clear (r);							\
2634     mpz_clear (b);							\
2635     mpz_clear (m);							\
2636     return t;								\
2637   }
2638 
2639 
2640 #define SPEED_ROUTINE_MPN_ADDSUB_CALL(call)				\
2641   {									\
2642     mp_ptr    wp, wp2, xp, yp;						\
2643     unsigned  i;							\
2644     double    t;							\
2645     TMP_DECL;								\
2646 									\
2647     SPEED_RESTRICT_COND (s->size >= 0);					\
2648 									\
2649     TMP_MARK;								\
2650     SPEED_TMP_ALLOC_LIMBS (wp,	s->size, s->align_wp);			\
2651     SPEED_TMP_ALLOC_LIMBS (wp2, s->size, s->align_wp2);			\
2652     xp = s->xp;								\
2653     yp = s->yp;								\
2654 									\
2655     if (s->r == 0)	;						\
2656     else if (s->r == 1) { xp = wp;	      }				\
2657     else if (s->r == 2) {	    yp = wp2; }				\
2658     else if (s->r == 3) { xp = wp;  yp = wp2; }				\
2659     else if (s->r == 4) { xp = wp2; yp = wp;  }				\
2660     else {								\
2661       TMP_FREE;								\
2662       return -1.0;							\
2663     }									\
2664     if (xp != s->xp) MPN_COPY (xp, s->xp, s->size);			\
2665     if (yp != s->yp) MPN_COPY (yp, s->yp, s->size);			\
2666 									\
2667     speed_operand_src (s, xp, s->size);					\
2668     speed_operand_src (s, yp, s->size);					\
2669     speed_operand_dst (s, wp, s->size);					\
2670     speed_operand_dst (s, wp2, s->size);				\
2671     speed_cache_fill (s);						\
2672 									\
2673     speed_starttime ();							\
2674     i = s->reps;							\
2675     do									\
2676       call;								\
2677     while (--i != 0);							\
2678     t = speed_endtime ();						\
2679 									\
2680     TMP_FREE;								\
2681     return t;								\
2682   }
2683 
2684 #define SPEED_ROUTINE_MPN_ADDSUB_N(function)				\
2685   SPEED_ROUTINE_MPN_ADDSUB_CALL						\
2686     (function (wp, wp2, xp, yp, s->size));
2687 
2688 #define SPEED_ROUTINE_MPN_ADDSUB_NC(function)				\
2689   SPEED_ROUTINE_MPN_ADDSUB_CALL						\
2690     (function (wp, wp2, xp, yp, s->size, 0));
2691 
2692 
2693 /* Doing an Nx1 gcd with the given r. */
2694 #define SPEED_ROUTINE_MPN_GCD_1N(function)				\
2695   {									\
2696     mp_ptr    xp;							\
2697     unsigned  i;							\
2698     double    t;							\
2699     TMP_DECL;								\
2700 									\
2701     SPEED_RESTRICT_COND (s->size >= 1);					\
2702     SPEED_RESTRICT_COND (s->r != 0);					\
2703 									\
2704     TMP_MARK;								\
2705     SPEED_TMP_ALLOC_LIMBS (xp, s->size, s->align_xp);			\
2706     MPN_COPY (xp, s->xp, s->size);					\
2707     xp[0] |= refmpn_zero_p (xp, s->size);				\
2708 									\
2709     speed_operand_src (s, s->xp, s->size);				\
2710     speed_cache_fill (s);						\
2711 									\
2712     speed_starttime ();							\
2713     i = s->reps;							\
2714     do									\
2715       function (xp, s->size, s->r);					\
2716     while (--i != 0);							\
2717     t = speed_endtime ();						\
2718 									\
2719     TMP_FREE;								\
2720     return t;								\
2721   }
2722 
2723 
2724 /* SPEED_BLOCK_SIZE many one GCDs of s->size bits each. */
2725 
2726 #define SPEED_ROUTINE_MPN_GCD_1_CALL(setup, call)			\
2727   {									\
2728     unsigned  i, j;							\
2729     mp_ptr    px, py;							\
2730     mp_limb_t x_mask, y_mask;						\
2731     double    t;							\
2732     TMP_DECL;								\
2733 									\
2734     SPEED_RESTRICT_COND (s->size >= 1);					\
2735     SPEED_RESTRICT_COND (s->size <= mp_bits_per_limb);			\
2736 									\
2737     TMP_MARK;								\
2738     SPEED_TMP_ALLOC_LIMBS (px, SPEED_BLOCK_SIZE, s->align_xp);		\
2739     SPEED_TMP_ALLOC_LIMBS (py, SPEED_BLOCK_SIZE, s->align_yp);		\
2740     MPN_COPY (px, s->xp_block, SPEED_BLOCK_SIZE);			\
2741     MPN_COPY (py, s->yp_block, SPEED_BLOCK_SIZE);			\
2742 									\
2743     x_mask = MP_LIMB_T_LOWBITMASK (s->size);				\
2744     y_mask = MP_LIMB_T_LOWBITMASK (s->r != 0 ? s->r : s->size);		\
2745     for (i = 0; i < SPEED_BLOCK_SIZE; i++)				\
2746       {									\
2747 	px[i] &= x_mask; px[i] += (px[i] == 0);				\
2748 	py[i] &= y_mask; py[i] += (py[i] == 0);				\
2749 	setup;								\
2750       }									\
2751 									\
2752     speed_operand_src (s, px, SPEED_BLOCK_SIZE);			\
2753     speed_operand_src (s, py, SPEED_BLOCK_SIZE);			\
2754     speed_cache_fill (s);						\
2755 									\
2756     speed_starttime ();							\
2757     i = s->reps;							\
2758     do									\
2759       {									\
2760 	j = SPEED_BLOCK_SIZE;						\
2761 	do								\
2762 	  {								\
2763 	    call;							\
2764 	  }								\
2765 	while (--j != 0);						\
2766       }									\
2767     while (--i != 0);							\
2768     t = speed_endtime ();						\
2769 									\
2770     TMP_FREE;								\
2771 									\
2772     s->time_divisor = SPEED_BLOCK_SIZE;					\
2773     return t;								\
2774   }
2775 
2776 #define SPEED_ROUTINE_MPN_GCD_1(function)				\
2777   SPEED_ROUTINE_MPN_GCD_1_CALL( , function (&px[j-1], 1, py[j-1]))
2778 
2779 #define SPEED_ROUTINE_MPN_JACBASE(function)				\
2780   SPEED_ROUTINE_MPN_GCD_1_CALL						\
2781     ({									\
2782        /* require x<y, y odd, y!=1 */					\
2783        px[i] %= py[i];							\
2784        px[i] |= 1;							\
2785        py[i] |= 1;							\
2786        if (py[i]==1) py[i]=3;						\
2787      },									\
2788      function (px[j-1], py[j-1], 0))
2789 
2790 
2791 #define SPEED_ROUTINE_MPN_HGCD_CALL(func, itchfunc)			\
2792   {									\
2793     mp_size_t hgcd_init_itch, hgcd_itch;				\
2794     mp_ptr ap, bp, wp, tmp1;						\
2795     struct hgcd_matrix hgcd;						\
2796     int res;								\
2797     unsigned i;								\
2798     double t;								\
2799     TMP_DECL;								\
2800 									\
2801     if (s->size < 2)							\
2802       return -1;							\
2803 									\
2804     TMP_MARK;								\
2805 									\
2806     SPEED_TMP_ALLOC_LIMBS (ap, s->size + 1, s->align_xp);		\
2807     SPEED_TMP_ALLOC_LIMBS (bp, s->size + 1, s->align_yp);		\
2808 									\
2809     s->xp[s->size - 1] |= 1;						\
2810     s->yp[s->size - 1] |= 1;						\
2811 									\
2812     hgcd_init_itch = MPN_HGCD_MATRIX_INIT_ITCH (s->size);		\
2813     hgcd_itch = itchfunc (s->size);					\
2814 									\
2815     SPEED_TMP_ALLOC_LIMBS (tmp1, hgcd_init_itch, s->align_wp);		\
2816     SPEED_TMP_ALLOC_LIMBS (wp, hgcd_itch, s->align_wp);			\
2817 									\
2818     speed_operand_src (s, s->xp, s->size);				\
2819     speed_operand_src (s, s->yp, s->size);				\
2820     speed_operand_dst (s, ap, s->size + 1);				\
2821     speed_operand_dst (s, bp, s->size + 1);				\
2822     speed_operand_dst (s, wp, hgcd_itch);				\
2823     speed_operand_dst (s, tmp1, hgcd_init_itch);			\
2824     speed_cache_fill (s);						\
2825 									\
2826     speed_starttime ();							\
2827     i = s->reps;							\
2828     do									\
2829       {									\
2830 	MPN_COPY (ap, s->xp, s->size);					\
2831 	MPN_COPY (bp, s->yp, s->size);					\
2832 	mpn_hgcd_matrix_init (&hgcd, s->size, tmp1);			\
2833 	res = func (ap, bp, s->size, &hgcd, wp);			\
2834       }									\
2835     while (--i != 0);							\
2836     t = speed_endtime ();						\
2837     TMP_FREE;								\
2838     return t;								\
2839   }
2840 
2841 #define SPEED_ROUTINE_MPN_HGCD_REDUCE_CALL(func, itchfunc)		\
2842   {									\
2843     mp_size_t hgcd_init_itch, hgcd_step_itch;				\
2844     mp_ptr ap, bp, wp, tmp1;						\
2845     struct hgcd_matrix hgcd;						\
2846     mp_size_t p = s->size/2;						\
2847     int res;								\
2848     unsigned i;								\
2849     double t;								\
2850     TMP_DECL;								\
2851 									\
2852     if (s->size < 2)							\
2853       return -1;							\
2854 									\
2855     TMP_MARK;								\
2856 									\
2857     SPEED_TMP_ALLOC_LIMBS (ap, s->size + 1, s->align_xp);		\
2858     SPEED_TMP_ALLOC_LIMBS (bp, s->size + 1, s->align_yp);		\
2859 									\
2860     s->xp[s->size - 1] |= 1;						\
2861     s->yp[s->size - 1] |= 1;						\
2862 									\
2863     hgcd_init_itch = MPN_HGCD_MATRIX_INIT_ITCH (s->size);		\
2864     hgcd_step_itch = itchfunc (s->size, p);				\
2865 									\
2866     SPEED_TMP_ALLOC_LIMBS (tmp1, hgcd_init_itch, s->align_wp);		\
2867     SPEED_TMP_ALLOC_LIMBS (wp, hgcd_step_itch, s->align_wp);			\
2868 									\
2869     speed_operand_src (s, s->xp, s->size);				\
2870     speed_operand_src (s, s->yp, s->size);				\
2871     speed_operand_dst (s, ap, s->size + 1);				\
2872     speed_operand_dst (s, bp, s->size + 1);				\
2873     speed_operand_dst (s, wp, hgcd_step_itch);				\
2874     speed_operand_dst (s, tmp1, hgcd_init_itch);			\
2875     speed_cache_fill (s);						\
2876 									\
2877     speed_starttime ();							\
2878     i = s->reps;							\
2879     do									\
2880       {									\
2881 	MPN_COPY (ap, s->xp, s->size);					\
2882 	MPN_COPY (bp, s->yp, s->size);					\
2883 	mpn_hgcd_matrix_init (&hgcd, s->size, tmp1);			\
2884 	res = func (&hgcd, ap, bp, s->size, p, wp);			\
2885       }									\
2886     while (--i != 0);							\
2887     t = speed_endtime ();						\
2888     TMP_FREE;								\
2889     return t;								\
2890   }
2891 
2892 /* Run some GCDs of s->size limbs each.  The number of different data values
2893    is decreased as s->size**2, since GCD is a quadratic algorithm.
2894    SPEED_ROUTINE_MPN_GCD runs more times than SPEED_ROUTINE_MPN_GCDEXT
2895    though, because the plain gcd is about twice as fast as gcdext.  */
2896 
2897 #define SPEED_ROUTINE_MPN_GCD_CALL(datafactor, call)			\
2898   {									\
2899     unsigned  i;							\
2900     mp_size_t j, pieces, psize;						\
2901     mp_ptr    wp, wp2, xtmp, ytmp, px, py;				\
2902     double    t;							\
2903     TMP_DECL;								\
2904 									\
2905     SPEED_RESTRICT_COND (s->size >= 1);					\
2906 									\
2907     TMP_MARK;								\
2908     SPEED_TMP_ALLOC_LIMBS (xtmp, s->size+1, s->align_xp);		\
2909     SPEED_TMP_ALLOC_LIMBS (ytmp, s->size+1, s->align_yp);		\
2910     SPEED_TMP_ALLOC_LIMBS (wp,   s->size+1, s->align_wp);		\
2911     SPEED_TMP_ALLOC_LIMBS (wp2,  s->size+1, s->align_wp2);		\
2912 									\
2913     pieces = SPEED_BLOCK_SIZE * datafactor / s->size / s->size;		\
2914     pieces = MIN (pieces, SPEED_BLOCK_SIZE / s->size);			\
2915     pieces = MAX (pieces, 1);						\
2916 									\
2917     psize = pieces * s->size;						\
2918     px = TMP_ALLOC_LIMBS (psize);					\
2919     py = TMP_ALLOC_LIMBS (psize);					\
2920     MPN_COPY (px, pieces==1 ? s->xp : s->xp_block, psize);		\
2921     MPN_COPY (py, pieces==1 ? s->yp : s->yp_block, psize);		\
2922 									\
2923     /* Requirements: x >= y, y must be odd, high limbs != 0.		\
2924        No need to ensure random numbers are really great.  */		\
2925     for (j = 0; j < pieces; j++)					\
2926       {									\
2927 	mp_ptr	x = px + j * s->size;					\
2928 	mp_ptr	y = py + j * s->size;					\
2929 	if (x[s->size - 1] == 0) x[s->size - 1] = 1;			\
2930 	if (y[s->size - 1] == 0) y[s->size - 1] = 1;			\
2931 									\
2932 	if (x[s->size - 1] < y[s->size - 1])				\
2933 	  MP_LIMB_T_SWAP (x[s->size - 1], y[s->size - 1]);		\
2934 	else if (x[s->size - 1] == y[s->size - 1])			\
2935 	  {								\
2936 	    x[s->size - 1] = 2;						\
2937 	    y[s->size - 1] = 1;						\
2938 	  }								\
2939 	y[0] |= 1;							\
2940       }									\
2941 									\
2942     speed_operand_src (s, px, psize);					\
2943     speed_operand_src (s, py, psize);					\
2944     speed_operand_dst (s, xtmp, s->size);				\
2945     speed_operand_dst (s, ytmp, s->size);				\
2946     speed_operand_dst (s, wp, s->size);					\
2947     speed_cache_fill (s);						\
2948 									\
2949     speed_starttime ();							\
2950     i = s->reps;							\
2951     do									\
2952       {									\
2953 	j = pieces;							\
2954 	do								\
2955 	  {								\
2956 	    MPN_COPY (xtmp, px+(j - 1)*s->size, s->size);		\
2957 	    MPN_COPY (ytmp, py+(j - 1)*s->size, s->size);		\
2958 	    call;							\
2959 	  }								\
2960 	while (--j != 0);						\
2961       }									\
2962     while (--i != 0);							\
2963     t = speed_endtime ();						\
2964 									\
2965     TMP_FREE;								\
2966 									\
2967     s->time_divisor = pieces;						\
2968     return t;								\
2969   }
2970 
2971 #define SPEED_ROUTINE_MPN_GCD(function)	\
2972   SPEED_ROUTINE_MPN_GCD_CALL (8, function (wp, xtmp, s->size, ytmp, s->size))
2973 
2974 #define SPEED_ROUTINE_MPN_GCDEXT(function)				\
2975   SPEED_ROUTINE_MPN_GCD_CALL						\
2976     (4, { mp_size_t  wp2size;						\
2977 	  function (wp, wp2, &wp2size, xtmp, s->size, ytmp, s->size); })
2978 
2979 
2980 #define SPEED_ROUTINE_MPN_GCDEXT_ONE(function)				\
2981   {									\
2982     unsigned  i;							\
2983     mp_size_t j, pieces, psize, wp2size;				\
2984     mp_ptr    wp, wp2, xtmp, ytmp, px, py;				\
2985     double    t;							\
2986     TMP_DECL;								\
2987 									\
2988     SPEED_RESTRICT_COND (s->size >= 1);					\
2989 									\
2990     TMP_MARK;								\
2991 									\
2992     SPEED_TMP_ALLOC_LIMBS (xtmp, s->size+1, s->align_xp);		\
2993     SPEED_TMP_ALLOC_LIMBS (ytmp, s->size+1, s->align_yp);		\
2994     MPN_COPY (xtmp, s->xp, s->size);					\
2995     MPN_COPY (ytmp, s->yp, s->size);					\
2996 									\
2997     SPEED_TMP_ALLOC_LIMBS (wp,	s->size+1, s->align_wp);		\
2998     SPEED_TMP_ALLOC_LIMBS (wp2, s->size+1, s->align_wp2);		\
2999 									\
3000     pieces = SPEED_BLOCK_SIZE / 3;					\
3001     psize = 3 * pieces;							\
3002     px = TMP_ALLOC_LIMBS (psize);					\
3003     py = TMP_ALLOC_LIMBS (psize);					\
3004     MPN_COPY (px, s->xp_block, psize);					\
3005     MPN_COPY (py, s->yp_block, psize);					\
3006 									\
3007     /* x must have at least as many bits as y,				\
3008        high limbs must be non-zero */					\
3009     for (j = 0; j < pieces; j++)					\
3010       {									\
3011 	mp_ptr	x = px+3*j;						\
3012 	mp_ptr	y = py+3*j;						\
3013 	x[2] += (x[2] == 0);						\
3014 	y[2] += (y[2] == 0);						\
3015 	if (x[2] < y[2])						\
3016 	  MP_LIMB_T_SWAP (x[2], y[2]);					\
3017       }									\
3018 									\
3019     speed_operand_src (s, px, psize);					\
3020     speed_operand_src (s, py, psize);					\
3021     speed_operand_dst (s, xtmp, s->size);				\
3022     speed_operand_dst (s, ytmp, s->size);				\
3023     speed_operand_dst (s, wp, s->size);					\
3024     speed_cache_fill (s);						\
3025 									\
3026     speed_starttime ();							\
3027     i = s->reps;							\
3028     do									\
3029       {									\
3030 	mp_ptr	x = px;							\
3031 	mp_ptr	y = py;							\
3032 	mp_ptr	xth = &xtmp[s->size-3];					\
3033 	mp_ptr	yth = &ytmp[s->size-3];					\
3034 	j = pieces;							\
3035 	do								\
3036 	  {								\
3037 	    xth[0] = x[0], xth[1] = x[1], xth[2] = x[2];		\
3038 	    yth[0] = y[0], yth[1] = y[1], yth[2] = y[2];		\
3039 									\
3040 	    ytmp[0] |= 1; /* y must be odd, */				\
3041 									\
3042 	    function (wp, wp2, &wp2size, xtmp, s->size, ytmp, s->size);	\
3043 									\
3044 	    x += 3;							\
3045 	    y += 3;							\
3046 	  }								\
3047 	while (--j != 0);						\
3048       }									\
3049     while (--i != 0);							\
3050     t = speed_endtime ();						\
3051 									\
3052     TMP_FREE;								\
3053 									\
3054     s->time_divisor = pieces;						\
3055     return t;								\
3056   }
3057 
3058 #define SPEED_ROUTINE_MPZ_JACOBI(function)				\
3059   {									\
3060     mpz_t     a, b;							\
3061     unsigned  i;							\
3062     mp_size_t j, pieces, psize;						\
3063     mp_ptr    px, py;							\
3064     double    t;							\
3065     TMP_DECL;								\
3066 									\
3067     TMP_MARK;								\
3068     pieces = SPEED_BLOCK_SIZE / MAX (s->size, 1);			\
3069     pieces = MAX (pieces, 1);						\
3070     s->time_divisor = pieces;						\
3071 									\
3072     psize = pieces * s->size;						\
3073     px = TMP_ALLOC_LIMBS (psize);					\
3074     py = TMP_ALLOC_LIMBS (psize);					\
3075     MPN_COPY (px, pieces==1 ? s->xp : s->xp_block, psize);		\
3076     MPN_COPY (py, pieces==1 ? s->yp : s->yp_block, psize);		\
3077 									\
3078     for (j = 0; j < pieces; j++)					\
3079       {									\
3080 	mp_ptr	x = px+j*s->size;					\
3081 	mp_ptr	y = py+j*s->size;					\
3082 									\
3083 	/* y odd */							\
3084 	y[0] |= 1;							\
3085 									\
3086 	/* high limbs non-zero */					\
3087 	if (x[s->size-1] == 0) x[s->size-1] = 1;			\
3088 	if (y[s->size-1] == 0) y[s->size-1] = 1;			\
3089       }									\
3090 									\
3091     SIZ(a) = s->size;							\
3092     SIZ(b) = s->size;							\
3093 									\
3094     speed_operand_src (s, px, psize);					\
3095     speed_operand_src (s, py, psize);					\
3096     speed_cache_fill (s);						\
3097 									\
3098     speed_starttime ();							\
3099     i = s->reps;							\
3100     do									\
3101       {									\
3102 	j = pieces;							\
3103 	do								\
3104 	  {								\
3105 	    PTR(a) = px+(j-1)*s->size;					\
3106 	    PTR(b) = py+(j-1)*s->size;					\
3107 	    function (a, b);						\
3108 	  }								\
3109 	while (--j != 0);						\
3110       }									\
3111     while (--i != 0);							\
3112     t = speed_endtime ();						\
3113 									\
3114     TMP_FREE;								\
3115     return t;								\
3116   }
3117 
3118 #define SPEED_ROUTINE_MPN_DIVREM_2(function)				\
3119   {									\
3120     mp_ptr    wp, xp;							\
3121     mp_limb_t yp[2];							\
3122     unsigned  i;							\
3123     double    t;							\
3124     TMP_DECL;								\
3125 									\
3126     SPEED_RESTRICT_COND (s->size >= 2);					\
3127 									\
3128     TMP_MARK;								\
3129     SPEED_TMP_ALLOC_LIMBS (xp, s->size, s->align_xp);			\
3130     SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
3131 									\
3132     /* source is destroyed */						\
3133     MPN_COPY (xp, s->xp, s->size);					\
3134 									\
3135     /* divisor must be normalized */					\
3136     MPN_COPY (yp, s->yp_block, 2);					\
3137     yp[1] |= GMP_NUMB_HIGHBIT;						\
3138 									\
3139     speed_operand_src (s, xp, s->size);					\
3140     speed_operand_src (s, yp, 2);					\
3141     speed_operand_dst (s, wp, s->size);					\
3142     speed_cache_fill (s);						\
3143 									\
3144     speed_starttime ();							\
3145     i = s->reps;							\
3146     do									\
3147       function (wp, 0, xp, s->size, yp);				\
3148     while (--i != 0);							\
3149     t = speed_endtime ();						\
3150 									\
3151     TMP_FREE;								\
3152     return t;								\
3153   }
3154 
3155 #define SPEED_ROUTINE_MPN_DIV_QR_1(function)				\
3156   {									\
3157     mp_ptr    wp, xp;							\
3158     mp_limb_t d;							\
3159     mp_limb_t r;							\
3160     unsigned  i;							\
3161     double    t;							\
3162     TMP_DECL;								\
3163 									\
3164     SPEED_RESTRICT_COND (s->size >= 1);					\
3165 									\
3166     TMP_MARK;								\
3167     SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
3168 									\
3169     d = s->r;								\
3170     if (d == 0)								\
3171       d = 1;								\
3172     speed_operand_src (s, s->xp, s->size);				\
3173     speed_operand_dst (s, wp, s->size);					\
3174     speed_cache_fill (s);						\
3175 									\
3176     speed_starttime ();							\
3177     i = s->reps;							\
3178     do									\
3179       r = function (wp, wp+s->size-1, s->xp, s->size, d);		\
3180     while (--i != 0);							\
3181     t = speed_endtime ();						\
3182 									\
3183     TMP_FREE;								\
3184     return t;								\
3185   }
3186 
3187 #define SPEED_ROUTINE_MPN_DIV_QR_1N_PI1(function)			\
3188   {									\
3189     mp_ptr    wp, xp;							\
3190     mp_limb_t d, dinv;							\
3191     mp_limb_t r;							\
3192     unsigned  i;							\
3193     double    t;							\
3194     TMP_DECL;								\
3195 									\
3196     SPEED_RESTRICT_COND (s->size >= 1);					\
3197 									\
3198     TMP_MARK;								\
3199     SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
3200 									\
3201     d = s->r;								\
3202     /* divisor must be normalized */					\
3203     SPEED_RESTRICT_COND (d & GMP_NUMB_HIGHBIT);				\
3204     invert_limb (dinv, d);						\
3205     speed_operand_src (s, s->xp, s->size);				\
3206     speed_operand_dst (s, wp, s->size);					\
3207     speed_cache_fill (s);						\
3208 									\
3209     speed_starttime ();							\
3210     i = s->reps;							\
3211     do									\
3212       r = function (wp, s->xp, s->size, 0, d, dinv);			\
3213     while (--i != 0);							\
3214     t = speed_endtime ();						\
3215 									\
3216     TMP_FREE;								\
3217     return t;								\
3218   }
3219 
3220 #define SPEED_ROUTINE_MPN_DIV_QR_2(function, norm)			\
3221   {									\
3222     mp_ptr    wp, xp;							\
3223     mp_limb_t yp[2];							\
3224     mp_limb_t rp[2];							\
3225     unsigned  i;							\
3226     double    t;							\
3227     TMP_DECL;								\
3228 									\
3229     SPEED_RESTRICT_COND (s->size >= 2);					\
3230 									\
3231     TMP_MARK;								\
3232     SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
3233 									\
3234     /* divisor must be normalized */					\
3235     MPN_COPY (yp, s->yp_block, 2);					\
3236     if (norm)								\
3237       yp[1] |= GMP_NUMB_HIGHBIT;					\
3238     else								\
3239       {									\
3240 	yp[1] &= ~GMP_NUMB_HIGHBIT;					\
3241 	if (yp[1] == 0)							\
3242 	  yp[1] = 1;							\
3243       }									\
3244     speed_operand_src (s, s->xp, s->size);				\
3245     speed_operand_src (s, yp, 2);					\
3246     speed_operand_dst (s, wp, s->size);					\
3247     speed_operand_dst (s, rp, 2);					\
3248     speed_cache_fill (s);						\
3249 									\
3250     speed_starttime ();							\
3251     i = s->reps;							\
3252     do									\
3253       function (wp, rp, s->xp, s->size, yp);				\
3254     while (--i != 0);							\
3255     t = speed_endtime ();						\
3256 									\
3257     TMP_FREE;								\
3258     return t;								\
3259   }
3260 
3261 #define SPEED_ROUTINE_MODLIMB_INVERT(function)				\
3262   {									\
3263     unsigned   i, j;							\
3264     mp_ptr     xp;							\
3265     mp_limb_t  n = 1;							\
3266     double     t;							\
3267 									\
3268     xp = s->xp_block-1;							\
3269 									\
3270     speed_operand_src (s, s->xp_block, SPEED_BLOCK_SIZE);		\
3271     speed_cache_fill (s);						\
3272 									\
3273     speed_starttime ();							\
3274     i = s->reps;							\
3275     do									\
3276       {									\
3277 	j = SPEED_BLOCK_SIZE;						\
3278 	do								\
3279 	  {								\
3280 	    /* randomized but successively dependent */			\
3281 	    n += (xp[j] << 1);						\
3282 									\
3283 	    function (n, n);						\
3284 	  }								\
3285 	while (--j != 0);						\
3286       }									\
3287     while (--i != 0);							\
3288     t = speed_endtime ();						\
3289 									\
3290     /* make sure the compiler won't optimize away n */			\
3291     noop_1 (n);								\
3292 									\
3293     s->time_divisor = SPEED_BLOCK_SIZE;					\
3294     return t;								\
3295   }
3296 
3297 
3298 #define SPEED_ROUTINE_MPN_SQRTROOT_CALL(call)				\
3299   {									\
3300     mp_ptr    wp, wp2;							\
3301     unsigned  i;							\
3302     double    t;							\
3303     TMP_DECL;								\
3304 									\
3305     SPEED_RESTRICT_COND (s->size >= 1);					\
3306 									\
3307     TMP_MARK;								\
3308     SPEED_TMP_ALLOC_LIMBS (wp,	s->size, s->align_wp);			\
3309     SPEED_TMP_ALLOC_LIMBS (wp2, s->size, s->align_wp2);			\
3310 									\
3311     speed_operand_src (s, s->xp, s->size);				\
3312     speed_operand_dst (s, wp, s->size);					\
3313     speed_operand_dst (s, wp2, s->size);				\
3314     speed_cache_fill (s);						\
3315 									\
3316     speed_starttime ();							\
3317     i = s->reps;							\
3318     do									\
3319       call;								\
3320     while (--i != 0);							\
3321     t = speed_endtime ();						\
3322 									\
3323     TMP_FREE;								\
3324     return t;								\
3325   }
3326 
3327 
3328 /* s->size controls the number of limbs in the input, s->r is the base, or
3329    decimal by default. */
3330 #define SPEED_ROUTINE_MPN_GET_STR(function)				\
3331   {									\
3332     unsigned char *wp;							\
3333     mp_size_t wn;							\
3334     mp_ptr xp;								\
3335     int base;								\
3336     unsigned i;								\
3337     double t;								\
3338     TMP_DECL;								\
3339 									\
3340     SPEED_RESTRICT_COND (s->size >= 1);					\
3341 									\
3342     base = s->r == 0 ? 10 : s->r;					\
3343     SPEED_RESTRICT_COND (base >= 2 && base <= 256);			\
3344 									\
3345     TMP_MARK;								\
3346     SPEED_TMP_ALLOC_LIMBS (xp, s->size + 1, s->align_xp);		\
3347 									\
3348     MPN_SIZEINBASE (wn, s->xp, s->size, base);				\
3349     wp = (unsigned char *) TMP_ALLOC (wn);				\
3350 									\
3351     /* use this during development to guard against overflowing wp */	\
3352     /*									\
3353     MPN_COPY (xp, s->xp, s->size);					\
3354     ASSERT_ALWAYS (mpn_get_str (wp, base, xp, s->size) <= wn);		\
3355     */									\
3356 									\
3357     speed_operand_src (s, s->xp, s->size);				\
3358     speed_operand_dst (s, xp, s->size);					\
3359     speed_operand_dst (s, (mp_ptr) wp, wn/GMP_LIMB_BYTES);		\
3360     speed_cache_fill (s);						\
3361 									\
3362     speed_starttime ();							\
3363     i = s->reps;							\
3364     do									\
3365       {									\
3366 	MPN_COPY (xp, s->xp, s->size);					\
3367 	function (wp, base, xp, s->size);				\
3368       }									\
3369     while (--i != 0);							\
3370     t = speed_endtime ();						\
3371 									\
3372     TMP_FREE;								\
3373     return t;								\
3374   }
3375 
3376 /* s->size controls the number of digits in the input, s->r is the base, or
3377    decimal by default. */
3378 #define SPEED_ROUTINE_MPN_SET_STR_CALL(call)				\
3379   {									\
3380     unsigned char *xp;							\
3381     mp_ptr     wp;							\
3382     mp_size_t  wn;							\
3383     unsigned   i;							\
3384     int        base;							\
3385     double     t;							\
3386     TMP_DECL;								\
3387 									\
3388     SPEED_RESTRICT_COND (s->size >= 1);					\
3389 									\
3390     base = s->r == 0 ? 10 : s->r;					\
3391     SPEED_RESTRICT_COND (base >= 2 && base <= 256);			\
3392 									\
3393     TMP_MARK;								\
3394 									\
3395     xp = (unsigned char *) TMP_ALLOC (s->size);				\
3396     for (i = 0; i < s->size; i++)					\
3397       xp[i] = s->xp[i] % base;						\
3398 									\
3399     LIMBS_PER_DIGIT_IN_BASE (wn, s->size, base);			\
3400     SPEED_TMP_ALLOC_LIMBS (wp, wn, s->align_wp);			\
3401 									\
3402     /* use this during development to check wn is big enough */		\
3403     /*									\
3404     ASSERT_ALWAYS (mpn_set_str (wp, xp, s->size, base) <= wn);		\
3405     */									\
3406 									\
3407     speed_operand_src (s, (mp_ptr) xp, s->size/GMP_LIMB_BYTES);	\
3408     speed_operand_dst (s, wp, wn);					\
3409     speed_cache_fill (s);						\
3410 									\
3411     speed_starttime ();							\
3412     i = s->reps;							\
3413     do									\
3414       call;								\
3415     while (--i != 0);							\
3416     t = speed_endtime ();						\
3417 									\
3418     TMP_FREE;								\
3419     return t;								\
3420   }
3421 
3422 
3423 /* Run an accel gcd find_a() function over various data values.  A set of
3424    values is used in case some run particularly fast or slow.  The size
3425    parameter is ignored, the amount of data tested is fixed.  */
3426 
3427 #define SPEED_ROUTINE_MPN_GCD_FINDA(function)				\
3428   {									\
3429     unsigned  i, j;							\
3430     mp_limb_t cp[SPEED_BLOCK_SIZE][2];					\
3431     double    t;							\
3432     TMP_DECL;								\
3433 									\
3434     TMP_MARK;								\
3435 									\
3436     /* low must be odd, high must be non-zero */			\
3437     for (i = 0; i < SPEED_BLOCK_SIZE; i++)				\
3438       {									\
3439 	cp[i][0] = s->xp_block[i] | 1;					\
3440 	cp[i][1] = s->yp_block[i] + (s->yp_block[i] == 0);		\
3441       }									\
3442 									\
3443     speed_operand_src (s, &cp[0][0], 2*SPEED_BLOCK_SIZE);		\
3444     speed_cache_fill (s);						\
3445 									\
3446     speed_starttime ();							\
3447     i = s->reps;							\
3448     do									\
3449       {									\
3450 	j = SPEED_BLOCK_SIZE;						\
3451 	do								\
3452 	  {								\
3453 	    function (cp[j-1]);						\
3454 	  }								\
3455 	while (--j != 0);						\
3456       }									\
3457     while (--i != 0);							\
3458     t = speed_endtime ();						\
3459 									\
3460     TMP_FREE;								\
3461 									\
3462     s->time_divisor = SPEED_BLOCK_SIZE;					\
3463     return t;								\
3464   }
3465 
3466 
3467 /* "call" should do "count_foo_zeros(c,n)".
3468    Give leading=1 if foo is leading zeros, leading=0 for trailing.
3469    Give zero=1 if n=0 is allowed in the call, zero=0 if not.  */
3470 
3471 #define SPEED_ROUTINE_COUNT_ZEROS_A(leading, zero)			\
3472   {									\
3473     mp_ptr     xp;							\
3474     int        i, c;							\
3475     unsigned   j;							\
3476     mp_limb_t  n;							\
3477     double     t;							\
3478     TMP_DECL;								\
3479 									\
3480     TMP_MARK;								\
3481     SPEED_TMP_ALLOC_LIMBS (xp, SPEED_BLOCK_SIZE, s->align_xp);		\
3482 									\
3483     if (! speed_routine_count_zeros_setup (s, xp, leading, zero))	\
3484       return -1.0;							\
3485     speed_operand_src (s, xp, SPEED_BLOCK_SIZE);			\
3486     speed_cache_fill (s);						\
3487 									\
3488     c = 0;								\
3489     speed_starttime ();							\
3490     j = s->reps;							\
3491     do {								\
3492       for (i = 0; i < SPEED_BLOCK_SIZE; i++)				\
3493 	{								\
3494 	  n = xp[i];							\
3495 	  n ^= c;							\
3496 
3497 #define SPEED_ROUTINE_COUNT_ZEROS_B()					\
3498 	}								\
3499     } while (--j != 0);							\
3500     t = speed_endtime ();						\
3501 									\
3502     /* don't let c go dead */						\
3503     noop_1 (c);								\
3504 									\
3505     s->time_divisor = SPEED_BLOCK_SIZE;					\
3506 									\
3507     TMP_FREE;								\
3508     return t;								\
3509   }									\
3510 
3511 #define SPEED_ROUTINE_COUNT_ZEROS_C(call, leading, zero)		\
3512   do {									\
3513     SPEED_ROUTINE_COUNT_ZEROS_A (leading, zero);			\
3514     call;								\
3515     SPEED_ROUTINE_COUNT_ZEROS_B ();					\
3516   } while (0)								\
3517 
3518 #define SPEED_ROUTINE_COUNT_LEADING_ZEROS_C(call,zero)			\
3519   SPEED_ROUTINE_COUNT_ZEROS_C (call, 1, zero)
3520 #define SPEED_ROUTINE_COUNT_LEADING_ZEROS(fun)				\
3521   SPEED_ROUTINE_COUNT_ZEROS_C (fun (c, n), 1, 0)
3522 
3523 #define SPEED_ROUTINE_COUNT_TRAILING_ZEROS_C(call,zero)			\
3524   SPEED_ROUTINE_COUNT_ZEROS_C (call, 0, zero)
3525 #define SPEED_ROUTINE_COUNT_TRAILING_ZEROS(call)			\
3526   SPEED_ROUTINE_COUNT_ZEROS_C (fun (c, n), 0, 0)
3527 
3528 
3529 #define SPEED_ROUTINE_INVERT_LIMB_CALL(call)				\
3530   {									\
3531     unsigned   i, j;							\
3532     mp_limb_t  d, dinv=0;						\
3533     mp_ptr     xp = s->xp_block - 1;					\
3534 									\
3535     s->time_divisor = SPEED_BLOCK_SIZE;					\
3536 									\
3537     speed_starttime ();							\
3538     i = s->reps;							\
3539     do									\
3540       {									\
3541 	j = SPEED_BLOCK_SIZE;						\
3542 	do								\
3543 	  {								\
3544 	    d = dinv ^ xp[j];						\
3545 	    d |= GMP_LIMB_HIGHBIT;					\
3546 	    do { call; } while (0);					\
3547 	  }								\
3548 	while (--j != 0);						\
3549       }									\
3550     while (--i != 0);							\
3551 									\
3552     /* don't let the compiler optimize everything away */		\
3553     noop_1 (dinv);							\
3554 									\
3555     return speed_endtime();						\
3556   }
3557 
3558 
3559 #define SPEED_ROUTINE_MPN_BACK_TO_BACK(function)			\
3560   {									\
3561     unsigned  i;							\
3562     speed_starttime ();							\
3563     i = s->reps;							\
3564     do									\
3565       function ();							\
3566     while (--i != 0);							\
3567     return speed_endtime ();						\
3568   }
3569 
3570 
3571 #define SPEED_ROUTINE_MPN_ZERO_CALL(call)				\
3572   {									\
3573     mp_ptr    wp;							\
3574     unsigned  i;							\
3575     double    t;							\
3576     TMP_DECL;								\
3577 									\
3578     SPEED_RESTRICT_COND (s->size >= 0);					\
3579 									\
3580     TMP_MARK;								\
3581     SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
3582     speed_operand_dst (s, wp, s->size);					\
3583     speed_cache_fill (s);						\
3584 									\
3585     speed_starttime ();							\
3586     i = s->reps;							\
3587     do									\
3588       call;								\
3589     while (--i != 0);							\
3590     t = speed_endtime ();						\
3591 									\
3592     TMP_FREE;								\
3593     return t;								\
3594   }
3595 
3596 #define SPEED_ROUTINE_MPN_ZERO(function)				\
3597   SPEED_ROUTINE_MPN_ZERO_CALL (function (wp, s->size))
3598 
3599 
3600 #endif
3601