xref: /netbsd-src/external/lgpl3/gmp/dist/tune/speed.h (revision 6cd39ddb8550f6fa1bff3fed32053d7f19fd0453)
1 /* Header for speed and threshold things.
2 
3 Copyright 1999, 2000, 2001, 2002, 2003, 2005, 2006, 2008, 2009, 2010, 2011,
4 2012 Free Software Foundation, Inc.
5 
6 This file is part of the GNU MP Library.
7 
8 The GNU MP Library is free software; you can redistribute it and/or modify
9 it under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 3 of the License, or (at your
11 option) any later version.
12 
13 The GNU MP Library is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16 License for more details.
17 
18 You should have received a copy of the GNU Lesser General Public License
19 along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
20 
21 #ifndef __SPEED_H__
22 #define __SPEED_H__
23 
24 
25 /* Pad ptr,oldsize with zero limbs (at the most significant end) to make it
26    newsize long. */
27 #define MPN_ZERO_EXTEND(ptr, oldsize, newsize)		\
28   do {							\
29     ASSERT ((newsize) >= (oldsize));			\
30     MPN_ZERO ((ptr)+(oldsize), (newsize)-(oldsize));	\
31   } while (0)
32 
33 /* A mask of the least significant n bits.  Note 1<<32 doesn't give zero on
34    x86 family CPUs, hence the separate case for GMP_LIMB_BITS. */
35 #define MP_LIMB_T_LOWBITMASK(n)	\
36   ((n) == GMP_LIMB_BITS ? MP_LIMB_T_MAX : ((mp_limb_t) 1 << (n)) - 1)
37 
38 
39 /* align must be a power of 2 here, usually CACHE_LINE_SIZE is a good choice */
40 
41 #define TMP_ALLOC_ALIGNED(bytes, align)	\
42   align_pointer (TMP_ALLOC ((bytes) + (align)-1), (align))
43 #define TMP_ALLOC_LIMBS_ALIGNED(limbs, align)	\
44   ((mp_ptr) TMP_ALLOC_ALIGNED ((limbs)*sizeof(mp_limb_t), align))
45 
46 /* CACHE_LINE_SIZE is our default alignment for speed operands, and the
47    limit on what s->align_xp etc and then request for off-alignment.  Maybe
48    this should be an option of some sort, but in any case here are some line
49    sizes,
50 
51        bytes
52 	 32   pentium
53 	 64   athlon
54 	 64   itanium-2 L1
55 	128   itanium-2 L2
56 */
57 #define CACHE_LINE_SIZE   64 /* bytes */
58 
59 #define SPEED_TMP_ALLOC_ADJUST_MASK  (CACHE_LINE_SIZE/BYTES_PER_MP_LIMB - 1)
60 
61 /* Set ptr to a TMP_ALLOC block of the given limbs, with the given limb
62    alignment.  */
63 #define SPEED_TMP_ALLOC_LIMBS(ptr, limbs, align)			\
64   do {									\
65     mp_ptr     __ptr;							\
66     mp_size_t  __ptr_align, __ptr_add;					\
67 									\
68     ASSERT ((CACHE_LINE_SIZE % BYTES_PER_MP_LIMB) == 0);		\
69     __ptr = TMP_ALLOC_LIMBS ((limbs) + SPEED_TMP_ALLOC_ADJUST_MASK);	\
70     __ptr_align = (__ptr - (mp_ptr) NULL);				\
71     __ptr_add = ((align) - __ptr_align) & SPEED_TMP_ALLOC_ADJUST_MASK;	\
72     (ptr) = __ptr + __ptr_add;						\
73   } while (0)
74 
75 
76 /* This is the size for s->xp_block and s->yp_block, used in certain
77    routines that want to run across many different data values and use
78    s->size for a different purpose, eg. SPEED_ROUTINE_MPN_GCD_1.
79 
80    512 means 2kbytes of data for each of xp_block and yp_block, making 4k
81    total, which should fit easily in any L1 data cache. */
82 
83 #define SPEED_BLOCK_SIZE   512 /* limbs */
84 
85 
86 extern double  speed_unittime;
87 extern double  speed_cycletime;
88 extern int     speed_precision;
89 extern char    speed_time_string[];
90 void speed_time_init (void);
91 void speed_cycletime_fail (const char *str);
92 void speed_cycletime_init (void);
93 void speed_cycletime_need_cycles (void);
94 void speed_cycletime_need_seconds (void);
95 void speed_starttime (void);
96 double speed_endtime (void);
97 
98 
99 struct speed_params {
100   unsigned   reps;	/* how many times to run the routine */
101   mp_ptr     xp;	/* first argument */
102   mp_ptr     yp;	/* second argument */
103   mp_size_t  size;	/* size of both arguments */
104   mp_limb_t  r;		/* user supplied parameter */
105   mp_size_t  align_xp;	/* alignment of xp */
106   mp_size_t  align_yp;	/* alignment of yp */
107   mp_size_t  align_wp;	/* intended alignment of wp */
108   mp_size_t  align_wp2; /* intended alignment of wp2 */
109   mp_ptr     xp_block;	/* first special SPEED_BLOCK_SIZE block */
110   mp_ptr     yp_block;	/* second special SPEED_BLOCK_SIZE block */
111 
112   double     time_divisor; /* optionally set by the speed routine */
113 
114   /* used by the cache priming things */
115   int	     cache;
116   unsigned   src_num, dst_num;
117   struct {
118     mp_ptr    ptr;
119     mp_size_t size;
120   } src[5], dst[4];
121 };
122 
123 typedef double (*speed_function_t) (struct speed_params *);
124 
125 double speed_measure (speed_function_t fun, struct speed_params *);
126 
127 /* Prototypes for speed measuring routines */
128 
129 double speed_back_to_back (struct speed_params *);
130 double speed_count_leading_zeros (struct speed_params *);
131 double speed_count_trailing_zeros (struct speed_params *);
132 double speed_find_a (struct speed_params *);
133 double speed_gmp_allocate_free (struct speed_params *);
134 double speed_gmp_allocate_reallocate_free (struct speed_params *);
135 double speed_invert_limb (struct speed_params *);
136 double speed_malloc_free (struct speed_params *);
137 double speed_malloc_realloc_free (struct speed_params *);
138 double speed_memcpy (struct speed_params *);
139 double speed_binvert_limb (struct speed_params *);
140 double speed_binvert_limb_mul1 (struct speed_params *);
141 double speed_binvert_limb_loop (struct speed_params *);
142 double speed_binvert_limb_cond (struct speed_params *);
143 double speed_binvert_limb_arith (struct speed_params *);
144 
145 double speed_mpf_init_clear (struct speed_params *);
146 
147 double speed_mpn_add_n (struct speed_params *);
148 double speed_mpn_add_err1_n (struct speed_params *);
149 double speed_mpn_add_err2_n (struct speed_params *);
150 double speed_mpn_add_err3_n (struct speed_params *);
151 double speed_mpn_addcnd_n (struct speed_params *);
152 double speed_mpn_addlsh_n (struct speed_params *);
153 double speed_mpn_addlsh1_n (struct speed_params *);
154 double speed_mpn_addlsh2_n (struct speed_params *);
155 double speed_mpn_addlsh_n_ip1 (struct speed_params *);
156 double speed_mpn_addlsh1_n_ip1 (struct speed_params *);
157 double speed_mpn_addlsh2_n_ip1 (struct speed_params *);
158 double speed_mpn_addlsh_n_ip2 (struct speed_params *);
159 double speed_mpn_addlsh1_n_ip2 (struct speed_params *);
160 double speed_mpn_addlsh2_n_ip2 (struct speed_params *);
161 double speed_mpn_add_n_sub_n (struct speed_params *);
162 double speed_mpn_and_n (struct speed_params *);
163 double speed_mpn_andn_n (struct speed_params *);
164 double speed_mpn_addmul_1 (struct speed_params *);
165 double speed_mpn_addmul_2 (struct speed_params *);
166 double speed_mpn_addmul_3 (struct speed_params *);
167 double speed_mpn_addmul_4 (struct speed_params *);
168 double speed_mpn_addmul_5 (struct speed_params *);
169 double speed_mpn_addmul_6 (struct speed_params *);
170 double speed_mpn_addmul_7 (struct speed_params *);
171 double speed_mpn_addmul_8 (struct speed_params *);
172 double speed_mpn_com (struct speed_params *);
173 double speed_mpn_copyd (struct speed_params *);
174 double speed_mpn_copyi (struct speed_params *);
175 double speed_MPN_COPY (struct speed_params *);
176 double speed_MPN_COPY_DECR (struct speed_params *);
177 double speed_MPN_COPY_INCR (struct speed_params *);
178 double speed_mpn_tabselect (struct speed_params *);
179 double speed_mpn_divexact_1 (struct speed_params *);
180 double speed_mpn_divexact_by3 (struct speed_params *);
181 double speed_mpn_bdiv_q_1 (struct speed_params *);
182 double speed_mpn_pi1_bdiv_q_1 (struct speed_params *);
183 double speed_mpn_bdiv_dbm1c (struct speed_params *);
184 double speed_mpn_divrem_1 (struct speed_params *);
185 double speed_mpn_divrem_1f (struct speed_params *);
186 double speed_mpn_divrem_1c (struct speed_params *);
187 double speed_mpn_divrem_1cf (struct speed_params *);
188 double speed_mpn_divrem_1_div (struct speed_params *);
189 double speed_mpn_divrem_1f_div (struct speed_params *);
190 double speed_mpn_divrem_1_inv (struct speed_params *);
191 double speed_mpn_divrem_1f_inv (struct speed_params *);
192 double speed_mpn_divrem_2 (struct speed_params *);
193 double speed_mpn_divrem_2_div (struct speed_params *);
194 double speed_mpn_divrem_2_inv (struct speed_params *);
195 double speed_mpn_div_qr_2n (struct speed_params *);
196 double speed_mpn_div_qr_2u (struct speed_params *);
197 double speed_mpn_fib2_ui (struct speed_params *);
198 double speed_mpn_matrix22_mul (struct speed_params *);
199 double speed_mpn_hgcd (struct speed_params *);
200 double speed_mpn_hgcd_lehmer (struct speed_params *);
201 double speed_mpn_hgcd_appr (struct speed_params *);
202 double speed_mpn_hgcd_appr_lehmer (struct speed_params *);
203 double speed_mpn_hgcd_reduce (struct speed_params *);
204 double speed_mpn_hgcd_reduce_1 (struct speed_params *);
205 double speed_mpn_hgcd_reduce_2 (struct speed_params *);
206 double speed_mpn_gcd (struct speed_params *);
207 double speed_mpn_gcd_1 (struct speed_params *);
208 double speed_mpn_gcd_1N (struct speed_params *);
209 double speed_mpn_gcdext (struct speed_params *);
210 double speed_mpn_gcdext_double (struct speed_params *);
211 double speed_mpn_gcdext_one_double (struct speed_params *);
212 double speed_mpn_gcdext_one_single (struct speed_params *);
213 double speed_mpn_gcdext_single (struct speed_params *);
214 double speed_mpn_get_str (struct speed_params *);
215 double speed_mpn_hamdist (struct speed_params *);
216 double speed_mpn_ior_n (struct speed_params *);
217 double speed_mpn_iorn_n (struct speed_params *);
218 double speed_mpn_jacobi_base (struct speed_params *);
219 double speed_mpn_jacobi_base_1 (struct speed_params *);
220 double speed_mpn_jacobi_base_2 (struct speed_params *);
221 double speed_mpn_jacobi_base_3 (struct speed_params *);
222 double speed_mpn_jacobi_base_4 (struct speed_params *);
223 double speed_mpn_lshift (struct speed_params *);
224 double speed_mpn_lshiftc (struct speed_params *);
225 double speed_mpn_mod_1 (struct speed_params *);
226 double speed_mpn_mod_1c (struct speed_params *);
227 double speed_mpn_mod_1_div (struct speed_params *);
228 double speed_mpn_mod_1_inv (struct speed_params *);
229 double speed_mpn_mod_1_1 (struct speed_params *);
230 double speed_mpn_mod_1_1_1 (struct speed_params *);
231 double speed_mpn_mod_1_1_2 (struct speed_params *);
232 double speed_mpn_mod_1_2 (struct speed_params *);
233 double speed_mpn_mod_1_3 (struct speed_params *);
234 double speed_mpn_mod_1_4 (struct speed_params *);
235 double speed_mpn_mod_34lsub1 (struct speed_params *);
236 double speed_mpn_modexact_1_odd (struct speed_params *);
237 double speed_mpn_modexact_1c_odd (struct speed_params *);
238 double speed_mpn_mul_1 (struct speed_params *);
239 double speed_mpn_mul_1_inplace (struct speed_params *);
240 double speed_mpn_mul_2 (struct speed_params *);
241 double speed_mpn_mul_3 (struct speed_params *);
242 double speed_mpn_mul_4 (struct speed_params *);
243 double speed_mpn_mul_5 (struct speed_params *);
244 double speed_mpn_mul_6 (struct speed_params *);
245 double speed_mpn_mul (struct speed_params *);
246 double speed_mpn_mul_basecase (struct speed_params *);
247 double speed_mpn_mulmid (struct speed_params *);
248 double speed_mpn_mulmid_basecase (struct speed_params *);
249 double speed_mpn_mul_fft (struct speed_params *);
250 double speed_mpn_mul_fft_sqr (struct speed_params *);
251 double speed_mpn_fft_mul (struct speed_params *);
252 double speed_mpn_fft_sqr (struct speed_params *);
253 #if WANT_OLD_FFT_FULL
254 double speed_mpn_mul_fft_full (struct speed_params *);
255 double speed_mpn_mul_fft_full_sqr (struct speed_params *);
256 #endif
257 double speed_mpn_nussbaumer_mul (struct speed_params *);
258 double speed_mpn_nussbaumer_mul_sqr (struct speed_params *);
259 double speed_mpn_mul_n (struct speed_params *);
260 double speed_mpn_mul_n_sqr (struct speed_params *);
261 double speed_mpn_mulmid_n (struct speed_params *);
262 double speed_mpn_mullo_n (struct speed_params *);
263 double speed_mpn_mullo_basecase (struct speed_params *);
264 double speed_mpn_nand_n (struct speed_params *);
265 double speed_mpn_nior_n (struct speed_params *);
266 double speed_mpn_popcount (struct speed_params *);
267 double speed_mpn_preinv_divrem_1 (struct speed_params *);
268 double speed_mpn_preinv_divrem_1f (struct speed_params *);
269 double speed_mpn_preinv_mod_1 (struct speed_params *);
270 double speed_mpn_sbpi1_div_qr (struct speed_params *);
271 double speed_mpn_dcpi1_div_qr (struct speed_params *);
272 double speed_mpn_sbpi1_divappr_q (struct speed_params *);
273 double speed_mpn_dcpi1_divappr_q (struct speed_params *);
274 double speed_mpn_mu_div_qr (struct speed_params *);
275 double speed_mpn_mu_divappr_q (struct speed_params *);
276 double speed_mpn_mupi_div_qr (struct speed_params *);
277 double speed_mpn_mu_div_q (struct speed_params *);
278 double speed_mpn_sbpi1_bdiv_qr (struct speed_params *);
279 double speed_mpn_dcpi1_bdiv_qr (struct speed_params *);
280 double speed_mpn_sbpi1_bdiv_q (struct speed_params *);
281 double speed_mpn_dcpi1_bdiv_q (struct speed_params *);
282 double speed_mpn_mu_bdiv_q (struct speed_params *);
283 double speed_mpn_mu_bdiv_qr (struct speed_params *);
284 double speed_mpn_broot (struct speed_params *);
285 double speed_mpn_broot_invm1 (struct speed_params *);
286 double speed_mpn_brootinv (struct speed_params *);
287 double speed_mpn_invert (struct speed_params *);
288 double speed_mpn_invertappr (struct speed_params *);
289 double speed_mpn_ni_invertappr (struct speed_params *);
290 double speed_mpn_binvert (struct speed_params *);
291 double speed_mpn_redc_1 (struct speed_params *);
292 double speed_mpn_redc_2 (struct speed_params *);
293 double speed_mpn_redc_n (struct speed_params *);
294 double speed_mpn_rsblsh_n (struct speed_params *);
295 double speed_mpn_rsblsh1_n (struct speed_params *);
296 double speed_mpn_rsblsh2_n (struct speed_params *);
297 double speed_mpn_rsh1add_n (struct speed_params *);
298 double speed_mpn_rsh1sub_n (struct speed_params *);
299 double speed_mpn_rshift (struct speed_params *);
300 double speed_mpn_sb_divrem_m3 (struct speed_params *);
301 double speed_mpn_sb_divrem_m3_div (struct speed_params *);
302 double speed_mpn_sb_divrem_m3_inv (struct speed_params *);
303 double speed_mpn_set_str (struct speed_params *);
304 double speed_mpn_bc_set_str (struct speed_params *);
305 double speed_mpn_dc_set_str (struct speed_params *);
306 double speed_mpn_set_str_pre (struct speed_params *);
307 double speed_mpn_sqr_basecase (struct speed_params *);
308 double speed_mpn_sqr_diag_addlsh1 (struct speed_params *);
309 double speed_mpn_sqr_diagonal (struct speed_params *);
310 double speed_mpn_sqr (struct speed_params *);
311 double speed_mpn_sqrtrem (struct speed_params *);
312 double speed_mpn_rootrem (struct speed_params *);
313 double speed_mpn_sub_n (struct speed_params *);
314 double speed_mpn_sub_err1_n (struct speed_params *);
315 double speed_mpn_sub_err2_n (struct speed_params *);
316 double speed_mpn_sub_err3_n (struct speed_params *);
317 double speed_mpn_subcnd_n (struct speed_params *);
318 double speed_mpn_sublsh_n (struct speed_params *);
319 double speed_mpn_sublsh1_n (struct speed_params *);
320 double speed_mpn_sublsh2_n (struct speed_params *);
321 double speed_mpn_sublsh_n_ip1 (struct speed_params *);
322 double speed_mpn_sublsh1_n_ip1 (struct speed_params *);
323 double speed_mpn_sublsh2_n_ip1 (struct speed_params *);
324 double speed_mpn_submul_1 (struct speed_params *);
325 double speed_mpn_toom2_sqr (struct speed_params *);
326 double speed_mpn_toom3_sqr (struct speed_params *);
327 double speed_mpn_toom4_sqr (struct speed_params *);
328 double speed_mpn_toom6_sqr (struct speed_params *);
329 double speed_mpn_toom8_sqr (struct speed_params *);
330 double speed_mpn_toom22_mul (struct speed_params *);
331 double speed_mpn_toom33_mul (struct speed_params *);
332 double speed_mpn_toom44_mul (struct speed_params *);
333 double speed_mpn_toom6h_mul (struct speed_params *);
334 double speed_mpn_toom8h_mul (struct speed_params *);
335 double speed_mpn_toom32_mul (struct speed_params *);
336 double speed_mpn_toom42_mul (struct speed_params *);
337 double speed_mpn_toom43_mul (struct speed_params *);
338 double speed_mpn_toom63_mul (struct speed_params *);
339 double speed_mpn_toom32_for_toom43_mul (struct speed_params *);
340 double speed_mpn_toom43_for_toom32_mul (struct speed_params *);
341 double speed_mpn_toom32_for_toom53_mul (struct speed_params *);
342 double speed_mpn_toom53_for_toom32_mul (struct speed_params *);
343 double speed_mpn_toom42_for_toom53_mul (struct speed_params *);
344 double speed_mpn_toom53_for_toom42_mul (struct speed_params *);
345 double speed_mpn_toom43_for_toom54_mul (struct speed_params *);
346 double speed_mpn_toom54_for_toom43_mul (struct speed_params *);
347 double speed_mpn_toom42_mulmid (struct speed_params *);
348 double speed_mpn_mulmod_bnm1 (struct speed_params *);
349 double speed_mpn_bc_mulmod_bnm1 (struct speed_params *);
350 double speed_mpn_mulmod_bnm1_rounded (struct speed_params *);
351 double speed_mpn_sqrmod_bnm1 (struct speed_params *);
352 double speed_mpn_udiv_qrnnd (struct speed_params *);
353 double speed_mpn_udiv_qrnnd_r (struct speed_params *);
354 double speed_mpn_umul_ppmm (struct speed_params *);
355 double speed_mpn_umul_ppmm_r (struct speed_params *);
356 double speed_mpn_xnor_n (struct speed_params *);
357 double speed_mpn_xor_n (struct speed_params *);
358 double speed_MPN_ZERO (struct speed_params *);
359 
360 double speed_mpq_init_clear (struct speed_params *);
361 
362 double speed_mpz_add (struct speed_params *);
363 double speed_mpz_bin_uiui (struct speed_params *);
364 double speed_mpz_bin_ui (struct speed_params *);
365 double speed_mpz_fac_ui (struct speed_params *);
366 double speed_mpz_fib_ui (struct speed_params *);
367 double speed_mpz_fib2_ui (struct speed_params *);
368 double speed_mpz_init_clear (struct speed_params *);
369 double speed_mpz_init_realloc_clear (struct speed_params *);
370 double speed_mpz_jacobi (struct speed_params *);
371 double speed_mpz_lucnum_ui (struct speed_params *);
372 double speed_mpz_lucnum2_ui (struct speed_params *);
373 double speed_mpz_mod (struct speed_params *);
374 double speed_mpz_powm (struct speed_params *);
375 double speed_mpz_powm_mod (struct speed_params *);
376 double speed_mpz_powm_redc (struct speed_params *);
377 double speed_mpz_powm_sec (struct speed_params *);
378 double speed_mpz_powm_ui (struct speed_params *);
379 double speed_mpz_urandomb (struct speed_params *);
380 
381 double speed_gmp_randseed (struct speed_params *);
382 double speed_gmp_randseed_ui (struct speed_params *);
383 
384 double speed_noop (struct speed_params *);
385 double speed_noop_wxs (struct speed_params *);
386 double speed_noop_wxys (struct speed_params *);
387 
388 double speed_operator_div (struct speed_params *);
389 double speed_operator_mod (struct speed_params *);
390 
391 double speed_udiv_qrnnd (struct speed_params *);
392 double speed_udiv_qrnnd_preinv1 (struct speed_params *);
393 double speed_udiv_qrnnd_preinv2 (struct speed_params *);
394 double speed_udiv_qrnnd_preinv3 (struct speed_params *);
395 double speed_udiv_qrnnd_c (struct speed_params *);
396 double speed_umul_ppmm (struct speed_params *);
397 
398 /* Prototypes for other routines */
399 
400 /* low 32-bits in p[0], high 32-bits in p[1] */
401 void speed_cyclecounter (unsigned p[2]);
402 
403 void mftb_function (unsigned p[2]);
404 
405 /* In i386 gcc -fPIC, ebx is a fixed register and can't be declared a dummy
406    output or a clobber for the cpuid, hence an explicit save and restore.  A
407    clobber as such doesn't provoke an error unfortunately (gcc 3.0), so use
408    the dummy output style in non-PIC, so there's an error if somehow -fPIC
409    is used without a -DPIC to tell us about it.  */
410 #if defined(__GNUC__) && ! defined (NO_ASM)	\
411   && (defined (__i386__) || defined (__i486__))
412 #if defined (PIC) || defined (__APPLE_CC__)
413 #define speed_cyclecounter(p)						\
414   do {									\
415     int	 __speed_cyclecounter__save_ebx;				\
416     int	 __speed_cyclecounter__dummy;					\
417     __asm__ __volatile__ ("movl %%ebx, %1\n"				\
418 			  "cpuid\n"					\
419 			  "movl %1, %%ebx\n"				\
420 			  "rdtsc"					\
421 			  : "=a"   ((p)[0]),				\
422 			    "=&rm" (__speed_cyclecounter__save_ebx),	\
423 			    "=c"   (__speed_cyclecounter__dummy),	\
424 			    "=d"   ((p)[1]));				\
425   } while (0)
426 #else
427 #define speed_cyclecounter(p)						\
428   do {									\
429     int	 __speed_cyclecounter__dummy1;					\
430     int	 __speed_cyclecounter__dummy2;					\
431     __asm__ __volatile__ ("cpuid\n"					\
432 			  "rdtsc"					\
433 			  : "=a" ((p)[0]),				\
434 			    "=b" (__speed_cyclecounter__dummy1),	\
435 			    "=c" (__speed_cyclecounter__dummy2),	\
436 			    "=d" ((p)[1]));				\
437   } while (0)
438 #endif
439 #endif
440 
441 double speed_cyclecounter_diff (const unsigned [2], const unsigned [2]);
442 int gettimeofday_microseconds_p (void);
443 int getrusage_microseconds_p (void);
444 int cycles_works_p (void);
445 long clk_tck (void);
446 double freq_measure (const char *, double (*)(void));
447 
448 int double_cmp_ptr (const double *, const double *);
449 void pentium_wbinvd (void);
450 typedef int (*qsort_function_t) (const void *, const void *);
451 
452 void noop (void);
453 void noop_1 (mp_limb_t);
454 void noop_wxs (mp_ptr, mp_srcptr, mp_size_t);
455 void noop_wxys (mp_ptr, mp_srcptr, mp_srcptr, mp_size_t);
456 void mpn_cache_fill (mp_srcptr, mp_size_t);
457 void mpn_cache_fill_dummy (mp_limb_t);
458 void speed_cache_fill (struct speed_params *);
459 void speed_operand_src (struct speed_params *, mp_ptr, mp_size_t);
460 void speed_operand_dst (struct speed_params *, mp_ptr, mp_size_t);
461 
462 extern int  speed_option_addrs;
463 extern int  speed_option_verbose;
464 extern int  speed_option_cycles_broken;
465 void speed_option_set (const char *);
466 
467 mp_limb_t mpn_divrem_1_div (mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t);
468 mp_limb_t mpn_divrem_1_inv (mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t);
469 mp_limb_t mpn_divrem_2_div (mp_ptr, mp_size_t, mp_ptr, mp_size_t, mp_srcptr);
470 mp_limb_t mpn_divrem_2_inv (mp_ptr, mp_size_t, mp_ptr, mp_size_t, mp_srcptr);
471 
472 int mpn_jacobi_base_1 (mp_limb_t, mp_limb_t, int);
473 int mpn_jacobi_base_2 (mp_limb_t, mp_limb_t, int);
474 int mpn_jacobi_base_3 (mp_limb_t, mp_limb_t, int);
475 int mpn_jacobi_base_4 (mp_limb_t, mp_limb_t, int);
476 
477 mp_limb_t mpn_mod_1_div (mp_srcptr, mp_size_t, mp_limb_t);
478 mp_limb_t mpn_mod_1_inv (mp_srcptr, mp_size_t, mp_limb_t);
479 
480 mp_limb_t mpn_mod_1_1p_1 (mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t [4]);
481 mp_limb_t mpn_mod_1_1p_2 (mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t [4]);
482 
483 void mpn_mod_1_1p_cps_1 (mp_limb_t [4], mp_limb_t);
484 void mpn_mod_1_1p_cps_2 (mp_limb_t [4], mp_limb_t);
485 
486 mp_size_t mpn_gcdext_one_double (mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t);
487 mp_size_t mpn_gcdext_one_single (mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t);
488 mp_size_t mpn_gcdext_single (mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t);
489 mp_size_t mpn_gcdext_double (mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t);
490 mp_size_t mpn_hgcd_lehmer (mp_ptr, mp_ptr, mp_size_t, struct hgcd_matrix *, mp_ptr);
491 mp_size_t mpn_hgcd_lehmer_itch (mp_size_t);
492 
493 mp_size_t mpn_hgcd_appr_lehmer (mp_ptr, mp_ptr, mp_size_t, struct hgcd_matrix *, mp_ptr);
494 mp_size_t mpn_hgcd_appr_lehmer_itch (mp_size_t);
495 
496 mp_size_t mpn_hgcd_reduce_1 (struct hgcd_matrix *, mp_ptr, mp_ptr, mp_size_t, mp_size_t, mp_ptr);
497 mp_size_t mpn_hgcd_reduce_1_itch (mp_size_t, mp_size_t);
498 
499 mp_size_t mpn_hgcd_reduce_2 (struct hgcd_matrix *, mp_ptr, mp_ptr, mp_size_t, mp_size_t, mp_ptr);
500 mp_size_t mpn_hgcd_reduce_2_itch (mp_size_t, mp_size_t);
501 
502 mp_limb_t mpn_sb_divrem_mn_div (mp_ptr, mp_ptr, mp_size_t, mp_srcptr, mp_size_t);
503 mp_limb_t mpn_sb_divrem_mn_inv (mp_ptr, mp_ptr, mp_size_t, mp_srcptr, mp_size_t);
504 
505 mp_size_t mpn_set_str_basecase (mp_ptr, const unsigned char *, size_t, int);
506 void mpn_pre_set_str (mp_ptr, unsigned char *, size_t, powers_t *, mp_ptr);
507 
508 void mpz_powm_mod (mpz_ptr, mpz_srcptr, mpz_srcptr, mpz_srcptr);
509 void mpz_powm_redc (mpz_ptr, mpz_srcptr, mpz_srcptr, mpz_srcptr);
510 
511 int speed_routine_count_zeros_setup (struct speed_params *, mp_ptr, int, int);
512 
513 
514 /* "get" is called repeatedly until it ticks over, just in case on a fast
515    processor it takes less than a microsecond, though this is probably
516    unlikely if it's a system call.
517 
518    speed_cyclecounter is called on the same side of the "get" for the start
519    and end measurements.  It doesn't matter how long it takes from the "get"
520    sample to the cycles sample, since that period will cancel out in the
521    difference calculation (assuming it's the same each time).
522 
523    Letting the test run for more than a process time slice is probably only
524    going to reduce accuracy, especially for getrusage when the cycle counter
525    is real time, or for gettimeofday if the cycle counter is in fact process
526    time.  Use CLK_TCK/2 as a reasonable stop.
527 
528    It'd be desirable to be quite accurate here.  The default speed_precision
529    for a cycle counter is 10000 cycles, so to mix that with getrusage or
530    gettimeofday the frequency should be at least that accurate.  But running
531    measurements for 10000 microseconds (or more) is too long.  Be satisfied
532    with just a half clock tick (5000 microseconds usually).  */
533 
534 #define FREQ_MEASURE_ONE(name, type, get, getc, sec, usec)		\
535   do {									\
536     type      st1, st, et1, et;						\
537     unsigned  sc[2], ec[2];						\
538     long      dt, half_tick;						\
539     double    dc, cyc;							\
540 									\
541     half_tick = (1000000L / clk_tck()) / 2;				\
542 									\
543     get (st1);								\
544     do {								\
545       get (st);								\
546     } while (usec(st) == usec(st1) && sec(st) == sec(st1));		\
547 									\
548     getc (sc);								\
549 									\
550     for (;;)								\
551       {									\
552 	get (et1);							\
553 	do {								\
554 	  get (et);							\
555 	} while (usec(et) == usec(et1) && sec(et) == sec(et1));		\
556 									\
557 	getc (ec);							\
558 									\
559 	dc = speed_cyclecounter_diff (ec, sc);				\
560 									\
561 	/* allow secs to cancel before multiplying */			\
562 	dt = sec(et) - sec(st);						\
563 	dt = dt * 1000000L + (usec(et) - usec(st));			\
564 									\
565 	if (dt >= half_tick)						\
566 	  break;							\
567       }									\
568 									\
569     cyc = dt * 1e-6 / dc;						\
570 									\
571     if (speed_option_verbose >= 2)					\
572       printf ("freq_measure_%s_one() dc=%.6g dt=%ld cyc=%.6g\n",	\
573 	      name, dc, dt, cyc);					\
574 									\
575     return dt * 1e-6 / dc;						\
576 									\
577   } while (0)
578 
579 
580 
581 
582 /* The measuring routines use these big macros to save duplication for
583    similar forms.  They also get used for some automatically generated
584    measuring of new implementations of functions.
585 
586    Having something like SPEED_ROUTINE_BINARY_N as a subroutine accepting a
587    function pointer is considered undesirable since it's not the way a
588    normal application will be calling, and some processors might do
589    different things with an indirect call, like not branch predicting, or
590    doing a full pipe flush.  At least some of the "functions" measured are
591    actually macros too.
592 
593    The net effect is to bloat the object code, possibly in a big way, but
594    only what's being measured is being run, so that doesn't matter.
595 
596    The loop forms don't try to cope with __GMP_ATTRIBUTE_PURE or
597    ATTRIBUTE_CONST on the called functions.  Adding a cast to a non-pure
598    function pointer doesn't work in gcc 3.2.  Using an actual non-pure
599    function pointer variable works, but stands a real risk of a
600    non-optimizing compiler generating unnecessary overheads in the call.
601    Currently the best idea is not to use those attributes for a timing
602    program build.  __GMP_NO_ATTRIBUTE_CONST_PURE will tell gmp.h and
603    gmp-impl.h to omit them from routines there.  */
604 
605 #define SPEED_RESTRICT_COND(cond)   if (!(cond)) return -1.0;
606 
607 /* For mpn_copy or similar. */
608 #define SPEED_ROUTINE_MPN_COPY_CALL(call)				\
609   {									\
610     mp_ptr    wp;							\
611     unsigned  i;							\
612     double    t;							\
613     TMP_DECL;								\
614 									\
615     SPEED_RESTRICT_COND (s->size >= 0);					\
616 									\
617     TMP_MARK;								\
618     SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
619 									\
620     speed_operand_src (s, s->xp, s->size);				\
621     speed_operand_dst (s, wp, s->size);					\
622     speed_cache_fill (s);						\
623 									\
624     speed_starttime ();							\
625     i = s->reps;							\
626     do									\
627       call;								\
628     while (--i != 0);							\
629     t = speed_endtime ();						\
630 									\
631     TMP_FREE;								\
632     return t;								\
633   }
634 #define SPEED_ROUTINE_MPN_COPY(function)				\
635   SPEED_ROUTINE_MPN_COPY_CALL (function (wp, s->xp, s->size))
636 
637 #define SPEED_ROUTINE_MPN_TABSELECT(function)				\
638   SPEED_ROUTINE_MPN_COPY_CALL (function (wp, s->xp, s->size, 1, s->r))
639 
640 #define SPEED_ROUTINE_MPN_COPYC(function)				\
641   {									\
642     mp_ptr    wp;							\
643     unsigned  i;							\
644     double    t;							\
645     TMP_DECL;								\
646 									\
647     SPEED_RESTRICT_COND (s->size >= 0);					\
648 									\
649     TMP_MARK;								\
650     SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
651 									\
652     speed_operand_src (s, s->xp, s->size);				\
653     speed_operand_dst (s, wp, s->size);					\
654     speed_cache_fill (s);						\
655 									\
656     speed_starttime ();							\
657     i = s->reps;							\
658     do									\
659       function (wp, s->xp, s->size, 0);					\
660     while (--i != 0);							\
661     t = speed_endtime ();						\
662 									\
663     TMP_FREE;								\
664     return t;								\
665   }
666 
667 /* s->size is still in limbs, and it's limbs which are copied, but
668    "function" takes a size in bytes not limbs.  */
669 #define SPEED_ROUTINE_MPN_COPY_BYTES(function)				\
670   {									\
671     mp_ptr    wp;							\
672     unsigned  i;							\
673     double    t;							\
674     TMP_DECL;								\
675 									\
676     SPEED_RESTRICT_COND (s->size >= 0);					\
677 									\
678     TMP_MARK;								\
679     SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
680 									\
681     speed_operand_src (s, s->xp, s->size);				\
682     speed_operand_dst (s, wp, s->size);					\
683     speed_cache_fill (s);						\
684 									\
685     speed_starttime ();							\
686     i = s->reps;							\
687     do									\
688       function (wp, s->xp, s->size * BYTES_PER_MP_LIMB);		\
689     while (--i != 0);							\
690     t = speed_endtime ();						\
691 									\
692     TMP_FREE;								\
693     return t;								\
694   }
695 
696 
697 /* For mpn_add_n, mpn_sub_n, or similar. */
698 #define SPEED_ROUTINE_MPN_BINARY_N_CALL(call)				\
699   {									\
700     mp_ptr     wp;							\
701     mp_ptr     xp, yp;							\
702     unsigned   i;							\
703     double     t;							\
704     TMP_DECL;								\
705 									\
706     SPEED_RESTRICT_COND (s->size >= 1);					\
707 									\
708     TMP_MARK;								\
709     SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
710 									\
711     xp = s->xp;								\
712     yp = s->yp;								\
713 									\
714     if (s->r == 0)	;						\
715     else if (s->r == 1) { xp = wp;	    }				\
716     else if (s->r == 2) {	   yp = wp; }				\
717     else if (s->r == 3) { xp = wp; yp = wp; }				\
718     else if (s->r == 4) {     yp = xp;	    }				\
719     else		{						\
720       TMP_FREE;								\
721       return -1.0;							\
722     }									\
723 									\
724     /* initialize wp if operand overlap */				\
725     if (xp == wp || yp == wp)						\
726       MPN_COPY (wp, s->xp, s->size);					\
727 									\
728     speed_operand_src (s, xp, s->size);					\
729     speed_operand_src (s, yp, s->size);					\
730     speed_operand_dst (s, wp, s->size);					\
731     speed_cache_fill (s);						\
732 									\
733     speed_starttime ();							\
734     i = s->reps;							\
735     do									\
736       call;								\
737     while (--i != 0);							\
738     t = speed_endtime ();						\
739 									\
740     TMP_FREE;								\
741     return t;								\
742   }
743 
744 
745 /* For mpn_aors_errK_n, where 1 <= K <= 3. */
746 #define SPEED_ROUTINE_MPN_BINARY_ERR_N_CALL(call, K)			\
747   {									\
748     mp_ptr     wp;							\
749     mp_ptr     xp, yp;							\
750     mp_ptr     zp[K];							\
751     mp_limb_t  ep[2*K];							\
752     unsigned   i;							\
753     double     t;							\
754     TMP_DECL;								\
755 									\
756     SPEED_RESTRICT_COND (s->size >= 1);					\
757 									\
758     TMP_MARK;								\
759     SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
760 									\
761     /* (don't have a mechnanism to specify zp alignments) */		\
762     for (i = 0; i < K; i++)						\
763       SPEED_TMP_ALLOC_LIMBS (zp[i], s->size, 0);			\
764 									\
765     xp = s->xp;								\
766     yp = s->yp;								\
767 									\
768     if (s->r == 0)	;						\
769     else if (s->r == 1) { xp = wp;	    }				\
770     else if (s->r == 2) {	   yp = wp; }				\
771     else if (s->r == 3) { xp = wp; yp = wp; }				\
772     else if (s->r == 4) {     yp = xp;	    }				\
773     else		{						\
774       TMP_FREE;								\
775       return -1.0;							\
776     }									\
777 									\
778     /* initialize wp if operand overlap */				\
779     if (xp == wp || yp == wp)						\
780       MPN_COPY (wp, s->xp, s->size);					\
781 									\
782     speed_operand_src (s, xp, s->size);					\
783     speed_operand_src (s, yp, s->size);					\
784     for (i = 0; i < K; i++)						\
785       speed_operand_src (s, zp[i], s->size);				\
786     speed_operand_dst (s, wp, s->size);					\
787     speed_cache_fill (s);						\
788 									\
789     speed_starttime ();							\
790     i = s->reps;							\
791     do									\
792       call;								\
793     while (--i != 0);							\
794     t = speed_endtime ();						\
795 									\
796     TMP_FREE;								\
797     return t;								\
798   }
799 
800 #define SPEED_ROUTINE_MPN_BINARY_ERR1_N(function)			\
801   SPEED_ROUTINE_MPN_BINARY_ERR_N_CALL ((*function) (wp, xp, yp, ep, zp[0], s->size, 0), 1)
802 
803 #define SPEED_ROUTINE_MPN_BINARY_ERR2_N(function)			\
804   SPEED_ROUTINE_MPN_BINARY_ERR_N_CALL ((*function) (wp, xp, yp, ep, zp[0], zp[1], s->size, 0), 2)
805 
806 #define SPEED_ROUTINE_MPN_BINARY_ERR3_N(function)			\
807   SPEED_ROUTINE_MPN_BINARY_ERR_N_CALL ((*function) (wp, xp, yp, ep, zp[0], zp[1], zp[2], s->size, 0), 3)
808 
809 
810 /* For mpn_add_n, mpn_sub_n, or similar. */
811 #define SPEED_ROUTINE_MPN_ADDSUB_N_CALL(call)				\
812   {									\
813     mp_ptr     ap, sp;							\
814     mp_ptr     xp, yp;							\
815     unsigned   i;							\
816     double     t;							\
817     TMP_DECL;								\
818 									\
819     SPEED_RESTRICT_COND (s->size >= 1);					\
820 									\
821     TMP_MARK;								\
822     SPEED_TMP_ALLOC_LIMBS (ap, s->size, s->align_wp);			\
823     SPEED_TMP_ALLOC_LIMBS (sp, s->size, s->align_wp);			\
824 									\
825     xp = s->xp;								\
826     yp = s->yp;								\
827 									\
828     if ((s->r & 1) != 0) { xp = ap; }					\
829     if ((s->r & 2) != 0) { yp = ap; }					\
830     if ((s->r & 4) != 0) { xp = sp; }					\
831     if ((s->r & 8) != 0) { yp = sp; }					\
832     if ((s->r & 3) == 3  ||  (s->r & 12) == 12)				\
833       {									\
834 	TMP_FREE;							\
835 	return -1.0;							\
836       }									\
837 									\
838     /* initialize ap if operand overlap */				\
839     if (xp == ap || yp == ap)						\
840       MPN_COPY (ap, s->xp, s->size);					\
841     /* initialize sp if operand overlap */				\
842     if (xp == sp || yp == sp)						\
843       MPN_COPY (sp, s->xp, s->size);					\
844 									\
845     speed_operand_src (s, xp, s->size);					\
846     speed_operand_src (s, yp, s->size);					\
847     speed_operand_dst (s, ap, s->size);					\
848     speed_operand_dst (s, sp, s->size);					\
849     speed_cache_fill (s);						\
850 									\
851     speed_starttime ();							\
852     i = s->reps;							\
853     do									\
854       call;								\
855     while (--i != 0);							\
856     t = speed_endtime ();						\
857 									\
858     TMP_FREE;								\
859     return t;								\
860   }
861 
862 #define SPEED_ROUTINE_MPN_BINARY_N(function)				\
863    SPEED_ROUTINE_MPN_BINARY_N_CALL ((*function) (wp, xp, yp, s->size))
864 
865 #define SPEED_ROUTINE_MPN_BINARY_NC(function)				\
866    SPEED_ROUTINE_MPN_BINARY_N_CALL ((*function) (wp, xp, yp, s->size, 0))
867 
868 
869 /* For mpn_lshift, mpn_rshift, mpn_mul_1, with r, or similar. */
870 #define SPEED_ROUTINE_MPN_UNARY_1_CALL(call)				\
871   {									\
872     mp_ptr    wp;							\
873     unsigned  i;							\
874     double    t;							\
875     TMP_DECL;								\
876 									\
877     SPEED_RESTRICT_COND (s->size >= 1);					\
878 									\
879     TMP_MARK;								\
880     SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
881 									\
882     speed_operand_src (s, s->xp, s->size);				\
883     speed_operand_dst (s, wp, s->size);					\
884     speed_cache_fill (s);						\
885 									\
886     speed_starttime ();							\
887     i = s->reps;							\
888     do									\
889       call;								\
890     while (--i != 0);							\
891     t = speed_endtime ();						\
892 									\
893     TMP_FREE;								\
894     return t;								\
895   }
896 
897 #define SPEED_ROUTINE_MPN_UNARY_1(function)				\
898   SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r))
899 
900 #define SPEED_ROUTINE_MPN_UNARY_1C(function)				\
901   SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r, 0))
902 
903 /* FIXME: wp is uninitialized here, should start it off from xp */
904 #define SPEED_ROUTINE_MPN_UNARY_1_INPLACE(function)			\
905   SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, wp, s->size, s->r))
906 
907 #define SPEED_ROUTINE_MPN_DIVEXACT_1(function)				\
908   SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r))
909 
910 #define SPEED_ROUTINE_MPN_BDIV_Q_1(function)				\
911     SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r))
912 
913 #define SPEED_ROUTINE_MPN_PI1_BDIV_Q_1_CALL(call)			\
914   {									\
915     unsigned   shift;							\
916     mp_limb_t  dinv;							\
917 									\
918     SPEED_RESTRICT_COND (s->size > 0);					\
919     SPEED_RESTRICT_COND (s->r != 0);					\
920 									\
921     count_trailing_zeros (shift, s->r);					\
922     binvert_limb (dinv, s->r >> shift);					\
923 									\
924     SPEED_ROUTINE_MPN_UNARY_1_CALL (call);				\
925   }
926 #define SPEED_ROUTINE_MPN_PI1_BDIV_Q_1(function)			\
927   SPEED_ROUTINE_MPN_PI1_BDIV_Q_1_CALL					\
928   ((*function) (wp, s->xp, s->size, s->r, dinv, shift))
929 
930 #define SPEED_ROUTINE_MPN_BDIV_DBM1C(function)				\
931   SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r, 0))
932 
933 #define SPEED_ROUTINE_MPN_DIVREM_1(function)				\
934   SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, 0, s->xp, s->size, s->r))
935 
936 #define SPEED_ROUTINE_MPN_DIVREM_1C(function)				\
937   SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, 0, s->xp, s->size, s->r, 0))
938 
939 #define SPEED_ROUTINE_MPN_DIVREM_1F(function)				\
940   SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->size, s->xp, 0, s->r))
941 
942 #define SPEED_ROUTINE_MPN_DIVREM_1CF(function)				\
943   SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->size, s->xp, 0, s->r, 0))
944 
945 
946 #define SPEED_ROUTINE_MPN_PREINV_DIVREM_1_CALL(call)			\
947   {									\
948     unsigned   shift;							\
949     mp_limb_t  dinv;							\
950 									\
951     SPEED_RESTRICT_COND (s->size >= 0);					\
952     SPEED_RESTRICT_COND (s->r != 0);					\
953 									\
954     count_leading_zeros (shift, s->r);					\
955     invert_limb (dinv, s->r << shift);					\
956 									\
957     SPEED_ROUTINE_MPN_UNARY_1_CALL (call);				\
958   }									\
959 
960 #define SPEED_ROUTINE_MPN_PREINV_DIVREM_1(function)			\
961   SPEED_ROUTINE_MPN_PREINV_DIVREM_1_CALL				\
962   ((*function) (wp, 0, s->xp, s->size, s->r, dinv, shift))
963 
964 /* s->size limbs worth of fraction part */
965 #define SPEED_ROUTINE_MPN_PREINV_DIVREM_1F(function)			\
966   SPEED_ROUTINE_MPN_PREINV_DIVREM_1_CALL				\
967   ((*function) (wp, s->size, s->xp, 0, s->r, dinv, shift))
968 
969 
970 /* s->r is duplicated to form the multiplier, defaulting to
971    MP_BASES_BIG_BASE_10.  Not sure if that's particularly useful, but at
972    least it provides some control.  */
973 #define SPEED_ROUTINE_MPN_UNARY_N(function,N)				\
974   {									\
975     mp_ptr     wp;							\
976     mp_size_t  wn;							\
977     unsigned   i;							\
978     double     t;							\
979     mp_limb_t  yp[N];							\
980     TMP_DECL;								\
981 									\
982     SPEED_RESTRICT_COND (s->size >= N);					\
983 									\
984     TMP_MARK;								\
985     wn = s->size + N-1;							\
986     SPEED_TMP_ALLOC_LIMBS (wp, wn, s->align_wp);			\
987     for (i = 0; i < N; i++)						\
988       yp[i] = (s->r != 0 ? s->r : MP_BASES_BIG_BASE_10);		\
989 									\
990     speed_operand_src (s, s->xp, s->size);				\
991     speed_operand_src (s, yp, (mp_size_t) N);				\
992     speed_operand_dst (s, wp, wn);					\
993     speed_cache_fill (s);						\
994 									\
995     speed_starttime ();							\
996     i = s->reps;							\
997     do									\
998       function (wp, s->xp, s->size, yp);				\
999     while (--i != 0);							\
1000     t = speed_endtime ();						\
1001 									\
1002     TMP_FREE;								\
1003     return t;								\
1004   }
1005 
1006 #define SPEED_ROUTINE_MPN_UNARY_2(function)				\
1007   SPEED_ROUTINE_MPN_UNARY_N (function, 2)
1008 #define SPEED_ROUTINE_MPN_UNARY_3(function)				\
1009   SPEED_ROUTINE_MPN_UNARY_N (function, 3)
1010 #define SPEED_ROUTINE_MPN_UNARY_4(function)				\
1011   SPEED_ROUTINE_MPN_UNARY_N (function, 4)
1012 #define SPEED_ROUTINE_MPN_UNARY_5(function)				\
1013   SPEED_ROUTINE_MPN_UNARY_N (function, 5)
1014 #define SPEED_ROUTINE_MPN_UNARY_6(function)				\
1015   SPEED_ROUTINE_MPN_UNARY_N (function, 6)
1016 #define SPEED_ROUTINE_MPN_UNARY_7(function)				\
1017   SPEED_ROUTINE_MPN_UNARY_N (function, 7)
1018 #define SPEED_ROUTINE_MPN_UNARY_8(function)				\
1019   SPEED_ROUTINE_MPN_UNARY_N (function, 8)
1020 
1021 
1022 /* For mpn_mul, mpn_mul_basecase, xsize=r, ysize=s->size. */
1023 #define SPEED_ROUTINE_MPN_MUL(function)					\
1024   {									\
1025     mp_ptr    wp;							\
1026     mp_size_t size1;							\
1027     unsigned  i;							\
1028     double    t;							\
1029     TMP_DECL;								\
1030 									\
1031     size1 = (s->r == 0 ? s->size : s->r);				\
1032     if (size1 < 0) size1 = -size1 - s->size;				\
1033 									\
1034     SPEED_RESTRICT_COND (size1 >= 1);					\
1035     SPEED_RESTRICT_COND (s->size >= size1);				\
1036 									\
1037     TMP_MARK;								\
1038     SPEED_TMP_ALLOC_LIMBS (wp, size1 + s->size, s->align_wp);		\
1039 									\
1040     speed_operand_src (s, s->xp, s->size);				\
1041     speed_operand_src (s, s->yp, size1);				\
1042     speed_operand_dst (s, wp, size1 + s->size);				\
1043     speed_cache_fill (s);						\
1044 									\
1045     speed_starttime ();							\
1046     i = s->reps;							\
1047     do									\
1048       function (wp, s->xp, s->size, s->yp, size1);			\
1049     while (--i != 0);							\
1050     t = speed_endtime ();						\
1051 									\
1052     TMP_FREE;								\
1053     return t;								\
1054   }
1055 
1056 
1057 #define SPEED_ROUTINE_MPN_MUL_N_CALL(call)				\
1058   {									\
1059     mp_ptr    wp;							\
1060     unsigned  i;							\
1061     double    t;							\
1062     TMP_DECL;								\
1063 									\
1064     SPEED_RESTRICT_COND (s->size >= 1);					\
1065 									\
1066     TMP_MARK;								\
1067     SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp);			\
1068 									\
1069     speed_operand_src (s, s->xp, s->size);				\
1070     speed_operand_src (s, s->yp, s->size);				\
1071     speed_operand_dst (s, wp, 2*s->size);				\
1072     speed_cache_fill (s);						\
1073 									\
1074     speed_starttime ();							\
1075     i = s->reps;							\
1076     do									\
1077       call;								\
1078     while (--i != 0);							\
1079     t = speed_endtime ();						\
1080 									\
1081     TMP_FREE;								\
1082     return t;								\
1083   }
1084 
1085 #define SPEED_ROUTINE_MPN_MUL_N(function)				\
1086   SPEED_ROUTINE_MPN_MUL_N_CALL (function (wp, s->xp, s->yp, s->size));
1087 
1088 #define SPEED_ROUTINE_MPN_MULLO_N_CALL(call)				\
1089   {									\
1090     mp_ptr    wp;							\
1091     unsigned  i;							\
1092     double    t;							\
1093     TMP_DECL;								\
1094 									\
1095     SPEED_RESTRICT_COND (s->size >= 1);					\
1096 									\
1097     TMP_MARK;								\
1098     SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
1099 									\
1100     speed_operand_src (s, s->xp, s->size);				\
1101     speed_operand_src (s, s->yp, s->size);				\
1102     speed_operand_dst (s, wp, s->size);					\
1103     speed_cache_fill (s);						\
1104 									\
1105     speed_starttime ();							\
1106     i = s->reps;							\
1107     do									\
1108       call;								\
1109     while (--i != 0);							\
1110     t = speed_endtime ();						\
1111 									\
1112     TMP_FREE;								\
1113     return t;								\
1114   }
1115 
1116 #define SPEED_ROUTINE_MPN_MULLO_N(function)				\
1117   SPEED_ROUTINE_MPN_MULLO_N_CALL (function (wp, s->xp, s->yp, s->size));
1118 
1119 /* For mpn_mul_basecase, xsize=r, ysize=s->size. */
1120 #define SPEED_ROUTINE_MPN_MULLO_BASECASE(function)			\
1121   {									\
1122     mp_ptr    wp;							\
1123     unsigned  i;							\
1124     double    t;							\
1125     TMP_DECL;								\
1126 									\
1127     SPEED_RESTRICT_COND (s->size >= 1);					\
1128 									\
1129     TMP_MARK;								\
1130     SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
1131 									\
1132     speed_operand_src (s, s->xp, s->size);				\
1133     speed_operand_src (s, s->yp, s->size);				\
1134     speed_operand_dst (s, wp, s->size);					\
1135     speed_cache_fill (s);						\
1136 									\
1137     speed_starttime ();							\
1138     i = s->reps;							\
1139     do									\
1140       function (wp, s->xp, s->yp, s->size);				\
1141     while (--i != 0);							\
1142     t = speed_endtime ();						\
1143 									\
1144     TMP_FREE;								\
1145     return t;								\
1146   }
1147 
1148 /* For mpn_mulmid, mpn_mulmid_basecase, xsize=r, ysize=s->size. */
1149 #define SPEED_ROUTINE_MPN_MULMID(function)				\
1150   {									\
1151     mp_ptr    wp, xp;							\
1152     mp_size_t size1;							\
1153     unsigned  i;							\
1154     double    t;							\
1155     TMP_DECL;								\
1156 									\
1157     size1 = (s->r == 0 ? (2 * s->size - 1) : s->r);			\
1158 									\
1159     SPEED_RESTRICT_COND (s->size >= 1);					\
1160     SPEED_RESTRICT_COND (size1 >= s->size);				\
1161 									\
1162     TMP_MARK;								\
1163     SPEED_TMP_ALLOC_LIMBS (wp, size1 - s->size + 3, s->align_wp);	\
1164     SPEED_TMP_ALLOC_LIMBS (xp, size1, s->align_xp);			\
1165 									\
1166     speed_operand_src (s, xp, size1);					\
1167     speed_operand_src (s, s->yp, s->size);				\
1168     speed_operand_dst (s, wp, size1 - s->size + 3);			\
1169     speed_cache_fill (s);						\
1170 									\
1171     speed_starttime ();							\
1172     i = s->reps;							\
1173     do									\
1174       function (wp, xp, size1, s->yp, s->size);				\
1175     while (--i != 0);							\
1176     t = speed_endtime ();						\
1177 									\
1178     TMP_FREE;								\
1179     return t;								\
1180   }
1181 
1182 #define SPEED_ROUTINE_MPN_MULMID_N(function)				\
1183   {									\
1184     mp_ptr    wp, xp;							\
1185     mp_size_t size1;							\
1186     unsigned  i;							\
1187     double    t;							\
1188     TMP_DECL;								\
1189 									\
1190     size1 = 2 * s->size - 1;						\
1191 									\
1192     SPEED_RESTRICT_COND (s->size >= 1);					\
1193 									\
1194     TMP_MARK;								\
1195     SPEED_TMP_ALLOC_LIMBS (wp, size1 - s->size + 3, s->align_wp);	\
1196     SPEED_TMP_ALLOC_LIMBS (xp, size1, s->align_xp);			\
1197 									\
1198     speed_operand_src (s, xp, size1);					\
1199     speed_operand_src (s, s->yp, s->size);				\
1200     speed_operand_dst (s, wp, size1 - s->size + 3);			\
1201     speed_cache_fill (s);						\
1202 									\
1203     speed_starttime ();							\
1204     i = s->reps;							\
1205     do									\
1206       function (wp, xp, s->yp, s->size);				\
1207     while (--i != 0);							\
1208     t = speed_endtime ();						\
1209 									\
1210     TMP_FREE;								\
1211     return t;								\
1212   }
1213 
1214 #define SPEED_ROUTINE_MPN_TOOM42_MULMID(function)			\
1215   {									\
1216     mp_ptr    wp, xp, scratch;						\
1217     mp_size_t size1, scratch_size;					\
1218     unsigned  i;							\
1219     double    t;							\
1220     TMP_DECL;								\
1221 									\
1222     size1 = 2 * s->size - 1;						\
1223 									\
1224     SPEED_RESTRICT_COND (s->size >= 1);					\
1225 									\
1226     TMP_MARK;								\
1227     SPEED_TMP_ALLOC_LIMBS (wp, size1 - s->size + 3, s->align_wp);	\
1228     SPEED_TMP_ALLOC_LIMBS (xp, size1, s->align_xp);			\
1229     scratch_size = mpn_toom42_mulmid_itch (s->size);			\
1230     SPEED_TMP_ALLOC_LIMBS (scratch, scratch_size, 0);			\
1231 									\
1232     speed_operand_src (s, xp, size1);					\
1233     speed_operand_src (s, s->yp, s->size);				\
1234     speed_operand_dst (s, wp, size1 - s->size + 3);			\
1235     speed_cache_fill (s);						\
1236 									\
1237     speed_starttime ();							\
1238     i = s->reps;							\
1239     do									\
1240       function (wp, xp, s->yp, s->size, scratch);			\
1241     while (--i != 0);							\
1242     t = speed_endtime ();						\
1243 									\
1244     TMP_FREE;								\
1245     return t;								\
1246   }
1247 
1248 #define SPEED_ROUTINE_MPN_MULMOD_BNM1_CALL(call)			\
1249   {									\
1250     mp_ptr    wp, tp;							\
1251     unsigned  i;							\
1252     double    t;							\
1253     mp_size_t itch;							\
1254     TMP_DECL;								\
1255 									\
1256     SPEED_RESTRICT_COND (s->size >= 1);					\
1257 									\
1258     itch = mpn_mulmod_bnm1_itch (s->size, s->size, s->size);		\
1259 									\
1260     TMP_MARK;								\
1261     SPEED_TMP_ALLOC_LIMBS (wp, 2 * s->size, s->align_wp);		\
1262     SPEED_TMP_ALLOC_LIMBS (tp, itch, s->align_wp2);			\
1263 									\
1264     speed_operand_src (s, s->xp, s->size);				\
1265     speed_operand_src (s, s->yp, s->size);				\
1266     speed_operand_dst (s, wp, 2 * s->size);				\
1267     speed_operand_dst (s, tp, itch);					\
1268     speed_cache_fill (s);						\
1269 									\
1270     speed_starttime ();							\
1271     i = s->reps;							\
1272     do									\
1273       call;								\
1274     while (--i != 0);							\
1275     t = speed_endtime ();						\
1276 									\
1277     TMP_FREE;								\
1278     return t;								\
1279   }
1280 #define SPEED_ROUTINE_MPN_MULMOD_BNM1_ROUNDED(function)			\
1281   {									\
1282     mp_ptr    wp, tp;							\
1283     unsigned  i;							\
1284     double    t;							\
1285     mp_size_t size, itch;						\
1286     TMP_DECL;								\
1287 									\
1288     SPEED_RESTRICT_COND (s->size >= 1);					\
1289 									\
1290     size = mpn_mulmod_bnm1_next_size (s->size);				\
1291     itch = mpn_mulmod_bnm1_itch (size, size, size);			\
1292 									\
1293     TMP_MARK;								\
1294     SPEED_TMP_ALLOC_LIMBS (wp, size, s->align_wp);			\
1295     SPEED_TMP_ALLOC_LIMBS (tp, itch, s->align_wp2);			\
1296 									\
1297     speed_operand_src (s, s->xp, s->size);				\
1298     speed_operand_src (s, s->yp, s->size);				\
1299     speed_operand_dst (s, wp, size);					\
1300     speed_operand_dst (s, tp, itch);					\
1301     speed_cache_fill (s);						\
1302 									\
1303     speed_starttime ();							\
1304     i = s->reps;							\
1305     do									\
1306       function (wp, size, s->xp, s->size, s->yp, s->size, tp);		\
1307     while (--i != 0);							\
1308     t = speed_endtime ();						\
1309 									\
1310     TMP_FREE;								\
1311     return t;								\
1312   }
1313 
1314 #define SPEED_ROUTINE_MPN_MUL_N_TSPACE(call, tsize, minsize)		\
1315   {									\
1316     mp_ptr    wp, tspace;						\
1317     unsigned  i;							\
1318     double    t;							\
1319     TMP_DECL;								\
1320 									\
1321     SPEED_RESTRICT_COND (s->size >= minsize);				\
1322 									\
1323     TMP_MARK;								\
1324     SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp);			\
1325     SPEED_TMP_ALLOC_LIMBS (tspace, tsize, s->align_wp2);		\
1326 									\
1327     speed_operand_src (s, s->xp, s->size);				\
1328     speed_operand_src (s, s->yp, s->size);				\
1329     speed_operand_dst (s, wp, 2*s->size);				\
1330     speed_operand_dst (s, tspace, tsize);				\
1331     speed_cache_fill (s);						\
1332 									\
1333     speed_starttime ();							\
1334     i = s->reps;							\
1335     do									\
1336       call;								\
1337     while (--i != 0);							\
1338     t = speed_endtime ();						\
1339 									\
1340     TMP_FREE;								\
1341     return t;								\
1342   }
1343 
1344 #define SPEED_ROUTINE_MPN_TOOM22_MUL_N(function)			\
1345   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1346     (function (wp, s->xp, s->size, s->yp, s->size, tspace),		\
1347      mpn_toom22_mul_itch (s->size, s->size),				\
1348      MPN_TOOM22_MUL_MINSIZE)
1349 
1350 #define SPEED_ROUTINE_MPN_TOOM33_MUL_N(function)			\
1351   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1352     (function (wp, s->xp, s->size, s->yp, s->size, tspace),		\
1353      mpn_toom33_mul_itch (s->size, s->size),				\
1354      MPN_TOOM33_MUL_MINSIZE)
1355 
1356 #define SPEED_ROUTINE_MPN_TOOM44_MUL_N(function)			\
1357   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1358     (function (wp, s->xp, s->size, s->yp, s->size, tspace),		\
1359      mpn_toom44_mul_itch (s->size, s->size),				\
1360      MPN_TOOM44_MUL_MINSIZE)
1361 
1362 #define SPEED_ROUTINE_MPN_TOOM6H_MUL_N(function)			\
1363   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1364     (function (wp, s->xp, s->size, s->yp, s->size, tspace),		\
1365      mpn_toom6h_mul_itch (s->size, s->size),				\
1366      MPN_TOOM6H_MUL_MINSIZE)
1367 
1368 #define SPEED_ROUTINE_MPN_TOOM8H_MUL_N(function)			\
1369   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1370     (function (wp, s->xp, s->size, s->yp, s->size, tspace),		\
1371      mpn_toom8h_mul_itch (s->size, s->size),				\
1372      MPN_TOOM8H_MUL_MINSIZE)
1373 
1374 #define SPEED_ROUTINE_MPN_TOOM32_MUL(function)				\
1375   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1376     (function (wp, s->xp, s->size, s->yp, 2*s->size/3, tspace),		\
1377      mpn_toom32_mul_itch (s->size, 2*s->size/3),			\
1378      MPN_TOOM32_MUL_MINSIZE)
1379 
1380 #define SPEED_ROUTINE_MPN_TOOM42_MUL(function)				\
1381   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1382     (function (wp, s->xp, s->size, s->yp, s->size/2, tspace),		\
1383      mpn_toom42_mul_itch (s->size, s->size/2),				\
1384      MPN_TOOM42_MUL_MINSIZE)
1385 
1386 #define SPEED_ROUTINE_MPN_TOOM43_MUL(function)				\
1387   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1388     (function (wp, s->xp, s->size, s->yp, s->size*3/4, tspace),		\
1389      mpn_toom43_mul_itch (s->size, s->size*3/4),			\
1390      MPN_TOOM43_MUL_MINSIZE)
1391 
1392 #define SPEED_ROUTINE_MPN_TOOM63_MUL(function)				\
1393   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1394     (function (wp, s->xp, s->size, s->yp, s->size/2, tspace),		\
1395      mpn_toom63_mul_itch (s->size, s->size/2),				\
1396      MPN_TOOM63_MUL_MINSIZE)
1397 
1398 #define SPEED_ROUTINE_MPN_TOOM32_FOR_TOOM43_MUL(function)		\
1399   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1400     (function (wp, s->xp, s->size, s->yp, 17*s->size/24, tspace),	\
1401      mpn_toom32_mul_itch (s->size, 17*s->size/24),			\
1402      MPN_TOOM32_MUL_MINSIZE)
1403 #define SPEED_ROUTINE_MPN_TOOM43_FOR_TOOM32_MUL(function)		\
1404   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1405     (function (wp, s->xp, s->size, s->yp, 17*s->size/24, tspace),	\
1406      mpn_toom43_mul_itch (s->size, 17*s->size/24),			\
1407      MPN_TOOM43_MUL_MINSIZE)
1408 
1409 #define SPEED_ROUTINE_MPN_TOOM32_FOR_TOOM53_MUL(function)		\
1410   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1411     (function (wp, s->xp, s->size, s->yp, 19*s->size/30, tspace),	\
1412      mpn_toom32_mul_itch (s->size, 19*s->size/30),			\
1413      MPN_TOOM32_MUL_MINSIZE)
1414 #define SPEED_ROUTINE_MPN_TOOM53_FOR_TOOM32_MUL(function)		\
1415   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1416     (function (wp, s->xp, s->size, s->yp, 19*s->size/30, tspace),	\
1417      mpn_toom53_mul_itch (s->size, 19*s->size/30),			\
1418      MPN_TOOM53_MUL_MINSIZE)
1419 
1420 #define SPEED_ROUTINE_MPN_TOOM42_FOR_TOOM53_MUL(function)		\
1421   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1422     (function (wp, s->xp, s->size, s->yp, 11*s->size/20, tspace),	\
1423      mpn_toom42_mul_itch (s->size, 11*s->size/20),			\
1424      MPN_TOOM42_MUL_MINSIZE)
1425 #define SPEED_ROUTINE_MPN_TOOM53_FOR_TOOM42_MUL(function)		\
1426   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1427     (function (wp, s->xp, s->size, s->yp, 11*s->size/20, tspace),	\
1428      mpn_toom53_mul_itch (s->size, 11*s->size/20),			\
1429      MPN_TOOM53_MUL_MINSIZE)
1430 
1431 #define SPEED_ROUTINE_MPN_TOOM43_FOR_TOOM54_MUL(function)		\
1432   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1433     (function (wp, s->xp, s->size, s->yp, 5*s->size/6, tspace),	\
1434      mpn_toom42_mul_itch (s->size, 5*s->size/6),			\
1435      MPN_TOOM54_MUL_MINSIZE)
1436 #define SPEED_ROUTINE_MPN_TOOM54_FOR_TOOM43_MUL(function)		\
1437   SPEED_ROUTINE_MPN_MUL_N_TSPACE					\
1438     (function (wp, s->xp, s->size, s->yp, 5*s->size/6, tspace),	\
1439      mpn_toom54_mul_itch (s->size, 5*s->size/6),			\
1440      MPN_TOOM54_MUL_MINSIZE)
1441 
1442 
1443 
1444 #define SPEED_ROUTINE_MPN_SQR_CALL(call)				\
1445   {									\
1446     mp_ptr    wp;							\
1447     unsigned  i;							\
1448     double    t;							\
1449     TMP_DECL;								\
1450 									\
1451     SPEED_RESTRICT_COND (s->size >= 1);					\
1452 									\
1453     TMP_MARK;								\
1454     SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp);			\
1455 									\
1456     speed_operand_src (s, s->xp, s->size);				\
1457     speed_operand_dst (s, wp, 2*s->size);				\
1458     speed_cache_fill (s);						\
1459 									\
1460     speed_starttime ();							\
1461     i = s->reps;							\
1462     do									\
1463       call;								\
1464     while (--i != 0);							\
1465     t = speed_endtime ();						\
1466 									\
1467     TMP_FREE;								\
1468     return t;								\
1469   }
1470 
1471 #define SPEED_ROUTINE_MPN_SQR(function)					\
1472   SPEED_ROUTINE_MPN_SQR_CALL (function (wp, s->xp, s->size))
1473 
1474 #define SPEED_ROUTINE_MPN_SQR_DIAG_ADDLSH1_CALL(call)			\
1475   {									\
1476     mp_ptr    wp, tp;							\
1477     unsigned  i;							\
1478     double    t;							\
1479     TMP_DECL;								\
1480 									\
1481     SPEED_RESTRICT_COND (s->size >= 2);					\
1482 									\
1483     TMP_MARK;								\
1484     SPEED_TMP_ALLOC_LIMBS (tp, 2 * s->size, s->align_wp);		\
1485     SPEED_TMP_ALLOC_LIMBS (wp, 2 * s->size, s->align_wp);		\
1486 									\
1487     speed_operand_src (s, s->xp, s->size);				\
1488     speed_operand_src (s, tp, 2 * s->size);				\
1489     speed_operand_dst (s, wp, 2 * s->size);				\
1490     speed_cache_fill (s);						\
1491 									\
1492     speed_starttime ();							\
1493     i = s->reps;							\
1494     do									\
1495       call;								\
1496     while (--i != 0);							\
1497     t = speed_endtime () / 2;						\
1498 									\
1499     TMP_FREE;								\
1500     return t;								\
1501   }
1502 
1503 #define SPEED_ROUTINE_MPN_SQR_TSPACE(call, tsize, minsize)		\
1504   {									\
1505     mp_ptr    wp, tspace;						\
1506     unsigned  i;							\
1507     double    t;							\
1508     TMP_DECL;								\
1509 									\
1510     SPEED_RESTRICT_COND (s->size >= minsize);				\
1511 									\
1512     TMP_MARK;								\
1513     SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp);			\
1514     SPEED_TMP_ALLOC_LIMBS (tspace, tsize, s->align_wp2);		\
1515 									\
1516     speed_operand_src (s, s->xp, s->size);				\
1517     speed_operand_dst (s, wp, 2*s->size);				\
1518     speed_operand_dst (s, tspace, tsize);				\
1519     speed_cache_fill (s);						\
1520 									\
1521     speed_starttime ();							\
1522     i = s->reps;							\
1523     do									\
1524       call;								\
1525     while (--i != 0);							\
1526     t = speed_endtime ();						\
1527 									\
1528     TMP_FREE;								\
1529     return t;								\
1530   }
1531 
1532 #define SPEED_ROUTINE_MPN_TOOM2_SQR(function)				\
1533   SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace),	\
1534 				mpn_toom2_sqr_itch (s->size),		\
1535 				MPN_TOOM2_SQR_MINSIZE)
1536 
1537 #define SPEED_ROUTINE_MPN_TOOM3_SQR(function)				\
1538   SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace),	\
1539 				mpn_toom3_sqr_itch (s->size),		\
1540 				MPN_TOOM3_SQR_MINSIZE)
1541 
1542 
1543 #define SPEED_ROUTINE_MPN_TOOM4_SQR(function)				\
1544   SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace),	\
1545 				mpn_toom4_sqr_itch (s->size),		\
1546 				MPN_TOOM4_SQR_MINSIZE)
1547 
1548 #define SPEED_ROUTINE_MPN_TOOM6_SQR(function)				\
1549   SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace),	\
1550 				mpn_toom6_sqr_itch (s->size),		\
1551 				MPN_TOOM6_SQR_MINSIZE)
1552 
1553 #define SPEED_ROUTINE_MPN_TOOM8_SQR(function)				\
1554   SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace),	\
1555 				mpn_toom8_sqr_itch (s->size),		\
1556 				MPN_TOOM8_SQR_MINSIZE)
1557 
1558 #define SPEED_ROUTINE_MPN_MOD_CALL(call)				\
1559   {									\
1560     unsigned   i;							\
1561 									\
1562     SPEED_RESTRICT_COND (s->size >= 0);					\
1563 									\
1564     speed_operand_src (s, s->xp, s->size);				\
1565     speed_cache_fill (s);						\
1566 									\
1567     speed_starttime ();							\
1568     i = s->reps;							\
1569     do									\
1570       call;								\
1571     while (--i != 0);							\
1572 									\
1573     return speed_endtime ();						\
1574   }
1575 
1576 #define SPEED_ROUTINE_MPN_MOD_1(function)				\
1577    SPEED_ROUTINE_MPN_MOD_CALL ((*function) (s->xp, s->size, s->r))
1578 
1579 #define SPEED_ROUTINE_MPN_MOD_1C(function)				\
1580    SPEED_ROUTINE_MPN_MOD_CALL ((*function)(s->xp, s->size, s->r, CNST_LIMB(0)))
1581 
1582 #define SPEED_ROUTINE_MPN_MODEXACT_1_ODD(function)			\
1583   SPEED_ROUTINE_MPN_MOD_CALL (function (s->xp, s->size, s->r));
1584 
1585 #define SPEED_ROUTINE_MPN_MODEXACT_1C_ODD(function)			\
1586   SPEED_ROUTINE_MPN_MOD_CALL (function (s->xp, s->size, s->r, CNST_LIMB(0)));
1587 
1588 #define SPEED_ROUTINE_MPN_MOD_34LSUB1(function)				\
1589    SPEED_ROUTINE_MPN_MOD_CALL ((*function) (s->xp, s->size))
1590 
1591 #define SPEED_ROUTINE_MPN_PREINV_MOD_1(function)			\
1592   {									\
1593     unsigned   i;							\
1594     mp_limb_t  inv;							\
1595 									\
1596     SPEED_RESTRICT_COND (s->size >= 0);					\
1597     SPEED_RESTRICT_COND (s->r & GMP_LIMB_HIGHBIT);			\
1598 									\
1599     invert_limb (inv, s->r);						\
1600     speed_operand_src (s, s->xp, s->size);				\
1601     speed_cache_fill (s);						\
1602 									\
1603     speed_starttime ();							\
1604     i = s->reps;							\
1605     do									\
1606       (*function) (s->xp, s->size, s->r, inv);				\
1607     while (--i != 0);							\
1608 									\
1609     return speed_endtime ();						\
1610   }
1611 
1612 #define SPEED_ROUTINE_MPN_MOD_1_1(function,pfunc)			\
1613   {									\
1614     unsigned   i;							\
1615     mp_limb_t  inv[4];							\
1616 									\
1617     SPEED_RESTRICT_COND (s->size >= 2);					\
1618 									\
1619     mpn_mod_1_1p_cps (inv, s->r);					\
1620     speed_operand_src (s, s->xp, s->size);				\
1621     speed_cache_fill (s);						\
1622 									\
1623     speed_starttime ();							\
1624     i = s->reps;							\
1625     do {								\
1626       pfunc (inv, s->r);						\
1627       function (s->xp, s->size, s->r << inv[1], inv);				\
1628     } while (--i != 0);							\
1629 									\
1630     return speed_endtime ();						\
1631   }
1632 #define SPEED_ROUTINE_MPN_MOD_1_N(function,pfunc,N)			\
1633   {									\
1634     unsigned   i;							\
1635     mp_limb_t  inv[N+3];						\
1636 									\
1637     SPEED_RESTRICT_COND (s->size >= 1);					\
1638     SPEED_RESTRICT_COND (s->r <= ~(mp_limb_t)0 / N);			\
1639 									\
1640     speed_operand_src (s, s->xp, s->size);				\
1641     speed_cache_fill (s);						\
1642 									\
1643     speed_starttime ();							\
1644     i = s->reps;							\
1645     do {								\
1646       pfunc (inv, s->r);						\
1647       function (s->xp, s->size, s->r, inv);				\
1648     } while (--i != 0);							\
1649 									\
1650     return speed_endtime ();						\
1651   }
1652 
1653 
1654 /* A division of 2*s->size by s->size limbs */
1655 
1656 #define SPEED_ROUTINE_MPN_DC_DIVREM_CALL(call)				\
1657   {									\
1658     unsigned  i;							\
1659     mp_ptr    a, d, q, r;						\
1660     double    t;							\
1661     gmp_pi1_t dinv;							\
1662     TMP_DECL;								\
1663 									\
1664     SPEED_RESTRICT_COND (s->size >= 1);					\
1665 									\
1666     TMP_MARK;								\
1667     SPEED_TMP_ALLOC_LIMBS (a, 2*s->size, s->align_xp);			\
1668     SPEED_TMP_ALLOC_LIMBS (d, s->size,   s->align_yp);			\
1669     SPEED_TMP_ALLOC_LIMBS (q, s->size+1, s->align_wp);			\
1670     SPEED_TMP_ALLOC_LIMBS (r, s->size,   s->align_wp2);			\
1671 									\
1672     MPN_COPY (a, s->xp, s->size);					\
1673     MPN_COPY (a+s->size, s->xp, s->size);				\
1674 									\
1675     MPN_COPY (d, s->yp, s->size);					\
1676 									\
1677     /* normalize the data */						\
1678     d[s->size-1] |= GMP_NUMB_HIGHBIT;					\
1679     a[2*s->size-1] = d[s->size-1] - 1;					\
1680 									\
1681     invert_pi1 (dinv, d[s->size-1], d[s->size-2]);			\
1682 									\
1683     speed_operand_src (s, a, 2*s->size);				\
1684     speed_operand_src (s, d, s->size);					\
1685     speed_operand_dst (s, q, s->size+1);				\
1686     speed_operand_dst (s, r, s->size);					\
1687     speed_cache_fill (s);						\
1688 									\
1689     speed_starttime ();							\
1690     i = s->reps;							\
1691     do									\
1692       call;								\
1693     while (--i != 0);							\
1694     t = speed_endtime ();						\
1695 									\
1696     TMP_FREE;								\
1697     return t;								\
1698   }
1699 
1700 
1701 /* A remainder 2*s->size by s->size limbs */
1702 
1703 #define SPEED_ROUTINE_MPZ_MOD(function)					\
1704   {									\
1705     unsigned   i;							\
1706     mpz_t      a, d, r;							\
1707 									\
1708     SPEED_RESTRICT_COND (s->size >= 1);					\
1709 									\
1710     mpz_init_set_n (d, s->yp, s->size);					\
1711 									\
1712     /* high part less than d, low part a duplicate copied in */		\
1713     mpz_init_set_n (a, s->xp, s->size);					\
1714     mpz_mod (a, a, d);							\
1715     mpz_mul_2exp (a, a, GMP_LIMB_BITS * s->size);			\
1716     MPN_COPY (PTR(a), s->xp, s->size);					\
1717 									\
1718     mpz_init (r);							\
1719 									\
1720     speed_operand_src (s, PTR(a), SIZ(a));				\
1721     speed_operand_src (s, PTR(d), SIZ(d));				\
1722     speed_cache_fill (s);						\
1723 									\
1724     speed_starttime ();							\
1725     i = s->reps;							\
1726     do									\
1727       function (r, a, d);						\
1728     while (--i != 0);							\
1729     return speed_endtime ();						\
1730   }
1731 
1732 #define SPEED_ROUTINE_MPN_PI1_DIV(function, INV, DMIN, QMIN)		\
1733   {									\
1734     unsigned   i;							\
1735     mp_ptr     dp, tp, ap, qp;						\
1736     gmp_pi1_t  inv;							\
1737     double     t;							\
1738     mp_size_t size1;							\
1739     TMP_DECL;								\
1740 									\
1741     size1 = (s->r == 0 ? 2 * s->size : s->r);				\
1742 									\
1743     SPEED_RESTRICT_COND (s->size >= DMIN);				\
1744     SPEED_RESTRICT_COND (size1 - s->size >= QMIN);			\
1745 									\
1746     TMP_MARK;								\
1747     SPEED_TMP_ALLOC_LIMBS (ap, size1, s->align_xp);			\
1748     SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
1749     SPEED_TMP_ALLOC_LIMBS (qp, size1 - s->size, s->align_wp);		\
1750     SPEED_TMP_ALLOC_LIMBS (tp, size1, s->align_wp2);			\
1751 									\
1752     /* we don't fill in dividend completely when size1 > s->size */	\
1753     MPN_COPY (ap,         s->xp, s->size);				\
1754     MPN_COPY (ap + size1 - s->size, s->xp, s->size);			\
1755 									\
1756     MPN_COPY (dp,         s->yp, s->size);				\
1757 									\
1758     /* normalize the data */						\
1759     dp[s->size-1] |= GMP_NUMB_HIGHBIT;					\
1760     ap[size1 - 1] = dp[s->size - 1] - 1;				\
1761 									\
1762     invert_pi1 (inv, dp[s->size-1], dp[s->size-2]);			\
1763 									\
1764     speed_operand_src (s, ap, size1);					\
1765     speed_operand_dst (s, tp, size1);					\
1766     speed_operand_src (s, dp, s->size);					\
1767     speed_operand_dst (s, qp, size1 - s->size);				\
1768     speed_cache_fill (s);						\
1769 									\
1770     speed_starttime ();							\
1771     i = s->reps;							\
1772     do {								\
1773       MPN_COPY (tp, ap, size1);						\
1774       function (qp, tp, size1, dp, s->size, INV);			\
1775     } while (--i != 0);							\
1776     t = speed_endtime ();						\
1777 									\
1778     TMP_FREE;								\
1779     return t;								\
1780   }
1781 #define SPEED_ROUTINE_MPN_MU_DIV_Q(function,itchfn)			\
1782   {									\
1783     unsigned   i;							\
1784     mp_ptr     dp, tp, qp, scratch;					\
1785     double     t;							\
1786     mp_size_t itch;							\
1787     TMP_DECL;								\
1788 									\
1789     SPEED_RESTRICT_COND (s->size >= 2);					\
1790 									\
1791     itch = itchfn (2 * s->size, s->size, 0);				\
1792     TMP_MARK;								\
1793     SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
1794     SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp);			\
1795     SPEED_TMP_ALLOC_LIMBS (tp, 2 * s->size, s->align_xp);		\
1796     SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2);		\
1797 									\
1798     MPN_COPY (tp,         s->xp, s->size);				\
1799     MPN_COPY (tp+s->size, s->xp, s->size);				\
1800 									\
1801     /* normalize the data */						\
1802     dp[s->size-1] |= GMP_NUMB_HIGHBIT;					\
1803     tp[2*s->size-1] = dp[s->size-1] - 1;				\
1804 									\
1805     speed_operand_dst (s, qp, s->size);					\
1806     speed_operand_src (s, tp, 2 * s->size);				\
1807     speed_operand_src (s, dp, s->size);					\
1808     speed_operand_dst (s, scratch, itch);				\
1809     speed_cache_fill (s);						\
1810 									\
1811     speed_starttime ();							\
1812     i = s->reps;							\
1813     do {								\
1814       function (qp, tp, 2 * s->size, dp, s->size, scratch);		\
1815     } while (--i != 0);							\
1816     t = speed_endtime ();						\
1817 									\
1818     TMP_FREE;								\
1819     return t;								\
1820   }
1821 #define SPEED_ROUTINE_MPN_MU_DIV_QR(function,itchfn)			\
1822   {									\
1823     unsigned   i;							\
1824     mp_ptr     dp, tp, qp, rp, scratch;					\
1825     double     t;							\
1826     mp_size_t size1, itch;						\
1827     TMP_DECL;								\
1828 									\
1829     size1 = (s->r == 0 ? 2 * s->size : s->r);				\
1830 									\
1831     SPEED_RESTRICT_COND (s->size >= 2);					\
1832     SPEED_RESTRICT_COND (size1 >= s->size);				\
1833 									\
1834     itch = itchfn (size1, s->size, 0);					\
1835     TMP_MARK;								\
1836     SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
1837     SPEED_TMP_ALLOC_LIMBS (qp, size1 - s->size, s->align_wp);		\
1838     SPEED_TMP_ALLOC_LIMBS (tp, size1, s->align_xp);			\
1839     SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2);		\
1840     SPEED_TMP_ALLOC_LIMBS (rp, s->size, s->align_wp2); /* alignment? */	\
1841 									\
1842     /* we don't fill in dividend completely when size1 > s->size */	\
1843     MPN_COPY (tp,         s->xp, s->size);				\
1844     MPN_COPY (tp + size1 - s->size, s->xp, s->size);			\
1845 									\
1846     MPN_COPY (dp,         s->yp, s->size);				\
1847 									\
1848     /* normalize the data */						\
1849     dp[s->size-1] |= GMP_NUMB_HIGHBIT;					\
1850     tp[size1 - 1] = dp[s->size - 1] - 1;				\
1851 									\
1852     speed_operand_dst (s, qp, size1 - s->size);				\
1853     speed_operand_dst (s, rp, s->size);					\
1854     speed_operand_src (s, tp, size1);					\
1855     speed_operand_src (s, dp, s->size);					\
1856     speed_operand_dst (s, scratch, itch);				\
1857     speed_cache_fill (s);						\
1858 									\
1859     speed_starttime ();							\
1860     i = s->reps;							\
1861     do {								\
1862       function (qp, rp, tp, size1, dp, s->size, scratch);		\
1863     } while (--i != 0);							\
1864     t = speed_endtime ();						\
1865 									\
1866     TMP_FREE;								\
1867     return t;								\
1868   }
1869 #define SPEED_ROUTINE_MPN_MUPI_DIV_QR(function,itchfn)			\
1870   {									\
1871     unsigned   i;							\
1872     mp_ptr     dp, tp, qp, rp, ip, scratch, tmp;			\
1873     double     t;							\
1874     mp_size_t  size1, itch;						\
1875     TMP_DECL;								\
1876 									\
1877     size1 = (s->r == 0 ? 2 * s->size : s->r);				\
1878 									\
1879     SPEED_RESTRICT_COND (s->size >= 2);					\
1880     SPEED_RESTRICT_COND (size1 >= s->size);				\
1881 									\
1882     itch = itchfn (size1, s->size, s->size);				\
1883     TMP_MARK;								\
1884     SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
1885     SPEED_TMP_ALLOC_LIMBS (qp, size1 - s->size, s->align_wp);		\
1886     SPEED_TMP_ALLOC_LIMBS (tp, size1, s->align_xp);			\
1887     SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2);		\
1888     SPEED_TMP_ALLOC_LIMBS (rp, s->size, s->align_wp2); /* alignment? */	\
1889     SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_wp2); /* alignment? */	\
1890 									\
1891     /* we don't fill in dividend completely when size1 > s->size */	\
1892     MPN_COPY (tp,         s->xp, s->size);				\
1893     MPN_COPY (tp + size1 - s->size, s->xp, s->size);			\
1894 									\
1895     MPN_COPY (dp,         s->yp, s->size);				\
1896 									\
1897     /* normalize the data */						\
1898     dp[s->size-1] |= GMP_NUMB_HIGHBIT;					\
1899     tp[size1 - 1] = dp[s->size-1] - 1;					\
1900 									\
1901     tmp = TMP_ALLOC_LIMBS (mpn_invert_itch (s->size));			\
1902     mpn_invert (ip, dp, s->size, tmp);					\
1903 									\
1904     speed_operand_dst (s, qp, size1 - s->size);				\
1905     speed_operand_dst (s, rp, s->size);					\
1906     speed_operand_src (s, tp, size1);					\
1907     speed_operand_src (s, dp, s->size);					\
1908     speed_operand_src (s, ip, s->size);					\
1909     speed_operand_dst (s, scratch, itch);				\
1910     speed_cache_fill (s);						\
1911 									\
1912     speed_starttime ();							\
1913     i = s->reps;							\
1914     do {								\
1915       function (qp, rp, tp, size1, dp, s->size, ip, s->size, scratch);	\
1916     } while (--i != 0);							\
1917     t = speed_endtime ();						\
1918 									\
1919     TMP_FREE;								\
1920     return t;								\
1921   }
1922 
1923 #define SPEED_ROUTINE_MPN_PI1_BDIV_QR(function)				\
1924   {									\
1925     unsigned   i;							\
1926     mp_ptr     dp, tp, ap, qp;						\
1927     mp_limb_t  inv;							\
1928     double     t;							\
1929     TMP_DECL;								\
1930 									\
1931     SPEED_RESTRICT_COND (s->size >= 1);					\
1932 									\
1933     TMP_MARK;								\
1934     SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size, s->align_xp);			\
1935     SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
1936     SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp);			\
1937     SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size, s->align_wp2);		\
1938 									\
1939     MPN_COPY (ap,         s->xp, s->size);				\
1940     MPN_COPY (ap+s->size, s->xp, s->size);				\
1941 									\
1942     /* divisor must be odd */						\
1943     MPN_COPY (dp, s->yp, s->size);					\
1944     dp[0] |= 1;								\
1945     binvert_limb (inv, dp[0]);						\
1946     inv = -inv;								\
1947 									\
1948     speed_operand_src (s, ap, 2*s->size);				\
1949     speed_operand_dst (s, tp, 2*s->size);				\
1950     speed_operand_src (s, dp, s->size);					\
1951     speed_operand_dst (s, qp, s->size);					\
1952     speed_cache_fill (s);						\
1953 									\
1954     speed_starttime ();							\
1955     i = s->reps;							\
1956     do {								\
1957       MPN_COPY (tp, ap, 2*s->size);					\
1958       function (qp, tp, 2*s->size, dp, s->size, inv);			\
1959     } while (--i != 0);							\
1960     t = speed_endtime ();						\
1961 									\
1962     TMP_FREE;								\
1963     return t;								\
1964   }
1965 #define SPEED_ROUTINE_MPN_PI1_BDIV_Q(function)				\
1966   {									\
1967     unsigned   i;							\
1968     mp_ptr     dp, tp, qp;						\
1969     mp_limb_t  inv;							\
1970     double     t;							\
1971     TMP_DECL;								\
1972 									\
1973     SPEED_RESTRICT_COND (s->size >= 1);					\
1974 									\
1975     TMP_MARK;								\
1976     SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
1977     SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp);			\
1978     SPEED_TMP_ALLOC_LIMBS (tp, s->size, s->align_wp2);			\
1979 									\
1980     /* divisor must be odd */						\
1981     MPN_COPY (dp, s->yp, s->size);					\
1982     dp[0] |= 1;								\
1983     binvert_limb (inv, dp[0]);						\
1984     inv = -inv;								\
1985 									\
1986     speed_operand_src (s, s->xp, s->size);				\
1987     speed_operand_dst (s, tp, s->size);					\
1988     speed_operand_src (s, dp, s->size);					\
1989     speed_operand_dst (s, qp, s->size);					\
1990     speed_cache_fill (s);						\
1991 									\
1992     speed_starttime ();							\
1993     i = s->reps;							\
1994     do {								\
1995       MPN_COPY (tp, s->xp, s->size);					\
1996       function (qp, tp, s->size, dp, s->size, inv);			\
1997     } while (--i != 0);							\
1998     t = speed_endtime ();						\
1999 									\
2000     TMP_FREE;								\
2001     return t;								\
2002   }
2003 #define SPEED_ROUTINE_MPN_MU_BDIV_Q(function,itchfn)			\
2004   {									\
2005     unsigned   i;							\
2006     mp_ptr     dp, qp, scratch;						\
2007     double     t;							\
2008     mp_size_t itch;							\
2009     TMP_DECL;								\
2010 									\
2011     SPEED_RESTRICT_COND (s->size >= 2);					\
2012 									\
2013     itch = itchfn (s->size, s->size);					\
2014     TMP_MARK;								\
2015     SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
2016     SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp);			\
2017     SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2);		\
2018 									\
2019     /* divisor must be odd */						\
2020     MPN_COPY (dp, s->yp, s->size);					\
2021     dp[0] |= 1;								\
2022 									\
2023     speed_operand_dst (s, qp, s->size);					\
2024     speed_operand_src (s, s->xp, s->size);				\
2025     speed_operand_src (s, dp, s->size);					\
2026     speed_operand_dst (s, scratch, itch);				\
2027     speed_cache_fill (s);						\
2028 									\
2029     speed_starttime ();							\
2030     i = s->reps;							\
2031     do {								\
2032       function (qp, s->xp, s->size, dp, s->size, scratch);		\
2033     } while (--i != 0);							\
2034     t = speed_endtime ();						\
2035 									\
2036     TMP_FREE;								\
2037     return t;								\
2038   }
2039 #define SPEED_ROUTINE_MPN_MU_BDIV_QR(function,itchfn)			\
2040   {									\
2041     unsigned   i;							\
2042     mp_ptr     dp, tp, qp, rp, scratch;					\
2043     double     t;							\
2044     mp_size_t itch;							\
2045     TMP_DECL;								\
2046 									\
2047     SPEED_RESTRICT_COND (s->size >= 2);					\
2048 									\
2049     itch = itchfn (2 * s->size, s->size);				\
2050     TMP_MARK;								\
2051     SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);			\
2052     SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp);			\
2053     SPEED_TMP_ALLOC_LIMBS (tp, 2 * s->size, s->align_xp);		\
2054     SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2);		\
2055     SPEED_TMP_ALLOC_LIMBS (rp, s->size, s->align_wp2); /* alignment? */	\
2056 									\
2057     MPN_COPY (tp,         s->xp, s->size);				\
2058     MPN_COPY (tp+s->size, s->xp, s->size);				\
2059 									\
2060     /* divisor must be odd */						\
2061     MPN_COPY (dp, s->yp, s->size);					\
2062     dp[0] |= 1;								\
2063 									\
2064     speed_operand_dst (s, qp, s->size);					\
2065     speed_operand_dst (s, rp, s->size);					\
2066     speed_operand_src (s, tp, 2 * s->size);				\
2067     speed_operand_src (s, dp, s->size);					\
2068     speed_operand_dst (s, scratch, itch);				\
2069     speed_cache_fill (s);						\
2070 									\
2071     speed_starttime ();							\
2072     i = s->reps;							\
2073     do {								\
2074       function (qp, rp, tp, 2 * s->size, dp, s->size, scratch);		\
2075     } while (--i != 0);							\
2076     t = speed_endtime ();						\
2077 									\
2078     TMP_FREE;								\
2079     return t;								\
2080   }
2081 
2082 #define SPEED_ROUTINE_MPN_BROOT(function)	\
2083   {						\
2084     SPEED_RESTRICT_COND (s->r & 1);		\
2085     s->xp[0] |= 1;				\
2086     SPEED_ROUTINE_MPN_UNARY_1_CALL		\
2087       ((*function) (wp, s->xp, s->size, s->r));	\
2088   }
2089 
2090 #define SPEED_ROUTINE_MPN_BROOTINV(function, itch)	\
2091   {							\
2092     mp_ptr    wp, tp;					\
2093     unsigned  i;					\
2094     double    t;					\
2095     TMP_DECL;						\
2096     TMP_MARK;						\
2097     SPEED_RESTRICT_COND (s->size >= 1);			\
2098     SPEED_RESTRICT_COND (s->r & 1);			\
2099     wp = TMP_ALLOC_LIMBS (s->size);			\
2100     tp = TMP_ALLOC_LIMBS ( (itch));			\
2101     s->xp[0] |= 1;					\
2102 							\
2103     speed_operand_src (s, s->xp, s->size);		\
2104     speed_operand_dst (s, wp, s->size);			\
2105     speed_cache_fill (s);				\
2106 							\
2107     speed_starttime ();					\
2108     i = s->reps;					\
2109     do							\
2110       (*function) (wp, s->xp, s->size, s->r, tp);	\
2111     while (--i != 0);					\
2112     t = speed_endtime ();				\
2113 							\
2114     TMP_FREE;						\
2115     return t;						\
2116   }
2117 
2118 #define SPEED_ROUTINE_MPN_INVERT(function,itchfn)			\
2119   {									\
2120     long  i;								\
2121     mp_ptr    up, tp, ip;						\
2122     double    t;							\
2123     TMP_DECL;								\
2124 									\
2125     SPEED_RESTRICT_COND (s->size >= 1);					\
2126 									\
2127     TMP_MARK;								\
2128     SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp);			\
2129     SPEED_TMP_ALLOC_LIMBS (up, s->size,   s->align_yp);			\
2130     SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp);		\
2131 									\
2132     MPN_COPY (up, s->xp, s->size);					\
2133 									\
2134     /* normalize the data */						\
2135     up[s->size-1] |= GMP_NUMB_HIGHBIT;					\
2136 									\
2137     speed_operand_src (s, up, s->size);					\
2138     speed_operand_dst (s, tp, s->size);					\
2139     speed_operand_dst (s, ip, s->size);					\
2140     speed_cache_fill (s);						\
2141 									\
2142     speed_starttime ();							\
2143     i = s->reps;							\
2144     do									\
2145       function (ip, up, s->size, tp);					\
2146     while (--i != 0);							\
2147     t = speed_endtime ();						\
2148 									\
2149     TMP_FREE;								\
2150     return t;								\
2151   }
2152 
2153 #define SPEED_ROUTINE_MPN_INVERTAPPR(function,itchfn)			\
2154   {									\
2155     long  i;								\
2156     mp_ptr    up, tp, ip;						\
2157     double    t;							\
2158     TMP_DECL;								\
2159 									\
2160     SPEED_RESTRICT_COND (s->size >= 1);					\
2161 									\
2162     TMP_MARK;								\
2163     SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp);			\
2164     SPEED_TMP_ALLOC_LIMBS (up, s->size, s->align_yp);			\
2165     SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp);		\
2166 									\
2167     MPN_COPY (up, s->xp, s->size);					\
2168 									\
2169     /* normalize the data */						\
2170     up[s->size-1] |= GMP_NUMB_HIGHBIT;					\
2171 									\
2172     speed_operand_src (s, up, s->size);					\
2173     speed_operand_dst (s, tp, s->size);					\
2174     speed_operand_dst (s, ip, s->size);					\
2175     speed_cache_fill (s);						\
2176 									\
2177     speed_starttime ();							\
2178     i = s->reps;							\
2179     do									\
2180       function (ip, up, s->size, tp);					\
2181     while (--i != 0);							\
2182     t = speed_endtime ();						\
2183 									\
2184     TMP_FREE;								\
2185     return t;								\
2186   }
2187 
2188 #define SPEED_ROUTINE_MPN_NI_INVERTAPPR(function,itchfn)		\
2189   {									\
2190     long  i;								\
2191     mp_ptr    up, tp, ip;						\
2192     double    t;							\
2193     TMP_DECL;								\
2194 									\
2195     SPEED_RESTRICT_COND (s->size >= 3);					\
2196 									\
2197     TMP_MARK;								\
2198     SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp);			\
2199     SPEED_TMP_ALLOC_LIMBS (up, s->size, s->align_yp);			\
2200     SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp);		\
2201 									\
2202     MPN_COPY (up, s->xp, s->size);					\
2203 									\
2204     /* normalize the data */						\
2205     up[s->size-1] |= GMP_NUMB_HIGHBIT;					\
2206 									\
2207     speed_operand_src (s, up, s->size);					\
2208     speed_operand_dst (s, tp, s->size);					\
2209     speed_operand_dst (s, ip, s->size);					\
2210     speed_cache_fill (s);						\
2211 									\
2212     speed_starttime ();							\
2213     i = s->reps;							\
2214     do									\
2215       function (ip, up, s->size, tp);					\
2216     while (--i != 0);							\
2217     t = speed_endtime ();						\
2218 									\
2219     TMP_FREE;								\
2220     return t;								\
2221   }
2222 
2223 #define SPEED_ROUTINE_MPN_BINVERT(function,itchfn)			\
2224   {									\
2225     long  i;								\
2226     mp_ptr    up, tp, ip;						\
2227     double    t;							\
2228     TMP_DECL;								\
2229 									\
2230     SPEED_RESTRICT_COND (s->size >= 1);					\
2231 									\
2232     TMP_MARK;								\
2233     SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp);			\
2234     SPEED_TMP_ALLOC_LIMBS (up, s->size,   s->align_yp);			\
2235     SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp);		\
2236 									\
2237     MPN_COPY (up, s->xp, s->size);					\
2238 									\
2239     /* normalize the data */						\
2240     up[0] |= 1;								\
2241 									\
2242     speed_operand_src (s, up, s->size);					\
2243     speed_operand_dst (s, tp, s->size);					\
2244     speed_operand_dst (s, ip, s->size);					\
2245     speed_cache_fill (s);						\
2246 									\
2247     speed_starttime ();							\
2248     i = s->reps;							\
2249     do									\
2250       function (ip, up, s->size, tp);					\
2251     while (--i != 0);							\
2252     t = speed_endtime ();						\
2253 									\
2254     TMP_FREE;								\
2255     return t;								\
2256   }
2257 
2258 #define SPEED_ROUTINE_REDC_1(function)					\
2259   {									\
2260     unsigned   i;							\
2261     mp_ptr     cp, mp, tp, ap;						\
2262     mp_limb_t  inv;							\
2263     double     t;							\
2264     TMP_DECL;								\
2265 									\
2266     SPEED_RESTRICT_COND (s->size >= 1);					\
2267 									\
2268     TMP_MARK;								\
2269     SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size+1, s->align_xp);		\
2270     SPEED_TMP_ALLOC_LIMBS (mp, s->size,     s->align_yp);		\
2271     SPEED_TMP_ALLOC_LIMBS (cp, s->size,     s->align_wp);		\
2272     SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size+1, s->align_wp2);		\
2273 									\
2274     MPN_COPY (ap,         s->xp, s->size);				\
2275     MPN_COPY (ap+s->size, s->xp, s->size);				\
2276 									\
2277     /* modulus must be odd */						\
2278     MPN_COPY (mp, s->yp, s->size);					\
2279     mp[0] |= 1;								\
2280     binvert_limb (inv, mp[0]);						\
2281     inv = -inv;								\
2282 									\
2283     speed_operand_src (s, ap, 2*s->size+1);				\
2284     speed_operand_dst (s, tp, 2*s->size+1);				\
2285     speed_operand_src (s, mp, s->size);					\
2286     speed_operand_dst (s, cp, s->size);					\
2287     speed_cache_fill (s);						\
2288 									\
2289     speed_starttime ();							\
2290     i = s->reps;							\
2291     do {								\
2292       MPN_COPY (tp, ap, 2*s->size);					\
2293       function (cp, tp, mp, s->size, inv);				\
2294     } while (--i != 0);							\
2295     t = speed_endtime ();						\
2296 									\
2297     TMP_FREE;								\
2298     return t;								\
2299   }
2300 #define SPEED_ROUTINE_REDC_2(function)					\
2301   {									\
2302     unsigned   i;							\
2303     mp_ptr     cp, mp, tp, ap;						\
2304     mp_limb_t  invp[2];							\
2305     double     t;							\
2306     TMP_DECL;								\
2307 									\
2308     SPEED_RESTRICT_COND (s->size >= 1);					\
2309 									\
2310     TMP_MARK;								\
2311     SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size+1, s->align_xp);		\
2312     SPEED_TMP_ALLOC_LIMBS (mp, s->size,     s->align_yp);		\
2313     SPEED_TMP_ALLOC_LIMBS (cp, s->size,     s->align_wp);		\
2314     SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size+1, s->align_wp2);		\
2315 									\
2316     MPN_COPY (ap,         s->xp, s->size);				\
2317     MPN_COPY (ap+s->size, s->xp, s->size);				\
2318 									\
2319     /* modulus must be odd */						\
2320     MPN_COPY (mp, s->yp, s->size);					\
2321     mp[0] |= 1;								\
2322     mpn_binvert (invp, mp, 2, tp);					\
2323     invp[0] = -invp[0]; invp[1] = ~invp[1];				\
2324 									\
2325     speed_operand_src (s, ap, 2*s->size+1);				\
2326     speed_operand_dst (s, tp, 2*s->size+1);				\
2327     speed_operand_src (s, mp, s->size);					\
2328     speed_operand_dst (s, cp, s->size);					\
2329     speed_cache_fill (s);						\
2330 									\
2331     speed_starttime ();							\
2332     i = s->reps;							\
2333     do {								\
2334       MPN_COPY (tp, ap, 2*s->size);					\
2335       function (cp, tp, mp, s->size, invp);				\
2336     } while (--i != 0);							\
2337     t = speed_endtime ();						\
2338 									\
2339     TMP_FREE;								\
2340     return t;								\
2341   }
2342 #define SPEED_ROUTINE_REDC_N(function)					\
2343   {									\
2344     unsigned   i;							\
2345     mp_ptr     cp, mp, tp, ap, invp;					\
2346     double     t;							\
2347     TMP_DECL;								\
2348 									\
2349     SPEED_RESTRICT_COND (s->size > 8);					\
2350 									\
2351     TMP_MARK;								\
2352     SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size+1, s->align_xp);		\
2353     SPEED_TMP_ALLOC_LIMBS (mp, s->size,     s->align_yp);		\
2354     SPEED_TMP_ALLOC_LIMBS (cp, s->size,     s->align_wp);		\
2355     SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size+1, s->align_wp2);		\
2356     SPEED_TMP_ALLOC_LIMBS (invp, s->size,   s->align_wp2); /* align? */	\
2357 									\
2358     MPN_COPY (ap,         s->xp, s->size);				\
2359     MPN_COPY (ap+s->size, s->xp, s->size);				\
2360 									\
2361     /* modulus must be odd */						\
2362     MPN_COPY (mp, s->yp, s->size);					\
2363     mp[0] |= 1;								\
2364     mpn_binvert (invp, mp, s->size, tp);				\
2365 									\
2366     speed_operand_src (s, ap, 2*s->size+1);				\
2367     speed_operand_dst (s, tp, 2*s->size+1);				\
2368     speed_operand_src (s, mp, s->size);					\
2369     speed_operand_dst (s, cp, s->size);					\
2370     speed_cache_fill (s);						\
2371 									\
2372     speed_starttime ();							\
2373     i = s->reps;							\
2374     do {								\
2375       MPN_COPY (tp, ap, 2*s->size);					\
2376       function (cp, tp, mp, s->size, invp);				\
2377     } while (--i != 0);							\
2378     t = speed_endtime ();						\
2379 									\
2380     TMP_FREE;								\
2381     return t;								\
2382   }
2383 
2384 
2385 #define SPEED_ROUTINE_MPN_POPCOUNT(function)				\
2386   {									\
2387     unsigned i;								\
2388 									\
2389     SPEED_RESTRICT_COND (s->size >= 1);					\
2390 									\
2391     speed_operand_src (s, s->xp, s->size);				\
2392     speed_cache_fill (s);						\
2393 									\
2394     speed_starttime ();							\
2395     i = s->reps;							\
2396     do									\
2397       function (s->xp, s->size);					\
2398     while (--i != 0);							\
2399 									\
2400     return speed_endtime ();						\
2401   }
2402 
2403 #define SPEED_ROUTINE_MPN_HAMDIST(function)				\
2404   {									\
2405     unsigned i;								\
2406 									\
2407     SPEED_RESTRICT_COND (s->size >= 1);					\
2408 									\
2409     speed_operand_src (s, s->xp, s->size);				\
2410     speed_operand_src (s, s->yp, s->size);				\
2411     speed_cache_fill (s);						\
2412 									\
2413     speed_starttime ();							\
2414     i = s->reps;							\
2415     do									\
2416       function (s->xp, s->yp, s->size);					\
2417     while (--i != 0);							\
2418 									\
2419     return speed_endtime ();						\
2420   }
2421 
2422 
2423 #define SPEED_ROUTINE_MPZ_UI(function)					\
2424   {									\
2425     mpz_t     z;							\
2426     unsigned  i;							\
2427     double    t;							\
2428 									\
2429     SPEED_RESTRICT_COND (s->size >= 0);					\
2430 									\
2431     mpz_init (z);							\
2432 									\
2433     speed_starttime ();							\
2434     i = s->reps;							\
2435     do									\
2436       function (z, s->size);						\
2437     while (--i != 0);							\
2438     t = speed_endtime ();						\
2439 									\
2440     mpz_clear (z);							\
2441     return t;								\
2442   }
2443 
2444 #define SPEED_ROUTINE_MPZ_FAC_UI(function)    SPEED_ROUTINE_MPZ_UI(function)
2445 #define SPEED_ROUTINE_MPZ_FIB_UI(function)    SPEED_ROUTINE_MPZ_UI(function)
2446 #define SPEED_ROUTINE_MPZ_LUCNUM_UI(function) SPEED_ROUTINE_MPZ_UI(function)
2447 
2448 
2449 #define SPEED_ROUTINE_MPZ_2_UI(function)				\
2450   {									\
2451     mpz_t     z, z2;							\
2452     unsigned  i;							\
2453     double    t;							\
2454 									\
2455     SPEED_RESTRICT_COND (s->size >= 0);					\
2456 									\
2457     mpz_init (z);							\
2458     mpz_init (z2);							\
2459 									\
2460     speed_starttime ();							\
2461     i = s->reps;							\
2462     do									\
2463       function (z, z2, s->size);					\
2464     while (--i != 0);							\
2465     t = speed_endtime ();						\
2466 									\
2467     mpz_clear (z);							\
2468     mpz_clear (z2);							\
2469     return t;								\
2470   }
2471 
2472 #define SPEED_ROUTINE_MPZ_FIB2_UI(function)    SPEED_ROUTINE_MPZ_2_UI(function)
2473 #define SPEED_ROUTINE_MPZ_LUCNUM2_UI(function) SPEED_ROUTINE_MPZ_2_UI(function)
2474 
2475 
2476 #define SPEED_ROUTINE_MPN_FIB2_UI(function)				\
2477   {									\
2478     mp_ptr     fp, f1p;							\
2479     mp_size_t  alloc;							\
2480     unsigned   i;							\
2481     double     t;							\
2482     TMP_DECL;								\
2483 									\
2484     SPEED_RESTRICT_COND (s->size >= 0);					\
2485 									\
2486     TMP_MARK;								\
2487     alloc = MPN_FIB2_SIZE (s->size);					\
2488     SPEED_TMP_ALLOC_LIMBS (fp,	alloc, s->align_xp);			\
2489     SPEED_TMP_ALLOC_LIMBS (f1p, alloc, s->align_yp);			\
2490 									\
2491     speed_starttime ();							\
2492     i = s->reps;							\
2493     do									\
2494       function (fp, f1p, s->size);					\
2495     while (--i != 0);							\
2496     t = speed_endtime ();						\
2497 									\
2498     TMP_FREE;								\
2499     return t;								\
2500   }
2501 
2502 
2503 
2504 /* Calculate b^e mod m for random b and m of s->size limbs and random e of 6
2505    limbs.  m is forced to odd so that redc can be used.  e is limited in
2506    size so the calculation doesn't take too long. */
2507 #define SPEED_ROUTINE_MPZ_POWM(function)				\
2508   {									\
2509     mpz_t     r, b, e, m;						\
2510     unsigned  i;							\
2511     double    t;							\
2512 									\
2513     SPEED_RESTRICT_COND (s->size >= 1);					\
2514 									\
2515     mpz_init (r);							\
2516     mpz_init_set_n (b, s->xp, s->size);					\
2517     mpz_init_set_n (m, s->yp, s->size);					\
2518     mpz_setbit (m, 0);	/* force m to odd */				\
2519     mpz_init_set_n (e, s->xp_block, 6);					\
2520 									\
2521     speed_starttime ();							\
2522     i = s->reps;							\
2523     do									\
2524       function (r, b, e, m);						\
2525     while (--i != 0);							\
2526     t = speed_endtime ();						\
2527 									\
2528     mpz_clear (r);							\
2529     mpz_clear (b);							\
2530     mpz_clear (e);							\
2531     mpz_clear (m);							\
2532     return t;								\
2533   }
2534 
2535 /* (m-2)^0xAAAAAAAA mod m */
2536 #define SPEED_ROUTINE_MPZ_POWM_UI(function)				\
2537   {									\
2538     mpz_t     r, b, m;							\
2539     unsigned  long  e;							\
2540     unsigned  i;							\
2541     double    t;							\
2542 									\
2543     SPEED_RESTRICT_COND (s->size >= 1);					\
2544 									\
2545     mpz_init (r);							\
2546 									\
2547     /* force m to odd */						\
2548     mpz_init (m);							\
2549     mpz_set_n (m, s->xp, s->size);					\
2550     PTR(m)[0] |= 1;							\
2551 									\
2552     e = (~ (unsigned long) 0) / 3;					\
2553     if (s->r != 0)							\
2554       e = s->r;								\
2555 									\
2556     mpz_init_set (b, m);						\
2557     mpz_sub_ui (b, b, 2);						\
2558 /* printf ("%X\n", mpz_get_ui(m)); */					\
2559     i = s->reps;							\
2560     speed_starttime ();							\
2561     do									\
2562       function (r, b, e, m);						\
2563     while (--i != 0);							\
2564     t = speed_endtime ();						\
2565 									\
2566     mpz_clear (r);							\
2567     mpz_clear (b);							\
2568     mpz_clear (m);							\
2569     return t;								\
2570   }
2571 
2572 
2573 #define SPEED_ROUTINE_MPN_ADDSUB_CALL(call)				\
2574   {									\
2575     mp_ptr    wp, wp2, xp, yp;						\
2576     unsigned  i;							\
2577     double    t;							\
2578     TMP_DECL;								\
2579 									\
2580     SPEED_RESTRICT_COND (s->size >= 0);					\
2581 									\
2582     TMP_MARK;								\
2583     SPEED_TMP_ALLOC_LIMBS (wp,	s->size, s->align_wp);			\
2584     SPEED_TMP_ALLOC_LIMBS (wp2, s->size, s->align_wp2);			\
2585     xp = s->xp;								\
2586     yp = s->yp;								\
2587 									\
2588     if (s->r == 0)	;						\
2589     else if (s->r == 1) { xp = wp;	      }				\
2590     else if (s->r == 2) {	    yp = wp2; }				\
2591     else if (s->r == 3) { xp = wp;  yp = wp2; }				\
2592     else if (s->r == 4) { xp = wp2; yp = wp;  }				\
2593     else {								\
2594       TMP_FREE;								\
2595       return -1.0;							\
2596     }									\
2597     if (xp != s->xp) MPN_COPY (xp, s->xp, s->size);			\
2598     if (yp != s->yp) MPN_COPY (yp, s->yp, s->size);			\
2599 									\
2600     speed_operand_src (s, xp, s->size);					\
2601     speed_operand_src (s, yp, s->size);					\
2602     speed_operand_dst (s, wp, s->size);					\
2603     speed_operand_dst (s, wp2, s->size);				\
2604     speed_cache_fill (s);						\
2605 									\
2606     speed_starttime ();							\
2607     i = s->reps;							\
2608     do									\
2609       call;								\
2610     while (--i != 0);							\
2611     t = speed_endtime ();						\
2612 									\
2613     TMP_FREE;								\
2614     return t;								\
2615   }
2616 
2617 #define SPEED_ROUTINE_MPN_ADDSUB_N(function)				\
2618   SPEED_ROUTINE_MPN_ADDSUB_CALL						\
2619     (function (wp, wp2, xp, yp, s->size));
2620 
2621 #define SPEED_ROUTINE_MPN_ADDSUB_NC(function)				\
2622   SPEED_ROUTINE_MPN_ADDSUB_CALL						\
2623     (function (wp, wp2, xp, yp, s->size, 0));
2624 
2625 
2626 /* Doing an Nx1 gcd with the given r. */
2627 #define SPEED_ROUTINE_MPN_GCD_1N(function)				\
2628   {									\
2629     mp_ptr    xp;							\
2630     unsigned  i;							\
2631     double    t;							\
2632     TMP_DECL;								\
2633 									\
2634     SPEED_RESTRICT_COND (s->size >= 1);					\
2635     SPEED_RESTRICT_COND (s->r != 0);					\
2636 									\
2637     TMP_MARK;								\
2638     SPEED_TMP_ALLOC_LIMBS (xp, s->size, s->align_xp);			\
2639     MPN_COPY (xp, s->xp, s->size);					\
2640     xp[0] |= refmpn_zero_p (xp, s->size);				\
2641 									\
2642     speed_operand_src (s, s->xp, s->size);				\
2643     speed_cache_fill (s);						\
2644 									\
2645     speed_starttime ();							\
2646     i = s->reps;							\
2647     do									\
2648       function (xp, s->size, s->r);					\
2649     while (--i != 0);							\
2650     t = speed_endtime ();						\
2651 									\
2652     TMP_FREE;								\
2653     return t;								\
2654   }
2655 
2656 
2657 /* SPEED_BLOCK_SIZE many one GCDs of s->size bits each. */
2658 
2659 #define SPEED_ROUTINE_MPN_GCD_1_CALL(setup, call)			\
2660   {									\
2661     unsigned  i, j;							\
2662     mp_ptr    px, py;							\
2663     mp_limb_t x_mask, y_mask;						\
2664     double    t;							\
2665     TMP_DECL;								\
2666 									\
2667     SPEED_RESTRICT_COND (s->size >= 1);					\
2668     SPEED_RESTRICT_COND (s->size <= mp_bits_per_limb);			\
2669 									\
2670     TMP_MARK;								\
2671     SPEED_TMP_ALLOC_LIMBS (px, SPEED_BLOCK_SIZE, s->align_xp);		\
2672     SPEED_TMP_ALLOC_LIMBS (py, SPEED_BLOCK_SIZE, s->align_yp);		\
2673     MPN_COPY (px, s->xp_block, SPEED_BLOCK_SIZE);			\
2674     MPN_COPY (py, s->yp_block, SPEED_BLOCK_SIZE);			\
2675 									\
2676     x_mask = MP_LIMB_T_LOWBITMASK (s->size);				\
2677     y_mask = MP_LIMB_T_LOWBITMASK (s->r != 0 ? s->r : s->size);		\
2678     for (i = 0; i < SPEED_BLOCK_SIZE; i++)				\
2679       {									\
2680 	px[i] &= x_mask; px[i] += (px[i] == 0);				\
2681 	py[i] &= y_mask; py[i] += (py[i] == 0);				\
2682 	setup;								\
2683       }									\
2684 									\
2685     speed_operand_src (s, px, SPEED_BLOCK_SIZE);			\
2686     speed_operand_src (s, py, SPEED_BLOCK_SIZE);			\
2687     speed_cache_fill (s);						\
2688 									\
2689     speed_starttime ();							\
2690     i = s->reps;							\
2691     do									\
2692       {									\
2693 	j = SPEED_BLOCK_SIZE;						\
2694 	do								\
2695 	  {								\
2696 	    call;							\
2697 	  }								\
2698 	while (--j != 0);						\
2699       }									\
2700     while (--i != 0);							\
2701     t = speed_endtime ();						\
2702 									\
2703     TMP_FREE;								\
2704 									\
2705     s->time_divisor = SPEED_BLOCK_SIZE;					\
2706     return t;								\
2707   }
2708 
2709 #define SPEED_ROUTINE_MPN_GCD_1(function)				\
2710   SPEED_ROUTINE_MPN_GCD_1_CALL( , function (&px[j-1], 1, py[j-1]))
2711 
2712 #define SPEED_ROUTINE_MPN_JACBASE(function)				\
2713   SPEED_ROUTINE_MPN_GCD_1_CALL						\
2714     ({									\
2715        /* require x<y, y odd, y!=1 */					\
2716        px[i] %= py[i];							\
2717        px[i] |= 1;							\
2718        py[i] |= 1;							\
2719        if (py[i]==1) py[i]=3;						\
2720      },									\
2721      function (px[j-1], py[j-1], 0))
2722 
2723 
2724 #define SPEED_ROUTINE_MPN_HGCD_CALL(func, itchfunc)			\
2725   {									\
2726     mp_size_t hgcd_init_itch, hgcd_itch;				\
2727     mp_ptr ap, bp, wp, tmp1;						\
2728     struct hgcd_matrix hgcd;						\
2729     int res;								\
2730     unsigned i;								\
2731     double t;								\
2732     TMP_DECL;								\
2733 									\
2734     if (s->size < 2)							\
2735       return -1;							\
2736 									\
2737     TMP_MARK;								\
2738 									\
2739     SPEED_TMP_ALLOC_LIMBS (ap, s->size + 1, s->align_xp);		\
2740     SPEED_TMP_ALLOC_LIMBS (bp, s->size + 1, s->align_yp);		\
2741 									\
2742     s->xp[s->size - 1] |= 1;						\
2743     s->yp[s->size - 1] |= 1;						\
2744 									\
2745     hgcd_init_itch = MPN_HGCD_MATRIX_INIT_ITCH (s->size);		\
2746     hgcd_itch = itchfunc (s->size);					\
2747 									\
2748     SPEED_TMP_ALLOC_LIMBS (tmp1, hgcd_init_itch, s->align_wp);		\
2749     SPEED_TMP_ALLOC_LIMBS (wp, hgcd_itch, s->align_wp);			\
2750 									\
2751     speed_operand_src (s, s->xp, s->size);				\
2752     speed_operand_src (s, s->yp, s->size);				\
2753     speed_operand_dst (s, ap, s->size + 1);				\
2754     speed_operand_dst (s, bp, s->size + 1);				\
2755     speed_operand_dst (s, wp, hgcd_itch);				\
2756     speed_operand_dst (s, tmp1, hgcd_init_itch);			\
2757     speed_cache_fill (s);						\
2758 									\
2759     speed_starttime ();							\
2760     i = s->reps;							\
2761     do									\
2762       {									\
2763 	MPN_COPY (ap, s->xp, s->size);					\
2764 	MPN_COPY (bp, s->yp, s->size);					\
2765 	mpn_hgcd_matrix_init (&hgcd, s->size, tmp1);			\
2766 	res = func (ap, bp, s->size, &hgcd, wp);			\
2767       }									\
2768     while (--i != 0);							\
2769     t = speed_endtime ();						\
2770     TMP_FREE;								\
2771     return t;								\
2772   }
2773 
2774 #define SPEED_ROUTINE_MPN_HGCD_REDUCE_CALL(func, itchfunc)		\
2775   {									\
2776     mp_size_t hgcd_init_itch, hgcd_step_itch;				\
2777     mp_ptr ap, bp, wp, tmp1;						\
2778     struct hgcd_matrix hgcd;						\
2779     mp_size_t p = s->size/2;						\
2780     int res;								\
2781     unsigned i;								\
2782     double t;								\
2783     TMP_DECL;								\
2784 									\
2785     if (s->size < 2)							\
2786       return -1;							\
2787 									\
2788     TMP_MARK;								\
2789 									\
2790     SPEED_TMP_ALLOC_LIMBS (ap, s->size + 1, s->align_xp);		\
2791     SPEED_TMP_ALLOC_LIMBS (bp, s->size + 1, s->align_yp);		\
2792 									\
2793     s->xp[s->size - 1] |= 1;						\
2794     s->yp[s->size - 1] |= 1;						\
2795 									\
2796     hgcd_init_itch = MPN_HGCD_MATRIX_INIT_ITCH (s->size);		\
2797     hgcd_step_itch = itchfunc (s->size, p);				\
2798 									\
2799     SPEED_TMP_ALLOC_LIMBS (tmp1, hgcd_init_itch, s->align_wp);		\
2800     SPEED_TMP_ALLOC_LIMBS (wp, hgcd_step_itch, s->align_wp);			\
2801 									\
2802     speed_operand_src (s, s->xp, s->size);				\
2803     speed_operand_src (s, s->yp, s->size);				\
2804     speed_operand_dst (s, ap, s->size + 1);				\
2805     speed_operand_dst (s, bp, s->size + 1);				\
2806     speed_operand_dst (s, wp, hgcd_step_itch);				\
2807     speed_operand_dst (s, tmp1, hgcd_init_itch);			\
2808     speed_cache_fill (s);						\
2809 									\
2810     speed_starttime ();							\
2811     i = s->reps;							\
2812     do									\
2813       {									\
2814 	MPN_COPY (ap, s->xp, s->size);					\
2815 	MPN_COPY (bp, s->yp, s->size);					\
2816 	mpn_hgcd_matrix_init (&hgcd, s->size, tmp1);			\
2817 	res = func (&hgcd, ap, bp, s->size, p, wp);			\
2818       }									\
2819     while (--i != 0);							\
2820     t = speed_endtime ();						\
2821     TMP_FREE;								\
2822     return t;								\
2823   }
2824 
2825 /* Run some GCDs of s->size limbs each.  The number of different data values
2826    is decreased as s->size**2, since GCD is a quadratic algorithm.
2827    SPEED_ROUTINE_MPN_GCD runs more times than SPEED_ROUTINE_MPN_GCDEXT
2828    though, because the plain gcd is about twice as fast as gcdext.  */
2829 
2830 #define SPEED_ROUTINE_MPN_GCD_CALL(datafactor, call)			\
2831   {									\
2832     unsigned  i;							\
2833     mp_size_t j, pieces, psize;						\
2834     mp_ptr    wp, wp2, xtmp, ytmp, px, py;				\
2835     double    t;							\
2836     TMP_DECL;								\
2837 									\
2838     SPEED_RESTRICT_COND (s->size >= 1);					\
2839 									\
2840     TMP_MARK;								\
2841     SPEED_TMP_ALLOC_LIMBS (xtmp, s->size+1, s->align_xp);		\
2842     SPEED_TMP_ALLOC_LIMBS (ytmp, s->size+1, s->align_yp);		\
2843     SPEED_TMP_ALLOC_LIMBS (wp,   s->size+1, s->align_wp);		\
2844     SPEED_TMP_ALLOC_LIMBS (wp2,  s->size+1, s->align_wp2);		\
2845 									\
2846     pieces = SPEED_BLOCK_SIZE * datafactor / s->size / s->size;		\
2847     pieces = MIN (pieces, SPEED_BLOCK_SIZE / s->size);			\
2848     pieces = MAX (pieces, 1);						\
2849 									\
2850     psize = pieces * s->size;						\
2851     px = TMP_ALLOC_LIMBS (psize);					\
2852     py = TMP_ALLOC_LIMBS (psize);					\
2853     MPN_COPY (px, pieces==1 ? s->xp : s->xp_block, psize);		\
2854     MPN_COPY (py, pieces==1 ? s->yp : s->yp_block, psize);		\
2855 									\
2856     /* Requirements: x >= y, y must be odd, high limbs != 0.		\
2857        No need to ensure random numbers are really great.  */		\
2858     for (j = 0; j < pieces; j++)					\
2859       {									\
2860 	mp_ptr	x = px + j * s->size;					\
2861 	mp_ptr	y = py + j * s->size;					\
2862 	if (x[s->size - 1] == 0) x[s->size - 1] = 1;			\
2863 	if (y[s->size - 1] == 0) y[s->size - 1] = 1;			\
2864 									\
2865 	if (x[s->size - 1] < y[s->size - 1])				\
2866 	  MP_LIMB_T_SWAP (x[s->size - 1], y[s->size - 1]);		\
2867 	else if (x[s->size - 1] == y[s->size - 1])			\
2868 	  {								\
2869 	    x[s->size - 1] = 2;						\
2870 	    y[s->size - 1] = 1;						\
2871 	  }								\
2872 	y[0] |= 1;							\
2873       }									\
2874 									\
2875     speed_operand_src (s, px, psize);					\
2876     speed_operand_src (s, py, psize);					\
2877     speed_operand_dst (s, xtmp, s->size);				\
2878     speed_operand_dst (s, ytmp, s->size);				\
2879     speed_operand_dst (s, wp, s->size);					\
2880     speed_cache_fill (s);						\
2881 									\
2882     speed_starttime ();							\
2883     i = s->reps;							\
2884     do									\
2885       {									\
2886 	j = pieces;							\
2887 	do								\
2888 	  {								\
2889 	    MPN_COPY (xtmp, px+(j - 1)*s->size, s->size);		\
2890 	    MPN_COPY (ytmp, py+(j - 1)*s->size, s->size);		\
2891 	    call;							\
2892 	  }								\
2893 	while (--j != 0);						\
2894       }									\
2895     while (--i != 0);							\
2896     t = speed_endtime ();						\
2897 									\
2898     TMP_FREE;								\
2899 									\
2900     s->time_divisor = pieces;						\
2901     return t;								\
2902   }
2903 
2904 #define SPEED_ROUTINE_MPN_GCD(function)	\
2905   SPEED_ROUTINE_MPN_GCD_CALL (8, function (wp, xtmp, s->size, ytmp, s->size))
2906 
2907 #define SPEED_ROUTINE_MPN_GCDEXT(function)				\
2908   SPEED_ROUTINE_MPN_GCD_CALL						\
2909     (4, { mp_size_t  wp2size;						\
2910 	  function (wp, wp2, &wp2size, xtmp, s->size, ytmp, s->size); })
2911 
2912 
2913 #define SPEED_ROUTINE_MPN_GCDEXT_ONE(function)				\
2914   {									\
2915     unsigned  i;							\
2916     mp_size_t j, pieces, psize, wp2size;				\
2917     mp_ptr    wp, wp2, xtmp, ytmp, px, py;				\
2918     double    t;							\
2919     TMP_DECL;								\
2920 									\
2921     SPEED_RESTRICT_COND (s->size >= 1);					\
2922 									\
2923     TMP_MARK;								\
2924 									\
2925     SPEED_TMP_ALLOC_LIMBS (xtmp, s->size+1, s->align_xp);		\
2926     SPEED_TMP_ALLOC_LIMBS (ytmp, s->size+1, s->align_yp);		\
2927     MPN_COPY (xtmp, s->xp, s->size);					\
2928     MPN_COPY (ytmp, s->yp, s->size);					\
2929 									\
2930     SPEED_TMP_ALLOC_LIMBS (wp,	s->size+1, s->align_wp);		\
2931     SPEED_TMP_ALLOC_LIMBS (wp2, s->size+1, s->align_wp2);		\
2932 									\
2933     pieces = SPEED_BLOCK_SIZE / 3;					\
2934     psize = 3 * pieces;							\
2935     px = TMP_ALLOC_LIMBS (psize);					\
2936     py = TMP_ALLOC_LIMBS (psize);					\
2937     MPN_COPY (px, s->xp_block, psize);					\
2938     MPN_COPY (py, s->yp_block, psize);					\
2939 									\
2940     /* x must have at least as many bits as y,				\
2941        high limbs must be non-zero */					\
2942     for (j = 0; j < pieces; j++)					\
2943       {									\
2944 	mp_ptr	x = px+3*j;						\
2945 	mp_ptr	y = py+3*j;						\
2946 	x[2] += (x[2] == 0);						\
2947 	y[2] += (y[2] == 0);						\
2948 	if (x[2] < y[2])						\
2949 	  MP_LIMB_T_SWAP (x[2], y[2]);					\
2950       }									\
2951 									\
2952     speed_operand_src (s, px, psize);					\
2953     speed_operand_src (s, py, psize);					\
2954     speed_operand_dst (s, xtmp, s->size);				\
2955     speed_operand_dst (s, ytmp, s->size);				\
2956     speed_operand_dst (s, wp, s->size);					\
2957     speed_cache_fill (s);						\
2958 									\
2959     speed_starttime ();							\
2960     i = s->reps;							\
2961     do									\
2962       {									\
2963 	mp_ptr	x = px;							\
2964 	mp_ptr	y = py;							\
2965 	mp_ptr	xth = &xtmp[s->size-3];					\
2966 	mp_ptr	yth = &ytmp[s->size-3];					\
2967 	j = pieces;							\
2968 	do								\
2969 	  {								\
2970 	    xth[0] = x[0], xth[1] = x[1], xth[2] = x[2];		\
2971 	    yth[0] = y[0], yth[1] = y[1], yth[2] = y[2];		\
2972 									\
2973 	    ytmp[0] |= 1; /* y must be odd, */				\
2974 									\
2975 	    function (wp, wp2, &wp2size, xtmp, s->size, ytmp, s->size);	\
2976 									\
2977 	    x += 3;							\
2978 	    y += 3;							\
2979 	  }								\
2980 	while (--j != 0);						\
2981       }									\
2982     while (--i != 0);							\
2983     t = speed_endtime ();						\
2984 									\
2985     TMP_FREE;								\
2986 									\
2987     s->time_divisor = pieces;						\
2988     return t;								\
2989   }
2990 
2991 #define SPEED_ROUTINE_MPZ_JACOBI(function)				\
2992   {									\
2993     mpz_t     a, b;							\
2994     unsigned  i;							\
2995     mp_size_t j, pieces, psize;						\
2996     mp_ptr    px, py;							\
2997     double    t;							\
2998     TMP_DECL;								\
2999 									\
3000     TMP_MARK;								\
3001     pieces = SPEED_BLOCK_SIZE / MAX (s->size, 1);			\
3002     pieces = MAX (pieces, 1);						\
3003     s->time_divisor = pieces;						\
3004 									\
3005     psize = pieces * s->size;						\
3006     px = TMP_ALLOC_LIMBS (psize);					\
3007     py = TMP_ALLOC_LIMBS (psize);					\
3008     MPN_COPY (px, pieces==1 ? s->xp : s->xp_block, psize);		\
3009     MPN_COPY (py, pieces==1 ? s->yp : s->yp_block, psize);		\
3010 									\
3011     for (j = 0; j < pieces; j++)					\
3012       {									\
3013 	mp_ptr	x = px+j*s->size;					\
3014 	mp_ptr	y = py+j*s->size;					\
3015 									\
3016 	/* y odd */							\
3017 	y[0] |= 1;							\
3018 									\
3019 	/* high limbs non-zero */					\
3020 	if (x[s->size-1] == 0) x[s->size-1] = 1;			\
3021 	if (y[s->size-1] == 0) y[s->size-1] = 1;			\
3022       }									\
3023 									\
3024     SIZ(a) = s->size;							\
3025     SIZ(b) = s->size;							\
3026 									\
3027     speed_operand_src (s, px, psize);					\
3028     speed_operand_src (s, py, psize);					\
3029     speed_cache_fill (s);						\
3030 									\
3031     speed_starttime ();							\
3032     i = s->reps;							\
3033     do									\
3034       {									\
3035 	j = pieces;							\
3036 	do								\
3037 	  {								\
3038 	    PTR(a) = px+(j-1)*s->size;					\
3039 	    PTR(b) = py+(j-1)*s->size;					\
3040 	    function (a, b);						\
3041 	  }								\
3042 	while (--j != 0);						\
3043       }									\
3044     while (--i != 0);							\
3045     t = speed_endtime ();						\
3046 									\
3047     TMP_FREE;								\
3048     return t;								\
3049   }
3050 
3051 #define SPEED_ROUTINE_MPN_DIVREM_2(function)				\
3052   {									\
3053     mp_ptr    wp, xp;							\
3054     mp_limb_t yp[2];							\
3055     unsigned  i;							\
3056     double    t;							\
3057     TMP_DECL;								\
3058 									\
3059     SPEED_RESTRICT_COND (s->size >= 2);					\
3060 									\
3061     TMP_MARK;								\
3062     SPEED_TMP_ALLOC_LIMBS (xp, s->size, s->align_xp);			\
3063     SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
3064 									\
3065     /* source is destroyed */						\
3066     MPN_COPY (xp, s->xp, s->size);					\
3067 									\
3068     /* divisor must be normalized */					\
3069     MPN_COPY (yp, s->yp_block, 2);					\
3070     yp[1] |= GMP_NUMB_HIGHBIT;						\
3071 									\
3072     speed_operand_src (s, xp, s->size);					\
3073     speed_operand_src (s, yp, 2);					\
3074     speed_operand_dst (s, wp, s->size);					\
3075     speed_cache_fill (s);						\
3076 									\
3077     speed_starttime ();							\
3078     i = s->reps;							\
3079     do									\
3080       function (wp, 0, xp, s->size, yp);				\
3081     while (--i != 0);							\
3082     t = speed_endtime ();						\
3083 									\
3084     TMP_FREE;								\
3085     return t;								\
3086   }
3087 
3088 #define SPEED_ROUTINE_MPN_DIV_QR_2(function, norm)			\
3089   {									\
3090     mp_ptr    wp, xp;							\
3091     mp_limb_t yp[2];							\
3092     mp_limb_t rp[2];							\
3093     unsigned  i;							\
3094     double    t;							\
3095     TMP_DECL;								\
3096 									\
3097     SPEED_RESTRICT_COND (s->size >= 2);					\
3098 									\
3099     TMP_MARK;								\
3100     SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
3101 									\
3102     /* divisor must be normalized */					\
3103     MPN_COPY (yp, s->yp_block, 2);					\
3104     if (norm)								\
3105       yp[1] |= GMP_NUMB_HIGHBIT;					\
3106     else								\
3107       {									\
3108 	yp[1] &= ~GMP_NUMB_HIGHBIT;					\
3109 	if (yp[1] == 0)							\
3110 	  yp[1] = 1;							\
3111       }									\
3112     speed_operand_src (s, s->xp, s->size);				\
3113     speed_operand_src (s, yp, 2);					\
3114     speed_operand_dst (s, wp, s->size);					\
3115     speed_operand_dst (s, rp, 2);					\
3116     speed_cache_fill (s);						\
3117 									\
3118     speed_starttime ();							\
3119     i = s->reps;							\
3120     do									\
3121       function (wp, rp, s->xp, s->size, yp);				\
3122     while (--i != 0);							\
3123     t = speed_endtime ();						\
3124 									\
3125     TMP_FREE;								\
3126     return t;								\
3127   }
3128 
3129 #define SPEED_ROUTINE_MODLIMB_INVERT(function)				\
3130   {									\
3131     unsigned   i, j;							\
3132     mp_ptr     xp;							\
3133     mp_limb_t  n = 1;							\
3134     double     t;							\
3135 									\
3136     xp = s->xp_block-1;							\
3137 									\
3138     speed_operand_src (s, s->xp_block, SPEED_BLOCK_SIZE);		\
3139     speed_cache_fill (s);						\
3140 									\
3141     speed_starttime ();							\
3142     i = s->reps;							\
3143     do									\
3144       {									\
3145 	j = SPEED_BLOCK_SIZE;						\
3146 	do								\
3147 	  {								\
3148 	    /* randomized but successively dependent */			\
3149 	    n += (xp[j] << 1);						\
3150 									\
3151 	    function (n, n);						\
3152 	  }								\
3153 	while (--j != 0);						\
3154       }									\
3155     while (--i != 0);							\
3156     t = speed_endtime ();						\
3157 									\
3158     /* make sure the compiler won't optimize away n */			\
3159     noop_1 (n);								\
3160 									\
3161     s->time_divisor = SPEED_BLOCK_SIZE;					\
3162     return t;								\
3163   }
3164 
3165 
3166 #define SPEED_ROUTINE_MPN_SQRTREM(function)				\
3167   {									\
3168     mp_ptr    wp, wp2;							\
3169     unsigned  i;							\
3170     double    t;							\
3171     TMP_DECL;								\
3172 									\
3173     SPEED_RESTRICT_COND (s->size >= 1);					\
3174 									\
3175     TMP_MARK;								\
3176     SPEED_TMP_ALLOC_LIMBS (wp,	s->size, s->align_wp);			\
3177     SPEED_TMP_ALLOC_LIMBS (wp2, s->size, s->align_wp2);			\
3178 									\
3179     speed_operand_src (s, s->xp, s->size);				\
3180     speed_operand_dst (s, wp, s->size);					\
3181     speed_operand_dst (s, wp2, s->size);				\
3182     speed_cache_fill (s);						\
3183 									\
3184     speed_starttime ();							\
3185     i = s->reps;							\
3186     do									\
3187       function (wp, wp2, s->xp, s->size);				\
3188     while (--i != 0);							\
3189     t = speed_endtime ();						\
3190 									\
3191     TMP_FREE;								\
3192     return t;								\
3193   }
3194 
3195 #define SPEED_ROUTINE_MPN_ROOTREM(function)				\
3196   {									\
3197     mp_ptr    wp, wp2;							\
3198     unsigned  i;							\
3199     double    t;							\
3200     TMP_DECL;								\
3201 									\
3202     SPEED_RESTRICT_COND (s->size >= 1);					\
3203 									\
3204     TMP_MARK;								\
3205     SPEED_TMP_ALLOC_LIMBS (wp,	s->size, s->align_wp);			\
3206     SPEED_TMP_ALLOC_LIMBS (wp2, s->size, s->align_wp2);			\
3207 									\
3208     speed_operand_src (s, s->xp, s->size);				\
3209     speed_operand_dst (s, wp, s->size);					\
3210     speed_operand_dst (s, wp2, s->size);				\
3211     speed_cache_fill (s);						\
3212 									\
3213     speed_starttime ();							\
3214     i = s->reps;							\
3215     do									\
3216       function (wp, wp2, s->xp, s->size, s->r);				\
3217     while (--i != 0);							\
3218     t = speed_endtime ();						\
3219 									\
3220     TMP_FREE;								\
3221     return t;								\
3222   }
3223 
3224 
3225 /* s->size controls the number of limbs in the input, s->r is the base, or
3226    decimal by default. */
3227 #define SPEED_ROUTINE_MPN_GET_STR(function)				\
3228   {									\
3229     unsigned char *wp;							\
3230     mp_size_t wn;							\
3231     mp_ptr xp;								\
3232     int base;								\
3233     unsigned i;								\
3234     double t;								\
3235     TMP_DECL;								\
3236 									\
3237     SPEED_RESTRICT_COND (s->size >= 1);					\
3238 									\
3239     base = s->r == 0 ? 10 : s->r;					\
3240     SPEED_RESTRICT_COND (base >= 2 && base <= 256);			\
3241 									\
3242     TMP_MARK;								\
3243     SPEED_TMP_ALLOC_LIMBS (xp, s->size + 1, s->align_xp);		\
3244 									\
3245     MPN_SIZEINBASE (wn, s->xp, s->size, base);				\
3246     wp = TMP_ALLOC (wn);						\
3247 									\
3248     /* use this during development to guard against overflowing wp */	\
3249     /*									\
3250     MPN_COPY (xp, s->xp, s->size);					\
3251     ASSERT_ALWAYS (mpn_get_str (wp, base, xp, s->size) <= wn);		\
3252     */									\
3253 									\
3254     speed_operand_src (s, s->xp, s->size);				\
3255     speed_operand_dst (s, xp, s->size);					\
3256     speed_operand_dst (s, (mp_ptr) wp, wn/BYTES_PER_MP_LIMB);		\
3257     speed_cache_fill (s);						\
3258 									\
3259     speed_starttime ();							\
3260     i = s->reps;							\
3261     do									\
3262       {									\
3263 	MPN_COPY (xp, s->xp, s->size);					\
3264 	function (wp, base, xp, s->size);				\
3265       }									\
3266     while (--i != 0);							\
3267     t = speed_endtime ();						\
3268 									\
3269     TMP_FREE;								\
3270     return t;								\
3271   }
3272 
3273 /* s->size controls the number of digits in the input, s->r is the base, or
3274    decimal by default. */
3275 #define SPEED_ROUTINE_MPN_SET_STR_CALL(call)				\
3276   {									\
3277     unsigned char *xp;							\
3278     mp_ptr     wp;							\
3279     mp_size_t  wn;							\
3280     unsigned   i;							\
3281     int        base;							\
3282     double     t;							\
3283     TMP_DECL;								\
3284 									\
3285     SPEED_RESTRICT_COND (s->size >= 1);					\
3286 									\
3287     base = s->r == 0 ? 10 : s->r;					\
3288     SPEED_RESTRICT_COND (base >= 2 && base <= 256);			\
3289 									\
3290     TMP_MARK;								\
3291 									\
3292     xp = TMP_ALLOC (s->size);						\
3293     for (i = 0; i < s->size; i++)					\
3294       xp[i] = s->xp[i] % base;						\
3295 									\
3296     LIMBS_PER_DIGIT_IN_BASE (wn, s->size, base);			\
3297     SPEED_TMP_ALLOC_LIMBS (wp, wn, s->align_wp);			\
3298 									\
3299     /* use this during development to check wn is big enough */		\
3300     /*									\
3301     ASSERT_ALWAYS (mpn_set_str (wp, xp, s->size, base) <= wn);		\
3302     */									\
3303 									\
3304     speed_operand_src (s, (mp_ptr) xp, s->size/BYTES_PER_MP_LIMB);	\
3305     speed_operand_dst (s, wp, wn);					\
3306     speed_cache_fill (s);						\
3307 									\
3308     speed_starttime ();							\
3309     i = s->reps;							\
3310     do									\
3311       call;								\
3312     while (--i != 0);							\
3313     t = speed_endtime ();						\
3314 									\
3315     TMP_FREE;								\
3316     return t;								\
3317   }
3318 
3319 
3320 /* Run an accel gcd find_a() function over various data values.  A set of
3321    values is used in case some run particularly fast or slow.  The size
3322    parameter is ignored, the amount of data tested is fixed.  */
3323 
3324 #define SPEED_ROUTINE_MPN_GCD_FINDA(function)				\
3325   {									\
3326     unsigned  i, j;							\
3327     mp_limb_t cp[SPEED_BLOCK_SIZE][2];					\
3328     double    t;							\
3329     TMP_DECL;								\
3330 									\
3331     TMP_MARK;								\
3332 									\
3333     /* low must be odd, high must be non-zero */			\
3334     for (i = 0; i < SPEED_BLOCK_SIZE; i++)				\
3335       {									\
3336 	cp[i][0] = s->xp_block[i] | 1;					\
3337 	cp[i][1] = s->yp_block[i] + (s->yp_block[i] == 0);		\
3338       }									\
3339 									\
3340     speed_operand_src (s, &cp[0][0], 2*SPEED_BLOCK_SIZE);		\
3341     speed_cache_fill (s);						\
3342 									\
3343     speed_starttime ();							\
3344     i = s->reps;							\
3345     do									\
3346       {									\
3347 	j = SPEED_BLOCK_SIZE;						\
3348 	do								\
3349 	  {								\
3350 	    function (cp[j-1]);						\
3351 	  }								\
3352 	while (--j != 0);						\
3353       }									\
3354     while (--i != 0);							\
3355     t = speed_endtime ();						\
3356 									\
3357     TMP_FREE;								\
3358 									\
3359     s->time_divisor = SPEED_BLOCK_SIZE;					\
3360     return t;								\
3361   }
3362 
3363 
3364 /* "call" should do "count_foo_zeros(c,n)".
3365    Give leading=1 if foo is leading zeros, leading=0 for trailing.
3366    Give zero=1 if n=0 is allowed in the call, zero=0 if not.  */
3367 
3368 #define SPEED_ROUTINE_COUNT_ZEROS_A(leading, zero)			\
3369   {									\
3370     mp_ptr     xp;							\
3371     int        i, c;							\
3372     unsigned   j;							\
3373     mp_limb_t  n;							\
3374     double     t;							\
3375     TMP_DECL;								\
3376 									\
3377     TMP_MARK;								\
3378     SPEED_TMP_ALLOC_LIMBS (xp, SPEED_BLOCK_SIZE, s->align_xp);		\
3379 									\
3380     if (! speed_routine_count_zeros_setup (s, xp, leading, zero))	\
3381       return -1.0;							\
3382     speed_operand_src (s, xp, SPEED_BLOCK_SIZE);			\
3383     speed_cache_fill (s);						\
3384 									\
3385     c = 0;								\
3386     speed_starttime ();							\
3387     j = s->reps;							\
3388     do {								\
3389       for (i = 0; i < SPEED_BLOCK_SIZE; i++)				\
3390 	{								\
3391 	  n = xp[i];							\
3392 	  n ^= c;							\
3393 
3394 #define SPEED_ROUTINE_COUNT_ZEROS_B()					\
3395 	}								\
3396     } while (--j != 0);							\
3397     t = speed_endtime ();						\
3398 									\
3399     /* don't let c go dead */						\
3400     noop_1 (c);								\
3401 									\
3402     s->time_divisor = SPEED_BLOCK_SIZE;					\
3403 									\
3404     TMP_FREE;								\
3405     return t;								\
3406   }									\
3407 
3408 #define SPEED_ROUTINE_COUNT_ZEROS_C(call, leading, zero)		\
3409   do {									\
3410     SPEED_ROUTINE_COUNT_ZEROS_A (leading, zero);			\
3411     call;								\
3412     SPEED_ROUTINE_COUNT_ZEROS_B ();					\
3413   } while (0)								\
3414 
3415 #define SPEED_ROUTINE_COUNT_LEADING_ZEROS_C(call,zero)			\
3416   SPEED_ROUTINE_COUNT_ZEROS_C (call, 1, zero)
3417 #define SPEED_ROUTINE_COUNT_LEADING_ZEROS(fun)				\
3418   SPEED_ROUTINE_COUNT_ZEROS_C (fun (c, n), 1, 0)
3419 
3420 #define SPEED_ROUTINE_COUNT_TRAILING_ZEROS_C(call,zero)			\
3421   SPEED_ROUTINE_COUNT_ZEROS_C (call, 0, zero)
3422 #define SPEED_ROUTINE_COUNT_TRAILING_ZEROS(call)			\
3423   SPEED_ROUTINE_COUNT_ZEROS_C (fun (c, n), 0, 0)
3424 
3425 
3426 #define SPEED_ROUTINE_INVERT_LIMB_CALL(call)				\
3427   {									\
3428     unsigned   i, j;							\
3429     mp_limb_t  d, dinv=0;						\
3430     mp_ptr     xp = s->xp_block - 1;					\
3431 									\
3432     s->time_divisor = SPEED_BLOCK_SIZE;					\
3433 									\
3434     speed_starttime ();							\
3435     i = s->reps;							\
3436     do									\
3437       {									\
3438 	j = SPEED_BLOCK_SIZE;						\
3439 	do								\
3440 	  {								\
3441 	    d = dinv ^ xp[j];						\
3442 	    d |= GMP_LIMB_HIGHBIT;					\
3443 	    do { call; } while (0);					\
3444 	  }								\
3445 	while (--j != 0);						\
3446       }									\
3447     while (--i != 0);							\
3448 									\
3449     /* don't let the compiler optimize everything away */		\
3450     noop_1 (dinv);							\
3451 									\
3452     return speed_endtime();						\
3453   }
3454 
3455 
3456 #define SPEED_ROUTINE_MPN_BACK_TO_BACK(function)			\
3457   {									\
3458     unsigned  i;							\
3459     speed_starttime ();							\
3460     i = s->reps;							\
3461     do									\
3462       function ();							\
3463     while (--i != 0);							\
3464     return speed_endtime ();						\
3465   }
3466 
3467 
3468 #define SPEED_ROUTINE_MPN_ZERO_CALL(call)				\
3469   {									\
3470     mp_ptr    wp;							\
3471     unsigned  i;							\
3472     double    t;							\
3473     TMP_DECL;								\
3474 									\
3475     SPEED_RESTRICT_COND (s->size >= 0);					\
3476 									\
3477     TMP_MARK;								\
3478     SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);			\
3479     speed_operand_dst (s, wp, s->size);					\
3480     speed_cache_fill (s);						\
3481 									\
3482     speed_starttime ();							\
3483     i = s->reps;							\
3484     do									\
3485       call;								\
3486     while (--i != 0);							\
3487     t = speed_endtime ();						\
3488 									\
3489     TMP_FREE;								\
3490     return t;								\
3491   }
3492 
3493 #define SPEED_ROUTINE_MPN_ZERO(function)				\
3494   SPEED_ROUTINE_MPN_ZERO_CALL (function (wp, s->size))
3495 
3496 
3497 #endif
3498