1 /* Header for speed and threshold things. 2 3 Copyright 1999, 2000, 2001, 2002, 2003, 2005, 2006, 2008, 2009, 2010, 2011, 4 2012 Free Software Foundation, Inc. 5 6 This file is part of the GNU MP Library. 7 8 The GNU MP Library is free software; you can redistribute it and/or modify 9 it under the terms of the GNU Lesser General Public License as published by 10 the Free Software Foundation; either version 3 of the License, or (at your 11 option) any later version. 12 13 The GNU MP Library is distributed in the hope that it will be useful, but 14 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 15 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 16 License for more details. 17 18 You should have received a copy of the GNU Lesser General Public License 19 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ 20 21 #ifndef __SPEED_H__ 22 #define __SPEED_H__ 23 24 25 /* Pad ptr,oldsize with zero limbs (at the most significant end) to make it 26 newsize long. */ 27 #define MPN_ZERO_EXTEND(ptr, oldsize, newsize) \ 28 do { \ 29 ASSERT ((newsize) >= (oldsize)); \ 30 MPN_ZERO ((ptr)+(oldsize), (newsize)-(oldsize)); \ 31 } while (0) 32 33 /* A mask of the least significant n bits. Note 1<<32 doesn't give zero on 34 x86 family CPUs, hence the separate case for GMP_LIMB_BITS. */ 35 #define MP_LIMB_T_LOWBITMASK(n) \ 36 ((n) == GMP_LIMB_BITS ? MP_LIMB_T_MAX : ((mp_limb_t) 1 << (n)) - 1) 37 38 39 /* align must be a power of 2 here, usually CACHE_LINE_SIZE is a good choice */ 40 41 #define TMP_ALLOC_ALIGNED(bytes, align) \ 42 align_pointer (TMP_ALLOC ((bytes) + (align)-1), (align)) 43 #define TMP_ALLOC_LIMBS_ALIGNED(limbs, align) \ 44 ((mp_ptr) TMP_ALLOC_ALIGNED ((limbs)*sizeof(mp_limb_t), align)) 45 46 /* CACHE_LINE_SIZE is our default alignment for speed operands, and the 47 limit on what s->align_xp etc and then request for off-alignment. Maybe 48 this should be an option of some sort, but in any case here are some line 49 sizes, 50 51 bytes 52 32 pentium 53 64 athlon 54 64 itanium-2 L1 55 128 itanium-2 L2 56 */ 57 #define CACHE_LINE_SIZE 64 /* bytes */ 58 59 #define SPEED_TMP_ALLOC_ADJUST_MASK (CACHE_LINE_SIZE/BYTES_PER_MP_LIMB - 1) 60 61 /* Set ptr to a TMP_ALLOC block of the given limbs, with the given limb 62 alignment. */ 63 #define SPEED_TMP_ALLOC_LIMBS(ptr, limbs, align) \ 64 do { \ 65 mp_ptr __ptr; \ 66 mp_size_t __ptr_align, __ptr_add; \ 67 \ 68 ASSERT ((CACHE_LINE_SIZE % BYTES_PER_MP_LIMB) == 0); \ 69 __ptr = TMP_ALLOC_LIMBS ((limbs) + SPEED_TMP_ALLOC_ADJUST_MASK); \ 70 __ptr_align = (__ptr - (mp_ptr) NULL); \ 71 __ptr_add = ((align) - __ptr_align) & SPEED_TMP_ALLOC_ADJUST_MASK; \ 72 (ptr) = __ptr + __ptr_add; \ 73 } while (0) 74 75 76 /* This is the size for s->xp_block and s->yp_block, used in certain 77 routines that want to run across many different data values and use 78 s->size for a different purpose, eg. SPEED_ROUTINE_MPN_GCD_1. 79 80 512 means 2kbytes of data for each of xp_block and yp_block, making 4k 81 total, which should fit easily in any L1 data cache. */ 82 83 #define SPEED_BLOCK_SIZE 512 /* limbs */ 84 85 86 extern double speed_unittime; 87 extern double speed_cycletime; 88 extern int speed_precision; 89 extern char speed_time_string[]; 90 void speed_time_init (void); 91 void speed_cycletime_fail (const char *str); 92 void speed_cycletime_init (void); 93 void speed_cycletime_need_cycles (void); 94 void speed_cycletime_need_seconds (void); 95 void speed_starttime (void); 96 double speed_endtime (void); 97 98 99 struct speed_params { 100 unsigned reps; /* how many times to run the routine */ 101 mp_ptr xp; /* first argument */ 102 mp_ptr yp; /* second argument */ 103 mp_size_t size; /* size of both arguments */ 104 mp_limb_t r; /* user supplied parameter */ 105 mp_size_t align_xp; /* alignment of xp */ 106 mp_size_t align_yp; /* alignment of yp */ 107 mp_size_t align_wp; /* intended alignment of wp */ 108 mp_size_t align_wp2; /* intended alignment of wp2 */ 109 mp_ptr xp_block; /* first special SPEED_BLOCK_SIZE block */ 110 mp_ptr yp_block; /* second special SPEED_BLOCK_SIZE block */ 111 112 double time_divisor; /* optionally set by the speed routine */ 113 114 /* used by the cache priming things */ 115 int cache; 116 unsigned src_num, dst_num; 117 struct { 118 mp_ptr ptr; 119 mp_size_t size; 120 } src[5], dst[4]; 121 }; 122 123 typedef double (*speed_function_t) (struct speed_params *); 124 125 double speed_measure (speed_function_t fun, struct speed_params *); 126 127 /* Prototypes for speed measuring routines */ 128 129 double speed_back_to_back (struct speed_params *); 130 double speed_count_leading_zeros (struct speed_params *); 131 double speed_count_trailing_zeros (struct speed_params *); 132 double speed_find_a (struct speed_params *); 133 double speed_gmp_allocate_free (struct speed_params *); 134 double speed_gmp_allocate_reallocate_free (struct speed_params *); 135 double speed_invert_limb (struct speed_params *); 136 double speed_malloc_free (struct speed_params *); 137 double speed_malloc_realloc_free (struct speed_params *); 138 double speed_memcpy (struct speed_params *); 139 double speed_binvert_limb (struct speed_params *); 140 double speed_binvert_limb_mul1 (struct speed_params *); 141 double speed_binvert_limb_loop (struct speed_params *); 142 double speed_binvert_limb_cond (struct speed_params *); 143 double speed_binvert_limb_arith (struct speed_params *); 144 145 double speed_mpf_init_clear (struct speed_params *); 146 147 double speed_mpn_add_n (struct speed_params *); 148 double speed_mpn_add_err1_n (struct speed_params *); 149 double speed_mpn_add_err2_n (struct speed_params *); 150 double speed_mpn_add_err3_n (struct speed_params *); 151 double speed_mpn_addcnd_n (struct speed_params *); 152 double speed_mpn_addlsh_n (struct speed_params *); 153 double speed_mpn_addlsh1_n (struct speed_params *); 154 double speed_mpn_addlsh2_n (struct speed_params *); 155 double speed_mpn_addlsh_n_ip1 (struct speed_params *); 156 double speed_mpn_addlsh1_n_ip1 (struct speed_params *); 157 double speed_mpn_addlsh2_n_ip1 (struct speed_params *); 158 double speed_mpn_addlsh_n_ip2 (struct speed_params *); 159 double speed_mpn_addlsh1_n_ip2 (struct speed_params *); 160 double speed_mpn_addlsh2_n_ip2 (struct speed_params *); 161 double speed_mpn_add_n_sub_n (struct speed_params *); 162 double speed_mpn_and_n (struct speed_params *); 163 double speed_mpn_andn_n (struct speed_params *); 164 double speed_mpn_addmul_1 (struct speed_params *); 165 double speed_mpn_addmul_2 (struct speed_params *); 166 double speed_mpn_addmul_3 (struct speed_params *); 167 double speed_mpn_addmul_4 (struct speed_params *); 168 double speed_mpn_addmul_5 (struct speed_params *); 169 double speed_mpn_addmul_6 (struct speed_params *); 170 double speed_mpn_addmul_7 (struct speed_params *); 171 double speed_mpn_addmul_8 (struct speed_params *); 172 double speed_mpn_com (struct speed_params *); 173 double speed_mpn_copyd (struct speed_params *); 174 double speed_mpn_copyi (struct speed_params *); 175 double speed_MPN_COPY (struct speed_params *); 176 double speed_MPN_COPY_DECR (struct speed_params *); 177 double speed_MPN_COPY_INCR (struct speed_params *); 178 double speed_mpn_tabselect (struct speed_params *); 179 double speed_mpn_divexact_1 (struct speed_params *); 180 double speed_mpn_divexact_by3 (struct speed_params *); 181 double speed_mpn_bdiv_q_1 (struct speed_params *); 182 double speed_mpn_pi1_bdiv_q_1 (struct speed_params *); 183 double speed_mpn_bdiv_dbm1c (struct speed_params *); 184 double speed_mpn_divrem_1 (struct speed_params *); 185 double speed_mpn_divrem_1f (struct speed_params *); 186 double speed_mpn_divrem_1c (struct speed_params *); 187 double speed_mpn_divrem_1cf (struct speed_params *); 188 double speed_mpn_divrem_1_div (struct speed_params *); 189 double speed_mpn_divrem_1f_div (struct speed_params *); 190 double speed_mpn_divrem_1_inv (struct speed_params *); 191 double speed_mpn_divrem_1f_inv (struct speed_params *); 192 double speed_mpn_divrem_2 (struct speed_params *); 193 double speed_mpn_divrem_2_div (struct speed_params *); 194 double speed_mpn_divrem_2_inv (struct speed_params *); 195 double speed_mpn_div_qr_2n (struct speed_params *); 196 double speed_mpn_div_qr_2u (struct speed_params *); 197 double speed_mpn_fib2_ui (struct speed_params *); 198 double speed_mpn_matrix22_mul (struct speed_params *); 199 double speed_mpn_hgcd (struct speed_params *); 200 double speed_mpn_hgcd_lehmer (struct speed_params *); 201 double speed_mpn_hgcd_appr (struct speed_params *); 202 double speed_mpn_hgcd_appr_lehmer (struct speed_params *); 203 double speed_mpn_hgcd_reduce (struct speed_params *); 204 double speed_mpn_hgcd_reduce_1 (struct speed_params *); 205 double speed_mpn_hgcd_reduce_2 (struct speed_params *); 206 double speed_mpn_gcd (struct speed_params *); 207 double speed_mpn_gcd_1 (struct speed_params *); 208 double speed_mpn_gcd_1N (struct speed_params *); 209 double speed_mpn_gcdext (struct speed_params *); 210 double speed_mpn_gcdext_double (struct speed_params *); 211 double speed_mpn_gcdext_one_double (struct speed_params *); 212 double speed_mpn_gcdext_one_single (struct speed_params *); 213 double speed_mpn_gcdext_single (struct speed_params *); 214 double speed_mpn_get_str (struct speed_params *); 215 double speed_mpn_hamdist (struct speed_params *); 216 double speed_mpn_ior_n (struct speed_params *); 217 double speed_mpn_iorn_n (struct speed_params *); 218 double speed_mpn_jacobi_base (struct speed_params *); 219 double speed_mpn_jacobi_base_1 (struct speed_params *); 220 double speed_mpn_jacobi_base_2 (struct speed_params *); 221 double speed_mpn_jacobi_base_3 (struct speed_params *); 222 double speed_mpn_jacobi_base_4 (struct speed_params *); 223 double speed_mpn_lshift (struct speed_params *); 224 double speed_mpn_lshiftc (struct speed_params *); 225 double speed_mpn_mod_1 (struct speed_params *); 226 double speed_mpn_mod_1c (struct speed_params *); 227 double speed_mpn_mod_1_div (struct speed_params *); 228 double speed_mpn_mod_1_inv (struct speed_params *); 229 double speed_mpn_mod_1_1 (struct speed_params *); 230 double speed_mpn_mod_1_1_1 (struct speed_params *); 231 double speed_mpn_mod_1_1_2 (struct speed_params *); 232 double speed_mpn_mod_1_2 (struct speed_params *); 233 double speed_mpn_mod_1_3 (struct speed_params *); 234 double speed_mpn_mod_1_4 (struct speed_params *); 235 double speed_mpn_mod_34lsub1 (struct speed_params *); 236 double speed_mpn_modexact_1_odd (struct speed_params *); 237 double speed_mpn_modexact_1c_odd (struct speed_params *); 238 double speed_mpn_mul_1 (struct speed_params *); 239 double speed_mpn_mul_1_inplace (struct speed_params *); 240 double speed_mpn_mul_2 (struct speed_params *); 241 double speed_mpn_mul_3 (struct speed_params *); 242 double speed_mpn_mul_4 (struct speed_params *); 243 double speed_mpn_mul_5 (struct speed_params *); 244 double speed_mpn_mul_6 (struct speed_params *); 245 double speed_mpn_mul (struct speed_params *); 246 double speed_mpn_mul_basecase (struct speed_params *); 247 double speed_mpn_mulmid (struct speed_params *); 248 double speed_mpn_mulmid_basecase (struct speed_params *); 249 double speed_mpn_mul_fft (struct speed_params *); 250 double speed_mpn_mul_fft_sqr (struct speed_params *); 251 double speed_mpn_fft_mul (struct speed_params *); 252 double speed_mpn_fft_sqr (struct speed_params *); 253 #if WANT_OLD_FFT_FULL 254 double speed_mpn_mul_fft_full (struct speed_params *); 255 double speed_mpn_mul_fft_full_sqr (struct speed_params *); 256 #endif 257 double speed_mpn_nussbaumer_mul (struct speed_params *); 258 double speed_mpn_nussbaumer_mul_sqr (struct speed_params *); 259 double speed_mpn_mul_n (struct speed_params *); 260 double speed_mpn_mul_n_sqr (struct speed_params *); 261 double speed_mpn_mulmid_n (struct speed_params *); 262 double speed_mpn_mullo_n (struct speed_params *); 263 double speed_mpn_mullo_basecase (struct speed_params *); 264 double speed_mpn_nand_n (struct speed_params *); 265 double speed_mpn_nior_n (struct speed_params *); 266 double speed_mpn_popcount (struct speed_params *); 267 double speed_mpn_preinv_divrem_1 (struct speed_params *); 268 double speed_mpn_preinv_divrem_1f (struct speed_params *); 269 double speed_mpn_preinv_mod_1 (struct speed_params *); 270 double speed_mpn_sbpi1_div_qr (struct speed_params *); 271 double speed_mpn_dcpi1_div_qr (struct speed_params *); 272 double speed_mpn_sbpi1_divappr_q (struct speed_params *); 273 double speed_mpn_dcpi1_divappr_q (struct speed_params *); 274 double speed_mpn_mu_div_qr (struct speed_params *); 275 double speed_mpn_mu_divappr_q (struct speed_params *); 276 double speed_mpn_mupi_div_qr (struct speed_params *); 277 double speed_mpn_mu_div_q (struct speed_params *); 278 double speed_mpn_sbpi1_bdiv_qr (struct speed_params *); 279 double speed_mpn_dcpi1_bdiv_qr (struct speed_params *); 280 double speed_mpn_sbpi1_bdiv_q (struct speed_params *); 281 double speed_mpn_dcpi1_bdiv_q (struct speed_params *); 282 double speed_mpn_mu_bdiv_q (struct speed_params *); 283 double speed_mpn_mu_bdiv_qr (struct speed_params *); 284 double speed_mpn_broot (struct speed_params *); 285 double speed_mpn_broot_invm1 (struct speed_params *); 286 double speed_mpn_brootinv (struct speed_params *); 287 double speed_mpn_invert (struct speed_params *); 288 double speed_mpn_invertappr (struct speed_params *); 289 double speed_mpn_ni_invertappr (struct speed_params *); 290 double speed_mpn_binvert (struct speed_params *); 291 double speed_mpn_redc_1 (struct speed_params *); 292 double speed_mpn_redc_2 (struct speed_params *); 293 double speed_mpn_redc_n (struct speed_params *); 294 double speed_mpn_rsblsh_n (struct speed_params *); 295 double speed_mpn_rsblsh1_n (struct speed_params *); 296 double speed_mpn_rsblsh2_n (struct speed_params *); 297 double speed_mpn_rsh1add_n (struct speed_params *); 298 double speed_mpn_rsh1sub_n (struct speed_params *); 299 double speed_mpn_rshift (struct speed_params *); 300 double speed_mpn_sb_divrem_m3 (struct speed_params *); 301 double speed_mpn_sb_divrem_m3_div (struct speed_params *); 302 double speed_mpn_sb_divrem_m3_inv (struct speed_params *); 303 double speed_mpn_set_str (struct speed_params *); 304 double speed_mpn_bc_set_str (struct speed_params *); 305 double speed_mpn_dc_set_str (struct speed_params *); 306 double speed_mpn_set_str_pre (struct speed_params *); 307 double speed_mpn_sqr_basecase (struct speed_params *); 308 double speed_mpn_sqr_diag_addlsh1 (struct speed_params *); 309 double speed_mpn_sqr_diagonal (struct speed_params *); 310 double speed_mpn_sqr (struct speed_params *); 311 double speed_mpn_sqrtrem (struct speed_params *); 312 double speed_mpn_rootrem (struct speed_params *); 313 double speed_mpn_sub_n (struct speed_params *); 314 double speed_mpn_sub_err1_n (struct speed_params *); 315 double speed_mpn_sub_err2_n (struct speed_params *); 316 double speed_mpn_sub_err3_n (struct speed_params *); 317 double speed_mpn_subcnd_n (struct speed_params *); 318 double speed_mpn_sublsh_n (struct speed_params *); 319 double speed_mpn_sublsh1_n (struct speed_params *); 320 double speed_mpn_sublsh2_n (struct speed_params *); 321 double speed_mpn_sublsh_n_ip1 (struct speed_params *); 322 double speed_mpn_sublsh1_n_ip1 (struct speed_params *); 323 double speed_mpn_sublsh2_n_ip1 (struct speed_params *); 324 double speed_mpn_submul_1 (struct speed_params *); 325 double speed_mpn_toom2_sqr (struct speed_params *); 326 double speed_mpn_toom3_sqr (struct speed_params *); 327 double speed_mpn_toom4_sqr (struct speed_params *); 328 double speed_mpn_toom6_sqr (struct speed_params *); 329 double speed_mpn_toom8_sqr (struct speed_params *); 330 double speed_mpn_toom22_mul (struct speed_params *); 331 double speed_mpn_toom33_mul (struct speed_params *); 332 double speed_mpn_toom44_mul (struct speed_params *); 333 double speed_mpn_toom6h_mul (struct speed_params *); 334 double speed_mpn_toom8h_mul (struct speed_params *); 335 double speed_mpn_toom32_mul (struct speed_params *); 336 double speed_mpn_toom42_mul (struct speed_params *); 337 double speed_mpn_toom43_mul (struct speed_params *); 338 double speed_mpn_toom63_mul (struct speed_params *); 339 double speed_mpn_toom32_for_toom43_mul (struct speed_params *); 340 double speed_mpn_toom43_for_toom32_mul (struct speed_params *); 341 double speed_mpn_toom32_for_toom53_mul (struct speed_params *); 342 double speed_mpn_toom53_for_toom32_mul (struct speed_params *); 343 double speed_mpn_toom42_for_toom53_mul (struct speed_params *); 344 double speed_mpn_toom53_for_toom42_mul (struct speed_params *); 345 double speed_mpn_toom43_for_toom54_mul (struct speed_params *); 346 double speed_mpn_toom54_for_toom43_mul (struct speed_params *); 347 double speed_mpn_toom42_mulmid (struct speed_params *); 348 double speed_mpn_mulmod_bnm1 (struct speed_params *); 349 double speed_mpn_bc_mulmod_bnm1 (struct speed_params *); 350 double speed_mpn_mulmod_bnm1_rounded (struct speed_params *); 351 double speed_mpn_sqrmod_bnm1 (struct speed_params *); 352 double speed_mpn_udiv_qrnnd (struct speed_params *); 353 double speed_mpn_udiv_qrnnd_r (struct speed_params *); 354 double speed_mpn_umul_ppmm (struct speed_params *); 355 double speed_mpn_umul_ppmm_r (struct speed_params *); 356 double speed_mpn_xnor_n (struct speed_params *); 357 double speed_mpn_xor_n (struct speed_params *); 358 double speed_MPN_ZERO (struct speed_params *); 359 360 double speed_mpq_init_clear (struct speed_params *); 361 362 double speed_mpz_add (struct speed_params *); 363 double speed_mpz_bin_uiui (struct speed_params *); 364 double speed_mpz_bin_ui (struct speed_params *); 365 double speed_mpz_fac_ui (struct speed_params *); 366 double speed_mpz_fib_ui (struct speed_params *); 367 double speed_mpz_fib2_ui (struct speed_params *); 368 double speed_mpz_init_clear (struct speed_params *); 369 double speed_mpz_init_realloc_clear (struct speed_params *); 370 double speed_mpz_jacobi (struct speed_params *); 371 double speed_mpz_lucnum_ui (struct speed_params *); 372 double speed_mpz_lucnum2_ui (struct speed_params *); 373 double speed_mpz_mod (struct speed_params *); 374 double speed_mpz_powm (struct speed_params *); 375 double speed_mpz_powm_mod (struct speed_params *); 376 double speed_mpz_powm_redc (struct speed_params *); 377 double speed_mpz_powm_sec (struct speed_params *); 378 double speed_mpz_powm_ui (struct speed_params *); 379 double speed_mpz_urandomb (struct speed_params *); 380 381 double speed_gmp_randseed (struct speed_params *); 382 double speed_gmp_randseed_ui (struct speed_params *); 383 384 double speed_noop (struct speed_params *); 385 double speed_noop_wxs (struct speed_params *); 386 double speed_noop_wxys (struct speed_params *); 387 388 double speed_operator_div (struct speed_params *); 389 double speed_operator_mod (struct speed_params *); 390 391 double speed_udiv_qrnnd (struct speed_params *); 392 double speed_udiv_qrnnd_preinv1 (struct speed_params *); 393 double speed_udiv_qrnnd_preinv2 (struct speed_params *); 394 double speed_udiv_qrnnd_preinv3 (struct speed_params *); 395 double speed_udiv_qrnnd_c (struct speed_params *); 396 double speed_umul_ppmm (struct speed_params *); 397 398 /* Prototypes for other routines */ 399 400 /* low 32-bits in p[0], high 32-bits in p[1] */ 401 void speed_cyclecounter (unsigned p[2]); 402 403 void mftb_function (unsigned p[2]); 404 405 /* In i386 gcc -fPIC, ebx is a fixed register and can't be declared a dummy 406 output or a clobber for the cpuid, hence an explicit save and restore. A 407 clobber as such doesn't provoke an error unfortunately (gcc 3.0), so use 408 the dummy output style in non-PIC, so there's an error if somehow -fPIC 409 is used without a -DPIC to tell us about it. */ 410 #if defined(__GNUC__) && ! defined (NO_ASM) \ 411 && (defined (__i386__) || defined (__i486__)) 412 #if defined (PIC) || defined (__APPLE_CC__) 413 #define speed_cyclecounter(p) \ 414 do { \ 415 int __speed_cyclecounter__save_ebx; \ 416 int __speed_cyclecounter__dummy; \ 417 __asm__ __volatile__ ("movl %%ebx, %1\n" \ 418 "cpuid\n" \ 419 "movl %1, %%ebx\n" \ 420 "rdtsc" \ 421 : "=a" ((p)[0]), \ 422 "=&rm" (__speed_cyclecounter__save_ebx), \ 423 "=c" (__speed_cyclecounter__dummy), \ 424 "=d" ((p)[1])); \ 425 } while (0) 426 #else 427 #define speed_cyclecounter(p) \ 428 do { \ 429 int __speed_cyclecounter__dummy1; \ 430 int __speed_cyclecounter__dummy2; \ 431 __asm__ __volatile__ ("cpuid\n" \ 432 "rdtsc" \ 433 : "=a" ((p)[0]), \ 434 "=b" (__speed_cyclecounter__dummy1), \ 435 "=c" (__speed_cyclecounter__dummy2), \ 436 "=d" ((p)[1])); \ 437 } while (0) 438 #endif 439 #endif 440 441 double speed_cyclecounter_diff (const unsigned [2], const unsigned [2]); 442 int gettimeofday_microseconds_p (void); 443 int getrusage_microseconds_p (void); 444 int cycles_works_p (void); 445 long clk_tck (void); 446 double freq_measure (const char *, double (*)(void)); 447 448 int double_cmp_ptr (const double *, const double *); 449 void pentium_wbinvd (void); 450 typedef int (*qsort_function_t) (const void *, const void *); 451 452 void noop (void); 453 void noop_1 (mp_limb_t); 454 void noop_wxs (mp_ptr, mp_srcptr, mp_size_t); 455 void noop_wxys (mp_ptr, mp_srcptr, mp_srcptr, mp_size_t); 456 void mpn_cache_fill (mp_srcptr, mp_size_t); 457 void mpn_cache_fill_dummy (mp_limb_t); 458 void speed_cache_fill (struct speed_params *); 459 void speed_operand_src (struct speed_params *, mp_ptr, mp_size_t); 460 void speed_operand_dst (struct speed_params *, mp_ptr, mp_size_t); 461 462 extern int speed_option_addrs; 463 extern int speed_option_verbose; 464 extern int speed_option_cycles_broken; 465 void speed_option_set (const char *); 466 467 mp_limb_t mpn_divrem_1_div (mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t); 468 mp_limb_t mpn_divrem_1_inv (mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t); 469 mp_limb_t mpn_divrem_2_div (mp_ptr, mp_size_t, mp_ptr, mp_size_t, mp_srcptr); 470 mp_limb_t mpn_divrem_2_inv (mp_ptr, mp_size_t, mp_ptr, mp_size_t, mp_srcptr); 471 472 int mpn_jacobi_base_1 (mp_limb_t, mp_limb_t, int); 473 int mpn_jacobi_base_2 (mp_limb_t, mp_limb_t, int); 474 int mpn_jacobi_base_3 (mp_limb_t, mp_limb_t, int); 475 int mpn_jacobi_base_4 (mp_limb_t, mp_limb_t, int); 476 477 mp_limb_t mpn_mod_1_div (mp_srcptr, mp_size_t, mp_limb_t); 478 mp_limb_t mpn_mod_1_inv (mp_srcptr, mp_size_t, mp_limb_t); 479 480 mp_limb_t mpn_mod_1_1p_1 (mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t [4]); 481 mp_limb_t mpn_mod_1_1p_2 (mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t [4]); 482 483 void mpn_mod_1_1p_cps_1 (mp_limb_t [4], mp_limb_t); 484 void mpn_mod_1_1p_cps_2 (mp_limb_t [4], mp_limb_t); 485 486 mp_size_t mpn_gcdext_one_double (mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t); 487 mp_size_t mpn_gcdext_one_single (mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t); 488 mp_size_t mpn_gcdext_single (mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t); 489 mp_size_t mpn_gcdext_double (mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t); 490 mp_size_t mpn_hgcd_lehmer (mp_ptr, mp_ptr, mp_size_t, struct hgcd_matrix *, mp_ptr); 491 mp_size_t mpn_hgcd_lehmer_itch (mp_size_t); 492 493 mp_size_t mpn_hgcd_appr_lehmer (mp_ptr, mp_ptr, mp_size_t, struct hgcd_matrix *, mp_ptr); 494 mp_size_t mpn_hgcd_appr_lehmer_itch (mp_size_t); 495 496 mp_size_t mpn_hgcd_reduce_1 (struct hgcd_matrix *, mp_ptr, mp_ptr, mp_size_t, mp_size_t, mp_ptr); 497 mp_size_t mpn_hgcd_reduce_1_itch (mp_size_t, mp_size_t); 498 499 mp_size_t mpn_hgcd_reduce_2 (struct hgcd_matrix *, mp_ptr, mp_ptr, mp_size_t, mp_size_t, mp_ptr); 500 mp_size_t mpn_hgcd_reduce_2_itch (mp_size_t, mp_size_t); 501 502 mp_limb_t mpn_sb_divrem_mn_div (mp_ptr, mp_ptr, mp_size_t, mp_srcptr, mp_size_t); 503 mp_limb_t mpn_sb_divrem_mn_inv (mp_ptr, mp_ptr, mp_size_t, mp_srcptr, mp_size_t); 504 505 mp_size_t mpn_set_str_basecase (mp_ptr, const unsigned char *, size_t, int); 506 void mpn_pre_set_str (mp_ptr, unsigned char *, size_t, powers_t *, mp_ptr); 507 508 void mpz_powm_mod (mpz_ptr, mpz_srcptr, mpz_srcptr, mpz_srcptr); 509 void mpz_powm_redc (mpz_ptr, mpz_srcptr, mpz_srcptr, mpz_srcptr); 510 511 int speed_routine_count_zeros_setup (struct speed_params *, mp_ptr, int, int); 512 513 514 /* "get" is called repeatedly until it ticks over, just in case on a fast 515 processor it takes less than a microsecond, though this is probably 516 unlikely if it's a system call. 517 518 speed_cyclecounter is called on the same side of the "get" for the start 519 and end measurements. It doesn't matter how long it takes from the "get" 520 sample to the cycles sample, since that period will cancel out in the 521 difference calculation (assuming it's the same each time). 522 523 Letting the test run for more than a process time slice is probably only 524 going to reduce accuracy, especially for getrusage when the cycle counter 525 is real time, or for gettimeofday if the cycle counter is in fact process 526 time. Use CLK_TCK/2 as a reasonable stop. 527 528 It'd be desirable to be quite accurate here. The default speed_precision 529 for a cycle counter is 10000 cycles, so to mix that with getrusage or 530 gettimeofday the frequency should be at least that accurate. But running 531 measurements for 10000 microseconds (or more) is too long. Be satisfied 532 with just a half clock tick (5000 microseconds usually). */ 533 534 #define FREQ_MEASURE_ONE(name, type, get, getc, sec, usec) \ 535 do { \ 536 type st1, st, et1, et; \ 537 unsigned sc[2], ec[2]; \ 538 long dt, half_tick; \ 539 double dc, cyc; \ 540 \ 541 half_tick = (1000000L / clk_tck()) / 2; \ 542 \ 543 get (st1); \ 544 do { \ 545 get (st); \ 546 } while (usec(st) == usec(st1) && sec(st) == sec(st1)); \ 547 \ 548 getc (sc); \ 549 \ 550 for (;;) \ 551 { \ 552 get (et1); \ 553 do { \ 554 get (et); \ 555 } while (usec(et) == usec(et1) && sec(et) == sec(et1)); \ 556 \ 557 getc (ec); \ 558 \ 559 dc = speed_cyclecounter_diff (ec, sc); \ 560 \ 561 /* allow secs to cancel before multiplying */ \ 562 dt = sec(et) - sec(st); \ 563 dt = dt * 1000000L + (usec(et) - usec(st)); \ 564 \ 565 if (dt >= half_tick) \ 566 break; \ 567 } \ 568 \ 569 cyc = dt * 1e-6 / dc; \ 570 \ 571 if (speed_option_verbose >= 2) \ 572 printf ("freq_measure_%s_one() dc=%.6g dt=%ld cyc=%.6g\n", \ 573 name, dc, dt, cyc); \ 574 \ 575 return dt * 1e-6 / dc; \ 576 \ 577 } while (0) 578 579 580 581 582 /* The measuring routines use these big macros to save duplication for 583 similar forms. They also get used for some automatically generated 584 measuring of new implementations of functions. 585 586 Having something like SPEED_ROUTINE_BINARY_N as a subroutine accepting a 587 function pointer is considered undesirable since it's not the way a 588 normal application will be calling, and some processors might do 589 different things with an indirect call, like not branch predicting, or 590 doing a full pipe flush. At least some of the "functions" measured are 591 actually macros too. 592 593 The net effect is to bloat the object code, possibly in a big way, but 594 only what's being measured is being run, so that doesn't matter. 595 596 The loop forms don't try to cope with __GMP_ATTRIBUTE_PURE or 597 ATTRIBUTE_CONST on the called functions. Adding a cast to a non-pure 598 function pointer doesn't work in gcc 3.2. Using an actual non-pure 599 function pointer variable works, but stands a real risk of a 600 non-optimizing compiler generating unnecessary overheads in the call. 601 Currently the best idea is not to use those attributes for a timing 602 program build. __GMP_NO_ATTRIBUTE_CONST_PURE will tell gmp.h and 603 gmp-impl.h to omit them from routines there. */ 604 605 #define SPEED_RESTRICT_COND(cond) if (!(cond)) return -1.0; 606 607 /* For mpn_copy or similar. */ 608 #define SPEED_ROUTINE_MPN_COPY_CALL(call) \ 609 { \ 610 mp_ptr wp; \ 611 unsigned i; \ 612 double t; \ 613 TMP_DECL; \ 614 \ 615 SPEED_RESTRICT_COND (s->size >= 0); \ 616 \ 617 TMP_MARK; \ 618 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ 619 \ 620 speed_operand_src (s, s->xp, s->size); \ 621 speed_operand_dst (s, wp, s->size); \ 622 speed_cache_fill (s); \ 623 \ 624 speed_starttime (); \ 625 i = s->reps; \ 626 do \ 627 call; \ 628 while (--i != 0); \ 629 t = speed_endtime (); \ 630 \ 631 TMP_FREE; \ 632 return t; \ 633 } 634 #define SPEED_ROUTINE_MPN_COPY(function) \ 635 SPEED_ROUTINE_MPN_COPY_CALL (function (wp, s->xp, s->size)) 636 637 #define SPEED_ROUTINE_MPN_TABSELECT(function) \ 638 SPEED_ROUTINE_MPN_COPY_CALL (function (wp, s->xp, s->size, 1, s->r)) 639 640 #define SPEED_ROUTINE_MPN_COPYC(function) \ 641 { \ 642 mp_ptr wp; \ 643 unsigned i; \ 644 double t; \ 645 TMP_DECL; \ 646 \ 647 SPEED_RESTRICT_COND (s->size >= 0); \ 648 \ 649 TMP_MARK; \ 650 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ 651 \ 652 speed_operand_src (s, s->xp, s->size); \ 653 speed_operand_dst (s, wp, s->size); \ 654 speed_cache_fill (s); \ 655 \ 656 speed_starttime (); \ 657 i = s->reps; \ 658 do \ 659 function (wp, s->xp, s->size, 0); \ 660 while (--i != 0); \ 661 t = speed_endtime (); \ 662 \ 663 TMP_FREE; \ 664 return t; \ 665 } 666 667 /* s->size is still in limbs, and it's limbs which are copied, but 668 "function" takes a size in bytes not limbs. */ 669 #define SPEED_ROUTINE_MPN_COPY_BYTES(function) \ 670 { \ 671 mp_ptr wp; \ 672 unsigned i; \ 673 double t; \ 674 TMP_DECL; \ 675 \ 676 SPEED_RESTRICT_COND (s->size >= 0); \ 677 \ 678 TMP_MARK; \ 679 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ 680 \ 681 speed_operand_src (s, s->xp, s->size); \ 682 speed_operand_dst (s, wp, s->size); \ 683 speed_cache_fill (s); \ 684 \ 685 speed_starttime (); \ 686 i = s->reps; \ 687 do \ 688 function (wp, s->xp, s->size * BYTES_PER_MP_LIMB); \ 689 while (--i != 0); \ 690 t = speed_endtime (); \ 691 \ 692 TMP_FREE; \ 693 return t; \ 694 } 695 696 697 /* For mpn_add_n, mpn_sub_n, or similar. */ 698 #define SPEED_ROUTINE_MPN_BINARY_N_CALL(call) \ 699 { \ 700 mp_ptr wp; \ 701 mp_ptr xp, yp; \ 702 unsigned i; \ 703 double t; \ 704 TMP_DECL; \ 705 \ 706 SPEED_RESTRICT_COND (s->size >= 1); \ 707 \ 708 TMP_MARK; \ 709 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ 710 \ 711 xp = s->xp; \ 712 yp = s->yp; \ 713 \ 714 if (s->r == 0) ; \ 715 else if (s->r == 1) { xp = wp; } \ 716 else if (s->r == 2) { yp = wp; } \ 717 else if (s->r == 3) { xp = wp; yp = wp; } \ 718 else if (s->r == 4) { yp = xp; } \ 719 else { \ 720 TMP_FREE; \ 721 return -1.0; \ 722 } \ 723 \ 724 /* initialize wp if operand overlap */ \ 725 if (xp == wp || yp == wp) \ 726 MPN_COPY (wp, s->xp, s->size); \ 727 \ 728 speed_operand_src (s, xp, s->size); \ 729 speed_operand_src (s, yp, s->size); \ 730 speed_operand_dst (s, wp, s->size); \ 731 speed_cache_fill (s); \ 732 \ 733 speed_starttime (); \ 734 i = s->reps; \ 735 do \ 736 call; \ 737 while (--i != 0); \ 738 t = speed_endtime (); \ 739 \ 740 TMP_FREE; \ 741 return t; \ 742 } 743 744 745 /* For mpn_aors_errK_n, where 1 <= K <= 3. */ 746 #define SPEED_ROUTINE_MPN_BINARY_ERR_N_CALL(call, K) \ 747 { \ 748 mp_ptr wp; \ 749 mp_ptr xp, yp; \ 750 mp_ptr zp[K]; \ 751 mp_limb_t ep[2*K]; \ 752 unsigned i; \ 753 double t; \ 754 TMP_DECL; \ 755 \ 756 SPEED_RESTRICT_COND (s->size >= 1); \ 757 \ 758 TMP_MARK; \ 759 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ 760 \ 761 /* (don't have a mechnanism to specify zp alignments) */ \ 762 for (i = 0; i < K; i++) \ 763 SPEED_TMP_ALLOC_LIMBS (zp[i], s->size, 0); \ 764 \ 765 xp = s->xp; \ 766 yp = s->yp; \ 767 \ 768 if (s->r == 0) ; \ 769 else if (s->r == 1) { xp = wp; } \ 770 else if (s->r == 2) { yp = wp; } \ 771 else if (s->r == 3) { xp = wp; yp = wp; } \ 772 else if (s->r == 4) { yp = xp; } \ 773 else { \ 774 TMP_FREE; \ 775 return -1.0; \ 776 } \ 777 \ 778 /* initialize wp if operand overlap */ \ 779 if (xp == wp || yp == wp) \ 780 MPN_COPY (wp, s->xp, s->size); \ 781 \ 782 speed_operand_src (s, xp, s->size); \ 783 speed_operand_src (s, yp, s->size); \ 784 for (i = 0; i < K; i++) \ 785 speed_operand_src (s, zp[i], s->size); \ 786 speed_operand_dst (s, wp, s->size); \ 787 speed_cache_fill (s); \ 788 \ 789 speed_starttime (); \ 790 i = s->reps; \ 791 do \ 792 call; \ 793 while (--i != 0); \ 794 t = speed_endtime (); \ 795 \ 796 TMP_FREE; \ 797 return t; \ 798 } 799 800 #define SPEED_ROUTINE_MPN_BINARY_ERR1_N(function) \ 801 SPEED_ROUTINE_MPN_BINARY_ERR_N_CALL ((*function) (wp, xp, yp, ep, zp[0], s->size, 0), 1) 802 803 #define SPEED_ROUTINE_MPN_BINARY_ERR2_N(function) \ 804 SPEED_ROUTINE_MPN_BINARY_ERR_N_CALL ((*function) (wp, xp, yp, ep, zp[0], zp[1], s->size, 0), 2) 805 806 #define SPEED_ROUTINE_MPN_BINARY_ERR3_N(function) \ 807 SPEED_ROUTINE_MPN_BINARY_ERR_N_CALL ((*function) (wp, xp, yp, ep, zp[0], zp[1], zp[2], s->size, 0), 3) 808 809 810 /* For mpn_add_n, mpn_sub_n, or similar. */ 811 #define SPEED_ROUTINE_MPN_ADDSUB_N_CALL(call) \ 812 { \ 813 mp_ptr ap, sp; \ 814 mp_ptr xp, yp; \ 815 unsigned i; \ 816 double t; \ 817 TMP_DECL; \ 818 \ 819 SPEED_RESTRICT_COND (s->size >= 1); \ 820 \ 821 TMP_MARK; \ 822 SPEED_TMP_ALLOC_LIMBS (ap, s->size, s->align_wp); \ 823 SPEED_TMP_ALLOC_LIMBS (sp, s->size, s->align_wp); \ 824 \ 825 xp = s->xp; \ 826 yp = s->yp; \ 827 \ 828 if ((s->r & 1) != 0) { xp = ap; } \ 829 if ((s->r & 2) != 0) { yp = ap; } \ 830 if ((s->r & 4) != 0) { xp = sp; } \ 831 if ((s->r & 8) != 0) { yp = sp; } \ 832 if ((s->r & 3) == 3 || (s->r & 12) == 12) \ 833 { \ 834 TMP_FREE; \ 835 return -1.0; \ 836 } \ 837 \ 838 /* initialize ap if operand overlap */ \ 839 if (xp == ap || yp == ap) \ 840 MPN_COPY (ap, s->xp, s->size); \ 841 /* initialize sp if operand overlap */ \ 842 if (xp == sp || yp == sp) \ 843 MPN_COPY (sp, s->xp, s->size); \ 844 \ 845 speed_operand_src (s, xp, s->size); \ 846 speed_operand_src (s, yp, s->size); \ 847 speed_operand_dst (s, ap, s->size); \ 848 speed_operand_dst (s, sp, s->size); \ 849 speed_cache_fill (s); \ 850 \ 851 speed_starttime (); \ 852 i = s->reps; \ 853 do \ 854 call; \ 855 while (--i != 0); \ 856 t = speed_endtime (); \ 857 \ 858 TMP_FREE; \ 859 return t; \ 860 } 861 862 #define SPEED_ROUTINE_MPN_BINARY_N(function) \ 863 SPEED_ROUTINE_MPN_BINARY_N_CALL ((*function) (wp, xp, yp, s->size)) 864 865 #define SPEED_ROUTINE_MPN_BINARY_NC(function) \ 866 SPEED_ROUTINE_MPN_BINARY_N_CALL ((*function) (wp, xp, yp, s->size, 0)) 867 868 869 /* For mpn_lshift, mpn_rshift, mpn_mul_1, with r, or similar. */ 870 #define SPEED_ROUTINE_MPN_UNARY_1_CALL(call) \ 871 { \ 872 mp_ptr wp; \ 873 unsigned i; \ 874 double t; \ 875 TMP_DECL; \ 876 \ 877 SPEED_RESTRICT_COND (s->size >= 1); \ 878 \ 879 TMP_MARK; \ 880 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ 881 \ 882 speed_operand_src (s, s->xp, s->size); \ 883 speed_operand_dst (s, wp, s->size); \ 884 speed_cache_fill (s); \ 885 \ 886 speed_starttime (); \ 887 i = s->reps; \ 888 do \ 889 call; \ 890 while (--i != 0); \ 891 t = speed_endtime (); \ 892 \ 893 TMP_FREE; \ 894 return t; \ 895 } 896 897 #define SPEED_ROUTINE_MPN_UNARY_1(function) \ 898 SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r)) 899 900 #define SPEED_ROUTINE_MPN_UNARY_1C(function) \ 901 SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r, 0)) 902 903 /* FIXME: wp is uninitialized here, should start it off from xp */ 904 #define SPEED_ROUTINE_MPN_UNARY_1_INPLACE(function) \ 905 SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, wp, s->size, s->r)) 906 907 #define SPEED_ROUTINE_MPN_DIVEXACT_1(function) \ 908 SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r)) 909 910 #define SPEED_ROUTINE_MPN_BDIV_Q_1(function) \ 911 SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r)) 912 913 #define SPEED_ROUTINE_MPN_PI1_BDIV_Q_1_CALL(call) \ 914 { \ 915 unsigned shift; \ 916 mp_limb_t dinv; \ 917 \ 918 SPEED_RESTRICT_COND (s->size > 0); \ 919 SPEED_RESTRICT_COND (s->r != 0); \ 920 \ 921 count_trailing_zeros (shift, s->r); \ 922 binvert_limb (dinv, s->r >> shift); \ 923 \ 924 SPEED_ROUTINE_MPN_UNARY_1_CALL (call); \ 925 } 926 #define SPEED_ROUTINE_MPN_PI1_BDIV_Q_1(function) \ 927 SPEED_ROUTINE_MPN_PI1_BDIV_Q_1_CALL \ 928 ((*function) (wp, s->xp, s->size, s->r, dinv, shift)) 929 930 #define SPEED_ROUTINE_MPN_BDIV_DBM1C(function) \ 931 SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r, 0)) 932 933 #define SPEED_ROUTINE_MPN_DIVREM_1(function) \ 934 SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, 0, s->xp, s->size, s->r)) 935 936 #define SPEED_ROUTINE_MPN_DIVREM_1C(function) \ 937 SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, 0, s->xp, s->size, s->r, 0)) 938 939 #define SPEED_ROUTINE_MPN_DIVREM_1F(function) \ 940 SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->size, s->xp, 0, s->r)) 941 942 #define SPEED_ROUTINE_MPN_DIVREM_1CF(function) \ 943 SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->size, s->xp, 0, s->r, 0)) 944 945 946 #define SPEED_ROUTINE_MPN_PREINV_DIVREM_1_CALL(call) \ 947 { \ 948 unsigned shift; \ 949 mp_limb_t dinv; \ 950 \ 951 SPEED_RESTRICT_COND (s->size >= 0); \ 952 SPEED_RESTRICT_COND (s->r != 0); \ 953 \ 954 count_leading_zeros (shift, s->r); \ 955 invert_limb (dinv, s->r << shift); \ 956 \ 957 SPEED_ROUTINE_MPN_UNARY_1_CALL (call); \ 958 } \ 959 960 #define SPEED_ROUTINE_MPN_PREINV_DIVREM_1(function) \ 961 SPEED_ROUTINE_MPN_PREINV_DIVREM_1_CALL \ 962 ((*function) (wp, 0, s->xp, s->size, s->r, dinv, shift)) 963 964 /* s->size limbs worth of fraction part */ 965 #define SPEED_ROUTINE_MPN_PREINV_DIVREM_1F(function) \ 966 SPEED_ROUTINE_MPN_PREINV_DIVREM_1_CALL \ 967 ((*function) (wp, s->size, s->xp, 0, s->r, dinv, shift)) 968 969 970 /* s->r is duplicated to form the multiplier, defaulting to 971 MP_BASES_BIG_BASE_10. Not sure if that's particularly useful, but at 972 least it provides some control. */ 973 #define SPEED_ROUTINE_MPN_UNARY_N(function,N) \ 974 { \ 975 mp_ptr wp; \ 976 mp_size_t wn; \ 977 unsigned i; \ 978 double t; \ 979 mp_limb_t yp[N]; \ 980 TMP_DECL; \ 981 \ 982 SPEED_RESTRICT_COND (s->size >= N); \ 983 \ 984 TMP_MARK; \ 985 wn = s->size + N-1; \ 986 SPEED_TMP_ALLOC_LIMBS (wp, wn, s->align_wp); \ 987 for (i = 0; i < N; i++) \ 988 yp[i] = (s->r != 0 ? s->r : MP_BASES_BIG_BASE_10); \ 989 \ 990 speed_operand_src (s, s->xp, s->size); \ 991 speed_operand_src (s, yp, (mp_size_t) N); \ 992 speed_operand_dst (s, wp, wn); \ 993 speed_cache_fill (s); \ 994 \ 995 speed_starttime (); \ 996 i = s->reps; \ 997 do \ 998 function (wp, s->xp, s->size, yp); \ 999 while (--i != 0); \ 1000 t = speed_endtime (); \ 1001 \ 1002 TMP_FREE; \ 1003 return t; \ 1004 } 1005 1006 #define SPEED_ROUTINE_MPN_UNARY_2(function) \ 1007 SPEED_ROUTINE_MPN_UNARY_N (function, 2) 1008 #define SPEED_ROUTINE_MPN_UNARY_3(function) \ 1009 SPEED_ROUTINE_MPN_UNARY_N (function, 3) 1010 #define SPEED_ROUTINE_MPN_UNARY_4(function) \ 1011 SPEED_ROUTINE_MPN_UNARY_N (function, 4) 1012 #define SPEED_ROUTINE_MPN_UNARY_5(function) \ 1013 SPEED_ROUTINE_MPN_UNARY_N (function, 5) 1014 #define SPEED_ROUTINE_MPN_UNARY_6(function) \ 1015 SPEED_ROUTINE_MPN_UNARY_N (function, 6) 1016 #define SPEED_ROUTINE_MPN_UNARY_7(function) \ 1017 SPEED_ROUTINE_MPN_UNARY_N (function, 7) 1018 #define SPEED_ROUTINE_MPN_UNARY_8(function) \ 1019 SPEED_ROUTINE_MPN_UNARY_N (function, 8) 1020 1021 1022 /* For mpn_mul, mpn_mul_basecase, xsize=r, ysize=s->size. */ 1023 #define SPEED_ROUTINE_MPN_MUL(function) \ 1024 { \ 1025 mp_ptr wp; \ 1026 mp_size_t size1; \ 1027 unsigned i; \ 1028 double t; \ 1029 TMP_DECL; \ 1030 \ 1031 size1 = (s->r == 0 ? s->size : s->r); \ 1032 if (size1 < 0) size1 = -size1 - s->size; \ 1033 \ 1034 SPEED_RESTRICT_COND (size1 >= 1); \ 1035 SPEED_RESTRICT_COND (s->size >= size1); \ 1036 \ 1037 TMP_MARK; \ 1038 SPEED_TMP_ALLOC_LIMBS (wp, size1 + s->size, s->align_wp); \ 1039 \ 1040 speed_operand_src (s, s->xp, s->size); \ 1041 speed_operand_src (s, s->yp, size1); \ 1042 speed_operand_dst (s, wp, size1 + s->size); \ 1043 speed_cache_fill (s); \ 1044 \ 1045 speed_starttime (); \ 1046 i = s->reps; \ 1047 do \ 1048 function (wp, s->xp, s->size, s->yp, size1); \ 1049 while (--i != 0); \ 1050 t = speed_endtime (); \ 1051 \ 1052 TMP_FREE; \ 1053 return t; \ 1054 } 1055 1056 1057 #define SPEED_ROUTINE_MPN_MUL_N_CALL(call) \ 1058 { \ 1059 mp_ptr wp; \ 1060 unsigned i; \ 1061 double t; \ 1062 TMP_DECL; \ 1063 \ 1064 SPEED_RESTRICT_COND (s->size >= 1); \ 1065 \ 1066 TMP_MARK; \ 1067 SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp); \ 1068 \ 1069 speed_operand_src (s, s->xp, s->size); \ 1070 speed_operand_src (s, s->yp, s->size); \ 1071 speed_operand_dst (s, wp, 2*s->size); \ 1072 speed_cache_fill (s); \ 1073 \ 1074 speed_starttime (); \ 1075 i = s->reps; \ 1076 do \ 1077 call; \ 1078 while (--i != 0); \ 1079 t = speed_endtime (); \ 1080 \ 1081 TMP_FREE; \ 1082 return t; \ 1083 } 1084 1085 #define SPEED_ROUTINE_MPN_MUL_N(function) \ 1086 SPEED_ROUTINE_MPN_MUL_N_CALL (function (wp, s->xp, s->yp, s->size)); 1087 1088 #define SPEED_ROUTINE_MPN_MULLO_N_CALL(call) \ 1089 { \ 1090 mp_ptr wp; \ 1091 unsigned i; \ 1092 double t; \ 1093 TMP_DECL; \ 1094 \ 1095 SPEED_RESTRICT_COND (s->size >= 1); \ 1096 \ 1097 TMP_MARK; \ 1098 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ 1099 \ 1100 speed_operand_src (s, s->xp, s->size); \ 1101 speed_operand_src (s, s->yp, s->size); \ 1102 speed_operand_dst (s, wp, s->size); \ 1103 speed_cache_fill (s); \ 1104 \ 1105 speed_starttime (); \ 1106 i = s->reps; \ 1107 do \ 1108 call; \ 1109 while (--i != 0); \ 1110 t = speed_endtime (); \ 1111 \ 1112 TMP_FREE; \ 1113 return t; \ 1114 } 1115 1116 #define SPEED_ROUTINE_MPN_MULLO_N(function) \ 1117 SPEED_ROUTINE_MPN_MULLO_N_CALL (function (wp, s->xp, s->yp, s->size)); 1118 1119 /* For mpn_mul_basecase, xsize=r, ysize=s->size. */ 1120 #define SPEED_ROUTINE_MPN_MULLO_BASECASE(function) \ 1121 { \ 1122 mp_ptr wp; \ 1123 unsigned i; \ 1124 double t; \ 1125 TMP_DECL; \ 1126 \ 1127 SPEED_RESTRICT_COND (s->size >= 1); \ 1128 \ 1129 TMP_MARK; \ 1130 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ 1131 \ 1132 speed_operand_src (s, s->xp, s->size); \ 1133 speed_operand_src (s, s->yp, s->size); \ 1134 speed_operand_dst (s, wp, s->size); \ 1135 speed_cache_fill (s); \ 1136 \ 1137 speed_starttime (); \ 1138 i = s->reps; \ 1139 do \ 1140 function (wp, s->xp, s->yp, s->size); \ 1141 while (--i != 0); \ 1142 t = speed_endtime (); \ 1143 \ 1144 TMP_FREE; \ 1145 return t; \ 1146 } 1147 1148 /* For mpn_mulmid, mpn_mulmid_basecase, xsize=r, ysize=s->size. */ 1149 #define SPEED_ROUTINE_MPN_MULMID(function) \ 1150 { \ 1151 mp_ptr wp, xp; \ 1152 mp_size_t size1; \ 1153 unsigned i; \ 1154 double t; \ 1155 TMP_DECL; \ 1156 \ 1157 size1 = (s->r == 0 ? (2 * s->size - 1) : s->r); \ 1158 \ 1159 SPEED_RESTRICT_COND (s->size >= 1); \ 1160 SPEED_RESTRICT_COND (size1 >= s->size); \ 1161 \ 1162 TMP_MARK; \ 1163 SPEED_TMP_ALLOC_LIMBS (wp, size1 - s->size + 3, s->align_wp); \ 1164 SPEED_TMP_ALLOC_LIMBS (xp, size1, s->align_xp); \ 1165 \ 1166 speed_operand_src (s, xp, size1); \ 1167 speed_operand_src (s, s->yp, s->size); \ 1168 speed_operand_dst (s, wp, size1 - s->size + 3); \ 1169 speed_cache_fill (s); \ 1170 \ 1171 speed_starttime (); \ 1172 i = s->reps; \ 1173 do \ 1174 function (wp, xp, size1, s->yp, s->size); \ 1175 while (--i != 0); \ 1176 t = speed_endtime (); \ 1177 \ 1178 TMP_FREE; \ 1179 return t; \ 1180 } 1181 1182 #define SPEED_ROUTINE_MPN_MULMID_N(function) \ 1183 { \ 1184 mp_ptr wp, xp; \ 1185 mp_size_t size1; \ 1186 unsigned i; \ 1187 double t; \ 1188 TMP_DECL; \ 1189 \ 1190 size1 = 2 * s->size - 1; \ 1191 \ 1192 SPEED_RESTRICT_COND (s->size >= 1); \ 1193 \ 1194 TMP_MARK; \ 1195 SPEED_TMP_ALLOC_LIMBS (wp, size1 - s->size + 3, s->align_wp); \ 1196 SPEED_TMP_ALLOC_LIMBS (xp, size1, s->align_xp); \ 1197 \ 1198 speed_operand_src (s, xp, size1); \ 1199 speed_operand_src (s, s->yp, s->size); \ 1200 speed_operand_dst (s, wp, size1 - s->size + 3); \ 1201 speed_cache_fill (s); \ 1202 \ 1203 speed_starttime (); \ 1204 i = s->reps; \ 1205 do \ 1206 function (wp, xp, s->yp, s->size); \ 1207 while (--i != 0); \ 1208 t = speed_endtime (); \ 1209 \ 1210 TMP_FREE; \ 1211 return t; \ 1212 } 1213 1214 #define SPEED_ROUTINE_MPN_TOOM42_MULMID(function) \ 1215 { \ 1216 mp_ptr wp, xp, scratch; \ 1217 mp_size_t size1, scratch_size; \ 1218 unsigned i; \ 1219 double t; \ 1220 TMP_DECL; \ 1221 \ 1222 size1 = 2 * s->size - 1; \ 1223 \ 1224 SPEED_RESTRICT_COND (s->size >= 1); \ 1225 \ 1226 TMP_MARK; \ 1227 SPEED_TMP_ALLOC_LIMBS (wp, size1 - s->size + 3, s->align_wp); \ 1228 SPEED_TMP_ALLOC_LIMBS (xp, size1, s->align_xp); \ 1229 scratch_size = mpn_toom42_mulmid_itch (s->size); \ 1230 SPEED_TMP_ALLOC_LIMBS (scratch, scratch_size, 0); \ 1231 \ 1232 speed_operand_src (s, xp, size1); \ 1233 speed_operand_src (s, s->yp, s->size); \ 1234 speed_operand_dst (s, wp, size1 - s->size + 3); \ 1235 speed_cache_fill (s); \ 1236 \ 1237 speed_starttime (); \ 1238 i = s->reps; \ 1239 do \ 1240 function (wp, xp, s->yp, s->size, scratch); \ 1241 while (--i != 0); \ 1242 t = speed_endtime (); \ 1243 \ 1244 TMP_FREE; \ 1245 return t; \ 1246 } 1247 1248 #define SPEED_ROUTINE_MPN_MULMOD_BNM1_CALL(call) \ 1249 { \ 1250 mp_ptr wp, tp; \ 1251 unsigned i; \ 1252 double t; \ 1253 mp_size_t itch; \ 1254 TMP_DECL; \ 1255 \ 1256 SPEED_RESTRICT_COND (s->size >= 1); \ 1257 \ 1258 itch = mpn_mulmod_bnm1_itch (s->size, s->size, s->size); \ 1259 \ 1260 TMP_MARK; \ 1261 SPEED_TMP_ALLOC_LIMBS (wp, 2 * s->size, s->align_wp); \ 1262 SPEED_TMP_ALLOC_LIMBS (tp, itch, s->align_wp2); \ 1263 \ 1264 speed_operand_src (s, s->xp, s->size); \ 1265 speed_operand_src (s, s->yp, s->size); \ 1266 speed_operand_dst (s, wp, 2 * s->size); \ 1267 speed_operand_dst (s, tp, itch); \ 1268 speed_cache_fill (s); \ 1269 \ 1270 speed_starttime (); \ 1271 i = s->reps; \ 1272 do \ 1273 call; \ 1274 while (--i != 0); \ 1275 t = speed_endtime (); \ 1276 \ 1277 TMP_FREE; \ 1278 return t; \ 1279 } 1280 #define SPEED_ROUTINE_MPN_MULMOD_BNM1_ROUNDED(function) \ 1281 { \ 1282 mp_ptr wp, tp; \ 1283 unsigned i; \ 1284 double t; \ 1285 mp_size_t size, itch; \ 1286 TMP_DECL; \ 1287 \ 1288 SPEED_RESTRICT_COND (s->size >= 1); \ 1289 \ 1290 size = mpn_mulmod_bnm1_next_size (s->size); \ 1291 itch = mpn_mulmod_bnm1_itch (size, size, size); \ 1292 \ 1293 TMP_MARK; \ 1294 SPEED_TMP_ALLOC_LIMBS (wp, size, s->align_wp); \ 1295 SPEED_TMP_ALLOC_LIMBS (tp, itch, s->align_wp2); \ 1296 \ 1297 speed_operand_src (s, s->xp, s->size); \ 1298 speed_operand_src (s, s->yp, s->size); \ 1299 speed_operand_dst (s, wp, size); \ 1300 speed_operand_dst (s, tp, itch); \ 1301 speed_cache_fill (s); \ 1302 \ 1303 speed_starttime (); \ 1304 i = s->reps; \ 1305 do \ 1306 function (wp, size, s->xp, s->size, s->yp, s->size, tp); \ 1307 while (--i != 0); \ 1308 t = speed_endtime (); \ 1309 \ 1310 TMP_FREE; \ 1311 return t; \ 1312 } 1313 1314 #define SPEED_ROUTINE_MPN_MUL_N_TSPACE(call, tsize, minsize) \ 1315 { \ 1316 mp_ptr wp, tspace; \ 1317 unsigned i; \ 1318 double t; \ 1319 TMP_DECL; \ 1320 \ 1321 SPEED_RESTRICT_COND (s->size >= minsize); \ 1322 \ 1323 TMP_MARK; \ 1324 SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp); \ 1325 SPEED_TMP_ALLOC_LIMBS (tspace, tsize, s->align_wp2); \ 1326 \ 1327 speed_operand_src (s, s->xp, s->size); \ 1328 speed_operand_src (s, s->yp, s->size); \ 1329 speed_operand_dst (s, wp, 2*s->size); \ 1330 speed_operand_dst (s, tspace, tsize); \ 1331 speed_cache_fill (s); \ 1332 \ 1333 speed_starttime (); \ 1334 i = s->reps; \ 1335 do \ 1336 call; \ 1337 while (--i != 0); \ 1338 t = speed_endtime (); \ 1339 \ 1340 TMP_FREE; \ 1341 return t; \ 1342 } 1343 1344 #define SPEED_ROUTINE_MPN_TOOM22_MUL_N(function) \ 1345 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1346 (function (wp, s->xp, s->size, s->yp, s->size, tspace), \ 1347 mpn_toom22_mul_itch (s->size, s->size), \ 1348 MPN_TOOM22_MUL_MINSIZE) 1349 1350 #define SPEED_ROUTINE_MPN_TOOM33_MUL_N(function) \ 1351 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1352 (function (wp, s->xp, s->size, s->yp, s->size, tspace), \ 1353 mpn_toom33_mul_itch (s->size, s->size), \ 1354 MPN_TOOM33_MUL_MINSIZE) 1355 1356 #define SPEED_ROUTINE_MPN_TOOM44_MUL_N(function) \ 1357 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1358 (function (wp, s->xp, s->size, s->yp, s->size, tspace), \ 1359 mpn_toom44_mul_itch (s->size, s->size), \ 1360 MPN_TOOM44_MUL_MINSIZE) 1361 1362 #define SPEED_ROUTINE_MPN_TOOM6H_MUL_N(function) \ 1363 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1364 (function (wp, s->xp, s->size, s->yp, s->size, tspace), \ 1365 mpn_toom6h_mul_itch (s->size, s->size), \ 1366 MPN_TOOM6H_MUL_MINSIZE) 1367 1368 #define SPEED_ROUTINE_MPN_TOOM8H_MUL_N(function) \ 1369 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1370 (function (wp, s->xp, s->size, s->yp, s->size, tspace), \ 1371 mpn_toom8h_mul_itch (s->size, s->size), \ 1372 MPN_TOOM8H_MUL_MINSIZE) 1373 1374 #define SPEED_ROUTINE_MPN_TOOM32_MUL(function) \ 1375 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1376 (function (wp, s->xp, s->size, s->yp, 2*s->size/3, tspace), \ 1377 mpn_toom32_mul_itch (s->size, 2*s->size/3), \ 1378 MPN_TOOM32_MUL_MINSIZE) 1379 1380 #define SPEED_ROUTINE_MPN_TOOM42_MUL(function) \ 1381 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1382 (function (wp, s->xp, s->size, s->yp, s->size/2, tspace), \ 1383 mpn_toom42_mul_itch (s->size, s->size/2), \ 1384 MPN_TOOM42_MUL_MINSIZE) 1385 1386 #define SPEED_ROUTINE_MPN_TOOM43_MUL(function) \ 1387 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1388 (function (wp, s->xp, s->size, s->yp, s->size*3/4, tspace), \ 1389 mpn_toom43_mul_itch (s->size, s->size*3/4), \ 1390 MPN_TOOM43_MUL_MINSIZE) 1391 1392 #define SPEED_ROUTINE_MPN_TOOM63_MUL(function) \ 1393 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1394 (function (wp, s->xp, s->size, s->yp, s->size/2, tspace), \ 1395 mpn_toom63_mul_itch (s->size, s->size/2), \ 1396 MPN_TOOM63_MUL_MINSIZE) 1397 1398 #define SPEED_ROUTINE_MPN_TOOM32_FOR_TOOM43_MUL(function) \ 1399 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1400 (function (wp, s->xp, s->size, s->yp, 17*s->size/24, tspace), \ 1401 mpn_toom32_mul_itch (s->size, 17*s->size/24), \ 1402 MPN_TOOM32_MUL_MINSIZE) 1403 #define SPEED_ROUTINE_MPN_TOOM43_FOR_TOOM32_MUL(function) \ 1404 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1405 (function (wp, s->xp, s->size, s->yp, 17*s->size/24, tspace), \ 1406 mpn_toom43_mul_itch (s->size, 17*s->size/24), \ 1407 MPN_TOOM43_MUL_MINSIZE) 1408 1409 #define SPEED_ROUTINE_MPN_TOOM32_FOR_TOOM53_MUL(function) \ 1410 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1411 (function (wp, s->xp, s->size, s->yp, 19*s->size/30, tspace), \ 1412 mpn_toom32_mul_itch (s->size, 19*s->size/30), \ 1413 MPN_TOOM32_MUL_MINSIZE) 1414 #define SPEED_ROUTINE_MPN_TOOM53_FOR_TOOM32_MUL(function) \ 1415 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1416 (function (wp, s->xp, s->size, s->yp, 19*s->size/30, tspace), \ 1417 mpn_toom53_mul_itch (s->size, 19*s->size/30), \ 1418 MPN_TOOM53_MUL_MINSIZE) 1419 1420 #define SPEED_ROUTINE_MPN_TOOM42_FOR_TOOM53_MUL(function) \ 1421 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1422 (function (wp, s->xp, s->size, s->yp, 11*s->size/20, tspace), \ 1423 mpn_toom42_mul_itch (s->size, 11*s->size/20), \ 1424 MPN_TOOM42_MUL_MINSIZE) 1425 #define SPEED_ROUTINE_MPN_TOOM53_FOR_TOOM42_MUL(function) \ 1426 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1427 (function (wp, s->xp, s->size, s->yp, 11*s->size/20, tspace), \ 1428 mpn_toom53_mul_itch (s->size, 11*s->size/20), \ 1429 MPN_TOOM53_MUL_MINSIZE) 1430 1431 #define SPEED_ROUTINE_MPN_TOOM43_FOR_TOOM54_MUL(function) \ 1432 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1433 (function (wp, s->xp, s->size, s->yp, 5*s->size/6, tspace), \ 1434 mpn_toom42_mul_itch (s->size, 5*s->size/6), \ 1435 MPN_TOOM54_MUL_MINSIZE) 1436 #define SPEED_ROUTINE_MPN_TOOM54_FOR_TOOM43_MUL(function) \ 1437 SPEED_ROUTINE_MPN_MUL_N_TSPACE \ 1438 (function (wp, s->xp, s->size, s->yp, 5*s->size/6, tspace), \ 1439 mpn_toom54_mul_itch (s->size, 5*s->size/6), \ 1440 MPN_TOOM54_MUL_MINSIZE) 1441 1442 1443 1444 #define SPEED_ROUTINE_MPN_SQR_CALL(call) \ 1445 { \ 1446 mp_ptr wp; \ 1447 unsigned i; \ 1448 double t; \ 1449 TMP_DECL; \ 1450 \ 1451 SPEED_RESTRICT_COND (s->size >= 1); \ 1452 \ 1453 TMP_MARK; \ 1454 SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp); \ 1455 \ 1456 speed_operand_src (s, s->xp, s->size); \ 1457 speed_operand_dst (s, wp, 2*s->size); \ 1458 speed_cache_fill (s); \ 1459 \ 1460 speed_starttime (); \ 1461 i = s->reps; \ 1462 do \ 1463 call; \ 1464 while (--i != 0); \ 1465 t = speed_endtime (); \ 1466 \ 1467 TMP_FREE; \ 1468 return t; \ 1469 } 1470 1471 #define SPEED_ROUTINE_MPN_SQR(function) \ 1472 SPEED_ROUTINE_MPN_SQR_CALL (function (wp, s->xp, s->size)) 1473 1474 #define SPEED_ROUTINE_MPN_SQR_DIAG_ADDLSH1_CALL(call) \ 1475 { \ 1476 mp_ptr wp, tp; \ 1477 unsigned i; \ 1478 double t; \ 1479 TMP_DECL; \ 1480 \ 1481 SPEED_RESTRICT_COND (s->size >= 2); \ 1482 \ 1483 TMP_MARK; \ 1484 SPEED_TMP_ALLOC_LIMBS (tp, 2 * s->size, s->align_wp); \ 1485 SPEED_TMP_ALLOC_LIMBS (wp, 2 * s->size, s->align_wp); \ 1486 \ 1487 speed_operand_src (s, s->xp, s->size); \ 1488 speed_operand_src (s, tp, 2 * s->size); \ 1489 speed_operand_dst (s, wp, 2 * s->size); \ 1490 speed_cache_fill (s); \ 1491 \ 1492 speed_starttime (); \ 1493 i = s->reps; \ 1494 do \ 1495 call; \ 1496 while (--i != 0); \ 1497 t = speed_endtime () / 2; \ 1498 \ 1499 TMP_FREE; \ 1500 return t; \ 1501 } 1502 1503 #define SPEED_ROUTINE_MPN_SQR_TSPACE(call, tsize, minsize) \ 1504 { \ 1505 mp_ptr wp, tspace; \ 1506 unsigned i; \ 1507 double t; \ 1508 TMP_DECL; \ 1509 \ 1510 SPEED_RESTRICT_COND (s->size >= minsize); \ 1511 \ 1512 TMP_MARK; \ 1513 SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp); \ 1514 SPEED_TMP_ALLOC_LIMBS (tspace, tsize, s->align_wp2); \ 1515 \ 1516 speed_operand_src (s, s->xp, s->size); \ 1517 speed_operand_dst (s, wp, 2*s->size); \ 1518 speed_operand_dst (s, tspace, tsize); \ 1519 speed_cache_fill (s); \ 1520 \ 1521 speed_starttime (); \ 1522 i = s->reps; \ 1523 do \ 1524 call; \ 1525 while (--i != 0); \ 1526 t = speed_endtime (); \ 1527 \ 1528 TMP_FREE; \ 1529 return t; \ 1530 } 1531 1532 #define SPEED_ROUTINE_MPN_TOOM2_SQR(function) \ 1533 SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace), \ 1534 mpn_toom2_sqr_itch (s->size), \ 1535 MPN_TOOM2_SQR_MINSIZE) 1536 1537 #define SPEED_ROUTINE_MPN_TOOM3_SQR(function) \ 1538 SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace), \ 1539 mpn_toom3_sqr_itch (s->size), \ 1540 MPN_TOOM3_SQR_MINSIZE) 1541 1542 1543 #define SPEED_ROUTINE_MPN_TOOM4_SQR(function) \ 1544 SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace), \ 1545 mpn_toom4_sqr_itch (s->size), \ 1546 MPN_TOOM4_SQR_MINSIZE) 1547 1548 #define SPEED_ROUTINE_MPN_TOOM6_SQR(function) \ 1549 SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace), \ 1550 mpn_toom6_sqr_itch (s->size), \ 1551 MPN_TOOM6_SQR_MINSIZE) 1552 1553 #define SPEED_ROUTINE_MPN_TOOM8_SQR(function) \ 1554 SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace), \ 1555 mpn_toom8_sqr_itch (s->size), \ 1556 MPN_TOOM8_SQR_MINSIZE) 1557 1558 #define SPEED_ROUTINE_MPN_MOD_CALL(call) \ 1559 { \ 1560 unsigned i; \ 1561 \ 1562 SPEED_RESTRICT_COND (s->size >= 0); \ 1563 \ 1564 speed_operand_src (s, s->xp, s->size); \ 1565 speed_cache_fill (s); \ 1566 \ 1567 speed_starttime (); \ 1568 i = s->reps; \ 1569 do \ 1570 call; \ 1571 while (--i != 0); \ 1572 \ 1573 return speed_endtime (); \ 1574 } 1575 1576 #define SPEED_ROUTINE_MPN_MOD_1(function) \ 1577 SPEED_ROUTINE_MPN_MOD_CALL ((*function) (s->xp, s->size, s->r)) 1578 1579 #define SPEED_ROUTINE_MPN_MOD_1C(function) \ 1580 SPEED_ROUTINE_MPN_MOD_CALL ((*function)(s->xp, s->size, s->r, CNST_LIMB(0))) 1581 1582 #define SPEED_ROUTINE_MPN_MODEXACT_1_ODD(function) \ 1583 SPEED_ROUTINE_MPN_MOD_CALL (function (s->xp, s->size, s->r)); 1584 1585 #define SPEED_ROUTINE_MPN_MODEXACT_1C_ODD(function) \ 1586 SPEED_ROUTINE_MPN_MOD_CALL (function (s->xp, s->size, s->r, CNST_LIMB(0))); 1587 1588 #define SPEED_ROUTINE_MPN_MOD_34LSUB1(function) \ 1589 SPEED_ROUTINE_MPN_MOD_CALL ((*function) (s->xp, s->size)) 1590 1591 #define SPEED_ROUTINE_MPN_PREINV_MOD_1(function) \ 1592 { \ 1593 unsigned i; \ 1594 mp_limb_t inv; \ 1595 \ 1596 SPEED_RESTRICT_COND (s->size >= 0); \ 1597 SPEED_RESTRICT_COND (s->r & GMP_LIMB_HIGHBIT); \ 1598 \ 1599 invert_limb (inv, s->r); \ 1600 speed_operand_src (s, s->xp, s->size); \ 1601 speed_cache_fill (s); \ 1602 \ 1603 speed_starttime (); \ 1604 i = s->reps; \ 1605 do \ 1606 (*function) (s->xp, s->size, s->r, inv); \ 1607 while (--i != 0); \ 1608 \ 1609 return speed_endtime (); \ 1610 } 1611 1612 #define SPEED_ROUTINE_MPN_MOD_1_1(function,pfunc) \ 1613 { \ 1614 unsigned i; \ 1615 mp_limb_t inv[4]; \ 1616 \ 1617 SPEED_RESTRICT_COND (s->size >= 2); \ 1618 \ 1619 mpn_mod_1_1p_cps (inv, s->r); \ 1620 speed_operand_src (s, s->xp, s->size); \ 1621 speed_cache_fill (s); \ 1622 \ 1623 speed_starttime (); \ 1624 i = s->reps; \ 1625 do { \ 1626 pfunc (inv, s->r); \ 1627 function (s->xp, s->size, s->r << inv[1], inv); \ 1628 } while (--i != 0); \ 1629 \ 1630 return speed_endtime (); \ 1631 } 1632 #define SPEED_ROUTINE_MPN_MOD_1_N(function,pfunc,N) \ 1633 { \ 1634 unsigned i; \ 1635 mp_limb_t inv[N+3]; \ 1636 \ 1637 SPEED_RESTRICT_COND (s->size >= 1); \ 1638 SPEED_RESTRICT_COND (s->r <= ~(mp_limb_t)0 / N); \ 1639 \ 1640 speed_operand_src (s, s->xp, s->size); \ 1641 speed_cache_fill (s); \ 1642 \ 1643 speed_starttime (); \ 1644 i = s->reps; \ 1645 do { \ 1646 pfunc (inv, s->r); \ 1647 function (s->xp, s->size, s->r, inv); \ 1648 } while (--i != 0); \ 1649 \ 1650 return speed_endtime (); \ 1651 } 1652 1653 1654 /* A division of 2*s->size by s->size limbs */ 1655 1656 #define SPEED_ROUTINE_MPN_DC_DIVREM_CALL(call) \ 1657 { \ 1658 unsigned i; \ 1659 mp_ptr a, d, q, r; \ 1660 double t; \ 1661 gmp_pi1_t dinv; \ 1662 TMP_DECL; \ 1663 \ 1664 SPEED_RESTRICT_COND (s->size >= 1); \ 1665 \ 1666 TMP_MARK; \ 1667 SPEED_TMP_ALLOC_LIMBS (a, 2*s->size, s->align_xp); \ 1668 SPEED_TMP_ALLOC_LIMBS (d, s->size, s->align_yp); \ 1669 SPEED_TMP_ALLOC_LIMBS (q, s->size+1, s->align_wp); \ 1670 SPEED_TMP_ALLOC_LIMBS (r, s->size, s->align_wp2); \ 1671 \ 1672 MPN_COPY (a, s->xp, s->size); \ 1673 MPN_COPY (a+s->size, s->xp, s->size); \ 1674 \ 1675 MPN_COPY (d, s->yp, s->size); \ 1676 \ 1677 /* normalize the data */ \ 1678 d[s->size-1] |= GMP_NUMB_HIGHBIT; \ 1679 a[2*s->size-1] = d[s->size-1] - 1; \ 1680 \ 1681 invert_pi1 (dinv, d[s->size-1], d[s->size-2]); \ 1682 \ 1683 speed_operand_src (s, a, 2*s->size); \ 1684 speed_operand_src (s, d, s->size); \ 1685 speed_operand_dst (s, q, s->size+1); \ 1686 speed_operand_dst (s, r, s->size); \ 1687 speed_cache_fill (s); \ 1688 \ 1689 speed_starttime (); \ 1690 i = s->reps; \ 1691 do \ 1692 call; \ 1693 while (--i != 0); \ 1694 t = speed_endtime (); \ 1695 \ 1696 TMP_FREE; \ 1697 return t; \ 1698 } 1699 1700 1701 /* A remainder 2*s->size by s->size limbs */ 1702 1703 #define SPEED_ROUTINE_MPZ_MOD(function) \ 1704 { \ 1705 unsigned i; \ 1706 mpz_t a, d, r; \ 1707 \ 1708 SPEED_RESTRICT_COND (s->size >= 1); \ 1709 \ 1710 mpz_init_set_n (d, s->yp, s->size); \ 1711 \ 1712 /* high part less than d, low part a duplicate copied in */ \ 1713 mpz_init_set_n (a, s->xp, s->size); \ 1714 mpz_mod (a, a, d); \ 1715 mpz_mul_2exp (a, a, GMP_LIMB_BITS * s->size); \ 1716 MPN_COPY (PTR(a), s->xp, s->size); \ 1717 \ 1718 mpz_init (r); \ 1719 \ 1720 speed_operand_src (s, PTR(a), SIZ(a)); \ 1721 speed_operand_src (s, PTR(d), SIZ(d)); \ 1722 speed_cache_fill (s); \ 1723 \ 1724 speed_starttime (); \ 1725 i = s->reps; \ 1726 do \ 1727 function (r, a, d); \ 1728 while (--i != 0); \ 1729 return speed_endtime (); \ 1730 } 1731 1732 #define SPEED_ROUTINE_MPN_PI1_DIV(function, INV, DMIN, QMIN) \ 1733 { \ 1734 unsigned i; \ 1735 mp_ptr dp, tp, ap, qp; \ 1736 gmp_pi1_t inv; \ 1737 double t; \ 1738 mp_size_t size1; \ 1739 TMP_DECL; \ 1740 \ 1741 size1 = (s->r == 0 ? 2 * s->size : s->r); \ 1742 \ 1743 SPEED_RESTRICT_COND (s->size >= DMIN); \ 1744 SPEED_RESTRICT_COND (size1 - s->size >= QMIN); \ 1745 \ 1746 TMP_MARK; \ 1747 SPEED_TMP_ALLOC_LIMBS (ap, size1, s->align_xp); \ 1748 SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp); \ 1749 SPEED_TMP_ALLOC_LIMBS (qp, size1 - s->size, s->align_wp); \ 1750 SPEED_TMP_ALLOC_LIMBS (tp, size1, s->align_wp2); \ 1751 \ 1752 /* we don't fill in dividend completely when size1 > s->size */ \ 1753 MPN_COPY (ap, s->xp, s->size); \ 1754 MPN_COPY (ap + size1 - s->size, s->xp, s->size); \ 1755 \ 1756 MPN_COPY (dp, s->yp, s->size); \ 1757 \ 1758 /* normalize the data */ \ 1759 dp[s->size-1] |= GMP_NUMB_HIGHBIT; \ 1760 ap[size1 - 1] = dp[s->size - 1] - 1; \ 1761 \ 1762 invert_pi1 (inv, dp[s->size-1], dp[s->size-2]); \ 1763 \ 1764 speed_operand_src (s, ap, size1); \ 1765 speed_operand_dst (s, tp, size1); \ 1766 speed_operand_src (s, dp, s->size); \ 1767 speed_operand_dst (s, qp, size1 - s->size); \ 1768 speed_cache_fill (s); \ 1769 \ 1770 speed_starttime (); \ 1771 i = s->reps; \ 1772 do { \ 1773 MPN_COPY (tp, ap, size1); \ 1774 function (qp, tp, size1, dp, s->size, INV); \ 1775 } while (--i != 0); \ 1776 t = speed_endtime (); \ 1777 \ 1778 TMP_FREE; \ 1779 return t; \ 1780 } 1781 #define SPEED_ROUTINE_MPN_MU_DIV_Q(function,itchfn) \ 1782 { \ 1783 unsigned i; \ 1784 mp_ptr dp, tp, qp, scratch; \ 1785 double t; \ 1786 mp_size_t itch; \ 1787 TMP_DECL; \ 1788 \ 1789 SPEED_RESTRICT_COND (s->size >= 2); \ 1790 \ 1791 itch = itchfn (2 * s->size, s->size, 0); \ 1792 TMP_MARK; \ 1793 SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp); \ 1794 SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp); \ 1795 SPEED_TMP_ALLOC_LIMBS (tp, 2 * s->size, s->align_xp); \ 1796 SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2); \ 1797 \ 1798 MPN_COPY (tp, s->xp, s->size); \ 1799 MPN_COPY (tp+s->size, s->xp, s->size); \ 1800 \ 1801 /* normalize the data */ \ 1802 dp[s->size-1] |= GMP_NUMB_HIGHBIT; \ 1803 tp[2*s->size-1] = dp[s->size-1] - 1; \ 1804 \ 1805 speed_operand_dst (s, qp, s->size); \ 1806 speed_operand_src (s, tp, 2 * s->size); \ 1807 speed_operand_src (s, dp, s->size); \ 1808 speed_operand_dst (s, scratch, itch); \ 1809 speed_cache_fill (s); \ 1810 \ 1811 speed_starttime (); \ 1812 i = s->reps; \ 1813 do { \ 1814 function (qp, tp, 2 * s->size, dp, s->size, scratch); \ 1815 } while (--i != 0); \ 1816 t = speed_endtime (); \ 1817 \ 1818 TMP_FREE; \ 1819 return t; \ 1820 } 1821 #define SPEED_ROUTINE_MPN_MU_DIV_QR(function,itchfn) \ 1822 { \ 1823 unsigned i; \ 1824 mp_ptr dp, tp, qp, rp, scratch; \ 1825 double t; \ 1826 mp_size_t size1, itch; \ 1827 TMP_DECL; \ 1828 \ 1829 size1 = (s->r == 0 ? 2 * s->size : s->r); \ 1830 \ 1831 SPEED_RESTRICT_COND (s->size >= 2); \ 1832 SPEED_RESTRICT_COND (size1 >= s->size); \ 1833 \ 1834 itch = itchfn (size1, s->size, 0); \ 1835 TMP_MARK; \ 1836 SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp); \ 1837 SPEED_TMP_ALLOC_LIMBS (qp, size1 - s->size, s->align_wp); \ 1838 SPEED_TMP_ALLOC_LIMBS (tp, size1, s->align_xp); \ 1839 SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2); \ 1840 SPEED_TMP_ALLOC_LIMBS (rp, s->size, s->align_wp2); /* alignment? */ \ 1841 \ 1842 /* we don't fill in dividend completely when size1 > s->size */ \ 1843 MPN_COPY (tp, s->xp, s->size); \ 1844 MPN_COPY (tp + size1 - s->size, s->xp, s->size); \ 1845 \ 1846 MPN_COPY (dp, s->yp, s->size); \ 1847 \ 1848 /* normalize the data */ \ 1849 dp[s->size-1] |= GMP_NUMB_HIGHBIT; \ 1850 tp[size1 - 1] = dp[s->size - 1] - 1; \ 1851 \ 1852 speed_operand_dst (s, qp, size1 - s->size); \ 1853 speed_operand_dst (s, rp, s->size); \ 1854 speed_operand_src (s, tp, size1); \ 1855 speed_operand_src (s, dp, s->size); \ 1856 speed_operand_dst (s, scratch, itch); \ 1857 speed_cache_fill (s); \ 1858 \ 1859 speed_starttime (); \ 1860 i = s->reps; \ 1861 do { \ 1862 function (qp, rp, tp, size1, dp, s->size, scratch); \ 1863 } while (--i != 0); \ 1864 t = speed_endtime (); \ 1865 \ 1866 TMP_FREE; \ 1867 return t; \ 1868 } 1869 #define SPEED_ROUTINE_MPN_MUPI_DIV_QR(function,itchfn) \ 1870 { \ 1871 unsigned i; \ 1872 mp_ptr dp, tp, qp, rp, ip, scratch, tmp; \ 1873 double t; \ 1874 mp_size_t size1, itch; \ 1875 TMP_DECL; \ 1876 \ 1877 size1 = (s->r == 0 ? 2 * s->size : s->r); \ 1878 \ 1879 SPEED_RESTRICT_COND (s->size >= 2); \ 1880 SPEED_RESTRICT_COND (size1 >= s->size); \ 1881 \ 1882 itch = itchfn (size1, s->size, s->size); \ 1883 TMP_MARK; \ 1884 SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp); \ 1885 SPEED_TMP_ALLOC_LIMBS (qp, size1 - s->size, s->align_wp); \ 1886 SPEED_TMP_ALLOC_LIMBS (tp, size1, s->align_xp); \ 1887 SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2); \ 1888 SPEED_TMP_ALLOC_LIMBS (rp, s->size, s->align_wp2); /* alignment? */ \ 1889 SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_wp2); /* alignment? */ \ 1890 \ 1891 /* we don't fill in dividend completely when size1 > s->size */ \ 1892 MPN_COPY (tp, s->xp, s->size); \ 1893 MPN_COPY (tp + size1 - s->size, s->xp, s->size); \ 1894 \ 1895 MPN_COPY (dp, s->yp, s->size); \ 1896 \ 1897 /* normalize the data */ \ 1898 dp[s->size-1] |= GMP_NUMB_HIGHBIT; \ 1899 tp[size1 - 1] = dp[s->size-1] - 1; \ 1900 \ 1901 tmp = TMP_ALLOC_LIMBS (mpn_invert_itch (s->size)); \ 1902 mpn_invert (ip, dp, s->size, tmp); \ 1903 \ 1904 speed_operand_dst (s, qp, size1 - s->size); \ 1905 speed_operand_dst (s, rp, s->size); \ 1906 speed_operand_src (s, tp, size1); \ 1907 speed_operand_src (s, dp, s->size); \ 1908 speed_operand_src (s, ip, s->size); \ 1909 speed_operand_dst (s, scratch, itch); \ 1910 speed_cache_fill (s); \ 1911 \ 1912 speed_starttime (); \ 1913 i = s->reps; \ 1914 do { \ 1915 function (qp, rp, tp, size1, dp, s->size, ip, s->size, scratch); \ 1916 } while (--i != 0); \ 1917 t = speed_endtime (); \ 1918 \ 1919 TMP_FREE; \ 1920 return t; \ 1921 } 1922 1923 #define SPEED_ROUTINE_MPN_PI1_BDIV_QR(function) \ 1924 { \ 1925 unsigned i; \ 1926 mp_ptr dp, tp, ap, qp; \ 1927 mp_limb_t inv; \ 1928 double t; \ 1929 TMP_DECL; \ 1930 \ 1931 SPEED_RESTRICT_COND (s->size >= 1); \ 1932 \ 1933 TMP_MARK; \ 1934 SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size, s->align_xp); \ 1935 SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp); \ 1936 SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp); \ 1937 SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size, s->align_wp2); \ 1938 \ 1939 MPN_COPY (ap, s->xp, s->size); \ 1940 MPN_COPY (ap+s->size, s->xp, s->size); \ 1941 \ 1942 /* divisor must be odd */ \ 1943 MPN_COPY (dp, s->yp, s->size); \ 1944 dp[0] |= 1; \ 1945 binvert_limb (inv, dp[0]); \ 1946 inv = -inv; \ 1947 \ 1948 speed_operand_src (s, ap, 2*s->size); \ 1949 speed_operand_dst (s, tp, 2*s->size); \ 1950 speed_operand_src (s, dp, s->size); \ 1951 speed_operand_dst (s, qp, s->size); \ 1952 speed_cache_fill (s); \ 1953 \ 1954 speed_starttime (); \ 1955 i = s->reps; \ 1956 do { \ 1957 MPN_COPY (tp, ap, 2*s->size); \ 1958 function (qp, tp, 2*s->size, dp, s->size, inv); \ 1959 } while (--i != 0); \ 1960 t = speed_endtime (); \ 1961 \ 1962 TMP_FREE; \ 1963 return t; \ 1964 } 1965 #define SPEED_ROUTINE_MPN_PI1_BDIV_Q(function) \ 1966 { \ 1967 unsigned i; \ 1968 mp_ptr dp, tp, qp; \ 1969 mp_limb_t inv; \ 1970 double t; \ 1971 TMP_DECL; \ 1972 \ 1973 SPEED_RESTRICT_COND (s->size >= 1); \ 1974 \ 1975 TMP_MARK; \ 1976 SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp); \ 1977 SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp); \ 1978 SPEED_TMP_ALLOC_LIMBS (tp, s->size, s->align_wp2); \ 1979 \ 1980 /* divisor must be odd */ \ 1981 MPN_COPY (dp, s->yp, s->size); \ 1982 dp[0] |= 1; \ 1983 binvert_limb (inv, dp[0]); \ 1984 inv = -inv; \ 1985 \ 1986 speed_operand_src (s, s->xp, s->size); \ 1987 speed_operand_dst (s, tp, s->size); \ 1988 speed_operand_src (s, dp, s->size); \ 1989 speed_operand_dst (s, qp, s->size); \ 1990 speed_cache_fill (s); \ 1991 \ 1992 speed_starttime (); \ 1993 i = s->reps; \ 1994 do { \ 1995 MPN_COPY (tp, s->xp, s->size); \ 1996 function (qp, tp, s->size, dp, s->size, inv); \ 1997 } while (--i != 0); \ 1998 t = speed_endtime (); \ 1999 \ 2000 TMP_FREE; \ 2001 return t; \ 2002 } 2003 #define SPEED_ROUTINE_MPN_MU_BDIV_Q(function,itchfn) \ 2004 { \ 2005 unsigned i; \ 2006 mp_ptr dp, qp, scratch; \ 2007 double t; \ 2008 mp_size_t itch; \ 2009 TMP_DECL; \ 2010 \ 2011 SPEED_RESTRICT_COND (s->size >= 2); \ 2012 \ 2013 itch = itchfn (s->size, s->size); \ 2014 TMP_MARK; \ 2015 SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp); \ 2016 SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp); \ 2017 SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2); \ 2018 \ 2019 /* divisor must be odd */ \ 2020 MPN_COPY (dp, s->yp, s->size); \ 2021 dp[0] |= 1; \ 2022 \ 2023 speed_operand_dst (s, qp, s->size); \ 2024 speed_operand_src (s, s->xp, s->size); \ 2025 speed_operand_src (s, dp, s->size); \ 2026 speed_operand_dst (s, scratch, itch); \ 2027 speed_cache_fill (s); \ 2028 \ 2029 speed_starttime (); \ 2030 i = s->reps; \ 2031 do { \ 2032 function (qp, s->xp, s->size, dp, s->size, scratch); \ 2033 } while (--i != 0); \ 2034 t = speed_endtime (); \ 2035 \ 2036 TMP_FREE; \ 2037 return t; \ 2038 } 2039 #define SPEED_ROUTINE_MPN_MU_BDIV_QR(function,itchfn) \ 2040 { \ 2041 unsigned i; \ 2042 mp_ptr dp, tp, qp, rp, scratch; \ 2043 double t; \ 2044 mp_size_t itch; \ 2045 TMP_DECL; \ 2046 \ 2047 SPEED_RESTRICT_COND (s->size >= 2); \ 2048 \ 2049 itch = itchfn (2 * s->size, s->size); \ 2050 TMP_MARK; \ 2051 SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp); \ 2052 SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp); \ 2053 SPEED_TMP_ALLOC_LIMBS (tp, 2 * s->size, s->align_xp); \ 2054 SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2); \ 2055 SPEED_TMP_ALLOC_LIMBS (rp, s->size, s->align_wp2); /* alignment? */ \ 2056 \ 2057 MPN_COPY (tp, s->xp, s->size); \ 2058 MPN_COPY (tp+s->size, s->xp, s->size); \ 2059 \ 2060 /* divisor must be odd */ \ 2061 MPN_COPY (dp, s->yp, s->size); \ 2062 dp[0] |= 1; \ 2063 \ 2064 speed_operand_dst (s, qp, s->size); \ 2065 speed_operand_dst (s, rp, s->size); \ 2066 speed_operand_src (s, tp, 2 * s->size); \ 2067 speed_operand_src (s, dp, s->size); \ 2068 speed_operand_dst (s, scratch, itch); \ 2069 speed_cache_fill (s); \ 2070 \ 2071 speed_starttime (); \ 2072 i = s->reps; \ 2073 do { \ 2074 function (qp, rp, tp, 2 * s->size, dp, s->size, scratch); \ 2075 } while (--i != 0); \ 2076 t = speed_endtime (); \ 2077 \ 2078 TMP_FREE; \ 2079 return t; \ 2080 } 2081 2082 #define SPEED_ROUTINE_MPN_BROOT(function) \ 2083 { \ 2084 SPEED_RESTRICT_COND (s->r & 1); \ 2085 s->xp[0] |= 1; \ 2086 SPEED_ROUTINE_MPN_UNARY_1_CALL \ 2087 ((*function) (wp, s->xp, s->size, s->r)); \ 2088 } 2089 2090 #define SPEED_ROUTINE_MPN_BROOTINV(function, itch) \ 2091 { \ 2092 mp_ptr wp, tp; \ 2093 unsigned i; \ 2094 double t; \ 2095 TMP_DECL; \ 2096 TMP_MARK; \ 2097 SPEED_RESTRICT_COND (s->size >= 1); \ 2098 SPEED_RESTRICT_COND (s->r & 1); \ 2099 wp = TMP_ALLOC_LIMBS (s->size); \ 2100 tp = TMP_ALLOC_LIMBS ( (itch)); \ 2101 s->xp[0] |= 1; \ 2102 \ 2103 speed_operand_src (s, s->xp, s->size); \ 2104 speed_operand_dst (s, wp, s->size); \ 2105 speed_cache_fill (s); \ 2106 \ 2107 speed_starttime (); \ 2108 i = s->reps; \ 2109 do \ 2110 (*function) (wp, s->xp, s->size, s->r, tp); \ 2111 while (--i != 0); \ 2112 t = speed_endtime (); \ 2113 \ 2114 TMP_FREE; \ 2115 return t; \ 2116 } 2117 2118 #define SPEED_ROUTINE_MPN_INVERT(function,itchfn) \ 2119 { \ 2120 long i; \ 2121 mp_ptr up, tp, ip; \ 2122 double t; \ 2123 TMP_DECL; \ 2124 \ 2125 SPEED_RESTRICT_COND (s->size >= 1); \ 2126 \ 2127 TMP_MARK; \ 2128 SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp); \ 2129 SPEED_TMP_ALLOC_LIMBS (up, s->size, s->align_yp); \ 2130 SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp); \ 2131 \ 2132 MPN_COPY (up, s->xp, s->size); \ 2133 \ 2134 /* normalize the data */ \ 2135 up[s->size-1] |= GMP_NUMB_HIGHBIT; \ 2136 \ 2137 speed_operand_src (s, up, s->size); \ 2138 speed_operand_dst (s, tp, s->size); \ 2139 speed_operand_dst (s, ip, s->size); \ 2140 speed_cache_fill (s); \ 2141 \ 2142 speed_starttime (); \ 2143 i = s->reps; \ 2144 do \ 2145 function (ip, up, s->size, tp); \ 2146 while (--i != 0); \ 2147 t = speed_endtime (); \ 2148 \ 2149 TMP_FREE; \ 2150 return t; \ 2151 } 2152 2153 #define SPEED_ROUTINE_MPN_INVERTAPPR(function,itchfn) \ 2154 { \ 2155 long i; \ 2156 mp_ptr up, tp, ip; \ 2157 double t; \ 2158 TMP_DECL; \ 2159 \ 2160 SPEED_RESTRICT_COND (s->size >= 1); \ 2161 \ 2162 TMP_MARK; \ 2163 SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp); \ 2164 SPEED_TMP_ALLOC_LIMBS (up, s->size, s->align_yp); \ 2165 SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp); \ 2166 \ 2167 MPN_COPY (up, s->xp, s->size); \ 2168 \ 2169 /* normalize the data */ \ 2170 up[s->size-1] |= GMP_NUMB_HIGHBIT; \ 2171 \ 2172 speed_operand_src (s, up, s->size); \ 2173 speed_operand_dst (s, tp, s->size); \ 2174 speed_operand_dst (s, ip, s->size); \ 2175 speed_cache_fill (s); \ 2176 \ 2177 speed_starttime (); \ 2178 i = s->reps; \ 2179 do \ 2180 function (ip, up, s->size, tp); \ 2181 while (--i != 0); \ 2182 t = speed_endtime (); \ 2183 \ 2184 TMP_FREE; \ 2185 return t; \ 2186 } 2187 2188 #define SPEED_ROUTINE_MPN_NI_INVERTAPPR(function,itchfn) \ 2189 { \ 2190 long i; \ 2191 mp_ptr up, tp, ip; \ 2192 double t; \ 2193 TMP_DECL; \ 2194 \ 2195 SPEED_RESTRICT_COND (s->size >= 3); \ 2196 \ 2197 TMP_MARK; \ 2198 SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp); \ 2199 SPEED_TMP_ALLOC_LIMBS (up, s->size, s->align_yp); \ 2200 SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp); \ 2201 \ 2202 MPN_COPY (up, s->xp, s->size); \ 2203 \ 2204 /* normalize the data */ \ 2205 up[s->size-1] |= GMP_NUMB_HIGHBIT; \ 2206 \ 2207 speed_operand_src (s, up, s->size); \ 2208 speed_operand_dst (s, tp, s->size); \ 2209 speed_operand_dst (s, ip, s->size); \ 2210 speed_cache_fill (s); \ 2211 \ 2212 speed_starttime (); \ 2213 i = s->reps; \ 2214 do \ 2215 function (ip, up, s->size, tp); \ 2216 while (--i != 0); \ 2217 t = speed_endtime (); \ 2218 \ 2219 TMP_FREE; \ 2220 return t; \ 2221 } 2222 2223 #define SPEED_ROUTINE_MPN_BINVERT(function,itchfn) \ 2224 { \ 2225 long i; \ 2226 mp_ptr up, tp, ip; \ 2227 double t; \ 2228 TMP_DECL; \ 2229 \ 2230 SPEED_RESTRICT_COND (s->size >= 1); \ 2231 \ 2232 TMP_MARK; \ 2233 SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp); \ 2234 SPEED_TMP_ALLOC_LIMBS (up, s->size, s->align_yp); \ 2235 SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp); \ 2236 \ 2237 MPN_COPY (up, s->xp, s->size); \ 2238 \ 2239 /* normalize the data */ \ 2240 up[0] |= 1; \ 2241 \ 2242 speed_operand_src (s, up, s->size); \ 2243 speed_operand_dst (s, tp, s->size); \ 2244 speed_operand_dst (s, ip, s->size); \ 2245 speed_cache_fill (s); \ 2246 \ 2247 speed_starttime (); \ 2248 i = s->reps; \ 2249 do \ 2250 function (ip, up, s->size, tp); \ 2251 while (--i != 0); \ 2252 t = speed_endtime (); \ 2253 \ 2254 TMP_FREE; \ 2255 return t; \ 2256 } 2257 2258 #define SPEED_ROUTINE_REDC_1(function) \ 2259 { \ 2260 unsigned i; \ 2261 mp_ptr cp, mp, tp, ap; \ 2262 mp_limb_t inv; \ 2263 double t; \ 2264 TMP_DECL; \ 2265 \ 2266 SPEED_RESTRICT_COND (s->size >= 1); \ 2267 \ 2268 TMP_MARK; \ 2269 SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size+1, s->align_xp); \ 2270 SPEED_TMP_ALLOC_LIMBS (mp, s->size, s->align_yp); \ 2271 SPEED_TMP_ALLOC_LIMBS (cp, s->size, s->align_wp); \ 2272 SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size+1, s->align_wp2); \ 2273 \ 2274 MPN_COPY (ap, s->xp, s->size); \ 2275 MPN_COPY (ap+s->size, s->xp, s->size); \ 2276 \ 2277 /* modulus must be odd */ \ 2278 MPN_COPY (mp, s->yp, s->size); \ 2279 mp[0] |= 1; \ 2280 binvert_limb (inv, mp[0]); \ 2281 inv = -inv; \ 2282 \ 2283 speed_operand_src (s, ap, 2*s->size+1); \ 2284 speed_operand_dst (s, tp, 2*s->size+1); \ 2285 speed_operand_src (s, mp, s->size); \ 2286 speed_operand_dst (s, cp, s->size); \ 2287 speed_cache_fill (s); \ 2288 \ 2289 speed_starttime (); \ 2290 i = s->reps; \ 2291 do { \ 2292 MPN_COPY (tp, ap, 2*s->size); \ 2293 function (cp, tp, mp, s->size, inv); \ 2294 } while (--i != 0); \ 2295 t = speed_endtime (); \ 2296 \ 2297 TMP_FREE; \ 2298 return t; \ 2299 } 2300 #define SPEED_ROUTINE_REDC_2(function) \ 2301 { \ 2302 unsigned i; \ 2303 mp_ptr cp, mp, tp, ap; \ 2304 mp_limb_t invp[2]; \ 2305 double t; \ 2306 TMP_DECL; \ 2307 \ 2308 SPEED_RESTRICT_COND (s->size >= 1); \ 2309 \ 2310 TMP_MARK; \ 2311 SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size+1, s->align_xp); \ 2312 SPEED_TMP_ALLOC_LIMBS (mp, s->size, s->align_yp); \ 2313 SPEED_TMP_ALLOC_LIMBS (cp, s->size, s->align_wp); \ 2314 SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size+1, s->align_wp2); \ 2315 \ 2316 MPN_COPY (ap, s->xp, s->size); \ 2317 MPN_COPY (ap+s->size, s->xp, s->size); \ 2318 \ 2319 /* modulus must be odd */ \ 2320 MPN_COPY (mp, s->yp, s->size); \ 2321 mp[0] |= 1; \ 2322 mpn_binvert (invp, mp, 2, tp); \ 2323 invp[0] = -invp[0]; invp[1] = ~invp[1]; \ 2324 \ 2325 speed_operand_src (s, ap, 2*s->size+1); \ 2326 speed_operand_dst (s, tp, 2*s->size+1); \ 2327 speed_operand_src (s, mp, s->size); \ 2328 speed_operand_dst (s, cp, s->size); \ 2329 speed_cache_fill (s); \ 2330 \ 2331 speed_starttime (); \ 2332 i = s->reps; \ 2333 do { \ 2334 MPN_COPY (tp, ap, 2*s->size); \ 2335 function (cp, tp, mp, s->size, invp); \ 2336 } while (--i != 0); \ 2337 t = speed_endtime (); \ 2338 \ 2339 TMP_FREE; \ 2340 return t; \ 2341 } 2342 #define SPEED_ROUTINE_REDC_N(function) \ 2343 { \ 2344 unsigned i; \ 2345 mp_ptr cp, mp, tp, ap, invp; \ 2346 double t; \ 2347 TMP_DECL; \ 2348 \ 2349 SPEED_RESTRICT_COND (s->size > 8); \ 2350 \ 2351 TMP_MARK; \ 2352 SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size+1, s->align_xp); \ 2353 SPEED_TMP_ALLOC_LIMBS (mp, s->size, s->align_yp); \ 2354 SPEED_TMP_ALLOC_LIMBS (cp, s->size, s->align_wp); \ 2355 SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size+1, s->align_wp2); \ 2356 SPEED_TMP_ALLOC_LIMBS (invp, s->size, s->align_wp2); /* align? */ \ 2357 \ 2358 MPN_COPY (ap, s->xp, s->size); \ 2359 MPN_COPY (ap+s->size, s->xp, s->size); \ 2360 \ 2361 /* modulus must be odd */ \ 2362 MPN_COPY (mp, s->yp, s->size); \ 2363 mp[0] |= 1; \ 2364 mpn_binvert (invp, mp, s->size, tp); \ 2365 \ 2366 speed_operand_src (s, ap, 2*s->size+1); \ 2367 speed_operand_dst (s, tp, 2*s->size+1); \ 2368 speed_operand_src (s, mp, s->size); \ 2369 speed_operand_dst (s, cp, s->size); \ 2370 speed_cache_fill (s); \ 2371 \ 2372 speed_starttime (); \ 2373 i = s->reps; \ 2374 do { \ 2375 MPN_COPY (tp, ap, 2*s->size); \ 2376 function (cp, tp, mp, s->size, invp); \ 2377 } while (--i != 0); \ 2378 t = speed_endtime (); \ 2379 \ 2380 TMP_FREE; \ 2381 return t; \ 2382 } 2383 2384 2385 #define SPEED_ROUTINE_MPN_POPCOUNT(function) \ 2386 { \ 2387 unsigned i; \ 2388 \ 2389 SPEED_RESTRICT_COND (s->size >= 1); \ 2390 \ 2391 speed_operand_src (s, s->xp, s->size); \ 2392 speed_cache_fill (s); \ 2393 \ 2394 speed_starttime (); \ 2395 i = s->reps; \ 2396 do \ 2397 function (s->xp, s->size); \ 2398 while (--i != 0); \ 2399 \ 2400 return speed_endtime (); \ 2401 } 2402 2403 #define SPEED_ROUTINE_MPN_HAMDIST(function) \ 2404 { \ 2405 unsigned i; \ 2406 \ 2407 SPEED_RESTRICT_COND (s->size >= 1); \ 2408 \ 2409 speed_operand_src (s, s->xp, s->size); \ 2410 speed_operand_src (s, s->yp, s->size); \ 2411 speed_cache_fill (s); \ 2412 \ 2413 speed_starttime (); \ 2414 i = s->reps; \ 2415 do \ 2416 function (s->xp, s->yp, s->size); \ 2417 while (--i != 0); \ 2418 \ 2419 return speed_endtime (); \ 2420 } 2421 2422 2423 #define SPEED_ROUTINE_MPZ_UI(function) \ 2424 { \ 2425 mpz_t z; \ 2426 unsigned i; \ 2427 double t; \ 2428 \ 2429 SPEED_RESTRICT_COND (s->size >= 0); \ 2430 \ 2431 mpz_init (z); \ 2432 \ 2433 speed_starttime (); \ 2434 i = s->reps; \ 2435 do \ 2436 function (z, s->size); \ 2437 while (--i != 0); \ 2438 t = speed_endtime (); \ 2439 \ 2440 mpz_clear (z); \ 2441 return t; \ 2442 } 2443 2444 #define SPEED_ROUTINE_MPZ_FAC_UI(function) SPEED_ROUTINE_MPZ_UI(function) 2445 #define SPEED_ROUTINE_MPZ_FIB_UI(function) SPEED_ROUTINE_MPZ_UI(function) 2446 #define SPEED_ROUTINE_MPZ_LUCNUM_UI(function) SPEED_ROUTINE_MPZ_UI(function) 2447 2448 2449 #define SPEED_ROUTINE_MPZ_2_UI(function) \ 2450 { \ 2451 mpz_t z, z2; \ 2452 unsigned i; \ 2453 double t; \ 2454 \ 2455 SPEED_RESTRICT_COND (s->size >= 0); \ 2456 \ 2457 mpz_init (z); \ 2458 mpz_init (z2); \ 2459 \ 2460 speed_starttime (); \ 2461 i = s->reps; \ 2462 do \ 2463 function (z, z2, s->size); \ 2464 while (--i != 0); \ 2465 t = speed_endtime (); \ 2466 \ 2467 mpz_clear (z); \ 2468 mpz_clear (z2); \ 2469 return t; \ 2470 } 2471 2472 #define SPEED_ROUTINE_MPZ_FIB2_UI(function) SPEED_ROUTINE_MPZ_2_UI(function) 2473 #define SPEED_ROUTINE_MPZ_LUCNUM2_UI(function) SPEED_ROUTINE_MPZ_2_UI(function) 2474 2475 2476 #define SPEED_ROUTINE_MPN_FIB2_UI(function) \ 2477 { \ 2478 mp_ptr fp, f1p; \ 2479 mp_size_t alloc; \ 2480 unsigned i; \ 2481 double t; \ 2482 TMP_DECL; \ 2483 \ 2484 SPEED_RESTRICT_COND (s->size >= 0); \ 2485 \ 2486 TMP_MARK; \ 2487 alloc = MPN_FIB2_SIZE (s->size); \ 2488 SPEED_TMP_ALLOC_LIMBS (fp, alloc, s->align_xp); \ 2489 SPEED_TMP_ALLOC_LIMBS (f1p, alloc, s->align_yp); \ 2490 \ 2491 speed_starttime (); \ 2492 i = s->reps; \ 2493 do \ 2494 function (fp, f1p, s->size); \ 2495 while (--i != 0); \ 2496 t = speed_endtime (); \ 2497 \ 2498 TMP_FREE; \ 2499 return t; \ 2500 } 2501 2502 2503 2504 /* Calculate b^e mod m for random b and m of s->size limbs and random e of 6 2505 limbs. m is forced to odd so that redc can be used. e is limited in 2506 size so the calculation doesn't take too long. */ 2507 #define SPEED_ROUTINE_MPZ_POWM(function) \ 2508 { \ 2509 mpz_t r, b, e, m; \ 2510 unsigned i; \ 2511 double t; \ 2512 \ 2513 SPEED_RESTRICT_COND (s->size >= 1); \ 2514 \ 2515 mpz_init (r); \ 2516 mpz_init_set_n (b, s->xp, s->size); \ 2517 mpz_init_set_n (m, s->yp, s->size); \ 2518 mpz_setbit (m, 0); /* force m to odd */ \ 2519 mpz_init_set_n (e, s->xp_block, 6); \ 2520 \ 2521 speed_starttime (); \ 2522 i = s->reps; \ 2523 do \ 2524 function (r, b, e, m); \ 2525 while (--i != 0); \ 2526 t = speed_endtime (); \ 2527 \ 2528 mpz_clear (r); \ 2529 mpz_clear (b); \ 2530 mpz_clear (e); \ 2531 mpz_clear (m); \ 2532 return t; \ 2533 } 2534 2535 /* (m-2)^0xAAAAAAAA mod m */ 2536 #define SPEED_ROUTINE_MPZ_POWM_UI(function) \ 2537 { \ 2538 mpz_t r, b, m; \ 2539 unsigned long e; \ 2540 unsigned i; \ 2541 double t; \ 2542 \ 2543 SPEED_RESTRICT_COND (s->size >= 1); \ 2544 \ 2545 mpz_init (r); \ 2546 \ 2547 /* force m to odd */ \ 2548 mpz_init (m); \ 2549 mpz_set_n (m, s->xp, s->size); \ 2550 PTR(m)[0] |= 1; \ 2551 \ 2552 e = (~ (unsigned long) 0) / 3; \ 2553 if (s->r != 0) \ 2554 e = s->r; \ 2555 \ 2556 mpz_init_set (b, m); \ 2557 mpz_sub_ui (b, b, 2); \ 2558 /* printf ("%X\n", mpz_get_ui(m)); */ \ 2559 i = s->reps; \ 2560 speed_starttime (); \ 2561 do \ 2562 function (r, b, e, m); \ 2563 while (--i != 0); \ 2564 t = speed_endtime (); \ 2565 \ 2566 mpz_clear (r); \ 2567 mpz_clear (b); \ 2568 mpz_clear (m); \ 2569 return t; \ 2570 } 2571 2572 2573 #define SPEED_ROUTINE_MPN_ADDSUB_CALL(call) \ 2574 { \ 2575 mp_ptr wp, wp2, xp, yp; \ 2576 unsigned i; \ 2577 double t; \ 2578 TMP_DECL; \ 2579 \ 2580 SPEED_RESTRICT_COND (s->size >= 0); \ 2581 \ 2582 TMP_MARK; \ 2583 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ 2584 SPEED_TMP_ALLOC_LIMBS (wp2, s->size, s->align_wp2); \ 2585 xp = s->xp; \ 2586 yp = s->yp; \ 2587 \ 2588 if (s->r == 0) ; \ 2589 else if (s->r == 1) { xp = wp; } \ 2590 else if (s->r == 2) { yp = wp2; } \ 2591 else if (s->r == 3) { xp = wp; yp = wp2; } \ 2592 else if (s->r == 4) { xp = wp2; yp = wp; } \ 2593 else { \ 2594 TMP_FREE; \ 2595 return -1.0; \ 2596 } \ 2597 if (xp != s->xp) MPN_COPY (xp, s->xp, s->size); \ 2598 if (yp != s->yp) MPN_COPY (yp, s->yp, s->size); \ 2599 \ 2600 speed_operand_src (s, xp, s->size); \ 2601 speed_operand_src (s, yp, s->size); \ 2602 speed_operand_dst (s, wp, s->size); \ 2603 speed_operand_dst (s, wp2, s->size); \ 2604 speed_cache_fill (s); \ 2605 \ 2606 speed_starttime (); \ 2607 i = s->reps; \ 2608 do \ 2609 call; \ 2610 while (--i != 0); \ 2611 t = speed_endtime (); \ 2612 \ 2613 TMP_FREE; \ 2614 return t; \ 2615 } 2616 2617 #define SPEED_ROUTINE_MPN_ADDSUB_N(function) \ 2618 SPEED_ROUTINE_MPN_ADDSUB_CALL \ 2619 (function (wp, wp2, xp, yp, s->size)); 2620 2621 #define SPEED_ROUTINE_MPN_ADDSUB_NC(function) \ 2622 SPEED_ROUTINE_MPN_ADDSUB_CALL \ 2623 (function (wp, wp2, xp, yp, s->size, 0)); 2624 2625 2626 /* Doing an Nx1 gcd with the given r. */ 2627 #define SPEED_ROUTINE_MPN_GCD_1N(function) \ 2628 { \ 2629 mp_ptr xp; \ 2630 unsigned i; \ 2631 double t; \ 2632 TMP_DECL; \ 2633 \ 2634 SPEED_RESTRICT_COND (s->size >= 1); \ 2635 SPEED_RESTRICT_COND (s->r != 0); \ 2636 \ 2637 TMP_MARK; \ 2638 SPEED_TMP_ALLOC_LIMBS (xp, s->size, s->align_xp); \ 2639 MPN_COPY (xp, s->xp, s->size); \ 2640 xp[0] |= refmpn_zero_p (xp, s->size); \ 2641 \ 2642 speed_operand_src (s, s->xp, s->size); \ 2643 speed_cache_fill (s); \ 2644 \ 2645 speed_starttime (); \ 2646 i = s->reps; \ 2647 do \ 2648 function (xp, s->size, s->r); \ 2649 while (--i != 0); \ 2650 t = speed_endtime (); \ 2651 \ 2652 TMP_FREE; \ 2653 return t; \ 2654 } 2655 2656 2657 /* SPEED_BLOCK_SIZE many one GCDs of s->size bits each. */ 2658 2659 #define SPEED_ROUTINE_MPN_GCD_1_CALL(setup, call) \ 2660 { \ 2661 unsigned i, j; \ 2662 mp_ptr px, py; \ 2663 mp_limb_t x_mask, y_mask; \ 2664 double t; \ 2665 TMP_DECL; \ 2666 \ 2667 SPEED_RESTRICT_COND (s->size >= 1); \ 2668 SPEED_RESTRICT_COND (s->size <= mp_bits_per_limb); \ 2669 \ 2670 TMP_MARK; \ 2671 SPEED_TMP_ALLOC_LIMBS (px, SPEED_BLOCK_SIZE, s->align_xp); \ 2672 SPEED_TMP_ALLOC_LIMBS (py, SPEED_BLOCK_SIZE, s->align_yp); \ 2673 MPN_COPY (px, s->xp_block, SPEED_BLOCK_SIZE); \ 2674 MPN_COPY (py, s->yp_block, SPEED_BLOCK_SIZE); \ 2675 \ 2676 x_mask = MP_LIMB_T_LOWBITMASK (s->size); \ 2677 y_mask = MP_LIMB_T_LOWBITMASK (s->r != 0 ? s->r : s->size); \ 2678 for (i = 0; i < SPEED_BLOCK_SIZE; i++) \ 2679 { \ 2680 px[i] &= x_mask; px[i] += (px[i] == 0); \ 2681 py[i] &= y_mask; py[i] += (py[i] == 0); \ 2682 setup; \ 2683 } \ 2684 \ 2685 speed_operand_src (s, px, SPEED_BLOCK_SIZE); \ 2686 speed_operand_src (s, py, SPEED_BLOCK_SIZE); \ 2687 speed_cache_fill (s); \ 2688 \ 2689 speed_starttime (); \ 2690 i = s->reps; \ 2691 do \ 2692 { \ 2693 j = SPEED_BLOCK_SIZE; \ 2694 do \ 2695 { \ 2696 call; \ 2697 } \ 2698 while (--j != 0); \ 2699 } \ 2700 while (--i != 0); \ 2701 t = speed_endtime (); \ 2702 \ 2703 TMP_FREE; \ 2704 \ 2705 s->time_divisor = SPEED_BLOCK_SIZE; \ 2706 return t; \ 2707 } 2708 2709 #define SPEED_ROUTINE_MPN_GCD_1(function) \ 2710 SPEED_ROUTINE_MPN_GCD_1_CALL( , function (&px[j-1], 1, py[j-1])) 2711 2712 #define SPEED_ROUTINE_MPN_JACBASE(function) \ 2713 SPEED_ROUTINE_MPN_GCD_1_CALL \ 2714 ({ \ 2715 /* require x<y, y odd, y!=1 */ \ 2716 px[i] %= py[i]; \ 2717 px[i] |= 1; \ 2718 py[i] |= 1; \ 2719 if (py[i]==1) py[i]=3; \ 2720 }, \ 2721 function (px[j-1], py[j-1], 0)) 2722 2723 2724 #define SPEED_ROUTINE_MPN_HGCD_CALL(func, itchfunc) \ 2725 { \ 2726 mp_size_t hgcd_init_itch, hgcd_itch; \ 2727 mp_ptr ap, bp, wp, tmp1; \ 2728 struct hgcd_matrix hgcd; \ 2729 int res; \ 2730 unsigned i; \ 2731 double t; \ 2732 TMP_DECL; \ 2733 \ 2734 if (s->size < 2) \ 2735 return -1; \ 2736 \ 2737 TMP_MARK; \ 2738 \ 2739 SPEED_TMP_ALLOC_LIMBS (ap, s->size + 1, s->align_xp); \ 2740 SPEED_TMP_ALLOC_LIMBS (bp, s->size + 1, s->align_yp); \ 2741 \ 2742 s->xp[s->size - 1] |= 1; \ 2743 s->yp[s->size - 1] |= 1; \ 2744 \ 2745 hgcd_init_itch = MPN_HGCD_MATRIX_INIT_ITCH (s->size); \ 2746 hgcd_itch = itchfunc (s->size); \ 2747 \ 2748 SPEED_TMP_ALLOC_LIMBS (tmp1, hgcd_init_itch, s->align_wp); \ 2749 SPEED_TMP_ALLOC_LIMBS (wp, hgcd_itch, s->align_wp); \ 2750 \ 2751 speed_operand_src (s, s->xp, s->size); \ 2752 speed_operand_src (s, s->yp, s->size); \ 2753 speed_operand_dst (s, ap, s->size + 1); \ 2754 speed_operand_dst (s, bp, s->size + 1); \ 2755 speed_operand_dst (s, wp, hgcd_itch); \ 2756 speed_operand_dst (s, tmp1, hgcd_init_itch); \ 2757 speed_cache_fill (s); \ 2758 \ 2759 speed_starttime (); \ 2760 i = s->reps; \ 2761 do \ 2762 { \ 2763 MPN_COPY (ap, s->xp, s->size); \ 2764 MPN_COPY (bp, s->yp, s->size); \ 2765 mpn_hgcd_matrix_init (&hgcd, s->size, tmp1); \ 2766 res = func (ap, bp, s->size, &hgcd, wp); \ 2767 } \ 2768 while (--i != 0); \ 2769 t = speed_endtime (); \ 2770 TMP_FREE; \ 2771 return t; \ 2772 } 2773 2774 #define SPEED_ROUTINE_MPN_HGCD_REDUCE_CALL(func, itchfunc) \ 2775 { \ 2776 mp_size_t hgcd_init_itch, hgcd_step_itch; \ 2777 mp_ptr ap, bp, wp, tmp1; \ 2778 struct hgcd_matrix hgcd; \ 2779 mp_size_t p = s->size/2; \ 2780 int res; \ 2781 unsigned i; \ 2782 double t; \ 2783 TMP_DECL; \ 2784 \ 2785 if (s->size < 2) \ 2786 return -1; \ 2787 \ 2788 TMP_MARK; \ 2789 \ 2790 SPEED_TMP_ALLOC_LIMBS (ap, s->size + 1, s->align_xp); \ 2791 SPEED_TMP_ALLOC_LIMBS (bp, s->size + 1, s->align_yp); \ 2792 \ 2793 s->xp[s->size - 1] |= 1; \ 2794 s->yp[s->size - 1] |= 1; \ 2795 \ 2796 hgcd_init_itch = MPN_HGCD_MATRIX_INIT_ITCH (s->size); \ 2797 hgcd_step_itch = itchfunc (s->size, p); \ 2798 \ 2799 SPEED_TMP_ALLOC_LIMBS (tmp1, hgcd_init_itch, s->align_wp); \ 2800 SPEED_TMP_ALLOC_LIMBS (wp, hgcd_step_itch, s->align_wp); \ 2801 \ 2802 speed_operand_src (s, s->xp, s->size); \ 2803 speed_operand_src (s, s->yp, s->size); \ 2804 speed_operand_dst (s, ap, s->size + 1); \ 2805 speed_operand_dst (s, bp, s->size + 1); \ 2806 speed_operand_dst (s, wp, hgcd_step_itch); \ 2807 speed_operand_dst (s, tmp1, hgcd_init_itch); \ 2808 speed_cache_fill (s); \ 2809 \ 2810 speed_starttime (); \ 2811 i = s->reps; \ 2812 do \ 2813 { \ 2814 MPN_COPY (ap, s->xp, s->size); \ 2815 MPN_COPY (bp, s->yp, s->size); \ 2816 mpn_hgcd_matrix_init (&hgcd, s->size, tmp1); \ 2817 res = func (&hgcd, ap, bp, s->size, p, wp); \ 2818 } \ 2819 while (--i != 0); \ 2820 t = speed_endtime (); \ 2821 TMP_FREE; \ 2822 return t; \ 2823 } 2824 2825 /* Run some GCDs of s->size limbs each. The number of different data values 2826 is decreased as s->size**2, since GCD is a quadratic algorithm. 2827 SPEED_ROUTINE_MPN_GCD runs more times than SPEED_ROUTINE_MPN_GCDEXT 2828 though, because the plain gcd is about twice as fast as gcdext. */ 2829 2830 #define SPEED_ROUTINE_MPN_GCD_CALL(datafactor, call) \ 2831 { \ 2832 unsigned i; \ 2833 mp_size_t j, pieces, psize; \ 2834 mp_ptr wp, wp2, xtmp, ytmp, px, py; \ 2835 double t; \ 2836 TMP_DECL; \ 2837 \ 2838 SPEED_RESTRICT_COND (s->size >= 1); \ 2839 \ 2840 TMP_MARK; \ 2841 SPEED_TMP_ALLOC_LIMBS (xtmp, s->size+1, s->align_xp); \ 2842 SPEED_TMP_ALLOC_LIMBS (ytmp, s->size+1, s->align_yp); \ 2843 SPEED_TMP_ALLOC_LIMBS (wp, s->size+1, s->align_wp); \ 2844 SPEED_TMP_ALLOC_LIMBS (wp2, s->size+1, s->align_wp2); \ 2845 \ 2846 pieces = SPEED_BLOCK_SIZE * datafactor / s->size / s->size; \ 2847 pieces = MIN (pieces, SPEED_BLOCK_SIZE / s->size); \ 2848 pieces = MAX (pieces, 1); \ 2849 \ 2850 psize = pieces * s->size; \ 2851 px = TMP_ALLOC_LIMBS (psize); \ 2852 py = TMP_ALLOC_LIMBS (psize); \ 2853 MPN_COPY (px, pieces==1 ? s->xp : s->xp_block, psize); \ 2854 MPN_COPY (py, pieces==1 ? s->yp : s->yp_block, psize); \ 2855 \ 2856 /* Requirements: x >= y, y must be odd, high limbs != 0. \ 2857 No need to ensure random numbers are really great. */ \ 2858 for (j = 0; j < pieces; j++) \ 2859 { \ 2860 mp_ptr x = px + j * s->size; \ 2861 mp_ptr y = py + j * s->size; \ 2862 if (x[s->size - 1] == 0) x[s->size - 1] = 1; \ 2863 if (y[s->size - 1] == 0) y[s->size - 1] = 1; \ 2864 \ 2865 if (x[s->size - 1] < y[s->size - 1]) \ 2866 MP_LIMB_T_SWAP (x[s->size - 1], y[s->size - 1]); \ 2867 else if (x[s->size - 1] == y[s->size - 1]) \ 2868 { \ 2869 x[s->size - 1] = 2; \ 2870 y[s->size - 1] = 1; \ 2871 } \ 2872 y[0] |= 1; \ 2873 } \ 2874 \ 2875 speed_operand_src (s, px, psize); \ 2876 speed_operand_src (s, py, psize); \ 2877 speed_operand_dst (s, xtmp, s->size); \ 2878 speed_operand_dst (s, ytmp, s->size); \ 2879 speed_operand_dst (s, wp, s->size); \ 2880 speed_cache_fill (s); \ 2881 \ 2882 speed_starttime (); \ 2883 i = s->reps; \ 2884 do \ 2885 { \ 2886 j = pieces; \ 2887 do \ 2888 { \ 2889 MPN_COPY (xtmp, px+(j - 1)*s->size, s->size); \ 2890 MPN_COPY (ytmp, py+(j - 1)*s->size, s->size); \ 2891 call; \ 2892 } \ 2893 while (--j != 0); \ 2894 } \ 2895 while (--i != 0); \ 2896 t = speed_endtime (); \ 2897 \ 2898 TMP_FREE; \ 2899 \ 2900 s->time_divisor = pieces; \ 2901 return t; \ 2902 } 2903 2904 #define SPEED_ROUTINE_MPN_GCD(function) \ 2905 SPEED_ROUTINE_MPN_GCD_CALL (8, function (wp, xtmp, s->size, ytmp, s->size)) 2906 2907 #define SPEED_ROUTINE_MPN_GCDEXT(function) \ 2908 SPEED_ROUTINE_MPN_GCD_CALL \ 2909 (4, { mp_size_t wp2size; \ 2910 function (wp, wp2, &wp2size, xtmp, s->size, ytmp, s->size); }) 2911 2912 2913 #define SPEED_ROUTINE_MPN_GCDEXT_ONE(function) \ 2914 { \ 2915 unsigned i; \ 2916 mp_size_t j, pieces, psize, wp2size; \ 2917 mp_ptr wp, wp2, xtmp, ytmp, px, py; \ 2918 double t; \ 2919 TMP_DECL; \ 2920 \ 2921 SPEED_RESTRICT_COND (s->size >= 1); \ 2922 \ 2923 TMP_MARK; \ 2924 \ 2925 SPEED_TMP_ALLOC_LIMBS (xtmp, s->size+1, s->align_xp); \ 2926 SPEED_TMP_ALLOC_LIMBS (ytmp, s->size+1, s->align_yp); \ 2927 MPN_COPY (xtmp, s->xp, s->size); \ 2928 MPN_COPY (ytmp, s->yp, s->size); \ 2929 \ 2930 SPEED_TMP_ALLOC_LIMBS (wp, s->size+1, s->align_wp); \ 2931 SPEED_TMP_ALLOC_LIMBS (wp2, s->size+1, s->align_wp2); \ 2932 \ 2933 pieces = SPEED_BLOCK_SIZE / 3; \ 2934 psize = 3 * pieces; \ 2935 px = TMP_ALLOC_LIMBS (psize); \ 2936 py = TMP_ALLOC_LIMBS (psize); \ 2937 MPN_COPY (px, s->xp_block, psize); \ 2938 MPN_COPY (py, s->yp_block, psize); \ 2939 \ 2940 /* x must have at least as many bits as y, \ 2941 high limbs must be non-zero */ \ 2942 for (j = 0; j < pieces; j++) \ 2943 { \ 2944 mp_ptr x = px+3*j; \ 2945 mp_ptr y = py+3*j; \ 2946 x[2] += (x[2] == 0); \ 2947 y[2] += (y[2] == 0); \ 2948 if (x[2] < y[2]) \ 2949 MP_LIMB_T_SWAP (x[2], y[2]); \ 2950 } \ 2951 \ 2952 speed_operand_src (s, px, psize); \ 2953 speed_operand_src (s, py, psize); \ 2954 speed_operand_dst (s, xtmp, s->size); \ 2955 speed_operand_dst (s, ytmp, s->size); \ 2956 speed_operand_dst (s, wp, s->size); \ 2957 speed_cache_fill (s); \ 2958 \ 2959 speed_starttime (); \ 2960 i = s->reps; \ 2961 do \ 2962 { \ 2963 mp_ptr x = px; \ 2964 mp_ptr y = py; \ 2965 mp_ptr xth = &xtmp[s->size-3]; \ 2966 mp_ptr yth = &ytmp[s->size-3]; \ 2967 j = pieces; \ 2968 do \ 2969 { \ 2970 xth[0] = x[0], xth[1] = x[1], xth[2] = x[2]; \ 2971 yth[0] = y[0], yth[1] = y[1], yth[2] = y[2]; \ 2972 \ 2973 ytmp[0] |= 1; /* y must be odd, */ \ 2974 \ 2975 function (wp, wp2, &wp2size, xtmp, s->size, ytmp, s->size); \ 2976 \ 2977 x += 3; \ 2978 y += 3; \ 2979 } \ 2980 while (--j != 0); \ 2981 } \ 2982 while (--i != 0); \ 2983 t = speed_endtime (); \ 2984 \ 2985 TMP_FREE; \ 2986 \ 2987 s->time_divisor = pieces; \ 2988 return t; \ 2989 } 2990 2991 #define SPEED_ROUTINE_MPZ_JACOBI(function) \ 2992 { \ 2993 mpz_t a, b; \ 2994 unsigned i; \ 2995 mp_size_t j, pieces, psize; \ 2996 mp_ptr px, py; \ 2997 double t; \ 2998 TMP_DECL; \ 2999 \ 3000 TMP_MARK; \ 3001 pieces = SPEED_BLOCK_SIZE / MAX (s->size, 1); \ 3002 pieces = MAX (pieces, 1); \ 3003 s->time_divisor = pieces; \ 3004 \ 3005 psize = pieces * s->size; \ 3006 px = TMP_ALLOC_LIMBS (psize); \ 3007 py = TMP_ALLOC_LIMBS (psize); \ 3008 MPN_COPY (px, pieces==1 ? s->xp : s->xp_block, psize); \ 3009 MPN_COPY (py, pieces==1 ? s->yp : s->yp_block, psize); \ 3010 \ 3011 for (j = 0; j < pieces; j++) \ 3012 { \ 3013 mp_ptr x = px+j*s->size; \ 3014 mp_ptr y = py+j*s->size; \ 3015 \ 3016 /* y odd */ \ 3017 y[0] |= 1; \ 3018 \ 3019 /* high limbs non-zero */ \ 3020 if (x[s->size-1] == 0) x[s->size-1] = 1; \ 3021 if (y[s->size-1] == 0) y[s->size-1] = 1; \ 3022 } \ 3023 \ 3024 SIZ(a) = s->size; \ 3025 SIZ(b) = s->size; \ 3026 \ 3027 speed_operand_src (s, px, psize); \ 3028 speed_operand_src (s, py, psize); \ 3029 speed_cache_fill (s); \ 3030 \ 3031 speed_starttime (); \ 3032 i = s->reps; \ 3033 do \ 3034 { \ 3035 j = pieces; \ 3036 do \ 3037 { \ 3038 PTR(a) = px+(j-1)*s->size; \ 3039 PTR(b) = py+(j-1)*s->size; \ 3040 function (a, b); \ 3041 } \ 3042 while (--j != 0); \ 3043 } \ 3044 while (--i != 0); \ 3045 t = speed_endtime (); \ 3046 \ 3047 TMP_FREE; \ 3048 return t; \ 3049 } 3050 3051 #define SPEED_ROUTINE_MPN_DIVREM_2(function) \ 3052 { \ 3053 mp_ptr wp, xp; \ 3054 mp_limb_t yp[2]; \ 3055 unsigned i; \ 3056 double t; \ 3057 TMP_DECL; \ 3058 \ 3059 SPEED_RESTRICT_COND (s->size >= 2); \ 3060 \ 3061 TMP_MARK; \ 3062 SPEED_TMP_ALLOC_LIMBS (xp, s->size, s->align_xp); \ 3063 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ 3064 \ 3065 /* source is destroyed */ \ 3066 MPN_COPY (xp, s->xp, s->size); \ 3067 \ 3068 /* divisor must be normalized */ \ 3069 MPN_COPY (yp, s->yp_block, 2); \ 3070 yp[1] |= GMP_NUMB_HIGHBIT; \ 3071 \ 3072 speed_operand_src (s, xp, s->size); \ 3073 speed_operand_src (s, yp, 2); \ 3074 speed_operand_dst (s, wp, s->size); \ 3075 speed_cache_fill (s); \ 3076 \ 3077 speed_starttime (); \ 3078 i = s->reps; \ 3079 do \ 3080 function (wp, 0, xp, s->size, yp); \ 3081 while (--i != 0); \ 3082 t = speed_endtime (); \ 3083 \ 3084 TMP_FREE; \ 3085 return t; \ 3086 } 3087 3088 #define SPEED_ROUTINE_MPN_DIV_QR_2(function, norm) \ 3089 { \ 3090 mp_ptr wp, xp; \ 3091 mp_limb_t yp[2]; \ 3092 mp_limb_t rp[2]; \ 3093 unsigned i; \ 3094 double t; \ 3095 TMP_DECL; \ 3096 \ 3097 SPEED_RESTRICT_COND (s->size >= 2); \ 3098 \ 3099 TMP_MARK; \ 3100 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ 3101 \ 3102 /* divisor must be normalized */ \ 3103 MPN_COPY (yp, s->yp_block, 2); \ 3104 if (norm) \ 3105 yp[1] |= GMP_NUMB_HIGHBIT; \ 3106 else \ 3107 { \ 3108 yp[1] &= ~GMP_NUMB_HIGHBIT; \ 3109 if (yp[1] == 0) \ 3110 yp[1] = 1; \ 3111 } \ 3112 speed_operand_src (s, s->xp, s->size); \ 3113 speed_operand_src (s, yp, 2); \ 3114 speed_operand_dst (s, wp, s->size); \ 3115 speed_operand_dst (s, rp, 2); \ 3116 speed_cache_fill (s); \ 3117 \ 3118 speed_starttime (); \ 3119 i = s->reps; \ 3120 do \ 3121 function (wp, rp, s->xp, s->size, yp); \ 3122 while (--i != 0); \ 3123 t = speed_endtime (); \ 3124 \ 3125 TMP_FREE; \ 3126 return t; \ 3127 } 3128 3129 #define SPEED_ROUTINE_MODLIMB_INVERT(function) \ 3130 { \ 3131 unsigned i, j; \ 3132 mp_ptr xp; \ 3133 mp_limb_t n = 1; \ 3134 double t; \ 3135 \ 3136 xp = s->xp_block-1; \ 3137 \ 3138 speed_operand_src (s, s->xp_block, SPEED_BLOCK_SIZE); \ 3139 speed_cache_fill (s); \ 3140 \ 3141 speed_starttime (); \ 3142 i = s->reps; \ 3143 do \ 3144 { \ 3145 j = SPEED_BLOCK_SIZE; \ 3146 do \ 3147 { \ 3148 /* randomized but successively dependent */ \ 3149 n += (xp[j] << 1); \ 3150 \ 3151 function (n, n); \ 3152 } \ 3153 while (--j != 0); \ 3154 } \ 3155 while (--i != 0); \ 3156 t = speed_endtime (); \ 3157 \ 3158 /* make sure the compiler won't optimize away n */ \ 3159 noop_1 (n); \ 3160 \ 3161 s->time_divisor = SPEED_BLOCK_SIZE; \ 3162 return t; \ 3163 } 3164 3165 3166 #define SPEED_ROUTINE_MPN_SQRTREM(function) \ 3167 { \ 3168 mp_ptr wp, wp2; \ 3169 unsigned i; \ 3170 double t; \ 3171 TMP_DECL; \ 3172 \ 3173 SPEED_RESTRICT_COND (s->size >= 1); \ 3174 \ 3175 TMP_MARK; \ 3176 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ 3177 SPEED_TMP_ALLOC_LIMBS (wp2, s->size, s->align_wp2); \ 3178 \ 3179 speed_operand_src (s, s->xp, s->size); \ 3180 speed_operand_dst (s, wp, s->size); \ 3181 speed_operand_dst (s, wp2, s->size); \ 3182 speed_cache_fill (s); \ 3183 \ 3184 speed_starttime (); \ 3185 i = s->reps; \ 3186 do \ 3187 function (wp, wp2, s->xp, s->size); \ 3188 while (--i != 0); \ 3189 t = speed_endtime (); \ 3190 \ 3191 TMP_FREE; \ 3192 return t; \ 3193 } 3194 3195 #define SPEED_ROUTINE_MPN_ROOTREM(function) \ 3196 { \ 3197 mp_ptr wp, wp2; \ 3198 unsigned i; \ 3199 double t; \ 3200 TMP_DECL; \ 3201 \ 3202 SPEED_RESTRICT_COND (s->size >= 1); \ 3203 \ 3204 TMP_MARK; \ 3205 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ 3206 SPEED_TMP_ALLOC_LIMBS (wp2, s->size, s->align_wp2); \ 3207 \ 3208 speed_operand_src (s, s->xp, s->size); \ 3209 speed_operand_dst (s, wp, s->size); \ 3210 speed_operand_dst (s, wp2, s->size); \ 3211 speed_cache_fill (s); \ 3212 \ 3213 speed_starttime (); \ 3214 i = s->reps; \ 3215 do \ 3216 function (wp, wp2, s->xp, s->size, s->r); \ 3217 while (--i != 0); \ 3218 t = speed_endtime (); \ 3219 \ 3220 TMP_FREE; \ 3221 return t; \ 3222 } 3223 3224 3225 /* s->size controls the number of limbs in the input, s->r is the base, or 3226 decimal by default. */ 3227 #define SPEED_ROUTINE_MPN_GET_STR(function) \ 3228 { \ 3229 unsigned char *wp; \ 3230 mp_size_t wn; \ 3231 mp_ptr xp; \ 3232 int base; \ 3233 unsigned i; \ 3234 double t; \ 3235 TMP_DECL; \ 3236 \ 3237 SPEED_RESTRICT_COND (s->size >= 1); \ 3238 \ 3239 base = s->r == 0 ? 10 : s->r; \ 3240 SPEED_RESTRICT_COND (base >= 2 && base <= 256); \ 3241 \ 3242 TMP_MARK; \ 3243 SPEED_TMP_ALLOC_LIMBS (xp, s->size + 1, s->align_xp); \ 3244 \ 3245 MPN_SIZEINBASE (wn, s->xp, s->size, base); \ 3246 wp = TMP_ALLOC (wn); \ 3247 \ 3248 /* use this during development to guard against overflowing wp */ \ 3249 /* \ 3250 MPN_COPY (xp, s->xp, s->size); \ 3251 ASSERT_ALWAYS (mpn_get_str (wp, base, xp, s->size) <= wn); \ 3252 */ \ 3253 \ 3254 speed_operand_src (s, s->xp, s->size); \ 3255 speed_operand_dst (s, xp, s->size); \ 3256 speed_operand_dst (s, (mp_ptr) wp, wn/BYTES_PER_MP_LIMB); \ 3257 speed_cache_fill (s); \ 3258 \ 3259 speed_starttime (); \ 3260 i = s->reps; \ 3261 do \ 3262 { \ 3263 MPN_COPY (xp, s->xp, s->size); \ 3264 function (wp, base, xp, s->size); \ 3265 } \ 3266 while (--i != 0); \ 3267 t = speed_endtime (); \ 3268 \ 3269 TMP_FREE; \ 3270 return t; \ 3271 } 3272 3273 /* s->size controls the number of digits in the input, s->r is the base, or 3274 decimal by default. */ 3275 #define SPEED_ROUTINE_MPN_SET_STR_CALL(call) \ 3276 { \ 3277 unsigned char *xp; \ 3278 mp_ptr wp; \ 3279 mp_size_t wn; \ 3280 unsigned i; \ 3281 int base; \ 3282 double t; \ 3283 TMP_DECL; \ 3284 \ 3285 SPEED_RESTRICT_COND (s->size >= 1); \ 3286 \ 3287 base = s->r == 0 ? 10 : s->r; \ 3288 SPEED_RESTRICT_COND (base >= 2 && base <= 256); \ 3289 \ 3290 TMP_MARK; \ 3291 \ 3292 xp = TMP_ALLOC (s->size); \ 3293 for (i = 0; i < s->size; i++) \ 3294 xp[i] = s->xp[i] % base; \ 3295 \ 3296 LIMBS_PER_DIGIT_IN_BASE (wn, s->size, base); \ 3297 SPEED_TMP_ALLOC_LIMBS (wp, wn, s->align_wp); \ 3298 \ 3299 /* use this during development to check wn is big enough */ \ 3300 /* \ 3301 ASSERT_ALWAYS (mpn_set_str (wp, xp, s->size, base) <= wn); \ 3302 */ \ 3303 \ 3304 speed_operand_src (s, (mp_ptr) xp, s->size/BYTES_PER_MP_LIMB); \ 3305 speed_operand_dst (s, wp, wn); \ 3306 speed_cache_fill (s); \ 3307 \ 3308 speed_starttime (); \ 3309 i = s->reps; \ 3310 do \ 3311 call; \ 3312 while (--i != 0); \ 3313 t = speed_endtime (); \ 3314 \ 3315 TMP_FREE; \ 3316 return t; \ 3317 } 3318 3319 3320 /* Run an accel gcd find_a() function over various data values. A set of 3321 values is used in case some run particularly fast or slow. The size 3322 parameter is ignored, the amount of data tested is fixed. */ 3323 3324 #define SPEED_ROUTINE_MPN_GCD_FINDA(function) \ 3325 { \ 3326 unsigned i, j; \ 3327 mp_limb_t cp[SPEED_BLOCK_SIZE][2]; \ 3328 double t; \ 3329 TMP_DECL; \ 3330 \ 3331 TMP_MARK; \ 3332 \ 3333 /* low must be odd, high must be non-zero */ \ 3334 for (i = 0; i < SPEED_BLOCK_SIZE; i++) \ 3335 { \ 3336 cp[i][0] = s->xp_block[i] | 1; \ 3337 cp[i][1] = s->yp_block[i] + (s->yp_block[i] == 0); \ 3338 } \ 3339 \ 3340 speed_operand_src (s, &cp[0][0], 2*SPEED_BLOCK_SIZE); \ 3341 speed_cache_fill (s); \ 3342 \ 3343 speed_starttime (); \ 3344 i = s->reps; \ 3345 do \ 3346 { \ 3347 j = SPEED_BLOCK_SIZE; \ 3348 do \ 3349 { \ 3350 function (cp[j-1]); \ 3351 } \ 3352 while (--j != 0); \ 3353 } \ 3354 while (--i != 0); \ 3355 t = speed_endtime (); \ 3356 \ 3357 TMP_FREE; \ 3358 \ 3359 s->time_divisor = SPEED_BLOCK_SIZE; \ 3360 return t; \ 3361 } 3362 3363 3364 /* "call" should do "count_foo_zeros(c,n)". 3365 Give leading=1 if foo is leading zeros, leading=0 for trailing. 3366 Give zero=1 if n=0 is allowed in the call, zero=0 if not. */ 3367 3368 #define SPEED_ROUTINE_COUNT_ZEROS_A(leading, zero) \ 3369 { \ 3370 mp_ptr xp; \ 3371 int i, c; \ 3372 unsigned j; \ 3373 mp_limb_t n; \ 3374 double t; \ 3375 TMP_DECL; \ 3376 \ 3377 TMP_MARK; \ 3378 SPEED_TMP_ALLOC_LIMBS (xp, SPEED_BLOCK_SIZE, s->align_xp); \ 3379 \ 3380 if (! speed_routine_count_zeros_setup (s, xp, leading, zero)) \ 3381 return -1.0; \ 3382 speed_operand_src (s, xp, SPEED_BLOCK_SIZE); \ 3383 speed_cache_fill (s); \ 3384 \ 3385 c = 0; \ 3386 speed_starttime (); \ 3387 j = s->reps; \ 3388 do { \ 3389 for (i = 0; i < SPEED_BLOCK_SIZE; i++) \ 3390 { \ 3391 n = xp[i]; \ 3392 n ^= c; \ 3393 3394 #define SPEED_ROUTINE_COUNT_ZEROS_B() \ 3395 } \ 3396 } while (--j != 0); \ 3397 t = speed_endtime (); \ 3398 \ 3399 /* don't let c go dead */ \ 3400 noop_1 (c); \ 3401 \ 3402 s->time_divisor = SPEED_BLOCK_SIZE; \ 3403 \ 3404 TMP_FREE; \ 3405 return t; \ 3406 } \ 3407 3408 #define SPEED_ROUTINE_COUNT_ZEROS_C(call, leading, zero) \ 3409 do { \ 3410 SPEED_ROUTINE_COUNT_ZEROS_A (leading, zero); \ 3411 call; \ 3412 SPEED_ROUTINE_COUNT_ZEROS_B (); \ 3413 } while (0) \ 3414 3415 #define SPEED_ROUTINE_COUNT_LEADING_ZEROS_C(call,zero) \ 3416 SPEED_ROUTINE_COUNT_ZEROS_C (call, 1, zero) 3417 #define SPEED_ROUTINE_COUNT_LEADING_ZEROS(fun) \ 3418 SPEED_ROUTINE_COUNT_ZEROS_C (fun (c, n), 1, 0) 3419 3420 #define SPEED_ROUTINE_COUNT_TRAILING_ZEROS_C(call,zero) \ 3421 SPEED_ROUTINE_COUNT_ZEROS_C (call, 0, zero) 3422 #define SPEED_ROUTINE_COUNT_TRAILING_ZEROS(call) \ 3423 SPEED_ROUTINE_COUNT_ZEROS_C (fun (c, n), 0, 0) 3424 3425 3426 #define SPEED_ROUTINE_INVERT_LIMB_CALL(call) \ 3427 { \ 3428 unsigned i, j; \ 3429 mp_limb_t d, dinv=0; \ 3430 mp_ptr xp = s->xp_block - 1; \ 3431 \ 3432 s->time_divisor = SPEED_BLOCK_SIZE; \ 3433 \ 3434 speed_starttime (); \ 3435 i = s->reps; \ 3436 do \ 3437 { \ 3438 j = SPEED_BLOCK_SIZE; \ 3439 do \ 3440 { \ 3441 d = dinv ^ xp[j]; \ 3442 d |= GMP_LIMB_HIGHBIT; \ 3443 do { call; } while (0); \ 3444 } \ 3445 while (--j != 0); \ 3446 } \ 3447 while (--i != 0); \ 3448 \ 3449 /* don't let the compiler optimize everything away */ \ 3450 noop_1 (dinv); \ 3451 \ 3452 return speed_endtime(); \ 3453 } 3454 3455 3456 #define SPEED_ROUTINE_MPN_BACK_TO_BACK(function) \ 3457 { \ 3458 unsigned i; \ 3459 speed_starttime (); \ 3460 i = s->reps; \ 3461 do \ 3462 function (); \ 3463 while (--i != 0); \ 3464 return speed_endtime (); \ 3465 } 3466 3467 3468 #define SPEED_ROUTINE_MPN_ZERO_CALL(call) \ 3469 { \ 3470 mp_ptr wp; \ 3471 unsigned i; \ 3472 double t; \ 3473 TMP_DECL; \ 3474 \ 3475 SPEED_RESTRICT_COND (s->size >= 0); \ 3476 \ 3477 TMP_MARK; \ 3478 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \ 3479 speed_operand_dst (s, wp, s->size); \ 3480 speed_cache_fill (s); \ 3481 \ 3482 speed_starttime (); \ 3483 i = s->reps; \ 3484 do \ 3485 call; \ 3486 while (--i != 0); \ 3487 t = speed_endtime (); \ 3488 \ 3489 TMP_FREE; \ 3490 return t; \ 3491 } 3492 3493 #define SPEED_ROUTINE_MPN_ZERO(function) \ 3494 SPEED_ROUTINE_MPN_ZERO_CALL (function (wp, s->size)) 3495 3496 3497 #endif 3498