1 /* Shared speed subroutines. 2 3 Copyright 1999-2006, 2008-2017, 2019 Free Software Foundation, Inc. 4 5 This file is part of the GNU MP Library. 6 7 The GNU MP Library is free software; you can redistribute it and/or modify 8 it under the terms of either: 9 10 * the GNU Lesser General Public License as published by the Free 11 Software Foundation; either version 3 of the License, or (at your 12 option) any later version. 13 14 or 15 16 * the GNU General Public License as published by the Free Software 17 Foundation; either version 2 of the License, or (at your option) any 18 later version. 19 20 or both in parallel, as here. 21 22 The GNU MP Library is distributed in the hope that it will be useful, but 23 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 for more details. 26 27 You should have received copies of the GNU General Public License and the 28 GNU Lesser General Public License along with the GNU MP Library. If not, 29 see https://www.gnu.org/licenses/. */ 30 31 #define __GMP_NO_ATTRIBUTE_CONST_PURE 32 33 #include <errno.h> 34 #include <fcntl.h> 35 #include <math.h> 36 #include <stdio.h> 37 #include <stdlib.h> /* for qsort */ 38 #include <string.h> 39 #include <unistd.h> 40 #if 0 41 #include <sys/ioctl.h> 42 #endif 43 44 #include "gmp-impl.h" 45 #include "longlong.h" 46 47 #include "tests.h" 48 #include "speed.h" 49 50 51 int speed_option_addrs = 0; 52 int speed_option_verbose = 0; 53 int speed_option_cycles_broken = 0; 54 55 56 /* Provide __clz_tab even if it's not required, for the benefit of new code 57 being tested with many.pl. */ 58 #ifndef COUNT_LEADING_ZEROS_NEED_CLZ_TAB 59 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB 60 #include "mp_clz_tab.c" 61 #undef COUNT_LEADING_ZEROS_NEED_CLZ_TAB 62 #endif 63 64 65 void 66 pentium_wbinvd(void) 67 { 68 #if 0 69 { 70 static int fd = -2; 71 72 if (fd == -2) 73 { 74 fd = open ("/dev/wbinvd", O_RDWR); 75 if (fd == -1) 76 perror ("open /dev/wbinvd"); 77 } 78 79 if (fd != -1) 80 ioctl (fd, 0, 0); 81 } 82 #endif 83 84 #if 0 85 #define WBINVDSIZE 1024*1024*2 86 { 87 static char *p = NULL; 88 int i, sum; 89 90 if (p == NULL) 91 p = malloc (WBINVDSIZE); 92 93 #if 0 94 for (i = 0; i < WBINVDSIZE; i++) 95 p[i] = i & 0xFF; 96 #endif 97 98 sum = 0; 99 for (i = 0; i < WBINVDSIZE; i++) 100 sum += p[i]; 101 102 mpn_cache_fill_dummy (sum); 103 } 104 #endif 105 } 106 107 108 int 109 double_cmp_ptr (const double *p, const double *q) 110 { 111 if (*p > *q) return 1; 112 if (*p < *q) return -1; 113 return 0; 114 } 115 116 117 /* Measure the speed of a given routine. 118 119 The routine is run with enough repetitions to make it take at least 120 speed_precision * speed_unittime. This aims to minimize the effects of a 121 limited accuracy time base and the overhead of the measuring itself. 122 123 Measurements are made looking for 4 results within TOLERANCE of each 124 other (or 3 for routines taking longer than 2 seconds). This aims to get 125 an accurate reading even if some runs are bloated by interrupts or task 126 switches or whatever. 127 128 The given (*fun)() is expected to run its function "s->reps" many times 129 and return the total elapsed time measured using speed_starttime() and 130 speed_endtime(). If the function doesn't support the given s->size or 131 s->r, -1.0 should be returned. See the various base routines below. */ 132 133 double 134 speed_measure (double (*fun) (struct speed_params *s), struct speed_params *s) 135 { 136 #define TOLERANCE 1.01 /* 1% */ 137 const int max_zeros = 10; 138 139 struct speed_params s_dummy; 140 int i, j, e; 141 double t[30]; 142 double t_unsorted[30]; 143 double reps_d; 144 int zeros = 0; 145 146 /* Use dummy parameters if caller doesn't provide any. Only a few special 147 "fun"s will cope with this, speed_noop() is one. */ 148 if (s == NULL) 149 { 150 memset (&s_dummy, '\0', sizeof (s_dummy)); 151 s = &s_dummy; 152 } 153 154 s->reps = 1; 155 s->time_divisor = 1.0; 156 for (i = 0; i < numberof (t); i++) 157 { 158 for (;;) 159 { 160 s->src_num = 0; 161 s->dst_num = 0; 162 163 t[i] = (*fun) (s); 164 165 if (speed_option_verbose >= 3) 166 gmp_printf("size=%ld reps=%u r=%Md attempt=%d %.9f\n", 167 (long) s->size, s->reps, s->r, i, t[i]); 168 169 if (t[i] == 0.0) 170 { 171 zeros++; 172 if (zeros > max_zeros) 173 { 174 fprintf (stderr, "Fatal error: too many (%d) failed measurements (0.0)\n", zeros); 175 abort (); 176 } 177 if (s->reps < 10000) 178 s->reps *= 2; 179 180 continue; 181 } 182 183 if (t[i] == -1.0) 184 return -1.0; 185 186 if (t[i] >= speed_unittime * speed_precision) 187 break; 188 189 /* go to a value of reps to make t[i] >= precision */ 190 reps_d = ceil (1.1 * s->reps 191 * speed_unittime * speed_precision 192 / MAX (t[i], speed_unittime)); 193 if (reps_d > 2e9 || reps_d < 1.0) 194 { 195 fprintf (stderr, "Fatal error: new reps bad: %.2f\n", reps_d); 196 fprintf (stderr, " (old reps %u, unittime %.4g, precision %d, t[i] %.4g)\n", 197 s->reps, speed_unittime, speed_precision, t[i]); 198 abort (); 199 } 200 s->reps = (unsigned) reps_d; 201 } 202 t[i] /= s->reps; 203 t_unsorted[i] = t[i]; 204 205 if (speed_precision == 0) 206 return t[i]; 207 208 /* require 3 values within TOLERANCE when >= 2 secs, 4 when below */ 209 if (t[0] >= 2.0) 210 e = 3; 211 else 212 e = 4; 213 214 /* Look for e many t[]'s within TOLERANCE of each other to consider a 215 valid measurement. Return smallest among them. */ 216 if (i >= e) 217 { 218 qsort (t, i+1, sizeof(t[0]), (qsort_function_t) double_cmp_ptr); 219 for (j = e-1; j < i; j++) 220 if (t[j] <= t[j-e+1] * TOLERANCE) 221 return t[j-e+1] / s->time_divisor; 222 } 223 } 224 225 fprintf (stderr, "speed_measure() could not get %d results within %.1f%%\n", 226 e, (TOLERANCE-1.0)*100.0); 227 fprintf (stderr, " unsorted sorted\n"); 228 fprintf (stderr, " %.12f %.12f is about %.1f%%\n", 229 t_unsorted[0]*(TOLERANCE-1.0), t[0]*(TOLERANCE-1.0), 230 100*(TOLERANCE-1.0)); 231 for (i = 0; i < numberof (t); i++) 232 fprintf (stderr, " %.09f %.09f\n", t_unsorted[i], t[i]); 233 234 return -1.0; 235 } 236 237 238 /* Read all of ptr,size to get it into the CPU memory cache. 239 240 A call to mpn_cache_fill_dummy() is used to make sure the compiler 241 doesn't optimize away the whole loop. Using "volatile mp_limb_t sum" 242 would work too, but the function call means we don't rely on every 243 compiler actually implementing volatile properly. 244 245 mpn_cache_fill_dummy() is in a separate source file to stop gcc thinking 246 it can inline it. */ 247 248 void 249 mpn_cache_fill (mp_srcptr ptr, mp_size_t size) 250 { 251 mp_limb_t sum = 0; 252 mp_size_t i; 253 254 for (i = 0; i < size; i++) 255 sum += ptr[i]; 256 257 mpn_cache_fill_dummy(sum); 258 } 259 260 261 void 262 mpn_cache_fill_write (mp_ptr ptr, mp_size_t size) 263 { 264 mpn_cache_fill (ptr, size); 265 266 #if 0 267 mpn_random (ptr, size); 268 #endif 269 270 #if 0 271 mp_size_t i; 272 273 for (i = 0; i < size; i++) 274 ptr[i] = i; 275 #endif 276 } 277 278 279 void 280 speed_operand_src (struct speed_params *s, mp_ptr ptr, mp_size_t size) 281 { 282 if (s->src_num >= numberof (s->src)) 283 { 284 fprintf (stderr, "speed_operand_src: no room left in s->src[]\n"); 285 abort (); 286 } 287 s->src[s->src_num].ptr = ptr; 288 s->src[s->src_num].size = size; 289 s->src_num++; 290 } 291 292 293 void 294 speed_operand_dst (struct speed_params *s, mp_ptr ptr, mp_size_t size) 295 { 296 if (s->dst_num >= numberof (s->dst)) 297 { 298 fprintf (stderr, "speed_operand_dst: no room left in s->dst[]\n"); 299 abort (); 300 } 301 s->dst[s->dst_num].ptr = ptr; 302 s->dst[s->dst_num].size = size; 303 s->dst_num++; 304 } 305 306 307 void 308 speed_cache_fill (struct speed_params *s) 309 { 310 static struct speed_params prev; 311 int i; 312 313 /* FIXME: need a better way to get the format string for a pointer */ 314 315 if (speed_option_addrs) 316 { 317 int different; 318 319 different = (s->dst_num != prev.dst_num || s->src_num != prev.src_num); 320 for (i = 0; i < s->dst_num; i++) 321 different |= (s->dst[i].ptr != prev.dst[i].ptr); 322 for (i = 0; i < s->src_num; i++) 323 different |= (s->src[i].ptr != prev.src[i].ptr); 324 325 if (different) 326 { 327 if (s->dst_num != 0) 328 { 329 printf ("dst"); 330 for (i = 0; i < s->dst_num; i++) 331 printf (" %08lX", (unsigned long) s->dst[i].ptr); 332 printf (" "); 333 } 334 335 if (s->src_num != 0) 336 { 337 printf ("src"); 338 for (i = 0; i < s->src_num; i++) 339 printf (" %08lX", (unsigned long) s->src[i].ptr); 340 printf (" "); 341 } 342 printf (" (cf sp approx %08lX)\n", (unsigned long) &different); 343 344 } 345 346 memcpy (&prev, s, sizeof(prev)); 347 } 348 349 switch (s->cache) { 350 case 0: 351 for (i = 0; i < s->dst_num; i++) 352 mpn_cache_fill_write (s->dst[i].ptr, s->dst[i].size); 353 for (i = 0; i < s->src_num; i++) 354 mpn_cache_fill (s->src[i].ptr, s->src[i].size); 355 break; 356 case 1: 357 pentium_wbinvd(); 358 break; 359 } 360 } 361 362 363 /* Miscellaneous options accepted by tune and speed programs under -o. */ 364 365 void 366 speed_option_set (const char *s) 367 { 368 int n; 369 370 if (strcmp (s, "addrs") == 0) 371 { 372 speed_option_addrs = 1; 373 } 374 else if (strcmp (s, "verbose") == 0) 375 { 376 speed_option_verbose++; 377 } 378 else if (sscanf (s, "verbose=%d", &n) == 1) 379 { 380 speed_option_verbose = n; 381 } 382 else if (strcmp (s, "cycles-broken") == 0) 383 { 384 speed_option_cycles_broken = 1; 385 } 386 else 387 { 388 printf ("Unrecognised -o option: %s\n", s); 389 exit (1); 390 } 391 } 392 393 394 /* The following are basic speed running routines for various gmp functions. 395 Many are very similar and use speed.h macros. 396 397 Each routine allocates it's own destination space for the result of the 398 function, because only it can know what the function needs. 399 400 speed_starttime() and speed_endtime() are put tight around the code to be 401 measured. Any setups are done outside the timed portion. 402 403 Each routine is responsible for its own cache priming. 404 speed_cache_fill() is a good way to do this, see examples in speed.h. 405 One cache priming possibility, for CPUs with write-allocate cache, and 406 functions that don't take too long, is to do one dummy call before timing 407 so as to cache everything that gets used. But speed_measure() runs a 408 routine at least twice and will take the smaller time, so this might not 409 be necessary. 410 411 Data alignment will be important, for source, destination and temporary 412 workspace. A routine can align its destination and workspace. Programs 413 using the routines will ensure s->xp and s->yp are aligned. Aligning 414 onto a CACHE_LINE_SIZE boundary is suggested. s->align_wp and 415 s->align_wp2 should be respected where it makes sense to do so. 416 SPEED_TMP_ALLOC_LIMBS is a good way to do this. 417 418 A loop of the following form can be expected to turn into good assembler 419 code on most CPUs, thereby minimizing overhead in the measurement. It 420 can always be assumed s->reps >= 1. 421 422 i = s->reps 423 do 424 foo(); 425 while (--i != 0); 426 427 Additional parameters might be added to "struct speed_params" in the 428 future. Routines should ignore anything they don't use. 429 430 s->size can be used creatively, and s->xp and s->yp can be ignored. For 431 example, speed_mpz_fac_ui() uses s->size as n for the factorial. s->r is 432 just a user-supplied parameter. speed_mpn_lshift() uses it as a shift, 433 speed_mpn_mul_1() uses it as a multiplier. */ 434 435 436 /* MPN_COPY etc can be macros, so the _CALL forms are necessary */ 437 double 438 speed_MPN_COPY (struct speed_params *s) 439 { 440 SPEED_ROUTINE_MPN_COPY (MPN_COPY); 441 } 442 double 443 speed_MPN_COPY_INCR (struct speed_params *s) 444 { 445 SPEED_ROUTINE_MPN_COPY (MPN_COPY_INCR); 446 } 447 double 448 speed_MPN_COPY_DECR (struct speed_params *s) 449 { 450 SPEED_ROUTINE_MPN_COPY (MPN_COPY_DECR); 451 } 452 #if HAVE_NATIVE_mpn_copyi 453 double 454 speed_mpn_copyi (struct speed_params *s) 455 { 456 SPEED_ROUTINE_MPN_COPY (mpn_copyi); 457 } 458 #endif 459 #if HAVE_NATIVE_mpn_copyd 460 double 461 speed_mpn_copyd (struct speed_params *s) 462 { 463 SPEED_ROUTINE_MPN_COPY (mpn_copyd); 464 } 465 #endif 466 double 467 speed_memcpy (struct speed_params *s) 468 { 469 SPEED_ROUTINE_MPN_COPY_BYTES (memcpy); 470 } 471 double 472 speed_mpn_com (struct speed_params *s) 473 { 474 SPEED_ROUTINE_MPN_COPY (mpn_com); 475 } 476 double 477 speed_mpn_neg (struct speed_params *s) 478 { 479 SPEED_ROUTINE_MPN_COPY (mpn_neg); 480 } 481 double 482 speed_mpn_sec_tabselect (struct speed_params *s) 483 { 484 SPEED_ROUTINE_MPN_TABSELECT (mpn_sec_tabselect); 485 } 486 487 488 double 489 speed_mpn_addmul_1 (struct speed_params *s) 490 { 491 SPEED_ROUTINE_MPN_UNARY_1 (mpn_addmul_1); 492 } 493 double 494 speed_mpn_submul_1 (struct speed_params *s) 495 { 496 SPEED_ROUTINE_MPN_UNARY_1 (mpn_submul_1); 497 } 498 499 #if HAVE_NATIVE_mpn_addmul_2 500 double 501 speed_mpn_addmul_2 (struct speed_params *s) 502 { 503 SPEED_ROUTINE_MPN_UNARY_2 (mpn_addmul_2); 504 } 505 #endif 506 #if HAVE_NATIVE_mpn_addmul_3 507 double 508 speed_mpn_addmul_3 (struct speed_params *s) 509 { 510 SPEED_ROUTINE_MPN_UNARY_3 (mpn_addmul_3); 511 } 512 #endif 513 #if HAVE_NATIVE_mpn_addmul_4 514 double 515 speed_mpn_addmul_4 (struct speed_params *s) 516 { 517 SPEED_ROUTINE_MPN_UNARY_4 (mpn_addmul_4); 518 } 519 #endif 520 #if HAVE_NATIVE_mpn_addmul_5 521 double 522 speed_mpn_addmul_5 (struct speed_params *s) 523 { 524 SPEED_ROUTINE_MPN_UNARY_5 (mpn_addmul_5); 525 } 526 #endif 527 #if HAVE_NATIVE_mpn_addmul_6 528 double 529 speed_mpn_addmul_6 (struct speed_params *s) 530 { 531 SPEED_ROUTINE_MPN_UNARY_6 (mpn_addmul_6); 532 } 533 #endif 534 #if HAVE_NATIVE_mpn_addmul_7 535 double 536 speed_mpn_addmul_7 (struct speed_params *s) 537 { 538 SPEED_ROUTINE_MPN_UNARY_7 (mpn_addmul_7); 539 } 540 #endif 541 #if HAVE_NATIVE_mpn_addmul_8 542 double 543 speed_mpn_addmul_8 (struct speed_params *s) 544 { 545 SPEED_ROUTINE_MPN_UNARY_8 (mpn_addmul_8); 546 } 547 #endif 548 549 double 550 speed_mpn_mul_1 (struct speed_params *s) 551 { 552 SPEED_ROUTINE_MPN_UNARY_1 (mpn_mul_1); 553 } 554 double 555 speed_mpn_mul_1_inplace (struct speed_params *s) 556 { 557 SPEED_ROUTINE_MPN_UNARY_1_INPLACE (mpn_mul_1); 558 } 559 560 #if HAVE_NATIVE_mpn_mul_2 561 double 562 speed_mpn_mul_2 (struct speed_params *s) 563 { 564 SPEED_ROUTINE_MPN_UNARY_2 (mpn_mul_2); 565 } 566 #endif 567 #if HAVE_NATIVE_mpn_mul_3 568 double 569 speed_mpn_mul_3 (struct speed_params *s) 570 { 571 SPEED_ROUTINE_MPN_UNARY_3 (mpn_mul_3); 572 } 573 #endif 574 #if HAVE_NATIVE_mpn_mul_4 575 double 576 speed_mpn_mul_4 (struct speed_params *s) 577 { 578 SPEED_ROUTINE_MPN_UNARY_4 (mpn_mul_4); 579 } 580 #endif 581 #if HAVE_NATIVE_mpn_mul_5 582 double 583 speed_mpn_mul_5 (struct speed_params *s) 584 { 585 SPEED_ROUTINE_MPN_UNARY_5 (mpn_mul_5); 586 } 587 #endif 588 #if HAVE_NATIVE_mpn_mul_6 589 double 590 speed_mpn_mul_6 (struct speed_params *s) 591 { 592 SPEED_ROUTINE_MPN_UNARY_6 (mpn_mul_6); 593 } 594 #endif 595 596 597 double 598 speed_mpn_lshift (struct speed_params *s) 599 { 600 SPEED_ROUTINE_MPN_UNARY_1 (mpn_lshift); 601 } 602 double 603 speed_mpn_lshiftc (struct speed_params *s) 604 { 605 SPEED_ROUTINE_MPN_UNARY_1 (mpn_lshiftc); 606 } 607 double 608 speed_mpn_rshift (struct speed_params *s) 609 { 610 SPEED_ROUTINE_MPN_UNARY_1 (mpn_rshift); 611 } 612 613 614 /* The carry-in variants (if available) are good for measuring because they 615 won't skip a division if high<divisor. Alternately, use -1 as a divisor 616 with the plain _1 forms. */ 617 double 618 speed_mpn_divrem_1 (struct speed_params *s) 619 { 620 SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1); 621 } 622 double 623 speed_mpn_divrem_1f (struct speed_params *s) 624 { 625 SPEED_ROUTINE_MPN_DIVREM_1F (mpn_divrem_1); 626 } 627 #if HAVE_NATIVE_mpn_divrem_1c 628 double 629 speed_mpn_divrem_1c (struct speed_params *s) 630 { 631 SPEED_ROUTINE_MPN_DIVREM_1C (mpn_divrem_1c); 632 } 633 double 634 speed_mpn_divrem_1cf (struct speed_params *s) 635 { 636 SPEED_ROUTINE_MPN_DIVREM_1CF (mpn_divrem_1c); 637 } 638 #endif 639 640 double 641 speed_mpn_divrem_1_div (struct speed_params *s) 642 { 643 SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1_div); 644 } 645 double 646 speed_mpn_divrem_1f_div (struct speed_params *s) 647 { 648 SPEED_ROUTINE_MPN_DIVREM_1F (mpn_divrem_1_div); 649 } 650 double 651 speed_mpn_divrem_1_inv (struct speed_params *s) 652 { 653 SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1_inv); 654 } 655 double 656 speed_mpn_divrem_1f_inv (struct speed_params *s) 657 { 658 SPEED_ROUTINE_MPN_DIVREM_1F (mpn_divrem_1_inv); 659 } 660 double 661 speed_mpn_mod_1_div (struct speed_params *s) 662 { 663 SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1_div); 664 } 665 double 666 speed_mpn_mod_1_inv (struct speed_params *s) 667 { 668 SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1_inv); 669 } 670 671 double 672 speed_mpn_preinv_divrem_1 (struct speed_params *s) 673 { 674 SPEED_ROUTINE_MPN_PREINV_DIVREM_1 (mpn_preinv_divrem_1); 675 } 676 double 677 speed_mpn_preinv_divrem_1f (struct speed_params *s) 678 { 679 SPEED_ROUTINE_MPN_PREINV_DIVREM_1F (mpn_preinv_divrem_1); 680 } 681 682 #if GMP_NUMB_BITS % 4 == 0 683 double 684 speed_mpn_mod_34lsub1 (struct speed_params *s) 685 { 686 SPEED_ROUTINE_MPN_MOD_34LSUB1 (mpn_mod_34lsub1); 687 } 688 #endif 689 690 double 691 speed_mpn_divrem_2 (struct speed_params *s) 692 { 693 SPEED_ROUTINE_MPN_DIVREM_2 (mpn_divrem_2); 694 } 695 double 696 speed_mpn_divrem_2_div (struct speed_params *s) 697 { 698 SPEED_ROUTINE_MPN_DIVREM_2 (mpn_divrem_2_div); 699 } 700 double 701 speed_mpn_divrem_2_inv (struct speed_params *s) 702 { 703 SPEED_ROUTINE_MPN_DIVREM_2 (mpn_divrem_2_inv); 704 } 705 706 double 707 speed_mpn_div_qr_1n_pi1 (struct speed_params *s) 708 { 709 SPEED_ROUTINE_MPN_DIV_QR_1N_PI1 (mpn_div_qr_1n_pi1); 710 } 711 double 712 speed_mpn_div_qr_1n_pi1_1 (struct speed_params *s) 713 { 714 SPEED_ROUTINE_MPN_DIV_QR_1N_PI1 (mpn_div_qr_1n_pi1_1); 715 } 716 double 717 speed_mpn_div_qr_1n_pi1_2 (struct speed_params *s) 718 { 719 SPEED_ROUTINE_MPN_DIV_QR_1N_PI1 (mpn_div_qr_1n_pi1_2); 720 } 721 722 double 723 speed_mpn_div_qr_1 (struct speed_params *s) 724 { 725 SPEED_ROUTINE_MPN_DIV_QR_1 (mpn_div_qr_1); 726 } 727 728 double 729 speed_mpn_div_qr_2n (struct speed_params *s) 730 { 731 SPEED_ROUTINE_MPN_DIV_QR_2 (mpn_div_qr_2, 1); 732 } 733 double 734 speed_mpn_div_qr_2u (struct speed_params *s) 735 { 736 SPEED_ROUTINE_MPN_DIV_QR_2 (mpn_div_qr_2, 0); 737 } 738 739 double 740 speed_mpn_mod_1 (struct speed_params *s) 741 { 742 SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1); 743 } 744 #if HAVE_NATIVE_mpn_mod_1c 745 double 746 speed_mpn_mod_1c (struct speed_params *s) 747 { 748 SPEED_ROUTINE_MPN_MOD_1C (mpn_mod_1c); 749 } 750 #endif 751 double 752 speed_mpn_preinv_mod_1 (struct speed_params *s) 753 { 754 SPEED_ROUTINE_MPN_PREINV_MOD_1 (mpn_preinv_mod_1); 755 } 756 double 757 speed_mpn_mod_1_1 (struct speed_params *s) 758 { 759 SPEED_ROUTINE_MPN_MOD_1_1 (mpn_mod_1_1p,mpn_mod_1_1p_cps); 760 } 761 double 762 speed_mpn_mod_1_1_1 (struct speed_params *s) 763 { 764 SPEED_ROUTINE_MPN_MOD_1_1 (mpn_mod_1_1p_1,mpn_mod_1_1p_cps_1); 765 } 766 double 767 speed_mpn_mod_1_1_2 (struct speed_params *s) 768 { 769 SPEED_ROUTINE_MPN_MOD_1_1 (mpn_mod_1_1p_2,mpn_mod_1_1p_cps_2); 770 } 771 double 772 speed_mpn_mod_1_2 (struct speed_params *s) 773 { 774 SPEED_ROUTINE_MPN_MOD_1_N (mpn_mod_1s_2p,mpn_mod_1s_2p_cps,2); 775 } 776 double 777 speed_mpn_mod_1_3 (struct speed_params *s) 778 { 779 SPEED_ROUTINE_MPN_MOD_1_N (mpn_mod_1s_3p,mpn_mod_1s_3p_cps,3); 780 } 781 double 782 speed_mpn_mod_1_4 (struct speed_params *s) 783 { 784 SPEED_ROUTINE_MPN_MOD_1_N (mpn_mod_1s_4p,mpn_mod_1s_4p_cps,4); 785 } 786 787 double 788 speed_mpn_divexact_1 (struct speed_params *s) 789 { 790 SPEED_ROUTINE_MPN_DIVEXACT_1 (mpn_divexact_1); 791 } 792 793 double 794 speed_mpn_divexact_by3 (struct speed_params *s) 795 { 796 SPEED_ROUTINE_MPN_COPY (mpn_divexact_by3); 797 } 798 799 double 800 speed_mpn_bdiv_dbm1c (struct speed_params *s) 801 { 802 SPEED_ROUTINE_MPN_BDIV_DBM1C (mpn_bdiv_dbm1c); 803 } 804 805 double 806 speed_mpn_bdiv_q_1 (struct speed_params *s) 807 { 808 SPEED_ROUTINE_MPN_BDIV_Q_1 (mpn_bdiv_q_1); 809 } 810 811 double 812 speed_mpn_pi1_bdiv_q_1 (struct speed_params *s) 813 { 814 SPEED_ROUTINE_MPN_PI1_BDIV_Q_1 (mpn_pi1_bdiv_q_1); 815 } 816 817 #if HAVE_NATIVE_mpn_modexact_1_odd 818 double 819 speed_mpn_modexact_1_odd (struct speed_params *s) 820 { 821 SPEED_ROUTINE_MPN_MODEXACT_1_ODD (mpn_modexact_1_odd); 822 } 823 #endif 824 825 double 826 speed_mpn_modexact_1c_odd (struct speed_params *s) 827 { 828 SPEED_ROUTINE_MPN_MODEXACT_1C_ODD (mpn_modexact_1c_odd); 829 } 830 831 double 832 speed_mpz_mod (struct speed_params *s) 833 { 834 SPEED_ROUTINE_MPZ_MOD (mpz_mod); 835 } 836 837 double 838 speed_mpn_sbpi1_div_qr (struct speed_params *s) 839 { 840 SPEED_ROUTINE_MPN_PI1_DIV (mpn_sbpi1_div_qr, inv.inv32, 2,0); 841 } 842 double 843 speed_mpn_dcpi1_div_qr (struct speed_params *s) 844 { 845 SPEED_ROUTINE_MPN_PI1_DIV (mpn_dcpi1_div_qr, &inv, 6,3); 846 } 847 double 848 speed_mpn_sbpi1_divappr_q (struct speed_params *s) 849 { 850 SPEED_ROUTINE_MPN_PI1_DIV (mpn_sbpi1_divappr_q, inv.inv32, 2,0); 851 } 852 double 853 speed_mpn_dcpi1_divappr_q (struct speed_params *s) 854 { 855 SPEED_ROUTINE_MPN_PI1_DIV (mpn_dcpi1_divappr_q, &inv, 6,3); 856 } 857 double 858 speed_mpn_mu_div_qr (struct speed_params *s) 859 { 860 SPEED_ROUTINE_MPN_MU_DIV_QR (mpn_mu_div_qr, mpn_mu_div_qr_itch); 861 } 862 double 863 speed_mpn_mu_divappr_q (struct speed_params *s) 864 { 865 SPEED_ROUTINE_MPN_MU_DIV_Q (mpn_mu_divappr_q, mpn_mu_divappr_q_itch); 866 } 867 double 868 speed_mpn_mu_div_q (struct speed_params *s) 869 { 870 SPEED_ROUTINE_MPN_MU_DIV_Q (mpn_mu_div_q, mpn_mu_div_q_itch); 871 } 872 double 873 speed_mpn_mupi_div_qr (struct speed_params *s) 874 { 875 SPEED_ROUTINE_MPN_MUPI_DIV_QR (mpn_preinv_mu_div_qr, mpn_preinv_mu_div_qr_itch); 876 } 877 878 double 879 speed_mpn_sbpi1_bdiv_qr (struct speed_params *s) 880 { 881 SPEED_ROUTINE_MPN_PI1_BDIV_QR (mpn_sbpi1_bdiv_qr); 882 } 883 double 884 speed_mpn_dcpi1_bdiv_qr (struct speed_params *s) 885 { 886 SPEED_ROUTINE_MPN_PI1_BDIV_QR (mpn_dcpi1_bdiv_qr); 887 } 888 double 889 speed_mpn_sbpi1_bdiv_q (struct speed_params *s) 890 { 891 SPEED_ROUTINE_MPN_PI1_BDIV_Q (mpn_sbpi1_bdiv_q); 892 } 893 double 894 speed_mpn_dcpi1_bdiv_q (struct speed_params *s) 895 { 896 SPEED_ROUTINE_MPN_PI1_BDIV_Q (mpn_dcpi1_bdiv_q); 897 } 898 double 899 speed_mpn_sbpi1_bdiv_r (struct speed_params *s) 900 { 901 SPEED_ROUTINE_MPN_PI1_BDIV_R (mpn_sbpi1_bdiv_r); 902 } 903 double 904 speed_mpn_mu_bdiv_q (struct speed_params *s) 905 { 906 SPEED_ROUTINE_MPN_MU_BDIV_Q (mpn_mu_bdiv_q, mpn_mu_bdiv_q_itch); 907 } 908 double 909 speed_mpn_mu_bdiv_qr (struct speed_params *s) 910 { 911 SPEED_ROUTINE_MPN_MU_BDIV_QR (mpn_mu_bdiv_qr, mpn_mu_bdiv_qr_itch); 912 } 913 914 double 915 speed_mpn_broot (struct speed_params *s) 916 { 917 SPEED_ROUTINE_MPN_BROOT (mpn_broot); 918 } 919 double 920 speed_mpn_broot_invm1 (struct speed_params *s) 921 { 922 SPEED_ROUTINE_MPN_BROOT (mpn_broot_invm1); 923 } 924 double 925 speed_mpn_brootinv (struct speed_params *s) 926 { 927 SPEED_ROUTINE_MPN_BROOTINV (mpn_brootinv, 5*s->size); 928 } 929 930 double 931 speed_mpn_binvert (struct speed_params *s) 932 { 933 SPEED_ROUTINE_MPN_BINVERT (mpn_binvert, mpn_binvert_itch); 934 } 935 936 double 937 speed_mpn_invert (struct speed_params *s) 938 { 939 SPEED_ROUTINE_MPN_INVERT (mpn_invert, mpn_invert_itch); 940 } 941 942 double 943 speed_mpn_invertappr (struct speed_params *s) 944 { 945 SPEED_ROUTINE_MPN_INVERTAPPR (mpn_invertappr, mpn_invertappr_itch); 946 } 947 948 double 949 speed_mpn_ni_invertappr (struct speed_params *s) 950 { 951 SPEED_ROUTINE_MPN_INVERTAPPR (mpn_ni_invertappr, mpn_invertappr_itch); 952 } 953 954 double 955 speed_mpn_sec_invert (struct speed_params *s) 956 { 957 SPEED_ROUTINE_MPN_SEC_INVERT (mpn_sec_invert, mpn_sec_invert_itch); 958 } 959 960 double 961 speed_mpn_redc_1 (struct speed_params *s) 962 { 963 SPEED_ROUTINE_REDC_1 (mpn_redc_1); 964 } 965 double 966 speed_mpn_redc_2 (struct speed_params *s) 967 { 968 SPEED_ROUTINE_REDC_2 (mpn_redc_2); 969 } 970 double 971 speed_mpn_redc_n (struct speed_params *s) 972 { 973 SPEED_ROUTINE_REDC_N (mpn_redc_n); 974 } 975 976 977 double 978 speed_mpn_popcount (struct speed_params *s) 979 { 980 SPEED_ROUTINE_MPN_POPCOUNT (mpn_popcount); 981 } 982 double 983 speed_mpn_hamdist (struct speed_params *s) 984 { 985 SPEED_ROUTINE_MPN_HAMDIST (mpn_hamdist); 986 } 987 988 989 double 990 speed_mpn_add_n (struct speed_params *s) 991 { 992 SPEED_ROUTINE_MPN_BINARY_N (mpn_add_n); 993 } 994 double 995 speed_mpn_sub_n (struct speed_params *s) 996 { 997 SPEED_ROUTINE_MPN_BINARY_N (mpn_sub_n); 998 } 999 double 1000 speed_mpn_add_1 (struct speed_params *s) 1001 { 1002 SPEED_ROUTINE_MPN_UNARY_1 (mpn_add_1); 1003 } 1004 double 1005 speed_mpn_add_1_inplace (struct speed_params *s) 1006 { 1007 SPEED_ROUTINE_MPN_UNARY_1_INPLACE (mpn_add_1); 1008 } 1009 double 1010 speed_mpn_sub_1 (struct speed_params *s) 1011 { 1012 SPEED_ROUTINE_MPN_UNARY_1 (mpn_sub_1); 1013 } 1014 double 1015 speed_mpn_sub_1_inplace (struct speed_params *s) 1016 { 1017 SPEED_ROUTINE_MPN_UNARY_1_INPLACE (mpn_sub_1); 1018 } 1019 1020 double 1021 speed_mpn_add_err1_n (struct speed_params *s) 1022 { 1023 SPEED_ROUTINE_MPN_BINARY_ERR1_N (mpn_add_err1_n); 1024 } 1025 double 1026 speed_mpn_sub_err1_n (struct speed_params *s) 1027 { 1028 SPEED_ROUTINE_MPN_BINARY_ERR1_N (mpn_sub_err1_n); 1029 } 1030 double 1031 speed_mpn_add_err2_n (struct speed_params *s) 1032 { 1033 SPEED_ROUTINE_MPN_BINARY_ERR2_N (mpn_add_err2_n); 1034 } 1035 double 1036 speed_mpn_sub_err2_n (struct speed_params *s) 1037 { 1038 SPEED_ROUTINE_MPN_BINARY_ERR2_N (mpn_sub_err2_n); 1039 } 1040 double 1041 speed_mpn_add_err3_n (struct speed_params *s) 1042 { 1043 SPEED_ROUTINE_MPN_BINARY_ERR3_N (mpn_add_err3_n); 1044 } 1045 double 1046 speed_mpn_sub_err3_n (struct speed_params *s) 1047 { 1048 SPEED_ROUTINE_MPN_BINARY_ERR3_N (mpn_sub_err3_n); 1049 } 1050 1051 1052 #if HAVE_NATIVE_mpn_add_n_sub_n 1053 double 1054 speed_mpn_add_n_sub_n (struct speed_params *s) 1055 { 1056 SPEED_ROUTINE_MPN_ADDSUB_N_CALL (mpn_add_n_sub_n (ap, sp, s->xp, s->yp, s->size)); 1057 } 1058 #endif 1059 1060 #if HAVE_NATIVE_mpn_addlsh1_n == 1 1061 double 1062 speed_mpn_addlsh1_n (struct speed_params *s) 1063 { 1064 SPEED_ROUTINE_MPN_BINARY_N (mpn_addlsh1_n); 1065 } 1066 #endif 1067 #if HAVE_NATIVE_mpn_sublsh1_n == 1 1068 double 1069 speed_mpn_sublsh1_n (struct speed_params *s) 1070 { 1071 SPEED_ROUTINE_MPN_BINARY_N (mpn_sublsh1_n); 1072 } 1073 #endif 1074 #if HAVE_NATIVE_mpn_addlsh1_n_ip1 1075 double 1076 speed_mpn_addlsh1_n_ip1 (struct speed_params *s) 1077 { 1078 SPEED_ROUTINE_MPN_COPY (mpn_addlsh1_n_ip1); 1079 } 1080 #endif 1081 #if HAVE_NATIVE_mpn_addlsh1_n_ip2 1082 double 1083 speed_mpn_addlsh1_n_ip2 (struct speed_params *s) 1084 { 1085 SPEED_ROUTINE_MPN_COPY (mpn_addlsh1_n_ip2); 1086 } 1087 #endif 1088 #if HAVE_NATIVE_mpn_sublsh1_n_ip1 1089 double 1090 speed_mpn_sublsh1_n_ip1 (struct speed_params *s) 1091 { 1092 SPEED_ROUTINE_MPN_COPY (mpn_sublsh1_n_ip1); 1093 } 1094 #endif 1095 #if HAVE_NATIVE_mpn_rsblsh1_n == 1 1096 double 1097 speed_mpn_rsblsh1_n (struct speed_params *s) 1098 { 1099 SPEED_ROUTINE_MPN_BINARY_N (mpn_rsblsh1_n); 1100 } 1101 #endif 1102 #if HAVE_NATIVE_mpn_addlsh2_n == 1 1103 double 1104 speed_mpn_addlsh2_n (struct speed_params *s) 1105 { 1106 SPEED_ROUTINE_MPN_BINARY_N (mpn_addlsh2_n); 1107 } 1108 #endif 1109 #if HAVE_NATIVE_mpn_sublsh2_n == 1 1110 double 1111 speed_mpn_sublsh2_n (struct speed_params *s) 1112 { 1113 SPEED_ROUTINE_MPN_BINARY_N (mpn_sublsh2_n); 1114 } 1115 #endif 1116 #if HAVE_NATIVE_mpn_addlsh2_n_ip1 1117 double 1118 speed_mpn_addlsh2_n_ip1 (struct speed_params *s) 1119 { 1120 SPEED_ROUTINE_MPN_COPY (mpn_addlsh2_n_ip1); 1121 } 1122 #endif 1123 #if HAVE_NATIVE_mpn_addlsh2_n_ip2 1124 double 1125 speed_mpn_addlsh2_n_ip2 (struct speed_params *s) 1126 { 1127 SPEED_ROUTINE_MPN_COPY (mpn_addlsh2_n_ip2); 1128 } 1129 #endif 1130 #if HAVE_NATIVE_mpn_sublsh2_n_ip1 1131 double 1132 speed_mpn_sublsh2_n_ip1 (struct speed_params *s) 1133 { 1134 SPEED_ROUTINE_MPN_COPY (mpn_sublsh2_n_ip1); 1135 } 1136 #endif 1137 #if HAVE_NATIVE_mpn_rsblsh2_n == 1 1138 double 1139 speed_mpn_rsblsh2_n (struct speed_params *s) 1140 { 1141 SPEED_ROUTINE_MPN_BINARY_N (mpn_rsblsh2_n); 1142 } 1143 #endif 1144 #if HAVE_NATIVE_mpn_addlsh_n 1145 double 1146 speed_mpn_addlsh_n (struct speed_params *s) 1147 { 1148 SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_addlsh_n (wp, xp, yp, s->size, 7)); 1149 } 1150 #endif 1151 #if HAVE_NATIVE_mpn_sublsh_n 1152 double 1153 speed_mpn_sublsh_n (struct speed_params *s) 1154 { 1155 SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_sublsh_n (wp, xp, yp, s->size, 7)); 1156 } 1157 #endif 1158 #if HAVE_NATIVE_mpn_addlsh_n_ip1 1159 double 1160 speed_mpn_addlsh_n_ip1 (struct speed_params *s) 1161 { 1162 SPEED_ROUTINE_MPN_UNARY_1_CALL (mpn_addlsh_n_ip1 (wp, s->xp, s->size, 7)); 1163 } 1164 #endif 1165 #if HAVE_NATIVE_mpn_addlsh_n_ip2 1166 double 1167 speed_mpn_addlsh_n_ip2 (struct speed_params *s) 1168 { 1169 SPEED_ROUTINE_MPN_UNARY_1_CALL (mpn_addlsh_n_ip2 (wp, s->xp, s->size, 7)); 1170 } 1171 #endif 1172 #if HAVE_NATIVE_mpn_sublsh_n_ip1 1173 double 1174 speed_mpn_sublsh_n_ip1 (struct speed_params *s) 1175 { 1176 SPEED_ROUTINE_MPN_UNARY_1_CALL (mpn_sublsh_n_ip1 (wp, s->xp, s->size, 7)); 1177 } 1178 #endif 1179 #if HAVE_NATIVE_mpn_rsblsh_n 1180 double 1181 speed_mpn_rsblsh_n (struct speed_params *s) 1182 { 1183 SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_rsblsh_n (wp, xp, yp, s->size, 7)); 1184 } 1185 #endif 1186 #if HAVE_NATIVE_mpn_rsh1add_n 1187 double 1188 speed_mpn_rsh1add_n (struct speed_params *s) 1189 { 1190 SPEED_ROUTINE_MPN_BINARY_N (mpn_rsh1add_n); 1191 } 1192 #endif 1193 #if HAVE_NATIVE_mpn_rsh1sub_n 1194 double 1195 speed_mpn_rsh1sub_n (struct speed_params *s) 1196 { 1197 SPEED_ROUTINE_MPN_BINARY_N (mpn_rsh1sub_n); 1198 } 1199 #endif 1200 1201 double 1202 speed_mpn_cnd_add_n (struct speed_params *s) 1203 { 1204 SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_cnd_add_n (1, wp, xp, yp, s->size)); 1205 } 1206 double 1207 speed_mpn_cnd_sub_n (struct speed_params *s) 1208 { 1209 SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_cnd_sub_n (1, wp, xp, yp, s->size)); 1210 } 1211 1212 /* mpn_and_n etc can be macros and so have to be handled with 1213 SPEED_ROUTINE_MPN_BINARY_N_CALL forms */ 1214 double 1215 speed_mpn_and_n (struct speed_params *s) 1216 { 1217 SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_and_n (wp, xp, yp, s->size)); 1218 } 1219 double 1220 speed_mpn_andn_n (struct speed_params *s) 1221 { 1222 SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_andn_n (wp, xp, yp, s->size)); 1223 } 1224 double 1225 speed_mpn_nand_n (struct speed_params *s) 1226 { 1227 SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_nand_n (wp, xp, yp, s->size)); 1228 } 1229 double 1230 speed_mpn_ior_n (struct speed_params *s) 1231 { 1232 SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_ior_n (wp, xp, yp, s->size)); 1233 } 1234 double 1235 speed_mpn_iorn_n (struct speed_params *s) 1236 { 1237 SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_iorn_n (wp, xp, yp, s->size)); 1238 } 1239 double 1240 speed_mpn_nior_n (struct speed_params *s) 1241 { 1242 SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_nior_n (wp, xp, yp, s->size)); 1243 } 1244 double 1245 speed_mpn_xor_n (struct speed_params *s) 1246 { 1247 SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_xor_n (wp, xp, yp, s->size)); 1248 } 1249 double 1250 speed_mpn_xnor_n (struct speed_params *s) 1251 { 1252 SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_xnor_n (wp, xp, yp, s->size)); 1253 } 1254 1255 1256 double 1257 speed_mpn_mul_n (struct speed_params *s) 1258 { 1259 SPEED_ROUTINE_MPN_MUL_N (mpn_mul_n); 1260 } 1261 double 1262 speed_mpn_sqr (struct speed_params *s) 1263 { 1264 SPEED_ROUTINE_MPN_SQR (mpn_sqr); 1265 } 1266 double 1267 speed_mpn_mul_n_sqr (struct speed_params *s) 1268 { 1269 SPEED_ROUTINE_MPN_SQR_CALL (mpn_mul_n (wp, s->xp, s->xp, s->size)); 1270 } 1271 1272 double 1273 speed_mpn_mul_basecase (struct speed_params *s) 1274 { 1275 SPEED_ROUTINE_MPN_MUL(mpn_mul_basecase); 1276 } 1277 double 1278 speed_mpn_mul (struct speed_params *s) 1279 { 1280 SPEED_ROUTINE_MPN_MUL(mpn_mul); 1281 } 1282 double 1283 speed_mpn_sqr_basecase (struct speed_params *s) 1284 { 1285 /* FIXME: size restrictions on some versions of sqr_basecase */ 1286 SPEED_ROUTINE_MPN_SQR (mpn_sqr_basecase); 1287 } 1288 1289 #if HAVE_NATIVE_mpn_sqr_diagonal 1290 double 1291 speed_mpn_sqr_diagonal (struct speed_params *s) 1292 { 1293 SPEED_ROUTINE_MPN_SQR (mpn_sqr_diagonal); 1294 } 1295 #endif 1296 1297 #if HAVE_NATIVE_mpn_sqr_diag_addlsh1 1298 double 1299 speed_mpn_sqr_diag_addlsh1 (struct speed_params *s) 1300 { 1301 SPEED_ROUTINE_MPN_SQR_DIAG_ADDLSH1_CALL (mpn_sqr_diag_addlsh1 (wp, tp, s->xp, s->size)); 1302 } 1303 #endif 1304 1305 double 1306 speed_mpn_toom2_sqr (struct speed_params *s) 1307 { 1308 SPEED_ROUTINE_MPN_TOOM2_SQR (mpn_toom2_sqr); 1309 } 1310 double 1311 speed_mpn_toom3_sqr (struct speed_params *s) 1312 { 1313 SPEED_ROUTINE_MPN_TOOM3_SQR (mpn_toom3_sqr); 1314 } 1315 double 1316 speed_mpn_toom4_sqr (struct speed_params *s) 1317 { 1318 SPEED_ROUTINE_MPN_TOOM4_SQR (mpn_toom4_sqr); 1319 } 1320 double 1321 speed_mpn_toom6_sqr (struct speed_params *s) 1322 { 1323 SPEED_ROUTINE_MPN_TOOM6_SQR (mpn_toom6_sqr); 1324 } 1325 double 1326 speed_mpn_toom8_sqr (struct speed_params *s) 1327 { 1328 SPEED_ROUTINE_MPN_TOOM8_SQR (mpn_toom8_sqr); 1329 } 1330 double 1331 speed_mpn_toom22_mul (struct speed_params *s) 1332 { 1333 SPEED_ROUTINE_MPN_TOOM22_MUL_N (mpn_toom22_mul); 1334 } 1335 double 1336 speed_mpn_toom33_mul (struct speed_params *s) 1337 { 1338 SPEED_ROUTINE_MPN_TOOM33_MUL_N (mpn_toom33_mul); 1339 } 1340 double 1341 speed_mpn_toom44_mul (struct speed_params *s) 1342 { 1343 SPEED_ROUTINE_MPN_TOOM44_MUL_N (mpn_toom44_mul); 1344 } 1345 double 1346 speed_mpn_toom6h_mul (struct speed_params *s) 1347 { 1348 SPEED_ROUTINE_MPN_TOOM6H_MUL_N (mpn_toom6h_mul); 1349 } 1350 double 1351 speed_mpn_toom8h_mul (struct speed_params *s) 1352 { 1353 SPEED_ROUTINE_MPN_TOOM8H_MUL_N (mpn_toom8h_mul); 1354 } 1355 1356 double 1357 speed_mpn_toom32_mul (struct speed_params *s) 1358 { 1359 SPEED_ROUTINE_MPN_TOOM32_MUL (mpn_toom32_mul); 1360 } 1361 double 1362 speed_mpn_toom42_mul (struct speed_params *s) 1363 { 1364 SPEED_ROUTINE_MPN_TOOM42_MUL (mpn_toom42_mul); 1365 } 1366 double 1367 speed_mpn_toom43_mul (struct speed_params *s) 1368 { 1369 SPEED_ROUTINE_MPN_TOOM43_MUL (mpn_toom43_mul); 1370 } 1371 double 1372 speed_mpn_toom63_mul (struct speed_params *s) 1373 { 1374 SPEED_ROUTINE_MPN_TOOM63_MUL (mpn_toom63_mul); 1375 } 1376 double 1377 speed_mpn_toom32_for_toom43_mul (struct speed_params *s) 1378 { 1379 SPEED_ROUTINE_MPN_TOOM32_FOR_TOOM43_MUL (mpn_toom32_mul); 1380 } 1381 double 1382 speed_mpn_toom43_for_toom32_mul (struct speed_params *s) 1383 { 1384 SPEED_ROUTINE_MPN_TOOM43_FOR_TOOM32_MUL (mpn_toom43_mul); 1385 } 1386 double 1387 speed_mpn_toom32_for_toom53_mul (struct speed_params *s) 1388 { 1389 SPEED_ROUTINE_MPN_TOOM32_FOR_TOOM53_MUL (mpn_toom32_mul); 1390 } 1391 double 1392 speed_mpn_toom53_for_toom32_mul (struct speed_params *s) 1393 { 1394 SPEED_ROUTINE_MPN_TOOM53_FOR_TOOM32_MUL (mpn_toom53_mul); 1395 } 1396 double 1397 speed_mpn_toom42_for_toom53_mul (struct speed_params *s) 1398 { 1399 SPEED_ROUTINE_MPN_TOOM42_FOR_TOOM53_MUL (mpn_toom42_mul); 1400 } 1401 double 1402 speed_mpn_toom53_for_toom42_mul (struct speed_params *s) 1403 { 1404 SPEED_ROUTINE_MPN_TOOM53_FOR_TOOM42_MUL (mpn_toom53_mul); 1405 } 1406 double 1407 speed_mpn_toom43_for_toom54_mul (struct speed_params *s) 1408 { 1409 SPEED_ROUTINE_MPN_TOOM43_FOR_TOOM54_MUL (mpn_toom43_mul); 1410 } 1411 double 1412 speed_mpn_toom54_for_toom43_mul (struct speed_params *s) 1413 { 1414 SPEED_ROUTINE_MPN_TOOM54_FOR_TOOM43_MUL (mpn_toom54_mul); 1415 } 1416 1417 double 1418 speed_mpn_nussbaumer_mul (struct speed_params *s) 1419 { 1420 SPEED_ROUTINE_MPN_MUL_N_CALL 1421 (mpn_nussbaumer_mul (wp, s->xp, s->size, s->yp, s->size)); 1422 } 1423 double 1424 speed_mpn_nussbaumer_mul_sqr (struct speed_params *s) 1425 { 1426 SPEED_ROUTINE_MPN_SQR_CALL 1427 (mpn_nussbaumer_mul (wp, s->xp, s->size, s->xp, s->size)); 1428 } 1429 1430 #if WANT_OLD_FFT_FULL 1431 double 1432 speed_mpn_mul_fft_full (struct speed_params *s) 1433 { 1434 SPEED_ROUTINE_MPN_MUL_N_CALL 1435 (mpn_mul_fft_full (wp, s->xp, s->size, s->yp, s->size)); 1436 } 1437 double 1438 speed_mpn_mul_fft_full_sqr (struct speed_params *s) 1439 { 1440 SPEED_ROUTINE_MPN_SQR_CALL 1441 (mpn_mul_fft_full (wp, s->xp, s->size, s->xp, s->size)); 1442 } 1443 #endif 1444 1445 /* These are mod 2^N+1 multiplies and squares. If s->r is supplied it's 1446 used as k, otherwise the best k for the size is used. If s->size isn't a 1447 multiple of 2^k it's rounded up to make the effective operation size. */ 1448 1449 #define SPEED_ROUTINE_MPN_MUL_FFT_CALL(call, sqr) \ 1450 { \ 1451 mp_ptr wp; \ 1452 mp_size_t pl; \ 1453 int k; \ 1454 unsigned i; \ 1455 double t; \ 1456 TMP_DECL; \ 1457 \ 1458 SPEED_RESTRICT_COND (s->size >= 1); \ 1459 \ 1460 if (s->r != 0) \ 1461 k = s->r; \ 1462 else \ 1463 k = mpn_fft_best_k (s->size, sqr); \ 1464 \ 1465 TMP_MARK; \ 1466 pl = mpn_fft_next_size (s->size, k); \ 1467 SPEED_TMP_ALLOC_LIMBS (wp, pl+1, s->align_wp); \ 1468 \ 1469 speed_operand_src (s, s->xp, s->size); \ 1470 if (!sqr) \ 1471 speed_operand_src (s, s->yp, s->size); \ 1472 speed_operand_dst (s, wp, pl+1); \ 1473 speed_cache_fill (s); \ 1474 \ 1475 speed_starttime (); \ 1476 i = s->reps; \ 1477 do \ 1478 call; \ 1479 while (--i != 0); \ 1480 t = speed_endtime (); \ 1481 \ 1482 TMP_FREE; \ 1483 return t; \ 1484 } 1485 1486 double 1487 speed_mpn_mul_fft (struct speed_params *s) 1488 { 1489 SPEED_ROUTINE_MPN_MUL_FFT_CALL 1490 (mpn_mul_fft (wp, pl, s->xp, s->size, s->yp, s->size, k), 0); 1491 } 1492 1493 double 1494 speed_mpn_mul_fft_sqr (struct speed_params *s) 1495 { 1496 SPEED_ROUTINE_MPN_MUL_FFT_CALL 1497 (mpn_mul_fft (wp, pl, s->xp, s->size, s->xp, s->size, k), 1); 1498 } 1499 1500 double 1501 speed_mpn_fft_mul (struct speed_params *s) 1502 { 1503 SPEED_ROUTINE_MPN_MUL_N_CALL (mpn_fft_mul (wp, s->xp, s->size, s->yp, s->size)); 1504 } 1505 1506 double 1507 speed_mpn_fft_sqr (struct speed_params *s) 1508 { 1509 SPEED_ROUTINE_MPN_SQR_CALL (mpn_fft_mul (wp, s->xp, s->size, s->xp, s->size)); 1510 } 1511 1512 double 1513 speed_mpn_sqrlo (struct speed_params *s) 1514 { 1515 SPEED_ROUTINE_MPN_SQRLO (mpn_sqrlo); 1516 } 1517 double 1518 speed_mpn_sqrlo_basecase (struct speed_params *s) 1519 { 1520 SPEED_RESTRICT_COND (ABOVE_THRESHOLD (s->size, MIN (3, SQRLO_BASECASE_THRESHOLD)) 1521 && BELOW_THRESHOLD (s->size, SQRLO_DC_THRESHOLD)); 1522 SPEED_ROUTINE_MPN_SQRLO (mpn_sqrlo_basecase); 1523 } 1524 double 1525 speed_mpn_mullo_n (struct speed_params *s) 1526 { 1527 SPEED_ROUTINE_MPN_MULLO_N (mpn_mullo_n); 1528 } 1529 double 1530 speed_mpn_mullo_basecase (struct speed_params *s) 1531 { 1532 SPEED_ROUTINE_MPN_MULLO_BASECASE (mpn_mullo_basecase); 1533 } 1534 1535 double 1536 speed_mpn_mulmid_basecase (struct speed_params *s) 1537 { 1538 SPEED_ROUTINE_MPN_MULMID (mpn_mulmid_basecase); 1539 } 1540 1541 double 1542 speed_mpn_mulmid (struct speed_params *s) 1543 { 1544 SPEED_ROUTINE_MPN_MULMID (mpn_mulmid); 1545 } 1546 1547 double 1548 speed_mpn_mulmid_n (struct speed_params *s) 1549 { 1550 SPEED_ROUTINE_MPN_MULMID_N (mpn_mulmid_n); 1551 } 1552 1553 double 1554 speed_mpn_toom42_mulmid (struct speed_params *s) 1555 { 1556 SPEED_ROUTINE_MPN_TOOM42_MULMID (mpn_toom42_mulmid); 1557 } 1558 1559 double 1560 speed_mpn_mulmod_bnm1 (struct speed_params *s) 1561 { 1562 SPEED_ROUTINE_MPN_MULMOD_BNM1_CALL (mpn_mulmod_bnm1 (wp, s->size, s->xp, s->size, s->yp, s->size, tp)); 1563 } 1564 1565 double 1566 speed_mpn_bc_mulmod_bnm1 (struct speed_params *s) 1567 { 1568 SPEED_ROUTINE_MPN_MULMOD_BNM1_CALL (mpn_bc_mulmod_bnm1 (wp, s->xp, s->yp, s->size, tp)); 1569 } 1570 1571 double 1572 speed_mpn_mulmod_bnm1_rounded (struct speed_params *s) 1573 { 1574 SPEED_ROUTINE_MPN_MULMOD_BNM1_ROUNDED (mpn_mulmod_bnm1); 1575 } 1576 1577 double 1578 speed_mpn_sqrmod_bnm1 (struct speed_params *s) 1579 { 1580 SPEED_ROUTINE_MPN_MULMOD_BNM1_CALL (mpn_sqrmod_bnm1 (wp, s->size, s->xp, s->size, tp)); 1581 } 1582 1583 double 1584 speed_mpn_matrix22_mul (struct speed_params *s) 1585 { 1586 /* Speed params only includes 2 inputs, so we have to invent the 1587 other 6. */ 1588 1589 mp_ptr a; 1590 mp_ptr r; 1591 mp_ptr b; 1592 mp_ptr tp; 1593 mp_size_t itch; 1594 unsigned i; 1595 double t; 1596 TMP_DECL; 1597 1598 TMP_MARK; 1599 SPEED_TMP_ALLOC_LIMBS (a, 4 * s->size, s->align_xp); 1600 SPEED_TMP_ALLOC_LIMBS (b, 4 * s->size, s->align_yp); 1601 SPEED_TMP_ALLOC_LIMBS (r, 8 * s->size + 4, s->align_wp); 1602 1603 MPN_COPY (a, s->xp, s->size); 1604 mpn_random (a + s->size, 3 * s->size); 1605 MPN_COPY (b, s->yp, s->size); 1606 mpn_random (b + s->size, 3 * s->size); 1607 1608 itch = mpn_matrix22_mul_itch (s->size, s->size); 1609 SPEED_TMP_ALLOC_LIMBS (tp, itch, s->align_wp2); 1610 1611 speed_operand_src (s, a, 4 * s->size); 1612 speed_operand_src (s, b, 4 * s->size); 1613 speed_operand_dst (s, r, 8 * s->size + 4); 1614 speed_operand_dst (s, tp, itch); 1615 speed_cache_fill (s); 1616 1617 speed_starttime (); 1618 i = s->reps; 1619 do 1620 { 1621 mp_size_t sz = s->size; 1622 MPN_COPY (r + 0 * sz + 0, a + 0 * sz, sz); 1623 MPN_COPY (r + 2 * sz + 1, a + 1 * sz, sz); 1624 MPN_COPY (r + 4 * sz + 2, a + 2 * sz, sz); 1625 MPN_COPY (r + 6 * sz + 3, a + 3 * sz, sz); 1626 mpn_matrix22_mul (r, r + 2 * sz + 1, r + 4 * sz + 2, r + 6 * sz + 3, sz, 1627 b, b + 1 * sz, b + 2 * sz, b + 3 * sz, sz, 1628 tp); 1629 } 1630 while (--i != 0); 1631 t = speed_endtime(); 1632 TMP_FREE; 1633 return t; 1634 } 1635 1636 double 1637 speed_mpn_hgcd2 (struct speed_params *s) 1638 { 1639 SPEED_ROUTINE_MPN_HGCD2 (mpn_hgcd2); 1640 } 1641 double 1642 speed_mpn_hgcd2_1 (struct speed_params *s) 1643 { 1644 SPEED_ROUTINE_MPN_HGCD2 (mpn_hgcd2_1); 1645 } 1646 double 1647 speed_mpn_hgcd2_2 (struct speed_params *s) 1648 { 1649 SPEED_ROUTINE_MPN_HGCD2 (mpn_hgcd2_2); 1650 } 1651 double 1652 speed_mpn_hgcd2_3 (struct speed_params *s) 1653 { 1654 SPEED_ROUTINE_MPN_HGCD2 (mpn_hgcd2_3); 1655 } 1656 double 1657 speed_mpn_hgcd2_4 (struct speed_params *s) 1658 { 1659 SPEED_ROUTINE_MPN_HGCD2 (mpn_hgcd2_4); 1660 } 1661 double 1662 speed_mpn_hgcd2_5 (struct speed_params *s) 1663 { 1664 SPEED_ROUTINE_MPN_HGCD2 (mpn_hgcd2_5); 1665 } 1666 1667 double 1668 speed_mpn_hgcd (struct speed_params *s) 1669 { 1670 SPEED_ROUTINE_MPN_HGCD_CALL (mpn_hgcd, mpn_hgcd_itch); 1671 } 1672 1673 double 1674 speed_mpn_hgcd_lehmer (struct speed_params *s) 1675 { 1676 SPEED_ROUTINE_MPN_HGCD_CALL (mpn_hgcd_lehmer, mpn_hgcd_lehmer_itch); 1677 } 1678 1679 double 1680 speed_mpn_hgcd_appr (struct speed_params *s) 1681 { 1682 SPEED_ROUTINE_MPN_HGCD_CALL (mpn_hgcd_appr, mpn_hgcd_appr_itch); 1683 } 1684 1685 double 1686 speed_mpn_hgcd_appr_lehmer (struct speed_params *s) 1687 { 1688 SPEED_ROUTINE_MPN_HGCD_CALL (mpn_hgcd_appr_lehmer, mpn_hgcd_appr_lehmer_itch); 1689 } 1690 1691 double 1692 speed_mpn_hgcd_reduce (struct speed_params *s) 1693 { 1694 SPEED_ROUTINE_MPN_HGCD_REDUCE_CALL (mpn_hgcd_reduce, mpn_hgcd_reduce_itch); 1695 } 1696 double 1697 speed_mpn_hgcd_reduce_1 (struct speed_params *s) 1698 { 1699 SPEED_ROUTINE_MPN_HGCD_REDUCE_CALL (mpn_hgcd_reduce_1, mpn_hgcd_reduce_1_itch); 1700 } 1701 double 1702 speed_mpn_hgcd_reduce_2 (struct speed_params *s) 1703 { 1704 SPEED_ROUTINE_MPN_HGCD_REDUCE_CALL (mpn_hgcd_reduce_2, mpn_hgcd_reduce_2_itch); 1705 } 1706 1707 double 1708 speed_mpn_gcd (struct speed_params *s) 1709 { 1710 SPEED_ROUTINE_MPN_GCD (mpn_gcd); 1711 } 1712 1713 double 1714 speed_mpn_gcdext (struct speed_params *s) 1715 { 1716 SPEED_ROUTINE_MPN_GCDEXT (mpn_gcdext); 1717 } 1718 #if 0 1719 double 1720 speed_mpn_gcdext_lehmer (struct speed_params *s) 1721 { 1722 SPEED_ROUTINE_MPN_GCDEXT (__gmpn_gcdext_lehmer); 1723 } 1724 #endif 1725 double 1726 speed_mpn_gcdext_single (struct speed_params *s) 1727 { 1728 SPEED_ROUTINE_MPN_GCDEXT (mpn_gcdext_single); 1729 } 1730 double 1731 speed_mpn_gcdext_double (struct speed_params *s) 1732 { 1733 SPEED_ROUTINE_MPN_GCDEXT (mpn_gcdext_double); 1734 } 1735 double 1736 speed_mpn_gcdext_one_single (struct speed_params *s) 1737 { 1738 SPEED_ROUTINE_MPN_GCDEXT_ONE (mpn_gcdext_one_single); 1739 } 1740 double 1741 speed_mpn_gcdext_one_double (struct speed_params *s) 1742 { 1743 SPEED_ROUTINE_MPN_GCDEXT_ONE (mpn_gcdext_one_double); 1744 } 1745 double 1746 speed_mpn_gcd_1 (struct speed_params *s) 1747 { 1748 SPEED_ROUTINE_MPN_GCD_1 (mpn_gcd_1); 1749 } 1750 double 1751 speed_mpn_gcd_11 (struct speed_params *s) 1752 { 1753 SPEED_ROUTINE_MPN_GCD_11 (mpn_gcd_11); 1754 } 1755 double 1756 speed_mpn_gcd_1N (struct speed_params *s) 1757 { 1758 SPEED_ROUTINE_MPN_GCD_1N (mpn_gcd_1); 1759 } 1760 double 1761 speed_mpn_gcd_22 (struct speed_params *s) 1762 { 1763 SPEED_ROUTINE_MPN_GCD_22 (mpn_gcd_22); 1764 } 1765 1766 double 1767 speed_mpz_nextprime (struct speed_params *s) 1768 { 1769 SPEED_ROUTINE_MPZ_NEXTPRIME (mpz_nextprime); 1770 } 1771 1772 double 1773 speed_mpz_jacobi (struct speed_params *s) 1774 { 1775 SPEED_ROUTINE_MPZ_JACOBI (mpz_jacobi); 1776 } 1777 double 1778 speed_mpn_jacobi_base (struct speed_params *s) 1779 { 1780 SPEED_ROUTINE_MPN_JACBASE (mpn_jacobi_base); 1781 } 1782 double 1783 speed_mpn_jacobi_base_1 (struct speed_params *s) 1784 { 1785 SPEED_ROUTINE_MPN_JACBASE (mpn_jacobi_base_1); 1786 } 1787 double 1788 speed_mpn_jacobi_base_2 (struct speed_params *s) 1789 { 1790 SPEED_ROUTINE_MPN_JACBASE (mpn_jacobi_base_2); 1791 } 1792 double 1793 speed_mpn_jacobi_base_3 (struct speed_params *s) 1794 { 1795 SPEED_ROUTINE_MPN_JACBASE (mpn_jacobi_base_3); 1796 } 1797 double 1798 speed_mpn_jacobi_base_4 (struct speed_params *s) 1799 { 1800 SPEED_ROUTINE_MPN_JACBASE (mpn_jacobi_base_4); 1801 } 1802 1803 1804 double 1805 speed_mpn_sqrtrem (struct speed_params *s) 1806 { 1807 SPEED_ROUTINE_MPN_SQRTROOT_CALL (mpn_sqrtrem (wp, wp2, s->xp, s->size)); 1808 } 1809 1810 double 1811 speed_mpn_sqrt (struct speed_params *s) 1812 { 1813 SPEED_ROUTINE_MPN_SQRTROOT_CALL (mpn_sqrtrem (wp, NULL, s->xp, s->size)); 1814 } 1815 1816 double 1817 speed_mpn_rootrem (struct speed_params *s) 1818 { 1819 SPEED_ROUTINE_MPN_SQRTROOT_CALL (mpn_rootrem (wp, wp2, s->xp, s->size, s->r)); 1820 } 1821 1822 double 1823 speed_mpn_root (struct speed_params *s) 1824 { 1825 SPEED_ROUTINE_MPN_SQRTROOT_CALL (mpn_rootrem (wp, NULL, s->xp, s->size, s->r)); 1826 } 1827 1828 1829 double 1830 speed_mpn_perfect_power_p (struct speed_params *s) 1831 { 1832 SPEED_ROUTINE_MPN_PERFECT_POWER (mpn_perfect_power_p); 1833 } 1834 1835 double 1836 speed_mpn_perfect_square_p (struct speed_params *s) 1837 { 1838 SPEED_ROUTINE_MPN_PERFECT_SQUARE (mpn_perfect_square_p); 1839 } 1840 1841 1842 double 1843 speed_mpz_fac_ui (struct speed_params *s) 1844 { 1845 SPEED_ROUTINE_MPZ_FAC_UI (mpz_fac_ui); 1846 } 1847 1848 double 1849 speed_mpz_2fac_ui (struct speed_params *s) 1850 { 1851 SPEED_ROUTINE_MPZ_UI (mpz_2fac_ui); 1852 } 1853 1854 double 1855 speed_mpz_primorial_ui (struct speed_params *s) 1856 { 1857 SPEED_ROUTINE_MPZ_UI (mpz_primorial_ui); 1858 } 1859 1860 1861 double 1862 speed_mpn_fib2_ui (struct speed_params *s) 1863 { 1864 SPEED_ROUTINE_MPN_FIB2_UI (mpn_fib2_ui); 1865 } 1866 double 1867 speed_mpz_fib_ui (struct speed_params *s) 1868 { 1869 SPEED_ROUTINE_MPZ_FIB_UI (mpz_fib_ui); 1870 } 1871 double 1872 speed_mpz_fib2_ui (struct speed_params *s) 1873 { 1874 SPEED_ROUTINE_MPZ_FIB2_UI (mpz_fib2_ui); 1875 } 1876 double 1877 speed_mpz_lucnum_ui (struct speed_params *s) 1878 { 1879 SPEED_ROUTINE_MPZ_LUCNUM_UI (mpz_lucnum_ui); 1880 } 1881 double 1882 speed_mpz_lucnum2_ui (struct speed_params *s) 1883 { 1884 SPEED_ROUTINE_MPZ_LUCNUM2_UI (mpz_lucnum2_ui); 1885 } 1886 1887 1888 double 1889 speed_mpz_powm (struct speed_params *s) 1890 { 1891 SPEED_ROUTINE_MPZ_POWM (mpz_powm); 1892 } 1893 double 1894 speed_mpz_powm_mod (struct speed_params *s) 1895 { 1896 SPEED_ROUTINE_MPZ_POWM (mpz_powm_mod); 1897 } 1898 double 1899 speed_mpz_powm_redc (struct speed_params *s) 1900 { 1901 SPEED_ROUTINE_MPZ_POWM (mpz_powm_redc); 1902 } 1903 double 1904 speed_mpz_powm_sec (struct speed_params *s) 1905 { 1906 SPEED_ROUTINE_MPZ_POWM (mpz_powm_sec); 1907 } 1908 double 1909 speed_mpz_powm_ui (struct speed_params *s) 1910 { 1911 SPEED_ROUTINE_MPZ_POWM_UI (mpz_powm_ui); 1912 } 1913 1914 1915 double 1916 speed_binvert_limb (struct speed_params *s) 1917 { 1918 SPEED_ROUTINE_MODLIMB_INVERT (binvert_limb); 1919 } 1920 1921 1922 double 1923 speed_noop (struct speed_params *s) 1924 { 1925 unsigned i; 1926 1927 speed_starttime (); 1928 i = s->reps; 1929 do 1930 noop (); 1931 while (--i != 0); 1932 return speed_endtime (); 1933 } 1934 1935 double 1936 speed_noop_wxs (struct speed_params *s) 1937 { 1938 mp_ptr wp; 1939 unsigned i; 1940 double t; 1941 TMP_DECL; 1942 1943 TMP_MARK; 1944 wp = TMP_ALLOC_LIMBS (1); 1945 1946 speed_starttime (); 1947 i = s->reps; 1948 do 1949 noop_wxs (wp, s->xp, s->size); 1950 while (--i != 0); 1951 t = speed_endtime (); 1952 1953 TMP_FREE; 1954 return t; 1955 } 1956 1957 double 1958 speed_noop_wxys (struct speed_params *s) 1959 { 1960 mp_ptr wp; 1961 unsigned i; 1962 double t; 1963 TMP_DECL; 1964 1965 TMP_MARK; 1966 wp = TMP_ALLOC_LIMBS (1); 1967 1968 speed_starttime (); 1969 i = s->reps; 1970 do 1971 noop_wxys (wp, s->xp, s->yp, s->size); 1972 while (--i != 0); 1973 t = speed_endtime (); 1974 1975 TMP_FREE; 1976 return t; 1977 } 1978 1979 1980 #define SPEED_ROUTINE_ALLOC_FREE(variables, calls) \ 1981 { \ 1982 unsigned i; \ 1983 variables; \ 1984 \ 1985 speed_starttime (); \ 1986 i = s->reps; \ 1987 do \ 1988 { \ 1989 calls; \ 1990 } \ 1991 while (--i != 0); \ 1992 return speed_endtime (); \ 1993 } 1994 1995 1996 /* Compare these to see how much malloc/free costs and then how much 1997 __gmp_default_allocate/free and mpz_init/clear add. mpz_init/clear or 1998 mpq_init/clear will be doing a 1 limb allocate, so use that as the size 1999 when including them in comparisons. */ 2000 2001 double 2002 speed_malloc_free (struct speed_params *s) 2003 { 2004 size_t bytes = s->size * GMP_LIMB_BYTES; 2005 SPEED_ROUTINE_ALLOC_FREE (void *p, 2006 p = malloc (bytes); 2007 free (p)); 2008 } 2009 2010 double 2011 speed_malloc_realloc_free (struct speed_params *s) 2012 { 2013 size_t bytes = s->size * GMP_LIMB_BYTES; 2014 SPEED_ROUTINE_ALLOC_FREE (void *p, 2015 p = malloc (GMP_LIMB_BYTES); 2016 p = realloc (p, bytes); 2017 free (p)); 2018 } 2019 2020 double 2021 speed_gmp_allocate_free (struct speed_params *s) 2022 { 2023 size_t bytes = s->size * GMP_LIMB_BYTES; 2024 SPEED_ROUTINE_ALLOC_FREE (void *p, 2025 p = (*__gmp_allocate_func) (bytes); 2026 (*__gmp_free_func) (p, bytes)); 2027 } 2028 2029 double 2030 speed_gmp_allocate_reallocate_free (struct speed_params *s) 2031 { 2032 size_t bytes = s->size * GMP_LIMB_BYTES; 2033 SPEED_ROUTINE_ALLOC_FREE 2034 (void *p, 2035 p = (*__gmp_allocate_func) (GMP_LIMB_BYTES); 2036 p = (*__gmp_reallocate_func) (p, bytes, GMP_LIMB_BYTES); 2037 (*__gmp_free_func) (p, bytes)); 2038 } 2039 2040 double 2041 speed_mpz_init_clear (struct speed_params *s) 2042 { 2043 SPEED_ROUTINE_ALLOC_FREE (mpz_t z, 2044 mpz_init (z); 2045 mpz_clear (z)); 2046 } 2047 2048 double 2049 speed_mpz_init_realloc_clear (struct speed_params *s) 2050 { 2051 SPEED_ROUTINE_ALLOC_FREE (mpz_t z, 2052 mpz_init (z); 2053 _mpz_realloc (z, s->size); 2054 mpz_clear (z)); 2055 } 2056 2057 double 2058 speed_mpq_init_clear (struct speed_params *s) 2059 { 2060 SPEED_ROUTINE_ALLOC_FREE (mpq_t q, 2061 mpq_init (q); 2062 mpq_clear (q)); 2063 } 2064 2065 double 2066 speed_mpf_init_clear (struct speed_params *s) 2067 { 2068 SPEED_ROUTINE_ALLOC_FREE (mpf_t f, 2069 mpf_init (f); 2070 mpf_clear (f)); 2071 } 2072 2073 2074 /* Compare this to mpn_add_n to see how much overhead mpz_add adds. Note 2075 that repeatedly calling mpz_add with the same data gives branch prediction 2076 in it an advantage. */ 2077 2078 double 2079 speed_mpz_add (struct speed_params *s) 2080 { 2081 mpz_t w, x, y; 2082 unsigned i; 2083 double t; 2084 2085 mpz_init (w); 2086 mpz_init (x); 2087 mpz_init (y); 2088 2089 mpz_set_n (x, s->xp, s->size); 2090 mpz_set_n (y, s->yp, s->size); 2091 mpz_add (w, x, y); 2092 2093 speed_starttime (); 2094 i = s->reps; 2095 do 2096 { 2097 mpz_add (w, x, y); 2098 } 2099 while (--i != 0); 2100 t = speed_endtime (); 2101 2102 mpz_clear (w); 2103 mpz_clear (x); 2104 mpz_clear (y); 2105 return t; 2106 } 2107 2108 2109 /* An inverse (s->r) or (s->size)/2 modulo s->size limbs */ 2110 2111 double 2112 speed_mpz_invert (struct speed_params *s) 2113 { 2114 mpz_t a, m, r; 2115 mp_size_t k; 2116 unsigned i; 2117 double t; 2118 2119 if (s->r == 0) 2120 k = s->size/2; 2121 else if (s->r < GMP_LIMB_HIGHBIT) 2122 k = s->r; 2123 else /* s->r < 0 */ 2124 k = s->size - (-s->r); 2125 2126 SPEED_RESTRICT_COND (k > 0 && k <= s->size); 2127 2128 mpz_init_set_n (m, s->yp, s->size); 2129 mpz_setbit (m, 0); /* force m to odd */ 2130 2131 mpz_init_set_n (a, s->xp, k); 2132 2133 mpz_init (r); 2134 while (mpz_invert (r, a, m) == 0) 2135 mpz_add_ui (a, a, 1); 2136 2137 speed_starttime (); 2138 i = s->reps; 2139 do 2140 mpz_invert (r, a, m); 2141 while (--i != 0); 2142 t = speed_endtime (); 2143 2144 mpz_clear (r); 2145 mpz_clear (a); 2146 mpz_clear (m); 2147 return t; 2148 } 2149 2150 /* If r==0, calculate binomial(size,size/2), 2151 otherwise calculate binomial(size,r). */ 2152 2153 double 2154 speed_mpz_bin_uiui (struct speed_params *s) 2155 { 2156 mpz_t w; 2157 unsigned long k; 2158 unsigned i; 2159 double t; 2160 2161 mpz_init (w); 2162 if (s->r != 0) 2163 k = s->r; 2164 else 2165 k = s->size/2; 2166 2167 speed_starttime (); 2168 i = s->reps; 2169 do 2170 { 2171 mpz_bin_uiui (w, s->size, k); 2172 } 2173 while (--i != 0); 2174 t = speed_endtime (); 2175 2176 mpz_clear (w); 2177 return t; 2178 } 2179 2180 /* If r==0, calculate binomial(2^size,size), 2181 otherwise calculate binomial(2^size,r). */ 2182 2183 double 2184 speed_mpz_bin_ui (struct speed_params *s) 2185 { 2186 mpz_t w, x; 2187 unsigned long k; 2188 unsigned i; 2189 double t; 2190 2191 mpz_init (w); 2192 mpz_init_set_ui (x, 0); 2193 2194 mpz_setbit (x, s->size); 2195 2196 if (s->r != 0) 2197 k = s->r; 2198 else 2199 k = s->size; 2200 2201 speed_starttime (); 2202 i = s->reps; 2203 do 2204 { 2205 mpz_bin_ui (w, x, k); 2206 } 2207 while (--i != 0); 2208 t = speed_endtime (); 2209 2210 mpz_clear (w); 2211 mpz_clear (x); 2212 return t; 2213 } 2214 2215 /* If r==0, calculate mfac(size,log(size)), 2216 otherwise calculate mfac(size,r). */ 2217 2218 double 2219 speed_mpz_mfac_uiui (struct speed_params *s) 2220 { 2221 mpz_t w; 2222 unsigned long k; 2223 unsigned i; 2224 double t; 2225 2226 mpz_init (w); 2227 if (s->r != 0) 2228 k = s->r; 2229 else 2230 for (k = 1; s->size >> k; ++k); 2231 2232 speed_starttime (); 2233 i = s->reps; 2234 do 2235 { 2236 mpz_mfac_uiui (w, s->size, k); 2237 } 2238 while (--i != 0); 2239 t = speed_endtime (); 2240 2241 mpz_clear (w); 2242 return t; 2243 } 2244 2245 /* The multiplies are successively dependent so the latency is measured, not 2246 the issue rate. There's only 10 per loop so the code doesn't get too big 2247 since umul_ppmm is several instructions on some cpus. 2248 2249 Putting the arguments as "h,l,l,h" gets slightly better code from gcc 2250 2.95.2 on x86, it puts only one mov between each mul, not two. That mov 2251 though will probably show up as a bogus extra cycle though. 2252 2253 The measuring function macros are into three parts to avoid overflowing 2254 preprocessor expansion space if umul_ppmm is big. 2255 2256 Limitations: 2257 2258 The default umul_ppmm doing h*l will be getting increasing numbers of 2259 high zero bits in the calculation. CPUs with data-dependent multipliers 2260 will want to use umul_ppmm.1 to get some randomization into the 2261 calculation. The extra xors and fetches will be a slowdown of course. */ 2262 2263 #define SPEED_MACRO_UMUL_PPMM_A \ 2264 { \ 2265 mp_limb_t h, l; \ 2266 unsigned i; \ 2267 double t; \ 2268 \ 2269 s->time_divisor = 10; \ 2270 \ 2271 h = s->xp[0]; \ 2272 l = s->yp[0]; \ 2273 \ 2274 if (s->r == 1) \ 2275 { \ 2276 speed_starttime (); \ 2277 i = s->reps; \ 2278 do \ 2279 { 2280 2281 #define SPEED_MACRO_UMUL_PPMM_B \ 2282 } \ 2283 while (--i != 0); \ 2284 t = speed_endtime (); \ 2285 } \ 2286 else \ 2287 { \ 2288 speed_starttime (); \ 2289 i = s->reps; \ 2290 do \ 2291 { 2292 2293 #define SPEED_MACRO_UMUL_PPMM_C \ 2294 } \ 2295 while (--i != 0); \ 2296 t = speed_endtime (); \ 2297 } \ 2298 \ 2299 /* stop the compiler optimizing away the whole calculation! */ \ 2300 noop_1 (h); \ 2301 noop_1 (l); \ 2302 \ 2303 return t; \ 2304 } 2305 2306 2307 double 2308 speed_umul_ppmm (struct speed_params *s) 2309 { 2310 SPEED_MACRO_UMUL_PPMM_A; 2311 { 2312 umul_ppmm (h, l, l, h); h ^= s->xp_block[0]; l ^= s->yp_block[0]; 2313 umul_ppmm (h, l, l, h); h ^= s->xp_block[1]; l ^= s->yp_block[1]; 2314 umul_ppmm (h, l, l, h); h ^= s->xp_block[2]; l ^= s->yp_block[2]; 2315 umul_ppmm (h, l, l, h); h ^= s->xp_block[3]; l ^= s->yp_block[3]; 2316 umul_ppmm (h, l, l, h); h ^= s->xp_block[4]; l ^= s->yp_block[4]; 2317 umul_ppmm (h, l, l, h); h ^= s->xp_block[5]; l ^= s->yp_block[5]; 2318 umul_ppmm (h, l, l, h); h ^= s->xp_block[6]; l ^= s->yp_block[6]; 2319 umul_ppmm (h, l, l, h); h ^= s->xp_block[7]; l ^= s->yp_block[7]; 2320 umul_ppmm (h, l, l, h); h ^= s->xp_block[8]; l ^= s->yp_block[8]; 2321 umul_ppmm (h, l, l, h); h ^= s->xp_block[9]; l ^= s->yp_block[9]; 2322 } 2323 SPEED_MACRO_UMUL_PPMM_B; 2324 { 2325 umul_ppmm (h, l, l, h); 2326 umul_ppmm (h, l, l, h); 2327 umul_ppmm (h, l, l, h); 2328 umul_ppmm (h, l, l, h); 2329 umul_ppmm (h, l, l, h); 2330 umul_ppmm (h, l, l, h); 2331 umul_ppmm (h, l, l, h); 2332 umul_ppmm (h, l, l, h); 2333 umul_ppmm (h, l, l, h); 2334 umul_ppmm (h, l, l, h); 2335 } 2336 SPEED_MACRO_UMUL_PPMM_C; 2337 } 2338 2339 2340 #if HAVE_NATIVE_mpn_umul_ppmm 2341 double 2342 speed_mpn_umul_ppmm (struct speed_params *s) 2343 { 2344 SPEED_MACRO_UMUL_PPMM_A; 2345 { 2346 h = mpn_umul_ppmm (&l, h, l); h ^= s->xp_block[0]; l ^= s->yp_block[0]; 2347 h = mpn_umul_ppmm (&l, h, l); h ^= s->xp_block[1]; l ^= s->yp_block[1]; 2348 h = mpn_umul_ppmm (&l, h, l); h ^= s->xp_block[2]; l ^= s->yp_block[2]; 2349 h = mpn_umul_ppmm (&l, h, l); h ^= s->xp_block[3]; l ^= s->yp_block[3]; 2350 h = mpn_umul_ppmm (&l, h, l); h ^= s->xp_block[4]; l ^= s->yp_block[4]; 2351 h = mpn_umul_ppmm (&l, h, l); h ^= s->xp_block[5]; l ^= s->yp_block[5]; 2352 h = mpn_umul_ppmm (&l, h, l); h ^= s->xp_block[6]; l ^= s->yp_block[6]; 2353 h = mpn_umul_ppmm (&l, h, l); h ^= s->xp_block[7]; l ^= s->yp_block[7]; 2354 h = mpn_umul_ppmm (&l, h, l); h ^= s->xp_block[8]; l ^= s->yp_block[8]; 2355 h = mpn_umul_ppmm (&l, h, l); h ^= s->xp_block[9]; l ^= s->yp_block[9]; 2356 } 2357 SPEED_MACRO_UMUL_PPMM_B; 2358 { 2359 h = mpn_umul_ppmm (&l, h, l); 2360 h = mpn_umul_ppmm (&l, h, l); 2361 h = mpn_umul_ppmm (&l, h, l); 2362 h = mpn_umul_ppmm (&l, h, l); 2363 h = mpn_umul_ppmm (&l, h, l); 2364 h = mpn_umul_ppmm (&l, h, l); 2365 h = mpn_umul_ppmm (&l, h, l); 2366 h = mpn_umul_ppmm (&l, h, l); 2367 h = mpn_umul_ppmm (&l, h, l); 2368 h = mpn_umul_ppmm (&l, h, l); 2369 } 2370 SPEED_MACRO_UMUL_PPMM_C; 2371 } 2372 #endif 2373 2374 #if HAVE_NATIVE_mpn_umul_ppmm_r 2375 double 2376 speed_mpn_umul_ppmm_r (struct speed_params *s) 2377 { 2378 SPEED_MACRO_UMUL_PPMM_A; 2379 { 2380 h = mpn_umul_ppmm_r (h, l, &l); h ^= s->xp_block[0]; l ^= s->yp_block[0]; 2381 h = mpn_umul_ppmm_r (h, l, &l); h ^= s->xp_block[1]; l ^= s->yp_block[1]; 2382 h = mpn_umul_ppmm_r (h, l, &l); h ^= s->xp_block[2]; l ^= s->yp_block[2]; 2383 h = mpn_umul_ppmm_r (h, l, &l); h ^= s->xp_block[3]; l ^= s->yp_block[3]; 2384 h = mpn_umul_ppmm_r (h, l, &l); h ^= s->xp_block[4]; l ^= s->yp_block[4]; 2385 h = mpn_umul_ppmm_r (h, l, &l); h ^= s->xp_block[5]; l ^= s->yp_block[5]; 2386 h = mpn_umul_ppmm_r (h, l, &l); h ^= s->xp_block[6]; l ^= s->yp_block[6]; 2387 h = mpn_umul_ppmm_r (h, l, &l); h ^= s->xp_block[7]; l ^= s->yp_block[7]; 2388 h = mpn_umul_ppmm_r (h, l, &l); h ^= s->xp_block[8]; l ^= s->yp_block[8]; 2389 h = mpn_umul_ppmm_r (h, l, &l); h ^= s->xp_block[9]; l ^= s->yp_block[9]; 2390 } 2391 SPEED_MACRO_UMUL_PPMM_B; 2392 { 2393 h = mpn_umul_ppmm_r (h, l, &l); 2394 h = mpn_umul_ppmm_r (h, l, &l); 2395 h = mpn_umul_ppmm_r (h, l, &l); 2396 h = mpn_umul_ppmm_r (h, l, &l); 2397 h = mpn_umul_ppmm_r (h, l, &l); 2398 h = mpn_umul_ppmm_r (h, l, &l); 2399 h = mpn_umul_ppmm_r (h, l, &l); 2400 h = mpn_umul_ppmm_r (h, l, &l); 2401 h = mpn_umul_ppmm_r (h, l, &l); 2402 h = mpn_umul_ppmm_r (h, l, &l); 2403 } 2404 SPEED_MACRO_UMUL_PPMM_C; 2405 } 2406 #endif 2407 2408 2409 /* The divisions are successively dependent so latency is measured, not 2410 issue rate. There's only 10 per loop so the code doesn't get too big, 2411 especially for udiv_qrnnd_preinv and preinv2norm, which are several 2412 instructions each. 2413 2414 Note that it's only the division which is measured here, there's no data 2415 fetching and no shifting if the divisor gets normalized. 2416 2417 In speed_udiv_qrnnd with gcc 2.95.2 on x86 the parameters "q,r,r,q,d" 2418 generate x86 div instructions with nothing in between. 2419 2420 The measuring function macros are in two parts to avoid overflowing 2421 preprocessor expansion space if udiv_qrnnd etc are big. 2422 2423 Limitations: 2424 2425 Don't blindly use this to set UDIV_TIME in gmp-mparam.h, check the code 2426 generated first. 2427 2428 CPUs with data-dependent divisions may want more attention paid to the 2429 randomness of the data used. Probably the measurement wanted is over 2430 uniformly distributed numbers, but what's here might not be giving that. */ 2431 2432 #define SPEED_ROUTINE_UDIV_QRNND_A(normalize) \ 2433 { \ 2434 double t; \ 2435 unsigned i; \ 2436 mp_limb_t q, r, d; \ 2437 mp_limb_t dinv; \ 2438 \ 2439 s->time_divisor = 10; \ 2440 \ 2441 /* divisor from "r" parameter, or a default */ \ 2442 d = s->r; \ 2443 if (d == 0) \ 2444 d = mp_bases[10].big_base; \ 2445 \ 2446 if (normalize) \ 2447 { \ 2448 unsigned norm; \ 2449 count_leading_zeros (norm, d); \ 2450 d <<= norm; \ 2451 invert_limb (dinv, d); \ 2452 } \ 2453 \ 2454 q = s->xp[0]; \ 2455 r = s->yp[0] % d; \ 2456 \ 2457 speed_starttime (); \ 2458 i = s->reps; \ 2459 do \ 2460 { 2461 2462 #define SPEED_ROUTINE_UDIV_QRNND_B \ 2463 } \ 2464 while (--i != 0); \ 2465 t = speed_endtime (); \ 2466 \ 2467 /* stop the compiler optimizing away the whole calculation! */ \ 2468 noop_1 (q); \ 2469 noop_1 (r); \ 2470 \ 2471 return t; \ 2472 } 2473 2474 double 2475 speed_udiv_qrnnd (struct speed_params *s) 2476 { 2477 SPEED_ROUTINE_UDIV_QRNND_A (UDIV_NEEDS_NORMALIZATION); 2478 { 2479 udiv_qrnnd (q, r, r, q, d); 2480 udiv_qrnnd (q, r, r, q, d); 2481 udiv_qrnnd (q, r, r, q, d); 2482 udiv_qrnnd (q, r, r, q, d); 2483 udiv_qrnnd (q, r, r, q, d); 2484 udiv_qrnnd (q, r, r, q, d); 2485 udiv_qrnnd (q, r, r, q, d); 2486 udiv_qrnnd (q, r, r, q, d); 2487 udiv_qrnnd (q, r, r, q, d); 2488 udiv_qrnnd (q, r, r, q, d); 2489 } 2490 SPEED_ROUTINE_UDIV_QRNND_B; 2491 } 2492 2493 double 2494 speed_udiv_qrnnd_c (struct speed_params *s) 2495 { 2496 SPEED_ROUTINE_UDIV_QRNND_A (1); 2497 { 2498 __udiv_qrnnd_c (q, r, r, q, d); 2499 __udiv_qrnnd_c (q, r, r, q, d); 2500 __udiv_qrnnd_c (q, r, r, q, d); 2501 __udiv_qrnnd_c (q, r, r, q, d); 2502 __udiv_qrnnd_c (q, r, r, q, d); 2503 __udiv_qrnnd_c (q, r, r, q, d); 2504 __udiv_qrnnd_c (q, r, r, q, d); 2505 __udiv_qrnnd_c (q, r, r, q, d); 2506 __udiv_qrnnd_c (q, r, r, q, d); 2507 __udiv_qrnnd_c (q, r, r, q, d); 2508 } 2509 SPEED_ROUTINE_UDIV_QRNND_B; 2510 } 2511 2512 #if HAVE_NATIVE_mpn_udiv_qrnnd 2513 double 2514 speed_mpn_udiv_qrnnd (struct speed_params *s) 2515 { 2516 SPEED_ROUTINE_UDIV_QRNND_A (1); 2517 { 2518 q = mpn_udiv_qrnnd (&r, r, q, d); 2519 q = mpn_udiv_qrnnd (&r, r, q, d); 2520 q = mpn_udiv_qrnnd (&r, r, q, d); 2521 q = mpn_udiv_qrnnd (&r, r, q, d); 2522 q = mpn_udiv_qrnnd (&r, r, q, d); 2523 q = mpn_udiv_qrnnd (&r, r, q, d); 2524 q = mpn_udiv_qrnnd (&r, r, q, d); 2525 q = mpn_udiv_qrnnd (&r, r, q, d); 2526 q = mpn_udiv_qrnnd (&r, r, q, d); 2527 q = mpn_udiv_qrnnd (&r, r, q, d); 2528 } 2529 SPEED_ROUTINE_UDIV_QRNND_B; 2530 } 2531 #endif 2532 2533 #if HAVE_NATIVE_mpn_udiv_qrnnd_r 2534 double 2535 speed_mpn_udiv_qrnnd_r (struct speed_params *s) 2536 { 2537 SPEED_ROUTINE_UDIV_QRNND_A (1); 2538 { 2539 q = mpn_udiv_qrnnd_r (r, q, d, &r); 2540 q = mpn_udiv_qrnnd_r (r, q, d, &r); 2541 q = mpn_udiv_qrnnd_r (r, q, d, &r); 2542 q = mpn_udiv_qrnnd_r (r, q, d, &r); 2543 q = mpn_udiv_qrnnd_r (r, q, d, &r); 2544 q = mpn_udiv_qrnnd_r (r, q, d, &r); 2545 q = mpn_udiv_qrnnd_r (r, q, d, &r); 2546 q = mpn_udiv_qrnnd_r (r, q, d, &r); 2547 q = mpn_udiv_qrnnd_r (r, q, d, &r); 2548 q = mpn_udiv_qrnnd_r (r, q, d, &r); 2549 } 2550 SPEED_ROUTINE_UDIV_QRNND_B; 2551 } 2552 #endif 2553 2554 2555 double 2556 speed_invert_limb (struct speed_params *s) 2557 { 2558 SPEED_ROUTINE_INVERT_LIMB_CALL (invert_limb (dinv, d)); 2559 } 2560 2561 2562 /* xp[0] might not be particularly random, but should give an indication how 2563 "/" runs. Same for speed_operator_mod below. */ 2564 double 2565 speed_operator_div (struct speed_params *s) 2566 { 2567 double t; 2568 unsigned i; 2569 mp_limb_t x, q, d; 2570 2571 s->time_divisor = 10; 2572 2573 /* divisor from "r" parameter, or a default */ 2574 d = s->r; 2575 if (d == 0) 2576 d = mp_bases[10].big_base; 2577 2578 x = s->xp[0]; 2579 q = 0; 2580 2581 speed_starttime (); 2582 i = s->reps; 2583 do 2584 { 2585 q ^= x; q /= d; 2586 q ^= x; q /= d; 2587 q ^= x; q /= d; 2588 q ^= x; q /= d; 2589 q ^= x; q /= d; 2590 q ^= x; q /= d; 2591 q ^= x; q /= d; 2592 q ^= x; q /= d; 2593 q ^= x; q /= d; 2594 q ^= x; q /= d; 2595 } 2596 while (--i != 0); 2597 t = speed_endtime (); 2598 2599 /* stop the compiler optimizing away the whole calculation! */ 2600 noop_1 (q); 2601 2602 return t; 2603 } 2604 2605 double 2606 speed_operator_mod (struct speed_params *s) 2607 { 2608 double t; 2609 unsigned i; 2610 mp_limb_t x, r, d; 2611 2612 s->time_divisor = 10; 2613 2614 /* divisor from "r" parameter, or a default */ 2615 d = s->r; 2616 if (d == 0) 2617 d = mp_bases[10].big_base; 2618 2619 x = s->xp[0]; 2620 r = 0; 2621 2622 speed_starttime (); 2623 i = s->reps; 2624 do 2625 { 2626 r ^= x; r %= d; 2627 r ^= x; r %= d; 2628 r ^= x; r %= d; 2629 r ^= x; r %= d; 2630 r ^= x; r %= d; 2631 r ^= x; r %= d; 2632 r ^= x; r %= d; 2633 r ^= x; r %= d; 2634 r ^= x; r %= d; 2635 r ^= x; r %= d; 2636 } 2637 while (--i != 0); 2638 t = speed_endtime (); 2639 2640 /* stop the compiler optimizing away the whole calculation! */ 2641 noop_1 (r); 2642 2643 return t; 2644 } 2645 2646 2647 /* r==0 measures on data with the values uniformly distributed. This will 2648 be typical for count_trailing_zeros in a GCD etc. 2649 2650 r==1 measures on data with the resultant count uniformly distributed 2651 between 0 and GMP_LIMB_BITS-1. This is probably sensible for 2652 count_leading_zeros on the high limbs of divisors. */ 2653 2654 int 2655 speed_routine_count_zeros_setup (struct speed_params *s, 2656 mp_ptr xp, int leading, int zero) 2657 { 2658 int i, c; 2659 mp_limb_t n; 2660 2661 if (s->r == 0) 2662 { 2663 /* Make uniformly distributed data. If zero isn't allowed then change 2664 it to 1 for leading, or 0x800..00 for trailing. */ 2665 MPN_COPY (xp, s->xp_block, SPEED_BLOCK_SIZE); 2666 if (! zero) 2667 for (i = 0; i < SPEED_BLOCK_SIZE; i++) 2668 if (xp[i] == 0) 2669 xp[i] = leading ? 1 : GMP_LIMB_HIGHBIT; 2670 } 2671 else if (s->r == 1) 2672 { 2673 /* Make counts uniformly distributed. A randomly chosen bit is set, and 2674 for leading the rest above it are cleared, or for trailing then the 2675 rest below. */ 2676 for (i = 0; i < SPEED_BLOCK_SIZE; i++) 2677 { 2678 mp_limb_t set = CNST_LIMB(1) << (s->yp_block[i] % GMP_LIMB_BITS); 2679 mp_limb_t keep_below = set-1; 2680 mp_limb_t keep_above = MP_LIMB_T_MAX ^ keep_below; 2681 mp_limb_t keep = (leading ? keep_below : keep_above); 2682 xp[i] = (s->xp_block[i] & keep) | set; 2683 } 2684 } 2685 else 2686 { 2687 return 0; 2688 } 2689 2690 /* Account for the effect of n^=c. */ 2691 c = 0; 2692 for (i = 0; i < SPEED_BLOCK_SIZE; i++) 2693 { 2694 n = xp[i]; 2695 xp[i] ^= c; 2696 2697 if (leading) 2698 count_leading_zeros (c, n); 2699 else 2700 count_trailing_zeros (c, n); 2701 } 2702 2703 return 1; 2704 } 2705 2706 double 2707 speed_count_leading_zeros (struct speed_params *s) 2708 { 2709 #ifdef COUNT_LEADING_ZEROS_0 2710 #define COUNT_LEADING_ZEROS_0_ALLOWED 1 2711 #else 2712 #define COUNT_LEADING_ZEROS_0_ALLOWED 0 2713 #endif 2714 2715 SPEED_ROUTINE_COUNT_ZEROS_A (1, COUNT_LEADING_ZEROS_0_ALLOWED); 2716 count_leading_zeros (c, n); 2717 SPEED_ROUTINE_COUNT_ZEROS_B (); 2718 } 2719 double 2720 speed_count_trailing_zeros (struct speed_params *s) 2721 { 2722 SPEED_ROUTINE_COUNT_ZEROS_A (0, 0); 2723 count_trailing_zeros (c, n); 2724 SPEED_ROUTINE_COUNT_ZEROS_B (); 2725 } 2726 2727 2728 double 2729 speed_mpn_get_str (struct speed_params *s) 2730 { 2731 SPEED_ROUTINE_MPN_GET_STR (mpn_get_str); 2732 } 2733 2734 double 2735 speed_mpn_set_str (struct speed_params *s) 2736 { 2737 SPEED_ROUTINE_MPN_SET_STR_CALL (mpn_set_str (wp, xp, s->size, base)); 2738 } 2739 double 2740 speed_mpn_bc_set_str (struct speed_params *s) 2741 { 2742 SPEED_ROUTINE_MPN_SET_STR_CALL (mpn_bc_set_str (wp, xp, s->size, base)); 2743 } 2744 2745 double 2746 speed_MPN_ZERO (struct speed_params *s) 2747 { 2748 SPEED_ROUTINE_MPN_ZERO_CALL (MPN_ZERO (wp, s->size)); 2749 } 2750 2751 2752 int 2753 speed_randinit (struct speed_params *s, gmp_randstate_ptr rstate) 2754 { 2755 if (s->r == 0) 2756 gmp_randinit_default (rstate); 2757 else if (s->r == 1) 2758 gmp_randinit_mt (rstate); 2759 else 2760 { 2761 return gmp_randinit_lc_2exp_size (rstate, s->r); 2762 } 2763 return 1; 2764 } 2765 2766 double 2767 speed_gmp_randseed (struct speed_params *s) 2768 { 2769 gmp_randstate_t rstate; 2770 unsigned i; 2771 double t; 2772 mpz_t x; 2773 2774 SPEED_RESTRICT_COND (s->size >= 1); 2775 SPEED_RESTRICT_COND (speed_randinit (s, rstate)); 2776 2777 /* s->size bits of seed */ 2778 mpz_init_set_n (x, s->xp, s->size); 2779 mpz_fdiv_r_2exp (x, x, (unsigned long) s->size); 2780 2781 /* cache priming */ 2782 gmp_randseed (rstate, x); 2783 2784 speed_starttime (); 2785 i = s->reps; 2786 do 2787 gmp_randseed (rstate, x); 2788 while (--i != 0); 2789 t = speed_endtime (); 2790 2791 gmp_randclear (rstate); 2792 mpz_clear (x); 2793 return t; 2794 } 2795 2796 double 2797 speed_gmp_randseed_ui (struct speed_params *s) 2798 { 2799 gmp_randstate_t rstate; 2800 unsigned i, j; 2801 double t; 2802 2803 SPEED_RESTRICT_COND (speed_randinit (s, rstate)); 2804 2805 /* cache priming */ 2806 gmp_randseed_ui (rstate, 123L); 2807 2808 speed_starttime (); 2809 i = s->reps; 2810 j = 0; 2811 do 2812 { 2813 gmp_randseed_ui (rstate, (unsigned long) s->xp_block[j]); 2814 j++; 2815 if (j >= SPEED_BLOCK_SIZE) 2816 j = 0; 2817 } 2818 while (--i != 0); 2819 t = speed_endtime (); 2820 2821 gmp_randclear (rstate); 2822 return t; 2823 } 2824 2825 double 2826 speed_mpz_urandomb (struct speed_params *s) 2827 { 2828 gmp_randstate_t rstate; 2829 mpz_t z; 2830 unsigned i; 2831 double t; 2832 2833 SPEED_RESTRICT_COND (s->size >= 0); 2834 SPEED_RESTRICT_COND (speed_randinit (s, rstate)); 2835 2836 mpz_init (z); 2837 2838 /* cache priming */ 2839 mpz_urandomb (z, rstate, (unsigned long) s->size); 2840 mpz_urandomb (z, rstate, (unsigned long) s->size); 2841 2842 speed_starttime (); 2843 i = s->reps; 2844 do 2845 mpz_urandomb (z, rstate, (unsigned long) s->size); 2846 while (--i != 0); 2847 t = speed_endtime (); 2848 2849 mpz_clear (z); 2850 gmp_randclear (rstate); 2851 return t; 2852 } 2853