xref: /netbsd-src/external/lgpl3/gmp/dist/mpz/bin_uiui.c (revision 72c7faa4dbb41dbb0238d6b4a109da0d4b236dd4)
1 /* mpz_bin_uiui - compute n over k.
2 
3 Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
4 
5 Copyright 2010-2012, 2015-2018 Free Software Foundation, Inc.
6 
7 This file is part of the GNU MP Library.
8 
9 The GNU MP Library is free software; you can redistribute it and/or modify
10 it under the terms of either:
11 
12   * the GNU Lesser General Public License as published by the Free
13     Software Foundation; either version 3 of the License, or (at your
14     option) any later version.
15 
16 or
17 
18   * the GNU General Public License as published by the Free Software
19     Foundation; either version 2 of the License, or (at your option) any
20     later version.
21 
22 or both in parallel, as here.
23 
24 The GNU MP Library is distributed in the hope that it will be useful, but
25 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27 for more details.
28 
29 You should have received copies of the GNU General Public License and the
30 GNU Lesser General Public License along with the GNU MP Library.  If not,
31 see https://www.gnu.org/licenses/.  */
32 
33 #include "gmp-impl.h"
34 #include "longlong.h"
35 
36 #ifndef BIN_GOETGHELUCK_THRESHOLD
37 #define BIN_GOETGHELUCK_THRESHOLD  512
38 #endif
39 #ifndef BIN_UIUI_ENABLE_SMALLDC
40 #define BIN_UIUI_ENABLE_SMALLDC    1
41 #endif
42 #ifndef BIN_UIUI_RECURSIVE_SMALLDC
43 #define BIN_UIUI_RECURSIVE_SMALLDC (GMP_NUMB_BITS > 32)
44 #endif
45 
46 /* Algorithm:
47 
48    Accumulate chunks of factors first limb-by-limb (using one of mul0-mul8)
49    which are then accumulated into mpn numbers.  The first inner loop
50    accumulates divisor factors, the 2nd inner loop accumulates exactly the same
51    number of dividend factors.  We avoid accumulating more for the divisor,
52    even with its smaller factors, since we else cannot guarantee divisibility.
53 
54    Since we know each division will yield an integer, we compute the quotient
55    using Hensel norm: If the quotient is limited by 2^t, we compute A / B mod
56    2^t.
57 
58    Improvements:
59 
60    (1) An obvious improvement to this code would be to compute mod 2^t
61    everywhere.  Unfortunately, we cannot determine t beforehand, unless we
62    invoke some approximation, such as Stirling's formula.  Of course, we don't
63    need t to be tight.  However, it is not clear that this would help much,
64    our numbers are kept reasonably small already.
65 
66    (2) Compute nmax/kmax semi-accurately, without scalar division or a loop.
67    Extracting the 3 msb, then doing a table lookup using cnt*8+msb as index,
68    would make it both reasonably accurate and fast.  (We could use a table
69    stored into a limb, perhaps.)  The table should take the removed factors of
70    2 into account (those done on-the-fly in mulN).
71 
72    (3) The first time in the loop we compute the odd part of a
73    factorial in kp, we might use oddfac_1 for this task.
74  */
75 
76 /* This threshold determines how large divisor to accumulate before we call
77    bdiv.  Perhaps we should never call bdiv, and accumulate all we are told,
78    since we are just basecase code anyway?  Presumably, this depends on the
79    relative speed of the asymptotically fast code and this code.  */
80 #define SOME_THRESHOLD 20
81 
82 /* Multiply-into-limb functions.  These remove factors of 2 on-the-fly.  FIXME:
83    All versions of MAXFACS don't take this 2 removal into account now, meaning
84    that then, shifting just adds some overhead.  (We remove factors from the
85    completed limb anyway.)  */
86 
87 static mp_limb_t
mul1(mp_limb_t m)88 mul1 (mp_limb_t m)
89 {
90   return m;
91 }
92 
93 static mp_limb_t
mul2(mp_limb_t m)94 mul2 (mp_limb_t m)
95 {
96   /* We need to shift before multiplying, to avoid an overflow. */
97   mp_limb_t m01 = (m | 1) * ((m + 1) >> 1);
98   return m01;
99 }
100 
101 static mp_limb_t
mul3(mp_limb_t m)102 mul3 (mp_limb_t m)
103 {
104   mp_limb_t m01 = (m + 0) * (m + 1) >> 1;
105   mp_limb_t m2 = (m + 2);
106   return m01 * m2;
107 }
108 
109 static mp_limb_t
mul4(mp_limb_t m)110 mul4 (mp_limb_t m)
111 {
112   mp_limb_t m03 = (m + 0) * (m + 3) >> 1;
113   return m03 * (m03 + 1); /* mul2 (m03) ? */
114 }
115 
116 static mp_limb_t
mul5(mp_limb_t m)117 mul5 (mp_limb_t m)
118 {
119   mp_limb_t m03 = (m + 0) * (m + 3) >> 1;
120   mp_limb_t m034 = m03 * (m + 4);
121   return (m03 + 1) * m034;
122 }
123 
124 static mp_limb_t
mul6(mp_limb_t m)125 mul6 (mp_limb_t m)
126 {
127   mp_limb_t m05 = (m + 0) * (m + 5);
128   mp_limb_t m1234 = (m05 + 5) * (m05 + 5) >> 3;
129   return m1234 * (m05 >> 1);
130 }
131 
132 static mp_limb_t
mul7(mp_limb_t m)133 mul7 (mp_limb_t m)
134 {
135   mp_limb_t m05 = (m + 0) * (m + 5);
136   mp_limb_t m1234 = (m05 + 5) * (m05 + 5) >> 3;
137   mp_limb_t m056 = m05 * (m + 6) >> 1;
138   return m1234 * m056;
139 }
140 
141 static mp_limb_t
mul8(mp_limb_t m)142 mul8 (mp_limb_t m)
143 {
144   mp_limb_t m07 = (m + 0) * (m + 7);
145   mp_limb_t m0257 = m07 * (m07 + 10) >> 3;
146   mp_limb_t m1346 = m07 + 9 + m0257;
147   return m0257 * m1346;
148 }
149 
150 /*
151 static mp_limb_t
152 mul9 (mp_limb_t m)
153 {
154   return (m + 8) * mul8 (m) ;
155 }
156 
157 static mp_limb_t
158 mul10 (mp_limb_t m)
159 {
160   mp_limb_t m09 = (m + 0) * (m + 9);
161   mp_limb_t m18 = (m09 >> 1) + 4;
162   mp_limb_t m0369 = m09 * (m09 + 18) >> 3;
163   mp_limb_t m2457 = m09 * 2 + 35 + m0369;
164   return ((m0369 * m2457) >> 1) * m18;
165 }
166 */
167 
168 typedef mp_limb_t (* mulfunc_t) (mp_limb_t);
169 
170 static const mulfunc_t mulfunc[] = {mul1,mul2,mul3,mul4,mul5,mul6,mul7,mul8 /* ,mul9,mul10 */};
171 #define M (numberof(mulfunc))
172 
173 /* Number of factors-of-2 removed by the corresponding mulN function.  */
174 static const unsigned char tcnttab[] = {0, 1, 1, 2, 2, 4, 4, 6 /*,6,8*/};
175 
176 #if 1
177 /* This variant is inaccurate but share the code with other functions.  */
178 #define MAXFACS(max,l)							\
179   do {									\
180     (max) = log_n_max (l);						\
181   } while (0)
182 #else
183 
184 /* This variant is exact(?) but uses a loop.  It takes the 2 removal
185  of mulN into account.  */
186 static const unsigned long ftab[] =
187 #if GMP_NUMB_BITS == 64
188   /* 1 to 8 factors per iteration */
189   {CNST_LIMB(0xffffffffffffffff),CNST_LIMB(0x16a09e667),0x32cbfc,0x16a08,0x24c0,0xa11,0x345,0x1ab /*,0xe9,0x8e */};
190 #endif
191 #if GMP_NUMB_BITS == 32
192   /* 1 to 7 factors per iteration */
193   {0xffffffff,0x16a09,0x7ff,0x168,0x6f,0x3d,0x20 /* ,0x17 */};
194 #endif
195 
196 #define MAXFACS(max,l)							\
197   do {									\
198     int __i;								\
199     for (__i = numberof (ftab) - 1; l > ftab[__i]; __i--)		\
200       ;									\
201     (max) = __i + 1;							\
202   } while (0)
203 #endif
204 
205 /* Entry i contains (i!/2^t)^(-1) where t is chosen such that the parenthesis
206    is an odd integer. */
207 static const mp_limb_t facinv[] = { ONE_LIMB_ODD_FACTORIAL_INVERSES_TABLE };
208 
209 static void
mpz_bdiv_bin_uiui(mpz_ptr r,unsigned long int n,unsigned long int k)210 mpz_bdiv_bin_uiui (mpz_ptr r, unsigned long int n, unsigned long int k)
211 {
212   unsigned nmax, kmax, nmaxnow, numfac;
213   mp_ptr np, kp;
214   mp_size_t nn, kn, alloc;
215   mp_limb_t i, j, t, iii, jjj, cy, dinv;
216   int cnt;
217   mp_size_t maxn;
218   TMP_DECL;
219 
220   ASSERT (k > ODD_FACTORIAL_TABLE_LIMIT);
221   TMP_MARK;
222 
223   maxn = 1 + n / GMP_NUMB_BITS;    /* absolutely largest result size (limbs) */
224 
225   /* FIXME: This allocation might be insufficient, but is usually way too
226      large.  */
227   alloc = SOME_THRESHOLD - 1 + MAX (3 * maxn / 2, SOME_THRESHOLD);
228   alloc = MIN (alloc, (mp_size_t) k) + 1;
229   TMP_ALLOC_LIMBS_2 (np, alloc, kp, SOME_THRESHOLD + 1);
230 
231   MAXFACS (nmax, n);
232   ASSERT (nmax <= M);
233   MAXFACS (kmax, k);
234   ASSERT (kmax <= M);
235   ASSERT (k >= M);
236 
237   i = n - k + 1;
238 
239   np[0] = 1; nn = 1;
240 
241   numfac = 1;
242   j = ODD_FACTORIAL_TABLE_LIMIT + 1;
243   jjj = ODD_FACTORIAL_TABLE_MAX;
244   ASSERT (__gmp_oddfac_table[ODD_FACTORIAL_TABLE_LIMIT] == ODD_FACTORIAL_TABLE_MAX);
245 
246   while (1)
247     {
248       kp[0] = jjj;				/* store new factors */
249       kn = 1;
250       t = k - j + 1;
251       kmax = MIN (kmax, t);
252 
253       while (kmax != 0 && kn < SOME_THRESHOLD)
254 	{
255 	  jjj = mulfunc[kmax - 1] (j);
256 	  j += kmax;				/* number of factors used */
257 	  count_trailing_zeros (cnt, jjj);	/* count low zeros */
258 	  jjj >>= cnt;				/* remove remaining low zeros */
259 	  cy = mpn_mul_1 (kp, kp, kn, jjj);	/* accumulate new factors */
260 	  kp[kn] = cy;
261 	  kn += cy != 0;
262 	  t = k - j + 1;
263 	  kmax = MIN (kmax, t);
264 	}
265       numfac = j - numfac;
266 
267       while (numfac != 0)
268 	{
269 	  nmaxnow = MIN (nmax, numfac);
270 	  iii = mulfunc[nmaxnow - 1] (i);
271 	  i += nmaxnow;				/* number of factors used */
272 	  count_trailing_zeros (cnt, iii);	/* count low zeros */
273 	  iii >>= cnt;				/* remove remaining low zeros */
274 	  cy = mpn_mul_1 (np, np, nn, iii);	/* accumulate new factors */
275 	  np[nn] = cy;
276 	  nn += cy != 0;
277 	  numfac -= nmaxnow;
278 	}
279 
280       ASSERT (nn < alloc);
281 
282       binvert_limb (dinv, kp[0]);
283       nn += (np[nn - 1] >= kp[kn - 1]);
284       nn -= kn;
285       mpn_sbpi1_bdiv_q (np, np, nn, kp, MIN(kn,nn), -dinv);
286       mpn_neg (np, np, nn);
287 
288       if (kmax == 0)
289 	break;
290       numfac = j;
291 
292       jjj = mulfunc[kmax - 1] (j);
293       j += kmax;				/* number of factors used */
294       count_trailing_zeros (cnt, jjj);		/* count low zeros */
295       jjj >>= cnt;				/* remove remaining low zeros */
296     }
297 
298   /* Put back the right number of factors of 2.  */
299   popc_limb (cnt, n - k);
300   popc_limb (j, k);
301   cnt += j;
302   popc_limb (j, n);
303   cnt -= j;
304   if (cnt != 0)
305     {
306       ASSERT (cnt < GMP_NUMB_BITS); /* can happen, but not for intended use */
307       cy = mpn_lshift (np, np, nn, cnt);
308       np[nn] = cy;
309       nn += cy != 0;
310     }
311 
312   nn -= np[nn - 1] == 0;	/* normalisation */
313 
314   kp = MPZ_NEWALLOC (r, nn);
315   SIZ(r) = nn;
316   MPN_COPY (kp, np, nn);
317   TMP_FREE;
318 }
319 
320 static void
mpz_smallk_bin_uiui(mpz_ptr r,unsigned long int n,unsigned long int k)321 mpz_smallk_bin_uiui (mpz_ptr r, unsigned long int n, unsigned long int k)
322 {
323   unsigned nmax, numfac;
324   mp_ptr rp;
325   mp_size_t rn, alloc;
326   mp_limb_t i, iii, cy;
327   unsigned i2cnt, cnt;
328 
329   MAXFACS (nmax, n);
330   nmax = MIN (nmax, M);
331 
332   i = n - k + 1;
333 
334   i2cnt = __gmp_fac2cnt_table[k / 2 - 1];		/* low zeros count */
335   if (nmax >= k)
336     {
337       MPZ_NEWALLOC (r, 1) [0] = mulfunc[k - 1] (i) * facinv[k - 2] >>
338 	(i2cnt - tcnttab[k - 1]);
339       SIZ(r) = 1;
340       return;
341     }
342 
343   count_leading_zeros (cnt, (mp_limb_t) n);
344   cnt = GMP_LIMB_BITS - cnt;
345   alloc = cnt * k / GMP_NUMB_BITS + 3;	/* FIXME: ensure rounding is enough. */
346   rp = MPZ_NEWALLOC (r, alloc);
347 
348   rp[0] = mulfunc[nmax - 1] (i);
349   rn = 1;
350   i += nmax;				/* number of factors used */
351   i2cnt -= tcnttab[nmax - 1];		/* low zeros count */
352   numfac = k - nmax;
353   do
354     {
355       nmax = MIN (nmax, numfac);
356       iii = mulfunc[nmax - 1] (i);
357       i += nmax;			/* number of factors used */
358       i2cnt -= tcnttab[nmax - 1];	/* update low zeros count */
359       cy = mpn_mul_1 (rp, rp, rn, iii);	/* accumulate new factors */
360       rp[rn] = cy;
361       rn += cy != 0;
362       numfac -= nmax;
363     } while (numfac != 0);
364 
365   ASSERT (rn < alloc);
366 
367   mpn_pi1_bdiv_q_1 (rp, rp, rn, __gmp_oddfac_table[k], facinv[k - 2], i2cnt);
368   /* A two-fold, branch-free normalisation is possible :*/
369   /* rn -= rp[rn - 1] == 0; */
370   /* rn -= rp[rn - 1] == 0; */
371   MPN_NORMALIZE_NOT_ZERO (rp, rn);
372 
373   SIZ(r) = rn;
374 }
375 
376 /* Algorithm:
377 
378    Plain and simply multiply things together.
379 
380    We tabulate factorials (k!/2^t)^(-1) mod B (where t is chosen such
381    that k!/2^t is odd).
382 
383 */
384 
385 static mp_limb_t
bc_bin_uiui(unsigned int n,unsigned int k)386 bc_bin_uiui (unsigned int n, unsigned int k)
387 {
388   return ((__gmp_oddfac_table[n] * facinv[k - 2] * facinv[n - k - 2])
389     << (__gmp_fac2cnt_table[n / 2 - 1] - __gmp_fac2cnt_table[k / 2 - 1] - __gmp_fac2cnt_table[(n-k) / 2 - 1]))
390     & GMP_NUMB_MASK;
391 }
392 
393 /* Algorithm:
394 
395    Recursively exploit the relation
396    bin(n,k) = bin(n,k>>1)*bin(n-k>>1,k-k>>1)/bin(k,k>>1) .
397 
398    Values for binomial(k,k>>1) that fit in a limb are precomputed
399    (with inverses).
400 */
401 
402 /* bin2kk[i - ODD_CENTRAL_BINOMIAL_OFFSET] =
403    binomial(i*2,i)/2^t (where t is chosen so that it is odd). */
404 static const mp_limb_t bin2kk[] = { ONE_LIMB_ODD_CENTRAL_BINOMIAL_TABLE };
405 
406 /* bin2kkinv[i] = bin2kk[i]^-1 mod B */
407 static const mp_limb_t bin2kkinv[] = { ONE_LIMB_ODD_CENTRAL_BINOMIAL_INVERSE_TABLE };
408 
409 /* bin2kk[i] = binomial((i+MIN_S)*2,i+MIN_S)/2^t. This table contains the t values. */
410 static const unsigned char fac2bin[] = { CENTRAL_BINOMIAL_2FAC_TABLE };
411 
412 static void
mpz_smallkdc_bin_uiui(mpz_ptr r,unsigned long int n,unsigned long int k)413 mpz_smallkdc_bin_uiui (mpz_ptr r, unsigned long int n, unsigned long int k)
414 {
415   mp_ptr rp;
416   mp_size_t rn;
417   unsigned long int hk;
418 
419   hk = k >> 1;
420 
421   if ((! BIN_UIUI_RECURSIVE_SMALLDC) || hk <= ODD_FACTORIAL_TABLE_LIMIT)
422     mpz_smallk_bin_uiui (r, n, hk);
423   else
424     mpz_smallkdc_bin_uiui (r, n, hk);
425   k -= hk;
426   n -= hk;
427   if (n <= ODD_FACTORIAL_EXTTABLE_LIMIT) {
428     mp_limb_t cy;
429     rn = SIZ (r);
430     rp = MPZ_REALLOC (r, rn + 1);
431     cy = mpn_mul_1 (rp, rp, rn, bc_bin_uiui (n, k));
432     rp [rn] = cy;
433     rn += cy != 0;
434   } else {
435     mp_limb_t buffer[ODD_CENTRAL_BINOMIAL_TABLE_LIMIT + 3];
436     mpz_t t;
437 
438     ALLOC (t) = ODD_CENTRAL_BINOMIAL_TABLE_LIMIT + 3;
439     PTR (t) = buffer;
440     if ((! BIN_UIUI_RECURSIVE_SMALLDC) || k <= ODD_FACTORIAL_TABLE_LIMIT)
441       mpz_smallk_bin_uiui (t, n, k);
442     else
443       mpz_smallkdc_bin_uiui (t, n, k);
444     mpz_mul (r, r, t);
445     rp = PTR (r);
446     rn = SIZ (r);
447   }
448 
449   mpn_pi1_bdiv_q_1 (rp, rp, rn, bin2kk[k - ODD_CENTRAL_BINOMIAL_OFFSET],
450 		    bin2kkinv[k - ODD_CENTRAL_BINOMIAL_OFFSET],
451 		    fac2bin[k - ODD_CENTRAL_BINOMIAL_OFFSET] - (k != hk));
452   /* A two-fold, branch-free normalisation is possible :*/
453   /* rn -= rp[rn - 1] == 0; */
454   /* rn -= rp[rn - 1] == 0; */
455   MPN_NORMALIZE_NOT_ZERO (rp, rn);
456 
457   SIZ(r) = rn;
458 }
459 
460 /* mpz_goetgheluck_bin_uiui(RESULT, N, K) -- Set RESULT to binomial(N,K).
461  *
462  * Contributed to the GNU project by Marco Bodrato.
463  *
464  * Implementation of the algorithm by P. Goetgheluck, "Computing
465  * Binomial Coefficients", The American Mathematical Monthly, Vol. 94,
466  * No. 4 (April 1987), pp. 360-365.
467  *
468  * Acknowledgment: Peter Luschny did spot the slowness of the previous
469  * code and suggested the reference.
470  */
471 
472 /* TODO: Remove duplicated constants / macros / static functions...
473  */
474 
475 /*************************************************************/
476 /* Section macros: common macros, for swing/fac/bin (&sieve) */
477 /*************************************************************/
478 
479 #define FACTOR_LIST_APPEND(PR, MAX_PR, VEC, I)			\
480   if ((PR) > (MAX_PR)) {					\
481     (VEC)[(I)++] = (PR);					\
482     (PR) = 1;							\
483   }
484 
485 #define FACTOR_LIST_STORE(P, PR, MAX_PR, VEC, I)		\
486   do {								\
487     if ((PR) > (MAX_PR)) {					\
488       (VEC)[(I)++] = (PR);					\
489       (PR) = (P);						\
490     } else							\
491       (PR) *= (P);						\
492   } while (0)
493 
494 #define LOOP_ON_SIEVE_CONTINUE(prime,end,sieve)			\
495     __max_i = (end);						\
496 								\
497     do {							\
498       ++__i;							\
499       if (((sieve)[__index] & __mask) == 0)			\
500 	{							\
501 	  mp_limb_t prime;					\
502 	  prime = id_to_n(__i)
503 
504 #define LOOP_ON_SIEVE_BEGIN(prime,start,end,off,sieve)		\
505   do {								\
506     mp_limb_t __mask, __index, __max_i, __i;			\
507 								\
508     __i = (start)-(off);					\
509     __index = __i / GMP_LIMB_BITS;				\
510     __mask = CNST_LIMB(1) << (__i % GMP_LIMB_BITS);		\
511     __i += (off);						\
512 								\
513     LOOP_ON_SIEVE_CONTINUE(prime,end,sieve)
514 
515 #define LOOP_ON_SIEVE_STOP					\
516 	}							\
517       __mask = __mask << 1 | __mask >> (GMP_LIMB_BITS-1);	\
518       __index += __mask & 1;					\
519     }  while (__i <= __max_i)
520 
521 #define LOOP_ON_SIEVE_END					\
522     LOOP_ON_SIEVE_STOP;						\
523   } while (0)
524 
525 /*********************************************************/
526 /* Section sieve: sieving functions and tools for primes */
527 /*********************************************************/
528 
529 #if WANT_ASSERT
530 static mp_limb_t
bit_to_n(mp_limb_t bit)531 bit_to_n (mp_limb_t bit) { return (bit*3+4)|1; }
532 #endif
533 
534 /* id_to_n (x) = bit_to_n (x-1) = (id*3+1)|1*/
535 static mp_limb_t
id_to_n(mp_limb_t id)536 id_to_n  (mp_limb_t id)  { return id*3+1+(id&1); }
537 
538 /* n_to_bit (n) = ((n-1)&(-CNST_LIMB(2)))/3U-1 */
539 static mp_limb_t
n_to_bit(mp_limb_t n)540 n_to_bit (mp_limb_t n) { return ((n-5)|1)/3U; }
541 
542 static mp_size_t
primesieve_size(mp_limb_t n)543 primesieve_size (mp_limb_t n) { return n_to_bit(n) / GMP_LIMB_BITS + 1; }
544 
545 /*********************************************************/
546 /* Section binomial: fast binomial implementation        */
547 /*********************************************************/
548 
549 #define COUNT_A_PRIME(P, N, K, PR, MAX_PR, VEC, I)	\
550   do {							\
551     mp_limb_t __a, __b, __prime, __ma,__mb;		\
552     __prime = (P);					\
553     __a = (N); __b = (K); __mb = 0;			\
554     FACTOR_LIST_APPEND(PR, MAX_PR, VEC, I);		\
555     do {						\
556       __mb += __b % __prime; __b /= __prime;		\
557       __ma = __a % __prime; __a /= __prime;		\
558       if (__ma < __mb) {				\
559         __mb = 1; (PR) *= __prime;			\
560       } else  __mb = 0;					\
561     } while (__a >= __prime);				\
562   } while (0)
563 
564 #define SH_COUNT_A_PRIME(P, N, K, PR, MAX_PR, VEC, I)	\
565   do {							\
566     mp_limb_t __prime;					\
567     __prime = (P);					\
568     if (((N) % __prime) < ((K) % __prime)) {		\
569       FACTOR_LIST_STORE (__prime, PR, MAX_PR, VEC, I);	\
570     }							\
571   } while (0)
572 
573 /* Returns an approximation of the sqare root of x.
574  * It gives:
575  *   limb_apprsqrt (x) ^ 2 <= x < (limb_apprsqrt (x)+1) ^ 2
576  * or
577  *   x <= limb_apprsqrt (x) ^ 2 <= x * 9/8
578  */
579 static mp_limb_t
limb_apprsqrt(mp_limb_t x)580 limb_apprsqrt (mp_limb_t x)
581 {
582   int s;
583 
584   ASSERT (x > 2);
585   count_leading_zeros (s, x);
586   s = (GMP_LIMB_BITS - s) >> 1;
587   return ((CNST_LIMB(1) << s) + (x >> s)) >> 1;
588 }
589 
590 static void
mpz_goetgheluck_bin_uiui(mpz_ptr r,unsigned long int n,unsigned long int k)591 mpz_goetgheluck_bin_uiui (mpz_ptr r, unsigned long int n, unsigned long int k)
592 {
593   mp_limb_t *sieve, *factors, count;
594   mp_limb_t prod, max_prod;
595   mp_size_t j;
596   TMP_DECL;
597 
598   ASSERT (BIN_GOETGHELUCK_THRESHOLD >= 13);
599   ASSERT (n >= 25);
600 
601   TMP_MARK;
602   sieve = TMP_ALLOC_LIMBS (primesieve_size (n));
603 
604   count = gmp_primesieve (sieve, n) + 1;
605   factors = TMP_ALLOC_LIMBS (count / log_n_max (n) + 1);
606 
607   max_prod = GMP_NUMB_MAX / n;
608 
609   /* Handle primes = 2, 3 separately. */
610   popc_limb (count, n - k);
611   popc_limb (j, k);
612   count += j;
613   popc_limb (j, n);
614   count -= j;
615   prod = CNST_LIMB(1) << count;
616 
617   j = 0;
618   COUNT_A_PRIME (3, n, k, prod, max_prod, factors, j);
619 
620   /* Accumulate prime factors from 5 to n/2 */
621     {
622       mp_limb_t s;
623 
624       s = limb_apprsqrt(n);
625       s = n_to_bit (s);
626       ASSERT (bit_to_n (s+1) * bit_to_n (s+1) > n);
627       ASSERT (s <= n_to_bit (n >> 1));
628       LOOP_ON_SIEVE_BEGIN (prime, n_to_bit (5), s, 0,sieve);
629       COUNT_A_PRIME (prime, n, k, prod, max_prod, factors, j);
630       LOOP_ON_SIEVE_STOP;
631 
632       ASSERT (max_prod <= GMP_NUMB_MAX / 2);
633       max_prod <<= 1;
634 
635       LOOP_ON_SIEVE_CONTINUE (prime, n_to_bit (n >> 1),sieve);
636       SH_COUNT_A_PRIME (prime, n, k, prod, max_prod, factors, j);
637       LOOP_ON_SIEVE_END;
638 
639       max_prod >>= 1;
640     }
641 
642   /* Store primes from (n-k)+1 to n */
643   ASSERT (n_to_bit (n - k) < n_to_bit (n));
644 
645   LOOP_ON_SIEVE_BEGIN (prime, n_to_bit (n - k) + 1, n_to_bit (n), 0,sieve);
646   FACTOR_LIST_STORE (prime, prod, max_prod, factors, j);
647   LOOP_ON_SIEVE_END;
648 
649   if (LIKELY (j != 0))
650     {
651       factors[j++] = prod;
652       mpz_prodlimbs (r, factors, j);
653     }
654   else
655     {
656       MPZ_NEWALLOC (r, 1)[0] = prod;
657       SIZ (r) = 1;
658     }
659   TMP_FREE;
660 }
661 
662 #undef COUNT_A_PRIME
663 #undef SH_COUNT_A_PRIME
664 #undef LOOP_ON_SIEVE_END
665 #undef LOOP_ON_SIEVE_STOP
666 #undef LOOP_ON_SIEVE_BEGIN
667 #undef LOOP_ON_SIEVE_CONTINUE
668 
669 /*********************************************************/
670 /* End of implementation of Goetgheluck's algorithm      */
671 /*********************************************************/
672 
673 void
mpz_bin_uiui(mpz_ptr r,unsigned long int n,unsigned long int k)674 mpz_bin_uiui (mpz_ptr r, unsigned long int n, unsigned long int k)
675 {
676   if (UNLIKELY (n < k)) {
677     SIZ (r) = 0;
678 #if BITS_PER_ULONG > GMP_NUMB_BITS
679   } else if (UNLIKELY (n > GMP_NUMB_MAX)) {
680     mpz_t tmp;
681 
682     mpz_init_set_ui (tmp, n);
683     mpz_bin_ui (r, tmp, k);
684     mpz_clear (tmp);
685 #endif
686   } else {
687     ASSERT (n <= GMP_NUMB_MAX);
688     /* Rewrite bin(n,k) as bin(n,n-k) if that is smaller. */
689     k = MIN (k, n - k);
690     if (k < 2) {
691       MPZ_NEWALLOC (r, 1)[0] = k ? n : 1; /* 1 + ((-k) & (n-1)); */
692       SIZ(r) = 1;
693     } else if (n <= ODD_FACTORIAL_EXTTABLE_LIMIT) { /* k >= 2, n >= 4 */
694       MPZ_NEWALLOC (r, 1)[0] = bc_bin_uiui (n, k);
695       SIZ(r) = 1;
696     } else if (k <= ODD_FACTORIAL_TABLE_LIMIT)
697       mpz_smallk_bin_uiui (r, n, k);
698     else if (BIN_UIUI_ENABLE_SMALLDC &&
699 	     k <= (BIN_UIUI_RECURSIVE_SMALLDC ? ODD_CENTRAL_BINOMIAL_TABLE_LIMIT : ODD_FACTORIAL_TABLE_LIMIT)* 2)
700       mpz_smallkdc_bin_uiui (r, n, k);
701     else if (ABOVE_THRESHOLD (k, BIN_GOETGHELUCK_THRESHOLD) &&
702 	     k > (n >> 4)) /* k > ODD_FACTORIAL_TABLE_LIMIT */
703       mpz_goetgheluck_bin_uiui (r, n, k);
704     else
705       mpz_bdiv_bin_uiui (r, n, k);
706   }
707 }
708