xref: /netbsd-src/external/lgpl3/gmp/dist/mpz/bin_uiui.c (revision c3ab26950fe8540fb553d1d1dcae454bc98e5a25)
1 /* mpz_bin_uiui - compute n over k.
2 
3 Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
4 
5 Copyright 2010, 2011, 2012 Free Software Foundation, Inc.
6 
7 This file is part of the GNU MP Library.
8 
9 The GNU MP Library is free software; you can redistribute it and/or modify
10 it under the terms of the GNU Lesser General Public License as published by
11 the Free Software Foundation; either version 3 of the License, or (at your
12 option) any later version.
13 
14 The GNU MP Library is distributed in the hope that it will be useful, but
15 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
17 License for more details.
18 
19 You should have received a copy of the GNU Lesser General Public License
20 along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
21 
22 #include "gmp.h"
23 #include "gmp-impl.h"
24 #include "longlong.h"
25 
26 #ifndef BIN_GOETGHELUCK_THRESHOLD
27 #define BIN_GOETGHELUCK_THRESHOLD  1000
28 #endif
29 #ifndef BIN_UIUI_ENABLE_SMALLDC
30 #define BIN_UIUI_ENABLE_SMALLDC    1
31 #endif
32 #ifndef BIN_UIUI_RECURSIVE_SMALLDC
33 #define BIN_UIUI_RECURSIVE_SMALLDC (GMP_NUMB_BITS > 32)
34 #endif
35 
36 /* Algorithm:
37 
38    Accumulate chunks of factors first limb-by-limb (using one of mul0-mul8)
39    which are then accumulated into mpn numbers.  The first inner loop
40    accumulates divisor factors, the 2nd inner loop accumulates exactly the same
41    number of dividend factors.  We avoid accumulating more for the divisor,
42    even with its smaller factors, since we else cannot guarantee divisibility.
43 
44    Since we know each division will yield an integer, we compute the quotient
45    using Hensel norm: If the quotient is limited by 2^t, we compute A / B mod
46    2^t.
47 
48    Improvements:
49 
50    (1) An obvious improvement to this code would be to compute mod 2^t
51    everywhere.  Unfortunately, we cannot determine t beforehand, unless we
52    invoke some approximation, such as Stirling's formula.  Of course, we don't
53    need t to be tight.  However, it is not clear that this would help much,
54    our numbers are kept reasonably small already.
55 
56    (2) Compute nmax/kmax semi-accurately, without scalar division or a loop.
57    Extracting the 3 msb, then doing a table lookup using cnt*8+msb as index,
58    would make it both reasonably accurate and fast.  (We could use a table
59    stored into a limb, perhaps.)  The table should take the removed factors of
60    2 into account (those done on-the-fly in mulN).
61 
62    (3) The first time in the loop we compute the odd part of a
63    factorial in kp, we might use oddfac_1 for this task.
64  */
65 
66 /* This threshold determines how large divisor to accumulate before we call
67    bdiv.  Perhaps we should never call bdiv, and accumulate all we are told,
68    since we are just basecase code anyway?  Presumably, this depends on the
69    relative speed of the asymptotically fast code and this code.  */
70 #define SOME_THRESHOLD 20
71 
72 /* Multiply-into-limb functions.  These remove factors of 2 on-the-fly.  FIXME:
73    All versions of MAXFACS don't take this 2 removal into account now, meaning
74    that then, shifting just adds some overhead.  (We remove factors from the
75    completed limb anyway.)  */
76 
77 static mp_limb_t
78 mul1 (mp_limb_t m)
79 {
80   return m;
81 }
82 
83 static mp_limb_t
84 mul2 (mp_limb_t m)
85 {
86   /* We need to shift before multiplying, to avoid an overflow. */
87   mp_limb_t m01 = (m | 1) * ((m + 1) >> 1);
88   return m01;
89 }
90 
91 static mp_limb_t
92 mul3 (mp_limb_t m)
93 {
94   mp_limb_t m01 = (m + 0) * (m + 1) >> 1;
95   mp_limb_t m2 = (m + 2);
96   return m01 * m2;
97 }
98 
99 static mp_limb_t
100 mul4 (mp_limb_t m)
101 {
102   mp_limb_t m01 = (m + 0) * (m + 1) >> 1;
103   mp_limb_t m23 = (m + 2) * (m + 3) >> 1;
104   return m01 * m23;
105 }
106 
107 static mp_limb_t
108 mul5 (mp_limb_t m)
109 {
110   mp_limb_t m012 = (m + 0) * (m + 1) * (m + 2) >> 1;
111   mp_limb_t m34 = (m + 3) * (m + 4) >> 1;
112   return m012 * m34;
113 }
114 
115 static mp_limb_t
116 mul6 (mp_limb_t m)
117 {
118   mp_limb_t m01 = (m + 0) * (m + 1);
119   mp_limb_t m23 = (m + 2) * (m + 3);
120   mp_limb_t m45 = (m + 4) * (m + 5) >> 1;
121   mp_limb_t m0123 = m01 * m23 >> 3;
122   return m0123 * m45;
123 }
124 
125 static mp_limb_t
126 mul7 (mp_limb_t m)
127 {
128   mp_limb_t m01 = (m + 0) * (m + 1);
129   mp_limb_t m23 = (m + 2) * (m + 3);
130   mp_limb_t m456 = (m + 4) * (m + 5) * (m + 6) >> 1;
131   mp_limb_t m0123 = m01 * m23 >> 3;
132   return m0123 * m456;
133 }
134 
135 static mp_limb_t
136 mul8 (mp_limb_t m)
137 {
138   mp_limb_t m01 = (m + 0) * (m + 1);
139   mp_limb_t m23 = (m + 2) * (m + 3);
140   mp_limb_t m45 = (m + 4) * (m + 5);
141   mp_limb_t m67 = (m + 6) * (m + 7);
142   mp_limb_t m0123 = m01 * m23 >> 3;
143   mp_limb_t m4567 = m45 * m67 >> 3;
144   return m0123 * m4567;
145 }
146 
147 typedef mp_limb_t (* mulfunc_t) (mp_limb_t);
148 
149 static const mulfunc_t mulfunc[] = {mul1,mul2,mul3,mul4,mul5,mul6,mul7,mul8};
150 #define M (numberof(mulfunc))
151 
152 /* Number of factors-of-2 removed by the corresponding mulN functon.  */
153 static const unsigned char tcnttab[] = {0, 1, 1, 2, 2, 4, 4, 6};
154 
155 #if 1
156 /* This variant is inaccurate but share the code with other functions.  */
157 #define MAXFACS(max,l)							\
158   do {									\
159     (max) = log_n_max (l);						\
160   } while (0)
161 #else
162 
163 /* This variant is exact(?) but uses a loop.  It takes the 2 removal
164  of mulN into account.  */
165 static const unsigned long ftab[] =
166 #if GMP_NUMB_BITS == 64
167   /* 1 to 8 factors per iteration */
168   {CNST_LIMB(0xffffffffffffffff),CNST_LIMB(0x100000000),0x32cbfe,0x16a0b,0x24c4,0xa16,0x34b,0x1b2 /*,0xdf,0x8d */};
169 #endif
170 #if GMP_NUMB_BITS == 32
171   /* 1 to 7 factors per iteration */
172   {0xffffffff,0x10000,0x801,0x16b,0x71,0x42,0x26 /* ,0x1e */};
173 #endif
174 
175 #define MAXFACS(max,l)							\
176   do {									\
177     int __i;								\
178     for (__i = numberof (ftab) - 1; l > ftab[__i]; __i--)		\
179       ;									\
180     (max) = __i + 1;							\
181   } while (0)
182 #endif
183 
184 /* Entry i contains (i!/2^t)^(-1) where t is chosen such that the parenthesis
185    is an odd integer. */
186 static const mp_limb_t facinv[] = { ONE_LIMB_ODD_FACTORIAL_INVERSES_TABLE };
187 
188 static void
189 mpz_bdiv_bin_uiui (mpz_ptr r, unsigned long int n, unsigned long int k)
190 {
191   int nmax, kmax, nmaxnow, numfac;
192   mp_ptr np, kp;
193   mp_size_t nn, kn, alloc;
194   mp_limb_t i, j, t, iii, jjj, cy, dinv;
195   mp_bitcnt_t i2cnt, j2cnt;
196   int cnt;
197   mp_size_t maxn;
198   TMP_DECL;
199 
200   ASSERT (k > ODD_FACTORIAL_TABLE_LIMIT);
201   TMP_MARK;
202 
203   maxn = 1 + n / GMP_NUMB_BITS;    /* absolutely largest result size (limbs) */
204 
205   /* FIXME: This allocation might be insufficient, but is usually way too
206      large.  */
207   alloc = SOME_THRESHOLD - 1 + MAX (3 * maxn / 2, SOME_THRESHOLD);
208   alloc = MIN (alloc, k) + 1;
209   np = TMP_ALLOC_LIMBS (alloc);
210   kp = TMP_ALLOC_LIMBS (SOME_THRESHOLD + 1);
211 
212   MAXFACS (nmax, n);
213   ASSERT (nmax <= M);
214   MAXFACS (kmax, k);
215   ASSERT (kmax <= M);
216   ASSERT (k >= M);
217 
218   i = n - k + 1;
219 
220   np[0] = 1; nn = 1;
221 
222   i2cnt = 0;				/* total low zeros in dividend */
223   j2cnt = __gmp_fac2cnt_table[ODD_FACTORIAL_TABLE_LIMIT / 2 - 1];
224 					/* total low zeros in divisor */
225 
226   numfac = 1;
227   j = ODD_FACTORIAL_TABLE_LIMIT + 1;
228   jjj = ODD_FACTORIAL_TABLE_MAX;
229   ASSERT (__gmp_oddfac_table[ODD_FACTORIAL_TABLE_LIMIT] == ODD_FACTORIAL_TABLE_MAX);
230 
231   while (1)
232     {
233       kp[0] = jjj;				/* store new factors */
234       kn = 1;
235       t = k - j + 1;
236       kmax = MIN (kmax, t);
237 
238       while (kmax != 0 && kn < SOME_THRESHOLD)
239 	{
240 	  jjj = mulfunc[kmax - 1] (j);
241 	  j += kmax;				/* number of factors used */
242 	  count_trailing_zeros (cnt, jjj);	/* count low zeros */
243 	  jjj >>= cnt;				/* remove remaining low zeros */
244 	  j2cnt += tcnttab[kmax - 1] + cnt;	/* update low zeros count */
245 	  cy = mpn_mul_1 (kp, kp, kn, jjj);	/* accumulate new factors */
246 	  kp[kn] = cy;
247 	  kn += cy != 0;
248 	  t = k - j + 1;
249 	  kmax = MIN (kmax, t);
250 	}
251       numfac = j - numfac;
252 
253       while (numfac != 0)
254 	{
255 	  nmaxnow = MIN (nmax, numfac);
256 	  iii = mulfunc[nmaxnow - 1] (i);
257 	  i += nmaxnow;				/* number of factors used */
258 	  count_trailing_zeros (cnt, iii);	/* count low zeros */
259 	  iii >>= cnt;				/* remove remaining low zeros */
260 	  i2cnt += tcnttab[nmaxnow - 1] + cnt;	/* update low zeros count */
261 	  cy = mpn_mul_1 (np, np, nn, iii);	/* accumulate new factors */
262 	  np[nn] = cy;
263 	  nn += cy != 0;
264 	  numfac -= nmaxnow;
265 	}
266 
267       ASSERT (nn < alloc);
268 
269       binvert_limb (dinv, kp[0]);
270       nn += (np[nn - 1] >= kp[kn - 1]);
271       nn -= kn;
272       mpn_sbpi1_bdiv_q (np, np, nn, kp, MIN(kn,nn), -dinv);
273 
274       if (kmax == 0)
275 	break;
276       numfac = j;
277 
278       jjj = mulfunc[kmax - 1] (j);
279       j += kmax;				/* number of factors used */
280       count_trailing_zeros (cnt, jjj);		/* count low zeros */
281       jjj >>= cnt;				/* remove remaining low zeros */
282       j2cnt += tcnttab[kmax - 1] + cnt;		/* update low zeros count */
283     }
284 
285   /* Put back the right number of factors of 2.  */
286   cnt = i2cnt - j2cnt;
287   if (cnt != 0)
288     {
289       ASSERT (cnt < GMP_NUMB_BITS); /* can happen, but not for intended use */
290       cy = mpn_lshift (np, np, nn, cnt);
291       np[nn] = cy;
292       nn += cy != 0;
293     }
294 
295   nn -= np[nn - 1] == 0;	/* normalisation */
296 
297   kp = MPZ_NEWALLOC (r, nn);
298   SIZ(r) = nn;
299   MPN_COPY (kp, np, nn);
300   TMP_FREE;
301 }
302 
303 static void
304 mpz_smallk_bin_uiui (mpz_ptr r, unsigned long int n, unsigned long int k)
305 {
306   int nmax, numfac;
307   mp_ptr rp;
308   mp_size_t rn, alloc;
309   mp_limb_t i, iii, cy;
310   mp_bitcnt_t i2cnt, cnt;
311 
312   count_leading_zeros (cnt, (mp_limb_t) n);
313   cnt = GMP_LIMB_BITS - cnt;
314   alloc = cnt * k / GMP_NUMB_BITS + 3;	/* FIXME: ensure rounding is enough. */
315   rp = MPZ_NEWALLOC (r, alloc);
316 
317   MAXFACS (nmax, n);
318   nmax = MIN (nmax, M);
319 
320   i = n - k + 1;
321 
322   nmax = MIN (nmax, k);
323   rp[0] = mulfunc[nmax - 1] (i);
324   rn = 1;
325   i += nmax;				/* number of factors used */
326   i2cnt = tcnttab[nmax - 1];		/* low zeros count */
327   numfac = k - nmax;
328   while (numfac != 0)
329     {
330       nmax = MIN (nmax, numfac);
331       iii = mulfunc[nmax - 1] (i);
332       i += nmax;			/* number of factors used */
333       i2cnt += tcnttab[nmax - 1];	/* update low zeros count */
334       cy = mpn_mul_1 (rp, rp, rn, iii);	/* accumulate new factors */
335       rp[rn] = cy;
336       rn += cy != 0;
337       numfac -= nmax;
338     }
339 
340   ASSERT (rn < alloc);
341 
342   mpn_pi1_bdiv_q_1 (rp, rp, rn, __gmp_oddfac_table[k], facinv[k - 2],
343 		    __gmp_fac2cnt_table[k / 2 - 1] - i2cnt);
344   /* A two-fold, branch-free normalisation is possible :*/
345   /* rn -= rp[rn - 1] == 0; */
346   /* rn -= rp[rn - 1] == 0; */
347   MPN_NORMALIZE_NOT_ZERO (rp, rn);
348 
349   SIZ(r) = rn;
350 }
351 
352 /* Algorithm:
353 
354    Plain and simply multiply things together.
355 
356    We tabulate factorials (k!/2^t)^(-1) mod B (where t is chosen such
357    that k!/2^t is odd).
358 
359 */
360 
361 static mp_limb_t
362 bc_bin_uiui (unsigned int n, unsigned int k)
363 {
364   return ((__gmp_oddfac_table[n] * facinv[k - 2] * facinv[n - k - 2])
365     << (__gmp_fac2cnt_table[n / 2 - 1] - __gmp_fac2cnt_table[k / 2 - 1] - __gmp_fac2cnt_table[(n-k) / 2 - 1]))
366     & GMP_NUMB_MASK;
367 }
368 
369 /* Algorithm:
370 
371    Recursively exploit the relation
372    bin(n,k) = bin(n,k>>1)*bin(n-k>>1,k-k>>1)/bin(k,k>>1) .
373 
374    Values for binomial(k,k>>1) that fit in a limb are precomputed
375    (with inverses).
376 */
377 
378 /* bin2kk[i - ODD_CENTRAL_BINOMIAL_OFFSET] =
379    binomial(i*2,i)/2^t (where t is chosen so that it is odd). */
380 static const mp_limb_t bin2kk[] = { ONE_LIMB_ODD_CENTRAL_BINOMIAL_TABLE };
381 
382 /* bin2kkinv[i] = bin2kk[i]^-1 mod B */
383 static const mp_limb_t bin2kkinv[] = { ONE_LIMB_ODD_CENTRAL_BINOMIAL_INVERSE_TABLE };
384 
385 /* bin2kk[i] = binomial((i+MIN_S)*2,i+MIN_S)/2^t. This table contains the t values. */
386 static const unsigned char fac2bin[] = { CENTRAL_BINOMIAL_2FAC_TABLE };
387 
388 static void
389 mpz_smallkdc_bin_uiui (mpz_ptr r, unsigned long int n, unsigned long int k)
390 {
391   mp_ptr rp;
392   mp_size_t rn;
393   unsigned long int hk;
394 
395   hk = k >> 1;
396 
397   if ((! BIN_UIUI_RECURSIVE_SMALLDC) || hk <= ODD_FACTORIAL_TABLE_LIMIT)
398     mpz_smallk_bin_uiui (r, n, hk);
399   else
400     mpz_smallkdc_bin_uiui (r, n, hk);
401   k -= hk;
402   n -= hk;
403   if (n <= ODD_FACTORIAL_EXTTABLE_LIMIT) {
404     mp_limb_t cy;
405     rn = SIZ (r);
406     rp = MPZ_REALLOC (r, rn + 1);
407     cy = mpn_mul_1 (rp, rp, rn, bc_bin_uiui (n, k));
408     rp [rn] = cy;
409     rn += cy != 0;
410   } else {
411     mp_limb_t buffer[ODD_CENTRAL_BINOMIAL_TABLE_LIMIT + 3];
412     mpz_t t;
413 
414     ALLOC (t) = ODD_CENTRAL_BINOMIAL_TABLE_LIMIT + 3;
415     PTR (t) = buffer;
416     if ((! BIN_UIUI_RECURSIVE_SMALLDC) || k <= ODD_FACTORIAL_TABLE_LIMIT)
417       mpz_smallk_bin_uiui (t, n, k);
418     else
419       mpz_smallkdc_bin_uiui (t, n, k);
420     mpz_mul (r, r, t);
421     rp = PTR (r);
422     rn = SIZ (r);
423   }
424 
425   mpn_pi1_bdiv_q_1 (rp, rp, rn, bin2kk[k - ODD_CENTRAL_BINOMIAL_OFFSET],
426 		    bin2kkinv[k - ODD_CENTRAL_BINOMIAL_OFFSET],
427 		    fac2bin[k - ODD_CENTRAL_BINOMIAL_OFFSET] - (k != hk));
428   /* A two-fold, branch-free normalisation is possible :*/
429   /* rn -= rp[rn - 1] == 0; */
430   /* rn -= rp[rn - 1] == 0; */
431   MPN_NORMALIZE_NOT_ZERO (rp, rn);
432 
433   SIZ(r) = rn;
434 }
435 
436 /* mpz_goetgheluck_bin_uiui(RESULT, N, K) -- Set RESULT to binomial(N,K).
437  *
438  * Contributed to the GNU project by Marco Bodrato.
439  *
440  * Implementation of the algorithm by P. Goetgheluck, "Computing
441  * Binomial Coefficients", The American Mathematical Monthly, Vol. 94,
442  * No. 4 (April 1987), pp. 360-365.
443  *
444  * Acknowledgment: Peter Luschny did spot the slowness of the previous
445  * code and suggested the reference.
446  */
447 
448 /* TODO: Remove duplicated constants / macros / static functions...
449  */
450 
451 /*************************************************************/
452 /* Section macros: common macros, for swing/fac/bin (&sieve) */
453 /*************************************************************/
454 
455 #define FACTOR_LIST_APPEND(PR, MAX_PR, VEC, I)			\
456   if ((PR) > (MAX_PR)) {					\
457     (VEC)[(I)++] = (PR);					\
458     (PR) = 1;							\
459   }
460 
461 #define FACTOR_LIST_STORE(P, PR, MAX_PR, VEC, I)		\
462   do {								\
463     if ((PR) > (MAX_PR)) {					\
464       (VEC)[(I)++] = (PR);					\
465       (PR) = (P);						\
466     } else							\
467       (PR) *= (P);						\
468   } while (0)
469 
470 #define LOOP_ON_SIEVE_CONTINUE(prime,end,sieve)			\
471     __max_i = (end);						\
472 								\
473     do {							\
474       ++__i;							\
475       if (((sieve)[__index] & __mask) == 0)			\
476 	{							\
477 	  (prime) = id_to_n(__i)
478 
479 #define LOOP_ON_SIEVE_BEGIN(prime,start,end,off,sieve)		\
480   do {								\
481     mp_limb_t __mask, __index, __max_i, __i;			\
482 								\
483     __i = (start)-(off);					\
484     __index = __i / GMP_LIMB_BITS;				\
485     __mask = CNST_LIMB(1) << (__i % GMP_LIMB_BITS);		\
486     __i += (off);						\
487 								\
488     LOOP_ON_SIEVE_CONTINUE(prime,end,sieve)
489 
490 #define LOOP_ON_SIEVE_STOP					\
491 	}							\
492       __mask = __mask << 1 | __mask >> (GMP_LIMB_BITS-1);	\
493       __index += __mask & 1;					\
494     }  while (__i <= __max_i)					\
495 
496 #define LOOP_ON_SIEVE_END					\
497     LOOP_ON_SIEVE_STOP;						\
498   } while (0)
499 
500 /*********************************************************/
501 /* Section sieve: sieving functions and tools for primes */
502 /*********************************************************/
503 
504 #if WANT_ASSERT
505 static mp_limb_t
506 bit_to_n (mp_limb_t bit) { return (bit*3+4)|1; }
507 #endif
508 
509 /* id_to_n (x) = bit_to_n (x-1) = (id*3+1)|1*/
510 static mp_limb_t
511 id_to_n  (mp_limb_t id)  { return id*3+1+(id&1); }
512 
513 /* n_to_bit (n) = ((n-1)&(-CNST_LIMB(2)))/3U-1 */
514 static mp_limb_t
515 n_to_bit (mp_limb_t n) { return ((n-5)|1)/3U; }
516 
517 static mp_size_t
518 primesieve_size (mp_limb_t n) { return n_to_bit(n) / GMP_LIMB_BITS + 1; }
519 
520 /*********************************************************/
521 /* Section binomial: fast binomial implementation        */
522 /*********************************************************/
523 
524 #define COUNT_A_PRIME(P, N, K, PR, MAX_PR, VEC, I)	\
525   do {							\
526     mp_limb_t __a, __b, __prime, __ma,__mb;		\
527     __prime = (P);					\
528     __a = (N); __b = (K); __mb = 0;			\
529     FACTOR_LIST_APPEND(PR, MAX_PR, VEC, I);		\
530     do {						\
531       __mb += __b % __prime; __b /= __prime;		\
532       __ma = __a % __prime; __a /= __prime;		\
533       if (__ma < __mb) {				\
534         __mb = 1; (PR) *= __prime;			\
535       } else  __mb = 0;					\
536     } while (__a >= __prime);				\
537   } while (0)
538 
539 #define SH_COUNT_A_PRIME(P, N, K, PR, MAX_PR, VEC, I)	\
540   do {							\
541     mp_limb_t __prime;					\
542     __prime = (P);					\
543     if (((N) % __prime) < ((K) % __prime)) {		\
544       FACTOR_LIST_STORE (__prime, PR, MAX_PR, VEC, I);	\
545     }							\
546   } while (0)
547 
548 /* Returns an approximation of the sqare root of x.  *
549  * It gives: x <= limb_apprsqrt (x) ^ 2 < x * 9/4    */
550 static mp_limb_t
551 limb_apprsqrt (mp_limb_t x)
552 {
553   int s;
554 
555   ASSERT (x > 2);
556   count_leading_zeros (s, x - 1);
557   s = GMP_LIMB_BITS - 1 - s;
558   return (CNST_LIMB(1) << (s >> 1)) + (CNST_LIMB(1) << ((s - 1) >> 1));
559 }
560 
561 static void
562 mpz_goetgheluck_bin_uiui (mpz_ptr r, unsigned long int n, unsigned long int k)
563 {
564   mp_limb_t *sieve, *factors, count;
565   mp_limb_t prod, max_prod, j;
566   TMP_DECL;
567 
568   ASSERT (BIN_GOETGHELUCK_THRESHOLD >= 13);
569   ASSERT (n >= 25);
570 
571   TMP_MARK;
572   sieve = TMP_ALLOC_LIMBS (primesieve_size (n));
573 
574   count = gmp_primesieve (sieve, n) + 1;
575   factors = TMP_ALLOC_LIMBS (count / log_n_max (n) + 1);
576 
577   max_prod = GMP_NUMB_MAX / n;
578 
579   /* Handle primes = 2, 3 separately. */
580   popc_limb (count, n - k);
581   popc_limb (j, k);
582   count += j;
583   popc_limb (j, n);
584   count -= j;
585   prod = CNST_LIMB(1) << count;
586 
587   j = 0;
588   COUNT_A_PRIME (3, n, k, prod, max_prod, factors, j);
589 
590   /* Accumulate prime factors from 5 to n/2 */
591     {
592       mp_limb_t s;
593 
594       {
595 	mp_limb_t prime;
596 	s = limb_apprsqrt(n);
597 	s = n_to_bit (s);
598 	LOOP_ON_SIEVE_BEGIN (prime, n_to_bit (5), s, 0,sieve);
599 	COUNT_A_PRIME (prime, n, k, prod, max_prod, factors, j);
600 	LOOP_ON_SIEVE_END;
601 	s++;
602       }
603 
604       ASSERT (max_prod <= GMP_NUMB_MAX / 2);
605       max_prod <<= 1;
606       ASSERT (bit_to_n (s) * bit_to_n (s) > n);
607       ASSERT (s <= n_to_bit (n >> 1));
608       {
609 	mp_limb_t prime;
610 
611 	LOOP_ON_SIEVE_BEGIN (prime, s, n_to_bit (n >> 1), 0,sieve);
612 	SH_COUNT_A_PRIME (prime, n, k, prod, max_prod, factors, j);
613 	LOOP_ON_SIEVE_END;
614       }
615       max_prod >>= 1;
616     }
617 
618   /* Store primes from (n-k)+1 to n */
619   ASSERT (n_to_bit (n - k) < n_to_bit (n));
620     {
621       mp_limb_t prime;
622       LOOP_ON_SIEVE_BEGIN (prime, n_to_bit (n - k) + 1, n_to_bit (n), 0,sieve);
623       FACTOR_LIST_STORE (prime, prod, max_prod, factors, j);
624       LOOP_ON_SIEVE_END;
625     }
626 
627   if (LIKELY (j != 0))
628     {
629       factors[j++] = prod;
630       mpz_prodlimbs (r, factors, j);
631     }
632   else
633     {
634       PTR (r)[0] = prod;
635       SIZ (r) = 1;
636     }
637   TMP_FREE;
638 }
639 
640 #undef COUNT_A_PRIME
641 #undef SH_COUNT_A_PRIME
642 #undef LOOP_ON_SIEVE_END
643 #undef LOOP_ON_SIEVE_STOP
644 #undef LOOP_ON_SIEVE_BEGIN
645 #undef LOOP_ON_SIEVE_CONTINUE
646 
647 /*********************************************************/
648 /* End of implementation of Goetgheluck's algorithm      */
649 /*********************************************************/
650 
651 void
652 mpz_bin_uiui (mpz_ptr r, unsigned long int n, unsigned long int k)
653 {
654   if (UNLIKELY (n < k)) {
655     SIZ (r) = 0;
656 #if BITS_PER_ULONG > GMP_NUMB_BITS
657   } else if (UNLIKELY (n > GMP_NUMB_MAX)) {
658     mpz_t tmp;
659 
660     mpz_init_set_ui (tmp, n);
661     mpz_bin_ui (r, tmp, k);
662     mpz_clear (tmp);
663 #endif
664   } else {
665     ASSERT (n <= GMP_NUMB_MAX);
666     /* Rewrite bin(n,k) as bin(n,n-k) if that is smaller. */
667     k = MIN (k, n - k);
668     if (k < 2) {
669       PTR(r)[0] = k ? n : 1; /* 1 + ((-k) & (n-1)); */
670       SIZ(r) = 1;
671     } else if (n <= ODD_FACTORIAL_EXTTABLE_LIMIT) { /* k >= 2, n >= 4 */
672       PTR(r)[0] = bc_bin_uiui (n, k);
673       SIZ(r) = 1;
674     } else if (k <= ODD_FACTORIAL_TABLE_LIMIT)
675       mpz_smallk_bin_uiui (r, n, k);
676     else if (BIN_UIUI_ENABLE_SMALLDC &&
677 	     k <= (BIN_UIUI_RECURSIVE_SMALLDC ? ODD_CENTRAL_BINOMIAL_TABLE_LIMIT : ODD_FACTORIAL_TABLE_LIMIT)* 2)
678       mpz_smallkdc_bin_uiui (r, n, k);
679     else if (ABOVE_THRESHOLD (k, BIN_GOETGHELUCK_THRESHOLD) &&
680 	     k > (n >> 4)) /* k > ODD_FACTORIAL_TABLE_LIMIT */
681       mpz_goetgheluck_bin_uiui (r, n, k);
682     else
683       mpz_bdiv_bin_uiui (r, n, k);
684   }
685 }
686