186d7f5d3SJohn Marino /* mpz_n_pow_ui -- mpn raised to ulong.
286d7f5d3SJohn Marino
386d7f5d3SJohn Marino THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST
486d7f5d3SJohn Marino CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
586d7f5d3SJohn Marino FUTURE GNU MP RELEASES.
686d7f5d3SJohn Marino
786d7f5d3SJohn Marino Copyright 2001, 2002, 2005 Free Software Foundation, Inc.
886d7f5d3SJohn Marino
986d7f5d3SJohn Marino This file is part of the GNU MP Library.
1086d7f5d3SJohn Marino
1186d7f5d3SJohn Marino The GNU MP Library is free software; you can redistribute it and/or modify
1286d7f5d3SJohn Marino it under the terms of the GNU Lesser General Public License as published by
1386d7f5d3SJohn Marino the Free Software Foundation; either version 3 of the License, or (at your
1486d7f5d3SJohn Marino option) any later version.
1586d7f5d3SJohn Marino
1686d7f5d3SJohn Marino The GNU MP Library is distributed in the hope that it will be useful, but
1786d7f5d3SJohn Marino WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
1886d7f5d3SJohn Marino or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
1986d7f5d3SJohn Marino License for more details.
2086d7f5d3SJohn Marino
2186d7f5d3SJohn Marino You should have received a copy of the GNU Lesser General Public License
2286d7f5d3SJohn Marino along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
2386d7f5d3SJohn Marino
2486d7f5d3SJohn Marino #include "gmp.h"
2586d7f5d3SJohn Marino #include "gmp-impl.h"
2686d7f5d3SJohn Marino #include "longlong.h"
2786d7f5d3SJohn Marino
2886d7f5d3SJohn Marino
2986d7f5d3SJohn Marino /* Change this to "#define TRACE(x) x" for some traces. */
3086d7f5d3SJohn Marino #define TRACE(x)
3186d7f5d3SJohn Marino
3286d7f5d3SJohn Marino
3386d7f5d3SJohn Marino /* Use this to test the mul_2 code on a CPU without a native version of that
3486d7f5d3SJohn Marino routine. */
3586d7f5d3SJohn Marino #if 0
3686d7f5d3SJohn Marino #define mpn_mul_2 refmpn_mul_2
3786d7f5d3SJohn Marino #define HAVE_NATIVE_mpn_mul_2 1
3886d7f5d3SJohn Marino #endif
3986d7f5d3SJohn Marino
4086d7f5d3SJohn Marino
4186d7f5d3SJohn Marino /* mpz_pow_ui and mpz_ui_pow_ui want to share almost all of this code.
4286d7f5d3SJohn Marino ui_pow_ui doesn't need the mpn_mul based powering loop or the tests on
4386d7f5d3SJohn Marino bsize==2 or >2, but separating that isn't easy because there's shared
4486d7f5d3SJohn Marino code both before and after (the size calculations and the powers of 2
4586d7f5d3SJohn Marino handling).
4686d7f5d3SJohn Marino
4786d7f5d3SJohn Marino Alternatives:
4886d7f5d3SJohn Marino
4986d7f5d3SJohn Marino It would work to just use the mpn_mul powering loop for 1 and 2 limb
5086d7f5d3SJohn Marino bases, but the current separate loop allows mul_1 and mul_2 to be done
5186d7f5d3SJohn Marino in-place, which might help cache locality a bit. If mpn_mul was relaxed
5286d7f5d3SJohn Marino to allow source==dest when vn==1 or 2 then some pointer twiddling might
5386d7f5d3SJohn Marino let us get the same effect in one loop.
5486d7f5d3SJohn Marino
5586d7f5d3SJohn Marino The initial powering for bsize==1 into blimb or blimb:blimb_low doesn't
5686d7f5d3SJohn Marino form the biggest possible power of b that fits, only the biggest power of
5786d7f5d3SJohn Marino 2 power, ie. b^(2^n). It'd be possible to choose a bigger power, perhaps
5886d7f5d3SJohn Marino using mp_bases[b].big_base for small b, and thereby get better value
5986d7f5d3SJohn Marino from mpn_mul_1 or mpn_mul_2 in the bignum powering. It's felt that doing
6086d7f5d3SJohn Marino so would be more complicated than it's worth, and could well end up being
6186d7f5d3SJohn Marino a slowdown for small e. For big e on the other hand the algorithm is
6286d7f5d3SJohn Marino dominated by mpn_sqr so there wouldn't much of a saving. The current
6386d7f5d3SJohn Marino code can be viewed as simply doing the first few steps of the powering in
6486d7f5d3SJohn Marino a single or double limb where possible.
6586d7f5d3SJohn Marino
6686d7f5d3SJohn Marino If r==b, and blow_twos==0, and r must be realloc'ed, then the temporary
6786d7f5d3SJohn Marino copy made of b is unnecessary. We could just use the old alloc'ed block
6886d7f5d3SJohn Marino and free it at the end. But arranging this seems like a lot more trouble
6986d7f5d3SJohn Marino than it's worth. */
7086d7f5d3SJohn Marino
7186d7f5d3SJohn Marino
7286d7f5d3SJohn Marino /* floor(sqrt(GMP_NUMB_MAX)), ie. the biggest value that can be squared in
7386d7f5d3SJohn Marino a limb without overflowing.
7486d7f5d3SJohn Marino FIXME: This formula is an underestimate when GMP_NUMB_BITS is odd. */
7586d7f5d3SJohn Marino
7686d7f5d3SJohn Marino #define GMP_NUMB_HALFMAX (((mp_limb_t) 1 << GMP_NUMB_BITS/2) - 1)
7786d7f5d3SJohn Marino
7886d7f5d3SJohn Marino
7986d7f5d3SJohn Marino /* The following are for convenience, they update the size and check the
8086d7f5d3SJohn Marino alloc. */
8186d7f5d3SJohn Marino
8286d7f5d3SJohn Marino #define MPN_SQR(dst, alloc, src, size) \
8386d7f5d3SJohn Marino do { \
8486d7f5d3SJohn Marino ASSERT (2*(size) <= (alloc)); \
8586d7f5d3SJohn Marino mpn_sqr (dst, src, size); \
8686d7f5d3SJohn Marino (size) *= 2; \
8786d7f5d3SJohn Marino (size) -= ((dst)[(size)-1] == 0); \
8886d7f5d3SJohn Marino } while (0)
8986d7f5d3SJohn Marino
9086d7f5d3SJohn Marino #define MPN_MUL(dst, alloc, src, size, src2, size2) \
9186d7f5d3SJohn Marino do { \
9286d7f5d3SJohn Marino mp_limb_t cy; \
9386d7f5d3SJohn Marino ASSERT ((size) + (size2) <= (alloc)); \
9486d7f5d3SJohn Marino cy = mpn_mul (dst, src, size, src2, size2); \
9586d7f5d3SJohn Marino (size) += (size2) - (cy == 0); \
9686d7f5d3SJohn Marino } while (0)
9786d7f5d3SJohn Marino
9886d7f5d3SJohn Marino #define MPN_MUL_2(ptr, size, alloc, mult) \
9986d7f5d3SJohn Marino do { \
10086d7f5d3SJohn Marino mp_limb_t cy; \
10186d7f5d3SJohn Marino ASSERT ((size)+2 <= (alloc)); \
10286d7f5d3SJohn Marino cy = mpn_mul_2 (ptr, ptr, size, mult); \
10386d7f5d3SJohn Marino (size)++; \
10486d7f5d3SJohn Marino (ptr)[(size)] = cy; \
10586d7f5d3SJohn Marino (size) += (cy != 0); \
10686d7f5d3SJohn Marino } while (0)
10786d7f5d3SJohn Marino
10886d7f5d3SJohn Marino #define MPN_MUL_1(ptr, size, alloc, limb) \
10986d7f5d3SJohn Marino do { \
11086d7f5d3SJohn Marino mp_limb_t cy; \
11186d7f5d3SJohn Marino ASSERT ((size)+1 <= (alloc)); \
11286d7f5d3SJohn Marino cy = mpn_mul_1 (ptr, ptr, size, limb); \
11386d7f5d3SJohn Marino (ptr)[size] = cy; \
11486d7f5d3SJohn Marino (size) += (cy != 0); \
11586d7f5d3SJohn Marino } while (0)
11686d7f5d3SJohn Marino
11786d7f5d3SJohn Marino #define MPN_LSHIFT(ptr, size, alloc, shift) \
11886d7f5d3SJohn Marino do { \
11986d7f5d3SJohn Marino mp_limb_t cy; \
12086d7f5d3SJohn Marino ASSERT ((size)+1 <= (alloc)); \
12186d7f5d3SJohn Marino cy = mpn_lshift (ptr, ptr, size, shift); \
12286d7f5d3SJohn Marino (ptr)[size] = cy; \
12386d7f5d3SJohn Marino (size) += (cy != 0); \
12486d7f5d3SJohn Marino } while (0)
12586d7f5d3SJohn Marino
12686d7f5d3SJohn Marino #define MPN_RSHIFT_OR_COPY(dst, src, size, shift) \
12786d7f5d3SJohn Marino do { \
12886d7f5d3SJohn Marino if ((shift) == 0) \
12986d7f5d3SJohn Marino MPN_COPY (dst, src, size); \
13086d7f5d3SJohn Marino else \
13186d7f5d3SJohn Marino { \
13286d7f5d3SJohn Marino mpn_rshift (dst, src, size, shift); \
13386d7f5d3SJohn Marino (size) -= ((dst)[(size)-1] == 0); \
13486d7f5d3SJohn Marino } \
13586d7f5d3SJohn Marino } while (0)
13686d7f5d3SJohn Marino
13786d7f5d3SJohn Marino
13886d7f5d3SJohn Marino /* ralloc and talloc are only wanted for ASSERTs, after the initial space
13986d7f5d3SJohn Marino allocations. Avoid writing values to them in a normal build, to ensure
14086d7f5d3SJohn Marino the compiler lets them go dead. gcc already figures this out itself
14186d7f5d3SJohn Marino actually. */
14286d7f5d3SJohn Marino
14386d7f5d3SJohn Marino #define SWAP_RP_TP \
14486d7f5d3SJohn Marino do { \
14586d7f5d3SJohn Marino MP_PTR_SWAP (rp, tp); \
14686d7f5d3SJohn Marino ASSERT_CODE (MP_SIZE_T_SWAP (ralloc, talloc)); \
14786d7f5d3SJohn Marino } while (0)
14886d7f5d3SJohn Marino
14986d7f5d3SJohn Marino
15086d7f5d3SJohn Marino void
mpz_n_pow_ui(mpz_ptr r,mp_srcptr bp,mp_size_t bsize,unsigned long int e)15186d7f5d3SJohn Marino mpz_n_pow_ui (mpz_ptr r, mp_srcptr bp, mp_size_t bsize, unsigned long int e)
15286d7f5d3SJohn Marino {
15386d7f5d3SJohn Marino mp_ptr rp;
15486d7f5d3SJohn Marino mp_size_t rtwos_limbs, ralloc, rsize;
15586d7f5d3SJohn Marino int rneg, i, cnt, btwos, r_bp_overlap;
15686d7f5d3SJohn Marino mp_limb_t blimb, rl;
15786d7f5d3SJohn Marino mp_bitcnt_t rtwos_bits;
15886d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_mul_2
15986d7f5d3SJohn Marino mp_limb_t blimb_low, rl_high;
16086d7f5d3SJohn Marino #else
16186d7f5d3SJohn Marino mp_limb_t b_twolimbs[2];
16286d7f5d3SJohn Marino #endif
16386d7f5d3SJohn Marino TMP_DECL;
16486d7f5d3SJohn Marino
16586d7f5d3SJohn Marino TRACE (printf ("mpz_n_pow_ui rp=0x%lX bp=0x%lX bsize=%ld e=%lu (0x%lX)\n",
16686d7f5d3SJohn Marino PTR(r), bp, bsize, e, e);
16786d7f5d3SJohn Marino mpn_trace ("b", bp, bsize));
16886d7f5d3SJohn Marino
16986d7f5d3SJohn Marino ASSERT (bsize == 0 || bp[ABS(bsize)-1] != 0);
17086d7f5d3SJohn Marino ASSERT (MPN_SAME_OR_SEPARATE2_P (PTR(r), ABSIZ(r), bp, bsize));
17186d7f5d3SJohn Marino
17286d7f5d3SJohn Marino /* b^0 == 1, including 0^0 == 1 */
17386d7f5d3SJohn Marino if (e == 0)
17486d7f5d3SJohn Marino {
17586d7f5d3SJohn Marino PTR(r)[0] = 1;
17686d7f5d3SJohn Marino SIZ(r) = 1;
17786d7f5d3SJohn Marino return;
17886d7f5d3SJohn Marino }
17986d7f5d3SJohn Marino
18086d7f5d3SJohn Marino /* 0^e == 0 apart from 0^0 above */
18186d7f5d3SJohn Marino if (bsize == 0)
18286d7f5d3SJohn Marino {
18386d7f5d3SJohn Marino SIZ(r) = 0;
18486d7f5d3SJohn Marino return;
18586d7f5d3SJohn Marino }
18686d7f5d3SJohn Marino
18786d7f5d3SJohn Marino /* Sign of the final result. */
18886d7f5d3SJohn Marino rneg = (bsize < 0 && (e & 1) != 0);
18986d7f5d3SJohn Marino bsize = ABS (bsize);
19086d7f5d3SJohn Marino TRACE (printf ("rneg %d\n", rneg));
19186d7f5d3SJohn Marino
19286d7f5d3SJohn Marino r_bp_overlap = (PTR(r) == bp);
19386d7f5d3SJohn Marino
19486d7f5d3SJohn Marino /* Strip low zero limbs from b. */
19586d7f5d3SJohn Marino rtwos_limbs = 0;
19686d7f5d3SJohn Marino for (blimb = *bp; blimb == 0; blimb = *++bp)
19786d7f5d3SJohn Marino {
19886d7f5d3SJohn Marino rtwos_limbs += e;
19986d7f5d3SJohn Marino bsize--; ASSERT (bsize >= 1);
20086d7f5d3SJohn Marino }
20186d7f5d3SJohn Marino TRACE (printf ("trailing zero rtwos_limbs=%ld\n", rtwos_limbs));
20286d7f5d3SJohn Marino
20386d7f5d3SJohn Marino /* Strip low zero bits from b. */
20486d7f5d3SJohn Marino count_trailing_zeros (btwos, blimb);
20586d7f5d3SJohn Marino blimb >>= btwos;
20686d7f5d3SJohn Marino rtwos_bits = e * btwos;
20786d7f5d3SJohn Marino rtwos_limbs += rtwos_bits / GMP_NUMB_BITS;
20886d7f5d3SJohn Marino rtwos_bits %= GMP_NUMB_BITS;
20986d7f5d3SJohn Marino TRACE (printf ("trailing zero btwos=%d rtwos_limbs=%ld rtwos_bits=%lu\n",
21086d7f5d3SJohn Marino btwos, rtwos_limbs, rtwos_bits));
21186d7f5d3SJohn Marino
21286d7f5d3SJohn Marino TMP_MARK;
21386d7f5d3SJohn Marino
21486d7f5d3SJohn Marino rl = 1;
21586d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_mul_2
21686d7f5d3SJohn Marino rl_high = 0;
21786d7f5d3SJohn Marino #endif
21886d7f5d3SJohn Marino
21986d7f5d3SJohn Marino if (bsize == 1)
22086d7f5d3SJohn Marino {
22186d7f5d3SJohn Marino bsize_1:
22286d7f5d3SJohn Marino /* Power up as far as possible within blimb. We start here with e!=0,
22386d7f5d3SJohn Marino but if e is small then we might reach e==0 and the whole b^e in rl.
22486d7f5d3SJohn Marino Notice this code works when blimb==1 too, reaching e==0. */
22586d7f5d3SJohn Marino
22686d7f5d3SJohn Marino while (blimb <= GMP_NUMB_HALFMAX)
22786d7f5d3SJohn Marino {
22886d7f5d3SJohn Marino TRACE (printf ("small e=0x%lX blimb=0x%lX rl=0x%lX\n",
22986d7f5d3SJohn Marino e, blimb, rl));
23086d7f5d3SJohn Marino ASSERT (e != 0);
23186d7f5d3SJohn Marino if ((e & 1) != 0)
23286d7f5d3SJohn Marino rl *= blimb;
23386d7f5d3SJohn Marino e >>= 1;
23486d7f5d3SJohn Marino if (e == 0)
23586d7f5d3SJohn Marino goto got_rl;
23686d7f5d3SJohn Marino blimb *= blimb;
23786d7f5d3SJohn Marino }
23886d7f5d3SJohn Marino
23986d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_mul_2
24086d7f5d3SJohn Marino TRACE (printf ("single power, e=0x%lX b=0x%lX rl=0x%lX\n",
24186d7f5d3SJohn Marino e, blimb, rl));
24286d7f5d3SJohn Marino
24386d7f5d3SJohn Marino /* Can power b once more into blimb:blimb_low */
24486d7f5d3SJohn Marino bsize = 2;
24586d7f5d3SJohn Marino ASSERT (e != 0);
24686d7f5d3SJohn Marino if ((e & 1) != 0)
24786d7f5d3SJohn Marino {
24886d7f5d3SJohn Marino umul_ppmm (rl_high, rl, rl, blimb << GMP_NAIL_BITS);
24986d7f5d3SJohn Marino rl >>= GMP_NAIL_BITS;
25086d7f5d3SJohn Marino }
25186d7f5d3SJohn Marino e >>= 1;
25286d7f5d3SJohn Marino umul_ppmm (blimb, blimb_low, blimb, blimb << GMP_NAIL_BITS);
25386d7f5d3SJohn Marino blimb_low >>= GMP_NAIL_BITS;
25486d7f5d3SJohn Marino
25586d7f5d3SJohn Marino got_rl:
25686d7f5d3SJohn Marino TRACE (printf ("double power e=0x%lX blimb=0x%lX:0x%lX rl=0x%lX:%lX\n",
25786d7f5d3SJohn Marino e, blimb, blimb_low, rl_high, rl));
25886d7f5d3SJohn Marino
25986d7f5d3SJohn Marino /* Combine left-over rtwos_bits into rl_high:rl to be handled by the
26086d7f5d3SJohn Marino final mul_1 or mul_2 rather than a separate lshift.
26186d7f5d3SJohn Marino - rl_high:rl mustn't be 1 (since then there's no final mul)
26286d7f5d3SJohn Marino - rl_high mustn't overflow
26386d7f5d3SJohn Marino - rl_high mustn't change to non-zero, since mul_1+lshift is
26486d7f5d3SJohn Marino probably faster than mul_2 (FIXME: is this true?) */
26586d7f5d3SJohn Marino
26686d7f5d3SJohn Marino if (rtwos_bits != 0
26786d7f5d3SJohn Marino && ! (rl_high == 0 && rl == 1)
26886d7f5d3SJohn Marino && (rl_high >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
26986d7f5d3SJohn Marino {
27086d7f5d3SJohn Marino mp_limb_t new_rl_high = (rl_high << rtwos_bits)
27186d7f5d3SJohn Marino | (rl >> (GMP_NUMB_BITS-rtwos_bits));
27286d7f5d3SJohn Marino if (! (rl_high == 0 && new_rl_high != 0))
27386d7f5d3SJohn Marino {
27486d7f5d3SJohn Marino rl_high = new_rl_high;
27586d7f5d3SJohn Marino rl <<= rtwos_bits;
27686d7f5d3SJohn Marino rtwos_bits = 0;
27786d7f5d3SJohn Marino TRACE (printf ("merged rtwos_bits, rl=0x%lX:%lX\n",
27886d7f5d3SJohn Marino rl_high, rl));
27986d7f5d3SJohn Marino }
28086d7f5d3SJohn Marino }
28186d7f5d3SJohn Marino #else
28286d7f5d3SJohn Marino got_rl:
28386d7f5d3SJohn Marino TRACE (printf ("small power e=0x%lX blimb=0x%lX rl=0x%lX\n",
28486d7f5d3SJohn Marino e, blimb, rl));
28586d7f5d3SJohn Marino
28686d7f5d3SJohn Marino /* Combine left-over rtwos_bits into rl to be handled by the final
28786d7f5d3SJohn Marino mul_1 rather than a separate lshift.
28886d7f5d3SJohn Marino - rl mustn't be 1 (since then there's no final mul)
28986d7f5d3SJohn Marino - rl mustn't overflow */
29086d7f5d3SJohn Marino
29186d7f5d3SJohn Marino if (rtwos_bits != 0
29286d7f5d3SJohn Marino && rl != 1
29386d7f5d3SJohn Marino && (rl >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
29486d7f5d3SJohn Marino {
29586d7f5d3SJohn Marino rl <<= rtwos_bits;
29686d7f5d3SJohn Marino rtwos_bits = 0;
29786d7f5d3SJohn Marino TRACE (printf ("merged rtwos_bits, rl=0x%lX\n", rl));
29886d7f5d3SJohn Marino }
29986d7f5d3SJohn Marino #endif
30086d7f5d3SJohn Marino }
30186d7f5d3SJohn Marino else if (bsize == 2)
30286d7f5d3SJohn Marino {
30386d7f5d3SJohn Marino mp_limb_t bsecond = bp[1];
30486d7f5d3SJohn Marino if (btwos != 0)
30586d7f5d3SJohn Marino blimb |= (bsecond << (GMP_NUMB_BITS - btwos)) & GMP_NUMB_MASK;
30686d7f5d3SJohn Marino bsecond >>= btwos;
30786d7f5d3SJohn Marino if (bsecond == 0)
30886d7f5d3SJohn Marino {
30986d7f5d3SJohn Marino /* Two limbs became one after rshift. */
31086d7f5d3SJohn Marino bsize = 1;
31186d7f5d3SJohn Marino goto bsize_1;
31286d7f5d3SJohn Marino }
31386d7f5d3SJohn Marino
31486d7f5d3SJohn Marino TRACE (printf ("bsize==2 using b=0x%lX:%lX", bsecond, blimb));
31586d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_mul_2
31686d7f5d3SJohn Marino blimb_low = blimb;
31786d7f5d3SJohn Marino #else
31886d7f5d3SJohn Marino bp = b_twolimbs;
31986d7f5d3SJohn Marino b_twolimbs[0] = blimb;
32086d7f5d3SJohn Marino b_twolimbs[1] = bsecond;
32186d7f5d3SJohn Marino #endif
32286d7f5d3SJohn Marino blimb = bsecond;
32386d7f5d3SJohn Marino }
32486d7f5d3SJohn Marino else
32586d7f5d3SJohn Marino {
32686d7f5d3SJohn Marino if (r_bp_overlap || btwos != 0)
32786d7f5d3SJohn Marino {
32886d7f5d3SJohn Marino mp_ptr tp = TMP_ALLOC_LIMBS (bsize);
32986d7f5d3SJohn Marino MPN_RSHIFT_OR_COPY (tp, bp, bsize, btwos);
33086d7f5d3SJohn Marino bp = tp;
33186d7f5d3SJohn Marino TRACE (printf ("rshift or copy bp,bsize, new bsize=%ld\n", bsize));
33286d7f5d3SJohn Marino }
33386d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_mul_2
33486d7f5d3SJohn Marino /* in case 3 limbs rshift to 2 and hence use the mul_2 loop below */
33586d7f5d3SJohn Marino blimb_low = bp[0];
33686d7f5d3SJohn Marino #endif
33786d7f5d3SJohn Marino blimb = bp[bsize-1];
33886d7f5d3SJohn Marino
33986d7f5d3SJohn Marino TRACE (printf ("big bsize=%ld ", bsize);
34086d7f5d3SJohn Marino mpn_trace ("b", bp, bsize));
34186d7f5d3SJohn Marino }
34286d7f5d3SJohn Marino
34386d7f5d3SJohn Marino /* At this point blimb is the most significant limb of the base to use.
34486d7f5d3SJohn Marino
34586d7f5d3SJohn Marino Each factor of b takes (bsize*BPML-cnt) bits and there's e of them; +1
34686d7f5d3SJohn Marino limb to round up the division; +1 for multiplies all using an extra
34786d7f5d3SJohn Marino limb over the true size; +2 for rl at the end; +1 for lshift at the
34886d7f5d3SJohn Marino end.
34986d7f5d3SJohn Marino
35086d7f5d3SJohn Marino The size calculation here is reasonably accurate. The base is at least
35186d7f5d3SJohn Marino half a limb, so in 32 bits the worst case is 2^16+1 treated as 17 bits
35286d7f5d3SJohn Marino when it will power up as just over 16, an overestimate of 17/16 =
35386d7f5d3SJohn Marino 6.25%. For a 64-bit limb it's half that.
35486d7f5d3SJohn Marino
35586d7f5d3SJohn Marino If e==0 then blimb won't be anything useful (though it will be
35686d7f5d3SJohn Marino non-zero), but that doesn't matter since we just end up with ralloc==5,
35786d7f5d3SJohn Marino and that's fine for 2 limbs of rl and 1 of lshift. */
35886d7f5d3SJohn Marino
35986d7f5d3SJohn Marino ASSERT (blimb != 0);
36086d7f5d3SJohn Marino count_leading_zeros (cnt, blimb);
36186d7f5d3SJohn Marino ralloc = (bsize*GMP_NUMB_BITS - cnt + GMP_NAIL_BITS) * e / GMP_NUMB_BITS + 5;
36286d7f5d3SJohn Marino TRACE (printf ("ralloc %ld, from bsize=%ld blimb=0x%lX cnt=%d\n",
36386d7f5d3SJohn Marino ralloc, bsize, blimb, cnt));
36486d7f5d3SJohn Marino MPZ_REALLOC (r, ralloc + rtwos_limbs);
36586d7f5d3SJohn Marino rp = PTR(r);
36686d7f5d3SJohn Marino
36786d7f5d3SJohn Marino /* Low zero limbs resulting from powers of 2. */
36886d7f5d3SJohn Marino MPN_ZERO (rp, rtwos_limbs);
36986d7f5d3SJohn Marino rp += rtwos_limbs;
37086d7f5d3SJohn Marino
37186d7f5d3SJohn Marino if (e == 0)
37286d7f5d3SJohn Marino {
37386d7f5d3SJohn Marino /* Any e==0 other than via bsize==1 or bsize==2 is covered at the
37486d7f5d3SJohn Marino start. */
37586d7f5d3SJohn Marino rp[0] = rl;
37686d7f5d3SJohn Marino rsize = 1;
37786d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_mul_2
37886d7f5d3SJohn Marino rp[1] = rl_high;
37986d7f5d3SJohn Marino rsize += (rl_high != 0);
38086d7f5d3SJohn Marino #endif
38186d7f5d3SJohn Marino ASSERT (rp[rsize-1] != 0);
38286d7f5d3SJohn Marino }
38386d7f5d3SJohn Marino else
38486d7f5d3SJohn Marino {
38586d7f5d3SJohn Marino mp_ptr tp;
38686d7f5d3SJohn Marino mp_size_t talloc;
38786d7f5d3SJohn Marino
38886d7f5d3SJohn Marino /* In the mpn_mul_1 or mpn_mul_2 loops or in the mpn_mul loop when the
38986d7f5d3SJohn Marino low bit of e is zero, tp only has to hold the second last power
39086d7f5d3SJohn Marino step, which is half the size of the final result. There's no need
39186d7f5d3SJohn Marino to round up the divide by 2, since ralloc includes a +2 for rl
39286d7f5d3SJohn Marino which not needed by tp. In the mpn_mul loop when the low bit of e
39386d7f5d3SJohn Marino is 1, tp must hold nearly the full result, so just size it the same
39486d7f5d3SJohn Marino as rp. */
39586d7f5d3SJohn Marino
39686d7f5d3SJohn Marino talloc = ralloc;
39786d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_mul_2
39886d7f5d3SJohn Marino if (bsize <= 2 || (e & 1) == 0)
39986d7f5d3SJohn Marino talloc /= 2;
40086d7f5d3SJohn Marino #else
40186d7f5d3SJohn Marino if (bsize <= 1 || (e & 1) == 0)
40286d7f5d3SJohn Marino talloc /= 2;
40386d7f5d3SJohn Marino #endif
40486d7f5d3SJohn Marino TRACE (printf ("talloc %ld\n", talloc));
40586d7f5d3SJohn Marino tp = TMP_ALLOC_LIMBS (talloc);
40686d7f5d3SJohn Marino
40786d7f5d3SJohn Marino /* Go from high to low over the bits of e, starting with i pointing at
40886d7f5d3SJohn Marino the bit below the highest 1 (which will mean i==-1 if e==1). */
40986d7f5d3SJohn Marino count_leading_zeros (cnt, e);
41086d7f5d3SJohn Marino i = GMP_LIMB_BITS - cnt - 2;
41186d7f5d3SJohn Marino
41286d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_mul_2
41386d7f5d3SJohn Marino if (bsize <= 2)
41486d7f5d3SJohn Marino {
41586d7f5d3SJohn Marino mp_limb_t mult[2];
41686d7f5d3SJohn Marino
41786d7f5d3SJohn Marino /* Any bsize==1 will have been powered above to be two limbs. */
41886d7f5d3SJohn Marino ASSERT (bsize == 2);
41986d7f5d3SJohn Marino ASSERT (blimb != 0);
42086d7f5d3SJohn Marino
42186d7f5d3SJohn Marino /* Arrange the final result ends up in r, not in the temp space */
42286d7f5d3SJohn Marino if ((i & 1) == 0)
42386d7f5d3SJohn Marino SWAP_RP_TP;
42486d7f5d3SJohn Marino
42586d7f5d3SJohn Marino rp[0] = blimb_low;
42686d7f5d3SJohn Marino rp[1] = blimb;
42786d7f5d3SJohn Marino rsize = 2;
42886d7f5d3SJohn Marino
42986d7f5d3SJohn Marino mult[0] = blimb_low;
43086d7f5d3SJohn Marino mult[1] = blimb;
43186d7f5d3SJohn Marino
43286d7f5d3SJohn Marino for ( ; i >= 0; i--)
43386d7f5d3SJohn Marino {
43486d7f5d3SJohn Marino TRACE (printf ("mul_2 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
43586d7f5d3SJohn Marino i, e, rsize, ralloc, talloc);
43686d7f5d3SJohn Marino mpn_trace ("r", rp, rsize));
43786d7f5d3SJohn Marino
43886d7f5d3SJohn Marino MPN_SQR (tp, talloc, rp, rsize);
43986d7f5d3SJohn Marino SWAP_RP_TP;
44086d7f5d3SJohn Marino if ((e & (1L << i)) != 0)
44186d7f5d3SJohn Marino MPN_MUL_2 (rp, rsize, ralloc, mult);
44286d7f5d3SJohn Marino }
44386d7f5d3SJohn Marino
44486d7f5d3SJohn Marino TRACE (mpn_trace ("mul_2 before rl, r", rp, rsize));
44586d7f5d3SJohn Marino if (rl_high != 0)
44686d7f5d3SJohn Marino {
44786d7f5d3SJohn Marino mult[0] = rl;
44886d7f5d3SJohn Marino mult[1] = rl_high;
44986d7f5d3SJohn Marino MPN_MUL_2 (rp, rsize, ralloc, mult);
45086d7f5d3SJohn Marino }
45186d7f5d3SJohn Marino else if (rl != 1)
45286d7f5d3SJohn Marino MPN_MUL_1 (rp, rsize, ralloc, rl);
45386d7f5d3SJohn Marino }
45486d7f5d3SJohn Marino #else
45586d7f5d3SJohn Marino if (bsize == 1)
45686d7f5d3SJohn Marino {
45786d7f5d3SJohn Marino /* Arrange the final result ends up in r, not in the temp space */
45886d7f5d3SJohn Marino if ((i & 1) == 0)
45986d7f5d3SJohn Marino SWAP_RP_TP;
46086d7f5d3SJohn Marino
46186d7f5d3SJohn Marino rp[0] = blimb;
46286d7f5d3SJohn Marino rsize = 1;
46386d7f5d3SJohn Marino
46486d7f5d3SJohn Marino for ( ; i >= 0; i--)
46586d7f5d3SJohn Marino {
46686d7f5d3SJohn Marino TRACE (printf ("mul_1 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
46786d7f5d3SJohn Marino i, e, rsize, ralloc, talloc);
46886d7f5d3SJohn Marino mpn_trace ("r", rp, rsize));
46986d7f5d3SJohn Marino
47086d7f5d3SJohn Marino MPN_SQR (tp, talloc, rp, rsize);
47186d7f5d3SJohn Marino SWAP_RP_TP;
47286d7f5d3SJohn Marino if ((e & (1L << i)) != 0)
47386d7f5d3SJohn Marino MPN_MUL_1 (rp, rsize, ralloc, blimb);
47486d7f5d3SJohn Marino }
47586d7f5d3SJohn Marino
47686d7f5d3SJohn Marino TRACE (mpn_trace ("mul_1 before rl, r", rp, rsize));
47786d7f5d3SJohn Marino if (rl != 1)
47886d7f5d3SJohn Marino MPN_MUL_1 (rp, rsize, ralloc, rl);
47986d7f5d3SJohn Marino }
48086d7f5d3SJohn Marino #endif
48186d7f5d3SJohn Marino else
48286d7f5d3SJohn Marino {
48386d7f5d3SJohn Marino int parity;
48486d7f5d3SJohn Marino
48586d7f5d3SJohn Marino /* Arrange the final result ends up in r, not in the temp space */
48686d7f5d3SJohn Marino ULONG_PARITY (parity, e);
48786d7f5d3SJohn Marino if (((parity ^ i) & 1) != 0)
48886d7f5d3SJohn Marino SWAP_RP_TP;
48986d7f5d3SJohn Marino
49086d7f5d3SJohn Marino MPN_COPY (rp, bp, bsize);
49186d7f5d3SJohn Marino rsize = bsize;
49286d7f5d3SJohn Marino
49386d7f5d3SJohn Marino for ( ; i >= 0; i--)
49486d7f5d3SJohn Marino {
49586d7f5d3SJohn Marino TRACE (printf ("mul loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
49686d7f5d3SJohn Marino i, e, rsize, ralloc, talloc);
49786d7f5d3SJohn Marino mpn_trace ("r", rp, rsize));
49886d7f5d3SJohn Marino
49986d7f5d3SJohn Marino MPN_SQR (tp, talloc, rp, rsize);
50086d7f5d3SJohn Marino SWAP_RP_TP;
50186d7f5d3SJohn Marino if ((e & (1L << i)) != 0)
50286d7f5d3SJohn Marino {
50386d7f5d3SJohn Marino MPN_MUL (tp, talloc, rp, rsize, bp, bsize);
50486d7f5d3SJohn Marino SWAP_RP_TP;
50586d7f5d3SJohn Marino }
50686d7f5d3SJohn Marino }
50786d7f5d3SJohn Marino }
50886d7f5d3SJohn Marino }
50986d7f5d3SJohn Marino
51086d7f5d3SJohn Marino ASSERT (rp == PTR(r) + rtwos_limbs);
51186d7f5d3SJohn Marino TRACE (mpn_trace ("end loop r", rp, rsize));
51286d7f5d3SJohn Marino TMP_FREE;
51386d7f5d3SJohn Marino
51486d7f5d3SJohn Marino /* Apply any partial limb factors of 2. */
51586d7f5d3SJohn Marino if (rtwos_bits != 0)
51686d7f5d3SJohn Marino {
51786d7f5d3SJohn Marino MPN_LSHIFT (rp, rsize, ralloc, (unsigned) rtwos_bits);
51886d7f5d3SJohn Marino TRACE (mpn_trace ("lshift r", rp, rsize));
51986d7f5d3SJohn Marino }
52086d7f5d3SJohn Marino
52186d7f5d3SJohn Marino rsize += rtwos_limbs;
52286d7f5d3SJohn Marino SIZ(r) = (rneg ? -rsize : rsize);
52386d7f5d3SJohn Marino }
524