xref: /dflybsd-src/contrib/gmp/mpz/n_pow_ui.c (revision 86d7f5d305c6adaa56ff4582ece9859d73106103)
186d7f5d3SJohn Marino /* mpz_n_pow_ui -- mpn raised to ulong.
286d7f5d3SJohn Marino 
386d7f5d3SJohn Marino    THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY.  THEY'RE ALMOST
486d7f5d3SJohn Marino    CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
586d7f5d3SJohn Marino    FUTURE GNU MP RELEASES.
686d7f5d3SJohn Marino 
786d7f5d3SJohn Marino Copyright 2001, 2002, 2005 Free Software Foundation, Inc.
886d7f5d3SJohn Marino 
986d7f5d3SJohn Marino This file is part of the GNU MP Library.
1086d7f5d3SJohn Marino 
1186d7f5d3SJohn Marino The GNU MP Library is free software; you can redistribute it and/or modify
1286d7f5d3SJohn Marino it under the terms of the GNU Lesser General Public License as published by
1386d7f5d3SJohn Marino the Free Software Foundation; either version 3 of the License, or (at your
1486d7f5d3SJohn Marino option) any later version.
1586d7f5d3SJohn Marino 
1686d7f5d3SJohn Marino The GNU MP Library is distributed in the hope that it will be useful, but
1786d7f5d3SJohn Marino WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
1886d7f5d3SJohn Marino or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
1986d7f5d3SJohn Marino License for more details.
2086d7f5d3SJohn Marino 
2186d7f5d3SJohn Marino You should have received a copy of the GNU Lesser General Public License
2286d7f5d3SJohn Marino along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
2386d7f5d3SJohn Marino 
2486d7f5d3SJohn Marino #include "gmp.h"
2586d7f5d3SJohn Marino #include "gmp-impl.h"
2686d7f5d3SJohn Marino #include "longlong.h"
2786d7f5d3SJohn Marino 
2886d7f5d3SJohn Marino 
2986d7f5d3SJohn Marino /* Change this to "#define TRACE(x) x" for some traces. */
3086d7f5d3SJohn Marino #define TRACE(x)
3186d7f5d3SJohn Marino 
3286d7f5d3SJohn Marino 
3386d7f5d3SJohn Marino /* Use this to test the mul_2 code on a CPU without a native version of that
3486d7f5d3SJohn Marino    routine.  */
3586d7f5d3SJohn Marino #if 0
3686d7f5d3SJohn Marino #define mpn_mul_2  refmpn_mul_2
3786d7f5d3SJohn Marino #define HAVE_NATIVE_mpn_mul_2  1
3886d7f5d3SJohn Marino #endif
3986d7f5d3SJohn Marino 
4086d7f5d3SJohn Marino 
4186d7f5d3SJohn Marino /* mpz_pow_ui and mpz_ui_pow_ui want to share almost all of this code.
4286d7f5d3SJohn Marino    ui_pow_ui doesn't need the mpn_mul based powering loop or the tests on
4386d7f5d3SJohn Marino    bsize==2 or >2, but separating that isn't easy because there's shared
4486d7f5d3SJohn Marino    code both before and after (the size calculations and the powers of 2
4586d7f5d3SJohn Marino    handling).
4686d7f5d3SJohn Marino 
4786d7f5d3SJohn Marino    Alternatives:
4886d7f5d3SJohn Marino 
4986d7f5d3SJohn Marino    It would work to just use the mpn_mul powering loop for 1 and 2 limb
5086d7f5d3SJohn Marino    bases, but the current separate loop allows mul_1 and mul_2 to be done
5186d7f5d3SJohn Marino    in-place, which might help cache locality a bit.  If mpn_mul was relaxed
5286d7f5d3SJohn Marino    to allow source==dest when vn==1 or 2 then some pointer twiddling might
5386d7f5d3SJohn Marino    let us get the same effect in one loop.
5486d7f5d3SJohn Marino 
5586d7f5d3SJohn Marino    The initial powering for bsize==1 into blimb or blimb:blimb_low doesn't
5686d7f5d3SJohn Marino    form the biggest possible power of b that fits, only the biggest power of
5786d7f5d3SJohn Marino    2 power, ie. b^(2^n).  It'd be possible to choose a bigger power, perhaps
5886d7f5d3SJohn Marino    using mp_bases[b].big_base for small b, and thereby get better value
5986d7f5d3SJohn Marino    from mpn_mul_1 or mpn_mul_2 in the bignum powering.  It's felt that doing
6086d7f5d3SJohn Marino    so would be more complicated than it's worth, and could well end up being
6186d7f5d3SJohn Marino    a slowdown for small e.  For big e on the other hand the algorithm is
6286d7f5d3SJohn Marino    dominated by mpn_sqr so there wouldn't much of a saving.  The current
6386d7f5d3SJohn Marino    code can be viewed as simply doing the first few steps of the powering in
6486d7f5d3SJohn Marino    a single or double limb where possible.
6586d7f5d3SJohn Marino 
6686d7f5d3SJohn Marino    If r==b, and blow_twos==0, and r must be realloc'ed, then the temporary
6786d7f5d3SJohn Marino    copy made of b is unnecessary.  We could just use the old alloc'ed block
6886d7f5d3SJohn Marino    and free it at the end.  But arranging this seems like a lot more trouble
6986d7f5d3SJohn Marino    than it's worth.  */
7086d7f5d3SJohn Marino 
7186d7f5d3SJohn Marino 
7286d7f5d3SJohn Marino /* floor(sqrt(GMP_NUMB_MAX)), ie. the biggest value that can be squared in
7386d7f5d3SJohn Marino    a limb without overflowing.
7486d7f5d3SJohn Marino    FIXME: This formula is an underestimate when GMP_NUMB_BITS is odd. */
7586d7f5d3SJohn Marino 
7686d7f5d3SJohn Marino #define GMP_NUMB_HALFMAX  (((mp_limb_t) 1 << GMP_NUMB_BITS/2) - 1)
7786d7f5d3SJohn Marino 
7886d7f5d3SJohn Marino 
7986d7f5d3SJohn Marino /* The following are for convenience, they update the size and check the
8086d7f5d3SJohn Marino    alloc.  */
8186d7f5d3SJohn Marino 
8286d7f5d3SJohn Marino #define MPN_SQR(dst, alloc, src, size)          \
8386d7f5d3SJohn Marino   do {                                          \
8486d7f5d3SJohn Marino     ASSERT (2*(size) <= (alloc));               \
8586d7f5d3SJohn Marino     mpn_sqr (dst, src, size);                   \
8686d7f5d3SJohn Marino     (size) *= 2;                                \
8786d7f5d3SJohn Marino     (size) -= ((dst)[(size)-1] == 0);           \
8886d7f5d3SJohn Marino   } while (0)
8986d7f5d3SJohn Marino 
9086d7f5d3SJohn Marino #define MPN_MUL(dst, alloc, src, size, src2, size2)     \
9186d7f5d3SJohn Marino   do {                                                  \
9286d7f5d3SJohn Marino     mp_limb_t  cy;                                      \
9386d7f5d3SJohn Marino     ASSERT ((size) + (size2) <= (alloc));               \
9486d7f5d3SJohn Marino     cy = mpn_mul (dst, src, size, src2, size2);         \
9586d7f5d3SJohn Marino     (size) += (size2) - (cy == 0);                      \
9686d7f5d3SJohn Marino   } while (0)
9786d7f5d3SJohn Marino 
9886d7f5d3SJohn Marino #define MPN_MUL_2(ptr, size, alloc, mult)       \
9986d7f5d3SJohn Marino   do {                                          \
10086d7f5d3SJohn Marino     mp_limb_t  cy;                              \
10186d7f5d3SJohn Marino     ASSERT ((size)+2 <= (alloc));               \
10286d7f5d3SJohn Marino     cy = mpn_mul_2 (ptr, ptr, size, mult);      \
10386d7f5d3SJohn Marino     (size)++;                                   \
10486d7f5d3SJohn Marino     (ptr)[(size)] = cy;                         \
10586d7f5d3SJohn Marino     (size) += (cy != 0);                        \
10686d7f5d3SJohn Marino   } while (0)
10786d7f5d3SJohn Marino 
10886d7f5d3SJohn Marino #define MPN_MUL_1(ptr, size, alloc, limb)       \
10986d7f5d3SJohn Marino   do {                                          \
11086d7f5d3SJohn Marino     mp_limb_t  cy;                              \
11186d7f5d3SJohn Marino     ASSERT ((size)+1 <= (alloc));               \
11286d7f5d3SJohn Marino     cy = mpn_mul_1 (ptr, ptr, size, limb);      \
11386d7f5d3SJohn Marino     (ptr)[size] = cy;                           \
11486d7f5d3SJohn Marino     (size) += (cy != 0);                        \
11586d7f5d3SJohn Marino   } while (0)
11686d7f5d3SJohn Marino 
11786d7f5d3SJohn Marino #define MPN_LSHIFT(ptr, size, alloc, shift)     \
11886d7f5d3SJohn Marino   do {                                          \
11986d7f5d3SJohn Marino     mp_limb_t  cy;                              \
12086d7f5d3SJohn Marino     ASSERT ((size)+1 <= (alloc));               \
12186d7f5d3SJohn Marino     cy = mpn_lshift (ptr, ptr, size, shift);    \
12286d7f5d3SJohn Marino     (ptr)[size] = cy;                           \
12386d7f5d3SJohn Marino     (size) += (cy != 0);                        \
12486d7f5d3SJohn Marino   } while (0)
12586d7f5d3SJohn Marino 
12686d7f5d3SJohn Marino #define MPN_RSHIFT_OR_COPY(dst, src, size, shift)       \
12786d7f5d3SJohn Marino   do {                                                  \
12886d7f5d3SJohn Marino     if ((shift) == 0)                                   \
12986d7f5d3SJohn Marino       MPN_COPY (dst, src, size);                        \
13086d7f5d3SJohn Marino     else                                                \
13186d7f5d3SJohn Marino       {                                                 \
13286d7f5d3SJohn Marino         mpn_rshift (dst, src, size, shift);             \
13386d7f5d3SJohn Marino         (size) -= ((dst)[(size)-1] == 0);               \
13486d7f5d3SJohn Marino       }                                                 \
13586d7f5d3SJohn Marino   } while (0)
13686d7f5d3SJohn Marino 
13786d7f5d3SJohn Marino 
13886d7f5d3SJohn Marino /* ralloc and talloc are only wanted for ASSERTs, after the initial space
13986d7f5d3SJohn Marino    allocations.  Avoid writing values to them in a normal build, to ensure
14086d7f5d3SJohn Marino    the compiler lets them go dead.  gcc already figures this out itself
14186d7f5d3SJohn Marino    actually.  */
14286d7f5d3SJohn Marino 
14386d7f5d3SJohn Marino #define SWAP_RP_TP                                      \
14486d7f5d3SJohn Marino   do {                                                  \
14586d7f5d3SJohn Marino     MP_PTR_SWAP (rp, tp);                               \
14686d7f5d3SJohn Marino     ASSERT_CODE (MP_SIZE_T_SWAP (ralloc, talloc));      \
14786d7f5d3SJohn Marino   } while (0)
14886d7f5d3SJohn Marino 
14986d7f5d3SJohn Marino 
15086d7f5d3SJohn Marino void
mpz_n_pow_ui(mpz_ptr r,mp_srcptr bp,mp_size_t bsize,unsigned long int e)15186d7f5d3SJohn Marino mpz_n_pow_ui (mpz_ptr r, mp_srcptr bp, mp_size_t bsize, unsigned long int e)
15286d7f5d3SJohn Marino {
15386d7f5d3SJohn Marino   mp_ptr         rp;
15486d7f5d3SJohn Marino   mp_size_t      rtwos_limbs, ralloc, rsize;
15586d7f5d3SJohn Marino   int            rneg, i, cnt, btwos, r_bp_overlap;
15686d7f5d3SJohn Marino   mp_limb_t      blimb, rl;
15786d7f5d3SJohn Marino   mp_bitcnt_t    rtwos_bits;
15886d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_mul_2
15986d7f5d3SJohn Marino   mp_limb_t      blimb_low, rl_high;
16086d7f5d3SJohn Marino #else
16186d7f5d3SJohn Marino   mp_limb_t      b_twolimbs[2];
16286d7f5d3SJohn Marino #endif
16386d7f5d3SJohn Marino   TMP_DECL;
16486d7f5d3SJohn Marino 
16586d7f5d3SJohn Marino   TRACE (printf ("mpz_n_pow_ui rp=0x%lX bp=0x%lX bsize=%ld e=%lu (0x%lX)\n",
16686d7f5d3SJohn Marino                  PTR(r), bp, bsize, e, e);
16786d7f5d3SJohn Marino          mpn_trace ("b", bp, bsize));
16886d7f5d3SJohn Marino 
16986d7f5d3SJohn Marino   ASSERT (bsize == 0 || bp[ABS(bsize)-1] != 0);
17086d7f5d3SJohn Marino   ASSERT (MPN_SAME_OR_SEPARATE2_P (PTR(r), ABSIZ(r), bp, bsize));
17186d7f5d3SJohn Marino 
17286d7f5d3SJohn Marino   /* b^0 == 1, including 0^0 == 1 */
17386d7f5d3SJohn Marino   if (e == 0)
17486d7f5d3SJohn Marino     {
17586d7f5d3SJohn Marino       PTR(r)[0] = 1;
17686d7f5d3SJohn Marino       SIZ(r) = 1;
17786d7f5d3SJohn Marino       return;
17886d7f5d3SJohn Marino     }
17986d7f5d3SJohn Marino 
18086d7f5d3SJohn Marino   /* 0^e == 0 apart from 0^0 above */
18186d7f5d3SJohn Marino   if (bsize == 0)
18286d7f5d3SJohn Marino     {
18386d7f5d3SJohn Marino       SIZ(r) = 0;
18486d7f5d3SJohn Marino       return;
18586d7f5d3SJohn Marino     }
18686d7f5d3SJohn Marino 
18786d7f5d3SJohn Marino   /* Sign of the final result. */
18886d7f5d3SJohn Marino   rneg = (bsize < 0 && (e & 1) != 0);
18986d7f5d3SJohn Marino   bsize = ABS (bsize);
19086d7f5d3SJohn Marino   TRACE (printf ("rneg %d\n", rneg));
19186d7f5d3SJohn Marino 
19286d7f5d3SJohn Marino   r_bp_overlap = (PTR(r) == bp);
19386d7f5d3SJohn Marino 
19486d7f5d3SJohn Marino   /* Strip low zero limbs from b. */
19586d7f5d3SJohn Marino   rtwos_limbs = 0;
19686d7f5d3SJohn Marino   for (blimb = *bp; blimb == 0; blimb = *++bp)
19786d7f5d3SJohn Marino     {
19886d7f5d3SJohn Marino       rtwos_limbs += e;
19986d7f5d3SJohn Marino       bsize--; ASSERT (bsize >= 1);
20086d7f5d3SJohn Marino     }
20186d7f5d3SJohn Marino   TRACE (printf ("trailing zero rtwos_limbs=%ld\n", rtwos_limbs));
20286d7f5d3SJohn Marino 
20386d7f5d3SJohn Marino   /* Strip low zero bits from b. */
20486d7f5d3SJohn Marino   count_trailing_zeros (btwos, blimb);
20586d7f5d3SJohn Marino   blimb >>= btwos;
20686d7f5d3SJohn Marino   rtwos_bits = e * btwos;
20786d7f5d3SJohn Marino   rtwos_limbs += rtwos_bits / GMP_NUMB_BITS;
20886d7f5d3SJohn Marino   rtwos_bits %= GMP_NUMB_BITS;
20986d7f5d3SJohn Marino   TRACE (printf ("trailing zero btwos=%d rtwos_limbs=%ld rtwos_bits=%lu\n",
21086d7f5d3SJohn Marino                  btwos, rtwos_limbs, rtwos_bits));
21186d7f5d3SJohn Marino 
21286d7f5d3SJohn Marino   TMP_MARK;
21386d7f5d3SJohn Marino 
21486d7f5d3SJohn Marino   rl = 1;
21586d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_mul_2
21686d7f5d3SJohn Marino   rl_high = 0;
21786d7f5d3SJohn Marino #endif
21886d7f5d3SJohn Marino 
21986d7f5d3SJohn Marino   if (bsize == 1)
22086d7f5d3SJohn Marino     {
22186d7f5d3SJohn Marino     bsize_1:
22286d7f5d3SJohn Marino       /* Power up as far as possible within blimb.  We start here with e!=0,
22386d7f5d3SJohn Marino          but if e is small then we might reach e==0 and the whole b^e in rl.
22486d7f5d3SJohn Marino          Notice this code works when blimb==1 too, reaching e==0.  */
22586d7f5d3SJohn Marino 
22686d7f5d3SJohn Marino       while (blimb <= GMP_NUMB_HALFMAX)
22786d7f5d3SJohn Marino         {
22886d7f5d3SJohn Marino           TRACE (printf ("small e=0x%lX blimb=0x%lX rl=0x%lX\n",
22986d7f5d3SJohn Marino                          e, blimb, rl));
23086d7f5d3SJohn Marino           ASSERT (e != 0);
23186d7f5d3SJohn Marino           if ((e & 1) != 0)
23286d7f5d3SJohn Marino             rl *= blimb;
23386d7f5d3SJohn Marino           e >>= 1;
23486d7f5d3SJohn Marino           if (e == 0)
23586d7f5d3SJohn Marino             goto got_rl;
23686d7f5d3SJohn Marino           blimb *= blimb;
23786d7f5d3SJohn Marino         }
23886d7f5d3SJohn Marino 
23986d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_mul_2
24086d7f5d3SJohn Marino       TRACE (printf ("single power, e=0x%lX b=0x%lX rl=0x%lX\n",
24186d7f5d3SJohn Marino                      e, blimb, rl));
24286d7f5d3SJohn Marino 
24386d7f5d3SJohn Marino       /* Can power b once more into blimb:blimb_low */
24486d7f5d3SJohn Marino       bsize = 2;
24586d7f5d3SJohn Marino       ASSERT (e != 0);
24686d7f5d3SJohn Marino       if ((e & 1) != 0)
24786d7f5d3SJohn Marino 	{
24886d7f5d3SJohn Marino 	  umul_ppmm (rl_high, rl, rl, blimb << GMP_NAIL_BITS);
24986d7f5d3SJohn Marino 	  rl >>= GMP_NAIL_BITS;
25086d7f5d3SJohn Marino 	}
25186d7f5d3SJohn Marino       e >>= 1;
25286d7f5d3SJohn Marino       umul_ppmm (blimb, blimb_low, blimb, blimb << GMP_NAIL_BITS);
25386d7f5d3SJohn Marino       blimb_low >>= GMP_NAIL_BITS;
25486d7f5d3SJohn Marino 
25586d7f5d3SJohn Marino     got_rl:
25686d7f5d3SJohn Marino       TRACE (printf ("double power e=0x%lX blimb=0x%lX:0x%lX rl=0x%lX:%lX\n",
25786d7f5d3SJohn Marino                      e, blimb, blimb_low, rl_high, rl));
25886d7f5d3SJohn Marino 
25986d7f5d3SJohn Marino       /* Combine left-over rtwos_bits into rl_high:rl to be handled by the
26086d7f5d3SJohn Marino          final mul_1 or mul_2 rather than a separate lshift.
26186d7f5d3SJohn Marino          - rl_high:rl mustn't be 1 (since then there's no final mul)
26286d7f5d3SJohn Marino          - rl_high mustn't overflow
26386d7f5d3SJohn Marino          - rl_high mustn't change to non-zero, since mul_1+lshift is
26486d7f5d3SJohn Marino          probably faster than mul_2 (FIXME: is this true?)  */
26586d7f5d3SJohn Marino 
26686d7f5d3SJohn Marino       if (rtwos_bits != 0
26786d7f5d3SJohn Marino           && ! (rl_high == 0 && rl == 1)
26886d7f5d3SJohn Marino           && (rl_high >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
26986d7f5d3SJohn Marino         {
27086d7f5d3SJohn Marino           mp_limb_t  new_rl_high = (rl_high << rtwos_bits)
27186d7f5d3SJohn Marino             | (rl >> (GMP_NUMB_BITS-rtwos_bits));
27286d7f5d3SJohn Marino           if (! (rl_high == 0 && new_rl_high != 0))
27386d7f5d3SJohn Marino             {
27486d7f5d3SJohn Marino               rl_high = new_rl_high;
27586d7f5d3SJohn Marino               rl <<= rtwos_bits;
27686d7f5d3SJohn Marino               rtwos_bits = 0;
27786d7f5d3SJohn Marino               TRACE (printf ("merged rtwos_bits, rl=0x%lX:%lX\n",
27886d7f5d3SJohn Marino                              rl_high, rl));
27986d7f5d3SJohn Marino             }
28086d7f5d3SJohn Marino         }
28186d7f5d3SJohn Marino #else
28286d7f5d3SJohn Marino     got_rl:
28386d7f5d3SJohn Marino       TRACE (printf ("small power e=0x%lX blimb=0x%lX rl=0x%lX\n",
28486d7f5d3SJohn Marino                      e, blimb, rl));
28586d7f5d3SJohn Marino 
28686d7f5d3SJohn Marino       /* Combine left-over rtwos_bits into rl to be handled by the final
28786d7f5d3SJohn Marino          mul_1 rather than a separate lshift.
28886d7f5d3SJohn Marino          - rl mustn't be 1 (since then there's no final mul)
28986d7f5d3SJohn Marino          - rl mustn't overflow  */
29086d7f5d3SJohn Marino 
29186d7f5d3SJohn Marino       if (rtwos_bits != 0
29286d7f5d3SJohn Marino           && rl != 1
29386d7f5d3SJohn Marino           && (rl >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
29486d7f5d3SJohn Marino         {
29586d7f5d3SJohn Marino           rl <<= rtwos_bits;
29686d7f5d3SJohn Marino           rtwos_bits = 0;
29786d7f5d3SJohn Marino           TRACE (printf ("merged rtwos_bits, rl=0x%lX\n", rl));
29886d7f5d3SJohn Marino         }
29986d7f5d3SJohn Marino #endif
30086d7f5d3SJohn Marino     }
30186d7f5d3SJohn Marino   else if (bsize == 2)
30286d7f5d3SJohn Marino     {
30386d7f5d3SJohn Marino       mp_limb_t  bsecond = bp[1];
30486d7f5d3SJohn Marino       if (btwos != 0)
30586d7f5d3SJohn Marino         blimb |= (bsecond << (GMP_NUMB_BITS - btwos)) & GMP_NUMB_MASK;
30686d7f5d3SJohn Marino       bsecond >>= btwos;
30786d7f5d3SJohn Marino       if (bsecond == 0)
30886d7f5d3SJohn Marino         {
30986d7f5d3SJohn Marino           /* Two limbs became one after rshift. */
31086d7f5d3SJohn Marino           bsize = 1;
31186d7f5d3SJohn Marino           goto bsize_1;
31286d7f5d3SJohn Marino         }
31386d7f5d3SJohn Marino 
31486d7f5d3SJohn Marino       TRACE (printf ("bsize==2 using b=0x%lX:%lX", bsecond, blimb));
31586d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_mul_2
31686d7f5d3SJohn Marino       blimb_low = blimb;
31786d7f5d3SJohn Marino #else
31886d7f5d3SJohn Marino       bp = b_twolimbs;
31986d7f5d3SJohn Marino       b_twolimbs[0] = blimb;
32086d7f5d3SJohn Marino       b_twolimbs[1] = bsecond;
32186d7f5d3SJohn Marino #endif
32286d7f5d3SJohn Marino       blimb = bsecond;
32386d7f5d3SJohn Marino     }
32486d7f5d3SJohn Marino   else
32586d7f5d3SJohn Marino     {
32686d7f5d3SJohn Marino       if (r_bp_overlap || btwos != 0)
32786d7f5d3SJohn Marino         {
32886d7f5d3SJohn Marino           mp_ptr tp = TMP_ALLOC_LIMBS (bsize);
32986d7f5d3SJohn Marino           MPN_RSHIFT_OR_COPY (tp, bp, bsize, btwos);
33086d7f5d3SJohn Marino           bp = tp;
33186d7f5d3SJohn Marino           TRACE (printf ("rshift or copy bp,bsize, new bsize=%ld\n", bsize));
33286d7f5d3SJohn Marino         }
33386d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_mul_2
33486d7f5d3SJohn Marino       /* in case 3 limbs rshift to 2 and hence use the mul_2 loop below */
33586d7f5d3SJohn Marino       blimb_low = bp[0];
33686d7f5d3SJohn Marino #endif
33786d7f5d3SJohn Marino       blimb = bp[bsize-1];
33886d7f5d3SJohn Marino 
33986d7f5d3SJohn Marino       TRACE (printf ("big bsize=%ld  ", bsize);
34086d7f5d3SJohn Marino              mpn_trace ("b", bp, bsize));
34186d7f5d3SJohn Marino     }
34286d7f5d3SJohn Marino 
34386d7f5d3SJohn Marino   /* At this point blimb is the most significant limb of the base to use.
34486d7f5d3SJohn Marino 
34586d7f5d3SJohn Marino      Each factor of b takes (bsize*BPML-cnt) bits and there's e of them; +1
34686d7f5d3SJohn Marino      limb to round up the division; +1 for multiplies all using an extra
34786d7f5d3SJohn Marino      limb over the true size; +2 for rl at the end; +1 for lshift at the
34886d7f5d3SJohn Marino      end.
34986d7f5d3SJohn Marino 
35086d7f5d3SJohn Marino      The size calculation here is reasonably accurate.  The base is at least
35186d7f5d3SJohn Marino      half a limb, so in 32 bits the worst case is 2^16+1 treated as 17 bits
35286d7f5d3SJohn Marino      when it will power up as just over 16, an overestimate of 17/16 =
35386d7f5d3SJohn Marino      6.25%.  For a 64-bit limb it's half that.
35486d7f5d3SJohn Marino 
35586d7f5d3SJohn Marino      If e==0 then blimb won't be anything useful (though it will be
35686d7f5d3SJohn Marino      non-zero), but that doesn't matter since we just end up with ralloc==5,
35786d7f5d3SJohn Marino      and that's fine for 2 limbs of rl and 1 of lshift.  */
35886d7f5d3SJohn Marino 
35986d7f5d3SJohn Marino   ASSERT (blimb != 0);
36086d7f5d3SJohn Marino   count_leading_zeros (cnt, blimb);
36186d7f5d3SJohn Marino   ralloc = (bsize*GMP_NUMB_BITS - cnt + GMP_NAIL_BITS) * e / GMP_NUMB_BITS + 5;
36286d7f5d3SJohn Marino   TRACE (printf ("ralloc %ld, from bsize=%ld blimb=0x%lX cnt=%d\n",
36386d7f5d3SJohn Marino                  ralloc, bsize, blimb, cnt));
36486d7f5d3SJohn Marino   MPZ_REALLOC (r, ralloc + rtwos_limbs);
36586d7f5d3SJohn Marino   rp = PTR(r);
36686d7f5d3SJohn Marino 
36786d7f5d3SJohn Marino   /* Low zero limbs resulting from powers of 2. */
36886d7f5d3SJohn Marino   MPN_ZERO (rp, rtwos_limbs);
36986d7f5d3SJohn Marino   rp += rtwos_limbs;
37086d7f5d3SJohn Marino 
37186d7f5d3SJohn Marino   if (e == 0)
37286d7f5d3SJohn Marino     {
37386d7f5d3SJohn Marino       /* Any e==0 other than via bsize==1 or bsize==2 is covered at the
37486d7f5d3SJohn Marino          start. */
37586d7f5d3SJohn Marino       rp[0] = rl;
37686d7f5d3SJohn Marino       rsize = 1;
37786d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_mul_2
37886d7f5d3SJohn Marino       rp[1] = rl_high;
37986d7f5d3SJohn Marino       rsize += (rl_high != 0);
38086d7f5d3SJohn Marino #endif
38186d7f5d3SJohn Marino       ASSERT (rp[rsize-1] != 0);
38286d7f5d3SJohn Marino     }
38386d7f5d3SJohn Marino   else
38486d7f5d3SJohn Marino     {
38586d7f5d3SJohn Marino       mp_ptr     tp;
38686d7f5d3SJohn Marino       mp_size_t  talloc;
38786d7f5d3SJohn Marino 
38886d7f5d3SJohn Marino       /* In the mpn_mul_1 or mpn_mul_2 loops or in the mpn_mul loop when the
38986d7f5d3SJohn Marino          low bit of e is zero, tp only has to hold the second last power
39086d7f5d3SJohn Marino          step, which is half the size of the final result.  There's no need
39186d7f5d3SJohn Marino          to round up the divide by 2, since ralloc includes a +2 for rl
39286d7f5d3SJohn Marino          which not needed by tp.  In the mpn_mul loop when the low bit of e
39386d7f5d3SJohn Marino          is 1, tp must hold nearly the full result, so just size it the same
39486d7f5d3SJohn Marino          as rp.  */
39586d7f5d3SJohn Marino 
39686d7f5d3SJohn Marino       talloc = ralloc;
39786d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_mul_2
39886d7f5d3SJohn Marino       if (bsize <= 2 || (e & 1) == 0)
39986d7f5d3SJohn Marino         talloc /= 2;
40086d7f5d3SJohn Marino #else
40186d7f5d3SJohn Marino       if (bsize <= 1 || (e & 1) == 0)
40286d7f5d3SJohn Marino         talloc /= 2;
40386d7f5d3SJohn Marino #endif
40486d7f5d3SJohn Marino       TRACE (printf ("talloc %ld\n", talloc));
40586d7f5d3SJohn Marino       tp = TMP_ALLOC_LIMBS (talloc);
40686d7f5d3SJohn Marino 
40786d7f5d3SJohn Marino       /* Go from high to low over the bits of e, starting with i pointing at
40886d7f5d3SJohn Marino          the bit below the highest 1 (which will mean i==-1 if e==1).  */
40986d7f5d3SJohn Marino       count_leading_zeros (cnt, e);
41086d7f5d3SJohn Marino       i = GMP_LIMB_BITS - cnt - 2;
41186d7f5d3SJohn Marino 
41286d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_mul_2
41386d7f5d3SJohn Marino       if (bsize <= 2)
41486d7f5d3SJohn Marino         {
41586d7f5d3SJohn Marino           mp_limb_t  mult[2];
41686d7f5d3SJohn Marino 
41786d7f5d3SJohn Marino           /* Any bsize==1 will have been powered above to be two limbs. */
41886d7f5d3SJohn Marino           ASSERT (bsize == 2);
41986d7f5d3SJohn Marino           ASSERT (blimb != 0);
42086d7f5d3SJohn Marino 
42186d7f5d3SJohn Marino           /* Arrange the final result ends up in r, not in the temp space */
42286d7f5d3SJohn Marino           if ((i & 1) == 0)
42386d7f5d3SJohn Marino             SWAP_RP_TP;
42486d7f5d3SJohn Marino 
42586d7f5d3SJohn Marino           rp[0] = blimb_low;
42686d7f5d3SJohn Marino           rp[1] = blimb;
42786d7f5d3SJohn Marino           rsize = 2;
42886d7f5d3SJohn Marino 
42986d7f5d3SJohn Marino           mult[0] = blimb_low;
43086d7f5d3SJohn Marino           mult[1] = blimb;
43186d7f5d3SJohn Marino 
43286d7f5d3SJohn Marino           for ( ; i >= 0; i--)
43386d7f5d3SJohn Marino             {
43486d7f5d3SJohn Marino               TRACE (printf ("mul_2 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
43586d7f5d3SJohn Marino                              i, e, rsize, ralloc, talloc);
43686d7f5d3SJohn Marino                      mpn_trace ("r", rp, rsize));
43786d7f5d3SJohn Marino 
43886d7f5d3SJohn Marino               MPN_SQR (tp, talloc, rp, rsize);
43986d7f5d3SJohn Marino               SWAP_RP_TP;
44086d7f5d3SJohn Marino               if ((e & (1L << i)) != 0)
44186d7f5d3SJohn Marino                 MPN_MUL_2 (rp, rsize, ralloc, mult);
44286d7f5d3SJohn Marino             }
44386d7f5d3SJohn Marino 
44486d7f5d3SJohn Marino           TRACE (mpn_trace ("mul_2 before rl, r", rp, rsize));
44586d7f5d3SJohn Marino           if (rl_high != 0)
44686d7f5d3SJohn Marino             {
44786d7f5d3SJohn Marino               mult[0] = rl;
44886d7f5d3SJohn Marino               mult[1] = rl_high;
44986d7f5d3SJohn Marino               MPN_MUL_2 (rp, rsize, ralloc, mult);
45086d7f5d3SJohn Marino             }
45186d7f5d3SJohn Marino           else if (rl != 1)
45286d7f5d3SJohn Marino             MPN_MUL_1 (rp, rsize, ralloc, rl);
45386d7f5d3SJohn Marino         }
45486d7f5d3SJohn Marino #else
45586d7f5d3SJohn Marino       if (bsize == 1)
45686d7f5d3SJohn Marino         {
45786d7f5d3SJohn Marino           /* Arrange the final result ends up in r, not in the temp space */
45886d7f5d3SJohn Marino           if ((i & 1) == 0)
45986d7f5d3SJohn Marino             SWAP_RP_TP;
46086d7f5d3SJohn Marino 
46186d7f5d3SJohn Marino           rp[0] = blimb;
46286d7f5d3SJohn Marino           rsize = 1;
46386d7f5d3SJohn Marino 
46486d7f5d3SJohn Marino           for ( ; i >= 0; i--)
46586d7f5d3SJohn Marino             {
46686d7f5d3SJohn Marino               TRACE (printf ("mul_1 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
46786d7f5d3SJohn Marino                              i, e, rsize, ralloc, talloc);
46886d7f5d3SJohn Marino                      mpn_trace ("r", rp, rsize));
46986d7f5d3SJohn Marino 
47086d7f5d3SJohn Marino               MPN_SQR (tp, talloc, rp, rsize);
47186d7f5d3SJohn Marino               SWAP_RP_TP;
47286d7f5d3SJohn Marino               if ((e & (1L << i)) != 0)
47386d7f5d3SJohn Marino                 MPN_MUL_1 (rp, rsize, ralloc, blimb);
47486d7f5d3SJohn Marino             }
47586d7f5d3SJohn Marino 
47686d7f5d3SJohn Marino           TRACE (mpn_trace ("mul_1 before rl, r", rp, rsize));
47786d7f5d3SJohn Marino           if (rl != 1)
47886d7f5d3SJohn Marino             MPN_MUL_1 (rp, rsize, ralloc, rl);
47986d7f5d3SJohn Marino         }
48086d7f5d3SJohn Marino #endif
48186d7f5d3SJohn Marino       else
48286d7f5d3SJohn Marino         {
48386d7f5d3SJohn Marino           int  parity;
48486d7f5d3SJohn Marino 
48586d7f5d3SJohn Marino           /* Arrange the final result ends up in r, not in the temp space */
48686d7f5d3SJohn Marino           ULONG_PARITY (parity, e);
48786d7f5d3SJohn Marino           if (((parity ^ i) & 1) != 0)
48886d7f5d3SJohn Marino             SWAP_RP_TP;
48986d7f5d3SJohn Marino 
49086d7f5d3SJohn Marino           MPN_COPY (rp, bp, bsize);
49186d7f5d3SJohn Marino           rsize = bsize;
49286d7f5d3SJohn Marino 
49386d7f5d3SJohn Marino           for ( ; i >= 0; i--)
49486d7f5d3SJohn Marino             {
49586d7f5d3SJohn Marino               TRACE (printf ("mul loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
49686d7f5d3SJohn Marino                              i, e, rsize, ralloc, talloc);
49786d7f5d3SJohn Marino                      mpn_trace ("r", rp, rsize));
49886d7f5d3SJohn Marino 
49986d7f5d3SJohn Marino               MPN_SQR (tp, talloc, rp, rsize);
50086d7f5d3SJohn Marino               SWAP_RP_TP;
50186d7f5d3SJohn Marino               if ((e & (1L << i)) != 0)
50286d7f5d3SJohn Marino                 {
50386d7f5d3SJohn Marino                   MPN_MUL (tp, talloc, rp, rsize, bp, bsize);
50486d7f5d3SJohn Marino                   SWAP_RP_TP;
50586d7f5d3SJohn Marino                 }
50686d7f5d3SJohn Marino             }
50786d7f5d3SJohn Marino         }
50886d7f5d3SJohn Marino     }
50986d7f5d3SJohn Marino 
51086d7f5d3SJohn Marino   ASSERT (rp == PTR(r) + rtwos_limbs);
51186d7f5d3SJohn Marino   TRACE (mpn_trace ("end loop r", rp, rsize));
51286d7f5d3SJohn Marino   TMP_FREE;
51386d7f5d3SJohn Marino 
51486d7f5d3SJohn Marino   /* Apply any partial limb factors of 2. */
51586d7f5d3SJohn Marino   if (rtwos_bits != 0)
51686d7f5d3SJohn Marino     {
51786d7f5d3SJohn Marino       MPN_LSHIFT (rp, rsize, ralloc, (unsigned) rtwos_bits);
51886d7f5d3SJohn Marino       TRACE (mpn_trace ("lshift r", rp, rsize));
51986d7f5d3SJohn Marino     }
52086d7f5d3SJohn Marino 
52186d7f5d3SJohn Marino   rsize += rtwos_limbs;
52286d7f5d3SJohn Marino   SIZ(r) = (rneg ? -rsize : rsize);
52386d7f5d3SJohn Marino }
524