186d7f5d3SJohn Marino /* sqrmod_bnm1.c -- squaring mod B^n-1.
286d7f5d3SJohn Marino
386d7f5d3SJohn Marino Contributed to the GNU project by Niels M�ller, Torbjorn Granlund and
486d7f5d3SJohn Marino Marco Bodrato.
586d7f5d3SJohn Marino
686d7f5d3SJohn Marino THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
786d7f5d3SJohn Marino SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
886d7f5d3SJohn Marino GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
986d7f5d3SJohn Marino
1086d7f5d3SJohn Marino Copyright 2009, 2010 Free Software Foundation, Inc.
1186d7f5d3SJohn Marino
1286d7f5d3SJohn Marino This file is part of the GNU MP Library.
1386d7f5d3SJohn Marino
1486d7f5d3SJohn Marino The GNU MP Library is free software; you can redistribute it and/or modify
1586d7f5d3SJohn Marino it under the terms of the GNU Lesser General Public License as published by
1686d7f5d3SJohn Marino the Free Software Foundation; either version 3 of the License, or (at your
1786d7f5d3SJohn Marino option) any later version.
1886d7f5d3SJohn Marino
1986d7f5d3SJohn Marino The GNU MP Library is distributed in the hope that it will be useful, but
2086d7f5d3SJohn Marino WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
2186d7f5d3SJohn Marino or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
2286d7f5d3SJohn Marino License for more details.
2386d7f5d3SJohn Marino
2486d7f5d3SJohn Marino You should have received a copy of the GNU Lesser General Public License
2586d7f5d3SJohn Marino along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
2686d7f5d3SJohn Marino
2786d7f5d3SJohn Marino
2886d7f5d3SJohn Marino #include "gmp.h"
2986d7f5d3SJohn Marino #include "gmp-impl.h"
3086d7f5d3SJohn Marino #include "longlong.h"
3186d7f5d3SJohn Marino
3286d7f5d3SJohn Marino /* Input is {ap,rn}; output is {rp,rn}, computation is
3386d7f5d3SJohn Marino mod B^rn - 1, and values are semi-normalised; zero is represented
3486d7f5d3SJohn Marino as either 0 or B^n - 1. Needs a scratch of 2rn limbs at tp.
3586d7f5d3SJohn Marino tp==rp is allowed. */
3686d7f5d3SJohn Marino static void
mpn_bc_sqrmod_bnm1(mp_ptr rp,mp_srcptr ap,mp_size_t rn,mp_ptr tp)3786d7f5d3SJohn Marino mpn_bc_sqrmod_bnm1 (mp_ptr rp, mp_srcptr ap, mp_size_t rn, mp_ptr tp)
3886d7f5d3SJohn Marino {
3986d7f5d3SJohn Marino mp_limb_t cy;
4086d7f5d3SJohn Marino
4186d7f5d3SJohn Marino ASSERT (0 < rn);
4286d7f5d3SJohn Marino
4386d7f5d3SJohn Marino mpn_sqr (tp, ap, rn);
4486d7f5d3SJohn Marino cy = mpn_add_n (rp, tp, tp + rn, rn);
4586d7f5d3SJohn Marino /* If cy == 1, then the value of rp is at most B^rn - 2, so there can
4686d7f5d3SJohn Marino * be no overflow when adding in the carry. */
4786d7f5d3SJohn Marino MPN_INCR_U (rp, rn, cy);
4886d7f5d3SJohn Marino }
4986d7f5d3SJohn Marino
5086d7f5d3SJohn Marino
5186d7f5d3SJohn Marino /* Input is {ap,rn+1}; output is {rp,rn+1}, in
5286d7f5d3SJohn Marino semi-normalised representation, computation is mod B^rn + 1. Needs
5386d7f5d3SJohn Marino a scratch area of 2rn + 2 limbs at tp; tp == rp is allowed.
5486d7f5d3SJohn Marino Output is normalised. */
5586d7f5d3SJohn Marino static void
mpn_bc_sqrmod_bnp1(mp_ptr rp,mp_srcptr ap,mp_size_t rn,mp_ptr tp)5686d7f5d3SJohn Marino mpn_bc_sqrmod_bnp1 (mp_ptr rp, mp_srcptr ap, mp_size_t rn, mp_ptr tp)
5786d7f5d3SJohn Marino {
5886d7f5d3SJohn Marino mp_limb_t cy;
5986d7f5d3SJohn Marino
6086d7f5d3SJohn Marino ASSERT (0 < rn);
6186d7f5d3SJohn Marino
6286d7f5d3SJohn Marino mpn_sqr (tp, ap, rn + 1);
6386d7f5d3SJohn Marino ASSERT (tp[2*rn+1] == 0);
6486d7f5d3SJohn Marino ASSERT (tp[2*rn] < GMP_NUMB_MAX);
6586d7f5d3SJohn Marino cy = tp[2*rn] + mpn_sub_n (rp, tp, tp+rn, rn);
6686d7f5d3SJohn Marino rp[rn] = 0;
6786d7f5d3SJohn Marino MPN_INCR_U (rp, rn+1, cy );
6886d7f5d3SJohn Marino }
6986d7f5d3SJohn Marino
7086d7f5d3SJohn Marino
7186d7f5d3SJohn Marino /* Computes {rp,MIN(rn,2an)} <- {ap,an}^2 Mod(B^rn-1)
7286d7f5d3SJohn Marino *
7386d7f5d3SJohn Marino * The result is expected to be ZERO if and only if the operand
7486d7f5d3SJohn Marino * already is. Otherwise the class [0] Mod(B^rn-1) is represented by
7586d7f5d3SJohn Marino * B^rn-1.
7686d7f5d3SJohn Marino * It should not be a problem if sqrmod_bnm1 is used to
7786d7f5d3SJohn Marino * compute the full square with an <= 2*rn, because this condition
7886d7f5d3SJohn Marino * implies (B^an-1)^2 < (B^rn-1) .
7986d7f5d3SJohn Marino *
8086d7f5d3SJohn Marino * Requires rn/4 < an <= rn
8186d7f5d3SJohn Marino * Scratch need: rn/2 + (need for recursive call OR rn + 3). This gives
8286d7f5d3SJohn Marino *
8386d7f5d3SJohn Marino * S(n) <= rn/2 + MAX (rn + 4, S(n/2)) <= 3/2 rn + 4
8486d7f5d3SJohn Marino */
8586d7f5d3SJohn Marino void
mpn_sqrmod_bnm1(mp_ptr rp,mp_size_t rn,mp_srcptr ap,mp_size_t an,mp_ptr tp)8686d7f5d3SJohn Marino mpn_sqrmod_bnm1 (mp_ptr rp, mp_size_t rn, mp_srcptr ap, mp_size_t an, mp_ptr tp)
8786d7f5d3SJohn Marino {
8886d7f5d3SJohn Marino ASSERT (0 < an);
8986d7f5d3SJohn Marino ASSERT (an <= rn);
9086d7f5d3SJohn Marino
9186d7f5d3SJohn Marino if ((rn & 1) != 0 || BELOW_THRESHOLD (rn, SQRMOD_BNM1_THRESHOLD))
9286d7f5d3SJohn Marino {
9386d7f5d3SJohn Marino if (UNLIKELY (an < rn))
9486d7f5d3SJohn Marino {
9586d7f5d3SJohn Marino if (UNLIKELY (2*an <= rn))
9686d7f5d3SJohn Marino {
9786d7f5d3SJohn Marino mpn_sqr (rp, ap, an);
9886d7f5d3SJohn Marino }
9986d7f5d3SJohn Marino else
10086d7f5d3SJohn Marino {
10186d7f5d3SJohn Marino mp_limb_t cy;
10286d7f5d3SJohn Marino mpn_sqr (tp, ap, an);
10386d7f5d3SJohn Marino cy = mpn_add (rp, tp, rn, tp + rn, 2*an - rn);
10486d7f5d3SJohn Marino MPN_INCR_U (rp, rn, cy);
10586d7f5d3SJohn Marino }
10686d7f5d3SJohn Marino }
10786d7f5d3SJohn Marino else
10886d7f5d3SJohn Marino mpn_bc_sqrmod_bnm1 (rp, ap, rn, tp);
10986d7f5d3SJohn Marino }
11086d7f5d3SJohn Marino else
11186d7f5d3SJohn Marino {
11286d7f5d3SJohn Marino mp_size_t n;
11386d7f5d3SJohn Marino mp_limb_t cy;
11486d7f5d3SJohn Marino mp_limb_t hi;
11586d7f5d3SJohn Marino
11686d7f5d3SJohn Marino n = rn >> 1;
11786d7f5d3SJohn Marino
11886d7f5d3SJohn Marino ASSERT (2*an > n);
11986d7f5d3SJohn Marino
12086d7f5d3SJohn Marino /* Compute xm = a^2 mod (B^n - 1), xp = a^2 mod (B^n + 1)
12186d7f5d3SJohn Marino and crt together as
12286d7f5d3SJohn Marino
12386d7f5d3SJohn Marino x = -xp * B^n + (B^n + 1) * [ (xp + xm)/2 mod (B^n-1)]
12486d7f5d3SJohn Marino */
12586d7f5d3SJohn Marino
12686d7f5d3SJohn Marino #define a0 ap
12786d7f5d3SJohn Marino #define a1 (ap + n)
12886d7f5d3SJohn Marino
12986d7f5d3SJohn Marino #define xp tp /* 2n + 2 */
13086d7f5d3SJohn Marino /* am1 maybe in {xp, n} */
13186d7f5d3SJohn Marino #define sp1 (tp + 2*n + 2)
13286d7f5d3SJohn Marino /* ap1 maybe in {sp1, n + 1} */
13386d7f5d3SJohn Marino
13486d7f5d3SJohn Marino {
13586d7f5d3SJohn Marino mp_srcptr am1;
13686d7f5d3SJohn Marino mp_size_t anm;
13786d7f5d3SJohn Marino mp_ptr so;
13886d7f5d3SJohn Marino
13986d7f5d3SJohn Marino if (LIKELY (an > n))
14086d7f5d3SJohn Marino {
14186d7f5d3SJohn Marino so = xp + n;
14286d7f5d3SJohn Marino am1 = xp;
14386d7f5d3SJohn Marino cy = mpn_add (xp, a0, n, a1, an - n);
14486d7f5d3SJohn Marino MPN_INCR_U (xp, n, cy);
14586d7f5d3SJohn Marino anm = n;
14686d7f5d3SJohn Marino }
14786d7f5d3SJohn Marino else
14886d7f5d3SJohn Marino {
14986d7f5d3SJohn Marino so = xp;
15086d7f5d3SJohn Marino am1 = a0;
15186d7f5d3SJohn Marino anm = an;
15286d7f5d3SJohn Marino }
15386d7f5d3SJohn Marino
15486d7f5d3SJohn Marino mpn_sqrmod_bnm1 (rp, n, am1, anm, so);
15586d7f5d3SJohn Marino }
15686d7f5d3SJohn Marino
15786d7f5d3SJohn Marino {
15886d7f5d3SJohn Marino int k;
15986d7f5d3SJohn Marino mp_srcptr ap1;
16086d7f5d3SJohn Marino mp_size_t anp;
16186d7f5d3SJohn Marino
16286d7f5d3SJohn Marino if (LIKELY (an > n)) {
16386d7f5d3SJohn Marino ap1 = sp1;
16486d7f5d3SJohn Marino cy = mpn_sub (sp1, a0, n, a1, an - n);
16586d7f5d3SJohn Marino sp1[n] = 0;
16686d7f5d3SJohn Marino MPN_INCR_U (sp1, n + 1, cy);
16786d7f5d3SJohn Marino anp = n + ap1[n];
16886d7f5d3SJohn Marino } else {
16986d7f5d3SJohn Marino ap1 = a0;
17086d7f5d3SJohn Marino anp = an;
17186d7f5d3SJohn Marino }
17286d7f5d3SJohn Marino
17386d7f5d3SJohn Marino if (BELOW_THRESHOLD (n, MUL_FFT_MODF_THRESHOLD))
17486d7f5d3SJohn Marino k=0;
17586d7f5d3SJohn Marino else
17686d7f5d3SJohn Marino {
17786d7f5d3SJohn Marino int mask;
17886d7f5d3SJohn Marino k = mpn_fft_best_k (n, 1);
17986d7f5d3SJohn Marino mask = (1<<k) -1;
18086d7f5d3SJohn Marino while (n & mask) {k--; mask >>=1;};
18186d7f5d3SJohn Marino }
18286d7f5d3SJohn Marino if (k >= FFT_FIRST_K)
18386d7f5d3SJohn Marino xp[n] = mpn_mul_fft (xp, n, ap1, anp, ap1, anp, k);
18486d7f5d3SJohn Marino else if (UNLIKELY (ap1 == a0))
18586d7f5d3SJohn Marino {
18686d7f5d3SJohn Marino ASSERT (anp <= n);
18786d7f5d3SJohn Marino ASSERT (2*anp > n);
18886d7f5d3SJohn Marino mpn_sqr (xp, a0, an);
18986d7f5d3SJohn Marino anp = 2*an - n;
19086d7f5d3SJohn Marino cy = mpn_sub (xp, xp, n, xp + n, anp);
19186d7f5d3SJohn Marino xp[n] = 0;
19286d7f5d3SJohn Marino MPN_INCR_U (xp, n+1, cy);
19386d7f5d3SJohn Marino }
19486d7f5d3SJohn Marino else
19586d7f5d3SJohn Marino mpn_bc_sqrmod_bnp1 (xp, ap1, n, xp);
19686d7f5d3SJohn Marino }
19786d7f5d3SJohn Marino
19886d7f5d3SJohn Marino /* Here the CRT recomposition begins.
19986d7f5d3SJohn Marino
20086d7f5d3SJohn Marino xm <- (xp + xm)/2 = (xp + xm)B^n/2 mod (B^n-1)
20186d7f5d3SJohn Marino Division by 2 is a bitwise rotation.
20286d7f5d3SJohn Marino
20386d7f5d3SJohn Marino Assumes xp normalised mod (B^n+1).
20486d7f5d3SJohn Marino
20586d7f5d3SJohn Marino The residue class [0] is represented by [B^n-1]; except when
20686d7f5d3SJohn Marino both input are ZERO.
20786d7f5d3SJohn Marino */
20886d7f5d3SJohn Marino
20986d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_rsh1add_n || HAVE_NATIVE_mpn_rsh1add_nc
21086d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_rsh1add_nc
21186d7f5d3SJohn Marino cy = mpn_rsh1add_nc(rp, rp, xp, n, xp[n]); /* B^n = 1 */
21286d7f5d3SJohn Marino hi = cy << (GMP_NUMB_BITS - 1);
21386d7f5d3SJohn Marino cy = 0;
21486d7f5d3SJohn Marino /* next update of rp[n-1] will set cy = 1 only if rp[n-1]+=hi
21586d7f5d3SJohn Marino overflows, i.e. a further increment will not overflow again. */
21686d7f5d3SJohn Marino #else /* ! _nc */
21786d7f5d3SJohn Marino cy = xp[n] + mpn_rsh1add_n(rp, rp, xp, n); /* B^n = 1 */
21886d7f5d3SJohn Marino hi = (cy<<(GMP_NUMB_BITS-1))&GMP_NUMB_MASK; /* (cy&1) << ... */
21986d7f5d3SJohn Marino cy >>= 1;
22086d7f5d3SJohn Marino /* cy = 1 only if xp[n] = 1 i.e. {xp,n} = ZERO, this implies that
22186d7f5d3SJohn Marino the rsh1add was a simple rshift: the top bit is 0. cy=1 => hi=0. */
22286d7f5d3SJohn Marino #endif
22386d7f5d3SJohn Marino #if GMP_NAIL_BITS == 0
22486d7f5d3SJohn Marino add_ssaaaa(cy, rp[n-1], cy, rp[n-1], 0, hi);
22586d7f5d3SJohn Marino #else
22686d7f5d3SJohn Marino cy += (hi & rp[n-1]) >> (GMP_NUMB_BITS-1);
22786d7f5d3SJohn Marino rp[n-1] ^= hi;
22886d7f5d3SJohn Marino #endif
22986d7f5d3SJohn Marino #else /* ! HAVE_NATIVE_mpn_rsh1add_n */
23086d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_add_nc
23186d7f5d3SJohn Marino cy = mpn_add_nc(rp, rp, xp, n, xp[n]);
23286d7f5d3SJohn Marino #else /* ! _nc */
23386d7f5d3SJohn Marino cy = xp[n] + mpn_add_n(rp, rp, xp, n); /* xp[n] == 1 implies {xp,n} == ZERO */
23486d7f5d3SJohn Marino #endif
23586d7f5d3SJohn Marino cy += (rp[0]&1);
23686d7f5d3SJohn Marino mpn_rshift(rp, rp, n, 1);
23786d7f5d3SJohn Marino ASSERT (cy <= 2);
23886d7f5d3SJohn Marino hi = (cy<<(GMP_NUMB_BITS-1))&GMP_NUMB_MASK; /* (cy&1) << ... */
23986d7f5d3SJohn Marino cy >>= 1;
24086d7f5d3SJohn Marino /* We can have cy != 0 only if hi = 0... */
24186d7f5d3SJohn Marino ASSERT ((rp[n-1] & GMP_NUMB_HIGHBIT) == 0);
24286d7f5d3SJohn Marino rp[n-1] |= hi;
24386d7f5d3SJohn Marino /* ... rp[n-1] + cy can not overflow, the following INCR is correct. */
24486d7f5d3SJohn Marino #endif
24586d7f5d3SJohn Marino ASSERT (cy <= 1);
24686d7f5d3SJohn Marino /* Next increment can not overflow, read the previous comments about cy. */
24786d7f5d3SJohn Marino ASSERT ((cy == 0) || ((rp[n-1] & GMP_NUMB_HIGHBIT) == 0));
24886d7f5d3SJohn Marino MPN_INCR_U(rp, n, cy);
24986d7f5d3SJohn Marino
25086d7f5d3SJohn Marino /* Compute the highest half:
25186d7f5d3SJohn Marino ([(xp + xm)/2 mod (B^n-1)] - xp ) * B^n
25286d7f5d3SJohn Marino */
25386d7f5d3SJohn Marino if (UNLIKELY (2*an < rn))
25486d7f5d3SJohn Marino {
25586d7f5d3SJohn Marino /* Note that in this case, the only way the result can equal
25686d7f5d3SJohn Marino zero mod B^{rn} - 1 is if the input is zero, and
25786d7f5d3SJohn Marino then the output of both the recursive calls and this CRT
25886d7f5d3SJohn Marino reconstruction is zero, not B^{rn} - 1. */
25986d7f5d3SJohn Marino cy = mpn_sub_n (rp + n, rp, xp, 2*an - n);
26086d7f5d3SJohn Marino
26186d7f5d3SJohn Marino /* FIXME: This subtraction of the high parts is not really
26286d7f5d3SJohn Marino necessary, we do it to get the carry out, and for sanity
26386d7f5d3SJohn Marino checking. */
26486d7f5d3SJohn Marino cy = xp[n] + mpn_sub_nc (xp + 2*an - n, rp + 2*an - n,
26586d7f5d3SJohn Marino xp + 2*an - n, rn - 2*an, cy);
26686d7f5d3SJohn Marino ASSERT (mpn_zero_p (xp + 2*an - n+1, rn - 1 - 2*an));
26786d7f5d3SJohn Marino cy = mpn_sub_1 (rp, rp, 2*an, cy);
26886d7f5d3SJohn Marino ASSERT (cy == (xp + 2*an - n)[0]);
26986d7f5d3SJohn Marino }
27086d7f5d3SJohn Marino else
27186d7f5d3SJohn Marino {
27286d7f5d3SJohn Marino cy = xp[n] + mpn_sub_n (rp + n, rp, xp, n);
27386d7f5d3SJohn Marino /* cy = 1 only if {xp,n+1} is not ZERO, i.e. {rp,n} is not ZERO.
27486d7f5d3SJohn Marino DECR will affect _at most_ the lowest n limbs. */
27586d7f5d3SJohn Marino MPN_DECR_U (rp, 2*n, cy);
27686d7f5d3SJohn Marino }
27786d7f5d3SJohn Marino #undef a0
27886d7f5d3SJohn Marino #undef a1
27986d7f5d3SJohn Marino #undef xp
28086d7f5d3SJohn Marino #undef sp1
28186d7f5d3SJohn Marino }
28286d7f5d3SJohn Marino }
28386d7f5d3SJohn Marino
28486d7f5d3SJohn Marino mp_size_t
mpn_sqrmod_bnm1_next_size(mp_size_t n)28586d7f5d3SJohn Marino mpn_sqrmod_bnm1_next_size (mp_size_t n)
28686d7f5d3SJohn Marino {
28786d7f5d3SJohn Marino mp_size_t nh;
28886d7f5d3SJohn Marino
28986d7f5d3SJohn Marino if (BELOW_THRESHOLD (n, SQRMOD_BNM1_THRESHOLD))
29086d7f5d3SJohn Marino return n;
29186d7f5d3SJohn Marino if (BELOW_THRESHOLD (n, 4 * (SQRMOD_BNM1_THRESHOLD - 1) + 1))
29286d7f5d3SJohn Marino return (n + (2-1)) & (-2);
29386d7f5d3SJohn Marino if (BELOW_THRESHOLD (n, 8 * (SQRMOD_BNM1_THRESHOLD - 1) + 1))
29486d7f5d3SJohn Marino return (n + (4-1)) & (-4);
29586d7f5d3SJohn Marino
29686d7f5d3SJohn Marino nh = (n + 1) >> 1;
29786d7f5d3SJohn Marino
29886d7f5d3SJohn Marino if (BELOW_THRESHOLD (nh, SQR_FFT_MODF_THRESHOLD))
29986d7f5d3SJohn Marino return (n + (8-1)) & (-8);
30086d7f5d3SJohn Marino
30186d7f5d3SJohn Marino return 2 * mpn_fft_next_size (nh, mpn_fft_best_k (nh, 1));
30286d7f5d3SJohn Marino }
303