xref: /dflybsd-src/contrib/gmp/mpn/generic/sqrmod_bnm1.c (revision 86d7f5d305c6adaa56ff4582ece9859d73106103)
186d7f5d3SJohn Marino /* sqrmod_bnm1.c -- squaring mod B^n-1.
286d7f5d3SJohn Marino 
386d7f5d3SJohn Marino    Contributed to the GNU project by Niels M�ller, Torbjorn Granlund and
486d7f5d3SJohn Marino    Marco Bodrato.
586d7f5d3SJohn Marino 
686d7f5d3SJohn Marino    THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
786d7f5d3SJohn Marino    SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
886d7f5d3SJohn Marino    GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
986d7f5d3SJohn Marino 
1086d7f5d3SJohn Marino Copyright 2009, 2010 Free Software Foundation, Inc.
1186d7f5d3SJohn Marino 
1286d7f5d3SJohn Marino This file is part of the GNU MP Library.
1386d7f5d3SJohn Marino 
1486d7f5d3SJohn Marino The GNU MP Library is free software; you can redistribute it and/or modify
1586d7f5d3SJohn Marino it under the terms of the GNU Lesser General Public License as published by
1686d7f5d3SJohn Marino the Free Software Foundation; either version 3 of the License, or (at your
1786d7f5d3SJohn Marino option) any later version.
1886d7f5d3SJohn Marino 
1986d7f5d3SJohn Marino The GNU MP Library is distributed in the hope that it will be useful, but
2086d7f5d3SJohn Marino WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
2186d7f5d3SJohn Marino or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
2286d7f5d3SJohn Marino License for more details.
2386d7f5d3SJohn Marino 
2486d7f5d3SJohn Marino You should have received a copy of the GNU Lesser General Public License
2586d7f5d3SJohn Marino along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
2686d7f5d3SJohn Marino 
2786d7f5d3SJohn Marino 
2886d7f5d3SJohn Marino #include "gmp.h"
2986d7f5d3SJohn Marino #include "gmp-impl.h"
3086d7f5d3SJohn Marino #include "longlong.h"
3186d7f5d3SJohn Marino 
3286d7f5d3SJohn Marino /* Input is {ap,rn}; output is {rp,rn}, computation is
3386d7f5d3SJohn Marino    mod B^rn - 1, and values are semi-normalised; zero is represented
3486d7f5d3SJohn Marino    as either 0 or B^n - 1.  Needs a scratch of 2rn limbs at tp.
3586d7f5d3SJohn Marino    tp==rp is allowed. */
3686d7f5d3SJohn Marino static void
mpn_bc_sqrmod_bnm1(mp_ptr rp,mp_srcptr ap,mp_size_t rn,mp_ptr tp)3786d7f5d3SJohn Marino mpn_bc_sqrmod_bnm1 (mp_ptr rp, mp_srcptr ap, mp_size_t rn, mp_ptr tp)
3886d7f5d3SJohn Marino {
3986d7f5d3SJohn Marino   mp_limb_t cy;
4086d7f5d3SJohn Marino 
4186d7f5d3SJohn Marino   ASSERT (0 < rn);
4286d7f5d3SJohn Marino 
4386d7f5d3SJohn Marino   mpn_sqr (tp, ap, rn);
4486d7f5d3SJohn Marino   cy = mpn_add_n (rp, tp, tp + rn, rn);
4586d7f5d3SJohn Marino   /* If cy == 1, then the value of rp is at most B^rn - 2, so there can
4686d7f5d3SJohn Marino    * be no overflow when adding in the carry. */
4786d7f5d3SJohn Marino   MPN_INCR_U (rp, rn, cy);
4886d7f5d3SJohn Marino }
4986d7f5d3SJohn Marino 
5086d7f5d3SJohn Marino 
5186d7f5d3SJohn Marino /* Input is {ap,rn+1}; output is {rp,rn+1}, in
5286d7f5d3SJohn Marino    semi-normalised representation, computation is mod B^rn + 1. Needs
5386d7f5d3SJohn Marino    a scratch area of 2rn + 2 limbs at tp; tp == rp is allowed.
5486d7f5d3SJohn Marino    Output is normalised. */
5586d7f5d3SJohn Marino static void
mpn_bc_sqrmod_bnp1(mp_ptr rp,mp_srcptr ap,mp_size_t rn,mp_ptr tp)5686d7f5d3SJohn Marino mpn_bc_sqrmod_bnp1 (mp_ptr rp, mp_srcptr ap, mp_size_t rn, mp_ptr tp)
5786d7f5d3SJohn Marino {
5886d7f5d3SJohn Marino   mp_limb_t cy;
5986d7f5d3SJohn Marino 
6086d7f5d3SJohn Marino   ASSERT (0 < rn);
6186d7f5d3SJohn Marino 
6286d7f5d3SJohn Marino   mpn_sqr (tp, ap, rn + 1);
6386d7f5d3SJohn Marino   ASSERT (tp[2*rn+1] == 0);
6486d7f5d3SJohn Marino   ASSERT (tp[2*rn] < GMP_NUMB_MAX);
6586d7f5d3SJohn Marino   cy = tp[2*rn] + mpn_sub_n (rp, tp, tp+rn, rn);
6686d7f5d3SJohn Marino   rp[rn] = 0;
6786d7f5d3SJohn Marino   MPN_INCR_U (rp, rn+1, cy );
6886d7f5d3SJohn Marino }
6986d7f5d3SJohn Marino 
7086d7f5d3SJohn Marino 
7186d7f5d3SJohn Marino /* Computes {rp,MIN(rn,2an)} <- {ap,an}^2 Mod(B^rn-1)
7286d7f5d3SJohn Marino  *
7386d7f5d3SJohn Marino  * The result is expected to be ZERO if and only if the operand
7486d7f5d3SJohn Marino  * already is. Otherwise the class [0] Mod(B^rn-1) is represented by
7586d7f5d3SJohn Marino  * B^rn-1.
7686d7f5d3SJohn Marino  * It should not be a problem if sqrmod_bnm1 is used to
7786d7f5d3SJohn Marino  * compute the full square with an <= 2*rn, because this condition
7886d7f5d3SJohn Marino  * implies (B^an-1)^2 < (B^rn-1) .
7986d7f5d3SJohn Marino  *
8086d7f5d3SJohn Marino  * Requires rn/4 < an <= rn
8186d7f5d3SJohn Marino  * Scratch need: rn/2 + (need for recursive call OR rn + 3). This gives
8286d7f5d3SJohn Marino  *
8386d7f5d3SJohn Marino  * S(n) <= rn/2 + MAX (rn + 4, S(n/2)) <= 3/2 rn + 4
8486d7f5d3SJohn Marino  */
8586d7f5d3SJohn Marino void
mpn_sqrmod_bnm1(mp_ptr rp,mp_size_t rn,mp_srcptr ap,mp_size_t an,mp_ptr tp)8686d7f5d3SJohn Marino mpn_sqrmod_bnm1 (mp_ptr rp, mp_size_t rn, mp_srcptr ap, mp_size_t an, mp_ptr tp)
8786d7f5d3SJohn Marino {
8886d7f5d3SJohn Marino   ASSERT (0 < an);
8986d7f5d3SJohn Marino   ASSERT (an <= rn);
9086d7f5d3SJohn Marino 
9186d7f5d3SJohn Marino   if ((rn & 1) != 0 || BELOW_THRESHOLD (rn, SQRMOD_BNM1_THRESHOLD))
9286d7f5d3SJohn Marino     {
9386d7f5d3SJohn Marino       if (UNLIKELY (an < rn))
9486d7f5d3SJohn Marino 	{
9586d7f5d3SJohn Marino 	  if (UNLIKELY (2*an <= rn))
9686d7f5d3SJohn Marino 	    {
9786d7f5d3SJohn Marino 	      mpn_sqr (rp, ap, an);
9886d7f5d3SJohn Marino 	    }
9986d7f5d3SJohn Marino 	  else
10086d7f5d3SJohn Marino 	    {
10186d7f5d3SJohn Marino 	      mp_limb_t cy;
10286d7f5d3SJohn Marino 	      mpn_sqr (tp, ap, an);
10386d7f5d3SJohn Marino 	      cy = mpn_add (rp, tp, rn, tp + rn, 2*an - rn);
10486d7f5d3SJohn Marino 	      MPN_INCR_U (rp, rn, cy);
10586d7f5d3SJohn Marino 	    }
10686d7f5d3SJohn Marino 	}
10786d7f5d3SJohn Marino       else
10886d7f5d3SJohn Marino 	mpn_bc_sqrmod_bnm1 (rp, ap, rn, tp);
10986d7f5d3SJohn Marino     }
11086d7f5d3SJohn Marino   else
11186d7f5d3SJohn Marino     {
11286d7f5d3SJohn Marino       mp_size_t n;
11386d7f5d3SJohn Marino       mp_limb_t cy;
11486d7f5d3SJohn Marino       mp_limb_t hi;
11586d7f5d3SJohn Marino 
11686d7f5d3SJohn Marino       n = rn >> 1;
11786d7f5d3SJohn Marino 
11886d7f5d3SJohn Marino       ASSERT (2*an > n);
11986d7f5d3SJohn Marino 
12086d7f5d3SJohn Marino       /* Compute xm = a^2 mod (B^n - 1), xp = a^2 mod (B^n + 1)
12186d7f5d3SJohn Marino 	 and crt together as
12286d7f5d3SJohn Marino 
12386d7f5d3SJohn Marino 	 x = -xp * B^n + (B^n + 1) * [ (xp + xm)/2 mod (B^n-1)]
12486d7f5d3SJohn Marino       */
12586d7f5d3SJohn Marino 
12686d7f5d3SJohn Marino #define a0 ap
12786d7f5d3SJohn Marino #define a1 (ap + n)
12886d7f5d3SJohn Marino 
12986d7f5d3SJohn Marino #define xp  tp	/* 2n + 2 */
13086d7f5d3SJohn Marino       /* am1  maybe in {xp, n} */
13186d7f5d3SJohn Marino #define sp1 (tp + 2*n + 2)
13286d7f5d3SJohn Marino       /* ap1  maybe in {sp1, n + 1} */
13386d7f5d3SJohn Marino 
13486d7f5d3SJohn Marino       {
13586d7f5d3SJohn Marino 	mp_srcptr am1;
13686d7f5d3SJohn Marino 	mp_size_t anm;
13786d7f5d3SJohn Marino 	mp_ptr so;
13886d7f5d3SJohn Marino 
13986d7f5d3SJohn Marino 	if (LIKELY (an > n))
14086d7f5d3SJohn Marino 	  {
14186d7f5d3SJohn Marino 	    so = xp + n;
14286d7f5d3SJohn Marino 	    am1 = xp;
14386d7f5d3SJohn Marino 	    cy = mpn_add (xp, a0, n, a1, an - n);
14486d7f5d3SJohn Marino 	    MPN_INCR_U (xp, n, cy);
14586d7f5d3SJohn Marino 	    anm = n;
14686d7f5d3SJohn Marino 	  }
14786d7f5d3SJohn Marino 	else
14886d7f5d3SJohn Marino 	  {
14986d7f5d3SJohn Marino 	    so = xp;
15086d7f5d3SJohn Marino 	    am1 = a0;
15186d7f5d3SJohn Marino 	    anm = an;
15286d7f5d3SJohn Marino 	  }
15386d7f5d3SJohn Marino 
15486d7f5d3SJohn Marino 	mpn_sqrmod_bnm1 (rp, n, am1, anm, so);
15586d7f5d3SJohn Marino       }
15686d7f5d3SJohn Marino 
15786d7f5d3SJohn Marino       {
15886d7f5d3SJohn Marino 	int       k;
15986d7f5d3SJohn Marino 	mp_srcptr ap1;
16086d7f5d3SJohn Marino 	mp_size_t anp;
16186d7f5d3SJohn Marino 
16286d7f5d3SJohn Marino 	if (LIKELY (an > n)) {
16386d7f5d3SJohn Marino 	  ap1 = sp1;
16486d7f5d3SJohn Marino 	  cy = mpn_sub (sp1, a0, n, a1, an - n);
16586d7f5d3SJohn Marino 	  sp1[n] = 0;
16686d7f5d3SJohn Marino 	  MPN_INCR_U (sp1, n + 1, cy);
16786d7f5d3SJohn Marino 	  anp = n + ap1[n];
16886d7f5d3SJohn Marino 	} else {
16986d7f5d3SJohn Marino 	  ap1 = a0;
17086d7f5d3SJohn Marino 	  anp = an;
17186d7f5d3SJohn Marino 	}
17286d7f5d3SJohn Marino 
17386d7f5d3SJohn Marino 	if (BELOW_THRESHOLD (n, MUL_FFT_MODF_THRESHOLD))
17486d7f5d3SJohn Marino 	  k=0;
17586d7f5d3SJohn Marino 	else
17686d7f5d3SJohn Marino 	  {
17786d7f5d3SJohn Marino 	    int mask;
17886d7f5d3SJohn Marino 	    k = mpn_fft_best_k (n, 1);
17986d7f5d3SJohn Marino 	    mask = (1<<k) -1;
18086d7f5d3SJohn Marino 	    while (n & mask) {k--; mask >>=1;};
18186d7f5d3SJohn Marino 	  }
18286d7f5d3SJohn Marino 	if (k >= FFT_FIRST_K)
18386d7f5d3SJohn Marino 	  xp[n] = mpn_mul_fft (xp, n, ap1, anp, ap1, anp, k);
18486d7f5d3SJohn Marino 	else if (UNLIKELY (ap1 == a0))
18586d7f5d3SJohn Marino 	  {
18686d7f5d3SJohn Marino 	    ASSERT (anp <= n);
18786d7f5d3SJohn Marino 	    ASSERT (2*anp > n);
18886d7f5d3SJohn Marino 	    mpn_sqr (xp, a0, an);
18986d7f5d3SJohn Marino 	    anp = 2*an - n;
19086d7f5d3SJohn Marino 	    cy = mpn_sub (xp, xp, n, xp + n, anp);
19186d7f5d3SJohn Marino 	    xp[n] = 0;
19286d7f5d3SJohn Marino 	    MPN_INCR_U (xp, n+1, cy);
19386d7f5d3SJohn Marino 	  }
19486d7f5d3SJohn Marino 	else
19586d7f5d3SJohn Marino 	  mpn_bc_sqrmod_bnp1 (xp, ap1, n, xp);
19686d7f5d3SJohn Marino       }
19786d7f5d3SJohn Marino 
19886d7f5d3SJohn Marino       /* Here the CRT recomposition begins.
19986d7f5d3SJohn Marino 
20086d7f5d3SJohn Marino 	 xm <- (xp + xm)/2 = (xp + xm)B^n/2 mod (B^n-1)
20186d7f5d3SJohn Marino 	 Division by 2 is a bitwise rotation.
20286d7f5d3SJohn Marino 
20386d7f5d3SJohn Marino 	 Assumes xp normalised mod (B^n+1).
20486d7f5d3SJohn Marino 
20586d7f5d3SJohn Marino 	 The residue class [0] is represented by [B^n-1]; except when
20686d7f5d3SJohn Marino 	 both input are ZERO.
20786d7f5d3SJohn Marino       */
20886d7f5d3SJohn Marino 
20986d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_rsh1add_n || HAVE_NATIVE_mpn_rsh1add_nc
21086d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_rsh1add_nc
21186d7f5d3SJohn Marino       cy = mpn_rsh1add_nc(rp, rp, xp, n, xp[n]); /* B^n = 1 */
21286d7f5d3SJohn Marino       hi = cy << (GMP_NUMB_BITS - 1);
21386d7f5d3SJohn Marino       cy = 0;
21486d7f5d3SJohn Marino       /* next update of rp[n-1] will set cy = 1 only if rp[n-1]+=hi
21586d7f5d3SJohn Marino 	 overflows, i.e. a further increment will not overflow again. */
21686d7f5d3SJohn Marino #else /* ! _nc */
21786d7f5d3SJohn Marino       cy = xp[n] + mpn_rsh1add_n(rp, rp, xp, n); /* B^n = 1 */
21886d7f5d3SJohn Marino       hi = (cy<<(GMP_NUMB_BITS-1))&GMP_NUMB_MASK; /* (cy&1) << ... */
21986d7f5d3SJohn Marino       cy >>= 1;
22086d7f5d3SJohn Marino       /* cy = 1 only if xp[n] = 1 i.e. {xp,n} = ZERO, this implies that
22186d7f5d3SJohn Marino 	 the rsh1add was a simple rshift: the top bit is 0. cy=1 => hi=0. */
22286d7f5d3SJohn Marino #endif
22386d7f5d3SJohn Marino #if GMP_NAIL_BITS == 0
22486d7f5d3SJohn Marino       add_ssaaaa(cy, rp[n-1], cy, rp[n-1], 0, hi);
22586d7f5d3SJohn Marino #else
22686d7f5d3SJohn Marino       cy += (hi & rp[n-1]) >> (GMP_NUMB_BITS-1);
22786d7f5d3SJohn Marino       rp[n-1] ^= hi;
22886d7f5d3SJohn Marino #endif
22986d7f5d3SJohn Marino #else /* ! HAVE_NATIVE_mpn_rsh1add_n */
23086d7f5d3SJohn Marino #if HAVE_NATIVE_mpn_add_nc
23186d7f5d3SJohn Marino       cy = mpn_add_nc(rp, rp, xp, n, xp[n]);
23286d7f5d3SJohn Marino #else /* ! _nc */
23386d7f5d3SJohn Marino       cy = xp[n] + mpn_add_n(rp, rp, xp, n); /* xp[n] == 1 implies {xp,n} == ZERO */
23486d7f5d3SJohn Marino #endif
23586d7f5d3SJohn Marino       cy += (rp[0]&1);
23686d7f5d3SJohn Marino       mpn_rshift(rp, rp, n, 1);
23786d7f5d3SJohn Marino       ASSERT (cy <= 2);
23886d7f5d3SJohn Marino       hi = (cy<<(GMP_NUMB_BITS-1))&GMP_NUMB_MASK; /* (cy&1) << ... */
23986d7f5d3SJohn Marino       cy >>= 1;
24086d7f5d3SJohn Marino       /* We can have cy != 0 only if hi = 0... */
24186d7f5d3SJohn Marino       ASSERT ((rp[n-1] & GMP_NUMB_HIGHBIT) == 0);
24286d7f5d3SJohn Marino       rp[n-1] |= hi;
24386d7f5d3SJohn Marino       /* ... rp[n-1] + cy can not overflow, the following INCR is correct. */
24486d7f5d3SJohn Marino #endif
24586d7f5d3SJohn Marino       ASSERT (cy <= 1);
24686d7f5d3SJohn Marino       /* Next increment can not overflow, read the previous comments about cy. */
24786d7f5d3SJohn Marino       ASSERT ((cy == 0) || ((rp[n-1] & GMP_NUMB_HIGHBIT) == 0));
24886d7f5d3SJohn Marino       MPN_INCR_U(rp, n, cy);
24986d7f5d3SJohn Marino 
25086d7f5d3SJohn Marino       /* Compute the highest half:
25186d7f5d3SJohn Marino 	 ([(xp + xm)/2 mod (B^n-1)] - xp ) * B^n
25286d7f5d3SJohn Marino        */
25386d7f5d3SJohn Marino       if (UNLIKELY (2*an < rn))
25486d7f5d3SJohn Marino 	{
25586d7f5d3SJohn Marino 	  /* Note that in this case, the only way the result can equal
25686d7f5d3SJohn Marino 	     zero mod B^{rn} - 1 is if the input is zero, and
25786d7f5d3SJohn Marino 	     then the output of both the recursive calls and this CRT
25886d7f5d3SJohn Marino 	     reconstruction is zero, not B^{rn} - 1. */
25986d7f5d3SJohn Marino 	  cy = mpn_sub_n (rp + n, rp, xp, 2*an - n);
26086d7f5d3SJohn Marino 
26186d7f5d3SJohn Marino 	  /* FIXME: This subtraction of the high parts is not really
26286d7f5d3SJohn Marino 	     necessary, we do it to get the carry out, and for sanity
26386d7f5d3SJohn Marino 	     checking. */
26486d7f5d3SJohn Marino 	  cy = xp[n] + mpn_sub_nc (xp + 2*an - n, rp + 2*an - n,
26586d7f5d3SJohn Marino 				   xp + 2*an - n, rn - 2*an, cy);
26686d7f5d3SJohn Marino 	  ASSERT (mpn_zero_p (xp + 2*an - n+1, rn - 1 - 2*an));
26786d7f5d3SJohn Marino 	  cy = mpn_sub_1 (rp, rp, 2*an, cy);
26886d7f5d3SJohn Marino 	  ASSERT (cy == (xp + 2*an - n)[0]);
26986d7f5d3SJohn Marino 	}
27086d7f5d3SJohn Marino       else
27186d7f5d3SJohn Marino 	{
27286d7f5d3SJohn Marino 	  cy = xp[n] + mpn_sub_n (rp + n, rp, xp, n);
27386d7f5d3SJohn Marino 	  /* cy = 1 only if {xp,n+1} is not ZERO, i.e. {rp,n} is not ZERO.
27486d7f5d3SJohn Marino 	     DECR will affect _at most_ the lowest n limbs. */
27586d7f5d3SJohn Marino 	  MPN_DECR_U (rp, 2*n, cy);
27686d7f5d3SJohn Marino 	}
27786d7f5d3SJohn Marino #undef a0
27886d7f5d3SJohn Marino #undef a1
27986d7f5d3SJohn Marino #undef xp
28086d7f5d3SJohn Marino #undef sp1
28186d7f5d3SJohn Marino     }
28286d7f5d3SJohn Marino }
28386d7f5d3SJohn Marino 
28486d7f5d3SJohn Marino mp_size_t
mpn_sqrmod_bnm1_next_size(mp_size_t n)28586d7f5d3SJohn Marino mpn_sqrmod_bnm1_next_size (mp_size_t n)
28686d7f5d3SJohn Marino {
28786d7f5d3SJohn Marino   mp_size_t nh;
28886d7f5d3SJohn Marino 
28986d7f5d3SJohn Marino   if (BELOW_THRESHOLD (n,     SQRMOD_BNM1_THRESHOLD))
29086d7f5d3SJohn Marino     return n;
29186d7f5d3SJohn Marino   if (BELOW_THRESHOLD (n, 4 * (SQRMOD_BNM1_THRESHOLD - 1) + 1))
29286d7f5d3SJohn Marino     return (n + (2-1)) & (-2);
29386d7f5d3SJohn Marino   if (BELOW_THRESHOLD (n, 8 * (SQRMOD_BNM1_THRESHOLD - 1) + 1))
29486d7f5d3SJohn Marino     return (n + (4-1)) & (-4);
29586d7f5d3SJohn Marino 
29686d7f5d3SJohn Marino   nh = (n + 1) >> 1;
29786d7f5d3SJohn Marino 
29886d7f5d3SJohn Marino   if (BELOW_THRESHOLD (nh, SQR_FFT_MODF_THRESHOLD))
29986d7f5d3SJohn Marino     return (n + (8-1)) & (-8);
30086d7f5d3SJohn Marino 
30186d7f5d3SJohn Marino   return 2 * mpn_fft_next_size (nh, mpn_fft_best_k (nh, 1));
30286d7f5d3SJohn Marino }
303