mpn/generic/mode1o.c

86d7f5d3SJohn Marino/* mpn_modexact_1c_odd -- mpn by limb exact division style remainder.
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino   THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY.  THEY'RE ALMOST
86d7f5d3SJohn Marino   CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
86d7f5d3SJohn Marino   FUTURE GNU MP RELEASES.
86d7f5d3SJohn Marino
86d7f5d3SJohn MarinoCopyright 2000, 2001, 2002, 2003, 2004 Free Software Foundation, Inc.
86d7f5d3SJohn Marino
86d7f5d3SJohn MarinoThis file is part of the GNU MP Library.
86d7f5d3SJohn Marino
86d7f5d3SJohn MarinoThe GNU MP Library is free software; you can redistribute it and/or modify
86d7f5d3SJohn Marinoit under the terms of the GNU Lesser General Public License as published by
86d7f5d3SJohn Marinothe Free Software Foundation; either version 3 of the License, or (at your
86d7f5d3SJohn Marinooption) any later version.
86d7f5d3SJohn Marino
86d7f5d3SJohn MarinoThe GNU MP Library is distributed in the hope that it will be useful, but
86d7f5d3SJohn MarinoWITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
86d7f5d3SJohn Marinoor FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
86d7f5d3SJohn MarinoLicense for more details.
86d7f5d3SJohn Marino
86d7f5d3SJohn MarinoYou should have received a copy of the GNU Lesser General Public License
86d7f5d3SJohn Marinoalong with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino#include "gmp.h"
86d7f5d3SJohn Marino#include "gmp-impl.h"
86d7f5d3SJohn Marino#include "longlong.h"
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino/* Calculate an r satisfying
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino           r*B^k + a - c == q*d
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino   where B=2^GMP_LIMB_BITS, a is {src,size}, k is either size or size-1
86d7f5d3SJohn Marino   (the caller won't know which), and q is the quotient (discarded).  d must
86d7f5d3SJohn Marino   be odd, c can be any limb value.
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino   If c<d then r will be in the range 0<=r<d, or if c>=d then 0<=r<=d.
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino   This slightly strange function suits the initial Nx1 reduction for GCDs
86d7f5d3SJohn Marino   or Jacobi symbols since the factors of 2 in B^k can be ignored, leaving
86d7f5d3SJohn Marino   -r == a mod d (by passing c=0).  For a GCD the factor of -1 on r can be
86d7f5d3SJohn Marino   ignored, or for the Jacobi symbol it can be accounted for.  The function
86d7f5d3SJohn Marino   also suits divisibility and congruence testing since if r=0 (or r=d) is
86d7f5d3SJohn Marino   obtained then a==c mod d.
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino   r is a bit like the remainder returned by mpn_divexact_by3c, and is the
86d7f5d3SJohn Marino   sort of remainder mpn_divexact_1 might return.  Like mpn_divexact_by3c, r
86d7f5d3SJohn Marino   represents a borrow, since effectively quotient limbs are chosen so that
86d7f5d3SJohn Marino   subtracting that multiple of d from src at each step will produce a zero
86d7f5d3SJohn Marino   limb.
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino   A long calculation can be done piece by piece from low to high by passing
86d7f5d3SJohn Marino   the return value from one part as the carry parameter to the next part.
86d7f5d3SJohn Marino   The effective final k becomes anything between size and size-n, if n
86d7f5d3SJohn Marino   pieces are used.
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino   A similar sort of routine could be constructed based on adding multiples
86d7f5d3SJohn Marino   of d at each limb, much like redc in mpz_powm does.  Subtracting however
86d7f5d3SJohn Marino   has a small advantage that when subtracting to cancel out l there's never
86d7f5d3SJohn Marino   a borrow into h, whereas using an addition would put a carry into h
86d7f5d3SJohn Marino   depending whether l==0 or l!=0.
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino   In terms of efficiency, this function is similar to a mul-by-inverse
86d7f5d3SJohn Marino   mpn_mod_1.  Both are essentially two multiplies and are best suited to
86d7f5d3SJohn Marino   CPUs with low latency multipliers (in comparison to a divide instruction
86d7f5d3SJohn Marino   at least.)  But modexact has a few less supplementary operations, only
86d7f5d3SJohn Marino   needs low part and high part multiplies, and has fewer working quantities
86d7f5d3SJohn Marino   (helping CPUs with few registers).
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino   In the main loop it will be noted that the new carry (call it r) is the
86d7f5d3SJohn Marino   sum of the high product h and any borrow from l=s-c.  If c<d then we will
86d7f5d3SJohn Marino   have r<d too, for the following reasons.  Let q=l*inverse be the quotient
86d7f5d3SJohn Marino   limb, so that q*d = B*h + l, where B=2^GMP_NUMB_BITS.  Now if h=d-1 then
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino       l = q*d - B*(d-1) <= (B-1)*d - B*(d-1) = B-d
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino   But if l=s-c produces a borrow when c<d, then l>=B-d+1 and hence will
86d7f5d3SJohn Marino   never have h=d-1 and so r=h+borrow <= d-1.
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino   When c>=d, on the other hand, h=d-1 can certainly occur together with a
86d7f5d3SJohn Marino   borrow, thereby giving only r<=d, as per the function definition above.
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino   As a design decision it's left to the caller to check for r=d if it might
86d7f5d3SJohn Marino   be passing c>=d.  Several applications have c<d initially so the extra
86d7f5d3SJohn Marino   test is often unnecessary, for example the GCDs or a plain divisibility
86d7f5d3SJohn Marino   d|a test will pass c=0.
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino   The special case for size==1 is so that it can be assumed c<=d in the
86d7f5d3SJohn Marino   high<=divisor test at the end.  c<=d is only guaranteed after at least
86d7f5d3SJohn Marino   one iteration of the main loop.  There's also a decent chance one % is
86d7f5d3SJohn Marino   faster than a binvert_limb, though that will depend on the processor.
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino   A CPU specific implementation might want to omit the size==1 code or the
86d7f5d3SJohn Marino   high<divisor test.  mpn/x86/k6/mode1o.asm for instance finds neither
86d7f5d3SJohn Marino   useful.  */
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino
86d7f5d3SJohn Marinomp_limb_t
86d7f5d3SJohn Marinompn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d,
86d7f5d3SJohn Marino                     mp_limb_t orig_c)
86d7f5d3SJohn Marino{
86d7f5d3SJohn Marino  mp_limb_t  s, h, l, inverse, dummy, dmul, ret;
86d7f5d3SJohn Marino  mp_limb_t  c = orig_c;
86d7f5d3SJohn Marino  mp_size_t  i;
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino  ASSERT (size >= 1);
86d7f5d3SJohn Marino  ASSERT (d & 1);
86d7f5d3SJohn Marino  ASSERT_MPN (src, size);
86d7f5d3SJohn Marino  ASSERT_LIMB (d);
86d7f5d3SJohn Marino  ASSERT_LIMB (c);
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino  if (size == 1)
86d7f5d3SJohn Marino    {
86d7f5d3SJohn Marino      s = src[0];
86d7f5d3SJohn Marino      if (s > c)
86d7f5d3SJohn Marino	{
86d7f5d3SJohn Marino	  l = s-c;
86d7f5d3SJohn Marino	  h = l % d;
86d7f5d3SJohn Marino	  if (h != 0)
86d7f5d3SJohn Marino	    h = d - h;
86d7f5d3SJohn Marino	}
86d7f5d3SJohn Marino      else
86d7f5d3SJohn Marino	{
86d7f5d3SJohn Marino	  l = c-s;
86d7f5d3SJohn Marino	  h = l % d;
86d7f5d3SJohn Marino	}
86d7f5d3SJohn Marino      return h;
86d7f5d3SJohn Marino    }
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino  binvert_limb (inverse, d);
86d7f5d3SJohn Marino  dmul = d << GMP_NAIL_BITS;
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino  i = 0;
86d7f5d3SJohn Marino  do
86d7f5d3SJohn Marino    {
86d7f5d3SJohn Marino      s = src[i];
86d7f5d3SJohn Marino      SUBC_LIMB (c, l, s, c);
86d7f5d3SJohn Marino      l = (l * inverse) & GMP_NUMB_MASK;
86d7f5d3SJohn Marino      umul_ppmm (h, dummy, l, dmul);
86d7f5d3SJohn Marino      c += h;
86d7f5d3SJohn Marino    }
86d7f5d3SJohn Marino  while (++i < size-1);
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino  s = src[i];
86d7f5d3SJohn Marino  if (s <= d)
86d7f5d3SJohn Marino    {
86d7f5d3SJohn Marino      /* With high<=d the final step can be a subtract and addback.  If c==0
86d7f5d3SJohn Marino	 then the addback will restore to l>=0.  If c==d then will get l==d
86d7f5d3SJohn Marino	 if s==0, but that's ok per the function definition.  */
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino      l = c - s;
86d7f5d3SJohn Marino      if (c < s)
86d7f5d3SJohn Marino	l += d;
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino      ret = l;
86d7f5d3SJohn Marino    }
86d7f5d3SJohn Marino  else
86d7f5d3SJohn Marino    {
86d7f5d3SJohn Marino      /* Can't skip a divide, just do the loop code once more. */
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino      SUBC_LIMB (c, l, s, c);
86d7f5d3SJohn Marino      l = (l * inverse) & GMP_NUMB_MASK;
86d7f5d3SJohn Marino      umul_ppmm (h, dummy, l, dmul);
86d7f5d3SJohn Marino      c += h;
86d7f5d3SJohn Marino      ret = c;
86d7f5d3SJohn Marino    }
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino  ASSERT (orig_c < d ? ret < d : ret <= d);
86d7f5d3SJohn Marino  return ret;
86d7f5d3SJohn Marino}
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino#if 0
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino/* The following is an alternate form that might shave one cycle on a
86d7f5d3SJohn Marino   superscalar processor since it takes c+=h off the dependent chain,
86d7f5d3SJohn Marino   leaving just a low product, high product, and a subtract.
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino   This is for CPU specific implementations to consider.  A special case for
86d7f5d3SJohn Marino   high<divisor and/or size==1 can be added if desired.
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino   Notice that c is only ever 0 or 1, since if s-c produces a borrow then
86d7f5d3SJohn Marino   x=0xFF..FF and x-h cannot produce a borrow.  The c=(x>s) could become
86d7f5d3SJohn Marino   c=(x==0xFF..FF) too, if that helped.  */
86d7f5d3SJohn Marino
86d7f5d3SJohn Marinomp_limb_t
86d7f5d3SJohn Marinompn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d, mp_limb_t h)
86d7f5d3SJohn Marino{
86d7f5d3SJohn Marino  mp_limb_t  s, x, y, inverse, dummy, dmul, c1, c2;
86d7f5d3SJohn Marino  mp_limb_t  c = 0;
86d7f5d3SJohn Marino  mp_size_t  i;
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino  ASSERT (size >= 1);
86d7f5d3SJohn Marino  ASSERT (d & 1);
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino  binvert_limb (inverse, d);
86d7f5d3SJohn Marino  dmul = d << GMP_NAIL_BITS;
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino  for (i = 0; i < size; i++)
86d7f5d3SJohn Marino    {
86d7f5d3SJohn Marino      ASSERT (c==0 || c==1);
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino      s = src[i];
86d7f5d3SJohn Marino      SUBC_LIMB (c1, x, s, c);
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino      SUBC_LIMB (c2, y, x, h);
86d7f5d3SJohn Marino      c = c1 + c2;
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino      y = (y * inverse) & GMP_NUMB_MASK;
86d7f5d3SJohn Marino      umul_ppmm (h, dummy, y, dmul);
86d7f5d3SJohn Marino    }
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino  h += c;
86d7f5d3SJohn Marino  return h;
86d7f5d3SJohn Marino}
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino#endif