mpn/generic/diveby3.c

*86d7f5d3SJohn Marino/* mpn_divexact_by3c -- mpn exact division by 3.
*86d7f5d3SJohn Marino
*86d7f5d3SJohn MarinoCopyright 2000, 2001, 2002, 2003, 2008 Free Software Foundation, Inc.
*86d7f5d3SJohn Marino
*86d7f5d3SJohn MarinoThis file is part of the GNU MP Library.
*86d7f5d3SJohn Marino
*86d7f5d3SJohn MarinoThe GNU MP Library is free software; you can redistribute it and/or modify
*86d7f5d3SJohn Marinoit under the terms of the GNU Lesser General Public License as published by
*86d7f5d3SJohn Marinothe Free Software Foundation; either version 3 of the License, or (at your
*86d7f5d3SJohn Marinooption) any later version.
*86d7f5d3SJohn Marino
*86d7f5d3SJohn MarinoThe GNU MP Library is distributed in the hope that it will be useful, but
*86d7f5d3SJohn MarinoWITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
*86d7f5d3SJohn Marinoor FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
*86d7f5d3SJohn MarinoLicense for more details.
*86d7f5d3SJohn Marino
*86d7f5d3SJohn MarinoYou should have received a copy of the GNU Lesser General Public License
*86d7f5d3SJohn Marinoalong with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
*86d7f5d3SJohn Marino
*86d7f5d3SJohn Marino#include "gmp.h"
*86d7f5d3SJohn Marino#include "gmp-impl.h"
*86d7f5d3SJohn Marino
*86d7f5d3SJohn Marino#if DIVEXACT_BY3_METHOD == 0
*86d7f5d3SJohn Marino
*86d7f5d3SJohn Marinomp_limb_t
*86d7f5d3SJohn Marinompn_divexact_by3c (mp_ptr rp, mp_srcptr up, mp_size_t un, mp_limb_t c)
*86d7f5d3SJohn Marino{
*86d7f5d3SJohn Marino  mp_limb_t r;
*86d7f5d3SJohn Marino  r = mpn_bdiv_dbm1c (rp, up, un, GMP_NUMB_MASK / 3, GMP_NUMB_MASK / 3 * c);
*86d7f5d3SJohn Marino
*86d7f5d3SJohn Marino  /* Possible bdiv_dbm1 return values are C * (GMP_NUMB_MASK / 3), 0 <= C < 3.
*86d7f5d3SJohn Marino     We want to return C.  We compute the remainder mod 4 and notice that the
*86d7f5d3SJohn Marino     inverse of (2^(2k)-1)/3 mod 4 is 1.  */
*86d7f5d3SJohn Marino  return r & 3;
*86d7f5d3SJohn Marino}
*86d7f5d3SJohn Marino
*86d7f5d3SJohn Marino#endif
*86d7f5d3SJohn Marino
*86d7f5d3SJohn Marino#if DIVEXACT_BY3_METHOD == 1
*86d7f5d3SJohn Marino
*86d7f5d3SJohn Marino/* The algorithm here is basically the same as mpn_divexact_1, as described
*86d7f5d3SJohn Marino   in the manual.  Namely at each step q = (src[i]-c)*inverse, and new c =
*86d7f5d3SJohn Marino   borrow(src[i]-c) + high(divisor*q).  But because the divisor is just 3,
*86d7f5d3SJohn Marino   high(divisor*q) can be determined with two comparisons instead of a
*86d7f5d3SJohn Marino   multiply.
*86d7f5d3SJohn Marino
*86d7f5d3SJohn Marino   The "c += ..."s add the high limb of 3*l to c.  That high limb will be 0,
*86d7f5d3SJohn Marino   1 or 2.  Doing two separate "+="s seems to give better code on gcc (as of
*86d7f5d3SJohn Marino   2.95.2 at least).
*86d7f5d3SJohn Marino
*86d7f5d3SJohn Marino   It will be noted that the new c is formed by adding three values each 0
*86d7f5d3SJohn Marino   or 1.  But the total is only 0, 1 or 2.  When the subtraction src[i]-c
*86d7f5d3SJohn Marino   causes a borrow, that leaves a limb value of either 0xFF...FF or
*86d7f5d3SJohn Marino   0xFF...FE.  The multiply by MODLIMB_INVERSE_3 gives 0x55...55 or
*86d7f5d3SJohn Marino   0xAA...AA respectively, and in those cases high(3*q) is only 0 or 1
*86d7f5d3SJohn Marino   respectively, hence a total of no more than 2.
*86d7f5d3SJohn Marino
*86d7f5d3SJohn Marino   Alternatives:
*86d7f5d3SJohn Marino
*86d7f5d3SJohn Marino   This implementation has each multiply on the dependent chain, due to
*86d7f5d3SJohn Marino   "l=s-c".  See below for alternative code which avoids that.  */
*86d7f5d3SJohn Marino
*86d7f5d3SJohn Marinomp_limb_t
*86d7f5d3SJohn Marinompn_divexact_by3c (mp_ptr restrict rp, mp_srcptr restrict up, mp_size_t un, mp_limb_t c)
*86d7f5d3SJohn Marino{
*86d7f5d3SJohn Marino  mp_limb_t  l, q, s;
*86d7f5d3SJohn Marino  mp_size_t  i;
*86d7f5d3SJohn Marino
*86d7f5d3SJohn Marino  ASSERT (un >= 1);
*86d7f5d3SJohn Marino  ASSERT (c == 0 || c == 1 || c == 2);
*86d7f5d3SJohn Marino  ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, un));
*86d7f5d3SJohn Marino
*86d7f5d3SJohn Marino  i = 0;
*86d7f5d3SJohn Marino  do
*86d7f5d3SJohn Marino    {
*86d7f5d3SJohn Marino      s = up[i];
*86d7f5d3SJohn Marino      SUBC_LIMB (c, l, s, c);
*86d7f5d3SJohn Marino
*86d7f5d3SJohn Marino      q = (l * MODLIMB_INVERSE_3) & GMP_NUMB_MASK;
*86d7f5d3SJohn Marino      rp[i] = q;
*86d7f5d3SJohn Marino
*86d7f5d3SJohn Marino      c += (q >= GMP_NUMB_CEIL_MAX_DIV3);
*86d7f5d3SJohn Marino      c += (q >= GMP_NUMB_CEIL_2MAX_DIV3);
*86d7f5d3SJohn Marino    }
*86d7f5d3SJohn Marino  while (++i < un);
*86d7f5d3SJohn Marino
*86d7f5d3SJohn Marino  ASSERT (c == 0 || c == 1 || c == 2);
*86d7f5d3SJohn Marino  return c;
*86d7f5d3SJohn Marino}
*86d7f5d3SJohn Marino
*86d7f5d3SJohn Marino
*86d7f5d3SJohn Marino#endif
*86d7f5d3SJohn Marino
*86d7f5d3SJohn Marino#if DIVEXACT_BY3_METHOD == 2
*86d7f5d3SJohn Marino
*86d7f5d3SJohn Marino/* The following alternative code re-arranges the quotient calculation from
*86d7f5d3SJohn Marino   (src[i]-c)*inverse to instead
*86d7f5d3SJohn Marino
*86d7f5d3SJohn Marino       q = src[i]*inverse - c*inverse
*86d7f5d3SJohn Marino
*86d7f5d3SJohn Marino   thereby allowing src[i]*inverse to be scheduled back as far as desired,
*86d7f5d3SJohn Marino   making full use of multiplier throughput and leaving just some carry
*86d7f5d3SJohn Marino   handing on the dependent chain.
*86d7f5d3SJohn Marino
*86d7f5d3SJohn Marino   The carry handling consists of determining the c for the next iteration.
*86d7f5d3SJohn Marino   This is the same as described above, namely look for any borrow from
*86d7f5d3SJohn Marino   src[i]-c, and at the high of 3*q.
*86d7f5d3SJohn Marino
*86d7f5d3SJohn Marino   high(3*q) is done with two comparisons as above (in c2 and c3).  The
*86d7f5d3SJohn Marino   borrow from src[i]-c is incorporated into those by noting that if there's
*86d7f5d3SJohn Marino   a carry then then we have src[i]-c == 0xFF..FF or 0xFF..FE, in turn
*86d7f5d3SJohn Marino   giving q = 0x55..55 or 0xAA..AA.  Adding 1 to either of those q values is
*86d7f5d3SJohn Marino   enough to make high(3*q) come out 1 bigger, as required.
*86d7f5d3SJohn Marino
*86d7f5d3SJohn Marino   l = -c*inverse is calculated at the same time as c, since for most chips
*86d7f5d3SJohn Marino   it can be more conveniently derived from separate c1/c2/c3 values than
*86d7f5d3SJohn Marino   from a combined c equal to 0, 1 or 2.
*86d7f5d3SJohn Marino
*86d7f5d3SJohn Marino   The net effect is that with good pipelining this loop should be able to
*86d7f5d3SJohn Marino   run at perhaps 4 cycles/limb, depending on available execute resources
*86d7f5d3SJohn Marino   etc.
*86d7f5d3SJohn Marino
*86d7f5d3SJohn Marino   Usage:
*86d7f5d3SJohn Marino
*86d7f5d3SJohn Marino   This code is not used by default, since we really can't rely on the
*86d7f5d3SJohn Marino   compiler generating a good software pipeline, nor on such an approach
*86d7f5d3SJohn Marino   even being worthwhile on all CPUs.
*86d7f5d3SJohn Marino
*86d7f5d3SJohn Marino   Itanium is one chip where this algorithm helps though, see
*86d7f5d3SJohn Marino   mpn/ia64/diveby3.asm.  */
*86d7f5d3SJohn Marino
*86d7f5d3SJohn Marinomp_limb_t
*86d7f5d3SJohn Marinompn_divexact_by3c (mp_ptr restrict rp, mp_srcptr restrict up, mp_size_t un, mp_limb_t cy)
*86d7f5d3SJohn Marino{
*86d7f5d3SJohn Marino  mp_limb_t  s, sm, cl, q, qx, c2, c3;
*86d7f5d3SJohn Marino  mp_size_t  i;
*86d7f5d3SJohn Marino
*86d7f5d3SJohn Marino  ASSERT (un >= 1);
*86d7f5d3SJohn Marino  ASSERT (cy == 0 || cy == 1 || cy == 2);
*86d7f5d3SJohn Marino  ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, un));
*86d7f5d3SJohn Marino
*86d7f5d3SJohn Marino  cl = cy == 0 ? 0 : cy == 1 ? -MODLIMB_INVERSE_3 : -2*MODLIMB_INVERSE_3;
*86d7f5d3SJohn Marino
*86d7f5d3SJohn Marino  for (i = 0; i < un; i++)
*86d7f5d3SJohn Marino    {
*86d7f5d3SJohn Marino      s = up[i];
*86d7f5d3SJohn Marino      sm = (s * MODLIMB_INVERSE_3) & GMP_NUMB_MASK;
*86d7f5d3SJohn Marino
*86d7f5d3SJohn Marino      q = (cl + sm) & GMP_NUMB_MASK;
*86d7f5d3SJohn Marino      rp[i] = q;
*86d7f5d3SJohn Marino      qx = q + (s < cy);
*86d7f5d3SJohn Marino
*86d7f5d3SJohn Marino      c2 = qx >= GMP_NUMB_CEIL_MAX_DIV3;
*86d7f5d3SJohn Marino      c3 = qx >= GMP_NUMB_CEIL_2MAX_DIV3 ;
*86d7f5d3SJohn Marino
*86d7f5d3SJohn Marino      cy = c2 + c3;
*86d7f5d3SJohn Marino      cl = (-c2 & -MODLIMB_INVERSE_3) + (-c3 & -MODLIMB_INVERSE_3);
*86d7f5d3SJohn Marino    }
*86d7f5d3SJohn Marino
*86d7f5d3SJohn Marino  return cy;
*86d7f5d3SJohn Marino}
*86d7f5d3SJohn Marino
*86d7f5d3SJohn Marino#endif