mpn/generic/perfsqr.c

86d7f5d3SJohn Marino/* mpn_perfect_square_p(u,usize) -- Return non-zero if U is a perfect square,
86d7f5d3SJohn Marino   zero otherwise.
86d7f5d3SJohn Marino
86d7f5d3SJohn MarinoCopyright 1991, 1993, 1994, 1996, 1997, 2000, 2001, 2002, 2005 Free Software
86d7f5d3SJohn MarinoFoundation, Inc.
86d7f5d3SJohn Marino
86d7f5d3SJohn MarinoThis file is part of the GNU MP Library.
86d7f5d3SJohn Marino
86d7f5d3SJohn MarinoThe GNU MP Library is free software; you can redistribute it and/or modify
86d7f5d3SJohn Marinoit under the terms of the GNU Lesser General Public License as published by
86d7f5d3SJohn Marinothe Free Software Foundation; either version 3 of the License, or (at your
86d7f5d3SJohn Marinooption) any later version.
86d7f5d3SJohn Marino
86d7f5d3SJohn MarinoThe GNU MP Library is distributed in the hope that it will be useful, but
86d7f5d3SJohn MarinoWITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
86d7f5d3SJohn Marinoor FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
86d7f5d3SJohn MarinoLicense for more details.
86d7f5d3SJohn Marino
86d7f5d3SJohn MarinoYou should have received a copy of the GNU Lesser General Public License
86d7f5d3SJohn Marinoalong with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino#include <stdio.h> /* for NULL */
86d7f5d3SJohn Marino#include "gmp.h"
86d7f5d3SJohn Marino#include "gmp-impl.h"
86d7f5d3SJohn Marino#include "longlong.h"
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino#include "perfsqr.h"
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino/* change this to "#define TRACE(x) x" for diagnostics */
86d7f5d3SJohn Marino#define TRACE(x)
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino/* PERFSQR_MOD_* detects non-squares using residue tests.
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino   A macro PERFSQR_MOD_TEST is setup by gen-psqr.c in perfsqr.h.  It takes
86d7f5d3SJohn Marino   {up,usize} modulo a selected modulus to get a remainder r.  For 32-bit or
86d7f5d3SJohn Marino   64-bit limbs this modulus will be 2^24-1 or 2^48-1 using PERFSQR_MOD_34,
86d7f5d3SJohn Marino   or for other limb or nail sizes a PERFSQR_PP is chosen and PERFSQR_MOD_PP
86d7f5d3SJohn Marino   used.  PERFSQR_PP_NORM and PERFSQR_PP_INVERTED are pre-calculated in this
86d7f5d3SJohn Marino   case too.
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino   PERFSQR_MOD_TEST then makes various calls to PERFSQR_MOD_1 or
86d7f5d3SJohn Marino   PERFSQR_MOD_2 with divisors d which are factors of the modulus, and table
86d7f5d3SJohn Marino   data indicating residues and non-residues modulo those divisors.  The
86d7f5d3SJohn Marino   table data is in 1 or 2 limbs worth of bits respectively, per the size of
86d7f5d3SJohn Marino   each d.
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino   A "modexact" style remainder is taken to reduce r modulo d.
86d7f5d3SJohn Marino   PERFSQR_MOD_IDX implements this, producing an index "idx" for use with
86d7f5d3SJohn Marino   the table data.  Notice there's just one multiplication by a constant
86d7f5d3SJohn Marino   "inv", for each d.
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino   The modexact doesn't produce a true r%d remainder, instead idx satisfies
86d7f5d3SJohn Marino   "-(idx<<PERFSQR_MOD_BITS) == r mod d".  Because d is odd, this factor
86d7f5d3SJohn Marino   -2^PERFSQR_MOD_BITS is a one-to-one mapping between r and idx, and is
86d7f5d3SJohn Marino   accounted for by having the table data suitably permuted.
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino   The remainder r fits within PERFSQR_MOD_BITS which is less than a limb.
86d7f5d3SJohn Marino   In fact the GMP_LIMB_BITS - PERFSQR_MOD_BITS spare bits are enough to fit
86d7f5d3SJohn Marino   each divisor d meaning the modexact multiply can take place entirely
86d7f5d3SJohn Marino   within one limb, giving the compiler the chance to optimize it, in a way
86d7f5d3SJohn Marino   that say umul_ppmm would not give.
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino   There's no need for the divisors d to be prime, in fact gen-psqr.c makes
86d7f5d3SJohn Marino   a deliberate effort to combine factors so as to reduce the number of
86d7f5d3SJohn Marino   separate tests done on r.  But such combining is limited to d <=
86d7f5d3SJohn Marino   2*GMP_LIMB_BITS so that the table data fits in at most 2 limbs.
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino   Alternatives:
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino   It'd be possible to use bigger divisors d, and more than 2 limbs of table
86d7f5d3SJohn Marino   data, but this doesn't look like it would be of much help to the prime
86d7f5d3SJohn Marino   factors in the usual moduli 2^24-1 or 2^48-1.
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino   The moduli 2^24-1 or 2^48-1 are nothing particularly special, they're
86d7f5d3SJohn Marino   just easy to calculate (see mpn_mod_34lsub1) and have a nice set of prime
86d7f5d3SJohn Marino   factors.  2^32-1 and 2^64-1 would be equally easy to calculate, but have
86d7f5d3SJohn Marino   fewer prime factors.
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino   The nails case usually ends up using mpn_mod_1, which is a lot slower
86d7f5d3SJohn Marino   than mpn_mod_34lsub1.  Perhaps other such special moduli could be found
86d7f5d3SJohn Marino   for the nails case.  Two-term things like 2^30-2^15-1 might be
86d7f5d3SJohn Marino   candidates.  Or at worst some on-the-fly de-nailing would allow the plain
86d7f5d3SJohn Marino   2^24-1 to be used.  Currently nails are too preliminary to be worried
86d7f5d3SJohn Marino   about.
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino*/
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino#define PERFSQR_MOD_MASK       ((CNST_LIMB(1) << PERFSQR_MOD_BITS) - 1)
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino#define MOD34_BITS  (GMP_NUMB_BITS / 4 * 3)
86d7f5d3SJohn Marino#define MOD34_MASK  ((CNST_LIMB(1) << MOD34_BITS) - 1)
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino#define PERFSQR_MOD_34(r, up, usize)				\
86d7f5d3SJohn Marino  do {								\
86d7f5d3SJohn Marino    (r) = mpn_mod_34lsub1 (up, usize);				\
86d7f5d3SJohn Marino    (r) = ((r) & MOD34_MASK) + ((r) >> MOD34_BITS);		\
86d7f5d3SJohn Marino  } while (0)
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino/* FIXME: The %= here isn't good, and might destroy any savings from keeping
86d7f5d3SJohn Marino   the PERFSQR_MOD_IDX stuff within a limb (rather than needing umul_ppmm).
86d7f5d3SJohn Marino   Maybe a new sort of mpn_preinv_mod_1 could accept an unnormalized divisor
86d7f5d3SJohn Marino   and a shift count, like mpn_preinv_divrem_1.	 But mod_34lsub1 is our
86d7f5d3SJohn Marino   normal case, so lets not worry too much about mod_1.	 */
86d7f5d3SJohn Marino#define PERFSQR_MOD_PP(r, up, usize)					\
86d7f5d3SJohn Marino  do {									\
86d7f5d3SJohn Marino    if (BELOW_THRESHOLD (usize, PREINV_MOD_1_TO_MOD_1_THRESHOLD))	\
86d7f5d3SJohn Marino      {									\
86d7f5d3SJohn Marino	(r) = mpn_preinv_mod_1 (up, usize, PERFSQR_PP_NORM,		\
86d7f5d3SJohn Marino				PERFSQR_PP_INVERTED);			\
86d7f5d3SJohn Marino	(r) %= PERFSQR_PP;						\
86d7f5d3SJohn Marino      }									\
86d7f5d3SJohn Marino    else								\
86d7f5d3SJohn Marino      {									\
86d7f5d3SJohn Marino	(r) = mpn_mod_1 (up, usize, PERFSQR_PP);			\
86d7f5d3SJohn Marino      }									\
86d7f5d3SJohn Marino  } while (0)
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino#define PERFSQR_MOD_IDX(idx, r, d, inv)				\
86d7f5d3SJohn Marino  do {								\
86d7f5d3SJohn Marino    mp_limb_t  q;						\
86d7f5d3SJohn Marino    ASSERT ((r) <= PERFSQR_MOD_MASK);				\
86d7f5d3SJohn Marino    ASSERT ((((inv) * (d)) & PERFSQR_MOD_MASK) == 1);		\
86d7f5d3SJohn Marino    ASSERT (MP_LIMB_T_MAX / (d) >= PERFSQR_MOD_MASK);		\
86d7f5d3SJohn Marino								\
86d7f5d3SJohn Marino    q = ((r) * (inv)) & PERFSQR_MOD_MASK;			\
86d7f5d3SJohn Marino    ASSERT (r == ((q * (d)) & PERFSQR_MOD_MASK));		\
86d7f5d3SJohn Marino    (idx) = (q * (d)) >> PERFSQR_MOD_BITS;			\
86d7f5d3SJohn Marino  } while (0)
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino#define PERFSQR_MOD_1(r, d, inv, mask)				\
86d7f5d3SJohn Marino  do {								\
86d7f5d3SJohn Marino    unsigned   idx;						\
86d7f5d3SJohn Marino    ASSERT ((d) <= GMP_LIMB_BITS);				\
86d7f5d3SJohn Marino    PERFSQR_MOD_IDX(idx, r, d, inv);				\
86d7f5d3SJohn Marino    TRACE (printf ("  PERFSQR_MOD_1 d=%u r=%lu idx=%u\n",	\
86d7f5d3SJohn Marino		   d, r%d, idx));				\
86d7f5d3SJohn Marino    if ((((mask) >> idx) & 1) == 0)				\
86d7f5d3SJohn Marino      {								\
86d7f5d3SJohn Marino	TRACE (printf ("  non-square\n"));			\
86d7f5d3SJohn Marino	return 0;						\
86d7f5d3SJohn Marino      }								\
86d7f5d3SJohn Marino  } while (0)
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino/* The expression "(int) idx - GMP_LIMB_BITS < 0" lets the compiler use the
86d7f5d3SJohn Marino   sign bit from "idx-GMP_LIMB_BITS", which might help avoid a branch.	*/
86d7f5d3SJohn Marino#define PERFSQR_MOD_2(r, d, inv, mhi, mlo)			\
86d7f5d3SJohn Marino  do {								\
86d7f5d3SJohn Marino    mp_limb_t  m;						\
86d7f5d3SJohn Marino    unsigned   idx;						\
86d7f5d3SJohn Marino    ASSERT ((d) <= 2*GMP_LIMB_BITS);				\
86d7f5d3SJohn Marino								\
86d7f5d3SJohn Marino    PERFSQR_MOD_IDX (idx, r, d, inv);				\
86d7f5d3SJohn Marino    TRACE (printf ("  PERFSQR_MOD_2 d=%u r=%lu idx=%u\n",	\
86d7f5d3SJohn Marino		   d, r%d, idx));				\
86d7f5d3SJohn Marino    m = ((int) idx - GMP_LIMB_BITS < 0 ? (mlo) : (mhi));	\
86d7f5d3SJohn Marino    idx %= GMP_LIMB_BITS;					\
86d7f5d3SJohn Marino    if (((m >> idx) & 1) == 0)					\
86d7f5d3SJohn Marino      {								\
86d7f5d3SJohn Marino	TRACE (printf ("  non-square\n"));			\
86d7f5d3SJohn Marino	return 0;						\
86d7f5d3SJohn Marino      }								\
86d7f5d3SJohn Marino  } while (0)
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino
86d7f5d3SJohn Marinoint
86d7f5d3SJohn Marinompn_perfect_square_p (mp_srcptr up, mp_size_t usize)
86d7f5d3SJohn Marino{
86d7f5d3SJohn Marino  ASSERT (usize >= 1);
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino  TRACE (gmp_printf ("mpn_perfect_square_p %Nd\n", up, usize));
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino  /* The first test excludes 212/256 (82.8%) of the perfect square candidates
86d7f5d3SJohn Marino     in O(1) time.  */
86d7f5d3SJohn Marino  {
86d7f5d3SJohn Marino    unsigned  idx = up[0] % 0x100;
86d7f5d3SJohn Marino    if (((sq_res_0x100[idx / GMP_LIMB_BITS]
86d7f5d3SJohn Marino	  >> (idx % GMP_LIMB_BITS)) & 1) == 0)
86d7f5d3SJohn Marino      return 0;
86d7f5d3SJohn Marino  }
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino#if 0
86d7f5d3SJohn Marino  /* Check that we have even multiplicity of 2, and then check that the rest is
86d7f5d3SJohn Marino     a possible perfect square.  Leave disabled until we can determine this
86d7f5d3SJohn Marino     really is an improvement.  It it is, it could completely replace the
86d7f5d3SJohn Marino     simple probe above, since this should through out more non-squares, but at
86d7f5d3SJohn Marino     the expense of somewhat more cycles.  */
86d7f5d3SJohn Marino  {
86d7f5d3SJohn Marino    mp_limb_t lo;
86d7f5d3SJohn Marino    int cnt;
86d7f5d3SJohn Marino    lo = up[0];
86d7f5d3SJohn Marino    while (lo == 0)
86d7f5d3SJohn Marino      up++, lo = up[0], usize--;
86d7f5d3SJohn Marino    count_trailing_zeros (cnt, lo);
86d7f5d3SJohn Marino    if ((cnt & 1) != 0)
86d7f5d3SJohn Marino      return 0;			/* return of not even multiplicity of 2 */
86d7f5d3SJohn Marino    lo >>= cnt;			/* shift down to align lowest non-zero bit */
86d7f5d3SJohn Marino    lo >>= 1;			/* shift away lowest non-zero bit */
86d7f5d3SJohn Marino    if ((lo & 3) != 0)
86d7f5d3SJohn Marino      return 0;
86d7f5d3SJohn Marino  }
86d7f5d3SJohn Marino#endif
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino  /* The second test uses mpn_mod_34lsub1 or mpn_mod_1 to detect non-squares
86d7f5d3SJohn Marino     according to their residues modulo small primes (or powers of
86d7f5d3SJohn Marino     primes).  See perfsqr.h.  */
86d7f5d3SJohn Marino  PERFSQR_MOD_TEST (up, usize);
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino  /* For the third and last test, we finally compute the square root,
86d7f5d3SJohn Marino     to make sure we've really got a perfect square.  */
86d7f5d3SJohn Marino  {
86d7f5d3SJohn Marino    mp_ptr root_ptr;
86d7f5d3SJohn Marino    int res;
86d7f5d3SJohn Marino    TMP_DECL;
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino    TMP_MARK;
86d7f5d3SJohn Marino    root_ptr = TMP_ALLOC_LIMBS ((usize + 1) / 2);
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino    /* Iff mpn_sqrtrem returns zero, the square is perfect.  */
86d7f5d3SJohn Marino    res = ! mpn_sqrtrem (root_ptr, NULL, up, usize);
86d7f5d3SJohn Marino    TMP_FREE;
86d7f5d3SJohn Marino
86d7f5d3SJohn Marino    return res;
86d7f5d3SJohn Marino  }
86d7f5d3SJohn Marino}