xref: /dflybsd-src/contrib/gmp/mpn/generic/perfsqr.c (revision 86d7f5d305c6adaa56ff4582ece9859d73106103)
186d7f5d3SJohn Marino /* mpn_perfect_square_p(u,usize) -- Return non-zero if U is a perfect square,
286d7f5d3SJohn Marino    zero otherwise.
386d7f5d3SJohn Marino 
486d7f5d3SJohn Marino Copyright 1991, 1993, 1994, 1996, 1997, 2000, 2001, 2002, 2005 Free Software
586d7f5d3SJohn Marino Foundation, Inc.
686d7f5d3SJohn Marino 
786d7f5d3SJohn Marino This file is part of the GNU MP Library.
886d7f5d3SJohn Marino 
986d7f5d3SJohn Marino The GNU MP Library is free software; you can redistribute it and/or modify
1086d7f5d3SJohn Marino it under the terms of the GNU Lesser General Public License as published by
1186d7f5d3SJohn Marino the Free Software Foundation; either version 3 of the License, or (at your
1286d7f5d3SJohn Marino option) any later version.
1386d7f5d3SJohn Marino 
1486d7f5d3SJohn Marino The GNU MP Library is distributed in the hope that it will be useful, but
1586d7f5d3SJohn Marino WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
1686d7f5d3SJohn Marino or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
1786d7f5d3SJohn Marino License for more details.
1886d7f5d3SJohn Marino 
1986d7f5d3SJohn Marino You should have received a copy of the GNU Lesser General Public License
2086d7f5d3SJohn Marino along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
2186d7f5d3SJohn Marino 
2286d7f5d3SJohn Marino #include <stdio.h> /* for NULL */
2386d7f5d3SJohn Marino #include "gmp.h"
2486d7f5d3SJohn Marino #include "gmp-impl.h"
2586d7f5d3SJohn Marino #include "longlong.h"
2686d7f5d3SJohn Marino 
2786d7f5d3SJohn Marino #include "perfsqr.h"
2886d7f5d3SJohn Marino 
2986d7f5d3SJohn Marino 
3086d7f5d3SJohn Marino /* change this to "#define TRACE(x) x" for diagnostics */
3186d7f5d3SJohn Marino #define TRACE(x)
3286d7f5d3SJohn Marino 
3386d7f5d3SJohn Marino 
3486d7f5d3SJohn Marino 
3586d7f5d3SJohn Marino /* PERFSQR_MOD_* detects non-squares using residue tests.
3686d7f5d3SJohn Marino 
3786d7f5d3SJohn Marino    A macro PERFSQR_MOD_TEST is setup by gen-psqr.c in perfsqr.h.  It takes
3886d7f5d3SJohn Marino    {up,usize} modulo a selected modulus to get a remainder r.  For 32-bit or
3986d7f5d3SJohn Marino    64-bit limbs this modulus will be 2^24-1 or 2^48-1 using PERFSQR_MOD_34,
4086d7f5d3SJohn Marino    or for other limb or nail sizes a PERFSQR_PP is chosen and PERFSQR_MOD_PP
4186d7f5d3SJohn Marino    used.  PERFSQR_PP_NORM and PERFSQR_PP_INVERTED are pre-calculated in this
4286d7f5d3SJohn Marino    case too.
4386d7f5d3SJohn Marino 
4486d7f5d3SJohn Marino    PERFSQR_MOD_TEST then makes various calls to PERFSQR_MOD_1 or
4586d7f5d3SJohn Marino    PERFSQR_MOD_2 with divisors d which are factors of the modulus, and table
4686d7f5d3SJohn Marino    data indicating residues and non-residues modulo those divisors.  The
4786d7f5d3SJohn Marino    table data is in 1 or 2 limbs worth of bits respectively, per the size of
4886d7f5d3SJohn Marino    each d.
4986d7f5d3SJohn Marino 
5086d7f5d3SJohn Marino    A "modexact" style remainder is taken to reduce r modulo d.
5186d7f5d3SJohn Marino    PERFSQR_MOD_IDX implements this, producing an index "idx" for use with
5286d7f5d3SJohn Marino    the table data.  Notice there's just one multiplication by a constant
5386d7f5d3SJohn Marino    "inv", for each d.
5486d7f5d3SJohn Marino 
5586d7f5d3SJohn Marino    The modexact doesn't produce a true r%d remainder, instead idx satisfies
5686d7f5d3SJohn Marino    "-(idx<<PERFSQR_MOD_BITS) == r mod d".  Because d is odd, this factor
5786d7f5d3SJohn Marino    -2^PERFSQR_MOD_BITS is a one-to-one mapping between r and idx, and is
5886d7f5d3SJohn Marino    accounted for by having the table data suitably permuted.
5986d7f5d3SJohn Marino 
6086d7f5d3SJohn Marino    The remainder r fits within PERFSQR_MOD_BITS which is less than a limb.
6186d7f5d3SJohn Marino    In fact the GMP_LIMB_BITS - PERFSQR_MOD_BITS spare bits are enough to fit
6286d7f5d3SJohn Marino    each divisor d meaning the modexact multiply can take place entirely
6386d7f5d3SJohn Marino    within one limb, giving the compiler the chance to optimize it, in a way
6486d7f5d3SJohn Marino    that say umul_ppmm would not give.
6586d7f5d3SJohn Marino 
6686d7f5d3SJohn Marino    There's no need for the divisors d to be prime, in fact gen-psqr.c makes
6786d7f5d3SJohn Marino    a deliberate effort to combine factors so as to reduce the number of
6886d7f5d3SJohn Marino    separate tests done on r.  But such combining is limited to d <=
6986d7f5d3SJohn Marino    2*GMP_LIMB_BITS so that the table data fits in at most 2 limbs.
7086d7f5d3SJohn Marino 
7186d7f5d3SJohn Marino    Alternatives:
7286d7f5d3SJohn Marino 
7386d7f5d3SJohn Marino    It'd be possible to use bigger divisors d, and more than 2 limbs of table
7486d7f5d3SJohn Marino    data, but this doesn't look like it would be of much help to the prime
7586d7f5d3SJohn Marino    factors in the usual moduli 2^24-1 or 2^48-1.
7686d7f5d3SJohn Marino 
7786d7f5d3SJohn Marino    The moduli 2^24-1 or 2^48-1 are nothing particularly special, they're
7886d7f5d3SJohn Marino    just easy to calculate (see mpn_mod_34lsub1) and have a nice set of prime
7986d7f5d3SJohn Marino    factors.  2^32-1 and 2^64-1 would be equally easy to calculate, but have
8086d7f5d3SJohn Marino    fewer prime factors.
8186d7f5d3SJohn Marino 
8286d7f5d3SJohn Marino    The nails case usually ends up using mpn_mod_1, which is a lot slower
8386d7f5d3SJohn Marino    than mpn_mod_34lsub1.  Perhaps other such special moduli could be found
8486d7f5d3SJohn Marino    for the nails case.  Two-term things like 2^30-2^15-1 might be
8586d7f5d3SJohn Marino    candidates.  Or at worst some on-the-fly de-nailing would allow the plain
8686d7f5d3SJohn Marino    2^24-1 to be used.  Currently nails are too preliminary to be worried
8786d7f5d3SJohn Marino    about.
8886d7f5d3SJohn Marino 
8986d7f5d3SJohn Marino */
9086d7f5d3SJohn Marino 
9186d7f5d3SJohn Marino #define PERFSQR_MOD_MASK       ((CNST_LIMB(1) << PERFSQR_MOD_BITS) - 1)
9286d7f5d3SJohn Marino 
9386d7f5d3SJohn Marino #define MOD34_BITS  (GMP_NUMB_BITS / 4 * 3)
9486d7f5d3SJohn Marino #define MOD34_MASK  ((CNST_LIMB(1) << MOD34_BITS) - 1)
9586d7f5d3SJohn Marino 
9686d7f5d3SJohn Marino #define PERFSQR_MOD_34(r, up, usize)				\
9786d7f5d3SJohn Marino   do {								\
9886d7f5d3SJohn Marino     (r) = mpn_mod_34lsub1 (up, usize);				\
9986d7f5d3SJohn Marino     (r) = ((r) & MOD34_MASK) + ((r) >> MOD34_BITS);		\
10086d7f5d3SJohn Marino   } while (0)
10186d7f5d3SJohn Marino 
10286d7f5d3SJohn Marino /* FIXME: The %= here isn't good, and might destroy any savings from keeping
10386d7f5d3SJohn Marino    the PERFSQR_MOD_IDX stuff within a limb (rather than needing umul_ppmm).
10486d7f5d3SJohn Marino    Maybe a new sort of mpn_preinv_mod_1 could accept an unnormalized divisor
10586d7f5d3SJohn Marino    and a shift count, like mpn_preinv_divrem_1.	 But mod_34lsub1 is our
10686d7f5d3SJohn Marino    normal case, so lets not worry too much about mod_1.	 */
10786d7f5d3SJohn Marino #define PERFSQR_MOD_PP(r, up, usize)					\
10886d7f5d3SJohn Marino   do {									\
10986d7f5d3SJohn Marino     if (BELOW_THRESHOLD (usize, PREINV_MOD_1_TO_MOD_1_THRESHOLD))	\
11086d7f5d3SJohn Marino       {									\
11186d7f5d3SJohn Marino 	(r) = mpn_preinv_mod_1 (up, usize, PERFSQR_PP_NORM,		\
11286d7f5d3SJohn Marino 				PERFSQR_PP_INVERTED);			\
11386d7f5d3SJohn Marino 	(r) %= PERFSQR_PP;						\
11486d7f5d3SJohn Marino       }									\
11586d7f5d3SJohn Marino     else								\
11686d7f5d3SJohn Marino       {									\
11786d7f5d3SJohn Marino 	(r) = mpn_mod_1 (up, usize, PERFSQR_PP);			\
11886d7f5d3SJohn Marino       }									\
11986d7f5d3SJohn Marino   } while (0)
12086d7f5d3SJohn Marino 
12186d7f5d3SJohn Marino #define PERFSQR_MOD_IDX(idx, r, d, inv)				\
12286d7f5d3SJohn Marino   do {								\
12386d7f5d3SJohn Marino     mp_limb_t  q;						\
12486d7f5d3SJohn Marino     ASSERT ((r) <= PERFSQR_MOD_MASK);				\
12586d7f5d3SJohn Marino     ASSERT ((((inv) * (d)) & PERFSQR_MOD_MASK) == 1);		\
12686d7f5d3SJohn Marino     ASSERT (MP_LIMB_T_MAX / (d) >= PERFSQR_MOD_MASK);		\
12786d7f5d3SJohn Marino 								\
12886d7f5d3SJohn Marino     q = ((r) * (inv)) & PERFSQR_MOD_MASK;			\
12986d7f5d3SJohn Marino     ASSERT (r == ((q * (d)) & PERFSQR_MOD_MASK));		\
13086d7f5d3SJohn Marino     (idx) = (q * (d)) >> PERFSQR_MOD_BITS;			\
13186d7f5d3SJohn Marino   } while (0)
13286d7f5d3SJohn Marino 
13386d7f5d3SJohn Marino #define PERFSQR_MOD_1(r, d, inv, mask)				\
13486d7f5d3SJohn Marino   do {								\
13586d7f5d3SJohn Marino     unsigned   idx;						\
13686d7f5d3SJohn Marino     ASSERT ((d) <= GMP_LIMB_BITS);				\
13786d7f5d3SJohn Marino     PERFSQR_MOD_IDX(idx, r, d, inv);				\
13886d7f5d3SJohn Marino     TRACE (printf ("  PERFSQR_MOD_1 d=%u r=%lu idx=%u\n",	\
13986d7f5d3SJohn Marino 		   d, r%d, idx));				\
14086d7f5d3SJohn Marino     if ((((mask) >> idx) & 1) == 0)				\
14186d7f5d3SJohn Marino       {								\
14286d7f5d3SJohn Marino 	TRACE (printf ("  non-square\n"));			\
14386d7f5d3SJohn Marino 	return 0;						\
14486d7f5d3SJohn Marino       }								\
14586d7f5d3SJohn Marino   } while (0)
14686d7f5d3SJohn Marino 
14786d7f5d3SJohn Marino /* The expression "(int) idx - GMP_LIMB_BITS < 0" lets the compiler use the
14886d7f5d3SJohn Marino    sign bit from "idx-GMP_LIMB_BITS", which might help avoid a branch.	*/
14986d7f5d3SJohn Marino #define PERFSQR_MOD_2(r, d, inv, mhi, mlo)			\
15086d7f5d3SJohn Marino   do {								\
15186d7f5d3SJohn Marino     mp_limb_t  m;						\
15286d7f5d3SJohn Marino     unsigned   idx;						\
15386d7f5d3SJohn Marino     ASSERT ((d) <= 2*GMP_LIMB_BITS);				\
15486d7f5d3SJohn Marino 								\
15586d7f5d3SJohn Marino     PERFSQR_MOD_IDX (idx, r, d, inv);				\
15686d7f5d3SJohn Marino     TRACE (printf ("  PERFSQR_MOD_2 d=%u r=%lu idx=%u\n",	\
15786d7f5d3SJohn Marino 		   d, r%d, idx));				\
15886d7f5d3SJohn Marino     m = ((int) idx - GMP_LIMB_BITS < 0 ? (mlo) : (mhi));	\
15986d7f5d3SJohn Marino     idx %= GMP_LIMB_BITS;					\
16086d7f5d3SJohn Marino     if (((m >> idx) & 1) == 0)					\
16186d7f5d3SJohn Marino       {								\
16286d7f5d3SJohn Marino 	TRACE (printf ("  non-square\n"));			\
16386d7f5d3SJohn Marino 	return 0;						\
16486d7f5d3SJohn Marino       }								\
16586d7f5d3SJohn Marino   } while (0)
16686d7f5d3SJohn Marino 
16786d7f5d3SJohn Marino 
16886d7f5d3SJohn Marino int
mpn_perfect_square_p(mp_srcptr up,mp_size_t usize)16986d7f5d3SJohn Marino mpn_perfect_square_p (mp_srcptr up, mp_size_t usize)
17086d7f5d3SJohn Marino {
17186d7f5d3SJohn Marino   ASSERT (usize >= 1);
17286d7f5d3SJohn Marino 
17386d7f5d3SJohn Marino   TRACE (gmp_printf ("mpn_perfect_square_p %Nd\n", up, usize));
17486d7f5d3SJohn Marino 
17586d7f5d3SJohn Marino   /* The first test excludes 212/256 (82.8%) of the perfect square candidates
17686d7f5d3SJohn Marino      in O(1) time.  */
17786d7f5d3SJohn Marino   {
17886d7f5d3SJohn Marino     unsigned  idx = up[0] % 0x100;
17986d7f5d3SJohn Marino     if (((sq_res_0x100[idx / GMP_LIMB_BITS]
18086d7f5d3SJohn Marino 	  >> (idx % GMP_LIMB_BITS)) & 1) == 0)
18186d7f5d3SJohn Marino       return 0;
18286d7f5d3SJohn Marino   }
18386d7f5d3SJohn Marino 
18486d7f5d3SJohn Marino #if 0
18586d7f5d3SJohn Marino   /* Check that we have even multiplicity of 2, and then check that the rest is
18686d7f5d3SJohn Marino      a possible perfect square.  Leave disabled until we can determine this
18786d7f5d3SJohn Marino      really is an improvement.  It it is, it could completely replace the
18886d7f5d3SJohn Marino      simple probe above, since this should through out more non-squares, but at
18986d7f5d3SJohn Marino      the expense of somewhat more cycles.  */
19086d7f5d3SJohn Marino   {
19186d7f5d3SJohn Marino     mp_limb_t lo;
19286d7f5d3SJohn Marino     int cnt;
19386d7f5d3SJohn Marino     lo = up[0];
19486d7f5d3SJohn Marino     while (lo == 0)
19586d7f5d3SJohn Marino       up++, lo = up[0], usize--;
19686d7f5d3SJohn Marino     count_trailing_zeros (cnt, lo);
19786d7f5d3SJohn Marino     if ((cnt & 1) != 0)
19886d7f5d3SJohn Marino       return 0;			/* return of not even multiplicity of 2 */
19986d7f5d3SJohn Marino     lo >>= cnt;			/* shift down to align lowest non-zero bit */
20086d7f5d3SJohn Marino     lo >>= 1;			/* shift away lowest non-zero bit */
20186d7f5d3SJohn Marino     if ((lo & 3) != 0)
20286d7f5d3SJohn Marino       return 0;
20386d7f5d3SJohn Marino   }
20486d7f5d3SJohn Marino #endif
20586d7f5d3SJohn Marino 
20686d7f5d3SJohn Marino 
20786d7f5d3SJohn Marino   /* The second test uses mpn_mod_34lsub1 or mpn_mod_1 to detect non-squares
20886d7f5d3SJohn Marino      according to their residues modulo small primes (or powers of
20986d7f5d3SJohn Marino      primes).  See perfsqr.h.  */
21086d7f5d3SJohn Marino   PERFSQR_MOD_TEST (up, usize);
21186d7f5d3SJohn Marino 
21286d7f5d3SJohn Marino 
21386d7f5d3SJohn Marino   /* For the third and last test, we finally compute the square root,
21486d7f5d3SJohn Marino      to make sure we've really got a perfect square.  */
21586d7f5d3SJohn Marino   {
21686d7f5d3SJohn Marino     mp_ptr root_ptr;
21786d7f5d3SJohn Marino     int res;
21886d7f5d3SJohn Marino     TMP_DECL;
21986d7f5d3SJohn Marino 
22086d7f5d3SJohn Marino     TMP_MARK;
22186d7f5d3SJohn Marino     root_ptr = TMP_ALLOC_LIMBS ((usize + 1) / 2);
22286d7f5d3SJohn Marino 
22386d7f5d3SJohn Marino     /* Iff mpn_sqrtrem returns zero, the square is perfect.  */
22486d7f5d3SJohn Marino     res = ! mpn_sqrtrem (root_ptr, NULL, up, usize);
22586d7f5d3SJohn Marino     TMP_FREE;
22686d7f5d3SJohn Marino 
22786d7f5d3SJohn Marino     return res;
22886d7f5d3SJohn Marino   }
22986d7f5d3SJohn Marino }
230