186d7f5d3SJohn Marino /* mpn_perfect_square_p(u,usize) -- Return non-zero if U is a perfect square,
286d7f5d3SJohn Marino zero otherwise.
386d7f5d3SJohn Marino
486d7f5d3SJohn Marino Copyright 1991, 1993, 1994, 1996, 1997, 2000, 2001, 2002, 2005 Free Software
586d7f5d3SJohn Marino Foundation, Inc.
686d7f5d3SJohn Marino
786d7f5d3SJohn Marino This file is part of the GNU MP Library.
886d7f5d3SJohn Marino
986d7f5d3SJohn Marino The GNU MP Library is free software; you can redistribute it and/or modify
1086d7f5d3SJohn Marino it under the terms of the GNU Lesser General Public License as published by
1186d7f5d3SJohn Marino the Free Software Foundation; either version 3 of the License, or (at your
1286d7f5d3SJohn Marino option) any later version.
1386d7f5d3SJohn Marino
1486d7f5d3SJohn Marino The GNU MP Library is distributed in the hope that it will be useful, but
1586d7f5d3SJohn Marino WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
1686d7f5d3SJohn Marino or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
1786d7f5d3SJohn Marino License for more details.
1886d7f5d3SJohn Marino
1986d7f5d3SJohn Marino You should have received a copy of the GNU Lesser General Public License
2086d7f5d3SJohn Marino along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
2186d7f5d3SJohn Marino
2286d7f5d3SJohn Marino #include <stdio.h> /* for NULL */
2386d7f5d3SJohn Marino #include "gmp.h"
2486d7f5d3SJohn Marino #include "gmp-impl.h"
2586d7f5d3SJohn Marino #include "longlong.h"
2686d7f5d3SJohn Marino
2786d7f5d3SJohn Marino #include "perfsqr.h"
2886d7f5d3SJohn Marino
2986d7f5d3SJohn Marino
3086d7f5d3SJohn Marino /* change this to "#define TRACE(x) x" for diagnostics */
3186d7f5d3SJohn Marino #define TRACE(x)
3286d7f5d3SJohn Marino
3386d7f5d3SJohn Marino
3486d7f5d3SJohn Marino
3586d7f5d3SJohn Marino /* PERFSQR_MOD_* detects non-squares using residue tests.
3686d7f5d3SJohn Marino
3786d7f5d3SJohn Marino A macro PERFSQR_MOD_TEST is setup by gen-psqr.c in perfsqr.h. It takes
3886d7f5d3SJohn Marino {up,usize} modulo a selected modulus to get a remainder r. For 32-bit or
3986d7f5d3SJohn Marino 64-bit limbs this modulus will be 2^24-1 or 2^48-1 using PERFSQR_MOD_34,
4086d7f5d3SJohn Marino or for other limb or nail sizes a PERFSQR_PP is chosen and PERFSQR_MOD_PP
4186d7f5d3SJohn Marino used. PERFSQR_PP_NORM and PERFSQR_PP_INVERTED are pre-calculated in this
4286d7f5d3SJohn Marino case too.
4386d7f5d3SJohn Marino
4486d7f5d3SJohn Marino PERFSQR_MOD_TEST then makes various calls to PERFSQR_MOD_1 or
4586d7f5d3SJohn Marino PERFSQR_MOD_2 with divisors d which are factors of the modulus, and table
4686d7f5d3SJohn Marino data indicating residues and non-residues modulo those divisors. The
4786d7f5d3SJohn Marino table data is in 1 or 2 limbs worth of bits respectively, per the size of
4886d7f5d3SJohn Marino each d.
4986d7f5d3SJohn Marino
5086d7f5d3SJohn Marino A "modexact" style remainder is taken to reduce r modulo d.
5186d7f5d3SJohn Marino PERFSQR_MOD_IDX implements this, producing an index "idx" for use with
5286d7f5d3SJohn Marino the table data. Notice there's just one multiplication by a constant
5386d7f5d3SJohn Marino "inv", for each d.
5486d7f5d3SJohn Marino
5586d7f5d3SJohn Marino The modexact doesn't produce a true r%d remainder, instead idx satisfies
5686d7f5d3SJohn Marino "-(idx<<PERFSQR_MOD_BITS) == r mod d". Because d is odd, this factor
5786d7f5d3SJohn Marino -2^PERFSQR_MOD_BITS is a one-to-one mapping between r and idx, and is
5886d7f5d3SJohn Marino accounted for by having the table data suitably permuted.
5986d7f5d3SJohn Marino
6086d7f5d3SJohn Marino The remainder r fits within PERFSQR_MOD_BITS which is less than a limb.
6186d7f5d3SJohn Marino In fact the GMP_LIMB_BITS - PERFSQR_MOD_BITS spare bits are enough to fit
6286d7f5d3SJohn Marino each divisor d meaning the modexact multiply can take place entirely
6386d7f5d3SJohn Marino within one limb, giving the compiler the chance to optimize it, in a way
6486d7f5d3SJohn Marino that say umul_ppmm would not give.
6586d7f5d3SJohn Marino
6686d7f5d3SJohn Marino There's no need for the divisors d to be prime, in fact gen-psqr.c makes
6786d7f5d3SJohn Marino a deliberate effort to combine factors so as to reduce the number of
6886d7f5d3SJohn Marino separate tests done on r. But such combining is limited to d <=
6986d7f5d3SJohn Marino 2*GMP_LIMB_BITS so that the table data fits in at most 2 limbs.
7086d7f5d3SJohn Marino
7186d7f5d3SJohn Marino Alternatives:
7286d7f5d3SJohn Marino
7386d7f5d3SJohn Marino It'd be possible to use bigger divisors d, and more than 2 limbs of table
7486d7f5d3SJohn Marino data, but this doesn't look like it would be of much help to the prime
7586d7f5d3SJohn Marino factors in the usual moduli 2^24-1 or 2^48-1.
7686d7f5d3SJohn Marino
7786d7f5d3SJohn Marino The moduli 2^24-1 or 2^48-1 are nothing particularly special, they're
7886d7f5d3SJohn Marino just easy to calculate (see mpn_mod_34lsub1) and have a nice set of prime
7986d7f5d3SJohn Marino factors. 2^32-1 and 2^64-1 would be equally easy to calculate, but have
8086d7f5d3SJohn Marino fewer prime factors.
8186d7f5d3SJohn Marino
8286d7f5d3SJohn Marino The nails case usually ends up using mpn_mod_1, which is a lot slower
8386d7f5d3SJohn Marino than mpn_mod_34lsub1. Perhaps other such special moduli could be found
8486d7f5d3SJohn Marino for the nails case. Two-term things like 2^30-2^15-1 might be
8586d7f5d3SJohn Marino candidates. Or at worst some on-the-fly de-nailing would allow the plain
8686d7f5d3SJohn Marino 2^24-1 to be used. Currently nails are too preliminary to be worried
8786d7f5d3SJohn Marino about.
8886d7f5d3SJohn Marino
8986d7f5d3SJohn Marino */
9086d7f5d3SJohn Marino
9186d7f5d3SJohn Marino #define PERFSQR_MOD_MASK ((CNST_LIMB(1) << PERFSQR_MOD_BITS) - 1)
9286d7f5d3SJohn Marino
9386d7f5d3SJohn Marino #define MOD34_BITS (GMP_NUMB_BITS / 4 * 3)
9486d7f5d3SJohn Marino #define MOD34_MASK ((CNST_LIMB(1) << MOD34_BITS) - 1)
9586d7f5d3SJohn Marino
9686d7f5d3SJohn Marino #define PERFSQR_MOD_34(r, up, usize) \
9786d7f5d3SJohn Marino do { \
9886d7f5d3SJohn Marino (r) = mpn_mod_34lsub1 (up, usize); \
9986d7f5d3SJohn Marino (r) = ((r) & MOD34_MASK) + ((r) >> MOD34_BITS); \
10086d7f5d3SJohn Marino } while (0)
10186d7f5d3SJohn Marino
10286d7f5d3SJohn Marino /* FIXME: The %= here isn't good, and might destroy any savings from keeping
10386d7f5d3SJohn Marino the PERFSQR_MOD_IDX stuff within a limb (rather than needing umul_ppmm).
10486d7f5d3SJohn Marino Maybe a new sort of mpn_preinv_mod_1 could accept an unnormalized divisor
10586d7f5d3SJohn Marino and a shift count, like mpn_preinv_divrem_1. But mod_34lsub1 is our
10686d7f5d3SJohn Marino normal case, so lets not worry too much about mod_1. */
10786d7f5d3SJohn Marino #define PERFSQR_MOD_PP(r, up, usize) \
10886d7f5d3SJohn Marino do { \
10986d7f5d3SJohn Marino if (BELOW_THRESHOLD (usize, PREINV_MOD_1_TO_MOD_1_THRESHOLD)) \
11086d7f5d3SJohn Marino { \
11186d7f5d3SJohn Marino (r) = mpn_preinv_mod_1 (up, usize, PERFSQR_PP_NORM, \
11286d7f5d3SJohn Marino PERFSQR_PP_INVERTED); \
11386d7f5d3SJohn Marino (r) %= PERFSQR_PP; \
11486d7f5d3SJohn Marino } \
11586d7f5d3SJohn Marino else \
11686d7f5d3SJohn Marino { \
11786d7f5d3SJohn Marino (r) = mpn_mod_1 (up, usize, PERFSQR_PP); \
11886d7f5d3SJohn Marino } \
11986d7f5d3SJohn Marino } while (0)
12086d7f5d3SJohn Marino
12186d7f5d3SJohn Marino #define PERFSQR_MOD_IDX(idx, r, d, inv) \
12286d7f5d3SJohn Marino do { \
12386d7f5d3SJohn Marino mp_limb_t q; \
12486d7f5d3SJohn Marino ASSERT ((r) <= PERFSQR_MOD_MASK); \
12586d7f5d3SJohn Marino ASSERT ((((inv) * (d)) & PERFSQR_MOD_MASK) == 1); \
12686d7f5d3SJohn Marino ASSERT (MP_LIMB_T_MAX / (d) >= PERFSQR_MOD_MASK); \
12786d7f5d3SJohn Marino \
12886d7f5d3SJohn Marino q = ((r) * (inv)) & PERFSQR_MOD_MASK; \
12986d7f5d3SJohn Marino ASSERT (r == ((q * (d)) & PERFSQR_MOD_MASK)); \
13086d7f5d3SJohn Marino (idx) = (q * (d)) >> PERFSQR_MOD_BITS; \
13186d7f5d3SJohn Marino } while (0)
13286d7f5d3SJohn Marino
13386d7f5d3SJohn Marino #define PERFSQR_MOD_1(r, d, inv, mask) \
13486d7f5d3SJohn Marino do { \
13586d7f5d3SJohn Marino unsigned idx; \
13686d7f5d3SJohn Marino ASSERT ((d) <= GMP_LIMB_BITS); \
13786d7f5d3SJohn Marino PERFSQR_MOD_IDX(idx, r, d, inv); \
13886d7f5d3SJohn Marino TRACE (printf (" PERFSQR_MOD_1 d=%u r=%lu idx=%u\n", \
13986d7f5d3SJohn Marino d, r%d, idx)); \
14086d7f5d3SJohn Marino if ((((mask) >> idx) & 1) == 0) \
14186d7f5d3SJohn Marino { \
14286d7f5d3SJohn Marino TRACE (printf (" non-square\n")); \
14386d7f5d3SJohn Marino return 0; \
14486d7f5d3SJohn Marino } \
14586d7f5d3SJohn Marino } while (0)
14686d7f5d3SJohn Marino
14786d7f5d3SJohn Marino /* The expression "(int) idx - GMP_LIMB_BITS < 0" lets the compiler use the
14886d7f5d3SJohn Marino sign bit from "idx-GMP_LIMB_BITS", which might help avoid a branch. */
14986d7f5d3SJohn Marino #define PERFSQR_MOD_2(r, d, inv, mhi, mlo) \
15086d7f5d3SJohn Marino do { \
15186d7f5d3SJohn Marino mp_limb_t m; \
15286d7f5d3SJohn Marino unsigned idx; \
15386d7f5d3SJohn Marino ASSERT ((d) <= 2*GMP_LIMB_BITS); \
15486d7f5d3SJohn Marino \
15586d7f5d3SJohn Marino PERFSQR_MOD_IDX (idx, r, d, inv); \
15686d7f5d3SJohn Marino TRACE (printf (" PERFSQR_MOD_2 d=%u r=%lu idx=%u\n", \
15786d7f5d3SJohn Marino d, r%d, idx)); \
15886d7f5d3SJohn Marino m = ((int) idx - GMP_LIMB_BITS < 0 ? (mlo) : (mhi)); \
15986d7f5d3SJohn Marino idx %= GMP_LIMB_BITS; \
16086d7f5d3SJohn Marino if (((m >> idx) & 1) == 0) \
16186d7f5d3SJohn Marino { \
16286d7f5d3SJohn Marino TRACE (printf (" non-square\n")); \
16386d7f5d3SJohn Marino return 0; \
16486d7f5d3SJohn Marino } \
16586d7f5d3SJohn Marino } while (0)
16686d7f5d3SJohn Marino
16786d7f5d3SJohn Marino
16886d7f5d3SJohn Marino int
mpn_perfect_square_p(mp_srcptr up,mp_size_t usize)16986d7f5d3SJohn Marino mpn_perfect_square_p (mp_srcptr up, mp_size_t usize)
17086d7f5d3SJohn Marino {
17186d7f5d3SJohn Marino ASSERT (usize >= 1);
17286d7f5d3SJohn Marino
17386d7f5d3SJohn Marino TRACE (gmp_printf ("mpn_perfect_square_p %Nd\n", up, usize));
17486d7f5d3SJohn Marino
17586d7f5d3SJohn Marino /* The first test excludes 212/256 (82.8%) of the perfect square candidates
17686d7f5d3SJohn Marino in O(1) time. */
17786d7f5d3SJohn Marino {
17886d7f5d3SJohn Marino unsigned idx = up[0] % 0x100;
17986d7f5d3SJohn Marino if (((sq_res_0x100[idx / GMP_LIMB_BITS]
18086d7f5d3SJohn Marino >> (idx % GMP_LIMB_BITS)) & 1) == 0)
18186d7f5d3SJohn Marino return 0;
18286d7f5d3SJohn Marino }
18386d7f5d3SJohn Marino
18486d7f5d3SJohn Marino #if 0
18586d7f5d3SJohn Marino /* Check that we have even multiplicity of 2, and then check that the rest is
18686d7f5d3SJohn Marino a possible perfect square. Leave disabled until we can determine this
18786d7f5d3SJohn Marino really is an improvement. It it is, it could completely replace the
18886d7f5d3SJohn Marino simple probe above, since this should through out more non-squares, but at
18986d7f5d3SJohn Marino the expense of somewhat more cycles. */
19086d7f5d3SJohn Marino {
19186d7f5d3SJohn Marino mp_limb_t lo;
19286d7f5d3SJohn Marino int cnt;
19386d7f5d3SJohn Marino lo = up[0];
19486d7f5d3SJohn Marino while (lo == 0)
19586d7f5d3SJohn Marino up++, lo = up[0], usize--;
19686d7f5d3SJohn Marino count_trailing_zeros (cnt, lo);
19786d7f5d3SJohn Marino if ((cnt & 1) != 0)
19886d7f5d3SJohn Marino return 0; /* return of not even multiplicity of 2 */
19986d7f5d3SJohn Marino lo >>= cnt; /* shift down to align lowest non-zero bit */
20086d7f5d3SJohn Marino lo >>= 1; /* shift away lowest non-zero bit */
20186d7f5d3SJohn Marino if ((lo & 3) != 0)
20286d7f5d3SJohn Marino return 0;
20386d7f5d3SJohn Marino }
20486d7f5d3SJohn Marino #endif
20586d7f5d3SJohn Marino
20686d7f5d3SJohn Marino
20786d7f5d3SJohn Marino /* The second test uses mpn_mod_34lsub1 or mpn_mod_1 to detect non-squares
20886d7f5d3SJohn Marino according to their residues modulo small primes (or powers of
20986d7f5d3SJohn Marino primes). See perfsqr.h. */
21086d7f5d3SJohn Marino PERFSQR_MOD_TEST (up, usize);
21186d7f5d3SJohn Marino
21286d7f5d3SJohn Marino
21386d7f5d3SJohn Marino /* For the third and last test, we finally compute the square root,
21486d7f5d3SJohn Marino to make sure we've really got a perfect square. */
21586d7f5d3SJohn Marino {
21686d7f5d3SJohn Marino mp_ptr root_ptr;
21786d7f5d3SJohn Marino int res;
21886d7f5d3SJohn Marino TMP_DECL;
21986d7f5d3SJohn Marino
22086d7f5d3SJohn Marino TMP_MARK;
22186d7f5d3SJohn Marino root_ptr = TMP_ALLOC_LIMBS ((usize + 1) / 2);
22286d7f5d3SJohn Marino
22386d7f5d3SJohn Marino /* Iff mpn_sqrtrem returns zero, the square is perfect. */
22486d7f5d3SJohn Marino res = ! mpn_sqrtrem (root_ptr, NULL, up, usize);
22586d7f5d3SJohn Marino TMP_FREE;
22686d7f5d3SJohn Marino
22786d7f5d3SJohn Marino return res;
22886d7f5d3SJohn Marino }
22986d7f5d3SJohn Marino }
230