xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/generic/perfsqr.c (revision 72c7faa4dbb41dbb0238d6b4a109da0d4b236dd4)
1 /* mpn_perfect_square_p(u,usize) -- Return non-zero if U is a perfect square,
2    zero otherwise.
3 
4 Copyright 1991, 1993, 1994, 1996, 1997, 2000-2002, 2005, 2012 Free Software
5 Foundation, Inc.
6 
7 This file is part of the GNU MP Library.
8 
9 The GNU MP Library is free software; you can redistribute it and/or modify
10 it under the terms of either:
11 
12   * the GNU Lesser General Public License as published by the Free
13     Software Foundation; either version 3 of the License, or (at your
14     option) any later version.
15 
16 or
17 
18   * the GNU General Public License as published by the Free Software
19     Foundation; either version 2 of the License, or (at your option) any
20     later version.
21 
22 or both in parallel, as here.
23 
24 The GNU MP Library is distributed in the hope that it will be useful, but
25 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27 for more details.
28 
29 You should have received copies of the GNU General Public License and the
30 GNU Lesser General Public License along with the GNU MP Library.  If not,
31 see https://www.gnu.org/licenses/.  */
32 
33 #include <stdio.h> /* for NULL */
34 #include "gmp-impl.h"
35 #include "longlong.h"
36 
37 #include "perfsqr.h"
38 
39 
40 /* change this to "#define TRACE(x) x" for diagnostics */
41 #define TRACE(x)
42 
43 
44 
45 /* PERFSQR_MOD_* detects non-squares using residue tests.
46 
47    A macro PERFSQR_MOD_TEST is setup by gen-psqr.c in perfsqr.h.  It takes
48    {up,usize} modulo a selected modulus to get a remainder r.  For 32-bit or
49    64-bit limbs this modulus will be 2^24-1 or 2^48-1 using PERFSQR_MOD_34,
50    or for other limb or nail sizes a PERFSQR_PP is chosen and PERFSQR_MOD_PP
51    used.  PERFSQR_PP_NORM and PERFSQR_PP_INVERTED are pre-calculated in this
52    case too.
53 
54    PERFSQR_MOD_TEST then makes various calls to PERFSQR_MOD_1 or
55    PERFSQR_MOD_2 with divisors d which are factors of the modulus, and table
56    data indicating residues and non-residues modulo those divisors.  The
57    table data is in 1 or 2 limbs worth of bits respectively, per the size of
58    each d.
59 
60    A "modexact" style remainder is taken to reduce r modulo d.
61    PERFSQR_MOD_IDX implements this, producing an index "idx" for use with
62    the table data.  Notice there's just one multiplication by a constant
63    "inv", for each d.
64 
65    The modexact doesn't produce a true r%d remainder, instead idx satisfies
66    "-(idx<<PERFSQR_MOD_BITS) == r mod d".  Because d is odd, this factor
67    -2^PERFSQR_MOD_BITS is a one-to-one mapping between r and idx, and is
68    accounted for by having the table data suitably permuted.
69 
70    The remainder r fits within PERFSQR_MOD_BITS which is less than a limb.
71    In fact the GMP_LIMB_BITS - PERFSQR_MOD_BITS spare bits are enough to fit
72    each divisor d meaning the modexact multiply can take place entirely
73    within one limb, giving the compiler the chance to optimize it, in a way
74    that say umul_ppmm would not give.
75 
76    There's no need for the divisors d to be prime, in fact gen-psqr.c makes
77    a deliberate effort to combine factors so as to reduce the number of
78    separate tests done on r.  But such combining is limited to d <=
79    2*GMP_LIMB_BITS so that the table data fits in at most 2 limbs.
80 
81    Alternatives:
82 
83    It'd be possible to use bigger divisors d, and more than 2 limbs of table
84    data, but this doesn't look like it would be of much help to the prime
85    factors in the usual moduli 2^24-1 or 2^48-1.
86 
87    The moduli 2^24-1 or 2^48-1 are nothing particularly special, they're
88    just easy to calculate (see mpn_mod_34lsub1) and have a nice set of prime
89    factors.  2^32-1 and 2^64-1 would be equally easy to calculate, but have
90    fewer prime factors.
91 
92    The nails case usually ends up using mpn_mod_1, which is a lot slower
93    than mpn_mod_34lsub1.  Perhaps other such special moduli could be found
94    for the nails case.  Two-term things like 2^30-2^15-1 might be
95    candidates.  Or at worst some on-the-fly de-nailing would allow the plain
96    2^24-1 to be used.  Currently nails are too preliminary to be worried
97    about.
98 
99 */
100 
101 #define PERFSQR_MOD_MASK       ((CNST_LIMB(1) << PERFSQR_MOD_BITS) - 1)
102 
103 #define MOD34_BITS  (GMP_NUMB_BITS / 4 * 3)
104 #define MOD34_MASK  ((CNST_LIMB(1) << MOD34_BITS) - 1)
105 
106 #define PERFSQR_MOD_34(r, up, usize)				\
107   do {								\
108     (r) = mpn_mod_34lsub1 (up, usize);				\
109     (r) = ((r) & MOD34_MASK) + ((r) >> MOD34_BITS);		\
110   } while (0)
111 
112 /* FIXME: The %= here isn't good, and might destroy any savings from keeping
113    the PERFSQR_MOD_IDX stuff within a limb (rather than needing umul_ppmm).
114    Maybe a new sort of mpn_preinv_mod_1 could accept an unnormalized divisor
115    and a shift count, like mpn_preinv_divrem_1.  But mod_34lsub1 is our
116    normal case, so lets not worry too much about mod_1.  */
117 #define PERFSQR_MOD_PP(r, up, usize)					\
118   do {									\
119     if (BELOW_THRESHOLD (usize, PREINV_MOD_1_TO_MOD_1_THRESHOLD))	\
120       {									\
121 	(r) = mpn_preinv_mod_1 (up, usize, PERFSQR_PP_NORM,		\
122 				PERFSQR_PP_INVERTED);			\
123 	(r) %= PERFSQR_PP;						\
124       }									\
125     else								\
126       {									\
127 	(r) = mpn_mod_1 (up, usize, PERFSQR_PP);			\
128       }									\
129   } while (0)
130 
131 #define PERFSQR_MOD_IDX(idx, r, d, inv)				\
132   do {								\
133     mp_limb_t  q;						\
134     ASSERT ((r) <= PERFSQR_MOD_MASK);				\
135     ASSERT ((((inv) * (d)) & PERFSQR_MOD_MASK) == 1);		\
136     ASSERT (MP_LIMB_T_MAX / (d) >= PERFSQR_MOD_MASK);		\
137 								\
138     q = ((r) * (inv)) & PERFSQR_MOD_MASK;			\
139     ASSERT (r == ((q * (d)) & PERFSQR_MOD_MASK));		\
140     (idx) = (q * (d)) >> PERFSQR_MOD_BITS;			\
141   } while (0)
142 
143 #define PERFSQR_MOD_1(r, d, inv, mask)				\
144   do {								\
145     unsigned   idx;						\
146     ASSERT ((d) <= GMP_LIMB_BITS);				\
147     PERFSQR_MOD_IDX(idx, r, d, inv);				\
148     TRACE (printf ("  PERFSQR_MOD_1 d=%u r=%lu idx=%u\n",	\
149 		   d, r%d, idx));				\
150     if ((((mask) >> idx) & 1) == 0)				\
151       {								\
152 	TRACE (printf ("  non-square\n"));			\
153 	return 0;						\
154       }								\
155   } while (0)
156 
157 /* The expression "(int) idx - GMP_LIMB_BITS < 0" lets the compiler use the
158    sign bit from "idx-GMP_LIMB_BITS", which might help avoid a branch.  */
159 #define PERFSQR_MOD_2(r, d, inv, mhi, mlo)			\
160   do {								\
161     mp_limb_t  m;						\
162     unsigned   idx;						\
163     ASSERT ((d) <= 2*GMP_LIMB_BITS);				\
164 								\
165     PERFSQR_MOD_IDX (idx, r, d, inv);				\
166     TRACE (printf ("  PERFSQR_MOD_2 d=%u r=%lu idx=%u\n",	\
167 		   d, r%d, idx));				\
168     m = ((int) idx - GMP_LIMB_BITS < 0 ? (mlo) : (mhi));	\
169     idx %= GMP_LIMB_BITS;					\
170     if (((m >> idx) & 1) == 0)					\
171       {								\
172 	TRACE (printf ("  non-square\n"));			\
173 	return 0;						\
174       }								\
175   } while (0)
176 
177 
178 int
mpn_perfect_square_p(mp_srcptr up,mp_size_t usize)179 mpn_perfect_square_p (mp_srcptr up, mp_size_t usize)
180 {
181   ASSERT (usize >= 1);
182 
183   TRACE (gmp_printf ("mpn_perfect_square_p %Nd\n", up, usize));
184 
185   /* The first test excludes 212/256 (82.8%) of the perfect square candidates
186      in O(1) time.  */
187   {
188     unsigned  idx = up[0] % 0x100;
189     if (((sq_res_0x100[idx / GMP_LIMB_BITS]
190 	  >> (idx % GMP_LIMB_BITS)) & 1) == 0)
191       return 0;
192   }
193 
194 #if 0
195   /* Check that we have even multiplicity of 2, and then check that the rest is
196      a possible perfect square.  Leave disabled until we can determine this
197      really is an improvement.  It it is, it could completely replace the
198      simple probe above, since this should throw out more non-squares, but at
199      the expense of somewhat more cycles.  */
200   {
201     mp_limb_t lo;
202     int cnt;
203     lo = up[0];
204     while (lo == 0)
205       up++, lo = up[0], usize--;
206     count_trailing_zeros (cnt, lo);
207     if ((cnt & 1) != 0)
208       return 0;			/* return of not even multiplicity of 2 */
209     lo >>= cnt;			/* shift down to align lowest non-zero bit */
210     lo >>= 1;			/* shift away lowest non-zero bit */
211     if ((lo & 3) != 0)
212       return 0;
213   }
214 #endif
215 
216 
217   /* The second test uses mpn_mod_34lsub1 or mpn_mod_1 to detect non-squares
218      according to their residues modulo small primes (or powers of
219      primes).  See perfsqr.h.  */
220   PERFSQR_MOD_TEST (up, usize);
221 
222 
223   /* For the third and last test, we finally compute the square root,
224      to make sure we've really got a perfect square.  */
225   {
226     mp_ptr root_ptr;
227     int res;
228     TMP_DECL;
229 
230     TMP_MARK;
231     root_ptr = TMP_ALLOC_LIMBS ((usize + 1) / 2);
232 
233     /* Iff mpn_sqrtrem returns zero, the square is perfect.  */
234     res = ! mpn_sqrtrem (root_ptr, NULL, up, usize);
235     TMP_FREE;
236 
237     return res;
238   }
239 }
240