xref: /netbsd-src/external/lgpl3/gmp/dist/gen-psqr.c (revision ce54336801cf28877c3414aa2fcb251dddd543a2)
1 /* Generate perfect square testing data.
2 
3 Copyright 2002-2004, 2012, 2014 Free Software Foundation, Inc.
4 
5 This file is part of the GNU MP Library.
6 
7 The GNU MP Library is free software; you can redistribute it and/or modify
8 it under the terms of either:
9 
10   * the GNU Lesser General Public License as published by the Free
11     Software Foundation; either version 3 of the License, or (at your
12     option) any later version.
13 
14 or
15 
16   * the GNU General Public License as published by the Free Software
17     Foundation; either version 2 of the License, or (at your option) any
18     later version.
19 
20 or both in parallel, as here.
21 
22 The GNU MP Library is distributed in the hope that it will be useful, but
23 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25 for more details.
26 
27 You should have received copies of the GNU General Public License and the
28 GNU Lesser General Public License along with the GNU MP Library.  If not,
29 see https://www.gnu.org/licenses/.  */
30 
31 #include <stdio.h>
32 #include <stdlib.h>
33 
34 #include "bootstrap.c"
35 
36 
37 /* The aim of this program is to choose either mpn_mod_34lsub1 or mpn_mod_1
38    (plus a PERFSQR_PP modulus), and generate tables indicating quadratic
39    residues and non-residues modulo small factors of that modulus.
40 
41    For the usual 32 or 64 bit cases mpn_mod_34lsub1 gets used.  That
42    function exists specifically because 2^24-1 and 2^48-1 have nice sets of
43    prime factors.  For other limb sizes it's considered, but if it doesn't
44    have good factors then mpn_mod_1 will be used instead.
45 
46    When mpn_mod_1 is used, the modulus PERFSQR_PP is created from a
47    selection of small primes, chosen to fill PERFSQR_MOD_BITS of a limb,
48    with that bit count chosen so (2*GMP_LIMB_BITS)*2^PERFSQR_MOD_BITS <=
49    GMP_LIMB_MAX, allowing PERFSQR_MOD_IDX in mpn/generic/perfsqr.c to do its
50    calculation within a single limb.
51 
52    In either case primes can be combined to make divisors.  The table data
53    then effectively indicates remainders which are quadratic residues mod
54    all the primes.  This sort of combining reduces the number of steps
55    needed after mpn_mod_34lsub1 or mpn_mod_1, saving code size and time.
56    Nothing is gained or lost in terms of detections, the same total fraction
57    of non-residues will be identified.
58 
59    Nothing particularly sophisticated is attempted for combining factors to
60    make divisors.  This is probably a kind of knapsack problem so it'd be
61    too hard to attempt anything completely general.  For the usual 32 and 64
62    bit limbs we get a good enough result just pairing the biggest and
63    smallest which fit together, repeatedly.
64 
65    Another aim is to get powerful combinations, ie. divisors which identify
66    biggest fraction of non-residues, and have those run first.  Again for
67    the usual 32 and 64 bits it seems good enough just to pair for big
68    divisors then sort according to the resulting fraction of non-residues
69    identified.
70 
71    Also in this program, a table sq_res_0x100 of residues modulo 256 is
72    generated.  This simply fills bits into limbs of the appropriate
73    build-time GMP_LIMB_BITS each.
74 
75 */
76 
77 
78 /* Normally we aren't using const in gen*.c programs, so as not to have to
79    bother figuring out if it works, but using it with f_cmp_divisor and
80    f_cmp_fraction avoids warnings from the qsort calls. */
81 
82 /* Same tests as gmp.h. */
83 #if  defined (__STDC__)                                 \
84   || defined (__cplusplus)                              \
85   || defined (_AIX)                                     \
86   || defined (__DECC)                                   \
87   || (defined (__mips) && defined (_SYSTYPE_SVR4))      \
88   || defined (_MSC_VER)                                 \
89   || defined (_WIN32)
90 #define HAVE_CONST        1
91 #endif
92 
93 #if ! HAVE_CONST
94 #define const
95 #endif
96 
97 
98 mpz_t  *sq_res_0x100;          /* table of limbs */
99 int    nsq_res_0x100;          /* elements in sq_res_0x100 array */
100 int    sq_res_0x100_num;       /* squares in sq_res_0x100 */
101 double sq_res_0x100_fraction;  /* sq_res_0x100_num / 256 */
102 
103 int     mod34_bits;        /* 3*GMP_NUMB_BITS/4 */
104 int     mod_bits;          /* bits from PERFSQR_MOD_34 or MOD_PP */
105 int     max_divisor;       /* all divisors <= max_divisor */
106 int     max_divisor_bits;  /* ceil(log2(max_divisor)) */
107 double  total_fraction;    /* of squares */
108 mpz_t   pp;                /* product of primes, or 0 if mod_34lsub1 used */
109 mpz_t   pp_norm;           /* pp shifted so NUMB high bit set */
110 mpz_t   pp_inverted;       /* invert_limb style inverse */
111 mpz_t   mod_mask;          /* 2^mod_bits-1 */
112 char    mod34_excuse[128]; /* why mod_34lsub1 not used (if it's not) */
113 
114 /* raw list of divisors of 2^mod34_bits-1 or pp, just to show in a comment */
115 struct rawfactor_t {
116   int     divisor;
117   int     multiplicity;
118 };
119 struct rawfactor_t  *rawfactor;
120 int                 nrawfactor;
121 
122 /* factors of 2^mod34_bits-1 or pp and associated data, after combining etc */
123 struct factor_t {
124   int     divisor;
125   mpz_t   inverse;   /* 1/divisor mod 2^mod_bits */
126   mpz_t   mask;      /* indicating squares mod divisor */
127   double  fraction;  /* squares/total */
128 };
129 struct factor_t  *factor;
130 int              nfactor;       /* entries in use in factor array */
131 int              factor_alloc;  /* entries allocated to factor array */
132 
133 
134 int
f_cmp_divisor(const void * parg,const void * qarg)135 f_cmp_divisor (const void *parg, const void *qarg)
136 {
137   const struct factor_t *p, *q;
138   p = (const struct factor_t *) parg;
139   q = (const struct factor_t *) qarg;
140   if (p->divisor > q->divisor)
141     return 1;
142   else if (p->divisor < q->divisor)
143     return -1;
144   else
145     return 0;
146 }
147 
148 int
f_cmp_fraction(const void * parg,const void * qarg)149 f_cmp_fraction (const void *parg, const void *qarg)
150 {
151   const struct factor_t *p, *q;
152   p = (const struct factor_t *) parg;
153   q = (const struct factor_t *) qarg;
154   if (p->fraction > q->fraction)
155     return 1;
156   else if (p->fraction < q->fraction)
157     return -1;
158   else
159     return 0;
160 }
161 
162 /* Remove array[idx] by copying the remainder down, and adjust narray
163    accordingly.  */
164 #define COLLAPSE_ELEMENT(array, idx, narray)                    \
165   do {                                                          \
166     memmove (&(array)[idx],					\
167 	     &(array)[idx+1],					\
168 	     ((narray)-((idx)+1)) * sizeof (array[0]));		\
169     (narray)--;                                                 \
170   } while (0)
171 
172 
173 /* return n*2^p mod m */
174 int
mul_2exp_mod(int n,int p,int m)175 mul_2exp_mod (int n, int p, int m)
176 {
177   while (--p >= 0)
178     n = (2 * n) % m;
179   return n;
180 }
181 
182 /* return -n mod m */
183 int
neg_mod(int n,int m)184 neg_mod (int n, int m)
185 {
186   assert (n >= 0 && n < m);
187   return (n == 0 ? 0 : m-n);
188 }
189 
190 /* Set "mask" to a value such that "mask & (1<<idx)" is non-zero if
191    "-(idx<<mod_bits)" can be a square modulo m.  */
192 void
square_mask(mpz_t mask,int m)193 square_mask (mpz_t mask, int m)
194 {
195   int    p, i, r, idx;
196 
197   p = mul_2exp_mod (1, mod_bits, m);
198   p = neg_mod (p, m);
199 
200   mpz_set_ui (mask, 0L);
201   for (i = 0; i < m; i++)
202     {
203       r = (i * i) % m;
204       idx = (r * p) % m;
205       mpz_setbit (mask, (unsigned long) idx);
206     }
207 }
208 
209 void
generate_sq_res_0x100(int limb_bits)210 generate_sq_res_0x100 (int limb_bits)
211 {
212   int  i, res;
213 
214   nsq_res_0x100 = (0x100 + limb_bits - 1) / limb_bits;
215   sq_res_0x100 = (mpz_t *) xmalloc (nsq_res_0x100 * sizeof (*sq_res_0x100));
216 
217   for (i = 0; i < nsq_res_0x100; i++)
218     mpz_init_set_ui (sq_res_0x100[i], 0L);
219 
220   for (i = 0; i < 0x100; i++)
221     {
222       res = (i * i) % 0x100;
223       mpz_setbit (sq_res_0x100[res / limb_bits],
224                   (unsigned long) (res % limb_bits));
225     }
226 
227   sq_res_0x100_num = 0;
228   for (i = 0; i < nsq_res_0x100; i++)
229     sq_res_0x100_num += mpz_popcount (sq_res_0x100[i]);
230   sq_res_0x100_fraction = (double) sq_res_0x100_num / 256.0;
231 }
232 
233 void
generate_mod(int limb_bits,int nail_bits)234 generate_mod (int limb_bits, int nail_bits)
235 {
236   int    numb_bits = limb_bits - nail_bits;
237   int    i, divisor;
238 
239   mpz_init_set_ui (pp, 0L);
240   mpz_init_set_ui (pp_norm, 0L);
241   mpz_init_set_ui (pp_inverted, 0L);
242 
243   /* no more than limb_bits many factors in a one limb modulus (and of
244      course in reality nothing like that many) */
245   factor_alloc = limb_bits;
246   factor = (struct factor_t *) xmalloc (factor_alloc * sizeof (*factor));
247   rawfactor = (struct rawfactor_t *) xmalloc (factor_alloc * sizeof (*rawfactor));
248 
249   if (numb_bits % 4 != 0)
250     {
251       strcpy (mod34_excuse, "GMP_NUMB_BITS % 4 != 0");
252       goto use_pp;
253     }
254 
255   max_divisor = 2*limb_bits;
256   max_divisor_bits = log2_ceil (max_divisor);
257 
258   if (numb_bits / 4 < max_divisor_bits)
259     {
260       /* Wind back to one limb worth of max_divisor, if that will let us use
261          mpn_mod_34lsub1.  */
262       max_divisor = limb_bits;
263       max_divisor_bits = log2_ceil (max_divisor);
264 
265       if (numb_bits / 4 < max_divisor_bits)
266         {
267           strcpy (mod34_excuse, "GMP_NUMB_BITS / 4 too small");
268           goto use_pp;
269         }
270     }
271 
272   {
273     /* Can use mpn_mod_34lsub1, find small factors of 2^mod34_bits-1. */
274     mpz_t  m, q, r;
275     int    multiplicity;
276 
277     mod34_bits = (numb_bits / 4) * 3;
278 
279     /* mpn_mod_34lsub1 returns a full limb value, PERFSQR_MOD_34 folds it at
280        the mod34_bits mark, adding the two halves for a remainder of at most
281        mod34_bits+1 many bits */
282     mod_bits = mod34_bits + 1;
283 
284     mpz_init_set_ui (m, 1L);
285     mpz_mul_2exp (m, m, mod34_bits);
286     mpz_sub_ui (m, m, 1L);
287 
288     mpz_init (q);
289     mpz_init (r);
290 
291     for (i = 3; i <= max_divisor; i+=2)
292       {
293         if (! isprime (i))
294           continue;
295 
296         mpz_tdiv_qr_ui (q, r, m, (unsigned long) i);
297         if (mpz_sgn (r) != 0)
298           continue;
299 
300         /* if a repeated prime is found it's used as an i^n in one factor */
301         divisor = 1;
302         multiplicity = 0;
303         do
304           {
305             if (divisor > max_divisor / i)
306               break;
307             multiplicity++;
308             mpz_set (m, q);
309             mpz_tdiv_qr_ui (q, r, m, (unsigned long) i);
310           }
311         while (mpz_sgn (r) == 0);
312 
313         assert (nrawfactor < factor_alloc);
314         rawfactor[nrawfactor].divisor = i;
315         rawfactor[nrawfactor].multiplicity = multiplicity;
316         nrawfactor++;
317       }
318 
319     mpz_clear (m);
320     mpz_clear (q);
321     mpz_clear (r);
322   }
323 
324   if (nrawfactor <= 2)
325     {
326       mpz_t  new_pp;
327 
328       sprintf (mod34_excuse, "only %d small factor%s",
329                nrawfactor, nrawfactor == 1 ? "" : "s");
330 
331     use_pp:
332       /* reset to two limbs of max_divisor, in case the mpn_mod_34lsub1 code
333          tried with just one */
334       max_divisor = 2*limb_bits;
335       max_divisor_bits = log2_ceil (max_divisor);
336 
337       mpz_init (new_pp);
338       nrawfactor = 0;
339       mod_bits = MIN (numb_bits, limb_bits - max_divisor_bits);
340 
341       /* one copy of each small prime */
342       mpz_set_ui (pp, 1L);
343       for (i = 3; i <= max_divisor; i+=2)
344         {
345           if (! isprime (i))
346             continue;
347 
348           mpz_mul_ui (new_pp, pp, (unsigned long) i);
349           if (mpz_sizeinbase (new_pp, 2) > mod_bits)
350             break;
351           mpz_set (pp, new_pp);
352 
353           assert (nrawfactor < factor_alloc);
354           rawfactor[nrawfactor].divisor = i;
355           rawfactor[nrawfactor].multiplicity = 1;
356           nrawfactor++;
357         }
358 
359       /* Plus an extra copy of one or more of the primes selected, if that
360          still fits in max_divisor and the total in mod_bits.  Usually only
361          3 or 5 will be candidates */
362       for (i = nrawfactor-1; i >= 0; i--)
363         {
364           if (rawfactor[i].divisor > max_divisor / rawfactor[i].divisor)
365             continue;
366           mpz_mul_ui (new_pp, pp, (unsigned long) rawfactor[i].divisor);
367           if (mpz_sizeinbase (new_pp, 2) > mod_bits)
368             continue;
369           mpz_set (pp, new_pp);
370 
371           rawfactor[i].multiplicity++;
372         }
373 
374       mod_bits = mpz_sizeinbase (pp, 2);
375 
376       mpz_set (pp_norm, pp);
377       while (mpz_sizeinbase (pp_norm, 2) < numb_bits)
378         mpz_add (pp_norm, pp_norm, pp_norm);
379 
380       mpz_preinv_invert (pp_inverted, pp_norm, numb_bits);
381 
382       mpz_clear (new_pp);
383     }
384 
385   /* start the factor array */
386   for (i = 0; i < nrawfactor; i++)
387     {
388       int  j;
389       assert (nfactor < factor_alloc);
390       factor[nfactor].divisor = 1;
391       for (j = 0; j < rawfactor[i].multiplicity; j++)
392         factor[nfactor].divisor *= rawfactor[i].divisor;
393       nfactor++;
394     }
395 
396  combine:
397   /* Combine entries in the factor array.  Combine the smallest entry with
398      the biggest one that will fit with it (ie. under max_divisor), then
399      repeat that with the new smallest entry. */
400   qsort (factor, nfactor, sizeof (factor[0]), f_cmp_divisor);
401   for (i = nfactor-1; i >= 1; i--)
402     {
403       if (factor[i].divisor <= max_divisor / factor[0].divisor)
404         {
405           factor[0].divisor *= factor[i].divisor;
406           COLLAPSE_ELEMENT (factor, i, nfactor);
407           goto combine;
408         }
409     }
410 
411   total_fraction = 1.0;
412   for (i = 0; i < nfactor; i++)
413     {
414       mpz_init (factor[i].inverse);
415       mpz_invert_ui_2exp (factor[i].inverse,
416                           (unsigned long) factor[i].divisor,
417                           (unsigned long) mod_bits);
418 
419       mpz_init (factor[i].mask);
420       square_mask (factor[i].mask, factor[i].divisor);
421 
422       /* fraction of possible squares */
423       factor[i].fraction = (double) mpz_popcount (factor[i].mask)
424         / factor[i].divisor;
425 
426       /* total fraction of possible squares */
427       total_fraction *= factor[i].fraction;
428     }
429 
430   /* best tests first (ie. smallest fraction) */
431   qsort (factor, nfactor, sizeof (factor[0]), f_cmp_fraction);
432 }
433 
434 void
print(int limb_bits,int nail_bits)435 print (int limb_bits, int nail_bits)
436 {
437   int    i;
438   mpz_t  mhi, mlo;
439 
440   printf ("/* This file generated by gen-psqr.c - DO NOT EDIT. */\n");
441   printf ("\n");
442 
443   printf ("#if GMP_LIMB_BITS != %d || GMP_NAIL_BITS != %d\n",
444           limb_bits, nail_bits);
445   printf ("Error, error, this data is for %d bit limb and %d bit nail\n",
446           limb_bits, nail_bits);
447   printf ("#endif\n");
448   printf ("\n");
449 
450   printf ("/* Non-zero bit indicates a quadratic residue mod 0x100.\n");
451   printf ("   This test identifies %.2f%% as non-squares (%d/256). */\n",
452           (1.0 - sq_res_0x100_fraction) * 100.0,
453           0x100 - sq_res_0x100_num);
454   printf ("static const mp_limb_t\n");
455   printf ("sq_res_0x100[%d] = {\n", nsq_res_0x100);
456   for (i = 0; i < nsq_res_0x100; i++)
457     {
458       printf ("  CNST_LIMB(0x");
459       mpz_out_str (stdout, 16, sq_res_0x100[i]);
460       printf ("),\n");
461     }
462   printf ("};\n");
463   printf ("\n");
464 
465   if (mpz_sgn (pp) != 0)
466     {
467       printf ("/* mpn_mod_34lsub1 not used due to %s */\n", mod34_excuse);
468       printf ("/* PERFSQR_PP = ");
469     }
470   else
471     printf ("/* 2^%d-1 = ", mod34_bits);
472   for (i = 0; i < nrawfactor; i++)
473     {
474       if (i != 0)
475         printf (" * ");
476       printf ("%d", rawfactor[i].divisor);
477       if (rawfactor[i].multiplicity != 1)
478         printf ("^%d", rawfactor[i].multiplicity);
479     }
480   printf (" %s*/\n", mpz_sgn (pp) == 0 ? "... " : "");
481 
482   printf ("#define PERFSQR_MOD_BITS  %d\n", mod_bits);
483   if (mpz_sgn (pp) != 0)
484     {
485       printf ("#define PERFSQR_PP            CNST_LIMB(0x");
486       mpz_out_str (stdout, 16, pp);
487       printf (")\n");
488       printf ("#define PERFSQR_PP_NORM       CNST_LIMB(0x");
489       mpz_out_str (stdout, 16, pp_norm);
490       printf (")\n");
491       printf ("#define PERFSQR_PP_INVERTED   CNST_LIMB(0x");
492       mpz_out_str (stdout, 16, pp_inverted);
493       printf (")\n");
494     }
495   printf ("\n");
496 
497   mpz_init (mhi);
498   mpz_init (mlo);
499 
500   printf ("/* This test identifies %.2f%% as non-squares. */\n",
501           (1.0 - total_fraction) * 100.0);
502   printf ("#define PERFSQR_MOD_TEST(up, usize) \\\n");
503   printf ("  do {                              \\\n");
504   printf ("    mp_limb_t  r;                   \\\n");
505   if (mpz_sgn (pp) != 0)
506     printf ("    PERFSQR_MOD_PP (r, up, usize);  \\\n");
507   else
508     printf ("    PERFSQR_MOD_34 (r, up, usize);  \\\n");
509 
510   for (i = 0; i < nfactor; i++)
511     {
512       printf ("                                    \\\n");
513       printf ("    /* %5.2f%% */                    \\\n",
514               (1.0 - factor[i].fraction) * 100.0);
515 
516       printf ("    PERFSQR_MOD_%d (r, CNST_LIMB(%2d), CNST_LIMB(0x",
517               factor[i].divisor <= limb_bits ? 1 : 2,
518               factor[i].divisor);
519       mpz_out_str (stdout, 16, factor[i].inverse);
520       printf ("), \\\n");
521       printf ("                   CNST_LIMB(0x");
522 
523       if ( factor[i].divisor <= limb_bits)
524         {
525           mpz_out_str (stdout, 16, factor[i].mask);
526         }
527       else
528         {
529           mpz_tdiv_r_2exp (mlo, factor[i].mask, (unsigned long) limb_bits);
530           mpz_tdiv_q_2exp (mhi, factor[i].mask, (unsigned long) limb_bits);
531           mpz_out_str (stdout, 16, mhi);
532           printf ("), CNST_LIMB(0x");
533           mpz_out_str (stdout, 16, mlo);
534         }
535       printf (")); \\\n");
536     }
537 
538   printf ("  } while (0)\n");
539   printf ("\n");
540 
541   printf ("/* Grand total sq_res_0x100 and PERFSQR_MOD_TEST, %.2f%% non-squares. */\n",
542           (1.0 - (total_fraction * 44.0/256.0)) * 100.0);
543   printf ("\n");
544 
545   printf ("/* helper for tests/mpz/t-perfsqr.c */\n");
546   printf ("#define PERFSQR_DIVISORS  { 256,");
547   for (i = 0; i < nfactor; i++)
548       printf (" %d,", factor[i].divisor);
549   printf (" }\n");
550 
551 
552   mpz_clear (mhi);
553   mpz_clear (mlo);
554 }
555 
556 int
main(int argc,char * argv[])557 main (int argc, char *argv[])
558 {
559   int  limb_bits, nail_bits;
560 
561   if (argc != 3)
562     {
563       fprintf (stderr, "Usage: gen-psqr <limbbits> <nailbits>\n");
564       exit (1);
565     }
566 
567   limb_bits = atoi (argv[1]);
568   nail_bits = atoi (argv[2]);
569 
570   if (limb_bits <= 0
571       || nail_bits < 0
572       || nail_bits >= limb_bits)
573     {
574       fprintf (stderr, "Invalid limb/nail bits: %d %d\n",
575                limb_bits, nail_bits);
576       exit (1);
577     }
578 
579   generate_sq_res_0x100 (limb_bits);
580   generate_mod (limb_bits, nail_bits);
581 
582   print (limb_bits, nail_bits);
583 
584   return 0;
585 }
586