xref: /netbsd-src/crypto/external/bsd/netpgp/dist/src/netpgpverify/bignum.c (revision b7b7574d3bf8eeb51a1fa3977b59142ec6434a55)
1 /*-
2  * Copyright (c) 2012 Alistair Crooks <agc@NetBSD.org>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
15  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
16  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
17  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
18  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
19  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
20  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
21  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
23  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24  */
25 /* LibTomMath, multiple-precision integer library -- Tom St Denis
26  *
27  * LibTomMath is a library that provides multiple-precision
28  * integer arithmetic as well as number theoretic functionality.
29  *
30  * The library was designed directly after the MPI library by
31  * Michael Fromberger but has been written from scratch with
32  * additional optimizations in place.
33  *
34  * The library is free for all purposes without any express
35  * guarantee it works.
36  *
37  * Tom St Denis, tomstdenis@gmail.com, http://libtom.org
38  */
39 #include "config.h"
40 
41 #include <sys/types.h>
42 #include <sys/stat.h>
43 #include <sys/param.h>
44 
45 #ifdef _KERNEL
46 # include <sys/kmem.h>
47 #else
48 # include <arpa/inet.h>
49 # include <ctype.h>
50 # include <inttypes.h>
51 # include <stdarg.h>
52 # include <stdio.h>
53 # include <stdlib.h>
54 # include <string.h>
55 # include <time.h>
56 # include <unistd.h>
57 #endif
58 
59 #include "misc.h"
60 #include "bn.h"
61 #include "digest.h"
62 
63 /**************************************************************************/
64 
65 /* LibTomMath, multiple-precision integer library -- Tom St Denis
66  *
67  * LibTomMath is a library that provides multiple-precision
68  * integer arithmetic as well as number theoretic functionality.
69  *
70  * The library was designed directly after the MPI library by
71  * Michael Fromberger but has been written from scratch with
72  * additional optimizations in place.
73  *
74  * The library is free for all purposes without any express
75  * guarantee it works.
76  *
77  * Tom St Denis, tomstdenis@gmail.com, http://libtom.org
78  */
79 
80 #define MP_PREC		32
81 #define DIGIT_BIT	28
82 #define MP_MASK          ((((mp_digit)1)<<((mp_digit)DIGIT_BIT))-((mp_digit)1))
83 
84 #define MP_WARRAY	/*LINTED*/(1U << (((sizeof(mp_word) * CHAR_BIT) - (2 * DIGIT_BIT) + 1)))
85 
86 #define MP_NO		0
87 #define MP_YES		1
88 
89 #ifndef USE_ARG
90 #define USE_ARG(x)	/*LINTED*/(void)&(x)
91 #endif
92 
93 #ifndef __arraycount
94 #define	__arraycount(__x)	(sizeof(__x) / sizeof(__x[0]))
95 #endif
96 
97 #define mp_iszero(a) (((a)->used == 0) ? MP_YES : MP_NO)
98 
99 #define s_mp_mul(a, b, c) s_mp_mul_digs(a, b, c, (a)->used + (b)->used + 1)
100 
101 typedef int           mp_err;
102 
103 static int mp_mul(mp_int * a, mp_int * b, mp_int * c);
104 static int mp_sqr(mp_int * a, mp_int * b);
105 
106 static int mp_sub_d(mp_int *a, mp_digit b, mp_int *c);
107 
108 /* set to zero */
109 static void
110 mp_zero(mp_int *a)
111 {
112   int       n;
113   mp_digit *tmp;
114 
115   a->sign = MP_ZPOS;
116   a->used = 0;
117 
118   tmp = a->dp;
119   /* XXX - memset */
120   for (n = 0; n < a->alloc; n++) {
121      *tmp++ = 0;
122   }
123 }
124 
125 /* grow as required */
126 static int
127 mp_grow(mp_int *a, int size)
128 {
129   int     i;
130   mp_digit *tmp;
131 
132   /* if the alloc size is smaller alloc more ram */
133   if (a->alloc < size) {
134     /* ensure there are always at least MP_PREC digits extra on top */
135     size += (MP_PREC * 2) - (size % MP_PREC);
136 
137     /* reallocate the array a->dp
138      *
139      * We store the return in a temporary variable
140      * in case the operation failed we don't want
141      * to overwrite the dp member of a.
142      */
143     tmp = realloc(a->dp, sizeof(*tmp) * size);
144     if (tmp == NULL) {
145       /* reallocation failed but "a" is still valid [can be freed] */
146       return MP_MEM;
147     }
148 
149     /* reallocation succeeded so set a->dp */
150     a->dp = tmp;
151 
152     /* zero excess digits */
153     i        = a->alloc;
154     a->alloc = size;
155     for (; i < a->alloc; i++) {
156       a->dp[i] = 0;
157     }
158   }
159   return MP_OKAY;
160 }
161 
162 /* shift left a certain amount of digits */
163 static int
164 mp_lshd (mp_int * a, int b)
165 {
166   int     x, res;
167 
168   /* if its less than zero return */
169   if (b <= 0) {
170     return MP_OKAY;
171   }
172 
173   /* grow to fit the new digits */
174   if (a->alloc < a->used + b) {
175      if ((res = mp_grow (a, a->used + b)) != MP_OKAY) {
176        return res;
177      }
178   }
179 
180   {
181     mp_digit *top, *bottom;
182 
183     /* increment the used by the shift amount then copy upwards */
184     a->used += b;
185 
186     /* top */
187     top = a->dp + a->used - 1;
188 
189     /* base */
190     bottom = a->dp + a->used - 1 - b;
191 
192     /* much like mp_rshd this is implemented using a sliding window
193      * except the window goes the otherway around.  Copying from
194      * the bottom to the top.  see bn_mp_rshd.c for more info.
195      */
196     for (x = a->used - 1; x >= b; x--) {
197       *top-- = *bottom--;
198     }
199 
200     /* zero the lower digits */
201     top = a->dp;
202     for (x = 0; x < b; x++) {
203       *top++ = 0;
204     }
205   }
206   return MP_OKAY;
207 }
208 
209 /* trim unused digits
210  *
211  * This is used to ensure that leading zero digits are
212  * trimed and the leading "used" digit will be non-zero
213  * Typically very fast.  Also fixes the sign if there
214  * are no more leading digits
215  */
216 static void
217 mp_clamp (mp_int * a)
218 {
219   /* decrease used while the most significant digit is
220    * zero.
221    */
222   while (a->used > 0 && a->dp[a->used - 1] == 0) {
223     --(a->used);
224   }
225 
226   /* reset the sign flag if used == 0 */
227   if (a->used == 0) {
228     a->sign = MP_ZPOS;
229   }
230 }
231 
232 /* copy, b = a */
233 static int
234 mp_copy(BIGNUM *a, BIGNUM *b)
235 {
236   int     res, n;
237 
238   /* if dst == src do nothing */
239   if (a == b) {
240     return MP_OKAY;
241   }
242   if (a == NULL || b == NULL) {
243 	return MP_VAL;
244   }
245 
246   /* grow dest */
247   if (b->alloc < a->used) {
248      if ((res = mp_grow (b, a->used)) != MP_OKAY) {
249         return res;
250      }
251   }
252 
253   /* zero b and copy the parameters over */
254   {
255     mp_digit *tmpa, *tmpb;
256 
257     /* pointer aliases */
258 
259     /* source */
260     tmpa = a->dp;
261 
262     /* destination */
263     tmpb = b->dp;
264 
265     /* copy all the digits */
266     for (n = 0; n < a->used; n++) {
267       *tmpb++ = *tmpa++;
268     }
269 
270     /* clear high digits */
271     for (; n < b->used; n++) {
272       *tmpb++ = 0;
273     }
274   }
275 
276   /* copy used count and sign */
277   b->used = a->used;
278   b->sign = a->sign;
279   return MP_OKAY;
280 }
281 
282 /* shift left by a certain bit count */
283 static int
284 mp_mul_2d(mp_int *a, int b, mp_int *c)
285 {
286   mp_digit d;
287   int      res;
288 
289   /* copy */
290   if (a != c) {
291      if ((res = mp_copy (a, c)) != MP_OKAY) {
292        return res;
293      }
294   }
295 
296   if (c->alloc < (int)(c->used + b/DIGIT_BIT + 1)) {
297      if ((res = mp_grow (c, c->used + b / DIGIT_BIT + 1)) != MP_OKAY) {
298        return res;
299      }
300   }
301 
302   /* shift by as many digits in the bit count */
303   if (b >= (int)DIGIT_BIT) {
304     if ((res = mp_lshd (c, b / DIGIT_BIT)) != MP_OKAY) {
305       return res;
306     }
307   }
308 
309   /* shift any bit count < DIGIT_BIT */
310   d = (mp_digit) (b % DIGIT_BIT);
311   if (d != 0) {
312     mp_digit *tmpc, shift, mask, r, rr;
313     int x;
314 
315     /* bitmask for carries */
316     mask = (((mp_digit)1) << d) - 1;
317 
318     /* shift for msbs */
319     shift = DIGIT_BIT - d;
320 
321     /* alias */
322     tmpc = c->dp;
323 
324     /* carry */
325     r    = 0;
326     for (x = 0; x < c->used; x++) {
327       /* get the higher bits of the current word */
328       rr = (*tmpc >> shift) & mask;
329 
330       /* shift the current word and OR in the carry */
331       *tmpc = ((*tmpc << d) | r) & MP_MASK;
332       ++tmpc;
333 
334       /* set the carry to the carry bits of the current word */
335       r = rr;
336     }
337 
338     /* set final carry */
339     if (r != 0) {
340        c->dp[(c->used)++] = r;
341     }
342   }
343   mp_clamp (c);
344   return MP_OKAY;
345 }
346 
347 /* reads a unsigned char array, assumes the msb is stored first [big endian] */
348 static int
349 mp_read_unsigned_bin(mp_int *a, const uint8_t *b, int c)
350 {
351   int     res;
352 
353   /* make sure there are at least two digits */
354   if (a->alloc < 2) {
355      if ((res = mp_grow(a, 2)) != MP_OKAY) {
356         return res;
357      }
358   }
359 
360   /* zero the int */
361   mp_zero (a);
362 
363   /* read the bytes in */
364   while (c-- > 0) {
365     if ((res = mp_mul_2d (a, 8, a)) != MP_OKAY) {
366       return res;
367     }
368 
369       a->dp[0] |= *b++;
370       a->used += 1;
371   }
372   mp_clamp (a);
373   return MP_OKAY;
374 }
375 
376 /* returns the number of bits in an int */
377 static int
378 mp_count_bits(const mp_int *a)
379 {
380   int     r;
381   mp_digit q;
382 
383   /* shortcut */
384   if (a->used == 0) {
385     return 0;
386   }
387 
388   /* get number of digits and add that */
389   r = (a->used - 1) * DIGIT_BIT;
390 
391   /* take the last digit and count the bits in it */
392   q = a->dp[a->used - 1];
393   while (q > ((mp_digit) 0)) {
394     ++r;
395     q >>= ((mp_digit) 1);
396   }
397   return r;
398 }
399 
400 /* compare maginitude of two ints (unsigned) */
401 static int
402 mp_cmp_mag (mp_int * a, mp_int * b)
403 {
404   int     n;
405   mp_digit *tmpa, *tmpb;
406 
407   /* compare based on # of non-zero digits */
408   if (a->used > b->used) {
409     return MP_GT;
410   }
411 
412   if (a->used < b->used) {
413     return MP_LT;
414   }
415 
416   /* alias for a */
417   tmpa = a->dp + (a->used - 1);
418 
419   /* alias for b */
420   tmpb = b->dp + (a->used - 1);
421 
422   /* compare based on digits  */
423   for (n = 0; n < a->used; ++n, --tmpa, --tmpb) {
424     if (*tmpa > *tmpb) {
425       return MP_GT;
426     }
427 
428     if (*tmpa < *tmpb) {
429       return MP_LT;
430     }
431   }
432   return MP_EQ;
433 }
434 
435 /* compare two ints (signed)*/
436 static int
437 mp_cmp (mp_int * a, mp_int * b)
438 {
439   /* compare based on sign */
440   if (a->sign != b->sign) {
441      if (a->sign == MP_NEG) {
442         return MP_LT;
443      } else {
444         return MP_GT;
445      }
446   }
447 
448   /* compare digits */
449   if (a->sign == MP_NEG) {
450      /* if negative compare opposite direction */
451      return mp_cmp_mag(b, a);
452   } else {
453      return mp_cmp_mag(a, b);
454   }
455 }
456 
457 /* get the size for an unsigned equivalent */
458 static int
459 mp_unsigned_bin_size (mp_int * a)
460 {
461   int     size = mp_count_bits (a);
462   return (size / 8 + ((size & 7) != 0 ? 1 : 0));
463 }
464 
465 /* init a new mp_int */
466 static int
467 mp_init (mp_int * a)
468 {
469   int i;
470 
471   /* allocate memory required and clear it */
472   a->dp = netpgp_allocate(1, sizeof (*a->dp) * MP_PREC);
473   if (a->dp == NULL) {
474     return MP_MEM;
475   }
476 
477   /* set the digits to zero */
478   for (i = 0; i < MP_PREC; i++) {
479       a->dp[i] = 0;
480   }
481 
482   /* set the used to zero, allocated digits to the default precision
483    * and sign to positive */
484   a->used  = 0;
485   a->alloc = MP_PREC;
486   a->sign  = MP_ZPOS;
487 
488   return MP_OKAY;
489 }
490 
491 /* clear one (frees)  */
492 static void
493 mp_clear (mp_int * a)
494 {
495   int i;
496 
497   /* only do anything if a hasn't been freed previously */
498   if (a->dp != NULL) {
499     /* first zero the digits */
500     for (i = 0; i < a->used; i++) {
501         a->dp[i] = 0;
502     }
503 
504     /* free ram */
505     netpgp_deallocate(a->dp, (size_t)a->alloc);
506 
507     /* reset members to make debugging easier */
508     a->dp    = NULL;
509     a->alloc = a->used = 0;
510     a->sign  = MP_ZPOS;
511   }
512 }
513 
514 static int
515 mp_init_multi(mp_int *mp, ...)
516 {
517     mp_err res = MP_OKAY;      /* Assume ok until proven otherwise */
518     int n = 0;                 /* Number of ok inits */
519     mp_int* cur_arg = mp;
520     va_list args;
521 
522     va_start(args, mp);        /* init args to next argument from caller */
523     while (cur_arg != NULL) {
524         if (mp_init(cur_arg) != MP_OKAY) {
525             /* Oops - error! Back-track and mp_clear what we already
526                succeeded in init-ing, then return error.
527             */
528             va_list clean_args;
529 
530             /* end the current list */
531             va_end(args);
532 
533             /* now start cleaning up */
534             cur_arg = mp;
535             va_start(clean_args, mp);
536             while (n--) {
537                 mp_clear(cur_arg);
538                 cur_arg = va_arg(clean_args, mp_int*);
539             }
540             va_end(clean_args);
541             res = MP_MEM;
542             break;
543         }
544         n++;
545         cur_arg = va_arg(args, mp_int*);
546     }
547     va_end(args);
548     return res;                /* Assumed ok, if error flagged above. */
549 }
550 
551 /* init an mp_init for a given size */
552 static int
553 mp_init_size (mp_int * a, int size)
554 {
555   int x;
556 
557   /* pad size so there are always extra digits */
558   size += (MP_PREC * 2) - (size % MP_PREC);
559 
560   /* alloc mem */
561   a->dp = netpgp_allocate (1, sizeof (*a->dp) * size);
562   if (a->dp == NULL) {
563     return MP_MEM;
564   }
565 
566   /* set the members */
567   a->used  = 0;
568   a->alloc = size;
569   a->sign  = MP_ZPOS;
570 
571   /* zero the digits */
572   for (x = 0; x < size; x++) {
573       a->dp[x] = 0;
574   }
575 
576   return MP_OKAY;
577 }
578 
579 /* creates "a" then copies b into it */
580 static int mp_init_copy (mp_int * a, mp_int * b)
581 {
582   int     res;
583 
584   if ((res = mp_init (a)) != MP_OKAY) {
585     return res;
586   }
587   return mp_copy (b, a);
588 }
589 
590 /* low level addition, based on HAC pp.594, Algorithm 14.7 */
591 static int
592 s_mp_add (mp_int * a, mp_int * b, mp_int * c)
593 {
594   mp_int *x;
595   int     olduse, res, min, max;
596 
597   /* find sizes, we let |a| <= |b| which means we have to sort
598    * them.  "x" will point to the input with the most digits
599    */
600   if (a->used > b->used) {
601     min = b->used;
602     max = a->used;
603     x = a;
604   } else {
605     min = a->used;
606     max = b->used;
607     x = b;
608   }
609 
610   /* init result */
611   if (c->alloc < max + 1) {
612     if ((res = mp_grow (c, max + 1)) != MP_OKAY) {
613       return res;
614     }
615   }
616 
617   /* get old used digit count and set new one */
618   olduse = c->used;
619   c->used = max + 1;
620 
621   {
622     mp_digit u, *tmpa, *tmpb, *tmpc;
623     int i;
624 
625     /* alias for digit pointers */
626 
627     /* first input */
628     tmpa = a->dp;
629 
630     /* second input */
631     tmpb = b->dp;
632 
633     /* destination */
634     tmpc = c->dp;
635 
636     /* zero the carry */
637     u = 0;
638     for (i = 0; i < min; i++) {
639       /* Compute the sum at one digit, T[i] = A[i] + B[i] + U */
640       *tmpc = *tmpa++ + *tmpb++ + u;
641 
642       /* U = carry bit of T[i] */
643       u = *tmpc >> ((mp_digit)DIGIT_BIT);
644 
645       /* take away carry bit from T[i] */
646       *tmpc++ &= MP_MASK;
647     }
648 
649     /* now copy higher words if any, that is in A+B
650      * if A or B has more digits add those in
651      */
652     if (min != max) {
653       for (; i < max; i++) {
654         /* T[i] = X[i] + U */
655         *tmpc = x->dp[i] + u;
656 
657         /* U = carry bit of T[i] */
658         u = *tmpc >> ((mp_digit)DIGIT_BIT);
659 
660         /* take away carry bit from T[i] */
661         *tmpc++ &= MP_MASK;
662       }
663     }
664 
665     /* add carry */
666     *tmpc++ = u;
667 
668     /* clear digits above oldused */
669     for (i = c->used; i < olduse; i++) {
670       *tmpc++ = 0;
671     }
672   }
673 
674   mp_clamp (c);
675   return MP_OKAY;
676 }
677 
678 /* low level subtraction (assumes |a| > |b|), HAC pp.595 Algorithm 14.9 */
679 static int
680 s_mp_sub (mp_int * a, mp_int * b, mp_int * c)
681 {
682   int     olduse, res, min, max;
683 
684   /* find sizes */
685   min = b->used;
686   max = a->used;
687 
688   /* init result */
689   if (c->alloc < max) {
690     if ((res = mp_grow (c, max)) != MP_OKAY) {
691       return res;
692     }
693   }
694   olduse = c->used;
695   c->used = max;
696 
697   {
698     mp_digit u, *tmpa, *tmpb, *tmpc;
699     int i;
700 
701     /* alias for digit pointers */
702     tmpa = a->dp;
703     tmpb = b->dp;
704     tmpc = c->dp;
705 
706     /* set carry to zero */
707     u = 0;
708     for (i = 0; i < min; i++) {
709       /* T[i] = A[i] - B[i] - U */
710       *tmpc = *tmpa++ - *tmpb++ - u;
711 
712       /* U = carry bit of T[i]
713        * Note this saves performing an AND operation since
714        * if a carry does occur it will propagate all the way to the
715        * MSB.  As a result a single shift is enough to get the carry
716        */
717       u = *tmpc >> ((mp_digit)(CHAR_BIT * sizeof (mp_digit) - 1));
718 
719       /* Clear carry from T[i] */
720       *tmpc++ &= MP_MASK;
721     }
722 
723     /* now copy higher words if any, e.g. if A has more digits than B  */
724     for (; i < max; i++) {
725       /* T[i] = A[i] - U */
726       *tmpc = *tmpa++ - u;
727 
728       /* U = carry bit of T[i] */
729       u = *tmpc >> ((mp_digit)(CHAR_BIT * sizeof (mp_digit) - 1));
730 
731       /* Clear carry from T[i] */
732       *tmpc++ &= MP_MASK;
733     }
734 
735     /* clear digits above used (since we may not have grown result above) */
736     for (i = c->used; i < olduse; i++) {
737       *tmpc++ = 0;
738     }
739   }
740 
741   mp_clamp (c);
742   return MP_OKAY;
743 }
744 
745 /* high level subtraction (handles signs) */
746 static int
747 mp_sub (mp_int * a, mp_int * b, mp_int * c)
748 {
749   int     sa, sb, res;
750 
751   sa = a->sign;
752   sb = b->sign;
753 
754   if (sa != sb) {
755     /* subtract a negative from a positive, OR */
756     /* subtract a positive from a negative. */
757     /* In either case, ADD their magnitudes, */
758     /* and use the sign of the first number. */
759     c->sign = sa;
760     res = s_mp_add (a, b, c);
761   } else {
762     /* subtract a positive from a positive, OR */
763     /* subtract a negative from a negative. */
764     /* First, take the difference between their */
765     /* magnitudes, then... */
766     if (mp_cmp_mag (a, b) != MP_LT) {
767       /* Copy the sign from the first */
768       c->sign = sa;
769       /* The first has a larger or equal magnitude */
770       res = s_mp_sub (a, b, c);
771     } else {
772       /* The result has the *opposite* sign from */
773       /* the first number. */
774       c->sign = (sa == MP_ZPOS) ? MP_NEG : MP_ZPOS;
775       /* The second has a larger magnitude */
776       res = s_mp_sub (b, a, c);
777     }
778   }
779   return res;
780 }
781 
782 /* shift right a certain amount of digits */
783 static int mp_rshd (mp_int * a, int b)
784 {
785   int     x;
786 
787   /* if b <= 0 then ignore it */
788   if (b <= 0) {
789     return 0;
790   }
791 
792   /* if b > used then simply zero it and return */
793   if (a->used <= b) {
794     mp_zero (a);
795     return 0;
796   }
797 
798   {
799     mp_digit *bottom, *top;
800 
801     /* shift the digits down */
802 
803     /* bottom */
804     bottom = a->dp;
805 
806     /* top [offset into digits] */
807     top = a->dp + b;
808 
809     /* this is implemented as a sliding window where
810      * the window is b-digits long and digits from
811      * the top of the window are copied to the bottom
812      *
813      * e.g.
814 
815      b-2 | b-1 | b0 | b1 | b2 | ... | bb |   ---->
816                  /\                   |      ---->
817                   \-------------------/      ---->
818      */
819     for (x = 0; x < (a->used - b); x++) {
820       *bottom++ = *top++;
821     }
822 
823     /* zero the top digits */
824     for (; x < a->used; x++) {
825       *bottom++ = 0;
826     }
827   }
828 
829   /* remove excess digits */
830   a->used -= b;
831   return 1;
832 }
833 
834 /* multiply by a digit */
835 static int
836 mp_mul_d (mp_int * a, mp_digit b, mp_int * c)
837 {
838   mp_digit u, *tmpa, *tmpc;
839   mp_word  r;
840   int      ix, res, olduse;
841 
842   /* make sure c is big enough to hold a*b */
843   if (c->alloc < a->used + 1) {
844     if ((res = mp_grow (c, a->used + 1)) != MP_OKAY) {
845       return res;
846     }
847   }
848 
849   /* get the original destinations used count */
850   olduse = c->used;
851 
852   /* set the sign */
853   c->sign = a->sign;
854 
855   /* alias for a->dp [source] */
856   tmpa = a->dp;
857 
858   /* alias for c->dp [dest] */
859   tmpc = c->dp;
860 
861   /* zero carry */
862   u = 0;
863 
864   /* compute columns */
865   for (ix = 0; ix < a->used; ix++) {
866     /* compute product and carry sum for this term */
867     r       = ((mp_word) u) + ((mp_word)*tmpa++) * ((mp_word)b);
868 
869     /* mask off higher bits to get a single digit */
870     *tmpc++ = (mp_digit) (r & ((mp_word) MP_MASK));
871 
872     /* send carry into next iteration */
873     u       = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
874   }
875 
876   /* store final carry [if any] and increment ix offset  */
877   *tmpc++ = u;
878   ++ix;
879 
880   /* now zero digits above the top */
881   while (ix++ < olduse) {
882      *tmpc++ = 0;
883   }
884 
885   /* set used count */
886   c->used = a->used + 1;
887   mp_clamp(c);
888 
889   return MP_OKAY;
890 }
891 
892 /* high level addition (handles signs) */
893 static int mp_add (mp_int * a, mp_int * b, mp_int * c)
894 {
895   int     sa, sb, res;
896 
897   /* get sign of both inputs */
898   sa = a->sign;
899   sb = b->sign;
900 
901   /* handle two cases, not four */
902   if (sa == sb) {
903     /* both positive or both negative */
904     /* add their magnitudes, copy the sign */
905     c->sign = sa;
906     res = s_mp_add (a, b, c);
907   } else {
908     /* one positive, the other negative */
909     /* subtract the one with the greater magnitude from */
910     /* the one of the lesser magnitude.  The result gets */
911     /* the sign of the one with the greater magnitude. */
912     if (mp_cmp_mag (a, b) == MP_LT) {
913       c->sign = sb;
914       res = s_mp_sub (b, a, c);
915     } else {
916       c->sign = sa;
917       res = s_mp_sub (a, b, c);
918     }
919   }
920   return res;
921 }
922 
923 /* swap the elements of two integers, for cases where you can't simply swap the
924  * mp_int pointers around
925  */
926 static void
927 mp_exch(mp_int *a, mp_int *b)
928 {
929   mp_int  t;
930 
931   t  = *a;
932   *a = *b;
933   *b = t;
934 }
935 
936 /* calc a value mod 2**b */
937 static int
938 mp_mod_2d (mp_int * a, int b, mp_int * c)
939 {
940   int     x, res;
941 
942   /* if b is <= 0 then zero the int */
943   if (b <= 0) {
944     mp_zero (c);
945     return MP_OKAY;
946   }
947 
948   /* if the modulus is larger than the value than return */
949   if (b >= (int) (a->used * DIGIT_BIT)) {
950     res = mp_copy (a, c);
951     return res;
952   }
953 
954   /* copy */
955   if ((res = mp_copy (a, c)) != MP_OKAY) {
956     return res;
957   }
958 
959   /* zero digits above the last digit of the modulus */
960   for (x = (b / DIGIT_BIT) + ((b % DIGIT_BIT) == 0 ? 0 : 1); x < c->used; x++) {
961     c->dp[x] = 0;
962   }
963   /* clear the digit that is not completely outside/inside the modulus */
964   c->dp[b / DIGIT_BIT] &=
965     (mp_digit) ((((mp_digit) 1) << (((mp_digit) b) % DIGIT_BIT)) - ((mp_digit) 1));
966   mp_clamp (c);
967   return MP_OKAY;
968 }
969 
970 /* shift right by a certain bit count (store quotient in c, optional remainder in d) */
971 static int mp_div_2d (mp_int * a, int b, mp_int * c, mp_int * d)
972 {
973   mp_digit D, r, rr;
974   int     x, res;
975   mp_int  t;
976 
977 
978   /* if the shift count is <= 0 then we do no work */
979   if (b <= 0) {
980     res = mp_copy (a, c);
981     if (d != NULL) {
982       mp_zero (d);
983     }
984     return res;
985   }
986 
987   if ((res = mp_init (&t)) != MP_OKAY) {
988     return res;
989   }
990 
991   /* get the remainder */
992   if (d != NULL) {
993     if ((res = mp_mod_2d (a, b, &t)) != MP_OKAY) {
994       mp_clear (&t);
995       return res;
996     }
997   }
998 
999   /* copy */
1000   if ((res = mp_copy (a, c)) != MP_OKAY) {
1001     mp_clear (&t);
1002     return res;
1003   }
1004 
1005   /* shift by as many digits in the bit count */
1006   if (b >= (int)DIGIT_BIT) {
1007     mp_rshd (c, b / DIGIT_BIT);
1008   }
1009 
1010   /* shift any bit count < DIGIT_BIT */
1011   D = (mp_digit) (b % DIGIT_BIT);
1012   if (D != 0) {
1013     mp_digit *tmpc, mask, shift;
1014 
1015     /* mask */
1016     mask = (((mp_digit)1) << D) - 1;
1017 
1018     /* shift for lsb */
1019     shift = DIGIT_BIT - D;
1020 
1021     /* alias */
1022     tmpc = c->dp + (c->used - 1);
1023 
1024     /* carry */
1025     r = 0;
1026     for (x = c->used - 1; x >= 0; x--) {
1027       /* get the lower  bits of this word in a temp */
1028       rr = *tmpc & mask;
1029 
1030       /* shift the current word and mix in the carry bits from the previous word */
1031       *tmpc = (*tmpc >> D) | (r << shift);
1032       --tmpc;
1033 
1034       /* set the carry to the carry bits of the current word found above */
1035       r = rr;
1036     }
1037   }
1038   mp_clamp (c);
1039   if (d != NULL) {
1040     mp_exch (&t, d);
1041   }
1042   mp_clear (&t);
1043   return MP_OKAY;
1044 }
1045 
1046 /* integer signed division.
1047  * c*b + d == a [e.g. a/b, c=quotient, d=remainder]
1048  * HAC pp.598 Algorithm 14.20
1049  *
1050  * Note that the description in HAC is horribly
1051  * incomplete.  For example, it doesn't consider
1052  * the case where digits are removed from 'x' in
1053  * the inner loop.  It also doesn't consider the
1054  * case that y has fewer than three digits, etc..
1055  *
1056  * The overall algorithm is as described as
1057  * 14.20 from HAC but fixed to treat these cases.
1058 */
1059 static int
1060 mp_div(mp_int *c, mp_int *d, mp_int *a, mp_int *b)
1061 {
1062   mp_int  q, x, y, t1, t2;
1063   int     res, n, t, i, norm, neg;
1064 
1065   /* is divisor zero ? */
1066   if (BN_is_zero (b) == 1) {
1067     return MP_VAL;
1068   }
1069 
1070   /* if a < b then q=0, r = a */
1071   if (mp_cmp_mag (a, b) == MP_LT) {
1072     if (d != NULL) {
1073       res = mp_copy (a, d);
1074     } else {
1075       res = MP_OKAY;
1076     }
1077     if (c != NULL) {
1078       mp_zero (c);
1079     }
1080     return res;
1081   }
1082 
1083   if ((res = mp_init_size (&q, a->used + 2)) != MP_OKAY) {
1084     return res;
1085   }
1086   q.used = a->used + 2;
1087 
1088   if ((res = mp_init (&t1)) != MP_OKAY) {
1089     goto LBL_Q;
1090   }
1091 
1092   if ((res = mp_init (&t2)) != MP_OKAY) {
1093     goto LBL_T1;
1094   }
1095 
1096   if ((res = mp_init_copy (&x, a)) != MP_OKAY) {
1097     goto LBL_T2;
1098   }
1099 
1100   if ((res = mp_init_copy (&y, b)) != MP_OKAY) {
1101     goto LBL_X;
1102   }
1103 
1104   /* fix the sign */
1105   neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
1106   x.sign = y.sign = MP_ZPOS;
1107 
1108   /* normalize both x and y, ensure that y >= b/2, [b == 2**DIGIT_BIT] */
1109   norm = mp_count_bits(&y) % DIGIT_BIT;
1110   if (norm < (int)(DIGIT_BIT-1)) {
1111      norm = (DIGIT_BIT-1) - norm;
1112      if ((res = mp_mul_2d (&x, norm, &x)) != MP_OKAY) {
1113        goto LBL_Y;
1114      }
1115      if ((res = mp_mul_2d (&y, norm, &y)) != MP_OKAY) {
1116        goto LBL_Y;
1117      }
1118   } else {
1119      norm = 0;
1120   }
1121 
1122   /* note hac does 0 based, so if used==5 then its 0,1,2,3,4, e.g. use 4 */
1123   n = x.used - 1;
1124   t = y.used - 1;
1125 
1126   /* while (x >= y*b**n-t) do { q[n-t] += 1; x -= y*b**{n-t} } */
1127   if ((res = mp_lshd (&y, n - t)) != MP_OKAY) { /* y = y*b**{n-t} */
1128     goto LBL_Y;
1129   }
1130 
1131   while (mp_cmp (&x, &y) != MP_LT) {
1132     ++(q.dp[n - t]);
1133     if ((res = mp_sub (&x, &y, &x)) != MP_OKAY) {
1134       goto LBL_Y;
1135     }
1136   }
1137 
1138   /* reset y by shifting it back down */
1139   mp_rshd (&y, n - t);
1140 
1141   /* step 3. for i from n down to (t + 1) */
1142   for (i = n; i >= (t + 1); i--) {
1143     if (i > x.used) {
1144       continue;
1145     }
1146 
1147     /* step 3.1 if xi == yt then set q{i-t-1} to b-1,
1148      * otherwise set q{i-t-1} to (xi*b + x{i-1})/yt */
1149     if (x.dp[i] == y.dp[t]) {
1150       q.dp[i - t - 1] = ((((mp_digit)1) << DIGIT_BIT) - 1);
1151     } else {
1152       mp_word tmp;
1153       tmp = ((mp_word) x.dp[i]) << ((mp_word) DIGIT_BIT);
1154       tmp |= ((mp_word) x.dp[i - 1]);
1155       tmp /= ((mp_word) y.dp[t]);
1156       if (tmp > (mp_word) MP_MASK)
1157         tmp = MP_MASK;
1158       q.dp[i - t - 1] = (mp_digit) (tmp & (mp_word) (MP_MASK));
1159     }
1160 
1161     /* while (q{i-t-1} * (yt * b + y{t-1})) >
1162              xi * b**2 + xi-1 * b + xi-2
1163 
1164        do q{i-t-1} -= 1;
1165     */
1166     q.dp[i - t - 1] = (q.dp[i - t - 1] + 1) & MP_MASK;
1167     do {
1168       q.dp[i - t - 1] = (q.dp[i - t - 1] - 1) & MP_MASK;
1169 
1170       /* find left hand */
1171       mp_zero (&t1);
1172       t1.dp[0] = (t - 1 < 0) ? 0 : y.dp[t - 1];
1173       t1.dp[1] = y.dp[t];
1174       t1.used = 2;
1175       if ((res = mp_mul_d (&t1, q.dp[i - t - 1], &t1)) != MP_OKAY) {
1176         goto LBL_Y;
1177       }
1178 
1179       /* find right hand */
1180       t2.dp[0] = (i - 2 < 0) ? 0 : x.dp[i - 2];
1181       t2.dp[1] = (i - 1 < 0) ? 0 : x.dp[i - 1];
1182       t2.dp[2] = x.dp[i];
1183       t2.used = 3;
1184     } while (mp_cmp_mag(&t1, &t2) == MP_GT);
1185 
1186     /* step 3.3 x = x - q{i-t-1} * y * b**{i-t-1} */
1187     if ((res = mp_mul_d (&y, q.dp[i - t - 1], &t1)) != MP_OKAY) {
1188       goto LBL_Y;
1189     }
1190 
1191     if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) {
1192       goto LBL_Y;
1193     }
1194 
1195     if ((res = mp_sub (&x, &t1, &x)) != MP_OKAY) {
1196       goto LBL_Y;
1197     }
1198 
1199     /* if x < 0 then { x = x + y*b**{i-t-1}; q{i-t-1} -= 1; } */
1200     if (x.sign == MP_NEG) {
1201       if ((res = mp_copy (&y, &t1)) != MP_OKAY) {
1202         goto LBL_Y;
1203       }
1204       if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) {
1205         goto LBL_Y;
1206       }
1207       if ((res = mp_add (&x, &t1, &x)) != MP_OKAY) {
1208         goto LBL_Y;
1209       }
1210 
1211       q.dp[i - t - 1] = (q.dp[i - t - 1] - 1UL) & MP_MASK;
1212     }
1213   }
1214 
1215   /* now q is the quotient and x is the remainder
1216    * [which we have to normalize]
1217    */
1218 
1219   /* get sign before writing to c */
1220   x.sign = x.used == 0 ? MP_ZPOS : a->sign;
1221 
1222   if (c != NULL) {
1223     mp_clamp (&q);
1224     mp_exch (&q, c);
1225     c->sign = neg;
1226   }
1227 
1228   if (d != NULL) {
1229     mp_div_2d (&x, norm, &x, NULL);
1230     mp_exch (&x, d);
1231   }
1232 
1233   res = MP_OKAY;
1234 
1235 LBL_Y:mp_clear (&y);
1236 LBL_X:mp_clear (&x);
1237 LBL_T2:mp_clear (&t2);
1238 LBL_T1:mp_clear (&t1);
1239 LBL_Q:mp_clear (&q);
1240   return res;
1241 }
1242 
1243 /* c = a mod b, 0 <= c < b */
1244 static int
1245 mp_mod (mp_int * a, mp_int * b, mp_int * c)
1246 {
1247   mp_int  t;
1248   int     res;
1249 
1250   if ((res = mp_init (&t)) != MP_OKAY) {
1251     return res;
1252   }
1253 
1254   if ((res = mp_div (NULL, &t, a, b)) != MP_OKAY) {
1255     mp_clear (&t);
1256     return res;
1257   }
1258 
1259   if (t.sign != b->sign) {
1260     res = mp_add (b, &t, c);
1261   } else {
1262     res = MP_OKAY;
1263     mp_exch (&t, c);
1264   }
1265 
1266   mp_clear (&t);
1267   return res;
1268 }
1269 
1270 /* set to a digit */
1271 static void mp_set (mp_int * a, mp_digit b)
1272 {
1273   mp_zero (a);
1274   a->dp[0] = b & MP_MASK;
1275   a->used  = (a->dp[0] != 0) ? 1 : 0;
1276 }
1277 
1278 /* b = a/2 */
1279 static int mp_div_2(mp_int * a, mp_int * b)
1280 {
1281   int     x, res, oldused;
1282 
1283   /* copy */
1284   if (b->alloc < a->used) {
1285     if ((res = mp_grow (b, a->used)) != MP_OKAY) {
1286       return res;
1287     }
1288   }
1289 
1290   oldused = b->used;
1291   b->used = a->used;
1292   {
1293     mp_digit r, rr, *tmpa, *tmpb;
1294 
1295     /* source alias */
1296     tmpa = a->dp + b->used - 1;
1297 
1298     /* dest alias */
1299     tmpb = b->dp + b->used - 1;
1300 
1301     /* carry */
1302     r = 0;
1303     for (x = b->used - 1; x >= 0; x--) {
1304       /* get the carry for the next iteration */
1305       rr = *tmpa & 1;
1306 
1307       /* shift the current digit, add in carry and store */
1308       *tmpb-- = (*tmpa-- >> 1) | (r << (DIGIT_BIT - 1));
1309 
1310       /* forward carry to next iteration */
1311       r = rr;
1312     }
1313 
1314     /* zero excess digits */
1315     tmpb = b->dp + b->used;
1316     for (x = b->used; x < oldused; x++) {
1317       *tmpb++ = 0;
1318     }
1319   }
1320   b->sign = a->sign;
1321   mp_clamp (b);
1322   return MP_OKAY;
1323 }
1324 
1325 /* compare a digit */
1326 static int mp_cmp_d(mp_int * a, mp_digit b)
1327 {
1328   /* compare based on sign */
1329   if (a->sign == MP_NEG) {
1330     return MP_LT;
1331   }
1332 
1333   /* compare based on magnitude */
1334   if (a->used > 1) {
1335     return MP_GT;
1336   }
1337 
1338   /* compare the only digit of a to b */
1339   if (a->dp[0] > b) {
1340     return MP_GT;
1341   } else if (a->dp[0] < b) {
1342     return MP_LT;
1343   } else {
1344     return MP_EQ;
1345   }
1346 }
1347 
1348 static void mp_clear_multi(mp_int *mp, ...)
1349 {
1350     mp_int* next_mp = mp;
1351     va_list args;
1352     va_start(args, mp);
1353     while (next_mp != NULL) {
1354         mp_clear(next_mp);
1355         next_mp = va_arg(args, mp_int*);
1356     }
1357     va_end(args);
1358 }
1359 
1360 /* computes the modular inverse via binary extended euclidean algorithm,
1361  * that is c = 1/a mod b
1362  *
1363  * Based on slow invmod except this is optimized for the case where b is
1364  * odd as per HAC Note 14.64 on pp. 610
1365  */
1366 static int
1367 fast_mp_invmod (mp_int * a, mp_int * b, mp_int * c)
1368 {
1369   mp_int  x, y, u, v, B, D;
1370   int     res, neg;
1371 
1372   /* 2. [modified] b must be odd   */
1373   if (BN_is_even (b) == 1) {
1374     return MP_VAL;
1375   }
1376 
1377   /* init all our temps */
1378   if ((res = mp_init_multi(&x, &y, &u, &v, &B, &D, NULL)) != MP_OKAY) {
1379      return res;
1380   }
1381 
1382   /* x == modulus, y == value to invert */
1383   if ((res = mp_copy (b, &x)) != MP_OKAY) {
1384     goto LBL_ERR;
1385   }
1386 
1387   /* we need y = |a| */
1388   if ((res = mp_mod (a, b, &y)) != MP_OKAY) {
1389     goto LBL_ERR;
1390   }
1391 
1392   /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
1393   if ((res = mp_copy (&x, &u)) != MP_OKAY) {
1394     goto LBL_ERR;
1395   }
1396   if ((res = mp_copy (&y, &v)) != MP_OKAY) {
1397     goto LBL_ERR;
1398   }
1399   mp_set (&D, 1);
1400 
1401 top:
1402   /* 4.  while u is even do */
1403   while (BN_is_even (&u) == 1) {
1404     /* 4.1 u = u/2 */
1405     if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
1406       goto LBL_ERR;
1407     }
1408     /* 4.2 if B is odd then */
1409     if (BN_is_odd (&B) == 1) {
1410       if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) {
1411         goto LBL_ERR;
1412       }
1413     }
1414     /* B = B/2 */
1415     if ((res = mp_div_2 (&B, &B)) != MP_OKAY) {
1416       goto LBL_ERR;
1417     }
1418   }
1419 
1420   /* 5.  while v is even do */
1421   while (BN_is_even (&v) == 1) {
1422     /* 5.1 v = v/2 */
1423     if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
1424       goto LBL_ERR;
1425     }
1426     /* 5.2 if D is odd then */
1427     if (BN_is_odd (&D) == 1) {
1428       /* D = (D-x)/2 */
1429       if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) {
1430         goto LBL_ERR;
1431       }
1432     }
1433     /* D = D/2 */
1434     if ((res = mp_div_2 (&D, &D)) != MP_OKAY) {
1435       goto LBL_ERR;
1436     }
1437   }
1438 
1439   /* 6.  if u >= v then */
1440   if (mp_cmp (&u, &v) != MP_LT) {
1441     /* u = u - v, B = B - D */
1442     if ((res = mp_sub (&u, &v, &u)) != MP_OKAY) {
1443       goto LBL_ERR;
1444     }
1445 
1446     if ((res = mp_sub (&B, &D, &B)) != MP_OKAY) {
1447       goto LBL_ERR;
1448     }
1449   } else {
1450     /* v - v - u, D = D - B */
1451     if ((res = mp_sub (&v, &u, &v)) != MP_OKAY) {
1452       goto LBL_ERR;
1453     }
1454 
1455     if ((res = mp_sub (&D, &B, &D)) != MP_OKAY) {
1456       goto LBL_ERR;
1457     }
1458   }
1459 
1460   /* if not zero goto step 4 */
1461   if (BN_is_zero (&u) == 0) {
1462     goto top;
1463   }
1464 
1465   /* now a = C, b = D, gcd == g*v */
1466 
1467   /* if v != 1 then there is no inverse */
1468   if (mp_cmp_d (&v, 1) != MP_EQ) {
1469     res = MP_VAL;
1470     goto LBL_ERR;
1471   }
1472 
1473   /* b is now the inverse */
1474   neg = a->sign;
1475   while (D.sign == MP_NEG) {
1476     if ((res = mp_add (&D, b, &D)) != MP_OKAY) {
1477       goto LBL_ERR;
1478     }
1479   }
1480   mp_exch (&D, c);
1481   c->sign = neg;
1482   res = MP_OKAY;
1483 
1484 LBL_ERR:mp_clear_multi (&x, &y, &u, &v, &B, &D, NULL);
1485   return res;
1486 }
1487 
1488 /* hac 14.61, pp608 */
1489 static int
1490 mp_invmod_slow (mp_int * a, mp_int * b, mp_int * c)
1491 {
1492   mp_int  x, y, u, v, A, B, C, D;
1493   int     res;
1494 
1495   /* b cannot be negative */
1496   if (b->sign == MP_NEG || BN_is_zero(b) == 1) {
1497     return MP_VAL;
1498   }
1499 
1500   /* init temps */
1501   if ((res = mp_init_multi(&x, &y, &u, &v,
1502                            &A, &B, &C, &D, NULL)) != MP_OKAY) {
1503      return res;
1504   }
1505 
1506   /* x = a, y = b */
1507   if ((res = mp_mod(a, b, &x)) != MP_OKAY) {
1508       goto LBL_ERR;
1509   }
1510   if ((res = mp_copy (b, &y)) != MP_OKAY) {
1511     goto LBL_ERR;
1512   }
1513 
1514   /* 2. [modified] if x,y are both even then return an error! */
1515   if (BN_is_even (&x) == 1 && BN_is_even (&y) == 1) {
1516     res = MP_VAL;
1517     goto LBL_ERR;
1518   }
1519 
1520   /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
1521   if ((res = mp_copy (&x, &u)) != MP_OKAY) {
1522     goto LBL_ERR;
1523   }
1524   if ((res = mp_copy (&y, &v)) != MP_OKAY) {
1525     goto LBL_ERR;
1526   }
1527   mp_set (&A, 1);
1528   mp_set (&D, 1);
1529 
1530 top:
1531   /* 4.  while u is even do */
1532   while (BN_is_even (&u) == 1) {
1533     /* 4.1 u = u/2 */
1534     if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
1535       goto LBL_ERR;
1536     }
1537     /* 4.2 if A or B is odd then */
1538     if (BN_is_odd (&A) == 1 || BN_is_odd (&B) == 1) {
1539       /* A = (A+y)/2, B = (B-x)/2 */
1540       if ((res = mp_add (&A, &y, &A)) != MP_OKAY) {
1541          goto LBL_ERR;
1542       }
1543       if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) {
1544          goto LBL_ERR;
1545       }
1546     }
1547     /* A = A/2, B = B/2 */
1548     if ((res = mp_div_2 (&A, &A)) != MP_OKAY) {
1549       goto LBL_ERR;
1550     }
1551     if ((res = mp_div_2 (&B, &B)) != MP_OKAY) {
1552       goto LBL_ERR;
1553     }
1554   }
1555 
1556   /* 5.  while v is even do */
1557   while (BN_is_even (&v) == 1) {
1558     /* 5.1 v = v/2 */
1559     if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
1560       goto LBL_ERR;
1561     }
1562     /* 5.2 if C or D is odd then */
1563     if (BN_is_odd (&C) == 1 || BN_is_odd (&D) == 1) {
1564       /* C = (C+y)/2, D = (D-x)/2 */
1565       if ((res = mp_add (&C, &y, &C)) != MP_OKAY) {
1566          goto LBL_ERR;
1567       }
1568       if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) {
1569          goto LBL_ERR;
1570       }
1571     }
1572     /* C = C/2, D = D/2 */
1573     if ((res = mp_div_2 (&C, &C)) != MP_OKAY) {
1574       goto LBL_ERR;
1575     }
1576     if ((res = mp_div_2 (&D, &D)) != MP_OKAY) {
1577       goto LBL_ERR;
1578     }
1579   }
1580 
1581   /* 6.  if u >= v then */
1582   if (mp_cmp (&u, &v) != MP_LT) {
1583     /* u = u - v, A = A - C, B = B - D */
1584     if ((res = mp_sub (&u, &v, &u)) != MP_OKAY) {
1585       goto LBL_ERR;
1586     }
1587 
1588     if ((res = mp_sub (&A, &C, &A)) != MP_OKAY) {
1589       goto LBL_ERR;
1590     }
1591 
1592     if ((res = mp_sub (&B, &D, &B)) != MP_OKAY) {
1593       goto LBL_ERR;
1594     }
1595   } else {
1596     /* v - v - u, C = C - A, D = D - B */
1597     if ((res = mp_sub (&v, &u, &v)) != MP_OKAY) {
1598       goto LBL_ERR;
1599     }
1600 
1601     if ((res = mp_sub (&C, &A, &C)) != MP_OKAY) {
1602       goto LBL_ERR;
1603     }
1604 
1605     if ((res = mp_sub (&D, &B, &D)) != MP_OKAY) {
1606       goto LBL_ERR;
1607     }
1608   }
1609 
1610   /* if not zero goto step 4 */
1611   if (BN_is_zero (&u) == 0)
1612     goto top;
1613 
1614   /* now a = C, b = D, gcd == g*v */
1615 
1616   /* if v != 1 then there is no inverse */
1617   if (mp_cmp_d (&v, 1) != MP_EQ) {
1618     res = MP_VAL;
1619     goto LBL_ERR;
1620   }
1621 
1622   /* if its too low */
1623   while (mp_cmp_d(&C, 0) == MP_LT) {
1624       if ((res = mp_add(&C, b, &C)) != MP_OKAY) {
1625          goto LBL_ERR;
1626       }
1627   }
1628 
1629   /* too big */
1630   while (mp_cmp_mag(&C, b) != MP_LT) {
1631       if ((res = mp_sub(&C, b, &C)) != MP_OKAY) {
1632          goto LBL_ERR;
1633       }
1634   }
1635 
1636   /* C is now the inverse */
1637   mp_exch (&C, c);
1638   res = MP_OKAY;
1639 LBL_ERR:mp_clear_multi (&x, &y, &u, &v, &A, &B, &C, &D, NULL);
1640   return res;
1641 }
1642 
1643 static int
1644 mp_invmod(mp_int *c, mp_int *a, mp_int *b)
1645 {
1646   /* b cannot be negative */
1647   if (b->sign == MP_NEG || BN_is_zero(b) == 1) {
1648     return MP_VAL;
1649   }
1650 
1651   /* if the modulus is odd we can use a faster routine instead */
1652   if (BN_is_odd (b) == 1) {
1653     return fast_mp_invmod(a, b, c);
1654   }
1655 
1656   return mp_invmod_slow(a, b, c);
1657 
1658   /*NOTREACHED*/
1659   return MP_VAL;
1660 }
1661 
1662 /* b = |a|
1663  *
1664  * Simple function copies the input and fixes the sign to positive
1665  */
1666 static int
1667 mp_abs (mp_int * a, mp_int * b)
1668 {
1669   int     res;
1670 
1671   /* copy a to b */
1672   if (a != b) {
1673      if ((res = mp_copy (a, b)) != MP_OKAY) {
1674        return res;
1675      }
1676   }
1677 
1678   /* force the sign of b to positive */
1679   b->sign = MP_ZPOS;
1680 
1681   return MP_OKAY;
1682 }
1683 
1684 /* determines if reduce_2k_l can be used */
1685 static int mp_reduce_is_2k_l(mp_int *a)
1686 {
1687    int ix, iy;
1688 
1689    if (a->used == 0) {
1690       return MP_NO;
1691    } else if (a->used == 1) {
1692       return MP_YES;
1693    } else if (a->used > 1) {
1694       /* if more than half of the digits are -1 we're sold */
1695       for (iy = ix = 0; ix < a->used; ix++) {
1696           if (a->dp[ix] == MP_MASK) {
1697               ++iy;
1698           }
1699       }
1700       return (iy >= (a->used/2)) ? MP_YES : MP_NO;
1701 
1702    }
1703    return MP_NO;
1704 }
1705 
1706 /* computes a = 2**b
1707  *
1708  * Simple algorithm which zeroes the int, grows it then just sets one bit
1709  * as required.
1710  */
1711 static int
1712 mp_2expt (mp_int * a, int b)
1713 {
1714   int     res;
1715 
1716   /* zero a as per default */
1717   mp_zero (a);
1718 
1719   /* grow a to accomodate the single bit */
1720   if ((res = mp_grow (a, b / DIGIT_BIT + 1)) != MP_OKAY) {
1721     return res;
1722   }
1723 
1724   /* set the used count of where the bit will go */
1725   a->used = b / DIGIT_BIT + 1;
1726 
1727   /* put the single bit in its place */
1728   a->dp[b / DIGIT_BIT] = ((mp_digit)1) << (b % DIGIT_BIT);
1729 
1730   return MP_OKAY;
1731 }
1732 
1733 /* pre-calculate the value required for Barrett reduction
1734  * For a given modulus "b" it calulates the value required in "a"
1735  */
1736 static int mp_reduce_setup (mp_int * a, mp_int * b)
1737 {
1738   int     res;
1739 
1740   if ((res = mp_2expt (a, b->used * 2 * DIGIT_BIT)) != MP_OKAY) {
1741     return res;
1742   }
1743   return mp_div (a, NULL, a, b);
1744 }
1745 
1746 /* b = a*2 */
1747 static int mp_mul_2(mp_int * a, mp_int * b)
1748 {
1749   int     x, res, oldused;
1750 
1751   /* grow to accomodate result */
1752   if (b->alloc < a->used + 1) {
1753     if ((res = mp_grow (b, a->used + 1)) != MP_OKAY) {
1754       return res;
1755     }
1756   }
1757 
1758   oldused = b->used;
1759   b->used = a->used;
1760 
1761   {
1762     mp_digit r, rr, *tmpa, *tmpb;
1763 
1764     /* alias for source */
1765     tmpa = a->dp;
1766 
1767     /* alias for dest */
1768     tmpb = b->dp;
1769 
1770     /* carry */
1771     r = 0;
1772     for (x = 0; x < a->used; x++) {
1773 
1774       /* get what will be the *next* carry bit from the
1775        * MSB of the current digit
1776        */
1777       rr = *tmpa >> ((mp_digit)(DIGIT_BIT - 1));
1778 
1779       /* now shift up this digit, add in the carry [from the previous] */
1780       *tmpb++ = ((*tmpa++ << ((mp_digit)1)) | r) & MP_MASK;
1781 
1782       /* copy the carry that would be from the source
1783        * digit into the next iteration
1784        */
1785       r = rr;
1786     }
1787 
1788     /* new leading digit? */
1789     if (r != 0) {
1790       /* add a MSB which is always 1 at this point */
1791       *tmpb = 1;
1792       ++(b->used);
1793     }
1794 
1795     /* now zero any excess digits on the destination
1796      * that we didn't write to
1797      */
1798     tmpb = b->dp + b->used;
1799     for (x = b->used; x < oldused; x++) {
1800       *tmpb++ = 0;
1801     }
1802   }
1803   b->sign = a->sign;
1804   return MP_OKAY;
1805 }
1806 
1807 /* divide by three (based on routine from MPI and the GMP manual) */
1808 static int
1809 mp_div_3 (mp_int * a, mp_int *c, mp_digit * d)
1810 {
1811   mp_int   q;
1812   mp_word  w, t;
1813   mp_digit b;
1814   int      res, ix;
1815 
1816   /* b = 2**DIGIT_BIT / 3 */
1817   b = (((mp_word)1) << ((mp_word)DIGIT_BIT)) / ((mp_word)3);
1818 
1819   if ((res = mp_init_size(&q, a->used)) != MP_OKAY) {
1820      return res;
1821   }
1822 
1823   q.used = a->used;
1824   q.sign = a->sign;
1825   w = 0;
1826   for (ix = a->used - 1; ix >= 0; ix--) {
1827      w = (w << ((mp_word)DIGIT_BIT)) | ((mp_word)a->dp[ix]);
1828 
1829      if (w >= 3) {
1830         /* multiply w by [1/3] */
1831         t = (w * ((mp_word)b)) >> ((mp_word)DIGIT_BIT);
1832 
1833         /* now subtract 3 * [w/3] from w, to get the remainder */
1834         w -= t+t+t;
1835 
1836         /* fixup the remainder as required since
1837          * the optimization is not exact.
1838          */
1839         while (w >= 3) {
1840            t += 1;
1841            w -= 3;
1842         }
1843       } else {
1844         t = 0;
1845       }
1846       q.dp[ix] = (mp_digit)t;
1847   }
1848 
1849   /* [optional] store the remainder */
1850   if (d != NULL) {
1851      *d = (mp_digit)w;
1852   }
1853 
1854   /* [optional] store the quotient */
1855   if (c != NULL) {
1856      mp_clamp(&q);
1857      mp_exch(&q, c);
1858   }
1859   mp_clear(&q);
1860 
1861   return res;
1862 }
1863 
1864 /* multiplication using the Toom-Cook 3-way algorithm
1865  *
1866  * Much more complicated than Karatsuba but has a lower
1867  * asymptotic running time of O(N**1.464).  This algorithm is
1868  * only particularly useful on VERY large inputs
1869  * (we're talking 1000s of digits here...).
1870 */
1871 static int mp_toom_mul(mp_int *a, mp_int *b, mp_int *c)
1872 {
1873     mp_int w0, w1, w2, w3, w4, tmp1, tmp2, a0, a1, a2, b0, b1, b2;
1874     int res, B;
1875 
1876     /* init temps */
1877     if ((res = mp_init_multi(&w0, &w1, &w2, &w3, &w4,
1878                              &a0, &a1, &a2, &b0, &b1,
1879                              &b2, &tmp1, &tmp2, NULL)) != MP_OKAY) {
1880        return res;
1881     }
1882 
1883     /* B */
1884     B = MIN(a->used, b->used) / 3;
1885 
1886     /* a = a2 * B**2 + a1 * B + a0 */
1887     if ((res = mp_mod_2d(a, DIGIT_BIT * B, &a0)) != MP_OKAY) {
1888        goto ERR;
1889     }
1890 
1891     if ((res = mp_copy(a, &a1)) != MP_OKAY) {
1892        goto ERR;
1893     }
1894     mp_rshd(&a1, B);
1895     mp_mod_2d(&a1, DIGIT_BIT * B, &a1);
1896 
1897     if ((res = mp_copy(a, &a2)) != MP_OKAY) {
1898        goto ERR;
1899     }
1900     mp_rshd(&a2, B*2);
1901 
1902     /* b = b2 * B**2 + b1 * B + b0 */
1903     if ((res = mp_mod_2d(b, DIGIT_BIT * B, &b0)) != MP_OKAY) {
1904        goto ERR;
1905     }
1906 
1907     if ((res = mp_copy(b, &b1)) != MP_OKAY) {
1908        goto ERR;
1909     }
1910     mp_rshd(&b1, B);
1911     mp_mod_2d(&b1, DIGIT_BIT * B, &b1);
1912 
1913     if ((res = mp_copy(b, &b2)) != MP_OKAY) {
1914        goto ERR;
1915     }
1916     mp_rshd(&b2, B*2);
1917 
1918     /* w0 = a0*b0 */
1919     if ((res = mp_mul(&a0, &b0, &w0)) != MP_OKAY) {
1920        goto ERR;
1921     }
1922 
1923     /* w4 = a2 * b2 */
1924     if ((res = mp_mul(&a2, &b2, &w4)) != MP_OKAY) {
1925        goto ERR;
1926     }
1927 
1928     /* w1 = (a2 + 2(a1 + 2a0))(b2 + 2(b1 + 2b0)) */
1929     if ((res = mp_mul_2(&a0, &tmp1)) != MP_OKAY) {
1930        goto ERR;
1931     }
1932     if ((res = mp_add(&tmp1, &a1, &tmp1)) != MP_OKAY) {
1933        goto ERR;
1934     }
1935     if ((res = mp_mul_2(&tmp1, &tmp1)) != MP_OKAY) {
1936        goto ERR;
1937     }
1938     if ((res = mp_add(&tmp1, &a2, &tmp1)) != MP_OKAY) {
1939        goto ERR;
1940     }
1941 
1942     if ((res = mp_mul_2(&b0, &tmp2)) != MP_OKAY) {
1943        goto ERR;
1944     }
1945     if ((res = mp_add(&tmp2, &b1, &tmp2)) != MP_OKAY) {
1946        goto ERR;
1947     }
1948     if ((res = mp_mul_2(&tmp2, &tmp2)) != MP_OKAY) {
1949        goto ERR;
1950     }
1951     if ((res = mp_add(&tmp2, &b2, &tmp2)) != MP_OKAY) {
1952        goto ERR;
1953     }
1954 
1955     if ((res = mp_mul(&tmp1, &tmp2, &w1)) != MP_OKAY) {
1956        goto ERR;
1957     }
1958 
1959     /* w3 = (a0 + 2(a1 + 2a2))(b0 + 2(b1 + 2b2)) */
1960     if ((res = mp_mul_2(&a2, &tmp1)) != MP_OKAY) {
1961        goto ERR;
1962     }
1963     if ((res = mp_add(&tmp1, &a1, &tmp1)) != MP_OKAY) {
1964        goto ERR;
1965     }
1966     if ((res = mp_mul_2(&tmp1, &tmp1)) != MP_OKAY) {
1967        goto ERR;
1968     }
1969     if ((res = mp_add(&tmp1, &a0, &tmp1)) != MP_OKAY) {
1970        goto ERR;
1971     }
1972 
1973     if ((res = mp_mul_2(&b2, &tmp2)) != MP_OKAY) {
1974        goto ERR;
1975     }
1976     if ((res = mp_add(&tmp2, &b1, &tmp2)) != MP_OKAY) {
1977        goto ERR;
1978     }
1979     if ((res = mp_mul_2(&tmp2, &tmp2)) != MP_OKAY) {
1980        goto ERR;
1981     }
1982     if ((res = mp_add(&tmp2, &b0, &tmp2)) != MP_OKAY) {
1983        goto ERR;
1984     }
1985 
1986     if ((res = mp_mul(&tmp1, &tmp2, &w3)) != MP_OKAY) {
1987        goto ERR;
1988     }
1989 
1990 
1991     /* w2 = (a2 + a1 + a0)(b2 + b1 + b0) */
1992     if ((res = mp_add(&a2, &a1, &tmp1)) != MP_OKAY) {
1993        goto ERR;
1994     }
1995     if ((res = mp_add(&tmp1, &a0, &tmp1)) != MP_OKAY) {
1996        goto ERR;
1997     }
1998     if ((res = mp_add(&b2, &b1, &tmp2)) != MP_OKAY) {
1999        goto ERR;
2000     }
2001     if ((res = mp_add(&tmp2, &b0, &tmp2)) != MP_OKAY) {
2002        goto ERR;
2003     }
2004     if ((res = mp_mul(&tmp1, &tmp2, &w2)) != MP_OKAY) {
2005        goto ERR;
2006     }
2007 
2008     /* now solve the matrix
2009 
2010        0  0  0  0  1
2011        1  2  4  8  16
2012        1  1  1  1  1
2013        16 8  4  2  1
2014        1  0  0  0  0
2015 
2016        using 12 subtractions, 4 shifts,
2017               2 small divisions and 1 small multiplication
2018      */
2019 
2020      /* r1 - r4 */
2021      if ((res = mp_sub(&w1, &w4, &w1)) != MP_OKAY) {
2022         goto ERR;
2023      }
2024      /* r3 - r0 */
2025      if ((res = mp_sub(&w3, &w0, &w3)) != MP_OKAY) {
2026         goto ERR;
2027      }
2028      /* r1/2 */
2029      if ((res = mp_div_2(&w1, &w1)) != MP_OKAY) {
2030         goto ERR;
2031      }
2032      /* r3/2 */
2033      if ((res = mp_div_2(&w3, &w3)) != MP_OKAY) {
2034         goto ERR;
2035      }
2036      /* r2 - r0 - r4 */
2037      if ((res = mp_sub(&w2, &w0, &w2)) != MP_OKAY) {
2038         goto ERR;
2039      }
2040      if ((res = mp_sub(&w2, &w4, &w2)) != MP_OKAY) {
2041         goto ERR;
2042      }
2043      /* r1 - r2 */
2044      if ((res = mp_sub(&w1, &w2, &w1)) != MP_OKAY) {
2045         goto ERR;
2046      }
2047      /* r3 - r2 */
2048      if ((res = mp_sub(&w3, &w2, &w3)) != MP_OKAY) {
2049         goto ERR;
2050      }
2051      /* r1 - 8r0 */
2052      if ((res = mp_mul_2d(&w0, 3, &tmp1)) != MP_OKAY) {
2053         goto ERR;
2054      }
2055      if ((res = mp_sub(&w1, &tmp1, &w1)) != MP_OKAY) {
2056         goto ERR;
2057      }
2058      /* r3 - 8r4 */
2059      if ((res = mp_mul_2d(&w4, 3, &tmp1)) != MP_OKAY) {
2060         goto ERR;
2061      }
2062      if ((res = mp_sub(&w3, &tmp1, &w3)) != MP_OKAY) {
2063         goto ERR;
2064      }
2065      /* 3r2 - r1 - r3 */
2066      if ((res = mp_mul_d(&w2, 3, &w2)) != MP_OKAY) {
2067         goto ERR;
2068      }
2069      if ((res = mp_sub(&w2, &w1, &w2)) != MP_OKAY) {
2070         goto ERR;
2071      }
2072      if ((res = mp_sub(&w2, &w3, &w2)) != MP_OKAY) {
2073         goto ERR;
2074      }
2075      /* r1 - r2 */
2076      if ((res = mp_sub(&w1, &w2, &w1)) != MP_OKAY) {
2077         goto ERR;
2078      }
2079      /* r3 - r2 */
2080      if ((res = mp_sub(&w3, &w2, &w3)) != MP_OKAY) {
2081         goto ERR;
2082      }
2083      /* r1/3 */
2084      if ((res = mp_div_3(&w1, &w1, NULL)) != MP_OKAY) {
2085         goto ERR;
2086      }
2087      /* r3/3 */
2088      if ((res = mp_div_3(&w3, &w3, NULL)) != MP_OKAY) {
2089         goto ERR;
2090      }
2091 
2092      /* at this point shift W[n] by B*n */
2093      if ((res = mp_lshd(&w1, 1*B)) != MP_OKAY) {
2094         goto ERR;
2095      }
2096      if ((res = mp_lshd(&w2, 2*B)) != MP_OKAY) {
2097         goto ERR;
2098      }
2099      if ((res = mp_lshd(&w3, 3*B)) != MP_OKAY) {
2100         goto ERR;
2101      }
2102      if ((res = mp_lshd(&w4, 4*B)) != MP_OKAY) {
2103         goto ERR;
2104      }
2105 
2106      if ((res = mp_add(&w0, &w1, c)) != MP_OKAY) {
2107         goto ERR;
2108      }
2109      if ((res = mp_add(&w2, &w3, &tmp1)) != MP_OKAY) {
2110         goto ERR;
2111      }
2112      if ((res = mp_add(&w4, &tmp1, &tmp1)) != MP_OKAY) {
2113         goto ERR;
2114      }
2115      if ((res = mp_add(&tmp1, c, c)) != MP_OKAY) {
2116         goto ERR;
2117      }
2118 
2119 ERR:
2120      mp_clear_multi(&w0, &w1, &w2, &w3, &w4,
2121                     &a0, &a1, &a2, &b0, &b1,
2122                     &b2, &tmp1, &tmp2, NULL);
2123      return res;
2124 }
2125 
2126 #define TOOM_MUL_CUTOFF	350
2127 #define KARATSUBA_MUL_CUTOFF 80
2128 
2129 /* c = |a| * |b| using Karatsuba Multiplication using
2130  * three half size multiplications
2131  *
2132  * Let B represent the radix [e.g. 2**DIGIT_BIT] and
2133  * let n represent half of the number of digits in
2134  * the min(a,b)
2135  *
2136  * a = a1 * B**n + a0
2137  * b = b1 * B**n + b0
2138  *
2139  * Then, a * b =>
2140    a1b1 * B**2n + ((a1 + a0)(b1 + b0) - (a0b0 + a1b1)) * B + a0b0
2141  *
2142  * Note that a1b1 and a0b0 are used twice and only need to be
2143  * computed once.  So in total three half size (half # of
2144  * digit) multiplications are performed, a0b0, a1b1 and
2145  * (a1+b1)(a0+b0)
2146  *
2147  * Note that a multiplication of half the digits requires
2148  * 1/4th the number of single precision multiplications so in
2149  * total after one call 25% of the single precision multiplications
2150  * are saved.  Note also that the call to mp_mul can end up back
2151  * in this function if the a0, a1, b0, or b1 are above the threshold.
2152  * This is known as divide-and-conquer and leads to the famous
2153  * O(N**lg(3)) or O(N**1.584) work which is asymptopically lower than
2154  * the standard O(N**2) that the baseline/comba methods use.
2155  * Generally though the overhead of this method doesn't pay off
2156  * until a certain size (N ~ 80) is reached.
2157  */
2158 static int mp_karatsuba_mul (mp_int * a, mp_int * b, mp_int * c)
2159 {
2160   mp_int  x0, x1, y0, y1, t1, x0y0, x1y1;
2161   int     B;
2162   int     err;
2163 
2164   /* default the return code to an error */
2165   err = MP_MEM;
2166 
2167   /* min # of digits */
2168   B = MIN (a->used, b->used);
2169 
2170   /* now divide in two */
2171   B = (int)((unsigned)B >> 1);
2172 
2173   /* init copy all the temps */
2174   if (mp_init_size (&x0, B) != MP_OKAY)
2175     goto ERR;
2176   if (mp_init_size (&x1, a->used - B) != MP_OKAY)
2177     goto X0;
2178   if (mp_init_size (&y0, B) != MP_OKAY)
2179     goto X1;
2180   if (mp_init_size (&y1, b->used - B) != MP_OKAY)
2181     goto Y0;
2182 
2183   /* init temps */
2184   if (mp_init_size (&t1, B * 2) != MP_OKAY)
2185     goto Y1;
2186   if (mp_init_size (&x0y0, B * 2) != MP_OKAY)
2187     goto T1;
2188   if (mp_init_size (&x1y1, B * 2) != MP_OKAY)
2189     goto X0Y0;
2190 
2191   /* now shift the digits */
2192   x0.used = y0.used = B;
2193   x1.used = a->used - B;
2194   y1.used = b->used - B;
2195 
2196   {
2197     int x;
2198     mp_digit *tmpa, *tmpb, *tmpx, *tmpy;
2199 
2200     /* we copy the digits directly instead of using higher level functions
2201      * since we also need to shift the digits
2202      */
2203     tmpa = a->dp;
2204     tmpb = b->dp;
2205 
2206     tmpx = x0.dp;
2207     tmpy = y0.dp;
2208     for (x = 0; x < B; x++) {
2209       *tmpx++ = *tmpa++;
2210       *tmpy++ = *tmpb++;
2211     }
2212 
2213     tmpx = x1.dp;
2214     for (x = B; x < a->used; x++) {
2215       *tmpx++ = *tmpa++;
2216     }
2217 
2218     tmpy = y1.dp;
2219     for (x = B; x < b->used; x++) {
2220       *tmpy++ = *tmpb++;
2221     }
2222   }
2223 
2224   /* only need to clamp the lower words since by definition the
2225    * upper words x1/y1 must have a known number of digits
2226    */
2227   mp_clamp (&x0);
2228   mp_clamp (&y0);
2229 
2230   /* now calc the products x0y0 and x1y1 */
2231   /* after this x0 is no longer required, free temp [x0==t2]! */
2232   if (mp_mul (&x0, &y0, &x0y0) != MP_OKAY)
2233     goto X1Y1;          /* x0y0 = x0*y0 */
2234   if (mp_mul (&x1, &y1, &x1y1) != MP_OKAY)
2235     goto X1Y1;          /* x1y1 = x1*y1 */
2236 
2237   /* now calc x1+x0 and y1+y0 */
2238   if (s_mp_add (&x1, &x0, &t1) != MP_OKAY)
2239     goto X1Y1;          /* t1 = x1 - x0 */
2240   if (s_mp_add (&y1, &y0, &x0) != MP_OKAY)
2241     goto X1Y1;          /* t2 = y1 - y0 */
2242   if (mp_mul (&t1, &x0, &t1) != MP_OKAY)
2243     goto X1Y1;          /* t1 = (x1 + x0) * (y1 + y0) */
2244 
2245   /* add x0y0 */
2246   if (mp_add (&x0y0, &x1y1, &x0) != MP_OKAY)
2247     goto X1Y1;          /* t2 = x0y0 + x1y1 */
2248   if (s_mp_sub (&t1, &x0, &t1) != MP_OKAY)
2249     goto X1Y1;          /* t1 = (x1+x0)*(y1+y0) - (x1y1 + x0y0) */
2250 
2251   /* shift by B */
2252   if (mp_lshd (&t1, B) != MP_OKAY)
2253     goto X1Y1;          /* t1 = (x0y0 + x1y1 - (x1-x0)*(y1-y0))<<B */
2254   if (mp_lshd (&x1y1, B * 2) != MP_OKAY)
2255     goto X1Y1;          /* x1y1 = x1y1 << 2*B */
2256 
2257   if (mp_add (&x0y0, &t1, &t1) != MP_OKAY)
2258     goto X1Y1;          /* t1 = x0y0 + t1 */
2259   if (mp_add (&t1, &x1y1, c) != MP_OKAY)
2260     goto X1Y1;          /* t1 = x0y0 + t1 + x1y1 */
2261 
2262   /* Algorithm succeeded set the return code to MP_OKAY */
2263   err = MP_OKAY;
2264 
2265 X1Y1:mp_clear (&x1y1);
2266 X0Y0:mp_clear (&x0y0);
2267 T1:mp_clear (&t1);
2268 Y1:mp_clear (&y1);
2269 Y0:mp_clear (&y0);
2270 X1:mp_clear (&x1);
2271 X0:mp_clear (&x0);
2272 ERR:
2273   return err;
2274 }
2275 
2276 /* Fast (comba) multiplier
2277  *
2278  * This is the fast column-array [comba] multiplier.  It is
2279  * designed to compute the columns of the product first
2280  * then handle the carries afterwards.  This has the effect
2281  * of making the nested loops that compute the columns very
2282  * simple and schedulable on super-scalar processors.
2283  *
2284  * This has been modified to produce a variable number of
2285  * digits of output so if say only a half-product is required
2286  * you don't have to compute the upper half (a feature
2287  * required for fast Barrett reduction).
2288  *
2289  * Based on Algorithm 14.12 on pp.595 of HAC.
2290  *
2291  */
2292 static int fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
2293 {
2294   int     olduse, res, pa, ix, iz;
2295   /*LINTED*/
2296   mp_digit W[MP_WARRAY];
2297   mp_word  _W;
2298 
2299   /* grow the destination as required */
2300   if (c->alloc < digs) {
2301     if ((res = mp_grow (c, digs)) != MP_OKAY) {
2302       return res;
2303     }
2304   }
2305 
2306   /* number of output digits to produce */
2307   pa = MIN(digs, a->used + b->used);
2308 
2309   /* clear the carry */
2310   _W = 0;
2311   for (ix = 0; ix < pa; ix++) {
2312       int      tx, ty;
2313       int      iy;
2314       mp_digit *tmpx, *tmpy;
2315 
2316       /* get offsets into the two bignums */
2317       ty = MIN(b->used-1, ix);
2318       tx = ix - ty;
2319 
2320       /* setup temp aliases */
2321       tmpx = a->dp + tx;
2322       tmpy = b->dp + ty;
2323 
2324       /* this is the number of times the loop will iterrate, essentially
2325          while (tx++ < a->used && ty-- >= 0) { ... }
2326        */
2327       iy = MIN(a->used-tx, ty+1);
2328 
2329       /* execute loop */
2330       for (iz = 0; iz < iy; ++iz) {
2331          _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
2332 
2333       }
2334 
2335       /* store term */
2336       W[ix] = ((mp_digit)_W) & MP_MASK;
2337 
2338       /* make next carry */
2339       _W = _W >> ((mp_word)DIGIT_BIT);
2340  }
2341 
2342   /* setup dest */
2343   olduse  = c->used;
2344   c->used = pa;
2345 
2346   {
2347     mp_digit *tmpc;
2348     tmpc = c->dp;
2349     for (ix = 0; ix < pa+1; ix++) {
2350       /* now extract the previous digit [below the carry] */
2351       *tmpc++ = W[ix];
2352     }
2353 
2354     /* clear unused digits [that existed in the old copy of c] */
2355     for (; ix < olduse; ix++) {
2356       *tmpc++ = 0;
2357     }
2358   }
2359   mp_clamp (c);
2360   return MP_OKAY;
2361 }
2362 
2363 /* Source: /usr/cvsroot/libtommath/dist/libtommath/bn_fast_s_mp_mul_digs.c,v $ */
2364 /* Revision: 1.2 $ */
2365 /* Date: 2011/03/18 16:22:09 $ */
2366 
2367 
2368 /* multiplies |a| * |b| and only computes upto digs digits of result
2369  * HAC pp. 595, Algorithm 14.12  Modified so you can control how
2370  * many digits of output are created.
2371  */
2372 static int s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
2373 {
2374   mp_int  t;
2375   int     res, pa, pb, ix, iy;
2376   mp_digit u;
2377   mp_word r;
2378   mp_digit tmpx, *tmpt, *tmpy;
2379 
2380   /* can we use the fast multiplier? */
2381   if (((unsigned)(digs) < MP_WARRAY) &&
2382       MIN (a->used, b->used) <
2383           (1 << (unsigned)((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
2384     return fast_s_mp_mul_digs (a, b, c, digs);
2385   }
2386 
2387   if ((res = mp_init_size (&t, digs)) != MP_OKAY) {
2388     return res;
2389   }
2390   t.used = digs;
2391 
2392   /* compute the digits of the product directly */
2393   pa = a->used;
2394   for (ix = 0; ix < pa; ix++) {
2395     /* set the carry to zero */
2396     u = 0;
2397 
2398     /* limit ourselves to making digs digits of output */
2399     pb = MIN (b->used, digs - ix);
2400 
2401     /* setup some aliases */
2402     /* copy of the digit from a used within the nested loop */
2403     tmpx = a->dp[ix];
2404 
2405     /* an alias for the destination shifted ix places */
2406     tmpt = t.dp + ix;
2407 
2408     /* an alias for the digits of b */
2409     tmpy = b->dp;
2410 
2411     /* compute the columns of the output and propagate the carry */
2412     for (iy = 0; iy < pb; iy++) {
2413       /* compute the column as a mp_word */
2414       r       = ((mp_word)*tmpt) +
2415                 ((mp_word)tmpx) * ((mp_word)*tmpy++) +
2416                 ((mp_word) u);
2417 
2418       /* the new column is the lower part of the result */
2419       *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
2420 
2421       /* get the carry word from the result */
2422       u       = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
2423     }
2424     /* set carry if it is placed below digs */
2425     if (ix + iy < digs) {
2426       *tmpt = u;
2427     }
2428   }
2429 
2430   mp_clamp (&t);
2431   mp_exch (&t, c);
2432 
2433   mp_clear (&t);
2434   return MP_OKAY;
2435 }
2436 
2437 /* Source: /usr/cvsroot/libtommath/dist/libtommath/bn_s_mp_mul_digs.c,v $ */
2438 /* Revision: 1.3 $ */
2439 /* Date: 2011/03/18 16:43:04 $ */
2440 
2441 /* high level multiplication (handles sign) */
2442 static int
2443 mp_mul(mp_int * a, mp_int * b, mp_int * c)
2444 {
2445   int     res, neg;
2446   neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
2447 
2448   /* use Toom-Cook? */
2449   if (MIN (a->used, b->used) >= TOOM_MUL_CUTOFF) {
2450     res = mp_toom_mul(a, b, c);
2451   } else
2452   /* use Karatsuba? */
2453   if (MIN (a->used, b->used) >= KARATSUBA_MUL_CUTOFF) {
2454     res = mp_karatsuba_mul (a, b, c);
2455   } else
2456   {
2457     /* can we use the fast multiplier?
2458      *
2459      * The fast multiplier can be used if the output will
2460      * have less than MP_WARRAY digits and the number of
2461      * digits won't affect carry propagation
2462      */
2463     int     digs = a->used + b->used + 1;
2464 
2465     if (((unsigned)digs < MP_WARRAY) &&
2466         MIN(a->used, b->used) <=
2467         (1 << (unsigned)((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
2468       res = fast_s_mp_mul_digs (a, b, c, digs);
2469     } else
2470       res = s_mp_mul (a, b, c); /* uses s_mp_mul_digs */
2471 
2472   }
2473   c->sign = (c->used > 0) ? neg : MP_ZPOS;
2474   return res;
2475 }
2476 
2477 /* this is a modified version of fast_s_mul_digs that only produces
2478  * output digits *above* digs.  See the comments for fast_s_mul_digs
2479  * to see how it works.
2480  *
2481  * This is used in the Barrett reduction since for one of the multiplications
2482  * only the higher digits were needed.  This essentially halves the work.
2483  *
2484  * Based on Algorithm 14.12 on pp.595 of HAC.
2485  */
2486 static int
2487 fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
2488 {
2489   int     olduse, res, pa, ix, iz;
2490   mp_digit W[MP_WARRAY];
2491   mp_word  _W;
2492 
2493   /* grow the destination as required */
2494   pa = a->used + b->used;
2495   if (c->alloc < pa) {
2496     if ((res = mp_grow (c, pa)) != MP_OKAY) {
2497       return res;
2498     }
2499   }
2500 
2501   /* number of output digits to produce */
2502   pa = a->used + b->used;
2503   _W = 0;
2504   for (ix = digs; ix < pa; ix++) {
2505       int      tx, ty, iy;
2506       mp_digit *tmpx, *tmpy;
2507 
2508       /* get offsets into the two bignums */
2509       ty = MIN(b->used-1, ix);
2510       tx = ix - ty;
2511 
2512       /* setup temp aliases */
2513       tmpx = a->dp + tx;
2514       tmpy = b->dp + ty;
2515 
2516       /* this is the number of times the loop will iterrate, essentially its
2517          while (tx++ < a->used && ty-- >= 0) { ... }
2518        */
2519       iy = MIN(a->used-tx, ty+1);
2520 
2521       /* execute loop */
2522       for (iz = 0; iz < iy; iz++) {
2523          _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
2524       }
2525 
2526       /* store term */
2527       W[ix] = ((mp_digit)_W) & MP_MASK;
2528 
2529       /* make next carry */
2530       _W = _W >> ((mp_word)DIGIT_BIT);
2531   }
2532 
2533   /* setup dest */
2534   olduse  = c->used;
2535   c->used = pa;
2536 
2537   {
2538     mp_digit *tmpc;
2539 
2540     tmpc = c->dp + digs;
2541     for (ix = digs; ix < pa; ix++) {
2542       /* now extract the previous digit [below the carry] */
2543       *tmpc++ = W[ix];
2544     }
2545 
2546     /* clear unused digits [that existed in the old copy of c] */
2547     for (; ix < olduse; ix++) {
2548       *tmpc++ = 0;
2549     }
2550   }
2551   mp_clamp (c);
2552   return MP_OKAY;
2553 }
2554 
2555 /* Source: /usr/cvsroot/libtommath/dist/libtommath/bn_fast_s_mp_mul_high_digs.c,v $ */
2556 /* Revision: 1.1.1.1 $ */
2557 /* Date: 2011/03/12 22:58:18 $ */
2558 
2559 /* multiplies |a| * |b| and does not compute the lower digs digits
2560  * [meant to get the higher part of the product]
2561  */
2562 static int
2563 s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
2564 {
2565   mp_int  t;
2566   int     res, pa, pb, ix, iy;
2567   mp_digit u;
2568   mp_word r;
2569   mp_digit tmpx, *tmpt, *tmpy;
2570 
2571   /* can we use the fast multiplier? */
2572   if (((unsigned)(a->used + b->used + 1) < MP_WARRAY)
2573       && MIN (a->used, b->used) < (1 << (unsigned)((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
2574     return fast_s_mp_mul_high_digs (a, b, c, digs);
2575   }
2576 
2577   if ((res = mp_init_size (&t, a->used + b->used + 1)) != MP_OKAY) {
2578     return res;
2579   }
2580   t.used = a->used + b->used + 1;
2581 
2582   pa = a->used;
2583   pb = b->used;
2584   for (ix = 0; ix < pa; ix++) {
2585     /* clear the carry */
2586     u = 0;
2587 
2588     /* left hand side of A[ix] * B[iy] */
2589     tmpx = a->dp[ix];
2590 
2591     /* alias to the address of where the digits will be stored */
2592     tmpt = &(t.dp[digs]);
2593 
2594     /* alias for where to read the right hand side from */
2595     tmpy = b->dp + (digs - ix);
2596 
2597     for (iy = digs - ix; iy < pb; iy++) {
2598       /* calculate the double precision result */
2599       r       = ((mp_word)*tmpt) +
2600                 ((mp_word)tmpx) * ((mp_word)*tmpy++) +
2601                 ((mp_word) u);
2602 
2603       /* get the lower part */
2604       *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
2605 
2606       /* carry the carry */
2607       u       = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
2608     }
2609     *tmpt = u;
2610   }
2611   mp_clamp (&t);
2612   mp_exch (&t, c);
2613   mp_clear (&t);
2614   return MP_OKAY;
2615 }
2616 
2617 /* Source: /usr/cvsroot/libtommath/dist/libtommath/bn_s_mp_mul_high_digs.c,v $ */
2618 /* Revision: 1.3 $ */
2619 /* Date: 2011/03/18 16:43:04 $ */
2620 
2621 /* reduces x mod m, assumes 0 < x < m**2, mu is
2622  * precomputed via mp_reduce_setup.
2623  * From HAC pp.604 Algorithm 14.42
2624  */
2625 static int
2626 mp_reduce (mp_int * x, mp_int * m, mp_int * mu)
2627 {
2628   mp_int  q;
2629   int     res, um = m->used;
2630 
2631   /* q = x */
2632   if ((res = mp_init_copy (&q, x)) != MP_OKAY) {
2633     return res;
2634   }
2635 
2636   /* q1 = x / b**(k-1)  */
2637   mp_rshd (&q, um - 1);
2638 
2639   /* according to HAC this optimization is ok */
2640   if (((unsigned long) um) > (((mp_digit)1) << (DIGIT_BIT - 1))) {
2641     if ((res = mp_mul (&q, mu, &q)) != MP_OKAY) {
2642       goto CLEANUP;
2643     }
2644   } else {
2645     if ((res = s_mp_mul_high_digs (&q, mu, &q, um)) != MP_OKAY) {
2646       goto CLEANUP;
2647     }
2648   }
2649 
2650   /* q3 = q2 / b**(k+1) */
2651   mp_rshd (&q, um + 1);
2652 
2653   /* x = x mod b**(k+1), quick (no division) */
2654   if ((res = mp_mod_2d (x, DIGIT_BIT * (um + 1), x)) != MP_OKAY) {
2655     goto CLEANUP;
2656   }
2657 
2658   /* q = q * m mod b**(k+1), quick (no division) */
2659   if ((res = s_mp_mul_digs (&q, m, &q, um + 1)) != MP_OKAY) {
2660     goto CLEANUP;
2661   }
2662 
2663   /* x = x - q */
2664   if ((res = mp_sub (x, &q, x)) != MP_OKAY) {
2665     goto CLEANUP;
2666   }
2667 
2668   /* If x < 0, add b**(k+1) to it */
2669   if (mp_cmp_d (x, 0) == MP_LT) {
2670     mp_set (&q, 1);
2671     if ((res = mp_lshd (&q, um + 1)) != MP_OKAY)
2672       goto CLEANUP;
2673     if ((res = mp_add (x, &q, x)) != MP_OKAY)
2674       goto CLEANUP;
2675   }
2676 
2677   /* Back off if it's too big */
2678   while (mp_cmp (x, m) != MP_LT) {
2679     if ((res = s_mp_sub (x, m, x)) != MP_OKAY) {
2680       goto CLEANUP;
2681     }
2682   }
2683 
2684 CLEANUP:
2685   mp_clear (&q);
2686 
2687   return res;
2688 }
2689 
2690 /* determines the setup value */
2691 static int mp_reduce_2k_setup_l(mp_int *a, mp_int *d)
2692 {
2693    int    res;
2694    mp_int tmp;
2695 
2696    if ((res = mp_init(&tmp)) != MP_OKAY) {
2697       return res;
2698    }
2699 
2700    if ((res = mp_2expt(&tmp, mp_count_bits(a))) != MP_OKAY) {
2701       goto ERR;
2702    }
2703 
2704    if ((res = s_mp_sub(&tmp, a, d)) != MP_OKAY) {
2705       goto ERR;
2706    }
2707 
2708 ERR:
2709    mp_clear(&tmp);
2710    return res;
2711 }
2712 
2713 /* Source: /usr/cvsroot/libtommath/dist/libtommath/bn_mp_reduce_2k_setup_l.c,v $ */
2714 /* Revision: 1.1.1.1 $ */
2715 /* Date: 2011/03/12 22:58:18 $ */
2716 
2717 /* reduces a modulo n where n is of the form 2**p - d
2718    This differs from reduce_2k since "d" can be larger
2719    than a single digit.
2720 */
2721 static int
2722 mp_reduce_2k_l(mp_int *a, mp_int *n, mp_int *d)
2723 {
2724    mp_int q;
2725    int    p, res;
2726 
2727    if ((res = mp_init(&q)) != MP_OKAY) {
2728       return res;
2729    }
2730 
2731    p = mp_count_bits(n);
2732 top:
2733    /* q = a/2**p, a = a mod 2**p */
2734    if ((res = mp_div_2d(a, p, &q, a)) != MP_OKAY) {
2735       goto ERR;
2736    }
2737 
2738    /* q = q * d */
2739    if ((res = mp_mul(&q, d, &q)) != MP_OKAY) {
2740       goto ERR;
2741    }
2742 
2743    /* a = a + q */
2744    if ((res = s_mp_add(a, &q, a)) != MP_OKAY) {
2745       goto ERR;
2746    }
2747 
2748    if (mp_cmp_mag(a, n) != MP_LT) {
2749       s_mp_sub(a, n, a);
2750       goto top;
2751    }
2752 
2753 ERR:
2754    mp_clear(&q);
2755    return res;
2756 }
2757 
2758 /* Source: /usr/cvsroot/libtommath/dist/libtommath/bn_mp_reduce_2k_l.c,v $ */
2759 /* Revision: 1.1.1.1 $ */
2760 /* Date: 2011/03/12 22:58:18 $ */
2761 
2762 /* squaring using Toom-Cook 3-way algorithm */
2763 static int
2764 mp_toom_sqr(mp_int *a, mp_int *b)
2765 {
2766     mp_int w0, w1, w2, w3, w4, tmp1, a0, a1, a2;
2767     int res, B;
2768 
2769     /* init temps */
2770     if ((res = mp_init_multi(&w0, &w1, &w2, &w3, &w4, &a0, &a1, &a2, &tmp1, NULL)) != MP_OKAY) {
2771        return res;
2772     }
2773 
2774     /* B */
2775     B = a->used / 3;
2776 
2777     /* a = a2 * B**2 + a1 * B + a0 */
2778     if ((res = mp_mod_2d(a, DIGIT_BIT * B, &a0)) != MP_OKAY) {
2779        goto ERR;
2780     }
2781 
2782     if ((res = mp_copy(a, &a1)) != MP_OKAY) {
2783        goto ERR;
2784     }
2785     mp_rshd(&a1, B);
2786     mp_mod_2d(&a1, DIGIT_BIT * B, &a1);
2787 
2788     if ((res = mp_copy(a, &a2)) != MP_OKAY) {
2789        goto ERR;
2790     }
2791     mp_rshd(&a2, B*2);
2792 
2793     /* w0 = a0*a0 */
2794     if ((res = mp_sqr(&a0, &w0)) != MP_OKAY) {
2795        goto ERR;
2796     }
2797 
2798     /* w4 = a2 * a2 */
2799     if ((res = mp_sqr(&a2, &w4)) != MP_OKAY) {
2800        goto ERR;
2801     }
2802 
2803     /* w1 = (a2 + 2(a1 + 2a0))**2 */
2804     if ((res = mp_mul_2(&a0, &tmp1)) != MP_OKAY) {
2805        goto ERR;
2806     }
2807     if ((res = mp_add(&tmp1, &a1, &tmp1)) != MP_OKAY) {
2808        goto ERR;
2809     }
2810     if ((res = mp_mul_2(&tmp1, &tmp1)) != MP_OKAY) {
2811        goto ERR;
2812     }
2813     if ((res = mp_add(&tmp1, &a2, &tmp1)) != MP_OKAY) {
2814        goto ERR;
2815     }
2816 
2817     if ((res = mp_sqr(&tmp1, &w1)) != MP_OKAY) {
2818        goto ERR;
2819     }
2820 
2821     /* w3 = (a0 + 2(a1 + 2a2))**2 */
2822     if ((res = mp_mul_2(&a2, &tmp1)) != MP_OKAY) {
2823        goto ERR;
2824     }
2825     if ((res = mp_add(&tmp1, &a1, &tmp1)) != MP_OKAY) {
2826        goto ERR;
2827     }
2828     if ((res = mp_mul_2(&tmp1, &tmp1)) != MP_OKAY) {
2829        goto ERR;
2830     }
2831     if ((res = mp_add(&tmp1, &a0, &tmp1)) != MP_OKAY) {
2832        goto ERR;
2833     }
2834 
2835     if ((res = mp_sqr(&tmp1, &w3)) != MP_OKAY) {
2836        goto ERR;
2837     }
2838 
2839 
2840     /* w2 = (a2 + a1 + a0)**2 */
2841     if ((res = mp_add(&a2, &a1, &tmp1)) != MP_OKAY) {
2842        goto ERR;
2843     }
2844     if ((res = mp_add(&tmp1, &a0, &tmp1)) != MP_OKAY) {
2845        goto ERR;
2846     }
2847     if ((res = mp_sqr(&tmp1, &w2)) != MP_OKAY) {
2848        goto ERR;
2849     }
2850 
2851     /* now solve the matrix
2852 
2853        0  0  0  0  1
2854        1  2  4  8  16
2855        1  1  1  1  1
2856        16 8  4  2  1
2857        1  0  0  0  0
2858 
2859        using 12 subtractions, 4 shifts, 2 small divisions and 1 small multiplication.
2860      */
2861 
2862      /* r1 - r4 */
2863      if ((res = mp_sub(&w1, &w4, &w1)) != MP_OKAY) {
2864         goto ERR;
2865      }
2866      /* r3 - r0 */
2867      if ((res = mp_sub(&w3, &w0, &w3)) != MP_OKAY) {
2868         goto ERR;
2869      }
2870      /* r1/2 */
2871      if ((res = mp_div_2(&w1, &w1)) != MP_OKAY) {
2872         goto ERR;
2873      }
2874      /* r3/2 */
2875      if ((res = mp_div_2(&w3, &w3)) != MP_OKAY) {
2876         goto ERR;
2877      }
2878      /* r2 - r0 - r4 */
2879      if ((res = mp_sub(&w2, &w0, &w2)) != MP_OKAY) {
2880         goto ERR;
2881      }
2882      if ((res = mp_sub(&w2, &w4, &w2)) != MP_OKAY) {
2883         goto ERR;
2884      }
2885      /* r1 - r2 */
2886      if ((res = mp_sub(&w1, &w2, &w1)) != MP_OKAY) {
2887         goto ERR;
2888      }
2889      /* r3 - r2 */
2890      if ((res = mp_sub(&w3, &w2, &w3)) != MP_OKAY) {
2891         goto ERR;
2892      }
2893      /* r1 - 8r0 */
2894      if ((res = mp_mul_2d(&w0, 3, &tmp1)) != MP_OKAY) {
2895         goto ERR;
2896      }
2897      if ((res = mp_sub(&w1, &tmp1, &w1)) != MP_OKAY) {
2898         goto ERR;
2899      }
2900      /* r3 - 8r4 */
2901      if ((res = mp_mul_2d(&w4, 3, &tmp1)) != MP_OKAY) {
2902         goto ERR;
2903      }
2904      if ((res = mp_sub(&w3, &tmp1, &w3)) != MP_OKAY) {
2905         goto ERR;
2906      }
2907      /* 3r2 - r1 - r3 */
2908      if ((res = mp_mul_d(&w2, 3, &w2)) != MP_OKAY) {
2909         goto ERR;
2910      }
2911      if ((res = mp_sub(&w2, &w1, &w2)) != MP_OKAY) {
2912         goto ERR;
2913      }
2914      if ((res = mp_sub(&w2, &w3, &w2)) != MP_OKAY) {
2915         goto ERR;
2916      }
2917      /* r1 - r2 */
2918      if ((res = mp_sub(&w1, &w2, &w1)) != MP_OKAY) {
2919         goto ERR;
2920      }
2921      /* r3 - r2 */
2922      if ((res = mp_sub(&w3, &w2, &w3)) != MP_OKAY) {
2923         goto ERR;
2924      }
2925      /* r1/3 */
2926      if ((res = mp_div_3(&w1, &w1, NULL)) != MP_OKAY) {
2927         goto ERR;
2928      }
2929      /* r3/3 */
2930      if ((res = mp_div_3(&w3, &w3, NULL)) != MP_OKAY) {
2931         goto ERR;
2932      }
2933 
2934      /* at this point shift W[n] by B*n */
2935      if ((res = mp_lshd(&w1, 1*B)) != MP_OKAY) {
2936         goto ERR;
2937      }
2938      if ((res = mp_lshd(&w2, 2*B)) != MP_OKAY) {
2939         goto ERR;
2940      }
2941      if ((res = mp_lshd(&w3, 3*B)) != MP_OKAY) {
2942         goto ERR;
2943      }
2944      if ((res = mp_lshd(&w4, 4*B)) != MP_OKAY) {
2945         goto ERR;
2946      }
2947 
2948      if ((res = mp_add(&w0, &w1, b)) != MP_OKAY) {
2949         goto ERR;
2950      }
2951      if ((res = mp_add(&w2, &w3, &tmp1)) != MP_OKAY) {
2952         goto ERR;
2953      }
2954      if ((res = mp_add(&w4, &tmp1, &tmp1)) != MP_OKAY) {
2955         goto ERR;
2956      }
2957      if ((res = mp_add(&tmp1, b, b)) != MP_OKAY) {
2958         goto ERR;
2959      }
2960 
2961 ERR:
2962      mp_clear_multi(&w0, &w1, &w2, &w3, &w4, &a0, &a1, &a2, &tmp1, NULL);
2963      return res;
2964 }
2965 
2966 
2967 /* Source: /usr/cvsroot/libtommath/dist/libtommath/bn_mp_toom_sqr.c,v $ */
2968 /* Revision: 1.1.1.1 $ */
2969 /* Date: 2011/03/12 22:58:18 $ */
2970 
2971 /* Karatsuba squaring, computes b = a*a using three
2972  * half size squarings
2973  *
2974  * See comments of karatsuba_mul for details.  It
2975  * is essentially the same algorithm but merely
2976  * tuned to perform recursive squarings.
2977  */
2978 static int mp_karatsuba_sqr (mp_int * a, mp_int * b)
2979 {
2980   mp_int  x0, x1, t1, t2, x0x0, x1x1;
2981   int     B, err;
2982 
2983   err = MP_MEM;
2984 
2985   /* min # of digits */
2986   B = a->used;
2987 
2988   /* now divide in two */
2989   B = (unsigned)B >> 1;
2990 
2991   /* init copy all the temps */
2992   if (mp_init_size (&x0, B) != MP_OKAY)
2993     goto ERR;
2994   if (mp_init_size (&x1, a->used - B) != MP_OKAY)
2995     goto X0;
2996 
2997   /* init temps */
2998   if (mp_init_size (&t1, a->used * 2) != MP_OKAY)
2999     goto X1;
3000   if (mp_init_size (&t2, a->used * 2) != MP_OKAY)
3001     goto T1;
3002   if (mp_init_size (&x0x0, B * 2) != MP_OKAY)
3003     goto T2;
3004   if (mp_init_size (&x1x1, (a->used - B) * 2) != MP_OKAY)
3005     goto X0X0;
3006 
3007   {
3008     int x;
3009     mp_digit *dst, *src;
3010 
3011     src = a->dp;
3012 
3013     /* now shift the digits */
3014     dst = x0.dp;
3015     for (x = 0; x < B; x++) {
3016       *dst++ = *src++;
3017     }
3018 
3019     dst = x1.dp;
3020     for (x = B; x < a->used; x++) {
3021       *dst++ = *src++;
3022     }
3023   }
3024 
3025   x0.used = B;
3026   x1.used = a->used - B;
3027 
3028   mp_clamp (&x0);
3029 
3030   /* now calc the products x0*x0 and x1*x1 */
3031   if (mp_sqr (&x0, &x0x0) != MP_OKAY)
3032     goto X1X1;           /* x0x0 = x0*x0 */
3033   if (mp_sqr (&x1, &x1x1) != MP_OKAY)
3034     goto X1X1;           /* x1x1 = x1*x1 */
3035 
3036   /* now calc (x1+x0)**2 */
3037   if (s_mp_add (&x1, &x0, &t1) != MP_OKAY)
3038     goto X1X1;           /* t1 = x1 - x0 */
3039   if (mp_sqr (&t1, &t1) != MP_OKAY)
3040     goto X1X1;           /* t1 = (x1 - x0) * (x1 - x0) */
3041 
3042   /* add x0y0 */
3043   if (s_mp_add (&x0x0, &x1x1, &t2) != MP_OKAY)
3044     goto X1X1;           /* t2 = x0x0 + x1x1 */
3045   if (s_mp_sub (&t1, &t2, &t1) != MP_OKAY)
3046     goto X1X1;           /* t1 = (x1+x0)**2 - (x0x0 + x1x1) */
3047 
3048   /* shift by B */
3049   if (mp_lshd (&t1, B) != MP_OKAY)
3050     goto X1X1;           /* t1 = (x0x0 + x1x1 - (x1-x0)*(x1-x0))<<B */
3051   if (mp_lshd (&x1x1, B * 2) != MP_OKAY)
3052     goto X1X1;           /* x1x1 = x1x1 << 2*B */
3053 
3054   if (mp_add (&x0x0, &t1, &t1) != MP_OKAY)
3055     goto X1X1;           /* t1 = x0x0 + t1 */
3056   if (mp_add (&t1, &x1x1, b) != MP_OKAY)
3057     goto X1X1;           /* t1 = x0x0 + t1 + x1x1 */
3058 
3059   err = MP_OKAY;
3060 
3061 X1X1:mp_clear (&x1x1);
3062 X0X0:mp_clear (&x0x0);
3063 T2:mp_clear (&t2);
3064 T1:mp_clear (&t1);
3065 X1:mp_clear (&x1);
3066 X0:mp_clear (&x0);
3067 ERR:
3068   return err;
3069 }
3070 
3071 /* Source: /usr/cvsroot/libtommath/dist/libtommath/bn_mp_karatsuba_sqr.c,v $ */
3072 /* Revision: 1.2 $ */
3073 /* Date: 2011/03/12 23:43:54 $ */
3074 
3075 /* the jist of squaring...
3076  * you do like mult except the offset of the tmpx [one that
3077  * starts closer to zero] can't equal the offset of tmpy.
3078  * So basically you set up iy like before then you min it with
3079  * (ty-tx) so that it never happens.  You double all those
3080  * you add in the inner loop
3081 
3082 After that loop you do the squares and add them in.
3083 */
3084 
3085 static int fast_s_mp_sqr (mp_int * a, mp_int * b)
3086 {
3087   int       olduse, res, pa, ix, iz;
3088   mp_digit   W[MP_WARRAY], *tmpx;
3089   mp_word   W1;
3090 
3091   /* grow the destination as required */
3092   pa = a->used + a->used;
3093   if (b->alloc < pa) {
3094     if ((res = mp_grow (b, pa)) != MP_OKAY) {
3095       return res;
3096     }
3097   }
3098 
3099   /* number of output digits to produce */
3100   W1 = 0;
3101   for (ix = 0; ix < pa; ix++) {
3102       int      tx, ty, iy;
3103       mp_word  _W;
3104       mp_digit *tmpy;
3105 
3106       /* clear counter */
3107       _W = 0;
3108 
3109       /* get offsets into the two bignums */
3110       ty = MIN(a->used-1, ix);
3111       tx = ix - ty;
3112 
3113       /* setup temp aliases */
3114       tmpx = a->dp + tx;
3115       tmpy = a->dp + ty;
3116 
3117       /* this is the number of times the loop will iterrate, essentially
3118          while (tx++ < a->used && ty-- >= 0) { ... }
3119        */
3120       iy = MIN(a->used-tx, ty+1);
3121 
3122       /* now for squaring tx can never equal ty
3123        * we halve the distance since they approach at a rate of 2x
3124        * and we have to round because odd cases need to be executed
3125        */
3126       iy = MIN(iy, (int)((unsigned)(ty-tx+1)>>1));
3127 
3128       /* execute loop */
3129       for (iz = 0; iz < iy; iz++) {
3130          _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
3131       }
3132 
3133       /* double the inner product and add carry */
3134       _W = _W + _W + W1;
3135 
3136       /* even columns have the square term in them */
3137       if ((ix&1) == 0) {
3138          _W += ((mp_word)a->dp[(unsigned)ix>>1])*((mp_word)a->dp[(unsigned)ix>>1]);
3139       }
3140 
3141       /* store it */
3142       W[ix] = (mp_digit)(_W & MP_MASK);
3143 
3144       /* make next carry */
3145       W1 = _W >> ((mp_word)DIGIT_BIT);
3146   }
3147 
3148   /* setup dest */
3149   olduse  = b->used;
3150   b->used = a->used+a->used;
3151 
3152   {
3153     mp_digit *tmpb;
3154     tmpb = b->dp;
3155     for (ix = 0; ix < pa; ix++) {
3156       *tmpb++ = W[ix] & MP_MASK;
3157     }
3158 
3159     /* clear unused digits [that existed in the old copy of c] */
3160     for (; ix < olduse; ix++) {
3161       *tmpb++ = 0;
3162     }
3163   }
3164   mp_clamp (b);
3165   return MP_OKAY;
3166 }
3167 
3168 /* Source: /usr/cvsroot/libtommath/dist/libtommath/bn_fast_s_mp_sqr.c,v $ */
3169 /* Revision: 1.3 $ */
3170 /* Date: 2011/03/18 16:43:04 $ */
3171 
3172 /* low level squaring, b = a*a, HAC pp.596-597, Algorithm 14.16 */
3173 static int
3174 s_mp_sqr (mp_int * a, mp_int * b)
3175 {
3176   mp_int  t;
3177   int     res, ix, iy, pa;
3178   mp_word r;
3179   mp_digit u, tmpx, *tmpt;
3180 
3181   pa = a->used;
3182   if ((res = mp_init_size (&t, 2*pa + 1)) != MP_OKAY) {
3183     return res;
3184   }
3185 
3186   /* default used is maximum possible size */
3187   t.used = 2*pa + 1;
3188 
3189   for (ix = 0; ix < pa; ix++) {
3190     /* first calculate the digit at 2*ix */
3191     /* calculate double precision result */
3192     r = ((mp_word) t.dp[2*ix]) +
3193         ((mp_word)a->dp[ix])*((mp_word)a->dp[ix]);
3194 
3195     /* store lower part in result */
3196     t.dp[ix+ix] = (mp_digit) (r & ((mp_word) MP_MASK));
3197 
3198     /* get the carry */
3199     u           = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
3200 
3201     /* left hand side of A[ix] * A[iy] */
3202     tmpx        = a->dp[ix];
3203 
3204     /* alias for where to store the results */
3205     tmpt        = t.dp + (2*ix + 1);
3206 
3207     for (iy = ix + 1; iy < pa; iy++) {
3208       /* first calculate the product */
3209       r       = ((mp_word)tmpx) * ((mp_word)a->dp[iy]);
3210 
3211       /* now calculate the double precision result, note we use
3212        * addition instead of *2 since it's easier to optimize
3213        */
3214       r       = ((mp_word) *tmpt) + r + r + ((mp_word) u);
3215 
3216       /* store lower part */
3217       *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
3218 
3219       /* get carry */
3220       u       = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
3221     }
3222     /* propagate upwards */
3223     while (u != ((mp_digit) 0)) {
3224       r       = ((mp_word) *tmpt) + ((mp_word) u);
3225       *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
3226       u       = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
3227     }
3228   }
3229 
3230   mp_clamp (&t);
3231   mp_exch (&t, b);
3232   mp_clear (&t);
3233   return MP_OKAY;
3234 }
3235 
3236 /* Source: /usr/cvsroot/libtommath/dist/libtommath/bn_s_mp_sqr.c,v $ */
3237 /* Revision: 1.1.1.1 $ */
3238 /* Date: 2011/03/12 22:58:18 $ */
3239 
3240 #define TOOM_SQR_CUTOFF      400
3241 #define KARATSUBA_SQR_CUTOFF 120
3242 
3243 /* computes b = a*a */
3244 static int
3245 mp_sqr (mp_int * a, mp_int * b)
3246 {
3247   int     res;
3248 
3249   /* use Toom-Cook? */
3250   if (a->used >= TOOM_SQR_CUTOFF) {
3251     res = mp_toom_sqr(a, b);
3252   /* Karatsuba? */
3253   } else
3254 if (a->used >= KARATSUBA_SQR_CUTOFF) {
3255     res = mp_karatsuba_sqr (a, b);
3256   } else
3257   {
3258     /* can we use the fast comba multiplier? */
3259     if (((unsigned)a->used * 2 + 1) < MP_WARRAY &&
3260          a->used <
3261          (1 << (unsigned)(sizeof(mp_word) * CHAR_BIT - 2*DIGIT_BIT - 1))) {
3262       res = fast_s_mp_sqr (a, b);
3263     } else
3264       res = s_mp_sqr (a, b);
3265   }
3266   b->sign = MP_ZPOS;
3267   return res;
3268 }
3269 
3270 /* Source: /usr/cvsroot/libtommath/dist/libtommath/bn_mp_sqr.c,v $ */
3271 /* Revision: 1.3 $ */
3272 /* Date: 2011/03/18 16:43:04 $ */
3273 
3274 #define TAB_SIZE 256
3275 
3276 static int s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
3277 {
3278   mp_int  M[TAB_SIZE], res, mu;
3279   mp_digit buf;
3280   int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
3281   int (*redux)(mp_int*,mp_int*,mp_int*);
3282 
3283   /* find window size */
3284   x = mp_count_bits (X);
3285   if (x <= 7) {
3286     winsize = 2;
3287   } else if (x <= 36) {
3288     winsize = 3;
3289   } else if (x <= 140) {
3290     winsize = 4;
3291   } else if (x <= 450) {
3292     winsize = 5;
3293   } else if (x <= 1303) {
3294     winsize = 6;
3295   } else if (x <= 3529) {
3296     winsize = 7;
3297   } else {
3298     winsize = 8;
3299   }
3300 
3301   /* init M array */
3302   /* init first cell */
3303   if ((err = mp_init(&M[1])) != MP_OKAY) {
3304      return err;
3305   }
3306 
3307   /* now init the second half of the array */
3308   for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
3309     if ((err = mp_init(&M[x])) != MP_OKAY) {
3310       for (y = 1<<(winsize-1); y < x; y++) {
3311         mp_clear (&M[y]);
3312       }
3313       mp_clear(&M[1]);
3314       return err;
3315     }
3316   }
3317 
3318   /* create mu, used for Barrett reduction */
3319   if ((err = mp_init (&mu)) != MP_OKAY) {
3320     goto LBL_M;
3321   }
3322 
3323   if (redmode == 0) {
3324      if ((err = mp_reduce_setup (&mu, P)) != MP_OKAY) {
3325         goto LBL_MU;
3326      }
3327      redux = mp_reduce;
3328   } else {
3329      if ((err = mp_reduce_2k_setup_l (P, &mu)) != MP_OKAY) {
3330         goto LBL_MU;
3331      }
3332      redux = mp_reduce_2k_l;
3333   }
3334 
3335   /* create M table
3336    *
3337    * The M table contains powers of the base,
3338    * e.g. M[x] = G**x mod P
3339    *
3340    * The first half of the table is not
3341    * computed though accept for M[0] and M[1]
3342    */
3343   if ((err = mp_mod (G, P, &M[1])) != MP_OKAY) {
3344     goto LBL_MU;
3345   }
3346 
3347   /* compute the value at M[1<<(winsize-1)] by squaring
3348    * M[1] (winsize-1) times
3349    */
3350   if ((err = mp_copy ( &M[1], &M[1 << (winsize - 1)])) != MP_OKAY) {
3351     goto LBL_MU;
3352   }
3353 
3354   for (x = 0; x < (winsize - 1); x++) {
3355     /* square it */
3356     if ((err = mp_sqr (&M[1 << (winsize - 1)],
3357                        &M[1 << (winsize - 1)])) != MP_OKAY) {
3358       goto LBL_MU;
3359     }
3360 
3361     /* reduce modulo P */
3362     if ((err = redux (&M[1 << (winsize - 1)], P, &mu)) != MP_OKAY) {
3363       goto LBL_MU;
3364     }
3365   }
3366 
3367   /* create upper table, that is M[x] = M[x-1] * M[1] (mod P)
3368    * for x = (2**(winsize - 1) + 1) to (2**winsize - 1)
3369    */
3370   for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) {
3371     if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) {
3372       goto LBL_MU;
3373     }
3374     if ((err = redux (&M[x], P, &mu)) != MP_OKAY) {
3375       goto LBL_MU;
3376     }
3377   }
3378 
3379   /* setup result */
3380   if ((err = mp_init (&res)) != MP_OKAY) {
3381     goto LBL_MU;
3382   }
3383   mp_set (&res, 1);
3384 
3385   /* set initial mode and bit cnt */
3386   mode   = 0;
3387   bitcnt = 1;
3388   buf    = 0;
3389   digidx = X->used - 1;
3390   bitcpy = 0;
3391   bitbuf = 0;
3392 
3393   for (;;) {
3394     /* grab next digit as required */
3395     if (--bitcnt == 0) {
3396       /* if digidx == -1 we are out of digits */
3397       if (digidx == -1) {
3398         break;
3399       }
3400       /* read next digit and reset the bitcnt */
3401       buf    = X->dp[digidx--];
3402       bitcnt = (int) DIGIT_BIT;
3403     }
3404 
3405     /* grab the next msb from the exponent */
3406     y     = (unsigned)(buf >> (mp_digit)(DIGIT_BIT - 1)) & 1;
3407     buf <<= (mp_digit)1;
3408 
3409     /* if the bit is zero and mode == 0 then we ignore it
3410      * These represent the leading zero bits before the first 1 bit
3411      * in the exponent.  Technically this opt is not required but it
3412      * does lower the # of trivial squaring/reductions used
3413      */
3414     if (mode == 0 && y == 0) {
3415       continue;
3416     }
3417 
3418     /* if the bit is zero and mode == 1 then we square */
3419     if (mode == 1 && y == 0) {
3420       if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
3421         goto LBL_RES;
3422       }
3423       if ((err = redux (&res, P, &mu)) != MP_OKAY) {
3424         goto LBL_RES;
3425       }
3426       continue;
3427     }
3428 
3429     /* else we add it to the window */
3430     bitbuf |= (y << (winsize - ++bitcpy));
3431     mode    = 2;
3432 
3433     if (bitcpy == winsize) {
3434       /* ok window is filled so square as required and multiply  */
3435       /* square first */
3436       for (x = 0; x < winsize; x++) {
3437         if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
3438           goto LBL_RES;
3439         }
3440         if ((err = redux (&res, P, &mu)) != MP_OKAY) {
3441           goto LBL_RES;
3442         }
3443       }
3444 
3445       /* then multiply */
3446       if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) {
3447         goto LBL_RES;
3448       }
3449       if ((err = redux (&res, P, &mu)) != MP_OKAY) {
3450         goto LBL_RES;
3451       }
3452 
3453       /* empty window and reset */
3454       bitcpy = 0;
3455       bitbuf = 0;
3456       mode   = 1;
3457     }
3458   }
3459 
3460   /* if bits remain then square/multiply */
3461   if (mode == 2 && bitcpy > 0) {
3462     /* square then multiply if the bit is set */
3463     for (x = 0; x < bitcpy; x++) {
3464       if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
3465         goto LBL_RES;
3466       }
3467       if ((err = redux (&res, P, &mu)) != MP_OKAY) {
3468         goto LBL_RES;
3469       }
3470 
3471       bitbuf <<= 1;
3472       if ((bitbuf & (1 << winsize)) != 0) {
3473         /* then multiply */
3474         if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
3475           goto LBL_RES;
3476         }
3477         if ((err = redux (&res, P, &mu)) != MP_OKAY) {
3478           goto LBL_RES;
3479         }
3480       }
3481     }
3482   }
3483 
3484   mp_exch (&res, Y);
3485   err = MP_OKAY;
3486 LBL_RES:mp_clear (&res);
3487 LBL_MU:mp_clear (&mu);
3488 LBL_M:
3489   mp_clear(&M[1]);
3490   for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
3491     mp_clear (&M[x]);
3492   }
3493   return err;
3494 }
3495 
3496 /* determines if a number is a valid DR modulus */
3497 static int
3498 mp_dr_is_modulus(mp_int *a)
3499 {
3500    int ix;
3501 
3502    /* must be at least two digits */
3503    if (a->used < 2) {
3504       return 0;
3505    }
3506 
3507    /* must be of the form b**k - a [a <= b] so all
3508     * but the first digit must be equal to -1 (mod b).
3509     */
3510    for (ix = 1; ix < a->used; ix++) {
3511        if (a->dp[ix] != MP_MASK) {
3512           return 0;
3513        }
3514    }
3515    return 1;
3516 }
3517 
3518 /* Source: /usr/cvsroot/libtommath/dist/libtommath/bn_mp_dr_is_modulus.c,v $ */
3519 /* Revision: 1.1.1.1 $ */
3520 /* Date: 2011/03/12 22:58:18 $ */
3521 
3522 /* determines if mp_reduce_2k can be used */
3523 static int mp_reduce_is_2k(mp_int *a)
3524 {
3525    int ix, iy, iw;
3526    mp_digit iz;
3527 
3528    if (a->used == 0) {
3529       return MP_NO;
3530    } else if (a->used == 1) {
3531       return MP_YES;
3532    } else if (a->used > 1) {
3533       iy = mp_count_bits(a);
3534       iz = 1;
3535       iw = 1;
3536 
3537       /* Test every bit from the second digit up, must be 1 */
3538       for (ix = DIGIT_BIT; ix < iy; ix++) {
3539           if ((a->dp[iw] & iz) == 0) {
3540              return MP_NO;
3541           }
3542           iz <<= 1;
3543           if (iz > (mp_digit)MP_MASK) {
3544              ++iw;
3545              iz = 1;
3546           }
3547       }
3548    }
3549    return MP_YES;
3550 }
3551 
3552 /* Source: /usr/cvsroot/libtommath/dist/libtommath/bn_mp_reduce_is_2k.c,v $ */
3553 /* Revision: 1.1.1.1 $ */
3554 /* Date: 2011/03/12 22:58:18 $ */
3555 
3556 
3557 /* d = a * b (mod c) */
3558 static int
3559 mp_mulmod (mp_int *d, mp_int * a, mp_int * b, mp_int * c)
3560 {
3561   int     res;
3562   mp_int  t;
3563 
3564   if ((res = mp_init (&t)) != MP_OKAY) {
3565     return res;
3566   }
3567 
3568   if ((res = mp_mul (a, b, &t)) != MP_OKAY) {
3569     mp_clear (&t);
3570     return res;
3571   }
3572   res = mp_mod (&t, c, d);
3573   mp_clear (&t);
3574   return res;
3575 }
3576 
3577 /* Source: /usr/cvsroot/libtommath/dist/libtommath/bn_mp_mulmod.c,v $ */
3578 /* Revision: 1.1.1.1 $ */
3579 /* Date: 2011/03/12 22:58:18 $ */
3580 
3581 /* setups the montgomery reduction stuff */
3582 static int
3583 mp_montgomery_setup (mp_int * n, mp_digit * rho)
3584 {
3585   mp_digit x, b;
3586 
3587 /* fast inversion mod 2**k
3588  *
3589  * Based on the fact that
3590  *
3591  * XA = 1 (mod 2**n)  =>  (X(2-XA)) A = 1 (mod 2**2n)
3592  *                    =>  2*X*A - X*X*A*A = 1
3593  *                    =>  2*(1) - (1)     = 1
3594  */
3595   b = n->dp[0];
3596 
3597   if ((b & 1) == 0) {
3598     return MP_VAL;
3599   }
3600 
3601   x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */
3602   x *= 2 - b * x;               /* here x*a==1 mod 2**8 */
3603   x *= 2 - b * x;               /* here x*a==1 mod 2**16 */
3604   x *= 2 - b * x;               /* here x*a==1 mod 2**32 */
3605   if (/*CONSTCOND*/sizeof(mp_digit) == 8) {
3606 	  x *= 2 - b * x;	/* here x*a==1 mod 2**64 */
3607   }
3608 
3609   /* rho = -1/m mod b */
3610   *rho = (unsigned long)(((mp_word)1 << ((mp_word) DIGIT_BIT)) - x) & MP_MASK;
3611 
3612   return MP_OKAY;
3613 }
3614 
3615 /* Source: /usr/cvsroot/libtommath/dist/libtommath/bn_mp_montgomery_setup.c,v $ */
3616 /* Revision: 1.1.1.1 $ */
3617 /* Date: 2011/03/12 22:58:18 $ */
3618 
3619 /* computes xR**-1 == x (mod N) via Montgomery Reduction
3620  *
3621  * This is an optimized implementation of montgomery_reduce
3622  * which uses the comba method to quickly calculate the columns of the
3623  * reduction.
3624  *
3625  * Based on Algorithm 14.32 on pp.601 of HAC.
3626 */
3627 static int
3628 fast_mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
3629 {
3630   int     ix, res, olduse;
3631   /*LINTED*/
3632   mp_word W[MP_WARRAY];
3633 
3634   /* get old used count */
3635   olduse = x->used;
3636 
3637   /* grow a as required */
3638   if (x->alloc < n->used + 1) {
3639     if ((res = mp_grow (x, n->used + 1)) != MP_OKAY) {
3640       return res;
3641     }
3642   }
3643 
3644   /* first we have to get the digits of the input into
3645    * an array of double precision words W[...]
3646    */
3647   {
3648     mp_word *_W;
3649     mp_digit *tmpx;
3650 
3651     /* alias for the W[] array */
3652     _W   = W;
3653 
3654     /* alias for the digits of  x*/
3655     tmpx = x->dp;
3656 
3657     /* copy the digits of a into W[0..a->used-1] */
3658     for (ix = 0; ix < x->used; ix++) {
3659       *_W++ = *tmpx++;
3660     }
3661 
3662     /* zero the high words of W[a->used..m->used*2] */
3663     for (; ix < n->used * 2 + 1; ix++) {
3664       *_W++ = 0;
3665     }
3666   }
3667 
3668   /* now we proceed to zero successive digits
3669    * from the least significant upwards
3670    */
3671   for (ix = 0; ix < n->used; ix++) {
3672     /* mu = ai * m' mod b
3673      *
3674      * We avoid a double precision multiplication (which isn't required)
3675      * by casting the value down to a mp_digit.  Note this requires
3676      * that W[ix-1] have  the carry cleared (see after the inner loop)
3677      */
3678     mp_digit mu;
3679     mu = (mp_digit) (((W[ix] & MP_MASK) * rho) & MP_MASK);
3680 
3681     /* a = a + mu * m * b**i
3682      *
3683      * This is computed in place and on the fly.  The multiplication
3684      * by b**i is handled by offseting which columns the results
3685      * are added to.
3686      *
3687      * Note the comba method normally doesn't handle carries in the
3688      * inner loop In this case we fix the carry from the previous
3689      * column since the Montgomery reduction requires digits of the
3690      * result (so far) [see above] to work.  This is
3691      * handled by fixing up one carry after the inner loop.  The
3692      * carry fixups are done in order so after these loops the
3693      * first m->used words of W[] have the carries fixed
3694      */
3695     {
3696       int iy;
3697       mp_digit *tmpn;
3698       mp_word *_W;
3699 
3700       /* alias for the digits of the modulus */
3701       tmpn = n->dp;
3702 
3703       /* Alias for the columns set by an offset of ix */
3704       _W = W + ix;
3705 
3706       /* inner loop */
3707       for (iy = 0; iy < n->used; iy++) {
3708           *_W++ += ((mp_word)mu) * ((mp_word)*tmpn++);
3709       }
3710     }
3711 
3712     /* now fix carry for next digit, W[ix+1] */
3713     W[ix + 1] += W[ix] >> ((mp_word) DIGIT_BIT);
3714   }
3715 
3716   /* now we have to propagate the carries and
3717    * shift the words downward [all those least
3718    * significant digits we zeroed].
3719    */
3720   {
3721     mp_digit *tmpx;
3722     mp_word *_W, *_W1;
3723 
3724     /* nox fix rest of carries */
3725 
3726     /* alias for current word */
3727     _W1 = W + ix;
3728 
3729     /* alias for next word, where the carry goes */
3730     _W = W + ++ix;
3731 
3732     for (; ix <= n->used * 2 + 1; ix++) {
3733       *_W++ += *_W1++ >> ((mp_word) DIGIT_BIT);
3734     }
3735 
3736     /* copy out, A = A/b**n
3737      *
3738      * The result is A/b**n but instead of converting from an
3739      * array of mp_word to mp_digit than calling mp_rshd
3740      * we just copy them in the right order
3741      */
3742 
3743     /* alias for destination word */
3744     tmpx = x->dp;
3745 
3746     /* alias for shifted double precision result */
3747     _W = W + n->used;
3748 
3749     for (ix = 0; ix < n->used + 1; ix++) {
3750       *tmpx++ = (mp_digit)(*_W++ & ((mp_word) MP_MASK));
3751     }
3752 
3753     /* zero oldused digits, if the input a was larger than
3754      * m->used+1 we'll have to clear the digits
3755      */
3756     for (; ix < olduse; ix++) {
3757       *tmpx++ = 0;
3758     }
3759   }
3760 
3761   /* set the max used and clamp */
3762   x->used = n->used + 1;
3763   mp_clamp (x);
3764 
3765   /* if A >= m then A = A - m */
3766   if (mp_cmp_mag (x, n) != MP_LT) {
3767     return s_mp_sub (x, n, x);
3768   }
3769   return MP_OKAY;
3770 }
3771 
3772 /* Source: /usr/cvsroot/libtommath/dist/libtommath/bn_fast_mp_montgomery_reduce.c,v $ */
3773 /* Revision: 1.2 $ */
3774 /* Date: 2011/03/18 16:22:09 $ */
3775 
3776 /* computes xR**-1 == x (mod N) via Montgomery Reduction */
3777 static int
3778 mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
3779 {
3780   int     ix, res, digs;
3781   mp_digit mu;
3782 
3783   /* can the fast reduction [comba] method be used?
3784    *
3785    * Note that unlike in mul you're safely allowed *less*
3786    * than the available columns [255 per default] since carries
3787    * are fixed up in the inner loop.
3788    */
3789   digs = n->used * 2 + 1;
3790   if (((unsigned)digs < MP_WARRAY) &&
3791       n->used <
3792       (1 << (unsigned)((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
3793     return fast_mp_montgomery_reduce (x, n, rho);
3794   }
3795 
3796   /* grow the input as required */
3797   if (x->alloc < digs) {
3798     if ((res = mp_grow (x, digs)) != MP_OKAY) {
3799       return res;
3800     }
3801   }
3802   x->used = digs;
3803 
3804   for (ix = 0; ix < n->used; ix++) {
3805     /* mu = ai * rho mod b
3806      *
3807      * The value of rho must be precalculated via
3808      * montgomery_setup() such that
3809      * it equals -1/n0 mod b this allows the
3810      * following inner loop to reduce the
3811      * input one digit at a time
3812      */
3813     mu = (mp_digit) (((mp_word)x->dp[ix]) * ((mp_word)rho) & MP_MASK);
3814 
3815     /* a = a + mu * m * b**i */
3816     {
3817       int iy;
3818       mp_digit *tmpn, *tmpx, u;
3819       mp_word r;
3820 
3821       /* alias for digits of the modulus */
3822       tmpn = n->dp;
3823 
3824       /* alias for the digits of x [the input] */
3825       tmpx = x->dp + ix;
3826 
3827       /* set the carry to zero */
3828       u = 0;
3829 
3830       /* Multiply and add in place */
3831       for (iy = 0; iy < n->used; iy++) {
3832         /* compute product and sum */
3833         r       = ((mp_word)mu) * ((mp_word)*tmpn++) +
3834                   ((mp_word) u) + ((mp_word) * tmpx);
3835 
3836         /* get carry */
3837         u       = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
3838 
3839         /* fix digit */
3840         *tmpx++ = (mp_digit)(r & ((mp_word) MP_MASK));
3841       }
3842       /* At this point the ix'th digit of x should be zero */
3843 
3844 
3845       /* propagate carries upwards as required*/
3846       while (u) {
3847         *tmpx   += u;
3848         u        = *tmpx >> DIGIT_BIT;
3849         *tmpx++ &= MP_MASK;
3850       }
3851     }
3852   }
3853 
3854   /* at this point the n.used'th least
3855    * significant digits of x are all zero
3856    * which means we can shift x to the
3857    * right by n.used digits and the
3858    * residue is unchanged.
3859    */
3860 
3861   /* x = x/b**n.used */
3862   mp_clamp(x);
3863   mp_rshd (x, n->used);
3864 
3865   /* if x >= n then x = x - n */
3866   if (mp_cmp_mag (x, n) != MP_LT) {
3867     return s_mp_sub (x, n, x);
3868   }
3869 
3870   return MP_OKAY;
3871 }
3872 
3873 /* Source: /usr/cvsroot/libtommath/dist/libtommath/bn_mp_montgomery_reduce.c,v $ */
3874 /* Revision: 1.3 $ */
3875 /* Date: 2011/03/18 16:43:04 $ */
3876 
3877 /* determines the setup value */
3878 static void
3879 mp_dr_setup(mp_int *a, mp_digit *d)
3880 {
3881    /* the casts are required if DIGIT_BIT is one less than
3882     * the number of bits in a mp_digit [e.g. DIGIT_BIT==31]
3883     */
3884    *d = (mp_digit)((((mp_word)1) << ((mp_word)DIGIT_BIT)) -
3885         ((mp_word)a->dp[0]));
3886 }
3887 
3888 /* Source: /usr/cvsroot/libtommath/dist/libtommath/bn_mp_dr_setup.c,v $ */
3889 /* Revision: 1.1.1.1 $ */
3890 /* Date: 2011/03/12 22:58:18 $ */
3891 
3892 /* reduce "x" in place modulo "n" using the Diminished Radix algorithm.
3893  *
3894  * Based on algorithm from the paper
3895  *
3896  * "Generating Efficient Primes for Discrete Log Cryptosystems"
3897  *                 Chae Hoon Lim, Pil Joong Lee,
3898  *          POSTECH Information Research Laboratories
3899  *
3900  * The modulus must be of a special format [see manual]
3901  *
3902  * Has been modified to use algorithm 7.10 from the LTM book instead
3903  *
3904  * Input x must be in the range 0 <= x <= (n-1)**2
3905  */
3906 static int
3907 mp_dr_reduce (mp_int * x, mp_int * n, mp_digit k)
3908 {
3909   int      err, i, m;
3910   mp_word  r;
3911   mp_digit mu, *tmpx1, *tmpx2;
3912 
3913   /* m = digits in modulus */
3914   m = n->used;
3915 
3916   /* ensure that "x" has at least 2m digits */
3917   if (x->alloc < m + m) {
3918     if ((err = mp_grow (x, m + m)) != MP_OKAY) {
3919       return err;
3920     }
3921   }
3922 
3923 /* top of loop, this is where the code resumes if
3924  * another reduction pass is required.
3925  */
3926 top:
3927   /* aliases for digits */
3928   /* alias for lower half of x */
3929   tmpx1 = x->dp;
3930 
3931   /* alias for upper half of x, or x/B**m */
3932   tmpx2 = x->dp + m;
3933 
3934   /* set carry to zero */
3935   mu = 0;
3936 
3937   /* compute (x mod B**m) + k * [x/B**m] inline and inplace */
3938   for (i = 0; i < m; i++) {
3939       r         = ((mp_word)*tmpx2++) * ((mp_word)k) + *tmpx1 + mu;
3940       *tmpx1++  = (mp_digit)(r & MP_MASK);
3941       mu        = (mp_digit)(r >> ((mp_word)DIGIT_BIT));
3942   }
3943 
3944   /* set final carry */
3945   *tmpx1++ = mu;
3946 
3947   /* zero words above m */
3948   for (i = m + 1; i < x->used; i++) {
3949       *tmpx1++ = 0;
3950   }
3951 
3952   /* clamp, sub and return */
3953   mp_clamp (x);
3954 
3955   /* if x >= n then subtract and reduce again
3956    * Each successive "recursion" makes the input smaller and smaller.
3957    */
3958   if (mp_cmp_mag (x, n) != MP_LT) {
3959     s_mp_sub(x, n, x);
3960     goto top;
3961   }
3962   return MP_OKAY;
3963 }
3964 
3965 /* Source: /usr/cvsroot/libtommath/dist/libtommath/bn_mp_dr_reduce.c,v $ */
3966 /* Revision: 1.1.1.1 $ */
3967 /* Date: 2011/03/12 22:58:18 $ */
3968 
3969 /* determines the setup value */
3970 static int
3971 mp_reduce_2k_setup(mp_int *a, mp_digit *d)
3972 {
3973    int res, p;
3974    mp_int tmp;
3975 
3976    if ((res = mp_init(&tmp)) != MP_OKAY) {
3977       return res;
3978    }
3979 
3980    p = mp_count_bits(a);
3981    if ((res = mp_2expt(&tmp, p)) != MP_OKAY) {
3982       mp_clear(&tmp);
3983       return res;
3984    }
3985 
3986    if ((res = s_mp_sub(&tmp, a, &tmp)) != MP_OKAY) {
3987       mp_clear(&tmp);
3988       return res;
3989    }
3990 
3991    *d = tmp.dp[0];
3992    mp_clear(&tmp);
3993    return MP_OKAY;
3994 }
3995 
3996 /* Source: /usr/cvsroot/libtommath/dist/libtommath/bn_mp_reduce_2k_setup.c,v $ */
3997 /* Revision: 1.1.1.1 $ */
3998 /* Date: 2011/03/12 22:58:18 $ */
3999 
4000 /* reduces a modulo n where n is of the form 2**p - d */
4001 static int
4002 mp_reduce_2k(mp_int *a, mp_int *n, mp_digit d)
4003 {
4004    mp_int q;
4005    int    p, res;
4006 
4007    if ((res = mp_init(&q)) != MP_OKAY) {
4008       return res;
4009    }
4010 
4011    p = mp_count_bits(n);
4012 top:
4013    /* q = a/2**p, a = a mod 2**p */
4014    if ((res = mp_div_2d(a, p, &q, a)) != MP_OKAY) {
4015       goto ERR;
4016    }
4017 
4018    if (d != 1) {
4019       /* q = q * d */
4020       if ((res = mp_mul_d(&q, d, &q)) != MP_OKAY) {
4021          goto ERR;
4022       }
4023    }
4024 
4025    /* a = a + q */
4026    if ((res = s_mp_add(a, &q, a)) != MP_OKAY) {
4027       goto ERR;
4028    }
4029 
4030    if (mp_cmp_mag(a, n) != MP_LT) {
4031       s_mp_sub(a, n, a);
4032       goto top;
4033    }
4034 
4035 ERR:
4036    mp_clear(&q);
4037    return res;
4038 }
4039 
4040 /* Source: /usr/cvsroot/libtommath/dist/libtommath/bn_mp_reduce_2k.c,v $ */
4041 /* Revision: 1.1.1.1 $ */
4042 /* Date: 2011/03/12 22:58:18 $ */
4043 
4044 /*
4045  * shifts with subtractions when the result is greater than b.
4046  *
4047  * The method is slightly modified to shift B unconditionally upto just under
4048  * the leading bit of b.  This saves alot of multiple precision shifting.
4049  */
4050 static int
4051 mp_montgomery_calc_normalization (mp_int * a, mp_int * b)
4052 {
4053   int     x, bits, res;
4054 
4055   /* how many bits of last digit does b use */
4056   bits = mp_count_bits (b) % DIGIT_BIT;
4057 
4058   if (b->used > 1) {
4059      if ((res = mp_2expt (a, (b->used - 1) * DIGIT_BIT + bits - 1)) != MP_OKAY) {
4060         return res;
4061      }
4062   } else {
4063      mp_set(a, 1);
4064      bits = 1;
4065   }
4066 
4067 
4068   /* now compute C = A * B mod b */
4069   for (x = bits - 1; x < (int)DIGIT_BIT; x++) {
4070     if ((res = mp_mul_2 (a, a)) != MP_OKAY) {
4071       return res;
4072     }
4073     if (mp_cmp_mag (a, b) != MP_LT) {
4074       if ((res = s_mp_sub (a, b, a)) != MP_OKAY) {
4075         return res;
4076       }
4077     }
4078   }
4079 
4080   return MP_OKAY;
4081 }
4082 
4083 /* Source: /usr/cvsroot/libtommath/dist/libtommath/bn_mp_montgomery_calc_normalization.c,v $ */
4084 /* Revision: 1.1.1.1 $ */
4085 /* Date: 2011/03/12 22:58:18 $ */
4086 
4087 /* computes Y == G**X mod P, HAC pp.616, Algorithm 14.85
4088  *
4089  * Uses a left-to-right k-ary sliding window to compute the modular exponentiation.
4090  * The value of k changes based on the size of the exponent.
4091  *
4092  * Uses Montgomery or Diminished Radix reduction [whichever appropriate]
4093  */
4094 
4095 #define TAB_SIZE 256
4096 
4097 static int
4098 mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
4099 {
4100   mp_int  M[TAB_SIZE], res;
4101   mp_digit buf, mp;
4102   int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
4103 
4104   /* use a pointer to the reduction algorithm.  This allows us to use
4105    * one of many reduction algorithms without modding the guts of
4106    * the code with if statements everywhere.
4107    */
4108   int     (*redux)(mp_int*,mp_int*,mp_digit);
4109 
4110   /* find window size */
4111   x = mp_count_bits (X);
4112   if (x <= 7) {
4113     winsize = 2;
4114   } else if (x <= 36) {
4115     winsize = 3;
4116   } else if (x <= 140) {
4117     winsize = 4;
4118   } else if (x <= 450) {
4119     winsize = 5;
4120   } else if (x <= 1303) {
4121     winsize = 6;
4122   } else if (x <= 3529) {
4123     winsize = 7;
4124   } else {
4125     winsize = 8;
4126   }
4127 
4128   /* init M array */
4129   /* init first cell */
4130   if ((err = mp_init(&M[1])) != MP_OKAY) {
4131      return err;
4132   }
4133 
4134   /* now init the second half of the array */
4135   for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
4136     if ((err = mp_init(&M[x])) != MP_OKAY) {
4137       for (y = 1<<(winsize-1); y < x; y++) {
4138         mp_clear (&M[y]);
4139       }
4140       mp_clear(&M[1]);
4141       return err;
4142     }
4143   }
4144 
4145   /* determine and setup reduction code */
4146   if (redmode == 0) {
4147      /* now setup montgomery  */
4148      if ((err = mp_montgomery_setup (P, &mp)) != MP_OKAY) {
4149         goto LBL_M;
4150      }
4151 
4152      /* automatically pick the comba one if available (saves quite a few calls/ifs) */
4153      if (((unsigned)(P->used * 2 + 1) < MP_WARRAY) &&
4154           P->used < (1 << (unsigned)((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
4155         redux = fast_mp_montgomery_reduce;
4156      } else
4157      {
4158         /* use slower baseline Montgomery method */
4159         redux = mp_montgomery_reduce;
4160      }
4161   } else if (redmode == 1) {
4162      /* setup DR reduction for moduli of the form B**k - b */
4163      mp_dr_setup(P, &mp);
4164      redux = mp_dr_reduce;
4165   } else {
4166      /* setup DR reduction for moduli of the form 2**k - b */
4167      if ((err = mp_reduce_2k_setup(P, &mp)) != MP_OKAY) {
4168         goto LBL_M;
4169      }
4170      redux = mp_reduce_2k;
4171   }
4172 
4173   /* setup result */
4174   if ((err = mp_init (&res)) != MP_OKAY) {
4175     goto LBL_M;
4176   }
4177 
4178   /* create M table
4179    *
4180 
4181    *
4182    * The first half of the table is not computed though accept for M[0] and M[1]
4183    */
4184 
4185   if (redmode == 0) {
4186      /* now we need R mod m */
4187      if ((err = mp_montgomery_calc_normalization (&res, P)) != MP_OKAY) {
4188        goto LBL_RES;
4189      }
4190 
4191      /* now set M[1] to G * R mod m */
4192      if ((err = mp_mulmod (&M[1], G, &res, P)) != MP_OKAY) {
4193        goto LBL_RES;
4194      }
4195   } else {
4196      mp_set(&res, 1);
4197      if ((err = mp_mod(G, P, &M[1])) != MP_OKAY) {
4198         goto LBL_RES;
4199      }
4200   }
4201 
4202   /* compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times */
4203   if ((err = mp_copy ( &M[1], &M[1 << (winsize - 1)])) != MP_OKAY) {
4204     goto LBL_RES;
4205   }
4206 
4207   for (x = 0; x < (winsize - 1); x++) {
4208     if ((err = mp_sqr (&M[1 << (winsize - 1)], &M[1 << (winsize - 1)])) != MP_OKAY) {
4209       goto LBL_RES;
4210     }
4211     if ((err = redux (&M[1 << (winsize - 1)], P, mp)) != MP_OKAY) {
4212       goto LBL_RES;
4213     }
4214   }
4215 
4216   /* create upper table */
4217   for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) {
4218     if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) {
4219       goto LBL_RES;
4220     }
4221     if ((err = redux (&M[x], P, mp)) != MP_OKAY) {
4222       goto LBL_RES;
4223     }
4224   }
4225 
4226   /* set initial mode and bit cnt */
4227   mode   = 0;
4228   bitcnt = 1;
4229   buf    = 0;
4230   digidx = X->used - 1;
4231   bitcpy = 0;
4232   bitbuf = 0;
4233 
4234   for (;;) {
4235     /* grab next digit as required */
4236     if (--bitcnt == 0) {
4237       /* if digidx == -1 we are out of digits so break */
4238       if (digidx == -1) {
4239         break;
4240       }
4241       /* read next digit and reset bitcnt */
4242       buf    = X->dp[digidx--];
4243       bitcnt = (int)DIGIT_BIT;
4244     }
4245 
4246     /* grab the next msb from the exponent */
4247     y     = (int)(mp_digit)((mp_digit)buf >> (unsigned)(DIGIT_BIT - 1)) & 1;
4248     buf <<= (mp_digit)1;
4249 
4250     /* if the bit is zero and mode == 0 then we ignore it
4251      * These represent the leading zero bits before the first 1 bit
4252      * in the exponent.  Technically this opt is not required but it
4253      * does lower the # of trivial squaring/reductions used
4254      */
4255     if (mode == 0 && y == 0) {
4256       continue;
4257     }
4258 
4259     /* if the bit is zero and mode == 1 then we square */
4260     if (mode == 1 && y == 0) {
4261       if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
4262         goto LBL_RES;
4263       }
4264       if ((err = redux (&res, P, mp)) != MP_OKAY) {
4265         goto LBL_RES;
4266       }
4267       continue;
4268     }
4269 
4270     /* else we add it to the window */
4271     bitbuf |= (y << (winsize - ++bitcpy));
4272     mode    = 2;
4273 
4274     if (bitcpy == winsize) {
4275       /* ok window is filled so square as required and multiply  */
4276       /* square first */
4277       for (x = 0; x < winsize; x++) {
4278         if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
4279           goto LBL_RES;
4280         }
4281         if ((err = redux (&res, P, mp)) != MP_OKAY) {
4282           goto LBL_RES;
4283         }
4284       }
4285 
4286       /* then multiply */
4287       if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) {
4288         goto LBL_RES;
4289       }
4290       if ((err = redux (&res, P, mp)) != MP_OKAY) {
4291         goto LBL_RES;
4292       }
4293 
4294       /* empty window and reset */
4295       bitcpy = 0;
4296       bitbuf = 0;
4297       mode   = 1;
4298     }
4299   }
4300 
4301   /* if bits remain then square/multiply */
4302   if (mode == 2 && bitcpy > 0) {
4303     /* square then multiply if the bit is set */
4304     for (x = 0; x < bitcpy; x++) {
4305       if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
4306         goto LBL_RES;
4307       }
4308       if ((err = redux (&res, P, mp)) != MP_OKAY) {
4309         goto LBL_RES;
4310       }
4311 
4312       /* get next bit of the window */
4313       bitbuf <<= 1;
4314       if ((bitbuf & (1 << winsize)) != 0) {
4315         /* then multiply */
4316         if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
4317           goto LBL_RES;
4318         }
4319         if ((err = redux (&res, P, mp)) != MP_OKAY) {
4320           goto LBL_RES;
4321         }
4322       }
4323     }
4324   }
4325 
4326   if (redmode == 0) {
4327      /* fixup result if Montgomery reduction is used
4328       * recall that any value in a Montgomery system is
4329       * actually multiplied by R mod n.  So we have
4330       * to reduce one more time to cancel out the factor
4331       * of R.
4332       */
4333      if ((err = redux(&res, P, mp)) != MP_OKAY) {
4334        goto LBL_RES;
4335      }
4336   }
4337 
4338   /* swap res with Y */
4339   mp_exch (&res, Y);
4340   err = MP_OKAY;
4341 LBL_RES:mp_clear (&res);
4342 LBL_M:
4343   mp_clear(&M[1]);
4344   for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
4345     mp_clear (&M[x]);
4346   }
4347   return err;
4348 }
4349 
4350 /* Source: /usr/cvsroot/libtommath/dist/libtommath/bn_mp_exptmod_fast.c,v $ */
4351 /* Revision: 1.4 $ */
4352 /* Date: 2011/03/18 16:43:04 $ */
4353 
4354 /* this is a shell function that calls either the normal or Montgomery
4355  * exptmod functions.  Originally the call to the montgomery code was
4356  * embedded in the normal function but that wasted alot of stack space
4357  * for nothing (since 99% of the time the Montgomery code would be called)
4358  */
4359 static int
4360 mp_exptmod(mp_int * G, mp_int * X, mp_int * P, mp_int *Y)
4361 {
4362   int dr;
4363 
4364   /* modulus P must be positive */
4365   if (P->sign == MP_NEG) {
4366      return MP_VAL;
4367   }
4368 
4369   /* if exponent X is negative we have to recurse */
4370   if (X->sign == MP_NEG) {
4371      mp_int tmpG, tmpX;
4372      int err;
4373 
4374      /* first compute 1/G mod P */
4375      if ((err = mp_init(&tmpG)) != MP_OKAY) {
4376         return err;
4377      }
4378      if ((err = mp_invmod(&tmpG, G, P)) != MP_OKAY) {
4379         mp_clear(&tmpG);
4380         return err;
4381      }
4382 
4383      /* now get |X| */
4384      if ((err = mp_init(&tmpX)) != MP_OKAY) {
4385         mp_clear(&tmpG);
4386         return err;
4387      }
4388      if ((err = mp_abs(X, &tmpX)) != MP_OKAY) {
4389         mp_clear_multi(&tmpG, &tmpX, NULL);
4390         return err;
4391      }
4392 
4393      /* and now compute (1/G)**|X| instead of G**X [X < 0] */
4394      err = mp_exptmod(&tmpG, &tmpX, P, Y);
4395      mp_clear_multi(&tmpG, &tmpX, NULL);
4396      return err;
4397   }
4398 
4399 /* modified diminished radix reduction */
4400   if (mp_reduce_is_2k_l(P) == MP_YES) {
4401      return s_mp_exptmod(G, X, P, Y, 1);
4402   }
4403 
4404   /* is it a DR modulus? */
4405   dr = mp_dr_is_modulus(P);
4406 
4407   /* if not, is it a unrestricted DR modulus? */
4408   if (dr == 0) {
4409      dr = mp_reduce_is_2k(P) << 1;
4410   }
4411 
4412   /* if the modulus is odd or dr != 0 use the montgomery method */
4413   if (BN_is_odd (P) == 1 || dr !=  0) {
4414     return mp_exptmod_fast (G, X, P, Y, dr);
4415   } else {
4416     /* otherwise use the generic Barrett reduction technique */
4417     return s_mp_exptmod (G, X, P, Y, 0);
4418   }
4419 }
4420 
4421 /* reverse an array, used for radix code */
4422 static void
4423 bn_reverse(unsigned char *s, int len)
4424 {
4425   int     ix, iy;
4426   unsigned char t;
4427 
4428   ix = 0;
4429   iy = len - 1;
4430   while (ix < iy) {
4431     t     = s[ix];
4432     s[ix] = s[iy];
4433     s[iy] = t;
4434     ++ix;
4435     --iy;
4436   }
4437 }
4438 
4439 static int
4440 s_is_power_of_two(mp_digit b, int *p)
4441 {
4442    int x;
4443 
4444    /* fast return if no power of two */
4445    if ((b==0) || (b & (b-1))) {
4446       return 0;
4447    }
4448 
4449    for (x = 0; x < DIGIT_BIT; x++) {
4450       if (b == (((mp_digit)1)<<x)) {
4451          *p = x;
4452          return 1;
4453       }
4454    }
4455    return 0;
4456 }
4457 
4458 /* single digit division (based on routine from MPI) */
4459 static int
4460 mp_div_d(mp_int *a, mp_digit b, mp_int *c, mp_digit *d)
4461 {
4462   mp_int  q;
4463   mp_word w;
4464   mp_digit t;
4465   int     res, ix;
4466 
4467   /* cannot divide by zero */
4468   if (b == 0) {
4469      return MP_VAL;
4470   }
4471 
4472   /* quick outs */
4473   if (b == 1 || mp_iszero(a) == 1) {
4474      if (d != NULL) {
4475         *d = 0;
4476      }
4477      if (c != NULL) {
4478         return mp_copy(a, c);
4479      }
4480      return MP_OKAY;
4481   }
4482 
4483   /* power of two ? */
4484   if (s_is_power_of_two(b, &ix) == 1) {
4485      if (d != NULL) {
4486         *d = a->dp[0] & ((((mp_digit)1)<<ix) - 1);
4487      }
4488      if (c != NULL) {
4489         return mp_div_2d(a, ix, c, NULL);
4490      }
4491      return MP_OKAY;
4492   }
4493 
4494 #ifdef BN_MP_DIV_3_C
4495   /* three? */
4496   if (b == 3) {
4497      return mp_div_3(a, c, d);
4498   }
4499 #endif
4500 
4501   /* no easy answer [c'est la vie].  Just division */
4502   if ((res = mp_init_size(&q, a->used)) != MP_OKAY) {
4503      return res;
4504   }
4505 
4506   q.used = a->used;
4507   q.sign = a->sign;
4508   w = 0;
4509   for (ix = a->used - 1; ix >= 0; ix--) {
4510      w = (w << ((mp_word)DIGIT_BIT)) | ((mp_word)a->dp[ix]);
4511 
4512      if (w >= b) {
4513         t = (mp_digit)(w / b);
4514         w -= ((mp_word)t) * ((mp_word)b);
4515       } else {
4516         t = 0;
4517       }
4518       q.dp[ix] = (mp_digit)t;
4519   }
4520 
4521   if (d != NULL) {
4522      *d = (mp_digit)w;
4523   }
4524 
4525   if (c != NULL) {
4526      mp_clamp(&q);
4527      mp_exch(&q, c);
4528   }
4529   mp_clear(&q);
4530 
4531   return res;
4532 }
4533 
4534 static int
4535 mp_mod_d(mp_int *a, mp_digit b, mp_digit *c)
4536 {
4537   return mp_div_d(a, b, NULL, c);
4538 }
4539 
4540 static const mp_digit ltm_prime_tab[] = {
4541   0x0002, 0x0003, 0x0005, 0x0007, 0x000B, 0x000D, 0x0011, 0x0013,
4542   0x0017, 0x001D, 0x001F, 0x0025, 0x0029, 0x002B, 0x002F, 0x0035,
4543   0x003B, 0x003D, 0x0043, 0x0047, 0x0049, 0x004F, 0x0053, 0x0059,
4544   0x0061, 0x0065, 0x0067, 0x006B, 0x006D, 0x0071, 0x007F,
4545 #ifndef MP_8BIT
4546   0x0083,
4547   0x0089, 0x008B, 0x0095, 0x0097, 0x009D, 0x00A3, 0x00A7, 0x00AD,
4548   0x00B3, 0x00B5, 0x00BF, 0x00C1, 0x00C5, 0x00C7, 0x00D3, 0x00DF,
4549   0x00E3, 0x00E5, 0x00E9, 0x00EF, 0x00F1, 0x00FB, 0x0101, 0x0107,
4550   0x010D, 0x010F, 0x0115, 0x0119, 0x011B, 0x0125, 0x0133, 0x0137,
4551 
4552   0x0139, 0x013D, 0x014B, 0x0151, 0x015B, 0x015D, 0x0161, 0x0167,
4553   0x016F, 0x0175, 0x017B, 0x017F, 0x0185, 0x018D, 0x0191, 0x0199,
4554   0x01A3, 0x01A5, 0x01AF, 0x01B1, 0x01B7, 0x01BB, 0x01C1, 0x01C9,
4555   0x01CD, 0x01CF, 0x01D3, 0x01DF, 0x01E7, 0x01EB, 0x01F3, 0x01F7,
4556   0x01FD, 0x0209, 0x020B, 0x021D, 0x0223, 0x022D, 0x0233, 0x0239,
4557   0x023B, 0x0241, 0x024B, 0x0251, 0x0257, 0x0259, 0x025F, 0x0265,
4558   0x0269, 0x026B, 0x0277, 0x0281, 0x0283, 0x0287, 0x028D, 0x0293,
4559   0x0295, 0x02A1, 0x02A5, 0x02AB, 0x02B3, 0x02BD, 0x02C5, 0x02CF,
4560 
4561   0x02D7, 0x02DD, 0x02E3, 0x02E7, 0x02EF, 0x02F5, 0x02F9, 0x0301,
4562   0x0305, 0x0313, 0x031D, 0x0329, 0x032B, 0x0335, 0x0337, 0x033B,
4563   0x033D, 0x0347, 0x0355, 0x0359, 0x035B, 0x035F, 0x036D, 0x0371,
4564   0x0373, 0x0377, 0x038B, 0x038F, 0x0397, 0x03A1, 0x03A9, 0x03AD,
4565   0x03B3, 0x03B9, 0x03C7, 0x03CB, 0x03D1, 0x03D7, 0x03DF, 0x03E5,
4566   0x03F1, 0x03F5, 0x03FB, 0x03FD, 0x0407, 0x0409, 0x040F, 0x0419,
4567   0x041B, 0x0425, 0x0427, 0x042D, 0x043F, 0x0443, 0x0445, 0x0449,
4568   0x044F, 0x0455, 0x045D, 0x0463, 0x0469, 0x047F, 0x0481, 0x048B,
4569 
4570   0x0493, 0x049D, 0x04A3, 0x04A9, 0x04B1, 0x04BD, 0x04C1, 0x04C7,
4571   0x04CD, 0x04CF, 0x04D5, 0x04E1, 0x04EB, 0x04FD, 0x04FF, 0x0503,
4572   0x0509, 0x050B, 0x0511, 0x0515, 0x0517, 0x051B, 0x0527, 0x0529,
4573   0x052F, 0x0551, 0x0557, 0x055D, 0x0565, 0x0577, 0x0581, 0x058F,
4574   0x0593, 0x0595, 0x0599, 0x059F, 0x05A7, 0x05AB, 0x05AD, 0x05B3,
4575   0x05BF, 0x05C9, 0x05CB, 0x05CF, 0x05D1, 0x05D5, 0x05DB, 0x05E7,
4576   0x05F3, 0x05FB, 0x0607, 0x060D, 0x0611, 0x0617, 0x061F, 0x0623,
4577   0x062B, 0x062F, 0x063D, 0x0641, 0x0647, 0x0649, 0x064D, 0x0653
4578 #endif
4579 };
4580 
4581 #define PRIME_SIZE	__arraycount(ltm_prime_tab)
4582 
4583 static int
4584 mp_prime_is_divisible(mp_int *a, int *result)
4585 {
4586   int     err, ix;
4587   mp_digit res;
4588 
4589   /* default to not */
4590   *result = MP_NO;
4591 
4592   for (ix = 0; ix < (int)PRIME_SIZE; ix++) {
4593     /* what is a mod LBL_prime_tab[ix] */
4594     if ((err = mp_mod_d (a, ltm_prime_tab[ix], &res)) != MP_OKAY) {
4595       return err;
4596     }
4597 
4598     /* is the residue zero? */
4599     if (res == 0) {
4600       *result = MP_YES;
4601       return MP_OKAY;
4602     }
4603   }
4604 
4605   return MP_OKAY;
4606 }
4607 
4608 /* single digit addition */
4609 static int
4610 mp_add_d(mp_int *a, mp_digit b, mp_int *c)
4611 {
4612   int     res, ix, oldused;
4613   mp_digit *tmpa, *tmpc, mu;
4614 
4615   /* grow c as required */
4616   if (c->alloc < a->used + 1) {
4617      if ((res = mp_grow(c, a->used + 1)) != MP_OKAY) {
4618         return res;
4619      }
4620   }
4621 
4622   /* if a is negative and |a| >= b, call c = |a| - b */
4623   if (a->sign == MP_NEG && (a->used > 1 || a->dp[0] >= b)) {
4624      /* temporarily fix sign of a */
4625      a->sign = MP_ZPOS;
4626 
4627      /* c = |a| - b */
4628      res = mp_sub_d(a, b, c);
4629 
4630      /* fix sign  */
4631      a->sign = c->sign = MP_NEG;
4632 
4633      /* clamp */
4634      mp_clamp(c);
4635 
4636      return res;
4637   }
4638 
4639   /* old number of used digits in c */
4640   oldused = c->used;
4641 
4642   /* sign always positive */
4643   c->sign = MP_ZPOS;
4644 
4645   /* source alias */
4646   tmpa    = a->dp;
4647 
4648   /* destination alias */
4649   tmpc    = c->dp;
4650 
4651   /* if a is positive */
4652   if (a->sign == MP_ZPOS) {
4653      /* add digit, after this we're propagating
4654       * the carry.
4655       */
4656      *tmpc   = *tmpa++ + b;
4657      mu      = *tmpc >> DIGIT_BIT;
4658      *tmpc++ &= MP_MASK;
4659 
4660      /* now handle rest of the digits */
4661      for (ix = 1; ix < a->used; ix++) {
4662         *tmpc   = *tmpa++ + mu;
4663         mu      = *tmpc >> DIGIT_BIT;
4664         *tmpc++ &= MP_MASK;
4665      }
4666      /* set final carry */
4667      ix++;
4668      *tmpc++  = mu;
4669 
4670      /* setup size */
4671      c->used = a->used + 1;
4672   } else {
4673      /* a was negative and |a| < b */
4674      c->used  = 1;
4675 
4676      /* the result is a single digit */
4677      if (a->used == 1) {
4678         *tmpc++  =  b - a->dp[0];
4679      } else {
4680         *tmpc++  =  b;
4681      }
4682 
4683      /* setup count so the clearing of oldused
4684       * can fall through correctly
4685       */
4686      ix       = 1;
4687   }
4688 
4689   /* now zero to oldused */
4690   while (ix++ < oldused) {
4691      *tmpc++ = 0;
4692   }
4693   mp_clamp(c);
4694 
4695   return MP_OKAY;
4696 }
4697 
4698 /* single digit subtraction */
4699 static int
4700 mp_sub_d(mp_int *a, mp_digit b, mp_int *c)
4701 {
4702   mp_digit *tmpa, *tmpc, mu;
4703   int       res, ix, oldused;
4704 
4705   /* grow c as required */
4706   if (c->alloc < a->used + 1) {
4707      if ((res = mp_grow(c, a->used + 1)) != MP_OKAY) {
4708         return res;
4709      }
4710   }
4711 
4712   /* if a is negative just do an unsigned
4713    * addition [with fudged signs]
4714    */
4715   if (a->sign == MP_NEG) {
4716      a->sign = MP_ZPOS;
4717      res     = mp_add_d(a, b, c);
4718      a->sign = c->sign = MP_NEG;
4719 
4720      /* clamp */
4721      mp_clamp(c);
4722 
4723      return res;
4724   }
4725 
4726   /* setup regs */
4727   oldused = c->used;
4728   tmpa    = a->dp;
4729   tmpc    = c->dp;
4730 
4731   /* if a <= b simply fix the single digit */
4732   if ((a->used == 1 && a->dp[0] <= b) || a->used == 0) {
4733      if (a->used == 1) {
4734         *tmpc++ = b - *tmpa;
4735      } else {
4736         *tmpc++ = b;
4737      }
4738      ix      = 1;
4739 
4740      /* negative/1digit */
4741      c->sign = MP_NEG;
4742      c->used = 1;
4743   } else {
4744      /* positive/size */
4745      c->sign = MP_ZPOS;
4746      c->used = a->used;
4747 
4748      /* subtract first digit */
4749      *tmpc    = *tmpa++ - b;
4750      mu       = *tmpc >> (sizeof(mp_digit) * CHAR_BIT - 1);
4751      *tmpc++ &= MP_MASK;
4752 
4753      /* handle rest of the digits */
4754      for (ix = 1; ix < a->used; ix++) {
4755         *tmpc    = *tmpa++ - mu;
4756         mu       = *tmpc >> (sizeof(mp_digit) * CHAR_BIT - 1);
4757         *tmpc++ &= MP_MASK;
4758      }
4759   }
4760 
4761   /* zero excess digits */
4762   while (ix++ < oldused) {
4763      *tmpc++ = 0;
4764   }
4765   mp_clamp(c);
4766   return MP_OKAY;
4767 }
4768 
4769 static const int lnz[16] = {
4770    4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
4771 };
4772 
4773 /* Counts the number of lsbs which are zero before the first zero bit */
4774 static int
4775 mp_cnt_lsb(mp_int *a)
4776 {
4777    int x;
4778    mp_digit q, qq;
4779 
4780    /* easy out */
4781    if (mp_iszero(a) == 1) {
4782       return 0;
4783    }
4784 
4785    /* scan lower digits until non-zero */
4786    for (x = 0; x < a->used && a->dp[x] == 0; x++);
4787    q = a->dp[x];
4788    x *= DIGIT_BIT;
4789 
4790    /* now scan this digit until a 1 is found */
4791    if ((q & 1) == 0) {
4792       do {
4793          qq  = q & 15;
4794 	 /* LINTED previous op ensures range of qq */
4795          x  += lnz[qq];
4796          q >>= 4;
4797       } while (qq == 0);
4798    }
4799    return x;
4800 }
4801 
4802 /* c = a * a (mod b) */
4803 static int
4804 mp_sqrmod(mp_int *a, mp_int *b, mp_int *c)
4805 {
4806   int     res;
4807   mp_int  t;
4808 
4809   if ((res = mp_init (&t)) != MP_OKAY) {
4810     return res;
4811   }
4812 
4813   if ((res = mp_sqr (a, &t)) != MP_OKAY) {
4814     mp_clear (&t);
4815     return res;
4816   }
4817   res = mp_mod (&t, b, c);
4818   mp_clear (&t);
4819   return res;
4820 }
4821 static int
4822 mp_prime_miller_rabin(mp_int *a, mp_int *b, int *result)
4823 {
4824   mp_int  n1, y, r;
4825   int     s, j, err;
4826 
4827   /* default */
4828   *result = MP_NO;
4829 
4830   /* ensure b > 1 */
4831   if (mp_cmp_d(b, 1) != MP_GT) {
4832      return MP_VAL;
4833   }
4834 
4835   /* get n1 = a - 1 */
4836   if ((err = mp_init_copy (&n1, a)) != MP_OKAY) {
4837     return err;
4838   }
4839   if ((err = mp_sub_d (&n1, 1, &n1)) != MP_OKAY) {
4840     goto LBL_N1;
4841   }
4842 
4843   /* set 2**s * r = n1 */
4844   if ((err = mp_init_copy (&r, &n1)) != MP_OKAY) {
4845     goto LBL_N1;
4846   }
4847 
4848   /* count the number of least significant bits
4849    * which are zero
4850    */
4851   s = mp_cnt_lsb(&r);
4852 
4853   /* now divide n - 1 by 2**s */
4854   if ((err = mp_div_2d (&r, s, &r, NULL)) != MP_OKAY) {
4855     goto LBL_R;
4856   }
4857 
4858   /* compute y = b**r mod a */
4859   if ((err = mp_init (&y)) != MP_OKAY) {
4860     goto LBL_R;
4861   }
4862   if ((err = mp_exptmod (b, &r, a, &y)) != MP_OKAY) {
4863     goto LBL_Y;
4864   }
4865 
4866   /* if y != 1 and y != n1 do */
4867   if (mp_cmp_d (&y, 1) != MP_EQ && mp_cmp (&y, &n1) != MP_EQ) {
4868     j = 1;
4869     /* while j <= s-1 and y != n1 */
4870     while ((j <= (s - 1)) && mp_cmp (&y, &n1) != MP_EQ) {
4871       if ((err = mp_sqrmod (&y, a, &y)) != MP_OKAY) {
4872          goto LBL_Y;
4873       }
4874 
4875       /* if y == 1 then composite */
4876       if (mp_cmp_d (&y, 1) == MP_EQ) {
4877          goto LBL_Y;
4878       }
4879 
4880       ++j;
4881     }
4882 
4883     /* if y != n1 then composite */
4884     if (mp_cmp (&y, &n1) != MP_EQ) {
4885       goto LBL_Y;
4886     }
4887   }
4888 
4889   /* probably prime now */
4890   *result = MP_YES;
4891 LBL_Y:mp_clear (&y);
4892 LBL_R:mp_clear (&r);
4893 LBL_N1:mp_clear (&n1);
4894   return err;
4895 }
4896 
4897 /* performs a variable number of rounds of Miller-Rabin
4898  *
4899  * Probability of error after t rounds is no more than
4900 
4901  *
4902  * Sets result to 1 if probably prime, 0 otherwise
4903  */
4904 static int
4905 mp_prime_is_prime(mp_int *a, int t, int *result)
4906 {
4907   mp_int  b;
4908   int     ix, err, res;
4909 
4910   /* default to no */
4911   *result = MP_NO;
4912 
4913   /* valid value of t? */
4914   if (t <= 0 || t > (int)PRIME_SIZE) {
4915     return MP_VAL;
4916   }
4917 
4918   /* is the input equal to one of the primes in the table? */
4919   for (ix = 0; ix < (int)PRIME_SIZE; ix++) {
4920       if (mp_cmp_d(a, ltm_prime_tab[ix]) == MP_EQ) {
4921          *result = 1;
4922          return MP_OKAY;
4923       }
4924   }
4925 
4926   /* first perform trial division */
4927   if ((err = mp_prime_is_divisible (a, &res)) != MP_OKAY) {
4928     return err;
4929   }
4930 
4931   /* return if it was trivially divisible */
4932   if (res == MP_YES) {
4933     return MP_OKAY;
4934   }
4935 
4936   /* now perform the miller-rabin rounds */
4937   if ((err = mp_init (&b)) != MP_OKAY) {
4938     return err;
4939   }
4940 
4941   for (ix = 0; ix < t; ix++) {
4942     /* set the prime */
4943     mp_set (&b, ltm_prime_tab[ix]);
4944 
4945     if ((err = mp_prime_miller_rabin (a, &b, &res)) != MP_OKAY) {
4946       goto LBL_B;
4947     }
4948 
4949     if (res == MP_NO) {
4950       goto LBL_B;
4951     }
4952   }
4953 
4954   /* passed the test */
4955   *result = MP_YES;
4956 LBL_B:mp_clear (&b);
4957   return err;
4958 }
4959 
4960 /* returns size of ASCII reprensentation */
4961 static int
4962 mp_radix_size (mp_int *a, int radix, int *size)
4963 {
4964   int     res, digs;
4965   mp_int  t;
4966   mp_digit d;
4967 
4968   *size = 0;
4969 
4970   /* special case for binary */
4971   if (radix == 2) {
4972     *size = mp_count_bits (a) + (a->sign == MP_NEG ? 1 : 0) + 1;
4973     return MP_OKAY;
4974   }
4975 
4976   /* make sure the radix is in range */
4977   if (radix < 2 || radix > 64) {
4978     return MP_VAL;
4979   }
4980 
4981   if (mp_iszero(a) == MP_YES) {
4982     *size = 2;
4983     return MP_OKAY;
4984   }
4985 
4986   /* digs is the digit count */
4987   digs = 0;
4988 
4989   /* if it's negative add one for the sign */
4990   if (a->sign == MP_NEG) {
4991     ++digs;
4992   }
4993 
4994   /* init a copy of the input */
4995   if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
4996     return res;
4997   }
4998 
4999   /* force temp to positive */
5000   t.sign = MP_ZPOS;
5001 
5002   /* fetch out all of the digits */
5003   while (mp_iszero (&t) == MP_NO) {
5004     if ((res = mp_div_d (&t, (mp_digit) radix, &t, &d)) != MP_OKAY) {
5005       mp_clear (&t);
5006       return res;
5007     }
5008     ++digs;
5009   }
5010   mp_clear (&t);
5011 
5012   /* return digs + 1, the 1 is for the NULL byte that would be required. */
5013   *size = digs + 1;
5014   return MP_OKAY;
5015 }
5016 
5017 static const char *mp_s_rmap = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+/";
5018 
5019 /* stores a bignum as a ASCII string in a given radix (2..64)
5020  *
5021  * Stores upto maxlen-1 chars and always a NULL byte
5022  */
5023 static int
5024 mp_toradix_n(mp_int * a, char *str, int radix, int maxlen)
5025 {
5026   int     res, digs;
5027   mp_int  t;
5028   mp_digit d;
5029   char   *_s = str;
5030 
5031   /* check range of the maxlen, radix */
5032   if (maxlen < 2 || radix < 2 || radix > 64) {
5033     return MP_VAL;
5034   }
5035 
5036   /* quick out if its zero */
5037   if (mp_iszero(a) == MP_YES) {
5038      *str++ = '0';
5039      *str = '\0';
5040      return MP_OKAY;
5041   }
5042 
5043   if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
5044     return res;
5045   }
5046 
5047   /* if it is negative output a - */
5048   if (t.sign == MP_NEG) {
5049     /* we have to reverse our digits later... but not the - sign!! */
5050     ++_s;
5051 
5052     /* store the flag and mark the number as positive */
5053     *str++ = '-';
5054     t.sign = MP_ZPOS;
5055 
5056     /* subtract a char */
5057     --maxlen;
5058   }
5059 
5060   digs = 0;
5061   while (mp_iszero (&t) == 0) {
5062     if (--maxlen < 1) {
5063        /* no more room */
5064        break;
5065     }
5066     if ((res = mp_div_d (&t, (mp_digit) radix, &t, &d)) != MP_OKAY) {
5067       mp_clear (&t);
5068       return res;
5069     }
5070     /* LINTED -- radix' range is checked above, limits d's range */
5071     *str++ = mp_s_rmap[d];
5072     ++digs;
5073   }
5074 
5075   /* reverse the digits of the string.  In this case _s points
5076    * to the first digit [exluding the sign] of the number
5077    */
5078   bn_reverse ((unsigned char *)_s, digs);
5079 
5080   /* append a NULL so the string is properly terminated */
5081   *str = '\0';
5082 
5083   mp_clear (&t);
5084   return MP_OKAY;
5085 }
5086 
5087 static char *
5088 formatbn(const BIGNUM *a, const int radix)
5089 {
5090 	char	*s;
5091 	int	 len;
5092 
5093 	if (mp_radix_size(__UNCONST(a), radix, &len) != MP_OKAY) {
5094 		return NULL;
5095 	}
5096 	if ((s = netpgp_allocate(1, (size_t)len)) != NULL) {
5097 		if (mp_toradix_n(__UNCONST(a), s, radix, len) != MP_OKAY) {
5098 			netpgp_deallocate(s, (size_t)len);
5099 			return NULL;
5100 		}
5101 	}
5102 	return s;
5103 }
5104 
5105 static int
5106 mp_getradix_num(mp_int *a, int radix, char *s)
5107 {
5108    int err, ch, neg, y;
5109 
5110    /* clear a */
5111    mp_zero(a);
5112 
5113    /* if first digit is - then set negative */
5114    if ((ch = *s++) == '-') {
5115       neg = MP_NEG;
5116       ch = *s++;
5117    } else {
5118       neg = MP_ZPOS;
5119    }
5120 
5121    for (;;) {
5122       /* find y in the radix map */
5123       for (y = 0; y < radix; y++) {
5124           if (mp_s_rmap[y] == ch) {
5125              break;
5126           }
5127       }
5128       if (y == radix) {
5129          break;
5130       }
5131 
5132       /* shift up and add */
5133       if ((err = mp_mul_d(a, radix, a)) != MP_OKAY) {
5134          return err;
5135       }
5136       if ((err = mp_add_d(a, y, a)) != MP_OKAY) {
5137          return err;
5138       }
5139 
5140       ch = *s++;
5141    }
5142    if (mp_cmp_d(a, 0) != MP_EQ) {
5143       a->sign = neg;
5144    }
5145 
5146    return MP_OKAY;
5147 }
5148 
5149 static int
5150 getbn(BIGNUM **a, const char *str, int radix)
5151 {
5152 	int	len;
5153 
5154 	if (a == NULL || str == NULL || (*a = BN_new()) == NULL) {
5155 		return 0;
5156 	}
5157 	if (mp_getradix_num(*a, radix, __UNCONST(str)) != MP_OKAY) {
5158 		return 0;
5159 	}
5160 	mp_radix_size(__UNCONST(a), radix, &len);
5161 	return len - 1;
5162 }
5163 
5164 /* d = a - b (mod c) */
5165 static int
5166 mp_submod(mp_int *a, mp_int *b, mp_int *c, mp_int *d)
5167 {
5168   int     res;
5169   mp_int  t;
5170 
5171 
5172   if ((res = mp_init (&t)) != MP_OKAY) {
5173     return res;
5174   }
5175 
5176   if ((res = mp_sub (a, b, &t)) != MP_OKAY) {
5177     mp_clear (&t);
5178     return res;
5179   }
5180   res = mp_mod (&t, c, d);
5181   mp_clear (&t);
5182   return res;
5183 }
5184 
5185 /**************************************************************************/
5186 
5187 /* BIGNUM emulation layer */
5188 
5189 /* essentiually, these are just wrappers around the libtommath functions */
5190 /* usually the order of args changes */
5191 /* the BIGNUM API tends to have more const poisoning */
5192 /* these wrappers also check the arguments passed for sanity */
5193 
5194 BIGNUM *
5195 BN_bin2bn(const uint8_t *data, int len, BIGNUM *ret)
5196 {
5197 	if (data == NULL) {
5198 		return BN_new();
5199 	}
5200 	if (ret == NULL) {
5201 		ret = BN_new();
5202 	}
5203 	return (mp_read_unsigned_bin(ret, data, len) == MP_OKAY) ? ret : NULL;
5204 }
5205 
5206 /* store in unsigned [big endian] format */
5207 int
5208 BN_bn2bin(const BIGNUM *a, unsigned char *b)
5209 {
5210 	BIGNUM	t;
5211 	int    	x;
5212 
5213 	if (a == NULL || b == NULL) {
5214 		return -1;
5215 	}
5216 	if (mp_init_copy (&t, __UNCONST(a)) != MP_OKAY) {
5217 		return -1;
5218 	}
5219 	for (x = 0; !BN_is_zero(&t) ; ) {
5220 		b[x++] = (unsigned char) (t.dp[0] & 0xff);
5221 		if (mp_div_2d (&t, 8, &t, NULL) != MP_OKAY) {
5222 			mp_clear(&t);
5223 			return -1;
5224 		}
5225 	}
5226 	bn_reverse(b, x);
5227 	mp_clear(&t);
5228 	return x;
5229 }
5230 
5231 void
5232 BN_init(BIGNUM *a)
5233 {
5234 	if (a != NULL) {
5235 		mp_init(a);
5236 	}
5237 }
5238 
5239 BIGNUM *
5240 BN_new(void)
5241 {
5242 	BIGNUM	*a;
5243 
5244 	if ((a = netpgp_allocate(1, sizeof(*a))) != NULL) {
5245 		mp_init(a);
5246 	}
5247 	return a;
5248 }
5249 
5250 /* copy, b = a */
5251 int
5252 BN_copy(BIGNUM *b, const BIGNUM *a)
5253 {
5254 	if (a == NULL || b == NULL) {
5255 		return MP_VAL;
5256 	}
5257 	return mp_copy(__UNCONST(a), b);
5258 }
5259 
5260 BIGNUM *
5261 BN_dup(const BIGNUM *a)
5262 {
5263 	BIGNUM	*ret;
5264 
5265 	if (a == NULL) {
5266 		return NULL;
5267 	}
5268 	if ((ret = BN_new()) != NULL) {
5269 		BN_copy(ret, a);
5270 	}
5271 	return ret;
5272 }
5273 
5274 void
5275 BN_swap(BIGNUM *a, BIGNUM *b)
5276 {
5277 	if (a && b) {
5278 		mp_exch(a, b);
5279 	}
5280 }
5281 
5282 int
5283 BN_lshift(BIGNUM *r, const BIGNUM *a, int n)
5284 {
5285 	if (r == NULL || a == NULL || n < 0) {
5286 		return 0;
5287 	}
5288 	BN_copy(r, a);
5289 	return mp_lshd(r, n) == MP_OKAY;
5290 }
5291 
5292 int
5293 BN_lshift1(BIGNUM *r, BIGNUM *a)
5294 {
5295 	if (r == NULL || a == NULL) {
5296 		return 0;
5297 	}
5298 	BN_copy(r, a);
5299 	return mp_lshd(r, 1) == MP_OKAY;
5300 }
5301 
5302 int
5303 BN_rshift(BIGNUM *r, const BIGNUM *a, int n)
5304 {
5305 	if (r == NULL || a == NULL || n < 0) {
5306 		return MP_VAL;
5307 	}
5308 	BN_copy(r, a);
5309 	return mp_rshd(r, n) == MP_OKAY;
5310 }
5311 
5312 int
5313 BN_rshift1(BIGNUM *r, BIGNUM *a)
5314 {
5315 	if (r == NULL || a == NULL) {
5316 		return 0;
5317 	}
5318 	BN_copy(r, a);
5319 	return mp_rshd(r, 1) == MP_OKAY;
5320 }
5321 
5322 int
5323 BN_set_word(BIGNUM *a, BN_ULONG w)
5324 {
5325 	if (a == NULL) {
5326 		return 0;
5327 	}
5328 	mp_set(a, w);
5329 	return 1;
5330 }
5331 
5332 int
5333 BN_add(BIGNUM *r, const BIGNUM *a, const BIGNUM *b)
5334 {
5335 	if (a == NULL || b == NULL || r == NULL) {
5336 		return 0;
5337 	}
5338 	return mp_add(__UNCONST(a), __UNCONST(b), r) == MP_OKAY;
5339 }
5340 
5341 int
5342 BN_sub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b)
5343 {
5344 	if (a == NULL || b == NULL || r == NULL) {
5345 		return 0;
5346 	}
5347 	return mp_sub(__UNCONST(a), __UNCONST(b), r) == MP_OKAY;
5348 }
5349 
5350 int
5351 BN_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx)
5352 {
5353 	if (a == NULL || b == NULL || r == NULL) {
5354 		return 0;
5355 	}
5356 	USE_ARG(ctx);
5357 	return mp_mul(__UNCONST(a), __UNCONST(b), r) == MP_OKAY;
5358 }
5359 
5360 int
5361 BN_div(BIGNUM *dv, BIGNUM *rem, const BIGNUM *a, const BIGNUM *d, BN_CTX *ctx)
5362 {
5363 	if ((dv == NULL && rem == NULL) || a == NULL || d == NULL) {
5364 		return 0;
5365 	}
5366 	USE_ARG(ctx);
5367 	return mp_div(dv, rem, __UNCONST(a), __UNCONST(d)) == MP_OKAY;
5368 }
5369 
5370 void
5371 BN_free(BIGNUM *a)
5372 {
5373 	if (a) {
5374 		mp_clear(a);
5375 	}
5376 }
5377 
5378 void
5379 BN_clear(BIGNUM *a)
5380 {
5381 	if (a) {
5382 		mp_clear(a);
5383 	}
5384 }
5385 
5386 void
5387 BN_clear_free(BIGNUM *a)
5388 {
5389 	if (a) {
5390 		mp_clear(a);
5391 	}
5392 }
5393 
5394 int
5395 BN_num_bytes(const BIGNUM *a)
5396 {
5397 	if (a == NULL) {
5398 		return MP_VAL;
5399 	}
5400 	return mp_unsigned_bin_size(__UNCONST(a));
5401 }
5402 
5403 int
5404 BN_num_bits(const BIGNUM *a)
5405 {
5406 	if (a == NULL) {
5407 		return 0;
5408 	}
5409 	return mp_count_bits(a);
5410 }
5411 
5412 void
5413 BN_set_negative(BIGNUM *a, int n)
5414 {
5415 	if (a) {
5416 		a->sign = (n) ? MP_NEG : 0;
5417 	}
5418 }
5419 
5420 int
5421 BN_cmp(BIGNUM *a, BIGNUM *b)
5422 {
5423 	if (a == NULL || b == NULL) {
5424 		return MP_VAL;
5425 	}
5426 	switch(mp_cmp(a, b)) {
5427 	case MP_LT:
5428 		return -1;
5429 	case MP_GT:
5430 		return 1;
5431 	case MP_EQ:
5432 	default:
5433 		return 0;
5434 	}
5435 }
5436 
5437 int
5438 BN_mod_exp(BIGNUM *Y, BIGNUM *G, BIGNUM *X, BIGNUM *P, BN_CTX *ctx)
5439 {
5440 	if (Y == NULL || G == NULL || X == NULL || P == NULL) {
5441 		return MP_VAL;
5442 	}
5443 	USE_ARG(ctx);
5444 	return mp_exptmod(G, X, P, Y) == MP_OKAY;
5445 }
5446 
5447 BIGNUM *
5448 BN_mod_inverse(BIGNUM *r, BIGNUM *a, const BIGNUM *n, BN_CTX *ctx)
5449 {
5450 	USE_ARG(ctx);
5451 	if (r == NULL || a == NULL || n == NULL) {
5452 		return NULL;
5453 	}
5454 	return (mp_invmod(r, a, __UNCONST(n)) == MP_OKAY) ? r : NULL;
5455 }
5456 
5457 int
5458 BN_mod_mul(BIGNUM *ret, BIGNUM *a, BIGNUM *b, const BIGNUM *m, BN_CTX *ctx)
5459 {
5460 	USE_ARG(ctx);
5461 	if (ret == NULL || a == NULL || b == NULL || m == NULL) {
5462 		return 0;
5463 	}
5464 	return mp_mulmod(ret, a, b, __UNCONST(m)) == MP_OKAY;
5465 }
5466 
5467 BN_CTX *
5468 BN_CTX_new(void)
5469 {
5470 	return netpgp_allocate(1, sizeof(BN_CTX));
5471 }
5472 
5473 void
5474 BN_CTX_init(BN_CTX *c)
5475 {
5476 	if (c != NULL) {
5477 		c->arraysize = 15;
5478 		if ((c->v = netpgp_allocate(sizeof(*c->v), c->arraysize)) == NULL) {
5479 			c->arraysize = 0;
5480 		}
5481 	}
5482 }
5483 
5484 BIGNUM *
5485 BN_CTX_get(BN_CTX *ctx)
5486 {
5487 	if (ctx == NULL || ctx->v == NULL || ctx->arraysize == 0 || ctx->count == ctx->arraysize - 1) {
5488 		return NULL;
5489 	}
5490 	return ctx->v[ctx->count++] = BN_new();
5491 }
5492 
5493 void
5494 BN_CTX_start(BN_CTX *ctx)
5495 {
5496 	BN_CTX_init(ctx);
5497 }
5498 
5499 void
5500 BN_CTX_free(BN_CTX *c)
5501 {
5502 	unsigned	i;
5503 
5504 	if (c != NULL && c->v != NULL) {
5505 		for (i = 0 ; i < c->count ; i++) {
5506 			BN_clear_free(c->v[i]);
5507 		}
5508 		netpgp_deallocate(c->v, sizeof(*c->v) * c->arraysize);
5509 	}
5510 }
5511 
5512 void
5513 BN_CTX_end(BN_CTX *ctx)
5514 {
5515 	BN_CTX_free(ctx);
5516 }
5517 
5518 char *
5519 BN_bn2hex(const BIGNUM *a)
5520 {
5521 	return (a == NULL) ? NULL : formatbn(a, 16);
5522 }
5523 
5524 char *
5525 BN_bn2dec(const BIGNUM *a)
5526 {
5527 	return (a == NULL) ? NULL : formatbn(a, 10);
5528 }
5529 
5530 #ifndef _KERNEL
5531 int
5532 BN_print_fp(FILE *fp, const BIGNUM *a)
5533 {
5534 	char	*s;
5535 	int	 ret;
5536 
5537 	if (fp == NULL || a == NULL) {
5538 		return 0;
5539 	}
5540 	s = BN_bn2hex(a);
5541 	ret = fprintf(fp, "%s", s);
5542 	netpgp_deallocate(s, strlen(s) + 1);
5543 	return ret;
5544 }
5545 #endif
5546 
5547 #ifdef BN_RAND_NEEDED
5548 int
5549 BN_rand(BIGNUM *rnd, int bits, int top, int bottom)
5550 {
5551 	uint64_t	r;
5552 	int		digits;
5553 	int		i;
5554 
5555 	if (rnd == NULL) {
5556 		return 0;
5557 	}
5558 	mp_init_size(rnd, digits = howmany(bits, DIGIT_BIT));
5559 	for (i = 0 ; i < digits ; i++) {
5560 		r = (uint64_t)arc4random();
5561 		r <<= 32;
5562 		r |= arc4random();
5563 		rnd->dp[i] = (r & MP_MASK);
5564 	}
5565 	if (top == 0) {
5566 		rnd->dp[rnd->used - 1] |= (((mp_digit)1)<<((mp_digit)DIGIT_BIT));
5567 	}
5568 	if (top == 1) {
5569 		rnd->dp[rnd->used - 1] |= (((mp_digit)1)<<((mp_digit)DIGIT_BIT));
5570 		rnd->dp[rnd->used - 1] |= (((mp_digit)1)<<((mp_digit)(DIGIT_BIT - 1)));
5571 	}
5572 	if (bottom) {
5573 		rnd->dp[0] |= 0x1;
5574 	}
5575 	return 1;
5576 }
5577 
5578 int
5579 BN_rand_range(BIGNUM *rnd, BIGNUM *range)
5580 {
5581 	if (rnd == NULL || range == NULL || BN_is_zero(range)) {
5582 		return 0;
5583 	}
5584 	BN_rand(rnd, BN_num_bits(range), 1, 0);
5585 	return mp_mod(rnd, range, rnd) == MP_OKAY;
5586 }
5587 #endif
5588 
5589 int
5590 BN_is_prime(const BIGNUM *a, int checks, void (*callback)(int, int, void *), BN_CTX *ctx, void *cb_arg)
5591 {
5592 	int	primality;
5593 
5594 	if (a == NULL) {
5595 		return 0;
5596 	}
5597 	USE_ARG(ctx);
5598 	USE_ARG(cb_arg);
5599 	USE_ARG(callback);
5600 	return (mp_prime_is_prime(__UNCONST(a), checks, &primality) == MP_OKAY) ? primality : 0;
5601 }
5602 
5603 const BIGNUM *
5604 BN_value_one(void)
5605 {
5606 	static mp_digit		digit = 1UL;
5607 	static const BIGNUM	one = { &digit, 1, 1, 0 };
5608 
5609 	return &one;
5610 }
5611 
5612 int
5613 BN_hex2bn(BIGNUM **a, const char *str)
5614 {
5615 	return getbn(a, str, 16);
5616 }
5617 
5618 int
5619 BN_dec2bn(BIGNUM **a, const char *str)
5620 {
5621 	return getbn(a, str, 10);
5622 }
5623 
5624 int
5625 BN_mod_sub(BIGNUM *r, BIGNUM *a, BIGNUM *b, const BIGNUM *m, BN_CTX *ctx)
5626 {
5627 	USE_ARG(ctx);
5628 	if (r == NULL || a == NULL || b == NULL || m == NULL) {
5629 		return 0;
5630 	}
5631 	return mp_submod(a, b, __UNCONST(m), r) == MP_OKAY;
5632 }
5633 
5634 int
5635 BN_is_bit_set(const BIGNUM *a, int n)
5636 {
5637 	if (a == NULL || n < 0 || n >= a->used * DIGIT_BIT) {
5638 		return 0;
5639 	}
5640 	return (a->dp[n / DIGIT_BIT] & (1 << (n % DIGIT_BIT))) ? 1 : 0;
5641 }
5642