xref: /netbsd-src/crypto/external/bsd/netpgp/dist/src/libbn/bignum.c (revision bdc22b2e01993381dcefeff2bc9b56ca75a4235c)
1 /*-
2  * Copyright (c) 2012 Alistair Crooks <agc@NetBSD.org>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
15  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
16  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
17  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
18  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
19  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
20  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
21  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
23  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24  */
25 /* LibTomMath, multiple-precision integer library -- Tom St Denis
26  *
27  * LibTomMath is a library that provides multiple-precision
28  * integer arithmetic as well as number theoretic functionality.
29  *
30  * The library was designed directly after the MPI library by
31  * Michael Fromberger but has been written from scratch with
32  * additional optimizations in place.
33  *
34  * The library is free for all purposes without any express
35  * guarantee it works.
36  *
37  * Tom St Denis, tomstdenis@gmail.com, http://libtom.org
38  */
39 #include <sys/types.h>
40 #include <sys/stat.h>
41 #include <sys/param.h>
42 
43 #ifdef _KERNEL
44 # include <sys/kmem.h>
45 #else
46 # include <arpa/inet.h>
47 # include <ctype.h>
48 # include <inttypes.h>
49 # include <stdarg.h>
50 # include <stdio.h>
51 # include <stdlib.h>
52 # include <string.h>
53 # include <time.h>
54 # include <unistd.h>
55 #endif
56 
57 #include "misc.h"
58 #include "bn.h"
59 #include "digest.h"
60 
61 /**************************************************************************/
62 
63 /* LibTomMath, multiple-precision integer library -- Tom St Denis
64  *
65  * LibTomMath is a library that provides multiple-precision
66  * integer arithmetic as well as number theoretic functionality.
67  *
68  * The library was designed directly after the MPI library by
69  * Michael Fromberger but has been written from scratch with
70  * additional optimizations in place.
71  *
72  * The library is free for all purposes without any express
73  * guarantee it works.
74  *
75  * Tom St Denis, tomstdenis@gmail.com, http://libtom.org
76  */
77 
78 #define MP_PREC		32
79 #define DIGIT_BIT	28
80 #define MP_MASK          ((((mp_digit)1)<<((mp_digit)DIGIT_BIT))-((mp_digit)1))
81 
82 #define MP_WARRAY	/*LINTED*/(1U << (((sizeof(mp_word) * CHAR_BIT) - (2 * DIGIT_BIT) + 1)))
83 
84 #define MP_NO		0
85 #define MP_YES		1
86 
87 #ifndef USE_ARG
88 #define USE_ARG(x)	/*LINTED*/(void)&(x)
89 #endif
90 
91 #ifndef __arraycount
92 #define	__arraycount(__x)	(sizeof(__x) / sizeof(__x[0]))
93 #endif
94 
95 #define mp_iszero(a) (((a)->used == 0) ? MP_YES : MP_NO)
96 
97 #define s_mp_mul(a, b, c) s_mp_mul_digs(a, b, c, (a)->used + (b)->used + 1)
98 
99 typedef int           mp_err;
100 
101 static int mp_mul(mp_int * a, mp_int * b, mp_int * c);
102 static int mp_sqr(mp_int * a, mp_int * b);
103 
104 static int mp_sub_d(mp_int *a, mp_digit b, mp_int *c);
105 
106 /* set to zero */
107 static void
108 mp_zero(mp_int *a)
109 {
110   int       n;
111   mp_digit *tmp;
112 
113   a->sign = MP_ZPOS;
114   a->used = 0;
115 
116   tmp = a->dp;
117   /* XXX - memset */
118   for (n = 0; n < a->alloc; n++) {
119      *tmp++ = 0;
120   }
121 }
122 
123 /* grow as required */
124 static int
125 mp_grow(mp_int *a, int size)
126 {
127   int     i;
128   mp_digit *tmp;
129 
130   /* if the alloc size is smaller alloc more ram */
131   if (a->alloc < size) {
132     /* ensure there are always at least MP_PREC digits extra on top */
133     size += (MP_PREC * 2) - (size % MP_PREC);
134 
135     /* reallocate the array a->dp
136      *
137      * We store the return in a temporary variable
138      * in case the operation failed we don't want
139      * to overwrite the dp member of a.
140      */
141     tmp = realloc(a->dp, sizeof(*tmp) * size);
142     if (tmp == NULL) {
143       /* reallocation failed but "a" is still valid [can be freed] */
144       return MP_MEM;
145     }
146 
147     /* reallocation succeeded so set a->dp */
148     a->dp = tmp;
149 
150     /* zero excess digits */
151     i        = a->alloc;
152     a->alloc = size;
153     for (; i < a->alloc; i++) {
154       a->dp[i] = 0;
155     }
156   }
157   return MP_OKAY;
158 }
159 
160 /* shift left a certain amount of digits */
161 static int
162 mp_lshd (mp_int * a, int b)
163 {
164   int     x, res;
165 
166   /* if its less than zero return */
167   if (b <= 0) {
168     return MP_OKAY;
169   }
170 
171   /* grow to fit the new digits */
172   if (a->alloc < a->used + b) {
173      if ((res = mp_grow (a, a->used + b)) != MP_OKAY) {
174        return res;
175      }
176   }
177 
178   {
179     mp_digit *top, *bottom;
180 
181     /* increment the used by the shift amount then copy upwards */
182     a->used += b;
183 
184     /* top */
185     top = a->dp + a->used - 1;
186 
187     /* base */
188     bottom = a->dp + a->used - 1 - b;
189 
190     /* much like mp_rshd this is implemented using a sliding window
191      * except the window goes the otherway around.  Copying from
192      * the bottom to the top.  see bn_mp_rshd.c for more info.
193      */
194     for (x = a->used - 1; x >= b; x--) {
195       *top-- = *bottom--;
196     }
197 
198     /* zero the lower digits */
199     top = a->dp;
200     for (x = 0; x < b; x++) {
201       *top++ = 0;
202     }
203   }
204   return MP_OKAY;
205 }
206 
207 /* trim unused digits
208  *
209  * This is used to ensure that leading zero digits are
210  * trimed and the leading "used" digit will be non-zero
211  * Typically very fast.  Also fixes the sign if there
212  * are no more leading digits
213  */
214 static void
215 mp_clamp (mp_int * a)
216 {
217   /* decrease used while the most significant digit is
218    * zero.
219    */
220   while (a->used > 0 && a->dp[a->used - 1] == 0) {
221     --(a->used);
222   }
223 
224   /* reset the sign flag if used == 0 */
225   if (a->used == 0) {
226     a->sign = MP_ZPOS;
227   }
228 }
229 
230 /* copy, b = a */
231 static int
232 mp_copy(BIGNUM *a, BIGNUM *b)
233 {
234   int     res, n;
235 
236   /* if dst == src do nothing */
237   if (a == b) {
238     return MP_OKAY;
239   }
240   if (a == NULL || b == NULL) {
241 	return MP_VAL;
242   }
243 
244   /* grow dest */
245   if (b->alloc < a->used) {
246      if ((res = mp_grow (b, a->used)) != MP_OKAY) {
247         return res;
248      }
249   }
250 
251   /* zero b and copy the parameters over */
252   {
253     mp_digit *tmpa, *tmpb;
254 
255     /* pointer aliases */
256 
257     /* source */
258     tmpa = a->dp;
259 
260     /* destination */
261     tmpb = b->dp;
262 
263     /* copy all the digits */
264     for (n = 0; n < a->used; n++) {
265       *tmpb++ = *tmpa++;
266     }
267 
268     /* clear high digits */
269     for (; n < b->used; n++) {
270       *tmpb++ = 0;
271     }
272   }
273 
274   /* copy used count and sign */
275   b->used = a->used;
276   b->sign = a->sign;
277   return MP_OKAY;
278 }
279 
280 /* shift left by a certain bit count */
281 static int
282 mp_mul_2d(mp_int *a, int b, mp_int *c)
283 {
284   mp_digit d;
285   int      res;
286 
287   /* copy */
288   if (a != c) {
289      if ((res = mp_copy (a, c)) != MP_OKAY) {
290        return res;
291      }
292   }
293 
294   if (c->alloc < (int)(c->used + b/DIGIT_BIT + 1)) {
295      if ((res = mp_grow (c, c->used + b / DIGIT_BIT + 1)) != MP_OKAY) {
296        return res;
297      }
298   }
299 
300   /* shift by as many digits in the bit count */
301   if (b >= (int)DIGIT_BIT) {
302     if ((res = mp_lshd (c, b / DIGIT_BIT)) != MP_OKAY) {
303       return res;
304     }
305   }
306 
307   /* shift any bit count < DIGIT_BIT */
308   d = (mp_digit) (b % DIGIT_BIT);
309   if (d != 0) {
310     mp_digit *tmpc, shift, mask, r, rr;
311     int x;
312 
313     /* bitmask for carries */
314     mask = (((mp_digit)1) << d) - 1;
315 
316     /* shift for msbs */
317     shift = DIGIT_BIT - d;
318 
319     /* alias */
320     tmpc = c->dp;
321 
322     /* carry */
323     r    = 0;
324     for (x = 0; x < c->used; x++) {
325       /* get the higher bits of the current word */
326       rr = (*tmpc >> shift) & mask;
327 
328       /* shift the current word and OR in the carry */
329       *tmpc = ((*tmpc << d) | r) & MP_MASK;
330       ++tmpc;
331 
332       /* set the carry to the carry bits of the current word */
333       r = rr;
334     }
335 
336     /* set final carry */
337     if (r != 0) {
338        c->dp[(c->used)++] = r;
339     }
340   }
341   mp_clamp (c);
342   return MP_OKAY;
343 }
344 
345 /* reads a unsigned char array, assumes the msb is stored first [big endian] */
346 static int
347 mp_read_unsigned_bin(mp_int *a, const uint8_t *b, int c)
348 {
349   int     res;
350 
351   /* make sure there are at least two digits */
352   if (a->alloc < 2) {
353      if ((res = mp_grow(a, 2)) != MP_OKAY) {
354         return res;
355      }
356   }
357 
358   /* zero the int */
359   mp_zero (a);
360 
361   /* read the bytes in */
362   while (c-- > 0) {
363     if ((res = mp_mul_2d (a, 8, a)) != MP_OKAY) {
364       return res;
365     }
366 
367       a->dp[0] |= *b++;
368       a->used += 1;
369   }
370   mp_clamp (a);
371   return MP_OKAY;
372 }
373 
374 /* returns the number of bits in an int */
375 static int
376 mp_count_bits(const mp_int *a)
377 {
378   int     r;
379   mp_digit q;
380 
381   /* shortcut */
382   if (a->used == 0) {
383     return 0;
384   }
385 
386   /* get number of digits and add that */
387   r = (a->used - 1) * DIGIT_BIT;
388 
389   /* take the last digit and count the bits in it */
390   q = a->dp[a->used - 1];
391   while (q > ((mp_digit) 0)) {
392     ++r;
393     q >>= ((mp_digit) 1);
394   }
395   return r;
396 }
397 
398 /* compare maginitude of two ints (unsigned) */
399 static int
400 mp_cmp_mag (mp_int * a, mp_int * b)
401 {
402   int     n;
403   mp_digit *tmpa, *tmpb;
404 
405   /* compare based on # of non-zero digits */
406   if (a->used > b->used) {
407     return MP_GT;
408   }
409 
410   if (a->used < b->used) {
411     return MP_LT;
412   }
413 
414   /* alias for a */
415   tmpa = a->dp + (a->used - 1);
416 
417   /* alias for b */
418   tmpb = b->dp + (a->used - 1);
419 
420   /* compare based on digits  */
421   for (n = 0; n < a->used; ++n, --tmpa, --tmpb) {
422     if (*tmpa > *tmpb) {
423       return MP_GT;
424     }
425 
426     if (*tmpa < *tmpb) {
427       return MP_LT;
428     }
429   }
430   return MP_EQ;
431 }
432 
433 /* compare two ints (signed)*/
434 static int
435 mp_cmp (mp_int * a, mp_int * b)
436 {
437   /* compare based on sign */
438   if (a->sign != b->sign) {
439      if (a->sign == MP_NEG) {
440         return MP_LT;
441      } else {
442         return MP_GT;
443      }
444   }
445 
446   /* compare digits */
447   if (a->sign == MP_NEG) {
448      /* if negative compare opposite direction */
449      return mp_cmp_mag(b, a);
450   } else {
451      return mp_cmp_mag(a, b);
452   }
453 }
454 
455 /* get the size for an unsigned equivalent */
456 static int
457 mp_unsigned_bin_size (mp_int * a)
458 {
459   int     size = mp_count_bits (a);
460   return (size / 8 + ((size & 7) != 0 ? 1 : 0));
461 }
462 
463 /* init a new mp_int */
464 static int
465 mp_init (mp_int * a)
466 {
467   int i;
468 
469   /* allocate memory required and clear it */
470   a->dp = netpgp_allocate(1, sizeof (*a->dp) * MP_PREC);
471   if (a->dp == NULL) {
472     return MP_MEM;
473   }
474 
475   /* set the digits to zero */
476   for (i = 0; i < MP_PREC; i++) {
477       a->dp[i] = 0;
478   }
479 
480   /* set the used to zero, allocated digits to the default precision
481    * and sign to positive */
482   a->used  = 0;
483   a->alloc = MP_PREC;
484   a->sign  = MP_ZPOS;
485 
486   return MP_OKAY;
487 }
488 
489 /* clear one (frees)  */
490 static void
491 mp_clear (mp_int * a)
492 {
493   int i;
494 
495   /* only do anything if a hasn't been freed previously */
496   if (a->dp != NULL) {
497     /* first zero the digits */
498     for (i = 0; i < a->used; i++) {
499         a->dp[i] = 0;
500     }
501 
502     /* free ram */
503     netpgp_deallocate(a->dp, (size_t)a->alloc);
504 
505     /* reset members to make debugging easier */
506     a->dp    = NULL;
507     a->alloc = a->used = 0;
508     a->sign  = MP_ZPOS;
509   }
510 }
511 
512 static int
513 mp_init_multi(mp_int *mp, ...)
514 {
515     mp_err res = MP_OKAY;      /* Assume ok until proven otherwise */
516     int n = 0;                 /* Number of ok inits */
517     mp_int* cur_arg = mp;
518     va_list args;
519 
520     va_start(args, mp);        /* init args to next argument from caller */
521     while (cur_arg != NULL) {
522         if (mp_init(cur_arg) != MP_OKAY) {
523             /* Oops - error! Back-track and mp_clear what we already
524                succeeded in init-ing, then return error.
525             */
526             va_list clean_args;
527 
528             /* end the current list */
529             va_end(args);
530 
531             /* now start cleaning up */
532             cur_arg = mp;
533             va_start(clean_args, mp);
534             while (n--) {
535                 mp_clear(cur_arg);
536                 cur_arg = va_arg(clean_args, mp_int*);
537             }
538             va_end(clean_args);
539             res = MP_MEM;
540             break;
541         }
542         n++;
543         cur_arg = va_arg(args, mp_int*);
544     }
545     va_end(args);
546     return res;                /* Assumed ok, if error flagged above. */
547 }
548 
549 /* init an mp_init for a given size */
550 static int
551 mp_init_size (mp_int * a, int size)
552 {
553   int x;
554 
555   /* pad size so there are always extra digits */
556   size += (MP_PREC * 2) - (size % MP_PREC);
557 
558   /* alloc mem */
559   a->dp = netpgp_allocate (1, sizeof (*a->dp) * size);
560   if (a->dp == NULL) {
561     return MP_MEM;
562   }
563 
564   /* set the members */
565   a->used  = 0;
566   a->alloc = size;
567   a->sign  = MP_ZPOS;
568 
569   /* zero the digits */
570   for (x = 0; x < size; x++) {
571       a->dp[x] = 0;
572   }
573 
574   return MP_OKAY;
575 }
576 
577 /* creates "a" then copies b into it */
578 static int mp_init_copy (mp_int * a, mp_int * b)
579 {
580   int     res;
581 
582   if ((res = mp_init (a)) != MP_OKAY) {
583     return res;
584   }
585   return mp_copy (b, a);
586 }
587 
588 /* low level addition, based on HAC pp.594, Algorithm 14.7 */
589 static int
590 s_mp_add (mp_int * a, mp_int * b, mp_int * c)
591 {
592   mp_int *x;
593   int     olduse, res, min, max;
594 
595   /* find sizes, we let |a| <= |b| which means we have to sort
596    * them.  "x" will point to the input with the most digits
597    */
598   if (a->used > b->used) {
599     min = b->used;
600     max = a->used;
601     x = a;
602   } else {
603     min = a->used;
604     max = b->used;
605     x = b;
606   }
607 
608   /* init result */
609   if (c->alloc < max + 1) {
610     if ((res = mp_grow (c, max + 1)) != MP_OKAY) {
611       return res;
612     }
613   }
614 
615   /* get old used digit count and set new one */
616   olduse = c->used;
617   c->used = max + 1;
618 
619   {
620     mp_digit u, *tmpa, *tmpb, *tmpc;
621     int i;
622 
623     /* alias for digit pointers */
624 
625     /* first input */
626     tmpa = a->dp;
627 
628     /* second input */
629     tmpb = b->dp;
630 
631     /* destination */
632     tmpc = c->dp;
633 
634     /* zero the carry */
635     u = 0;
636     for (i = 0; i < min; i++) {
637       /* Compute the sum at one digit, T[i] = A[i] + B[i] + U */
638       *tmpc = *tmpa++ + *tmpb++ + u;
639 
640       /* U = carry bit of T[i] */
641       u = *tmpc >> ((mp_digit)DIGIT_BIT);
642 
643       /* take away carry bit from T[i] */
644       *tmpc++ &= MP_MASK;
645     }
646 
647     /* now copy higher words if any, that is in A+B
648      * if A or B has more digits add those in
649      */
650     if (min != max) {
651       for (; i < max; i++) {
652         /* T[i] = X[i] + U */
653         *tmpc = x->dp[i] + u;
654 
655         /* U = carry bit of T[i] */
656         u = *tmpc >> ((mp_digit)DIGIT_BIT);
657 
658         /* take away carry bit from T[i] */
659         *tmpc++ &= MP_MASK;
660       }
661     }
662 
663     /* add carry */
664     *tmpc++ = u;
665 
666     /* clear digits above oldused */
667     for (i = c->used; i < olduse; i++) {
668       *tmpc++ = 0;
669     }
670   }
671 
672   mp_clamp (c);
673   return MP_OKAY;
674 }
675 
676 /* low level subtraction (assumes |a| > |b|), HAC pp.595 Algorithm 14.9 */
677 static int
678 s_mp_sub (mp_int * a, mp_int * b, mp_int * c)
679 {
680   int     olduse, res, min, max;
681 
682   /* find sizes */
683   min = b->used;
684   max = a->used;
685 
686   /* init result */
687   if (c->alloc < max) {
688     if ((res = mp_grow (c, max)) != MP_OKAY) {
689       return res;
690     }
691   }
692   olduse = c->used;
693   c->used = max;
694 
695   {
696     mp_digit u, *tmpa, *tmpb, *tmpc;
697     int i;
698 
699     /* alias for digit pointers */
700     tmpa = a->dp;
701     tmpb = b->dp;
702     tmpc = c->dp;
703 
704     /* set carry to zero */
705     u = 0;
706     for (i = 0; i < min; i++) {
707       /* T[i] = A[i] - B[i] - U */
708       *tmpc = *tmpa++ - *tmpb++ - u;
709 
710       /* U = carry bit of T[i]
711        * Note this saves performing an AND operation since
712        * if a carry does occur it will propagate all the way to the
713        * MSB.  As a result a single shift is enough to get the carry
714        */
715       u = *tmpc >> ((mp_digit)(CHAR_BIT * sizeof (mp_digit) - 1));
716 
717       /* Clear carry from T[i] */
718       *tmpc++ &= MP_MASK;
719     }
720 
721     /* now copy higher words if any, e.g. if A has more digits than B  */
722     for (; i < max; i++) {
723       /* T[i] = A[i] - U */
724       *tmpc = *tmpa++ - u;
725 
726       /* U = carry bit of T[i] */
727       u = *tmpc >> ((mp_digit)(CHAR_BIT * sizeof (mp_digit) - 1));
728 
729       /* Clear carry from T[i] */
730       *tmpc++ &= MP_MASK;
731     }
732 
733     /* clear digits above used (since we may not have grown result above) */
734     for (i = c->used; i < olduse; i++) {
735       *tmpc++ = 0;
736     }
737   }
738 
739   mp_clamp (c);
740   return MP_OKAY;
741 }
742 
743 /* high level subtraction (handles signs) */
744 static int
745 mp_sub (mp_int * a, mp_int * b, mp_int * c)
746 {
747   int     sa, sb, res;
748 
749   sa = a->sign;
750   sb = b->sign;
751 
752   if (sa != sb) {
753     /* subtract a negative from a positive, OR */
754     /* subtract a positive from a negative. */
755     /* In either case, ADD their magnitudes, */
756     /* and use the sign of the first number. */
757     c->sign = sa;
758     res = s_mp_add (a, b, c);
759   } else {
760     /* subtract a positive from a positive, OR */
761     /* subtract a negative from a negative. */
762     /* First, take the difference between their */
763     /* magnitudes, then... */
764     if (mp_cmp_mag (a, b) != MP_LT) {
765       /* Copy the sign from the first */
766       c->sign = sa;
767       /* The first has a larger or equal magnitude */
768       res = s_mp_sub (a, b, c);
769     } else {
770       /* The result has the *opposite* sign from */
771       /* the first number. */
772       c->sign = (sa == MP_ZPOS) ? MP_NEG : MP_ZPOS;
773       /* The second has a larger magnitude */
774       res = s_mp_sub (b, a, c);
775     }
776   }
777   return res;
778 }
779 
780 /* shift right a certain amount of digits */
781 static int mp_rshd (mp_int * a, int b)
782 {
783   int     x;
784 
785   /* if b <= 0 then ignore it */
786   if (b <= 0) {
787     return 0;
788   }
789 
790   /* if b > used then simply zero it and return */
791   if (a->used <= b) {
792     mp_zero (a);
793     return 0;
794   }
795 
796   {
797     mp_digit *bottom, *top;
798 
799     /* shift the digits down */
800 
801     /* bottom */
802     bottom = a->dp;
803 
804     /* top [offset into digits] */
805     top = a->dp + b;
806 
807     /* this is implemented as a sliding window where
808      * the window is b-digits long and digits from
809      * the top of the window are copied to the bottom
810      *
811      * e.g.
812 
813      b-2 | b-1 | b0 | b1 | b2 | ... | bb |   ---->
814                  /\                   |      ---->
815                   \-------------------/      ---->
816      */
817     for (x = 0; x < (a->used - b); x++) {
818       *bottom++ = *top++;
819     }
820 
821     /* zero the top digits */
822     for (; x < a->used; x++) {
823       *bottom++ = 0;
824     }
825   }
826 
827   /* remove excess digits */
828   a->used -= b;
829   return 1;
830 }
831 
832 /* multiply by a digit */
833 static int
834 mp_mul_d (mp_int * a, mp_digit b, mp_int * c)
835 {
836   mp_digit u, *tmpa, *tmpc;
837   mp_word  r;
838   int      ix, res, olduse;
839 
840   /* make sure c is big enough to hold a*b */
841   if (c->alloc < a->used + 1) {
842     if ((res = mp_grow (c, a->used + 1)) != MP_OKAY) {
843       return res;
844     }
845   }
846 
847   /* get the original destinations used count */
848   olduse = c->used;
849 
850   /* set the sign */
851   c->sign = a->sign;
852 
853   /* alias for a->dp [source] */
854   tmpa = a->dp;
855 
856   /* alias for c->dp [dest] */
857   tmpc = c->dp;
858 
859   /* zero carry */
860   u = 0;
861 
862   /* compute columns */
863   for (ix = 0; ix < a->used; ix++) {
864     /* compute product and carry sum for this term */
865     r       = ((mp_word) u) + ((mp_word)*tmpa++) * ((mp_word)b);
866 
867     /* mask off higher bits to get a single digit */
868     *tmpc++ = (mp_digit) (r & ((mp_word) MP_MASK));
869 
870     /* send carry into next iteration */
871     u       = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
872   }
873 
874   /* store final carry [if any] and increment ix offset  */
875   *tmpc++ = u;
876   ++ix;
877 
878   /* now zero digits above the top */
879   while (ix++ < olduse) {
880      *tmpc++ = 0;
881   }
882 
883   /* set used count */
884   c->used = a->used + 1;
885   mp_clamp(c);
886 
887   return MP_OKAY;
888 }
889 
890 /* high level addition (handles signs) */
891 static int mp_add (mp_int * a, mp_int * b, mp_int * c)
892 {
893   int     sa, sb, res;
894 
895   /* get sign of both inputs */
896   sa = a->sign;
897   sb = b->sign;
898 
899   /* handle two cases, not four */
900   if (sa == sb) {
901     /* both positive or both negative */
902     /* add their magnitudes, copy the sign */
903     c->sign = sa;
904     res = s_mp_add (a, b, c);
905   } else {
906     /* one positive, the other negative */
907     /* subtract the one with the greater magnitude from */
908     /* the one of the lesser magnitude.  The result gets */
909     /* the sign of the one with the greater magnitude. */
910     if (mp_cmp_mag (a, b) == MP_LT) {
911       c->sign = sb;
912       res = s_mp_sub (b, a, c);
913     } else {
914       c->sign = sa;
915       res = s_mp_sub (a, b, c);
916     }
917   }
918   return res;
919 }
920 
921 /* swap the elements of two integers, for cases where you can't simply swap the
922  * mp_int pointers around
923  */
924 static void
925 mp_exch(mp_int *a, mp_int *b)
926 {
927   mp_int  t;
928 
929   t  = *a;
930   *a = *b;
931   *b = t;
932 }
933 
934 /* calc a value mod 2**b */
935 static int
936 mp_mod_2d (mp_int * a, int b, mp_int * c)
937 {
938   int     x, res;
939 
940   /* if b is <= 0 then zero the int */
941   if (b <= 0) {
942     mp_zero (c);
943     return MP_OKAY;
944   }
945 
946   /* if the modulus is larger than the value than return */
947   if (b >= (int) (a->used * DIGIT_BIT)) {
948     res = mp_copy (a, c);
949     return res;
950   }
951 
952   /* copy */
953   if ((res = mp_copy (a, c)) != MP_OKAY) {
954     return res;
955   }
956 
957   /* zero digits above the last digit of the modulus */
958   for (x = (b / DIGIT_BIT) + ((b % DIGIT_BIT) == 0 ? 0 : 1); x < c->used; x++) {
959     c->dp[x] = 0;
960   }
961   /* clear the digit that is not completely outside/inside the modulus */
962   c->dp[b / DIGIT_BIT] &=
963     (mp_digit) ((((mp_digit) 1) << (((mp_digit) b) % DIGIT_BIT)) - ((mp_digit) 1));
964   mp_clamp (c);
965   return MP_OKAY;
966 }
967 
968 /* shift right by a certain bit count (store quotient in c, optional remainder in d) */
969 static int mp_div_2d (mp_int * a, int b, mp_int * c, mp_int * d)
970 {
971   mp_digit D, r, rr;
972   int     x, res;
973   mp_int  t;
974 
975 
976   /* if the shift count is <= 0 then we do no work */
977   if (b <= 0) {
978     res = mp_copy (a, c);
979     if (d != NULL) {
980       mp_zero (d);
981     }
982     return res;
983   }
984 
985   if ((res = mp_init (&t)) != MP_OKAY) {
986     return res;
987   }
988 
989   /* get the remainder */
990   if (d != NULL) {
991     if ((res = mp_mod_2d (a, b, &t)) != MP_OKAY) {
992       mp_clear (&t);
993       return res;
994     }
995   }
996 
997   /* copy */
998   if ((res = mp_copy (a, c)) != MP_OKAY) {
999     mp_clear (&t);
1000     return res;
1001   }
1002 
1003   /* shift by as many digits in the bit count */
1004   if (b >= (int)DIGIT_BIT) {
1005     mp_rshd (c, b / DIGIT_BIT);
1006   }
1007 
1008   /* shift any bit count < DIGIT_BIT */
1009   D = (mp_digit) (b % DIGIT_BIT);
1010   if (D != 0) {
1011     mp_digit *tmpc, mask, shift;
1012 
1013     /* mask */
1014     mask = (((mp_digit)1) << D) - 1;
1015 
1016     /* shift for lsb */
1017     shift = DIGIT_BIT - D;
1018 
1019     /* alias */
1020     tmpc = c->dp + (c->used - 1);
1021 
1022     /* carry */
1023     r = 0;
1024     for (x = c->used - 1; x >= 0; x--) {
1025       /* get the lower  bits of this word in a temp */
1026       rr = *tmpc & mask;
1027 
1028       /* shift the current word and mix in the carry bits from the previous word */
1029       *tmpc = (*tmpc >> D) | (r << shift);
1030       --tmpc;
1031 
1032       /* set the carry to the carry bits of the current word found above */
1033       r = rr;
1034     }
1035   }
1036   mp_clamp (c);
1037   if (d != NULL) {
1038     mp_exch (&t, d);
1039   }
1040   mp_clear (&t);
1041   return MP_OKAY;
1042 }
1043 
1044 /* integer signed division.
1045  * c*b + d == a [e.g. a/b, c=quotient, d=remainder]
1046  * HAC pp.598 Algorithm 14.20
1047  *
1048  * Note that the description in HAC is horribly
1049  * incomplete.  For example, it doesn't consider
1050  * the case where digits are removed from 'x' in
1051  * the inner loop.  It also doesn't consider the
1052  * case that y has fewer than three digits, etc..
1053  *
1054  * The overall algorithm is as described as
1055  * 14.20 from HAC but fixed to treat these cases.
1056 */
1057 static int
1058 mp_div(mp_int *c, mp_int *d, mp_int *a, mp_int *b)
1059 {
1060   mp_int  q, x, y, t1, t2;
1061   int     res, n, t, i, norm, neg;
1062 
1063   /* is divisor zero ? */
1064   if (BN_is_zero (b) == 1) {
1065     return MP_VAL;
1066   }
1067 
1068   /* if a < b then q=0, r = a */
1069   if (mp_cmp_mag (a, b) == MP_LT) {
1070     if (d != NULL) {
1071       res = mp_copy (a, d);
1072     } else {
1073       res = MP_OKAY;
1074     }
1075     if (c != NULL) {
1076       mp_zero (c);
1077     }
1078     return res;
1079   }
1080 
1081   if ((res = mp_init_size (&q, a->used + 2)) != MP_OKAY) {
1082     return res;
1083   }
1084   q.used = a->used + 2;
1085 
1086   if ((res = mp_init (&t1)) != MP_OKAY) {
1087     goto LBL_Q;
1088   }
1089 
1090   if ((res = mp_init (&t2)) != MP_OKAY) {
1091     goto LBL_T1;
1092   }
1093 
1094   if ((res = mp_init_copy (&x, a)) != MP_OKAY) {
1095     goto LBL_T2;
1096   }
1097 
1098   if ((res = mp_init_copy (&y, b)) != MP_OKAY) {
1099     goto LBL_X;
1100   }
1101 
1102   /* fix the sign */
1103   neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
1104   x.sign = y.sign = MP_ZPOS;
1105 
1106   /* normalize both x and y, ensure that y >= b/2, [b == 2**DIGIT_BIT] */
1107   norm = mp_count_bits(&y) % DIGIT_BIT;
1108   if (norm < (int)(DIGIT_BIT-1)) {
1109      norm = (DIGIT_BIT-1) - norm;
1110      if ((res = mp_mul_2d (&x, norm, &x)) != MP_OKAY) {
1111        goto LBL_Y;
1112      }
1113      if ((res = mp_mul_2d (&y, norm, &y)) != MP_OKAY) {
1114        goto LBL_Y;
1115      }
1116   } else {
1117      norm = 0;
1118   }
1119 
1120   /* note hac does 0 based, so if used==5 then its 0,1,2,3,4, e.g. use 4 */
1121   n = x.used - 1;
1122   t = y.used - 1;
1123 
1124   /* while (x >= y*b**n-t) do { q[n-t] += 1; x -= y*b**{n-t} } */
1125   if ((res = mp_lshd (&y, n - t)) != MP_OKAY) { /* y = y*b**{n-t} */
1126     goto LBL_Y;
1127   }
1128 
1129   while (mp_cmp (&x, &y) != MP_LT) {
1130     ++(q.dp[n - t]);
1131     if ((res = mp_sub (&x, &y, &x)) != MP_OKAY) {
1132       goto LBL_Y;
1133     }
1134   }
1135 
1136   /* reset y by shifting it back down */
1137   mp_rshd (&y, n - t);
1138 
1139   /* step 3. for i from n down to (t + 1) */
1140   for (i = n; i >= (t + 1); i--) {
1141     if (i > x.used) {
1142       continue;
1143     }
1144 
1145     /* step 3.1 if xi == yt then set q{i-t-1} to b-1,
1146      * otherwise set q{i-t-1} to (xi*b + x{i-1})/yt */
1147     if (x.dp[i] == y.dp[t]) {
1148       q.dp[i - t - 1] = ((((mp_digit)1) << DIGIT_BIT) - 1);
1149     } else {
1150       mp_word tmp;
1151       tmp = ((mp_word) x.dp[i]) << ((mp_word) DIGIT_BIT);
1152       tmp |= ((mp_word) x.dp[i - 1]);
1153       tmp /= ((mp_word) y.dp[t]);
1154       if (tmp > (mp_word) MP_MASK)
1155         tmp = MP_MASK;
1156       q.dp[i - t - 1] = (mp_digit) (tmp & (mp_word) (MP_MASK));
1157     }
1158 
1159     /* while (q{i-t-1} * (yt * b + y{t-1})) >
1160              xi * b**2 + xi-1 * b + xi-2
1161 
1162        do q{i-t-1} -= 1;
1163     */
1164     q.dp[i - t - 1] = (q.dp[i - t - 1] + 1) & MP_MASK;
1165     do {
1166       q.dp[i - t - 1] = (q.dp[i - t - 1] - 1) & MP_MASK;
1167 
1168       /* find left hand */
1169       mp_zero (&t1);
1170       t1.dp[0] = (t - 1 < 0) ? 0 : y.dp[t - 1];
1171       t1.dp[1] = y.dp[t];
1172       t1.used = 2;
1173       if ((res = mp_mul_d (&t1, q.dp[i - t - 1], &t1)) != MP_OKAY) {
1174         goto LBL_Y;
1175       }
1176 
1177       /* find right hand */
1178       t2.dp[0] = (i - 2 < 0) ? 0 : x.dp[i - 2];
1179       t2.dp[1] = (i - 1 < 0) ? 0 : x.dp[i - 1];
1180       t2.dp[2] = x.dp[i];
1181       t2.used = 3;
1182     } while (mp_cmp_mag(&t1, &t2) == MP_GT);
1183 
1184     /* step 3.3 x = x - q{i-t-1} * y * b**{i-t-1} */
1185     if ((res = mp_mul_d (&y, q.dp[i - t - 1], &t1)) != MP_OKAY) {
1186       goto LBL_Y;
1187     }
1188 
1189     if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) {
1190       goto LBL_Y;
1191     }
1192 
1193     if ((res = mp_sub (&x, &t1, &x)) != MP_OKAY) {
1194       goto LBL_Y;
1195     }
1196 
1197     /* if x < 0 then { x = x + y*b**{i-t-1}; q{i-t-1} -= 1; } */
1198     if (x.sign == MP_NEG) {
1199       if ((res = mp_copy (&y, &t1)) != MP_OKAY) {
1200         goto LBL_Y;
1201       }
1202       if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) {
1203         goto LBL_Y;
1204       }
1205       if ((res = mp_add (&x, &t1, &x)) != MP_OKAY) {
1206         goto LBL_Y;
1207       }
1208 
1209       q.dp[i - t - 1] = (q.dp[i - t - 1] - 1UL) & MP_MASK;
1210     }
1211   }
1212 
1213   /* now q is the quotient and x is the remainder
1214    * [which we have to normalize]
1215    */
1216 
1217   /* get sign before writing to c */
1218   x.sign = x.used == 0 ? MP_ZPOS : a->sign;
1219 
1220   if (c != NULL) {
1221     mp_clamp (&q);
1222     mp_exch (&q, c);
1223     c->sign = neg;
1224   }
1225 
1226   if (d != NULL) {
1227     mp_div_2d (&x, norm, &x, NULL);
1228     mp_exch (&x, d);
1229   }
1230 
1231   res = MP_OKAY;
1232 
1233 LBL_Y:mp_clear (&y);
1234 LBL_X:mp_clear (&x);
1235 LBL_T2:mp_clear (&t2);
1236 LBL_T1:mp_clear (&t1);
1237 LBL_Q:mp_clear (&q);
1238   return res;
1239 }
1240 
1241 /* c = a mod b, 0 <= c < b */
1242 static int
1243 mp_mod (mp_int * a, mp_int * b, mp_int * c)
1244 {
1245   mp_int  t;
1246   int     res;
1247 
1248   if ((res = mp_init (&t)) != MP_OKAY) {
1249     return res;
1250   }
1251 
1252   if ((res = mp_div (NULL, &t, a, b)) != MP_OKAY) {
1253     mp_clear (&t);
1254     return res;
1255   }
1256 
1257   if (t.sign != b->sign) {
1258     res = mp_add (b, &t, c);
1259   } else {
1260     res = MP_OKAY;
1261     mp_exch (&t, c);
1262   }
1263 
1264   mp_clear (&t);
1265   return res;
1266 }
1267 
1268 /* set to a digit */
1269 static void mp_set (mp_int * a, mp_digit b)
1270 {
1271   mp_zero (a);
1272   a->dp[0] = b & MP_MASK;
1273   a->used  = (a->dp[0] != 0) ? 1 : 0;
1274 }
1275 
1276 /* b = a/2 */
1277 static int mp_div_2(mp_int * a, mp_int * b)
1278 {
1279   int     x, res, oldused;
1280 
1281   /* copy */
1282   if (b->alloc < a->used) {
1283     if ((res = mp_grow (b, a->used)) != MP_OKAY) {
1284       return res;
1285     }
1286   }
1287 
1288   oldused = b->used;
1289   b->used = a->used;
1290   {
1291     mp_digit r, rr, *tmpa, *tmpb;
1292 
1293     /* source alias */
1294     tmpa = a->dp + b->used - 1;
1295 
1296     /* dest alias */
1297     tmpb = b->dp + b->used - 1;
1298 
1299     /* carry */
1300     r = 0;
1301     for (x = b->used - 1; x >= 0; x--) {
1302       /* get the carry for the next iteration */
1303       rr = *tmpa & 1;
1304 
1305       /* shift the current digit, add in carry and store */
1306       *tmpb-- = (*tmpa-- >> 1) | (r << (DIGIT_BIT - 1));
1307 
1308       /* forward carry to next iteration */
1309       r = rr;
1310     }
1311 
1312     /* zero excess digits */
1313     tmpb = b->dp + b->used;
1314     for (x = b->used; x < oldused; x++) {
1315       *tmpb++ = 0;
1316     }
1317   }
1318   b->sign = a->sign;
1319   mp_clamp (b);
1320   return MP_OKAY;
1321 }
1322 
1323 /* compare a digit */
1324 static int mp_cmp_d(mp_int * a, mp_digit b)
1325 {
1326   /* compare based on sign */
1327   if (a->sign == MP_NEG) {
1328     return MP_LT;
1329   }
1330 
1331   /* compare based on magnitude */
1332   if (a->used > 1) {
1333     return MP_GT;
1334   }
1335 
1336   /* compare the only digit of a to b */
1337   if (a->dp[0] > b) {
1338     return MP_GT;
1339   } else if (a->dp[0] < b) {
1340     return MP_LT;
1341   } else {
1342     return MP_EQ;
1343   }
1344 }
1345 
1346 static void mp_clear_multi(mp_int *mp, ...)
1347 {
1348     mp_int* next_mp = mp;
1349     va_list args;
1350     va_start(args, mp);
1351     while (next_mp != NULL) {
1352         mp_clear(next_mp);
1353         next_mp = va_arg(args, mp_int*);
1354     }
1355     va_end(args);
1356 }
1357 
1358 /* computes the modular inverse via binary extended euclidean algorithm,
1359  * that is c = 1/a mod b
1360  *
1361  * Based on slow invmod except this is optimized for the case where b is
1362  * odd as per HAC Note 14.64 on pp. 610
1363  */
1364 static int
1365 fast_mp_invmod (mp_int * a, mp_int * b, mp_int * c)
1366 {
1367   mp_int  x, y, u, v, B, D;
1368   int     res, neg;
1369 
1370   /* 2. [modified] b must be odd   */
1371   if (BN_is_even (b) == 1) {
1372     return MP_VAL;
1373   }
1374 
1375   /* init all our temps */
1376   if ((res = mp_init_multi(&x, &y, &u, &v, &B, &D, NULL)) != MP_OKAY) {
1377      return res;
1378   }
1379 
1380   /* x == modulus, y == value to invert */
1381   if ((res = mp_copy (b, &x)) != MP_OKAY) {
1382     goto LBL_ERR;
1383   }
1384 
1385   /* we need y = |a| */
1386   if ((res = mp_mod (a, b, &y)) != MP_OKAY) {
1387     goto LBL_ERR;
1388   }
1389 
1390   /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
1391   if ((res = mp_copy (&x, &u)) != MP_OKAY) {
1392     goto LBL_ERR;
1393   }
1394   if ((res = mp_copy (&y, &v)) != MP_OKAY) {
1395     goto LBL_ERR;
1396   }
1397   mp_set (&D, (unsigned long)1);
1398 
1399 top:
1400   /* 4.  while u is even do */
1401   while (BN_is_even (&u) == 1) {
1402     /* 4.1 u = u/2 */
1403     if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
1404       goto LBL_ERR;
1405     }
1406     /* 4.2 if B is odd then */
1407     if (BN_is_odd (&B) == 1) {
1408       if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) {
1409         goto LBL_ERR;
1410       }
1411     }
1412     /* B = B/2 */
1413     if ((res = mp_div_2 (&B, &B)) != MP_OKAY) {
1414       goto LBL_ERR;
1415     }
1416   }
1417 
1418   /* 5.  while v is even do */
1419   while (BN_is_even (&v) == 1) {
1420     /* 5.1 v = v/2 */
1421     if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
1422       goto LBL_ERR;
1423     }
1424     /* 5.2 if D is odd then */
1425     if (BN_is_odd (&D) == 1) {
1426       /* D = (D-x)/2 */
1427       if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) {
1428         goto LBL_ERR;
1429       }
1430     }
1431     /* D = D/2 */
1432     if ((res = mp_div_2 (&D, &D)) != MP_OKAY) {
1433       goto LBL_ERR;
1434     }
1435   }
1436 
1437   /* 6.  if u >= v then */
1438   if (mp_cmp (&u, &v) != MP_LT) {
1439     /* u = u - v, B = B - D */
1440     if ((res = mp_sub (&u, &v, &u)) != MP_OKAY) {
1441       goto LBL_ERR;
1442     }
1443 
1444     if ((res = mp_sub (&B, &D, &B)) != MP_OKAY) {
1445       goto LBL_ERR;
1446     }
1447   } else {
1448     /* v - v - u, D = D - B */
1449     if ((res = mp_sub (&v, &u, &v)) != MP_OKAY) {
1450       goto LBL_ERR;
1451     }
1452 
1453     if ((res = mp_sub (&D, &B, &D)) != MP_OKAY) {
1454       goto LBL_ERR;
1455     }
1456   }
1457 
1458   /* if not zero goto step 4 */
1459   if (BN_is_zero (&u) == 0) {
1460     goto top;
1461   }
1462 
1463   /* now a = C, b = D, gcd == g*v */
1464 
1465   /* if v != 1 then there is no inverse */
1466   if (mp_cmp_d (&v, (unsigned long)1) != MP_EQ) {
1467     res = MP_VAL;
1468     goto LBL_ERR;
1469   }
1470 
1471   /* b is now the inverse */
1472   neg = a->sign;
1473   while (D.sign == MP_NEG) {
1474     if ((res = mp_add (&D, b, &D)) != MP_OKAY) {
1475       goto LBL_ERR;
1476     }
1477   }
1478   mp_exch (&D, c);
1479   c->sign = neg;
1480   res = MP_OKAY;
1481 
1482 LBL_ERR:mp_clear_multi (&x, &y, &u, &v, &B, &D, NULL);
1483   return res;
1484 }
1485 
1486 /* hac 14.61, pp608 */
1487 static int
1488 mp_invmod_slow (mp_int * a, mp_int * b, mp_int * c)
1489 {
1490   mp_int  x, y, u, v, A, B, C, D;
1491   int     res;
1492 
1493   /* b cannot be negative */
1494   if (b->sign == MP_NEG || BN_is_zero(b) == 1) {
1495     return MP_VAL;
1496   }
1497 
1498   /* init temps */
1499   if ((res = mp_init_multi(&x, &y, &u, &v,
1500                            &A, &B, &C, &D, NULL)) != MP_OKAY) {
1501      return res;
1502   }
1503 
1504   /* x = a, y = b */
1505   if ((res = mp_mod(a, b, &x)) != MP_OKAY) {
1506       goto LBL_ERR;
1507   }
1508   if ((res = mp_copy (b, &y)) != MP_OKAY) {
1509     goto LBL_ERR;
1510   }
1511 
1512   /* 2. [modified] if x,y are both even then return an error! */
1513   if (BN_is_even (&x) == 1 && BN_is_even (&y) == 1) {
1514     res = MP_VAL;
1515     goto LBL_ERR;
1516   }
1517 
1518   /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
1519   if ((res = mp_copy (&x, &u)) != MP_OKAY) {
1520     goto LBL_ERR;
1521   }
1522   if ((res = mp_copy (&y, &v)) != MP_OKAY) {
1523     goto LBL_ERR;
1524   }
1525   mp_set (&A, (unsigned long)1);
1526   mp_set (&D, (unsigned long)1);
1527 
1528 top:
1529   /* 4.  while u is even do */
1530   while (BN_is_even (&u) == 1) {
1531     /* 4.1 u = u/2 */
1532     if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
1533       goto LBL_ERR;
1534     }
1535     /* 4.2 if A or B is odd then */
1536     if (BN_is_odd (&A) == 1 || BN_is_odd (&B) == 1) {
1537       /* A = (A+y)/2, B = (B-x)/2 */
1538       if ((res = mp_add (&A, &y, &A)) != MP_OKAY) {
1539          goto LBL_ERR;
1540       }
1541       if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) {
1542          goto LBL_ERR;
1543       }
1544     }
1545     /* A = A/2, B = B/2 */
1546     if ((res = mp_div_2 (&A, &A)) != MP_OKAY) {
1547       goto LBL_ERR;
1548     }
1549     if ((res = mp_div_2 (&B, &B)) != MP_OKAY) {
1550       goto LBL_ERR;
1551     }
1552   }
1553 
1554   /* 5.  while v is even do */
1555   while (BN_is_even (&v) == 1) {
1556     /* 5.1 v = v/2 */
1557     if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
1558       goto LBL_ERR;
1559     }
1560     /* 5.2 if C or D is odd then */
1561     if (BN_is_odd (&C) == 1 || BN_is_odd (&D) == 1) {
1562       /* C = (C+y)/2, D = (D-x)/2 */
1563       if ((res = mp_add (&C, &y, &C)) != MP_OKAY) {
1564          goto LBL_ERR;
1565       }
1566       if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) {
1567          goto LBL_ERR;
1568       }
1569     }
1570     /* C = C/2, D = D/2 */
1571     if ((res = mp_div_2 (&C, &C)) != MP_OKAY) {
1572       goto LBL_ERR;
1573     }
1574     if ((res = mp_div_2 (&D, &D)) != MP_OKAY) {
1575       goto LBL_ERR;
1576     }
1577   }
1578 
1579   /* 6.  if u >= v then */
1580   if (mp_cmp (&u, &v) != MP_LT) {
1581     /* u = u - v, A = A - C, B = B - D */
1582     if ((res = mp_sub (&u, &v, &u)) != MP_OKAY) {
1583       goto LBL_ERR;
1584     }
1585 
1586     if ((res = mp_sub (&A, &C, &A)) != MP_OKAY) {
1587       goto LBL_ERR;
1588     }
1589 
1590     if ((res = mp_sub (&B, &D, &B)) != MP_OKAY) {
1591       goto LBL_ERR;
1592     }
1593   } else {
1594     /* v - v - u, C = C - A, D = D - B */
1595     if ((res = mp_sub (&v, &u, &v)) != MP_OKAY) {
1596       goto LBL_ERR;
1597     }
1598 
1599     if ((res = mp_sub (&C, &A, &C)) != MP_OKAY) {
1600       goto LBL_ERR;
1601     }
1602 
1603     if ((res = mp_sub (&D, &B, &D)) != MP_OKAY) {
1604       goto LBL_ERR;
1605     }
1606   }
1607 
1608   /* if not zero goto step 4 */
1609   if (BN_is_zero (&u) == 0)
1610     goto top;
1611 
1612   /* now a = C, b = D, gcd == g*v */
1613 
1614   /* if v != 1 then there is no inverse */
1615   if (mp_cmp_d (&v, 1) != MP_EQ) {
1616     res = MP_VAL;
1617     goto LBL_ERR;
1618   }
1619 
1620   /* if its too low */
1621   while (mp_cmp_d(&C, 0) == MP_LT) {
1622       if ((res = mp_add(&C, b, &C)) != MP_OKAY) {
1623          goto LBL_ERR;
1624       }
1625   }
1626 
1627   /* too big */
1628   while (mp_cmp_mag(&C, b) != MP_LT) {
1629       if ((res = mp_sub(&C, b, &C)) != MP_OKAY) {
1630          goto LBL_ERR;
1631       }
1632   }
1633 
1634   /* C is now the inverse */
1635   mp_exch (&C, c);
1636   res = MP_OKAY;
1637 LBL_ERR:mp_clear_multi (&x, &y, &u, &v, &A, &B, &C, &D, NULL);
1638   return res;
1639 }
1640 
1641 static int
1642 mp_invmod(mp_int *c, mp_int *a, mp_int *b)
1643 {
1644   /* b cannot be negative */
1645   if (b->sign == MP_NEG || BN_is_zero(b) == 1) {
1646     return MP_VAL;
1647   }
1648 
1649   /* if the modulus is odd we can use a faster routine instead */
1650   if (BN_is_odd (b) == 1) {
1651     return fast_mp_invmod(a, b, c);
1652   }
1653 
1654   return mp_invmod_slow(a, b, c);
1655 
1656   /*NOTREACHED*/
1657   return MP_VAL;
1658 }
1659 
1660 /* b = |a|
1661  *
1662  * Simple function copies the input and fixes the sign to positive
1663  */
1664 static int
1665 mp_abs (mp_int * a, mp_int * b)
1666 {
1667   int     res;
1668 
1669   /* copy a to b */
1670   if (a != b) {
1671      if ((res = mp_copy (a, b)) != MP_OKAY) {
1672        return res;
1673      }
1674   }
1675 
1676   /* force the sign of b to positive */
1677   b->sign = MP_ZPOS;
1678 
1679   return MP_OKAY;
1680 }
1681 
1682 /* determines if reduce_2k_l can be used */
1683 static int mp_reduce_is_2k_l(mp_int *a)
1684 {
1685    int ix, iy;
1686 
1687    if (a->used == 0) {
1688       return MP_NO;
1689    } else if (a->used == 1) {
1690       return MP_YES;
1691    } else if (a->used > 1) {
1692       /* if more than half of the digits are -1 we're sold */
1693       for (iy = ix = 0; ix < a->used; ix++) {
1694           if (a->dp[ix] == MP_MASK) {
1695               ++iy;
1696           }
1697       }
1698       return (iy >= (a->used/2)) ? MP_YES : MP_NO;
1699 
1700    }
1701    return MP_NO;
1702 }
1703 
1704 /* computes a = 2**b
1705  *
1706  * Simple algorithm which zeroes the int, grows it then just sets one bit
1707  * as required.
1708  */
1709 static int
1710 mp_2expt (mp_int * a, int b)
1711 {
1712   int     res;
1713 
1714   /* zero a as per default */
1715   mp_zero (a);
1716 
1717   /* grow a to accomodate the single bit */
1718   if ((res = mp_grow (a, b / DIGIT_BIT + 1)) != MP_OKAY) {
1719     return res;
1720   }
1721 
1722   /* set the used count of where the bit will go */
1723   a->used = b / DIGIT_BIT + 1;
1724 
1725   /* put the single bit in its place */
1726   a->dp[b / DIGIT_BIT] = ((mp_digit)1) << (b % DIGIT_BIT);
1727 
1728   return MP_OKAY;
1729 }
1730 
1731 /* pre-calculate the value required for Barrett reduction
1732  * For a given modulus "b" it calulates the value required in "a"
1733  */
1734 static int mp_reduce_setup (mp_int * a, mp_int * b)
1735 {
1736   int     res;
1737 
1738   if ((res = mp_2expt (a, b->used * 2 * DIGIT_BIT)) != MP_OKAY) {
1739     return res;
1740   }
1741   return mp_div (a, NULL, a, b);
1742 }
1743 
1744 /* b = a*2 */
1745 static int mp_mul_2(mp_int * a, mp_int * b)
1746 {
1747   int     x, res, oldused;
1748 
1749   /* grow to accomodate result */
1750   if (b->alloc < a->used + 1) {
1751     if ((res = mp_grow (b, a->used + 1)) != MP_OKAY) {
1752       return res;
1753     }
1754   }
1755 
1756   oldused = b->used;
1757   b->used = a->used;
1758 
1759   {
1760     mp_digit r, rr, *tmpa, *tmpb;
1761 
1762     /* alias for source */
1763     tmpa = a->dp;
1764 
1765     /* alias for dest */
1766     tmpb = b->dp;
1767 
1768     /* carry */
1769     r = 0;
1770     for (x = 0; x < a->used; x++) {
1771 
1772       /* get what will be the *next* carry bit from the
1773        * MSB of the current digit
1774        */
1775       rr = *tmpa >> ((mp_digit)(DIGIT_BIT - 1));
1776 
1777       /* now shift up this digit, add in the carry [from the previous] */
1778       *tmpb++ = ((*tmpa++ << ((mp_digit)1)) | r) & MP_MASK;
1779 
1780       /* copy the carry that would be from the source
1781        * digit into the next iteration
1782        */
1783       r = rr;
1784     }
1785 
1786     /* new leading digit? */
1787     if (r != 0) {
1788       /* add a MSB which is always 1 at this point */
1789       *tmpb = 1;
1790       ++(b->used);
1791     }
1792 
1793     /* now zero any excess digits on the destination
1794      * that we didn't write to
1795      */
1796     tmpb = b->dp + b->used;
1797     for (x = b->used; x < oldused; x++) {
1798       *tmpb++ = 0;
1799     }
1800   }
1801   b->sign = a->sign;
1802   return MP_OKAY;
1803 }
1804 
1805 /* divide by three (based on routine from MPI and the GMP manual) */
1806 static int
1807 mp_div_3 (mp_int * a, mp_int *c, mp_digit * d)
1808 {
1809   mp_int   q;
1810   mp_word  w, t;
1811   mp_digit b;
1812   int      res, ix;
1813 
1814   /* b = 2**DIGIT_BIT / 3 */
1815   b = (((mp_word)1) << ((mp_word)DIGIT_BIT)) / ((mp_word)3);
1816 
1817   if ((res = mp_init_size(&q, a->used)) != MP_OKAY) {
1818      return res;
1819   }
1820 
1821   q.used = a->used;
1822   q.sign = a->sign;
1823   w = 0;
1824   for (ix = a->used - 1; ix >= 0; ix--) {
1825      w = (w << ((mp_word)DIGIT_BIT)) | ((mp_word)a->dp[ix]);
1826 
1827      if (w >= 3) {
1828         /* multiply w by [1/3] */
1829         t = (w * ((mp_word)b)) >> ((mp_word)DIGIT_BIT);
1830 
1831         /* now subtract 3 * [w/3] from w, to get the remainder */
1832         w -= t+t+t;
1833 
1834         /* fixup the remainder as required since
1835          * the optimization is not exact.
1836          */
1837         while (w >= 3) {
1838            t += 1;
1839            w -= 3;
1840         }
1841       } else {
1842         t = 0;
1843       }
1844       q.dp[ix] = (mp_digit)t;
1845   }
1846 
1847   /* [optional] store the remainder */
1848   if (d != NULL) {
1849      *d = (mp_digit)w;
1850   }
1851 
1852   /* [optional] store the quotient */
1853   if (c != NULL) {
1854      mp_clamp(&q);
1855      mp_exch(&q, c);
1856   }
1857   mp_clear(&q);
1858 
1859   return res;
1860 }
1861 
1862 /* multiplication using the Toom-Cook 3-way algorithm
1863  *
1864  * Much more complicated than Karatsuba but has a lower
1865  * asymptotic running time of O(N**1.464).  This algorithm is
1866  * only particularly useful on VERY large inputs
1867  * (we're talking 1000s of digits here...).
1868 */
1869 static int mp_toom_mul(mp_int *a, mp_int *b, mp_int *c)
1870 {
1871     mp_int w0, w1, w2, w3, w4, tmp1, tmp2, a0, a1, a2, b0, b1, b2;
1872     int res, B;
1873 
1874     /* init temps */
1875     if ((res = mp_init_multi(&w0, &w1, &w2, &w3, &w4,
1876                              &a0, &a1, &a2, &b0, &b1,
1877                              &b2, &tmp1, &tmp2, NULL)) != MP_OKAY) {
1878        return res;
1879     }
1880 
1881     /* B */
1882     B = MIN(a->used, b->used) / 3;
1883 
1884     /* a = a2 * B**2 + a1 * B + a0 */
1885     if ((res = mp_mod_2d(a, DIGIT_BIT * B, &a0)) != MP_OKAY) {
1886        goto ERR;
1887     }
1888 
1889     if ((res = mp_copy(a, &a1)) != MP_OKAY) {
1890        goto ERR;
1891     }
1892     mp_rshd(&a1, B);
1893     mp_mod_2d(&a1, DIGIT_BIT * B, &a1);
1894 
1895     if ((res = mp_copy(a, &a2)) != MP_OKAY) {
1896        goto ERR;
1897     }
1898     mp_rshd(&a2, B*2);
1899 
1900     /* b = b2 * B**2 + b1 * B + b0 */
1901     if ((res = mp_mod_2d(b, DIGIT_BIT * B, &b0)) != MP_OKAY) {
1902        goto ERR;
1903     }
1904 
1905     if ((res = mp_copy(b, &b1)) != MP_OKAY) {
1906        goto ERR;
1907     }
1908     mp_rshd(&b1, B);
1909     mp_mod_2d(&b1, DIGIT_BIT * B, &b1);
1910 
1911     if ((res = mp_copy(b, &b2)) != MP_OKAY) {
1912        goto ERR;
1913     }
1914     mp_rshd(&b2, B*2);
1915 
1916     /* w0 = a0*b0 */
1917     if ((res = mp_mul(&a0, &b0, &w0)) != MP_OKAY) {
1918        goto ERR;
1919     }
1920 
1921     /* w4 = a2 * b2 */
1922     if ((res = mp_mul(&a2, &b2, &w4)) != MP_OKAY) {
1923        goto ERR;
1924     }
1925 
1926     /* w1 = (a2 + 2(a1 + 2a0))(b2 + 2(b1 + 2b0)) */
1927     if ((res = mp_mul_2(&a0, &tmp1)) != MP_OKAY) {
1928        goto ERR;
1929     }
1930     if ((res = mp_add(&tmp1, &a1, &tmp1)) != MP_OKAY) {
1931        goto ERR;
1932     }
1933     if ((res = mp_mul_2(&tmp1, &tmp1)) != MP_OKAY) {
1934        goto ERR;
1935     }
1936     if ((res = mp_add(&tmp1, &a2, &tmp1)) != MP_OKAY) {
1937        goto ERR;
1938     }
1939 
1940     if ((res = mp_mul_2(&b0, &tmp2)) != MP_OKAY) {
1941        goto ERR;
1942     }
1943     if ((res = mp_add(&tmp2, &b1, &tmp2)) != MP_OKAY) {
1944        goto ERR;
1945     }
1946     if ((res = mp_mul_2(&tmp2, &tmp2)) != MP_OKAY) {
1947        goto ERR;
1948     }
1949     if ((res = mp_add(&tmp2, &b2, &tmp2)) != MP_OKAY) {
1950        goto ERR;
1951     }
1952 
1953     if ((res = mp_mul(&tmp1, &tmp2, &w1)) != MP_OKAY) {
1954        goto ERR;
1955     }
1956 
1957     /* w3 = (a0 + 2(a1 + 2a2))(b0 + 2(b1 + 2b2)) */
1958     if ((res = mp_mul_2(&a2, &tmp1)) != MP_OKAY) {
1959        goto ERR;
1960     }
1961     if ((res = mp_add(&tmp1, &a1, &tmp1)) != MP_OKAY) {
1962        goto ERR;
1963     }
1964     if ((res = mp_mul_2(&tmp1, &tmp1)) != MP_OKAY) {
1965        goto ERR;
1966     }
1967     if ((res = mp_add(&tmp1, &a0, &tmp1)) != MP_OKAY) {
1968        goto ERR;
1969     }
1970 
1971     if ((res = mp_mul_2(&b2, &tmp2)) != MP_OKAY) {
1972        goto ERR;
1973     }
1974     if ((res = mp_add(&tmp2, &b1, &tmp2)) != MP_OKAY) {
1975        goto ERR;
1976     }
1977     if ((res = mp_mul_2(&tmp2, &tmp2)) != MP_OKAY) {
1978        goto ERR;
1979     }
1980     if ((res = mp_add(&tmp2, &b0, &tmp2)) != MP_OKAY) {
1981        goto ERR;
1982     }
1983 
1984     if ((res = mp_mul(&tmp1, &tmp2, &w3)) != MP_OKAY) {
1985        goto ERR;
1986     }
1987 
1988 
1989     /* w2 = (a2 + a1 + a0)(b2 + b1 + b0) */
1990     if ((res = mp_add(&a2, &a1, &tmp1)) != MP_OKAY) {
1991        goto ERR;
1992     }
1993     if ((res = mp_add(&tmp1, &a0, &tmp1)) != MP_OKAY) {
1994        goto ERR;
1995     }
1996     if ((res = mp_add(&b2, &b1, &tmp2)) != MP_OKAY) {
1997        goto ERR;
1998     }
1999     if ((res = mp_add(&tmp2, &b0, &tmp2)) != MP_OKAY) {
2000        goto ERR;
2001     }
2002     if ((res = mp_mul(&tmp1, &tmp2, &w2)) != MP_OKAY) {
2003        goto ERR;
2004     }
2005 
2006     /* now solve the matrix
2007 
2008        0  0  0  0  1
2009        1  2  4  8  16
2010        1  1  1  1  1
2011        16 8  4  2  1
2012        1  0  0  0  0
2013 
2014        using 12 subtractions, 4 shifts,
2015               2 small divisions and 1 small multiplication
2016      */
2017 
2018      /* r1 - r4 */
2019      if ((res = mp_sub(&w1, &w4, &w1)) != MP_OKAY) {
2020         goto ERR;
2021      }
2022      /* r3 - r0 */
2023      if ((res = mp_sub(&w3, &w0, &w3)) != MP_OKAY) {
2024         goto ERR;
2025      }
2026      /* r1/2 */
2027      if ((res = mp_div_2(&w1, &w1)) != MP_OKAY) {
2028         goto ERR;
2029      }
2030      /* r3/2 */
2031      if ((res = mp_div_2(&w3, &w3)) != MP_OKAY) {
2032         goto ERR;
2033      }
2034      /* r2 - r0 - r4 */
2035      if ((res = mp_sub(&w2, &w0, &w2)) != MP_OKAY) {
2036         goto ERR;
2037      }
2038      if ((res = mp_sub(&w2, &w4, &w2)) != MP_OKAY) {
2039         goto ERR;
2040      }
2041      /* r1 - r2 */
2042      if ((res = mp_sub(&w1, &w2, &w1)) != MP_OKAY) {
2043         goto ERR;
2044      }
2045      /* r3 - r2 */
2046      if ((res = mp_sub(&w3, &w2, &w3)) != MP_OKAY) {
2047         goto ERR;
2048      }
2049      /* r1 - 8r0 */
2050      if ((res = mp_mul_2d(&w0, 3, &tmp1)) != MP_OKAY) {
2051         goto ERR;
2052      }
2053      if ((res = mp_sub(&w1, &tmp1, &w1)) != MP_OKAY) {
2054         goto ERR;
2055      }
2056      /* r3 - 8r4 */
2057      if ((res = mp_mul_2d(&w4, 3, &tmp1)) != MP_OKAY) {
2058         goto ERR;
2059      }
2060      if ((res = mp_sub(&w3, &tmp1, &w3)) != MP_OKAY) {
2061         goto ERR;
2062      }
2063      /* 3r2 - r1 - r3 */
2064      if ((res = mp_mul_d(&w2, 3, &w2)) != MP_OKAY) {
2065         goto ERR;
2066      }
2067      if ((res = mp_sub(&w2, &w1, &w2)) != MP_OKAY) {
2068         goto ERR;
2069      }
2070      if ((res = mp_sub(&w2, &w3, &w2)) != MP_OKAY) {
2071         goto ERR;
2072      }
2073      /* r1 - r2 */
2074      if ((res = mp_sub(&w1, &w2, &w1)) != MP_OKAY) {
2075         goto ERR;
2076      }
2077      /* r3 - r2 */
2078      if ((res = mp_sub(&w3, &w2, &w3)) != MP_OKAY) {
2079         goto ERR;
2080      }
2081      /* r1/3 */
2082      if ((res = mp_div_3(&w1, &w1, NULL)) != MP_OKAY) {
2083         goto ERR;
2084      }
2085      /* r3/3 */
2086      if ((res = mp_div_3(&w3, &w3, NULL)) != MP_OKAY) {
2087         goto ERR;
2088      }
2089 
2090      /* at this point shift W[n] by B*n */
2091      if ((res = mp_lshd(&w1, 1*B)) != MP_OKAY) {
2092         goto ERR;
2093      }
2094      if ((res = mp_lshd(&w2, 2*B)) != MP_OKAY) {
2095         goto ERR;
2096      }
2097      if ((res = mp_lshd(&w3, 3*B)) != MP_OKAY) {
2098         goto ERR;
2099      }
2100      if ((res = mp_lshd(&w4, 4*B)) != MP_OKAY) {
2101         goto ERR;
2102      }
2103 
2104      if ((res = mp_add(&w0, &w1, c)) != MP_OKAY) {
2105         goto ERR;
2106      }
2107      if ((res = mp_add(&w2, &w3, &tmp1)) != MP_OKAY) {
2108         goto ERR;
2109      }
2110      if ((res = mp_add(&w4, &tmp1, &tmp1)) != MP_OKAY) {
2111         goto ERR;
2112      }
2113      if ((res = mp_add(&tmp1, c, c)) != MP_OKAY) {
2114         goto ERR;
2115      }
2116 
2117 ERR:
2118      mp_clear_multi(&w0, &w1, &w2, &w3, &w4,
2119                     &a0, &a1, &a2, &b0, &b1,
2120                     &b2, &tmp1, &tmp2, NULL);
2121      return res;
2122 }
2123 
2124 #define TOOM_MUL_CUTOFF	350
2125 #define KARATSUBA_MUL_CUTOFF 80
2126 
2127 /* c = |a| * |b| using Karatsuba Multiplication using
2128  * three half size multiplications
2129  *
2130  * Let B represent the radix [e.g. 2**DIGIT_BIT] and
2131  * let n represent half of the number of digits in
2132  * the min(a,b)
2133  *
2134  * a = a1 * B**n + a0
2135  * b = b1 * B**n + b0
2136  *
2137  * Then, a * b =>
2138    a1b1 * B**2n + ((a1 + a0)(b1 + b0) - (a0b0 + a1b1)) * B + a0b0
2139  *
2140  * Note that a1b1 and a0b0 are used twice and only need to be
2141  * computed once.  So in total three half size (half # of
2142  * digit) multiplications are performed, a0b0, a1b1 and
2143  * (a1+b1)(a0+b0)
2144  *
2145  * Note that a multiplication of half the digits requires
2146  * 1/4th the number of single precision multiplications so in
2147  * total after one call 25% of the single precision multiplications
2148  * are saved.  Note also that the call to mp_mul can end up back
2149  * in this function if the a0, a1, b0, or b1 are above the threshold.
2150  * This is known as divide-and-conquer and leads to the famous
2151  * O(N**lg(3)) or O(N**1.584) work which is asymptopically lower than
2152  * the standard O(N**2) that the baseline/comba methods use.
2153  * Generally though the overhead of this method doesn't pay off
2154  * until a certain size (N ~ 80) is reached.
2155  */
2156 static int mp_karatsuba_mul (mp_int * a, mp_int * b, mp_int * c)
2157 {
2158   mp_int  x0, x1, y0, y1, t1, x0y0, x1y1;
2159   int     B;
2160   int     err;
2161 
2162   /* default the return code to an error */
2163   err = MP_MEM;
2164 
2165   /* min # of digits */
2166   B = MIN (a->used, b->used);
2167 
2168   /* now divide in two */
2169   B = (int)((unsigned)B >> 1);
2170 
2171   /* init copy all the temps */
2172   if (mp_init_size (&x0, B) != MP_OKAY)
2173     goto ERR;
2174   if (mp_init_size (&x1, a->used - B) != MP_OKAY)
2175     goto X0;
2176   if (mp_init_size (&y0, B) != MP_OKAY)
2177     goto X1;
2178   if (mp_init_size (&y1, b->used - B) != MP_OKAY)
2179     goto Y0;
2180 
2181   /* init temps */
2182   if (mp_init_size (&t1, B * 2) != MP_OKAY)
2183     goto Y1;
2184   if (mp_init_size (&x0y0, B * 2) != MP_OKAY)
2185     goto T1;
2186   if (mp_init_size (&x1y1, B * 2) != MP_OKAY)
2187     goto X0Y0;
2188 
2189   /* now shift the digits */
2190   x0.used = y0.used = B;
2191   x1.used = a->used - B;
2192   y1.used = b->used - B;
2193 
2194   {
2195     int x;
2196     mp_digit *tmpa, *tmpb, *tmpx, *tmpy;
2197 
2198     /* we copy the digits directly instead of using higher level functions
2199      * since we also need to shift the digits
2200      */
2201     tmpa = a->dp;
2202     tmpb = b->dp;
2203 
2204     tmpx = x0.dp;
2205     tmpy = y0.dp;
2206     for (x = 0; x < B; x++) {
2207       *tmpx++ = *tmpa++;
2208       *tmpy++ = *tmpb++;
2209     }
2210 
2211     tmpx = x1.dp;
2212     for (x = B; x < a->used; x++) {
2213       *tmpx++ = *tmpa++;
2214     }
2215 
2216     tmpy = y1.dp;
2217     for (x = B; x < b->used; x++) {
2218       *tmpy++ = *tmpb++;
2219     }
2220   }
2221 
2222   /* only need to clamp the lower words since by definition the
2223    * upper words x1/y1 must have a known number of digits
2224    */
2225   mp_clamp (&x0);
2226   mp_clamp (&y0);
2227 
2228   /* now calc the products x0y0 and x1y1 */
2229   /* after this x0 is no longer required, free temp [x0==t2]! */
2230   if (mp_mul (&x0, &y0, &x0y0) != MP_OKAY)
2231     goto X1Y1;          /* x0y0 = x0*y0 */
2232   if (mp_mul (&x1, &y1, &x1y1) != MP_OKAY)
2233     goto X1Y1;          /* x1y1 = x1*y1 */
2234 
2235   /* now calc x1+x0 and y1+y0 */
2236   if (s_mp_add (&x1, &x0, &t1) != MP_OKAY)
2237     goto X1Y1;          /* t1 = x1 - x0 */
2238   if (s_mp_add (&y1, &y0, &x0) != MP_OKAY)
2239     goto X1Y1;          /* t2 = y1 - y0 */
2240   if (mp_mul (&t1, &x0, &t1) != MP_OKAY)
2241     goto X1Y1;          /* t1 = (x1 + x0) * (y1 + y0) */
2242 
2243   /* add x0y0 */
2244   if (mp_add (&x0y0, &x1y1, &x0) != MP_OKAY)
2245     goto X1Y1;          /* t2 = x0y0 + x1y1 */
2246   if (s_mp_sub (&t1, &x0, &t1) != MP_OKAY)
2247     goto X1Y1;          /* t1 = (x1+x0)*(y1+y0) - (x1y1 + x0y0) */
2248 
2249   /* shift by B */
2250   if (mp_lshd (&t1, B) != MP_OKAY)
2251     goto X1Y1;          /* t1 = (x0y0 + x1y1 - (x1-x0)*(y1-y0))<<B */
2252   if (mp_lshd (&x1y1, B * 2) != MP_OKAY)
2253     goto X1Y1;          /* x1y1 = x1y1 << 2*B */
2254 
2255   if (mp_add (&x0y0, &t1, &t1) != MP_OKAY)
2256     goto X1Y1;          /* t1 = x0y0 + t1 */
2257   if (mp_add (&t1, &x1y1, c) != MP_OKAY)
2258     goto X1Y1;          /* t1 = x0y0 + t1 + x1y1 */
2259 
2260   /* Algorithm succeeded set the return code to MP_OKAY */
2261   err = MP_OKAY;
2262 
2263 X1Y1:mp_clear (&x1y1);
2264 X0Y0:mp_clear (&x0y0);
2265 T1:mp_clear (&t1);
2266 Y1:mp_clear (&y1);
2267 Y0:mp_clear (&y0);
2268 X1:mp_clear (&x1);
2269 X0:mp_clear (&x0);
2270 ERR:
2271   return err;
2272 }
2273 
2274 /* Fast (comba) multiplier
2275  *
2276  * This is the fast column-array [comba] multiplier.  It is
2277  * designed to compute the columns of the product first
2278  * then handle the carries afterwards.  This has the effect
2279  * of making the nested loops that compute the columns very
2280  * simple and schedulable on super-scalar processors.
2281  *
2282  * This has been modified to produce a variable number of
2283  * digits of output so if say only a half-product is required
2284  * you don't have to compute the upper half (a feature
2285  * required for fast Barrett reduction).
2286  *
2287  * Based on Algorithm 14.12 on pp.595 of HAC.
2288  *
2289  */
2290 static int fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
2291 {
2292   int     olduse, res, pa, ix, iz;
2293   /*LINTED*/
2294   mp_digit W[MP_WARRAY];
2295   mp_word  _W;
2296 
2297   /* grow the destination as required */
2298   if (c->alloc < digs) {
2299     if ((res = mp_grow (c, digs)) != MP_OKAY) {
2300       return res;
2301     }
2302   }
2303 
2304   /* number of output digits to produce */
2305   pa = MIN(digs, a->used + b->used);
2306 
2307   /* clear the carry */
2308   _W = 0;
2309   for (ix = 0; ix < pa; ix++) {
2310       int      tx, ty;
2311       int      iy;
2312       mp_digit *tmpx, *tmpy;
2313 
2314       /* get offsets into the two bignums */
2315       ty = MIN(b->used-1, ix);
2316       tx = ix - ty;
2317 
2318       /* setup temp aliases */
2319       tmpx = a->dp + tx;
2320       tmpy = b->dp + ty;
2321 
2322       /* this is the number of times the loop will iterrate, essentially
2323          while (tx++ < a->used && ty-- >= 0) { ... }
2324        */
2325       iy = MIN(a->used-tx, ty+1);
2326 
2327       /* execute loop */
2328       for (iz = 0; iz < iy; ++iz) {
2329          _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
2330 
2331       }
2332 
2333       /* store term */
2334       W[ix] = ((mp_digit)_W) & MP_MASK;
2335 
2336       /* make next carry */
2337       _W = _W >> ((mp_word)DIGIT_BIT);
2338  }
2339 
2340   /* setup dest */
2341   olduse  = c->used;
2342   c->used = pa;
2343 
2344   {
2345     mp_digit *tmpc;
2346     tmpc = c->dp;
2347     for (ix = 0; ix < pa+1; ix++) {
2348       /* now extract the previous digit [below the carry] */
2349       *tmpc++ = W[ix];
2350     }
2351 
2352     /* clear unused digits [that existed in the old copy of c] */
2353     for (; ix < olduse; ix++) {
2354       *tmpc++ = 0;
2355     }
2356   }
2357   mp_clamp (c);
2358   return MP_OKAY;
2359 }
2360 
2361 /* Source: /usr/cvsroot/libtommath/dist/libtommath/bn_fast_s_mp_mul_digs.c,v $ */
2362 /* Revision: 1.2 $ */
2363 /* Date: 2011/03/18 16:22:09 $ */
2364 
2365 
2366 /* multiplies |a| * |b| and only computes upto digs digits of result
2367  * HAC pp. 595, Algorithm 14.12  Modified so you can control how
2368  * many digits of output are created.
2369  */
2370 static int s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
2371 {
2372   mp_int  t;
2373   int     res, pa, pb, ix, iy;
2374   mp_digit u;
2375   mp_word r;
2376   mp_digit tmpx, *tmpt, *tmpy;
2377 
2378   /* can we use the fast multiplier? */
2379   if (((unsigned)(digs) < MP_WARRAY) &&
2380       MIN (a->used, b->used) <
2381           (1 << (unsigned)((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
2382     return fast_s_mp_mul_digs (a, b, c, digs);
2383   }
2384 
2385   if ((res = mp_init_size (&t, digs)) != MP_OKAY) {
2386     return res;
2387   }
2388   t.used = digs;
2389 
2390   /* compute the digits of the product directly */
2391   pa = a->used;
2392   for (ix = 0; ix < pa; ix++) {
2393     /* set the carry to zero */
2394     u = 0;
2395 
2396     /* limit ourselves to making digs digits of output */
2397     pb = MIN (b->used, digs - ix);
2398 
2399     /* setup some aliases */
2400     /* copy of the digit from a used within the nested loop */
2401     tmpx = a->dp[ix];
2402 
2403     /* an alias for the destination shifted ix places */
2404     tmpt = t.dp + ix;
2405 
2406     /* an alias for the digits of b */
2407     tmpy = b->dp;
2408 
2409     /* compute the columns of the output and propagate the carry */
2410     for (iy = 0; iy < pb; iy++) {
2411       /* compute the column as a mp_word */
2412       r       = ((mp_word)*tmpt) +
2413                 ((mp_word)tmpx) * ((mp_word)*tmpy++) +
2414                 ((mp_word) u);
2415 
2416       /* the new column is the lower part of the result */
2417       *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
2418 
2419       /* get the carry word from the result */
2420       u       = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
2421     }
2422     /* set carry if it is placed below digs */
2423     if (ix + iy < digs) {
2424       *tmpt = u;
2425     }
2426   }
2427 
2428   mp_clamp (&t);
2429   mp_exch (&t, c);
2430 
2431   mp_clear (&t);
2432   return MP_OKAY;
2433 }
2434 
2435 /* Source: /usr/cvsroot/libtommath/dist/libtommath/bn_s_mp_mul_digs.c,v $ */
2436 /* Revision: 1.3 $ */
2437 /* Date: 2011/03/18 16:43:04 $ */
2438 
2439 /* high level multiplication (handles sign) */
2440 static int
2441 mp_mul(mp_int * a, mp_int * b, mp_int * c)
2442 {
2443   int     res, neg;
2444   neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
2445 
2446   /* use Toom-Cook? */
2447   if (MIN (a->used, b->used) >= TOOM_MUL_CUTOFF) {
2448     res = mp_toom_mul(a, b, c);
2449   } else
2450   /* use Karatsuba? */
2451   if (MIN (a->used, b->used) >= KARATSUBA_MUL_CUTOFF) {
2452     res = mp_karatsuba_mul (a, b, c);
2453   } else
2454   {
2455     /* can we use the fast multiplier?
2456      *
2457      * The fast multiplier can be used if the output will
2458      * have less than MP_WARRAY digits and the number of
2459      * digits won't affect carry propagation
2460      */
2461     int     digs = a->used + b->used + 1;
2462 
2463     if (((unsigned)digs < MP_WARRAY) &&
2464         MIN(a->used, b->used) <=
2465         (1 << (unsigned)((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
2466       res = fast_s_mp_mul_digs (a, b, c, digs);
2467     } else
2468       res = s_mp_mul (a, b, c); /* uses s_mp_mul_digs */
2469 
2470   }
2471   c->sign = (c->used > 0) ? neg : MP_ZPOS;
2472   return res;
2473 }
2474 
2475 /* this is a modified version of fast_s_mul_digs that only produces
2476  * output digits *above* digs.  See the comments for fast_s_mul_digs
2477  * to see how it works.
2478  *
2479  * This is used in the Barrett reduction since for one of the multiplications
2480  * only the higher digits were needed.  This essentially halves the work.
2481  *
2482  * Based on Algorithm 14.12 on pp.595 of HAC.
2483  */
2484 static int
2485 fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
2486 {
2487   int     olduse, res, pa, ix, iz;
2488   mp_digit W[MP_WARRAY];
2489   mp_word  _W;
2490 
2491   /* grow the destination as required */
2492   pa = a->used + b->used;
2493   if (c->alloc < pa) {
2494     if ((res = mp_grow (c, pa)) != MP_OKAY) {
2495       return res;
2496     }
2497   }
2498 
2499   /* number of output digits to produce */
2500   pa = a->used + b->used;
2501   _W = 0;
2502   for (ix = digs; ix < pa; ix++) {
2503       int      tx, ty, iy;
2504       mp_digit *tmpx, *tmpy;
2505 
2506       /* get offsets into the two bignums */
2507       ty = MIN(b->used-1, ix);
2508       tx = ix - ty;
2509 
2510       /* setup temp aliases */
2511       tmpx = a->dp + tx;
2512       tmpy = b->dp + ty;
2513 
2514       /* this is the number of times the loop will iterrate, essentially its
2515          while (tx++ < a->used && ty-- >= 0) { ... }
2516        */
2517       iy = MIN(a->used-tx, ty+1);
2518 
2519       /* execute loop */
2520       for (iz = 0; iz < iy; iz++) {
2521          _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
2522       }
2523 
2524       /* store term */
2525       W[ix] = ((mp_digit)_W) & MP_MASK;
2526 
2527       /* make next carry */
2528       _W = _W >> ((mp_word)DIGIT_BIT);
2529   }
2530 
2531   /* setup dest */
2532   olduse  = c->used;
2533   c->used = pa;
2534 
2535   {
2536     mp_digit *tmpc;
2537 
2538     tmpc = c->dp + digs;
2539     for (ix = digs; ix < pa; ix++) {
2540       /* now extract the previous digit [below the carry] */
2541       *tmpc++ = W[ix];
2542     }
2543 
2544     /* clear unused digits [that existed in the old copy of c] */
2545     for (; ix < olduse; ix++) {
2546       *tmpc++ = 0;
2547     }
2548   }
2549   mp_clamp (c);
2550   return MP_OKAY;
2551 }
2552 
2553 /* Source: /usr/cvsroot/libtommath/dist/libtommath/bn_fast_s_mp_mul_high_digs.c,v $ */
2554 /* Revision: 1.1.1.1 $ */
2555 /* Date: 2011/03/12 22:58:18 $ */
2556 
2557 /* multiplies |a| * |b| and does not compute the lower digs digits
2558  * [meant to get the higher part of the product]
2559  */
2560 static int
2561 s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
2562 {
2563   mp_int  t;
2564   int     res, pa, pb, ix, iy;
2565   mp_digit u;
2566   mp_word r;
2567   mp_digit tmpx, *tmpt, *tmpy;
2568 
2569   /* can we use the fast multiplier? */
2570   if (((unsigned)(a->used + b->used + 1) < MP_WARRAY)
2571       && MIN (a->used, b->used) < (1 << (unsigned)((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
2572     return fast_s_mp_mul_high_digs (a, b, c, digs);
2573   }
2574 
2575   if ((res = mp_init_size (&t, a->used + b->used + 1)) != MP_OKAY) {
2576     return res;
2577   }
2578   t.used = a->used + b->used + 1;
2579 
2580   pa = a->used;
2581   pb = b->used;
2582   for (ix = 0; ix < pa; ix++) {
2583     /* clear the carry */
2584     u = 0;
2585 
2586     /* left hand side of A[ix] * B[iy] */
2587     tmpx = a->dp[ix];
2588 
2589     /* alias to the address of where the digits will be stored */
2590     tmpt = &(t.dp[digs]);
2591 
2592     /* alias for where to read the right hand side from */
2593     tmpy = b->dp + (digs - ix);
2594 
2595     for (iy = digs - ix; iy < pb; iy++) {
2596       /* calculate the double precision result */
2597       r       = ((mp_word)*tmpt) +
2598                 ((mp_word)tmpx) * ((mp_word)*tmpy++) +
2599                 ((mp_word) u);
2600 
2601       /* get the lower part */
2602       *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
2603 
2604       /* carry the carry */
2605       u       = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
2606     }
2607     *tmpt = u;
2608   }
2609   mp_clamp (&t);
2610   mp_exch (&t, c);
2611   mp_clear (&t);
2612   return MP_OKAY;
2613 }
2614 
2615 /* Source: /usr/cvsroot/libtommath/dist/libtommath/bn_s_mp_mul_high_digs.c,v $ */
2616 /* Revision: 1.3 $ */
2617 /* Date: 2011/03/18 16:43:04 $ */
2618 
2619 /* reduces x mod m, assumes 0 < x < m**2, mu is
2620  * precomputed via mp_reduce_setup.
2621  * From HAC pp.604 Algorithm 14.42
2622  */
2623 static int
2624 mp_reduce (mp_int * x, mp_int * m, mp_int * mu)
2625 {
2626   mp_int  q;
2627   int     res, um = m->used;
2628 
2629   /* q = x */
2630   if ((res = mp_init_copy (&q, x)) != MP_OKAY) {
2631     return res;
2632   }
2633 
2634   /* q1 = x / b**(k-1)  */
2635   mp_rshd (&q, um - 1);
2636 
2637   /* according to HAC this optimization is ok */
2638   if (((unsigned long) um) > (((mp_digit)1) << (DIGIT_BIT - 1))) {
2639     if ((res = mp_mul (&q, mu, &q)) != MP_OKAY) {
2640       goto CLEANUP;
2641     }
2642   } else {
2643     if ((res = s_mp_mul_high_digs (&q, mu, &q, um)) != MP_OKAY) {
2644       goto CLEANUP;
2645     }
2646   }
2647 
2648   /* q3 = q2 / b**(k+1) */
2649   mp_rshd (&q, um + 1);
2650 
2651   /* x = x mod b**(k+1), quick (no division) */
2652   if ((res = mp_mod_2d (x, DIGIT_BIT * (um + 1), x)) != MP_OKAY) {
2653     goto CLEANUP;
2654   }
2655 
2656   /* q = q * m mod b**(k+1), quick (no division) */
2657   if ((res = s_mp_mul_digs (&q, m, &q, um + 1)) != MP_OKAY) {
2658     goto CLEANUP;
2659   }
2660 
2661   /* x = x - q */
2662   if ((res = mp_sub (x, &q, x)) != MP_OKAY) {
2663     goto CLEANUP;
2664   }
2665 
2666   /* If x < 0, add b**(k+1) to it */
2667   if (mp_cmp_d (x, 0) == MP_LT) {
2668     mp_set (&q, 1);
2669     if ((res = mp_lshd (&q, um + 1)) != MP_OKAY)
2670       goto CLEANUP;
2671     if ((res = mp_add (x, &q, x)) != MP_OKAY)
2672       goto CLEANUP;
2673   }
2674 
2675   /* Back off if it's too big */
2676   while (mp_cmp (x, m) != MP_LT) {
2677     if ((res = s_mp_sub (x, m, x)) != MP_OKAY) {
2678       goto CLEANUP;
2679     }
2680   }
2681 
2682 CLEANUP:
2683   mp_clear (&q);
2684 
2685   return res;
2686 }
2687 
2688 /* determines the setup value */
2689 static int mp_reduce_2k_setup_l(mp_int *a, mp_int *d)
2690 {
2691    int    res;
2692    mp_int tmp;
2693 
2694    if ((res = mp_init(&tmp)) != MP_OKAY) {
2695       return res;
2696    }
2697 
2698    if ((res = mp_2expt(&tmp, mp_count_bits(a))) != MP_OKAY) {
2699       goto ERR;
2700    }
2701 
2702    if ((res = s_mp_sub(&tmp, a, d)) != MP_OKAY) {
2703       goto ERR;
2704    }
2705 
2706 ERR:
2707    mp_clear(&tmp);
2708    return res;
2709 }
2710 
2711 /* Source: /usr/cvsroot/libtommath/dist/libtommath/bn_mp_reduce_2k_setup_l.c,v $ */
2712 /* Revision: 1.1.1.1 $ */
2713 /* Date: 2011/03/12 22:58:18 $ */
2714 
2715 /* reduces a modulo n where n is of the form 2**p - d
2716    This differs from reduce_2k since "d" can be larger
2717    than a single digit.
2718 */
2719 static int
2720 mp_reduce_2k_l(mp_int *a, mp_int *n, mp_int *d)
2721 {
2722    mp_int q;
2723    int    p, res;
2724 
2725    if ((res = mp_init(&q)) != MP_OKAY) {
2726       return res;
2727    }
2728 
2729    p = mp_count_bits(n);
2730 top:
2731    /* q = a/2**p, a = a mod 2**p */
2732    if ((res = mp_div_2d(a, p, &q, a)) != MP_OKAY) {
2733       goto ERR;
2734    }
2735 
2736    /* q = q * d */
2737    if ((res = mp_mul(&q, d, &q)) != MP_OKAY) {
2738       goto ERR;
2739    }
2740 
2741    /* a = a + q */
2742    if ((res = s_mp_add(a, &q, a)) != MP_OKAY) {
2743       goto ERR;
2744    }
2745 
2746    if (mp_cmp_mag(a, n) != MP_LT) {
2747       s_mp_sub(a, n, a);
2748       goto top;
2749    }
2750 
2751 ERR:
2752    mp_clear(&q);
2753    return res;
2754 }
2755 
2756 /* Source: /usr/cvsroot/libtommath/dist/libtommath/bn_mp_reduce_2k_l.c,v $ */
2757 /* Revision: 1.1.1.1 $ */
2758 /* Date: 2011/03/12 22:58:18 $ */
2759 
2760 /* squaring using Toom-Cook 3-way algorithm */
2761 static int
2762 mp_toom_sqr(mp_int *a, mp_int *b)
2763 {
2764     mp_int w0, w1, w2, w3, w4, tmp1, a0, a1, a2;
2765     int res, B;
2766 
2767     /* init temps */
2768     if ((res = mp_init_multi(&w0, &w1, &w2, &w3, &w4, &a0, &a1, &a2, &tmp1, NULL)) != MP_OKAY) {
2769        return res;
2770     }
2771 
2772     /* B */
2773     B = a->used / 3;
2774 
2775     /* a = a2 * B**2 + a1 * B + a0 */
2776     if ((res = mp_mod_2d(a, DIGIT_BIT * B, &a0)) != MP_OKAY) {
2777        goto ERR;
2778     }
2779 
2780     if ((res = mp_copy(a, &a1)) != MP_OKAY) {
2781        goto ERR;
2782     }
2783     mp_rshd(&a1, B);
2784     mp_mod_2d(&a1, DIGIT_BIT * B, &a1);
2785 
2786     if ((res = mp_copy(a, &a2)) != MP_OKAY) {
2787        goto ERR;
2788     }
2789     mp_rshd(&a2, B*2);
2790 
2791     /* w0 = a0*a0 */
2792     if ((res = mp_sqr(&a0, &w0)) != MP_OKAY) {
2793        goto ERR;
2794     }
2795 
2796     /* w4 = a2 * a2 */
2797     if ((res = mp_sqr(&a2, &w4)) != MP_OKAY) {
2798        goto ERR;
2799     }
2800 
2801     /* w1 = (a2 + 2(a1 + 2a0))**2 */
2802     if ((res = mp_mul_2(&a0, &tmp1)) != MP_OKAY) {
2803        goto ERR;
2804     }
2805     if ((res = mp_add(&tmp1, &a1, &tmp1)) != MP_OKAY) {
2806        goto ERR;
2807     }
2808     if ((res = mp_mul_2(&tmp1, &tmp1)) != MP_OKAY) {
2809        goto ERR;
2810     }
2811     if ((res = mp_add(&tmp1, &a2, &tmp1)) != MP_OKAY) {
2812        goto ERR;
2813     }
2814 
2815     if ((res = mp_sqr(&tmp1, &w1)) != MP_OKAY) {
2816        goto ERR;
2817     }
2818 
2819     /* w3 = (a0 + 2(a1 + 2a2))**2 */
2820     if ((res = mp_mul_2(&a2, &tmp1)) != MP_OKAY) {
2821        goto ERR;
2822     }
2823     if ((res = mp_add(&tmp1, &a1, &tmp1)) != MP_OKAY) {
2824        goto ERR;
2825     }
2826     if ((res = mp_mul_2(&tmp1, &tmp1)) != MP_OKAY) {
2827        goto ERR;
2828     }
2829     if ((res = mp_add(&tmp1, &a0, &tmp1)) != MP_OKAY) {
2830        goto ERR;
2831     }
2832 
2833     if ((res = mp_sqr(&tmp1, &w3)) != MP_OKAY) {
2834        goto ERR;
2835     }
2836 
2837 
2838     /* w2 = (a2 + a1 + a0)**2 */
2839     if ((res = mp_add(&a2, &a1, &tmp1)) != MP_OKAY) {
2840        goto ERR;
2841     }
2842     if ((res = mp_add(&tmp1, &a0, &tmp1)) != MP_OKAY) {
2843        goto ERR;
2844     }
2845     if ((res = mp_sqr(&tmp1, &w2)) != MP_OKAY) {
2846        goto ERR;
2847     }
2848 
2849     /* now solve the matrix
2850 
2851        0  0  0  0  1
2852        1  2  4  8  16
2853        1  1  1  1  1
2854        16 8  4  2  1
2855        1  0  0  0  0
2856 
2857        using 12 subtractions, 4 shifts, 2 small divisions and 1 small multiplication.
2858      */
2859 
2860      /* r1 - r4 */
2861      if ((res = mp_sub(&w1, &w4, &w1)) != MP_OKAY) {
2862         goto ERR;
2863      }
2864      /* r3 - r0 */
2865      if ((res = mp_sub(&w3, &w0, &w3)) != MP_OKAY) {
2866         goto ERR;
2867      }
2868      /* r1/2 */
2869      if ((res = mp_div_2(&w1, &w1)) != MP_OKAY) {
2870         goto ERR;
2871      }
2872      /* r3/2 */
2873      if ((res = mp_div_2(&w3, &w3)) != MP_OKAY) {
2874         goto ERR;
2875      }
2876      /* r2 - r0 - r4 */
2877      if ((res = mp_sub(&w2, &w0, &w2)) != MP_OKAY) {
2878         goto ERR;
2879      }
2880      if ((res = mp_sub(&w2, &w4, &w2)) != MP_OKAY) {
2881         goto ERR;
2882      }
2883      /* r1 - r2 */
2884      if ((res = mp_sub(&w1, &w2, &w1)) != MP_OKAY) {
2885         goto ERR;
2886      }
2887      /* r3 - r2 */
2888      if ((res = mp_sub(&w3, &w2, &w3)) != MP_OKAY) {
2889         goto ERR;
2890      }
2891      /* r1 - 8r0 */
2892      if ((res = mp_mul_2d(&w0, 3, &tmp1)) != MP_OKAY) {
2893         goto ERR;
2894      }
2895      if ((res = mp_sub(&w1, &tmp1, &w1)) != MP_OKAY) {
2896         goto ERR;
2897      }
2898      /* r3 - 8r4 */
2899      if ((res = mp_mul_2d(&w4, 3, &tmp1)) != MP_OKAY) {
2900         goto ERR;
2901      }
2902      if ((res = mp_sub(&w3, &tmp1, &w3)) != MP_OKAY) {
2903         goto ERR;
2904      }
2905      /* 3r2 - r1 - r3 */
2906      if ((res = mp_mul_d(&w2, 3, &w2)) != MP_OKAY) {
2907         goto ERR;
2908      }
2909      if ((res = mp_sub(&w2, &w1, &w2)) != MP_OKAY) {
2910         goto ERR;
2911      }
2912      if ((res = mp_sub(&w2, &w3, &w2)) != MP_OKAY) {
2913         goto ERR;
2914      }
2915      /* r1 - r2 */
2916      if ((res = mp_sub(&w1, &w2, &w1)) != MP_OKAY) {
2917         goto ERR;
2918      }
2919      /* r3 - r2 */
2920      if ((res = mp_sub(&w3, &w2, &w3)) != MP_OKAY) {
2921         goto ERR;
2922      }
2923      /* r1/3 */
2924      if ((res = mp_div_3(&w1, &w1, NULL)) != MP_OKAY) {
2925         goto ERR;
2926      }
2927      /* r3/3 */
2928      if ((res = mp_div_3(&w3, &w3, NULL)) != MP_OKAY) {
2929         goto ERR;
2930      }
2931 
2932      /* at this point shift W[n] by B*n */
2933      if ((res = mp_lshd(&w1, 1*B)) != MP_OKAY) {
2934         goto ERR;
2935      }
2936      if ((res = mp_lshd(&w2, 2*B)) != MP_OKAY) {
2937         goto ERR;
2938      }
2939      if ((res = mp_lshd(&w3, 3*B)) != MP_OKAY) {
2940         goto ERR;
2941      }
2942      if ((res = mp_lshd(&w4, 4*B)) != MP_OKAY) {
2943         goto ERR;
2944      }
2945 
2946      if ((res = mp_add(&w0, &w1, b)) != MP_OKAY) {
2947         goto ERR;
2948      }
2949      if ((res = mp_add(&w2, &w3, &tmp1)) != MP_OKAY) {
2950         goto ERR;
2951      }
2952      if ((res = mp_add(&w4, &tmp1, &tmp1)) != MP_OKAY) {
2953         goto ERR;
2954      }
2955      if ((res = mp_add(&tmp1, b, b)) != MP_OKAY) {
2956         goto ERR;
2957      }
2958 
2959 ERR:
2960      mp_clear_multi(&w0, &w1, &w2, &w3, &w4, &a0, &a1, &a2, &tmp1, NULL);
2961      return res;
2962 }
2963 
2964 
2965 /* Source: /usr/cvsroot/libtommath/dist/libtommath/bn_mp_toom_sqr.c,v $ */
2966 /* Revision: 1.1.1.1 $ */
2967 /* Date: 2011/03/12 22:58:18 $ */
2968 
2969 /* Karatsuba squaring, computes b = a*a using three
2970  * half size squarings
2971  *
2972  * See comments of karatsuba_mul for details.  It
2973  * is essentially the same algorithm but merely
2974  * tuned to perform recursive squarings.
2975  */
2976 static int mp_karatsuba_sqr (mp_int * a, mp_int * b)
2977 {
2978   mp_int  x0, x1, t1, t2, x0x0, x1x1;
2979   int     B, err;
2980 
2981   err = MP_MEM;
2982 
2983   /* min # of digits */
2984   B = a->used;
2985 
2986   /* now divide in two */
2987   B = (unsigned)B >> 1;
2988 
2989   /* init copy all the temps */
2990   if (mp_init_size (&x0, B) != MP_OKAY)
2991     goto ERR;
2992   if (mp_init_size (&x1, a->used - B) != MP_OKAY)
2993     goto X0;
2994 
2995   /* init temps */
2996   if (mp_init_size (&t1, a->used * 2) != MP_OKAY)
2997     goto X1;
2998   if (mp_init_size (&t2, a->used * 2) != MP_OKAY)
2999     goto T1;
3000   if (mp_init_size (&x0x0, B * 2) != MP_OKAY)
3001     goto T2;
3002   if (mp_init_size (&x1x1, (a->used - B) * 2) != MP_OKAY)
3003     goto X0X0;
3004 
3005   {
3006     int x;
3007     mp_digit *dst, *src;
3008 
3009     src = a->dp;
3010 
3011     /* now shift the digits */
3012     dst = x0.dp;
3013     for (x = 0; x < B; x++) {
3014       *dst++ = *src++;
3015     }
3016 
3017     dst = x1.dp;
3018     for (x = B; x < a->used; x++) {
3019       *dst++ = *src++;
3020     }
3021   }
3022 
3023   x0.used = B;
3024   x1.used = a->used - B;
3025 
3026   mp_clamp (&x0);
3027 
3028   /* now calc the products x0*x0 and x1*x1 */
3029   if (mp_sqr (&x0, &x0x0) != MP_OKAY)
3030     goto X1X1;           /* x0x0 = x0*x0 */
3031   if (mp_sqr (&x1, &x1x1) != MP_OKAY)
3032     goto X1X1;           /* x1x1 = x1*x1 */
3033 
3034   /* now calc (x1+x0)**2 */
3035   if (s_mp_add (&x1, &x0, &t1) != MP_OKAY)
3036     goto X1X1;           /* t1 = x1 - x0 */
3037   if (mp_sqr (&t1, &t1) != MP_OKAY)
3038     goto X1X1;           /* t1 = (x1 - x0) * (x1 - x0) */
3039 
3040   /* add x0y0 */
3041   if (s_mp_add (&x0x0, &x1x1, &t2) != MP_OKAY)
3042     goto X1X1;           /* t2 = x0x0 + x1x1 */
3043   if (s_mp_sub (&t1, &t2, &t1) != MP_OKAY)
3044     goto X1X1;           /* t1 = (x1+x0)**2 - (x0x0 + x1x1) */
3045 
3046   /* shift by B */
3047   if (mp_lshd (&t1, B) != MP_OKAY)
3048     goto X1X1;           /* t1 = (x0x0 + x1x1 - (x1-x0)*(x1-x0))<<B */
3049   if (mp_lshd (&x1x1, B * 2) != MP_OKAY)
3050     goto X1X1;           /* x1x1 = x1x1 << 2*B */
3051 
3052   if (mp_add (&x0x0, &t1, &t1) != MP_OKAY)
3053     goto X1X1;           /* t1 = x0x0 + t1 */
3054   if (mp_add (&t1, &x1x1, b) != MP_OKAY)
3055     goto X1X1;           /* t1 = x0x0 + t1 + x1x1 */
3056 
3057   err = MP_OKAY;
3058 
3059 X1X1:mp_clear (&x1x1);
3060 X0X0:mp_clear (&x0x0);
3061 T2:mp_clear (&t2);
3062 T1:mp_clear (&t1);
3063 X1:mp_clear (&x1);
3064 X0:mp_clear (&x0);
3065 ERR:
3066   return err;
3067 }
3068 
3069 /* Source: /usr/cvsroot/libtommath/dist/libtommath/bn_mp_karatsuba_sqr.c,v $ */
3070 /* Revision: 1.2 $ */
3071 /* Date: 2011/03/12 23:43:54 $ */
3072 
3073 /* the jist of squaring...
3074  * you do like mult except the offset of the tmpx [one that
3075  * starts closer to zero] can't equal the offset of tmpy.
3076  * So basically you set up iy like before then you min it with
3077  * (ty-tx) so that it never happens.  You double all those
3078  * you add in the inner loop
3079 
3080 After that loop you do the squares and add them in.
3081 */
3082 
3083 static int fast_s_mp_sqr (mp_int * a, mp_int * b)
3084 {
3085   int       olduse, res, pa, ix, iz;
3086   mp_digit   W[MP_WARRAY], *tmpx;
3087   mp_word   W1;
3088 
3089   /* grow the destination as required */
3090   pa = a->used + a->used;
3091   if (b->alloc < pa) {
3092     if ((res = mp_grow (b, pa)) != MP_OKAY) {
3093       return res;
3094     }
3095   }
3096 
3097   /* number of output digits to produce */
3098   W1 = 0;
3099   for (ix = 0; ix < pa; ix++) {
3100       int      tx, ty, iy;
3101       mp_word  _W;
3102       mp_digit *tmpy;
3103 
3104       /* clear counter */
3105       _W = 0;
3106 
3107       /* get offsets into the two bignums */
3108       ty = MIN(a->used-1, ix);
3109       tx = ix - ty;
3110 
3111       /* setup temp aliases */
3112       tmpx = a->dp + tx;
3113       tmpy = a->dp + ty;
3114 
3115       /* this is the number of times the loop will iterrate, essentially
3116          while (tx++ < a->used && ty-- >= 0) { ... }
3117        */
3118       iy = MIN(a->used-tx, ty+1);
3119 
3120       /* now for squaring tx can never equal ty
3121        * we halve the distance since they approach at a rate of 2x
3122        * and we have to round because odd cases need to be executed
3123        */
3124       iy = MIN(iy, (int)((unsigned)(ty-tx+1)>>1));
3125 
3126       /* execute loop */
3127       for (iz = 0; iz < iy; iz++) {
3128          _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
3129       }
3130 
3131       /* double the inner product and add carry */
3132       _W = _W + _W + W1;
3133 
3134       /* even columns have the square term in them */
3135       if ((ix&1) == 0) {
3136          _W += ((mp_word)a->dp[(unsigned)ix>>1])*((mp_word)a->dp[(unsigned)ix>>1]);
3137       }
3138 
3139       /* store it */
3140       W[ix] = (mp_digit)(_W & MP_MASK);
3141 
3142       /* make next carry */
3143       W1 = _W >> ((mp_word)DIGIT_BIT);
3144   }
3145 
3146   /* setup dest */
3147   olduse  = b->used;
3148   b->used = a->used+a->used;
3149 
3150   {
3151     mp_digit *tmpb;
3152     tmpb = b->dp;
3153     for (ix = 0; ix < pa; ix++) {
3154       *tmpb++ = W[ix] & MP_MASK;
3155     }
3156 
3157     /* clear unused digits [that existed in the old copy of c] */
3158     for (; ix < olduse; ix++) {
3159       *tmpb++ = 0;
3160     }
3161   }
3162   mp_clamp (b);
3163   return MP_OKAY;
3164 }
3165 
3166 /* Source: /usr/cvsroot/libtommath/dist/libtommath/bn_fast_s_mp_sqr.c,v $ */
3167 /* Revision: 1.3 $ */
3168 /* Date: 2011/03/18 16:43:04 $ */
3169 
3170 /* low level squaring, b = a*a, HAC pp.596-597, Algorithm 14.16 */
3171 static int
3172 s_mp_sqr (mp_int * a, mp_int * b)
3173 {
3174   mp_int  t;
3175   int     res, ix, iy, pa;
3176   mp_word r;
3177   mp_digit u, tmpx, *tmpt;
3178 
3179   pa = a->used;
3180   if ((res = mp_init_size (&t, 2*pa + 1)) != MP_OKAY) {
3181     return res;
3182   }
3183 
3184   /* default used is maximum possible size */
3185   t.used = 2*pa + 1;
3186 
3187   for (ix = 0; ix < pa; ix++) {
3188     /* first calculate the digit at 2*ix */
3189     /* calculate double precision result */
3190     r = ((mp_word) t.dp[2*ix]) +
3191         ((mp_word)a->dp[ix])*((mp_word)a->dp[ix]);
3192 
3193     /* store lower part in result */
3194     t.dp[ix+ix] = (mp_digit) (r & ((mp_word) MP_MASK));
3195 
3196     /* get the carry */
3197     u           = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
3198 
3199     /* left hand side of A[ix] * A[iy] */
3200     tmpx        = a->dp[ix];
3201 
3202     /* alias for where to store the results */
3203     tmpt        = t.dp + (2*ix + 1);
3204 
3205     for (iy = ix + 1; iy < pa; iy++) {
3206       /* first calculate the product */
3207       r       = ((mp_word)tmpx) * ((mp_word)a->dp[iy]);
3208 
3209       /* now calculate the double precision result, note we use
3210        * addition instead of *2 since it's easier to optimize
3211        */
3212       r       = ((mp_word) *tmpt) + r + r + ((mp_word) u);
3213 
3214       /* store lower part */
3215       *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
3216 
3217       /* get carry */
3218       u       = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
3219     }
3220     /* propagate upwards */
3221     while (u != ((mp_digit) 0)) {
3222       r       = ((mp_word) *tmpt) + ((mp_word) u);
3223       *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
3224       u       = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
3225     }
3226   }
3227 
3228   mp_clamp (&t);
3229   mp_exch (&t, b);
3230   mp_clear (&t);
3231   return MP_OKAY;
3232 }
3233 
3234 /* Source: /usr/cvsroot/libtommath/dist/libtommath/bn_s_mp_sqr.c,v $ */
3235 /* Revision: 1.1.1.1 $ */
3236 /* Date: 2011/03/12 22:58:18 $ */
3237 
3238 #define TOOM_SQR_CUTOFF      400
3239 #define KARATSUBA_SQR_CUTOFF 120
3240 
3241 /* computes b = a*a */
3242 static int
3243 mp_sqr (mp_int * a, mp_int * b)
3244 {
3245   int     res;
3246 
3247   /* use Toom-Cook? */
3248   if (a->used >= TOOM_SQR_CUTOFF) {
3249     res = mp_toom_sqr(a, b);
3250   /* Karatsuba? */
3251   } else
3252 if (a->used >= KARATSUBA_SQR_CUTOFF) {
3253     res = mp_karatsuba_sqr (a, b);
3254   } else
3255   {
3256     /* can we use the fast comba multiplier? */
3257     if (((unsigned)a->used * 2 + 1) < MP_WARRAY &&
3258          a->used <
3259          (1 << (unsigned)(sizeof(mp_word) * CHAR_BIT - 2*DIGIT_BIT - 1))) {
3260       res = fast_s_mp_sqr (a, b);
3261     } else
3262       res = s_mp_sqr (a, b);
3263   }
3264   b->sign = MP_ZPOS;
3265   return res;
3266 }
3267 
3268 /* Source: /usr/cvsroot/libtommath/dist/libtommath/bn_mp_sqr.c,v $ */
3269 /* Revision: 1.3 $ */
3270 /* Date: 2011/03/18 16:43:04 $ */
3271 
3272 #define TAB_SIZE 256
3273 
3274 static int s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
3275 {
3276   mp_int  M[TAB_SIZE], res, mu;
3277   mp_digit buf;
3278   int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
3279   int (*redux)(mp_int*,mp_int*,mp_int*);
3280 
3281   /* find window size */
3282   x = mp_count_bits (X);
3283   if (x <= 7) {
3284     winsize = 2;
3285   } else if (x <= 36) {
3286     winsize = 3;
3287   } else if (x <= 140) {
3288     winsize = 4;
3289   } else if (x <= 450) {
3290     winsize = 5;
3291   } else if (x <= 1303) {
3292     winsize = 6;
3293   } else if (x <= 3529) {
3294     winsize = 7;
3295   } else {
3296     winsize = 8;
3297   }
3298 
3299   /* init M array */
3300   /* init first cell */
3301   if ((err = mp_init(&M[1])) != MP_OKAY) {
3302      return err;
3303   }
3304 
3305   /* now init the second half of the array */
3306   for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
3307     if ((err = mp_init(&M[x])) != MP_OKAY) {
3308       for (y = 1<<(winsize-1); y < x; y++) {
3309         mp_clear (&M[y]);
3310       }
3311       mp_clear(&M[1]);
3312       return err;
3313     }
3314   }
3315 
3316   /* create mu, used for Barrett reduction */
3317   if ((err = mp_init (&mu)) != MP_OKAY) {
3318     goto LBL_M;
3319   }
3320 
3321   if (redmode == 0) {
3322      if ((err = mp_reduce_setup (&mu, P)) != MP_OKAY) {
3323         goto LBL_MU;
3324      }
3325      redux = mp_reduce;
3326   } else {
3327      if ((err = mp_reduce_2k_setup_l (P, &mu)) != MP_OKAY) {
3328         goto LBL_MU;
3329      }
3330      redux = mp_reduce_2k_l;
3331   }
3332 
3333   /* create M table
3334    *
3335    * The M table contains powers of the base,
3336    * e.g. M[x] = G**x mod P
3337    *
3338    * The first half of the table is not
3339    * computed though accept for M[0] and M[1]
3340    */
3341   if ((err = mp_mod (G, P, &M[1])) != MP_OKAY) {
3342     goto LBL_MU;
3343   }
3344 
3345   /* compute the value at M[1<<(winsize-1)] by squaring
3346    * M[1] (winsize-1) times
3347    */
3348   if ((err = mp_copy ( &M[1], &M[1 << (winsize - 1)])) != MP_OKAY) {
3349     goto LBL_MU;
3350   }
3351 
3352   for (x = 0; x < (winsize - 1); x++) {
3353     /* square it */
3354     if ((err = mp_sqr (&M[1 << (winsize - 1)],
3355                        &M[1 << (winsize - 1)])) != MP_OKAY) {
3356       goto LBL_MU;
3357     }
3358 
3359     /* reduce modulo P */
3360     if ((err = redux (&M[1 << (winsize - 1)], P, &mu)) != MP_OKAY) {
3361       goto LBL_MU;
3362     }
3363   }
3364 
3365   /* create upper table, that is M[x] = M[x-1] * M[1] (mod P)
3366    * for x = (2**(winsize - 1) + 1) to (2**winsize - 1)
3367    */
3368   for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) {
3369     if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) {
3370       goto LBL_MU;
3371     }
3372     if ((err = redux (&M[x], P, &mu)) != MP_OKAY) {
3373       goto LBL_MU;
3374     }
3375   }
3376 
3377   /* setup result */
3378   if ((err = mp_init (&res)) != MP_OKAY) {
3379     goto LBL_MU;
3380   }
3381   mp_set (&res, 1);
3382 
3383   /* set initial mode and bit cnt */
3384   mode   = 0;
3385   bitcnt = 1;
3386   buf    = 0;
3387   digidx = X->used - 1;
3388   bitcpy = 0;
3389   bitbuf = 0;
3390 
3391   for (;;) {
3392     /* grab next digit as required */
3393     if (--bitcnt == 0) {
3394       /* if digidx == -1 we are out of digits */
3395       if (digidx == -1) {
3396         break;
3397       }
3398       /* read next digit and reset the bitcnt */
3399       buf    = X->dp[digidx--];
3400       bitcnt = (int) DIGIT_BIT;
3401     }
3402 
3403     /* grab the next msb from the exponent */
3404     y     = (unsigned)(buf >> (mp_digit)(DIGIT_BIT - 1)) & 1;
3405     buf <<= (mp_digit)1;
3406 
3407     /* if the bit is zero and mode == 0 then we ignore it
3408      * These represent the leading zero bits before the first 1 bit
3409      * in the exponent.  Technically this opt is not required but it
3410      * does lower the # of trivial squaring/reductions used
3411      */
3412     if (mode == 0 && y == 0) {
3413       continue;
3414     }
3415 
3416     /* if the bit is zero and mode == 1 then we square */
3417     if (mode == 1 && y == 0) {
3418       if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
3419         goto LBL_RES;
3420       }
3421       if ((err = redux (&res, P, &mu)) != MP_OKAY) {
3422         goto LBL_RES;
3423       }
3424       continue;
3425     }
3426 
3427     /* else we add it to the window */
3428     bitbuf |= (y << (winsize - ++bitcpy));
3429     mode    = 2;
3430 
3431     if (bitcpy == winsize) {
3432       /* ok window is filled so square as required and multiply  */
3433       /* square first */
3434       for (x = 0; x < winsize; x++) {
3435         if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
3436           goto LBL_RES;
3437         }
3438         if ((err = redux (&res, P, &mu)) != MP_OKAY) {
3439           goto LBL_RES;
3440         }
3441       }
3442 
3443       /* then multiply */
3444       if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) {
3445         goto LBL_RES;
3446       }
3447       if ((err = redux (&res, P, &mu)) != MP_OKAY) {
3448         goto LBL_RES;
3449       }
3450 
3451       /* empty window and reset */
3452       bitcpy = 0;
3453       bitbuf = 0;
3454       mode   = 1;
3455     }
3456   }
3457 
3458   /* if bits remain then square/multiply */
3459   if (mode == 2 && bitcpy > 0) {
3460     /* square then multiply if the bit is set */
3461     for (x = 0; x < bitcpy; x++) {
3462       if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
3463         goto LBL_RES;
3464       }
3465       if ((err = redux (&res, P, &mu)) != MP_OKAY) {
3466         goto LBL_RES;
3467       }
3468 
3469       bitbuf <<= 1;
3470       if ((bitbuf & (1 << winsize)) != 0) {
3471         /* then multiply */
3472         if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
3473           goto LBL_RES;
3474         }
3475         if ((err = redux (&res, P, &mu)) != MP_OKAY) {
3476           goto LBL_RES;
3477         }
3478       }
3479     }
3480   }
3481 
3482   mp_exch (&res, Y);
3483   err = MP_OKAY;
3484 LBL_RES:mp_clear (&res);
3485 LBL_MU:mp_clear (&mu);
3486 LBL_M:
3487   mp_clear(&M[1]);
3488   for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
3489     mp_clear (&M[x]);
3490   }
3491   return err;
3492 }
3493 
3494 /* determines if a number is a valid DR modulus */
3495 static int
3496 mp_dr_is_modulus(mp_int *a)
3497 {
3498    int ix;
3499 
3500    /* must be at least two digits */
3501    if (a->used < 2) {
3502       return 0;
3503    }
3504 
3505    /* must be of the form b**k - a [a <= b] so all
3506     * but the first digit must be equal to -1 (mod b).
3507     */
3508    for (ix = 1; ix < a->used; ix++) {
3509        if (a->dp[ix] != MP_MASK) {
3510           return 0;
3511        }
3512    }
3513    return 1;
3514 }
3515 
3516 /* Source: /usr/cvsroot/libtommath/dist/libtommath/bn_mp_dr_is_modulus.c,v $ */
3517 /* Revision: 1.1.1.1 $ */
3518 /* Date: 2011/03/12 22:58:18 $ */
3519 
3520 /* determines if mp_reduce_2k can be used */
3521 static int mp_reduce_is_2k(mp_int *a)
3522 {
3523    int ix, iy, iw;
3524    mp_digit iz;
3525 
3526    if (a->used == 0) {
3527       return MP_NO;
3528    } else if (a->used == 1) {
3529       return MP_YES;
3530    } else if (a->used > 1) {
3531       iy = mp_count_bits(a);
3532       iz = 1;
3533       iw = 1;
3534 
3535       /* Test every bit from the second digit up, must be 1 */
3536       for (ix = DIGIT_BIT; ix < iy; ix++) {
3537           if ((a->dp[iw] & iz) == 0) {
3538              return MP_NO;
3539           }
3540           iz <<= 1;
3541           if (iz > (mp_digit)MP_MASK) {
3542              ++iw;
3543              iz = 1;
3544           }
3545       }
3546    }
3547    return MP_YES;
3548 }
3549 
3550 /* Source: /usr/cvsroot/libtommath/dist/libtommath/bn_mp_reduce_is_2k.c,v $ */
3551 /* Revision: 1.1.1.1 $ */
3552 /* Date: 2011/03/12 22:58:18 $ */
3553 
3554 
3555 /* d = a * b (mod c) */
3556 static int
3557 mp_mulmod (mp_int *d, mp_int * a, mp_int * b, mp_int * c)
3558 {
3559   int     res;
3560   mp_int  t;
3561 
3562   if ((res = mp_init (&t)) != MP_OKAY) {
3563     return res;
3564   }
3565 
3566   if ((res = mp_mul (a, b, &t)) != MP_OKAY) {
3567     mp_clear (&t);
3568     return res;
3569   }
3570   res = mp_mod (&t, c, d);
3571   mp_clear (&t);
3572   return res;
3573 }
3574 
3575 /* Source: /usr/cvsroot/libtommath/dist/libtommath/bn_mp_mulmod.c,v $ */
3576 /* Revision: 1.1.1.1 $ */
3577 /* Date: 2011/03/12 22:58:18 $ */
3578 
3579 /* setups the montgomery reduction stuff */
3580 static int
3581 mp_montgomery_setup (mp_int * n, mp_digit * rho)
3582 {
3583   mp_digit x, b;
3584 
3585 /* fast inversion mod 2**k
3586  *
3587  * Based on the fact that
3588  *
3589  * XA = 1 (mod 2**n)  =>  (X(2-XA)) A = 1 (mod 2**2n)
3590  *                    =>  2*X*A - X*X*A*A = 1
3591  *                    =>  2*(1) - (1)     = 1
3592  */
3593   b = n->dp[0];
3594 
3595   if ((b & 1) == 0) {
3596     return MP_VAL;
3597   }
3598 
3599   x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */
3600   x *= 2 - b * x;               /* here x*a==1 mod 2**8 */
3601   x *= 2 - b * x;               /* here x*a==1 mod 2**16 */
3602   x *= 2 - b * x;               /* here x*a==1 mod 2**32 */
3603   if (/*CONSTCOND*/sizeof(mp_digit) == 8) {
3604 	  x *= 2 - b * x;	/* here x*a==1 mod 2**64 */
3605   }
3606 
3607   /* rho = -1/m mod b */
3608   *rho = (unsigned long)(((mp_word)1 << ((mp_word) DIGIT_BIT)) - x) & MP_MASK;
3609 
3610   return MP_OKAY;
3611 }
3612 
3613 /* Source: /usr/cvsroot/libtommath/dist/libtommath/bn_mp_montgomery_setup.c,v $ */
3614 /* Revision: 1.1.1.1 $ */
3615 /* Date: 2011/03/12 22:58:18 $ */
3616 
3617 /* computes xR**-1 == x (mod N) via Montgomery Reduction
3618  *
3619  * This is an optimized implementation of montgomery_reduce
3620  * which uses the comba method to quickly calculate the columns of the
3621  * reduction.
3622  *
3623  * Based on Algorithm 14.32 on pp.601 of HAC.
3624 */
3625 static int
3626 fast_mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
3627 {
3628   int     ix, res, olduse;
3629   /*LINTED*/
3630   mp_word W[MP_WARRAY];
3631 
3632   /* get old used count */
3633   olduse = x->used;
3634 
3635   /* grow a as required */
3636   if (x->alloc < n->used + 1) {
3637     if ((res = mp_grow (x, n->used + 1)) != MP_OKAY) {
3638       return res;
3639     }
3640   }
3641 
3642   /* first we have to get the digits of the input into
3643    * an array of double precision words W[...]
3644    */
3645   {
3646     mp_word *_W;
3647     mp_digit *tmpx;
3648 
3649     /* alias for the W[] array */
3650     _W   = W;
3651 
3652     /* alias for the digits of  x*/
3653     tmpx = x->dp;
3654 
3655     /* copy the digits of a into W[0..a->used-1] */
3656     for (ix = 0; ix < x->used; ix++) {
3657       *_W++ = *tmpx++;
3658     }
3659 
3660     /* zero the high words of W[a->used..m->used*2] */
3661     for (; ix < n->used * 2 + 1; ix++) {
3662       *_W++ = 0;
3663     }
3664   }
3665 
3666   /* now we proceed to zero successive digits
3667    * from the least significant upwards
3668    */
3669   for (ix = 0; ix < n->used; ix++) {
3670     /* mu = ai * m' mod b
3671      *
3672      * We avoid a double precision multiplication (which isn't required)
3673      * by casting the value down to a mp_digit.  Note this requires
3674      * that W[ix-1] have  the carry cleared (see after the inner loop)
3675      */
3676     mp_digit mu;
3677     mu = (mp_digit) (((W[ix] & MP_MASK) * rho) & MP_MASK);
3678 
3679     /* a = a + mu * m * b**i
3680      *
3681      * This is computed in place and on the fly.  The multiplication
3682      * by b**i is handled by offseting which columns the results
3683      * are added to.
3684      *
3685      * Note the comba method normally doesn't handle carries in the
3686      * inner loop In this case we fix the carry from the previous
3687      * column since the Montgomery reduction requires digits of the
3688      * result (so far) [see above] to work.  This is
3689      * handled by fixing up one carry after the inner loop.  The
3690      * carry fixups are done in order so after these loops the
3691      * first m->used words of W[] have the carries fixed
3692      */
3693     {
3694       int iy;
3695       mp_digit *tmpn;
3696       mp_word *_W;
3697 
3698       /* alias for the digits of the modulus */
3699       tmpn = n->dp;
3700 
3701       /* Alias for the columns set by an offset of ix */
3702       _W = W + ix;
3703 
3704       /* inner loop */
3705       for (iy = 0; iy < n->used; iy++) {
3706           *_W++ += ((mp_word)mu) * ((mp_word)*tmpn++);
3707       }
3708     }
3709 
3710     /* now fix carry for next digit, W[ix+1] */
3711     W[ix + 1] += W[ix] >> ((mp_word) DIGIT_BIT);
3712   }
3713 
3714   /* now we have to propagate the carries and
3715    * shift the words downward [all those least
3716    * significant digits we zeroed].
3717    */
3718   {
3719     mp_digit *tmpx;
3720     mp_word *_W, *_W1;
3721 
3722     /* nox fix rest of carries */
3723 
3724     /* alias for current word */
3725     _W1 = W + ix;
3726 
3727     /* alias for next word, where the carry goes */
3728     _W = W + ++ix;
3729 
3730     for (; ix <= n->used * 2 + 1; ix++) {
3731       *_W++ += *_W1++ >> ((mp_word) DIGIT_BIT);
3732     }
3733 
3734     /* copy out, A = A/b**n
3735      *
3736      * The result is A/b**n but instead of converting from an
3737      * array of mp_word to mp_digit than calling mp_rshd
3738      * we just copy them in the right order
3739      */
3740 
3741     /* alias for destination word */
3742     tmpx = x->dp;
3743 
3744     /* alias for shifted double precision result */
3745     _W = W + n->used;
3746 
3747     for (ix = 0; ix < n->used + 1; ix++) {
3748       *tmpx++ = (mp_digit)(*_W++ & ((mp_word) MP_MASK));
3749     }
3750 
3751     /* zero oldused digits, if the input a was larger than
3752      * m->used+1 we'll have to clear the digits
3753      */
3754     for (; ix < olduse; ix++) {
3755       *tmpx++ = 0;
3756     }
3757   }
3758 
3759   /* set the max used and clamp */
3760   x->used = n->used + 1;
3761   mp_clamp (x);
3762 
3763   /* if A >= m then A = A - m */
3764   if (mp_cmp_mag (x, n) != MP_LT) {
3765     return s_mp_sub (x, n, x);
3766   }
3767   return MP_OKAY;
3768 }
3769 
3770 /* Source: /usr/cvsroot/libtommath/dist/libtommath/bn_fast_mp_montgomery_reduce.c,v $ */
3771 /* Revision: 1.2 $ */
3772 /* Date: 2011/03/18 16:22:09 $ */
3773 
3774 /* computes xR**-1 == x (mod N) via Montgomery Reduction */
3775 static int
3776 mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
3777 {
3778   int     ix, res, digs;
3779   mp_digit mu;
3780 
3781   /* can the fast reduction [comba] method be used?
3782    *
3783    * Note that unlike in mul you're safely allowed *less*
3784    * than the available columns [255 per default] since carries
3785    * are fixed up in the inner loop.
3786    */
3787   digs = n->used * 2 + 1;
3788   if (((unsigned)digs < MP_WARRAY) &&
3789       n->used <
3790       (1 << (unsigned)((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
3791     return fast_mp_montgomery_reduce (x, n, rho);
3792   }
3793 
3794   /* grow the input as required */
3795   if (x->alloc < digs) {
3796     if ((res = mp_grow (x, digs)) != MP_OKAY) {
3797       return res;
3798     }
3799   }
3800   x->used = digs;
3801 
3802   for (ix = 0; ix < n->used; ix++) {
3803     /* mu = ai * rho mod b
3804      *
3805      * The value of rho must be precalculated via
3806      * montgomery_setup() such that
3807      * it equals -1/n0 mod b this allows the
3808      * following inner loop to reduce the
3809      * input one digit at a time
3810      */
3811     mu = (mp_digit) (((mp_word)x->dp[ix]) * ((mp_word)rho) & MP_MASK);
3812 
3813     /* a = a + mu * m * b**i */
3814     {
3815       int iy;
3816       mp_digit *tmpn, *tmpx, u;
3817       mp_word r;
3818 
3819       /* alias for digits of the modulus */
3820       tmpn = n->dp;
3821 
3822       /* alias for the digits of x [the input] */
3823       tmpx = x->dp + ix;
3824 
3825       /* set the carry to zero */
3826       u = 0;
3827 
3828       /* Multiply and add in place */
3829       for (iy = 0; iy < n->used; iy++) {
3830         /* compute product and sum */
3831         r       = ((mp_word)mu) * ((mp_word)*tmpn++) +
3832                   ((mp_word) u) + ((mp_word) * tmpx);
3833 
3834         /* get carry */
3835         u       = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
3836 
3837         /* fix digit */
3838         *tmpx++ = (mp_digit)(r & ((mp_word) MP_MASK));
3839       }
3840       /* At this point the ix'th digit of x should be zero */
3841 
3842 
3843       /* propagate carries upwards as required*/
3844       while (u) {
3845         *tmpx   += u;
3846         u        = *tmpx >> DIGIT_BIT;
3847         *tmpx++ &= MP_MASK;
3848       }
3849     }
3850   }
3851 
3852   /* at this point the n.used'th least
3853    * significant digits of x are all zero
3854    * which means we can shift x to the
3855    * right by n.used digits and the
3856    * residue is unchanged.
3857    */
3858 
3859   /* x = x/b**n.used */
3860   mp_clamp(x);
3861   mp_rshd (x, n->used);
3862 
3863   /* if x >= n then x = x - n */
3864   if (mp_cmp_mag (x, n) != MP_LT) {
3865     return s_mp_sub (x, n, x);
3866   }
3867 
3868   return MP_OKAY;
3869 }
3870 
3871 /* Source: /usr/cvsroot/libtommath/dist/libtommath/bn_mp_montgomery_reduce.c,v $ */
3872 /* Revision: 1.3 $ */
3873 /* Date: 2011/03/18 16:43:04 $ */
3874 
3875 /* determines the setup value */
3876 static void
3877 mp_dr_setup(mp_int *a, mp_digit *d)
3878 {
3879    /* the casts are required if DIGIT_BIT is one less than
3880     * the number of bits in a mp_digit [e.g. DIGIT_BIT==31]
3881     */
3882    *d = (mp_digit)((((mp_word)1) << ((mp_word)DIGIT_BIT)) -
3883         ((mp_word)a->dp[0]));
3884 }
3885 
3886 /* Source: /usr/cvsroot/libtommath/dist/libtommath/bn_mp_dr_setup.c,v $ */
3887 /* Revision: 1.1.1.1 $ */
3888 /* Date: 2011/03/12 22:58:18 $ */
3889 
3890 /* reduce "x" in place modulo "n" using the Diminished Radix algorithm.
3891  *
3892  * Based on algorithm from the paper
3893  *
3894  * "Generating Efficient Primes for Discrete Log Cryptosystems"
3895  *                 Chae Hoon Lim, Pil Joong Lee,
3896  *          POSTECH Information Research Laboratories
3897  *
3898  * The modulus must be of a special format [see manual]
3899  *
3900  * Has been modified to use algorithm 7.10 from the LTM book instead
3901  *
3902  * Input x must be in the range 0 <= x <= (n-1)**2
3903  */
3904 static int
3905 mp_dr_reduce (mp_int * x, mp_int * n, mp_digit k)
3906 {
3907   int      err, i, m;
3908   mp_word  r;
3909   mp_digit mu, *tmpx1, *tmpx2;
3910 
3911   /* m = digits in modulus */
3912   m = n->used;
3913 
3914   /* ensure that "x" has at least 2m digits */
3915   if (x->alloc < m + m) {
3916     if ((err = mp_grow (x, m + m)) != MP_OKAY) {
3917       return err;
3918     }
3919   }
3920 
3921 /* top of loop, this is where the code resumes if
3922  * another reduction pass is required.
3923  */
3924 top:
3925   /* aliases for digits */
3926   /* alias for lower half of x */
3927   tmpx1 = x->dp;
3928 
3929   /* alias for upper half of x, or x/B**m */
3930   tmpx2 = x->dp + m;
3931 
3932   /* set carry to zero */
3933   mu = 0;
3934 
3935   /* compute (x mod B**m) + k * [x/B**m] inline and inplace */
3936   for (i = 0; i < m; i++) {
3937       r         = ((mp_word)*tmpx2++) * ((mp_word)k) + *tmpx1 + mu;
3938       *tmpx1++  = (mp_digit)(r & MP_MASK);
3939       mu        = (mp_digit)(r >> ((mp_word)DIGIT_BIT));
3940   }
3941 
3942   /* set final carry */
3943   *tmpx1++ = mu;
3944 
3945   /* zero words above m */
3946   for (i = m + 1; i < x->used; i++) {
3947       *tmpx1++ = 0;
3948   }
3949 
3950   /* clamp, sub and return */
3951   mp_clamp (x);
3952 
3953   /* if x >= n then subtract and reduce again
3954    * Each successive "recursion" makes the input smaller and smaller.
3955    */
3956   if (mp_cmp_mag (x, n) != MP_LT) {
3957     s_mp_sub(x, n, x);
3958     goto top;
3959   }
3960   return MP_OKAY;
3961 }
3962 
3963 /* Source: /usr/cvsroot/libtommath/dist/libtommath/bn_mp_dr_reduce.c,v $ */
3964 /* Revision: 1.1.1.1 $ */
3965 /* Date: 2011/03/12 22:58:18 $ */
3966 
3967 /* determines the setup value */
3968 static int
3969 mp_reduce_2k_setup(mp_int *a, mp_digit *d)
3970 {
3971    int res, p;
3972    mp_int tmp;
3973 
3974    if ((res = mp_init(&tmp)) != MP_OKAY) {
3975       return res;
3976    }
3977 
3978    p = mp_count_bits(a);
3979    if ((res = mp_2expt(&tmp, p)) != MP_OKAY) {
3980       mp_clear(&tmp);
3981       return res;
3982    }
3983 
3984    if ((res = s_mp_sub(&tmp, a, &tmp)) != MP_OKAY) {
3985       mp_clear(&tmp);
3986       return res;
3987    }
3988 
3989    *d = tmp.dp[0];
3990    mp_clear(&tmp);
3991    return MP_OKAY;
3992 }
3993 
3994 /* Source: /usr/cvsroot/libtommath/dist/libtommath/bn_mp_reduce_2k_setup.c,v $ */
3995 /* Revision: 1.1.1.1 $ */
3996 /* Date: 2011/03/12 22:58:18 $ */
3997 
3998 /* reduces a modulo n where n is of the form 2**p - d */
3999 static int
4000 mp_reduce_2k(mp_int *a, mp_int *n, mp_digit d)
4001 {
4002    mp_int q;
4003    int    p, res;
4004 
4005    if ((res = mp_init(&q)) != MP_OKAY) {
4006       return res;
4007    }
4008 
4009    p = mp_count_bits(n);
4010 top:
4011    /* q = a/2**p, a = a mod 2**p */
4012    if ((res = mp_div_2d(a, p, &q, a)) != MP_OKAY) {
4013       goto ERR;
4014    }
4015 
4016    if (d != 1) {
4017       /* q = q * d */
4018       if ((res = mp_mul_d(&q, d, &q)) != MP_OKAY) {
4019          goto ERR;
4020       }
4021    }
4022 
4023    /* a = a + q */
4024    if ((res = s_mp_add(a, &q, a)) != MP_OKAY) {
4025       goto ERR;
4026    }
4027 
4028    if (mp_cmp_mag(a, n) != MP_LT) {
4029       s_mp_sub(a, n, a);
4030       goto top;
4031    }
4032 
4033 ERR:
4034    mp_clear(&q);
4035    return res;
4036 }
4037 
4038 /* Source: /usr/cvsroot/libtommath/dist/libtommath/bn_mp_reduce_2k.c,v $ */
4039 /* Revision: 1.1.1.1 $ */
4040 /* Date: 2011/03/12 22:58:18 $ */
4041 
4042 /*
4043  * shifts with subtractions when the result is greater than b.
4044  *
4045  * The method is slightly modified to shift B unconditionally upto just under
4046  * the leading bit of b.  This saves alot of multiple precision shifting.
4047  */
4048 static int
4049 mp_montgomery_calc_normalization (mp_int * a, mp_int * b)
4050 {
4051   int     x, bits, res;
4052 
4053   /* how many bits of last digit does b use */
4054   bits = mp_count_bits (b) % DIGIT_BIT;
4055 
4056   if (b->used > 1) {
4057      if ((res = mp_2expt (a, (b->used - 1) * DIGIT_BIT + bits - 1)) != MP_OKAY) {
4058         return res;
4059      }
4060   } else {
4061      mp_set(a, 1);
4062      bits = 1;
4063   }
4064 
4065 
4066   /* now compute C = A * B mod b */
4067   for (x = bits - 1; x < (int)DIGIT_BIT; x++) {
4068     if ((res = mp_mul_2 (a, a)) != MP_OKAY) {
4069       return res;
4070     }
4071     if (mp_cmp_mag (a, b) != MP_LT) {
4072       if ((res = s_mp_sub (a, b, a)) != MP_OKAY) {
4073         return res;
4074       }
4075     }
4076   }
4077 
4078   return MP_OKAY;
4079 }
4080 
4081 /* Source: /usr/cvsroot/libtommath/dist/libtommath/bn_mp_montgomery_calc_normalization.c,v $ */
4082 /* Revision: 1.1.1.1 $ */
4083 /* Date: 2011/03/12 22:58:18 $ */
4084 
4085 /* computes Y == G**X mod P, HAC pp.616, Algorithm 14.85
4086  *
4087  * Uses a left-to-right k-ary sliding window to compute the modular exponentiation.
4088  * The value of k changes based on the size of the exponent.
4089  *
4090  * Uses Montgomery or Diminished Radix reduction [whichever appropriate]
4091  */
4092 
4093 #define TAB_SIZE 256
4094 
4095 static int
4096 mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
4097 {
4098   mp_int  M[TAB_SIZE], res;
4099   mp_digit buf, mp;
4100   int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
4101 
4102   /* use a pointer to the reduction algorithm.  This allows us to use
4103    * one of many reduction algorithms without modding the guts of
4104    * the code with if statements everywhere.
4105    */
4106   int     (*redux)(mp_int*,mp_int*,mp_digit);
4107 
4108   /* find window size */
4109   x = mp_count_bits (X);
4110   if (x <= 7) {
4111     winsize = 2;
4112   } else if (x <= 36) {
4113     winsize = 3;
4114   } else if (x <= 140) {
4115     winsize = 4;
4116   } else if (x <= 450) {
4117     winsize = 5;
4118   } else if (x <= 1303) {
4119     winsize = 6;
4120   } else if (x <= 3529) {
4121     winsize = 7;
4122   } else {
4123     winsize = 8;
4124   }
4125 
4126   /* init M array */
4127   /* init first cell */
4128   if ((err = mp_init(&M[1])) != MP_OKAY) {
4129      return err;
4130   }
4131 
4132   /* now init the second half of the array */
4133   for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
4134     if ((err = mp_init(&M[x])) != MP_OKAY) {
4135       for (y = 1<<(winsize-1); y < x; y++) {
4136         mp_clear (&M[y]);
4137       }
4138       mp_clear(&M[1]);
4139       return err;
4140     }
4141   }
4142 
4143   /* determine and setup reduction code */
4144   if (redmode == 0) {
4145      /* now setup montgomery  */
4146      if ((err = mp_montgomery_setup (P, &mp)) != MP_OKAY) {
4147         goto LBL_M;
4148      }
4149 
4150      /* automatically pick the comba one if available (saves quite a few calls/ifs) */
4151      if (((unsigned)(P->used * 2 + 1) < MP_WARRAY) &&
4152           P->used < (1 << (unsigned)((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
4153         redux = fast_mp_montgomery_reduce;
4154      } else
4155      {
4156         /* use slower baseline Montgomery method */
4157         redux = mp_montgomery_reduce;
4158      }
4159   } else if (redmode == 1) {
4160      /* setup DR reduction for moduli of the form B**k - b */
4161      mp_dr_setup(P, &mp);
4162      redux = mp_dr_reduce;
4163   } else {
4164      /* setup DR reduction for moduli of the form 2**k - b */
4165      if ((err = mp_reduce_2k_setup(P, &mp)) != MP_OKAY) {
4166         goto LBL_M;
4167      }
4168      redux = mp_reduce_2k;
4169   }
4170 
4171   /* setup result */
4172   if ((err = mp_init (&res)) != MP_OKAY) {
4173     goto LBL_M;
4174   }
4175 
4176   /* create M table
4177    *
4178 
4179    *
4180    * The first half of the table is not computed though accept for M[0] and M[1]
4181    */
4182 
4183   if (redmode == 0) {
4184      /* now we need R mod m */
4185      if ((err = mp_montgomery_calc_normalization (&res, P)) != MP_OKAY) {
4186        goto LBL_RES;
4187      }
4188 
4189      /* now set M[1] to G * R mod m */
4190      if ((err = mp_mulmod (&M[1], G, &res, P)) != MP_OKAY) {
4191        goto LBL_RES;
4192      }
4193   } else {
4194      mp_set(&res, 1);
4195      if ((err = mp_mod(G, P, &M[1])) != MP_OKAY) {
4196         goto LBL_RES;
4197      }
4198   }
4199 
4200   /* compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times */
4201   if ((err = mp_copy ( &M[1], &M[1 << (winsize - 1)])) != MP_OKAY) {
4202     goto LBL_RES;
4203   }
4204 
4205   for (x = 0; x < (winsize - 1); x++) {
4206     if ((err = mp_sqr (&M[1 << (winsize - 1)], &M[1 << (winsize - 1)])) != MP_OKAY) {
4207       goto LBL_RES;
4208     }
4209     if ((err = redux (&M[1 << (winsize - 1)], P, mp)) != MP_OKAY) {
4210       goto LBL_RES;
4211     }
4212   }
4213 
4214   /* create upper table */
4215   for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) {
4216     if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) {
4217       goto LBL_RES;
4218     }
4219     if ((err = redux (&M[x], P, mp)) != MP_OKAY) {
4220       goto LBL_RES;
4221     }
4222   }
4223 
4224   /* set initial mode and bit cnt */
4225   mode   = 0;
4226   bitcnt = 1;
4227   buf    = 0;
4228   digidx = X->used - 1;
4229   bitcpy = 0;
4230   bitbuf = 0;
4231 
4232   for (;;) {
4233     /* grab next digit as required */
4234     if (--bitcnt == 0) {
4235       /* if digidx == -1 we are out of digits so break */
4236       if (digidx == -1) {
4237         break;
4238       }
4239       /* read next digit and reset bitcnt */
4240       buf    = X->dp[digidx--];
4241       bitcnt = (int)DIGIT_BIT;
4242     }
4243 
4244     /* grab the next msb from the exponent */
4245     y     = (int)(mp_digit)((mp_digit)buf >> (unsigned)(DIGIT_BIT - 1)) & 1;
4246     buf <<= (mp_digit)1;
4247 
4248     /* if the bit is zero and mode == 0 then we ignore it
4249      * These represent the leading zero bits before the first 1 bit
4250      * in the exponent.  Technically this opt is not required but it
4251      * does lower the # of trivial squaring/reductions used
4252      */
4253     if (mode == 0 && y == 0) {
4254       continue;
4255     }
4256 
4257     /* if the bit is zero and mode == 1 then we square */
4258     if (mode == 1 && y == 0) {
4259       if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
4260         goto LBL_RES;
4261       }
4262       if ((err = redux (&res, P, mp)) != MP_OKAY) {
4263         goto LBL_RES;
4264       }
4265       continue;
4266     }
4267 
4268     /* else we add it to the window */
4269     bitbuf |= (y << (winsize - ++bitcpy));
4270     mode    = 2;
4271 
4272     if (bitcpy == winsize) {
4273       /* ok window is filled so square as required and multiply  */
4274       /* square first */
4275       for (x = 0; x < winsize; x++) {
4276         if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
4277           goto LBL_RES;
4278         }
4279         if ((err = redux (&res, P, mp)) != MP_OKAY) {
4280           goto LBL_RES;
4281         }
4282       }
4283 
4284       /* then multiply */
4285       if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) {
4286         goto LBL_RES;
4287       }
4288       if ((err = redux (&res, P, mp)) != MP_OKAY) {
4289         goto LBL_RES;
4290       }
4291 
4292       /* empty window and reset */
4293       bitcpy = 0;
4294       bitbuf = 0;
4295       mode   = 1;
4296     }
4297   }
4298 
4299   /* if bits remain then square/multiply */
4300   if (mode == 2 && bitcpy > 0) {
4301     /* square then multiply if the bit is set */
4302     for (x = 0; x < bitcpy; x++) {
4303       if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
4304         goto LBL_RES;
4305       }
4306       if ((err = redux (&res, P, mp)) != MP_OKAY) {
4307         goto LBL_RES;
4308       }
4309 
4310       /* get next bit of the window */
4311       bitbuf <<= 1;
4312       if ((bitbuf & (1 << winsize)) != 0) {
4313         /* then multiply */
4314         if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
4315           goto LBL_RES;
4316         }
4317         if ((err = redux (&res, P, mp)) != MP_OKAY) {
4318           goto LBL_RES;
4319         }
4320       }
4321     }
4322   }
4323 
4324   if (redmode == 0) {
4325      /* fixup result if Montgomery reduction is used
4326       * recall that any value in a Montgomery system is
4327       * actually multiplied by R mod n.  So we have
4328       * to reduce one more time to cancel out the factor
4329       * of R.
4330       */
4331      if ((err = redux(&res, P, mp)) != MP_OKAY) {
4332        goto LBL_RES;
4333      }
4334   }
4335 
4336   /* swap res with Y */
4337   mp_exch (&res, Y);
4338   err = MP_OKAY;
4339 LBL_RES:mp_clear (&res);
4340 LBL_M:
4341   mp_clear(&M[1]);
4342   for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
4343     mp_clear (&M[x]);
4344   }
4345   return err;
4346 }
4347 
4348 /* Source: /usr/cvsroot/libtommath/dist/libtommath/bn_mp_exptmod_fast.c,v $ */
4349 /* Revision: 1.4 $ */
4350 /* Date: 2011/03/18 16:43:04 $ */
4351 
4352 /* this is a shell function that calls either the normal or Montgomery
4353  * exptmod functions.  Originally the call to the montgomery code was
4354  * embedded in the normal function but that wasted alot of stack space
4355  * for nothing (since 99% of the time the Montgomery code would be called)
4356  */
4357 static int
4358 mp_exptmod(mp_int * G, mp_int * X, mp_int * P, mp_int *Y)
4359 {
4360   int dr;
4361 
4362   /* modulus P must be positive */
4363   if (P->sign == MP_NEG) {
4364      return MP_VAL;
4365   }
4366 
4367   /* if exponent X is negative we have to recurse */
4368   if (X->sign == MP_NEG) {
4369      mp_int tmpG, tmpX;
4370      int err;
4371 
4372      /* first compute 1/G mod P */
4373      if ((err = mp_init(&tmpG)) != MP_OKAY) {
4374         return err;
4375      }
4376      if ((err = mp_invmod(&tmpG, G, P)) != MP_OKAY) {
4377         mp_clear(&tmpG);
4378         return err;
4379      }
4380 
4381      /* now get |X| */
4382      if ((err = mp_init(&tmpX)) != MP_OKAY) {
4383         mp_clear(&tmpG);
4384         return err;
4385      }
4386      if ((err = mp_abs(X, &tmpX)) != MP_OKAY) {
4387         mp_clear_multi(&tmpG, &tmpX, NULL);
4388         return err;
4389      }
4390 
4391      /* and now compute (1/G)**|X| instead of G**X [X < 0] */
4392      err = mp_exptmod(&tmpG, &tmpX, P, Y);
4393      mp_clear_multi(&tmpG, &tmpX, NULL);
4394      return err;
4395   }
4396 
4397 /* modified diminished radix reduction */
4398   if (mp_reduce_is_2k_l(P) == MP_YES) {
4399      return s_mp_exptmod(G, X, P, Y, 1);
4400   }
4401 
4402   /* is it a DR modulus? */
4403   dr = mp_dr_is_modulus(P);
4404 
4405   /* if not, is it a unrestricted DR modulus? */
4406   if (dr == 0) {
4407      dr = mp_reduce_is_2k(P) << 1;
4408   }
4409 
4410   /* if the modulus is odd or dr != 0 use the montgomery method */
4411   if (BN_is_odd (P) == 1 || dr !=  0) {
4412     return mp_exptmod_fast (G, X, P, Y, dr);
4413   } else {
4414     /* otherwise use the generic Barrett reduction technique */
4415     return s_mp_exptmod (G, X, P, Y, 0);
4416   }
4417 }
4418 
4419 /* reverse an array, used for radix code */
4420 static void
4421 bn_reverse(unsigned char *s, int len)
4422 {
4423   int     ix, iy;
4424   unsigned char t;
4425 
4426   ix = 0;
4427   iy = len - 1;
4428   while (ix < iy) {
4429     t     = s[ix];
4430     s[ix] = s[iy];
4431     s[iy] = t;
4432     ++ix;
4433     --iy;
4434   }
4435 }
4436 
4437 static int
4438 s_is_power_of_two(mp_digit b, int *p)
4439 {
4440    int x;
4441 
4442    /* fast return if no power of two */
4443    if ((b==0) || (b & (b-1))) {
4444       return 0;
4445    }
4446 
4447    for (x = 0; x < DIGIT_BIT; x++) {
4448       if (b == (((mp_digit)1)<<x)) {
4449          *p = x;
4450          return 1;
4451       }
4452    }
4453    return 0;
4454 }
4455 
4456 /* single digit division (based on routine from MPI) */
4457 static int
4458 mp_div_d(mp_int *a, mp_digit b, mp_int *c, mp_digit *d)
4459 {
4460   mp_int  q;
4461   mp_word w;
4462   mp_digit t;
4463   int     res, ix;
4464 
4465   /* cannot divide by zero */
4466   if (b == 0) {
4467      return MP_VAL;
4468   }
4469 
4470   /* quick outs */
4471   if (b == 1 || mp_iszero(a) == 1) {
4472      if (d != NULL) {
4473         *d = 0;
4474      }
4475      if (c != NULL) {
4476         return mp_copy(a, c);
4477      }
4478      return MP_OKAY;
4479   }
4480 
4481   /* power of two ? */
4482   if (s_is_power_of_two(b, &ix) == 1) {
4483      if (d != NULL) {
4484         *d = a->dp[0] & ((((mp_digit)1)<<ix) - 1);
4485      }
4486      if (c != NULL) {
4487         return mp_div_2d(a, ix, c, NULL);
4488      }
4489      return MP_OKAY;
4490   }
4491 
4492 #ifdef BN_MP_DIV_3_C
4493   /* three? */
4494   if (b == 3) {
4495      return mp_div_3(a, c, d);
4496   }
4497 #endif
4498 
4499   /* no easy answer [c'est la vie].  Just division */
4500   if ((res = mp_init_size(&q, a->used)) != MP_OKAY) {
4501      return res;
4502   }
4503 
4504   q.used = a->used;
4505   q.sign = a->sign;
4506   w = 0;
4507   for (ix = a->used - 1; ix >= 0; ix--) {
4508      w = (w << ((mp_word)DIGIT_BIT)) | ((mp_word)a->dp[ix]);
4509 
4510      if (w >= b) {
4511         t = (mp_digit)(w / b);
4512         w -= ((mp_word)t) * ((mp_word)b);
4513       } else {
4514         t = 0;
4515       }
4516       q.dp[ix] = (mp_digit)t;
4517   }
4518 
4519   if (d != NULL) {
4520      *d = (mp_digit)w;
4521   }
4522 
4523   if (c != NULL) {
4524      mp_clamp(&q);
4525      mp_exch(&q, c);
4526   }
4527   mp_clear(&q);
4528 
4529   return res;
4530 }
4531 
4532 static int
4533 mp_mod_d(mp_int *a, mp_digit b, mp_digit *c)
4534 {
4535   return mp_div_d(a, b, NULL, c);
4536 }
4537 
4538 static const mp_digit ltm_prime_tab[] = {
4539   0x0002, 0x0003, 0x0005, 0x0007, 0x000B, 0x000D, 0x0011, 0x0013,
4540   0x0017, 0x001D, 0x001F, 0x0025, 0x0029, 0x002B, 0x002F, 0x0035,
4541   0x003B, 0x003D, 0x0043, 0x0047, 0x0049, 0x004F, 0x0053, 0x0059,
4542   0x0061, 0x0065, 0x0067, 0x006B, 0x006D, 0x0071, 0x007F,
4543 #ifndef MP_8BIT
4544   0x0083,
4545   0x0089, 0x008B, 0x0095, 0x0097, 0x009D, 0x00A3, 0x00A7, 0x00AD,
4546   0x00B3, 0x00B5, 0x00BF, 0x00C1, 0x00C5, 0x00C7, 0x00D3, 0x00DF,
4547   0x00E3, 0x00E5, 0x00E9, 0x00EF, 0x00F1, 0x00FB, 0x0101, 0x0107,
4548   0x010D, 0x010F, 0x0115, 0x0119, 0x011B, 0x0125, 0x0133, 0x0137,
4549 
4550   0x0139, 0x013D, 0x014B, 0x0151, 0x015B, 0x015D, 0x0161, 0x0167,
4551   0x016F, 0x0175, 0x017B, 0x017F, 0x0185, 0x018D, 0x0191, 0x0199,
4552   0x01A3, 0x01A5, 0x01AF, 0x01B1, 0x01B7, 0x01BB, 0x01C1, 0x01C9,
4553   0x01CD, 0x01CF, 0x01D3, 0x01DF, 0x01E7, 0x01EB, 0x01F3, 0x01F7,
4554   0x01FD, 0x0209, 0x020B, 0x021D, 0x0223, 0x022D, 0x0233, 0x0239,
4555   0x023B, 0x0241, 0x024B, 0x0251, 0x0257, 0x0259, 0x025F, 0x0265,
4556   0x0269, 0x026B, 0x0277, 0x0281, 0x0283, 0x0287, 0x028D, 0x0293,
4557   0x0295, 0x02A1, 0x02A5, 0x02AB, 0x02B3, 0x02BD, 0x02C5, 0x02CF,
4558 
4559   0x02D7, 0x02DD, 0x02E3, 0x02E7, 0x02EF, 0x02F5, 0x02F9, 0x0301,
4560   0x0305, 0x0313, 0x031D, 0x0329, 0x032B, 0x0335, 0x0337, 0x033B,
4561   0x033D, 0x0347, 0x0355, 0x0359, 0x035B, 0x035F, 0x036D, 0x0371,
4562   0x0373, 0x0377, 0x038B, 0x038F, 0x0397, 0x03A1, 0x03A9, 0x03AD,
4563   0x03B3, 0x03B9, 0x03C7, 0x03CB, 0x03D1, 0x03D7, 0x03DF, 0x03E5,
4564   0x03F1, 0x03F5, 0x03FB, 0x03FD, 0x0407, 0x0409, 0x040F, 0x0419,
4565   0x041B, 0x0425, 0x0427, 0x042D, 0x043F, 0x0443, 0x0445, 0x0449,
4566   0x044F, 0x0455, 0x045D, 0x0463, 0x0469, 0x047F, 0x0481, 0x048B,
4567 
4568   0x0493, 0x049D, 0x04A3, 0x04A9, 0x04B1, 0x04BD, 0x04C1, 0x04C7,
4569   0x04CD, 0x04CF, 0x04D5, 0x04E1, 0x04EB, 0x04FD, 0x04FF, 0x0503,
4570   0x0509, 0x050B, 0x0511, 0x0515, 0x0517, 0x051B, 0x0527, 0x0529,
4571   0x052F, 0x0551, 0x0557, 0x055D, 0x0565, 0x0577, 0x0581, 0x058F,
4572   0x0593, 0x0595, 0x0599, 0x059F, 0x05A7, 0x05AB, 0x05AD, 0x05B3,
4573   0x05BF, 0x05C9, 0x05CB, 0x05CF, 0x05D1, 0x05D5, 0x05DB, 0x05E7,
4574   0x05F3, 0x05FB, 0x0607, 0x060D, 0x0611, 0x0617, 0x061F, 0x0623,
4575   0x062B, 0x062F, 0x063D, 0x0641, 0x0647, 0x0649, 0x064D, 0x0653
4576 #endif
4577 };
4578 
4579 #define PRIME_SIZE	__arraycount(ltm_prime_tab)
4580 
4581 static int
4582 mp_prime_is_divisible(mp_int *a, int *result)
4583 {
4584   int     err, ix;
4585   mp_digit res;
4586 
4587   /* default to not */
4588   *result = MP_NO;
4589 
4590   for (ix = 0; ix < (int)PRIME_SIZE; ix++) {
4591     /* what is a mod LBL_prime_tab[ix] */
4592     if ((err = mp_mod_d (a, ltm_prime_tab[ix], &res)) != MP_OKAY) {
4593       return err;
4594     }
4595 
4596     /* is the residue zero? */
4597     if (res == 0) {
4598       *result = MP_YES;
4599       return MP_OKAY;
4600     }
4601   }
4602 
4603   return MP_OKAY;
4604 }
4605 
4606 /* single digit addition */
4607 static int
4608 mp_add_d(mp_int *a, mp_digit b, mp_int *c)
4609 {
4610   int     res, ix, oldused;
4611   mp_digit *tmpa, *tmpc, mu;
4612 
4613   /* grow c as required */
4614   if (c->alloc < a->used + 1) {
4615      if ((res = mp_grow(c, a->used + 1)) != MP_OKAY) {
4616         return res;
4617      }
4618   }
4619 
4620   /* if a is negative and |a| >= b, call c = |a| - b */
4621   if (a->sign == MP_NEG && (a->used > 1 || a->dp[0] >= b)) {
4622      /* temporarily fix sign of a */
4623      a->sign = MP_ZPOS;
4624 
4625      /* c = |a| - b */
4626      res = mp_sub_d(a, b, c);
4627 
4628      /* fix sign  */
4629      a->sign = c->sign = MP_NEG;
4630 
4631      /* clamp */
4632      mp_clamp(c);
4633 
4634      return res;
4635   }
4636 
4637   /* old number of used digits in c */
4638   oldused = c->used;
4639 
4640   /* sign always positive */
4641   c->sign = MP_ZPOS;
4642 
4643   /* source alias */
4644   tmpa    = a->dp;
4645 
4646   /* destination alias */
4647   tmpc    = c->dp;
4648 
4649   /* if a is positive */
4650   if (a->sign == MP_ZPOS) {
4651      /* add digit, after this we're propagating
4652       * the carry.
4653       */
4654      *tmpc   = *tmpa++ + b;
4655      mu      = *tmpc >> DIGIT_BIT;
4656      *tmpc++ &= MP_MASK;
4657 
4658      /* now handle rest of the digits */
4659      for (ix = 1; ix < a->used; ix++) {
4660         *tmpc   = *tmpa++ + mu;
4661         mu      = *tmpc >> DIGIT_BIT;
4662         *tmpc++ &= MP_MASK;
4663      }
4664      /* set final carry */
4665      ix++;
4666      *tmpc++  = mu;
4667 
4668      /* setup size */
4669      c->used = a->used + 1;
4670   } else {
4671      /* a was negative and |a| < b */
4672      c->used  = 1;
4673 
4674      /* the result is a single digit */
4675      if (a->used == 1) {
4676         *tmpc++  =  b - a->dp[0];
4677      } else {
4678         *tmpc++  =  b;
4679      }
4680 
4681      /* setup count so the clearing of oldused
4682       * can fall through correctly
4683       */
4684      ix       = 1;
4685   }
4686 
4687   /* now zero to oldused */
4688   while (ix++ < oldused) {
4689      *tmpc++ = 0;
4690   }
4691   mp_clamp(c);
4692 
4693   return MP_OKAY;
4694 }
4695 
4696 /* single digit subtraction */
4697 static int
4698 mp_sub_d(mp_int *a, mp_digit b, mp_int *c)
4699 {
4700   mp_digit *tmpa, *tmpc, mu;
4701   int       res, ix, oldused;
4702 
4703   /* grow c as required */
4704   if (c->alloc < a->used + 1) {
4705      if ((res = mp_grow(c, a->used + 1)) != MP_OKAY) {
4706         return res;
4707      }
4708   }
4709 
4710   /* if a is negative just do an unsigned
4711    * addition [with fudged signs]
4712    */
4713   if (a->sign == MP_NEG) {
4714      a->sign = MP_ZPOS;
4715      res     = mp_add_d(a, b, c);
4716      a->sign = c->sign = MP_NEG;
4717 
4718      /* clamp */
4719      mp_clamp(c);
4720 
4721      return res;
4722   }
4723 
4724   /* setup regs */
4725   oldused = c->used;
4726   tmpa    = a->dp;
4727   tmpc    = c->dp;
4728 
4729   /* if a <= b simply fix the single digit */
4730   if ((a->used == 1 && a->dp[0] <= b) || a->used == 0) {
4731      if (a->used == 1) {
4732         *tmpc++ = b - *tmpa;
4733      } else {
4734         *tmpc++ = b;
4735      }
4736      ix      = 1;
4737 
4738      /* negative/1digit */
4739      c->sign = MP_NEG;
4740      c->used = 1;
4741   } else {
4742      /* positive/size */
4743      c->sign = MP_ZPOS;
4744      c->used = a->used;
4745 
4746      /* subtract first digit */
4747      *tmpc    = *tmpa++ - b;
4748      mu       = *tmpc >> (sizeof(mp_digit) * CHAR_BIT - 1);
4749      *tmpc++ &= MP_MASK;
4750 
4751      /* handle rest of the digits */
4752      for (ix = 1; ix < a->used; ix++) {
4753         *tmpc    = *tmpa++ - mu;
4754         mu       = *tmpc >> (sizeof(mp_digit) * CHAR_BIT - 1);
4755         *tmpc++ &= MP_MASK;
4756      }
4757   }
4758 
4759   /* zero excess digits */
4760   while (ix++ < oldused) {
4761      *tmpc++ = 0;
4762   }
4763   mp_clamp(c);
4764   return MP_OKAY;
4765 }
4766 
4767 static const int lnz[16] = {
4768    4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
4769 };
4770 
4771 /* Counts the number of lsbs which are zero before the first zero bit */
4772 static int
4773 mp_cnt_lsb(mp_int *a)
4774 {
4775    int x;
4776    mp_digit q, qq;
4777 
4778    /* easy out */
4779    if (mp_iszero(a) == 1) {
4780       return 0;
4781    }
4782 
4783    /* scan lower digits until non-zero */
4784    for (x = 0; x < a->used && a->dp[x] == 0; x++);
4785    q = a->dp[x];
4786    x *= DIGIT_BIT;
4787 
4788    /* now scan this digit until a 1 is found */
4789    if ((q & 1) == 0) {
4790       do {
4791          qq  = q & 15;
4792 	 /* LINTED previous op ensures range of qq */
4793          x  += lnz[qq];
4794          q >>= 4;
4795       } while (qq == 0);
4796    }
4797    return x;
4798 }
4799 
4800 /* c = a * a (mod b) */
4801 static int
4802 mp_sqrmod(mp_int *a, mp_int *b, mp_int *c)
4803 {
4804   int     res;
4805   mp_int  t;
4806 
4807   if ((res = mp_init (&t)) != MP_OKAY) {
4808     return res;
4809   }
4810 
4811   if ((res = mp_sqr (a, &t)) != MP_OKAY) {
4812     mp_clear (&t);
4813     return res;
4814   }
4815   res = mp_mod (&t, b, c);
4816   mp_clear (&t);
4817   return res;
4818 }
4819 static int
4820 mp_prime_miller_rabin(mp_int *a, mp_int *b, int *result)
4821 {
4822   mp_int  n1, y, r;
4823   int     s, j, err;
4824 
4825   /* default */
4826   *result = MP_NO;
4827 
4828   /* ensure b > 1 */
4829   if (mp_cmp_d(b, 1) != MP_GT) {
4830      return MP_VAL;
4831   }
4832 
4833   /* get n1 = a - 1 */
4834   if ((err = mp_init_copy (&n1, a)) != MP_OKAY) {
4835     return err;
4836   }
4837   if ((err = mp_sub_d (&n1, 1, &n1)) != MP_OKAY) {
4838     goto LBL_N1;
4839   }
4840 
4841   /* set 2**s * r = n1 */
4842   if ((err = mp_init_copy (&r, &n1)) != MP_OKAY) {
4843     goto LBL_N1;
4844   }
4845 
4846   /* count the number of least significant bits
4847    * which are zero
4848    */
4849   s = mp_cnt_lsb(&r);
4850 
4851   /* now divide n - 1 by 2**s */
4852   if ((err = mp_div_2d (&r, s, &r, NULL)) != MP_OKAY) {
4853     goto LBL_R;
4854   }
4855 
4856   /* compute y = b**r mod a */
4857   if ((err = mp_init (&y)) != MP_OKAY) {
4858     goto LBL_R;
4859   }
4860   if ((err = mp_exptmod (b, &r, a, &y)) != MP_OKAY) {
4861     goto LBL_Y;
4862   }
4863 
4864   /* if y != 1 and y != n1 do */
4865   if (mp_cmp_d (&y, 1) != MP_EQ && mp_cmp (&y, &n1) != MP_EQ) {
4866     j = 1;
4867     /* while j <= s-1 and y != n1 */
4868     while ((j <= (s - 1)) && mp_cmp (&y, &n1) != MP_EQ) {
4869       if ((err = mp_sqrmod (&y, a, &y)) != MP_OKAY) {
4870          goto LBL_Y;
4871       }
4872 
4873       /* if y == 1 then composite */
4874       if (mp_cmp_d (&y, 1) == MP_EQ) {
4875          goto LBL_Y;
4876       }
4877 
4878       ++j;
4879     }
4880 
4881     /* if y != n1 then composite */
4882     if (mp_cmp (&y, &n1) != MP_EQ) {
4883       goto LBL_Y;
4884     }
4885   }
4886 
4887   /* probably prime now */
4888   *result = MP_YES;
4889 LBL_Y:mp_clear (&y);
4890 LBL_R:mp_clear (&r);
4891 LBL_N1:mp_clear (&n1);
4892   return err;
4893 }
4894 
4895 /* performs a variable number of rounds of Miller-Rabin
4896  *
4897  * Probability of error after t rounds is no more than
4898 
4899  *
4900  * Sets result to 1 if probably prime, 0 otherwise
4901  */
4902 static int
4903 mp_prime_is_prime(mp_int *a, int t, int *result)
4904 {
4905   mp_int  b;
4906   int     ix, err, res;
4907 
4908   /* default to no */
4909   *result = MP_NO;
4910 
4911   /* valid value of t? */
4912   if (t <= 0 || t > (int)PRIME_SIZE) {
4913     return MP_VAL;
4914   }
4915 
4916   /* is the input equal to one of the primes in the table? */
4917   for (ix = 0; ix < (int)PRIME_SIZE; ix++) {
4918       if (mp_cmp_d(a, ltm_prime_tab[ix]) == MP_EQ) {
4919          *result = 1;
4920          return MP_OKAY;
4921       }
4922   }
4923 
4924   /* first perform trial division */
4925   if ((err = mp_prime_is_divisible (a, &res)) != MP_OKAY) {
4926     return err;
4927   }
4928 
4929   /* return if it was trivially divisible */
4930   if (res == MP_YES) {
4931     return MP_OKAY;
4932   }
4933 
4934   /* now perform the miller-rabin rounds */
4935   if ((err = mp_init (&b)) != MP_OKAY) {
4936     return err;
4937   }
4938 
4939   for (ix = 0; ix < t; ix++) {
4940     /* set the prime */
4941     mp_set (&b, ltm_prime_tab[ix]);
4942 
4943     if ((err = mp_prime_miller_rabin (a, &b, &res)) != MP_OKAY) {
4944       goto LBL_B;
4945     }
4946 
4947     if (res == MP_NO) {
4948       goto LBL_B;
4949     }
4950   }
4951 
4952   /* passed the test */
4953   *result = MP_YES;
4954 LBL_B:mp_clear (&b);
4955   return err;
4956 }
4957 
4958 /* returns size of ASCII reprensentation */
4959 static int
4960 mp_radix_size (mp_int *a, int radix, int *size)
4961 {
4962   int     res, digs;
4963   mp_int  t;
4964   mp_digit d;
4965 
4966   *size = 0;
4967 
4968   /* special case for binary */
4969   if (radix == 2) {
4970     *size = mp_count_bits (a) + (a->sign == MP_NEG ? 1 : 0) + 1;
4971     return MP_OKAY;
4972   }
4973 
4974   /* make sure the radix is in range */
4975   if (radix < 2 || radix > 64) {
4976     return MP_VAL;
4977   }
4978 
4979   if (mp_iszero(a) == MP_YES) {
4980     *size = 2;
4981     return MP_OKAY;
4982   }
4983 
4984   /* digs is the digit count */
4985   digs = 0;
4986 
4987   /* if it's negative add one for the sign */
4988   if (a->sign == MP_NEG) {
4989     ++digs;
4990   }
4991 
4992   /* init a copy of the input */
4993   if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
4994     return res;
4995   }
4996 
4997   /* force temp to positive */
4998   t.sign = MP_ZPOS;
4999 
5000   /* fetch out all of the digits */
5001   while (mp_iszero (&t) == MP_NO) {
5002     if ((res = mp_div_d (&t, (mp_digit) radix, &t, &d)) != MP_OKAY) {
5003       mp_clear (&t);
5004       return res;
5005     }
5006     ++digs;
5007   }
5008   mp_clear (&t);
5009 
5010   /* return digs + 1, the 1 is for the NULL byte that would be required. */
5011   *size = digs + 1;
5012   return MP_OKAY;
5013 }
5014 
5015 static const char *mp_s_rmap = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+/";
5016 
5017 /* stores a bignum as a ASCII string in a given radix (2..64)
5018  *
5019  * Stores upto maxlen-1 chars and always a NULL byte
5020  */
5021 static int
5022 mp_toradix_n(mp_int * a, char *str, int radix, int maxlen)
5023 {
5024   int     res, digs;
5025   mp_int  t;
5026   mp_digit d;
5027   char   *_s = str;
5028 
5029   /* check range of the maxlen, radix */
5030   if (maxlen < 2 || radix < 2 || radix > 64) {
5031     return MP_VAL;
5032   }
5033 
5034   /* quick out if its zero */
5035   if (mp_iszero(a) == MP_YES) {
5036      *str++ = '0';
5037      *str = '\0';
5038      return MP_OKAY;
5039   }
5040 
5041   if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
5042     return res;
5043   }
5044 
5045   /* if it is negative output a - */
5046   if (t.sign == MP_NEG) {
5047     /* we have to reverse our digits later... but not the - sign!! */
5048     ++_s;
5049 
5050     /* store the flag and mark the number as positive */
5051     *str++ = '-';
5052     t.sign = MP_ZPOS;
5053 
5054     /* subtract a char */
5055     --maxlen;
5056   }
5057 
5058   digs = 0;
5059   while (mp_iszero (&t) == 0) {
5060     if (--maxlen < 1) {
5061        /* no more room */
5062        break;
5063     }
5064     if ((res = mp_div_d (&t, (mp_digit) radix, &t, &d)) != MP_OKAY) {
5065       mp_clear (&t);
5066       return res;
5067     }
5068     /* LINTED -- radix' range is checked above, limits d's range */
5069     *str++ = mp_s_rmap[d];
5070     ++digs;
5071   }
5072 
5073   /* reverse the digits of the string.  In this case _s points
5074    * to the first digit [exluding the sign] of the number
5075    */
5076   bn_reverse ((unsigned char *)_s, digs);
5077 
5078   /* append a NULL so the string is properly terminated */
5079   *str = '\0';
5080 
5081   mp_clear (&t);
5082   return MP_OKAY;
5083 }
5084 
5085 static char *
5086 formatbn(const BIGNUM *a, const int radix)
5087 {
5088 	char	*s;
5089 	int	 len;
5090 
5091 	if (mp_radix_size(__UNCONST(a), radix, &len) != MP_OKAY) {
5092 		return NULL;
5093 	}
5094 	if ((s = netpgp_allocate(1, (size_t)len)) != NULL) {
5095 		if (mp_toradix_n(__UNCONST(a), s, radix, len) != MP_OKAY) {
5096 			netpgp_deallocate(s, (size_t)len);
5097 			return NULL;
5098 		}
5099 	}
5100 	return s;
5101 }
5102 
5103 static int
5104 mp_getradix_num(mp_int *a, int radix, char *s)
5105 {
5106    int err, ch, neg, y;
5107 
5108    /* clear a */
5109    mp_zero(a);
5110 
5111    /* if first digit is - then set negative */
5112    if ((ch = *s++) == '-') {
5113       neg = MP_NEG;
5114       ch = *s++;
5115    } else {
5116       neg = MP_ZPOS;
5117    }
5118 
5119    for (;;) {
5120       /* find y in the radix map */
5121       for (y = 0; y < radix; y++) {
5122           if (mp_s_rmap[y] == ch) {
5123              break;
5124           }
5125       }
5126       if (y == radix) {
5127          break;
5128       }
5129 
5130       /* shift up and add */
5131       if ((err = mp_mul_d(a, radix, a)) != MP_OKAY) {
5132          return err;
5133       }
5134       if ((err = mp_add_d(a, y, a)) != MP_OKAY) {
5135          return err;
5136       }
5137 
5138       ch = *s++;
5139    }
5140    if (mp_cmp_d(a, 0) != MP_EQ) {
5141       a->sign = neg;
5142    }
5143 
5144    return MP_OKAY;
5145 }
5146 
5147 static int
5148 getbn(BIGNUM **a, const char *str, int radix)
5149 {
5150 	int	len;
5151 
5152 	if (a == NULL || str == NULL || (*a = BN_new()) == NULL) {
5153 		return 0;
5154 	}
5155 	if (mp_getradix_num(*a, radix, __UNCONST(str)) != MP_OKAY) {
5156 		return 0;
5157 	}
5158 	mp_radix_size(__UNCONST(a), radix, &len);
5159 	return len - 1;
5160 }
5161 
5162 /* d = a - b (mod c) */
5163 static int
5164 mp_submod(mp_int *a, mp_int *b, mp_int *c, mp_int *d)
5165 {
5166   int     res;
5167   mp_int  t;
5168 
5169 
5170   if ((res = mp_init (&t)) != MP_OKAY) {
5171     return res;
5172   }
5173 
5174   if ((res = mp_sub (a, b, &t)) != MP_OKAY) {
5175     mp_clear (&t);
5176     return res;
5177   }
5178   res = mp_mod (&t, c, d);
5179   mp_clear (&t);
5180   return res;
5181 }
5182 
5183 /**************************************************************************/
5184 
5185 /* BIGNUM emulation layer */
5186 
5187 /* essentiually, these are just wrappers around the libtommath functions */
5188 /* usually the order of args changes */
5189 /* the BIGNUM API tends to have more const poisoning */
5190 /* these wrappers also check the arguments passed for sanity */
5191 
5192 BIGNUM *
5193 BN_bin2bn(const uint8_t *data, int len, BIGNUM *ret)
5194 {
5195 	if (data == NULL) {
5196 		return BN_new();
5197 	}
5198 	if (ret == NULL) {
5199 		ret = BN_new();
5200 	}
5201 	return (mp_read_unsigned_bin(ret, data, len) == MP_OKAY) ? ret : NULL;
5202 }
5203 
5204 /* store in unsigned [big endian] format */
5205 int
5206 BN_bn2bin(const BIGNUM *a, unsigned char *b)
5207 {
5208 	BIGNUM	t;
5209 	int    	x;
5210 
5211 	if (a == NULL || b == NULL) {
5212 		return -1;
5213 	}
5214 	if (mp_init_copy (&t, __UNCONST(a)) != MP_OKAY) {
5215 		return -1;
5216 	}
5217 	for (x = 0; !BN_is_zero(&t) ; ) {
5218 		b[x++] = (unsigned char) (t.dp[0] & 0xff);
5219 		if (mp_div_2d (&t, 8, &t, NULL) != MP_OKAY) {
5220 			mp_clear(&t);
5221 			return -1;
5222 		}
5223 	}
5224 	bn_reverse(b, x);
5225 	mp_clear(&t);
5226 	return x;
5227 }
5228 
5229 void
5230 BN_init(BIGNUM *a)
5231 {
5232 	if (a != NULL) {
5233 		mp_init(a);
5234 	}
5235 }
5236 
5237 BIGNUM *
5238 BN_new(void)
5239 {
5240 	BIGNUM	*a;
5241 
5242 	if ((a = netpgp_allocate(1, sizeof(*a))) != NULL) {
5243 		mp_init(a);
5244 	}
5245 	return a;
5246 }
5247 
5248 /* copy, b = a */
5249 int
5250 BN_copy(BIGNUM *b, const BIGNUM *a)
5251 {
5252 	if (a == NULL || b == NULL) {
5253 		return MP_VAL;
5254 	}
5255 	return mp_copy(__UNCONST(a), b);
5256 }
5257 
5258 BIGNUM *
5259 BN_dup(const BIGNUM *a)
5260 {
5261 	BIGNUM	*ret;
5262 
5263 	if (a == NULL) {
5264 		return NULL;
5265 	}
5266 	if ((ret = BN_new()) != NULL) {
5267 		BN_copy(ret, a);
5268 	}
5269 	return ret;
5270 }
5271 
5272 void
5273 BN_swap(BIGNUM *a, BIGNUM *b)
5274 {
5275 	if (a && b) {
5276 		mp_exch(a, b);
5277 	}
5278 }
5279 
5280 int
5281 BN_lshift(BIGNUM *r, const BIGNUM *a, int n)
5282 {
5283 	if (r == NULL || a == NULL || n < 0) {
5284 		return 0;
5285 	}
5286 	BN_copy(r, a);
5287 	return mp_lshd(r, n) == MP_OKAY;
5288 }
5289 
5290 int
5291 BN_lshift1(BIGNUM *r, BIGNUM *a)
5292 {
5293 	if (r == NULL || a == NULL) {
5294 		return 0;
5295 	}
5296 	BN_copy(r, a);
5297 	return mp_lshd(r, 1) == MP_OKAY;
5298 }
5299 
5300 int
5301 BN_rshift(BIGNUM *r, const BIGNUM *a, int n)
5302 {
5303 	if (r == NULL || a == NULL || n < 0) {
5304 		return MP_VAL;
5305 	}
5306 	BN_copy(r, a);
5307 	return mp_rshd(r, n) == MP_OKAY;
5308 }
5309 
5310 int
5311 BN_rshift1(BIGNUM *r, BIGNUM *a)
5312 {
5313 	if (r == NULL || a == NULL) {
5314 		return 0;
5315 	}
5316 	BN_copy(r, a);
5317 	return mp_rshd(r, 1) == MP_OKAY;
5318 }
5319 
5320 int
5321 BN_set_word(BIGNUM *a, BN_ULONG w)
5322 {
5323 	if (a == NULL) {
5324 		return 0;
5325 	}
5326 	mp_set(a, w);
5327 	return 1;
5328 }
5329 
5330 int
5331 BN_add(BIGNUM *r, const BIGNUM *a, const BIGNUM *b)
5332 {
5333 	if (a == NULL || b == NULL || r == NULL) {
5334 		return 0;
5335 	}
5336 	return mp_add(__UNCONST(a), __UNCONST(b), r) == MP_OKAY;
5337 }
5338 
5339 int
5340 BN_sub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b)
5341 {
5342 	if (a == NULL || b == NULL || r == NULL) {
5343 		return 0;
5344 	}
5345 	return mp_sub(__UNCONST(a), __UNCONST(b), r) == MP_OKAY;
5346 }
5347 
5348 int
5349 BN_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx)
5350 {
5351 	if (a == NULL || b == NULL || r == NULL) {
5352 		return 0;
5353 	}
5354 	USE_ARG(ctx);
5355 	return mp_mul(__UNCONST(a), __UNCONST(b), r) == MP_OKAY;
5356 }
5357 
5358 int
5359 BN_div(BIGNUM *dv, BIGNUM *rem, const BIGNUM *a, const BIGNUM *d, BN_CTX *ctx)
5360 {
5361 	if ((dv == NULL && rem == NULL) || a == NULL || d == NULL) {
5362 		return 0;
5363 	}
5364 	USE_ARG(ctx);
5365 	return mp_div(dv, rem, __UNCONST(a), __UNCONST(d)) == MP_OKAY;
5366 }
5367 
5368 void
5369 BN_free(BIGNUM *a)
5370 {
5371 	if (a) {
5372 		mp_clear(a);
5373 	}
5374 }
5375 
5376 void
5377 BN_clear(BIGNUM *a)
5378 {
5379 	if (a) {
5380 		mp_clear(a);
5381 	}
5382 }
5383 
5384 void
5385 BN_clear_free(BIGNUM *a)
5386 {
5387 	if (a) {
5388 		mp_clear(a);
5389 	}
5390 }
5391 
5392 int
5393 BN_num_bytes(const BIGNUM *a)
5394 {
5395 	if (a == NULL) {
5396 		return MP_VAL;
5397 	}
5398 	return mp_unsigned_bin_size(__UNCONST(a));
5399 }
5400 
5401 int
5402 BN_num_bits(const BIGNUM *a)
5403 {
5404 	if (a == NULL) {
5405 		return 0;
5406 	}
5407 	return mp_count_bits(a);
5408 }
5409 
5410 void
5411 BN_set_negative(BIGNUM *a, int n)
5412 {
5413 	if (a) {
5414 		a->sign = (n) ? MP_NEG : 0;
5415 	}
5416 }
5417 
5418 int
5419 BN_cmp(BIGNUM *a, BIGNUM *b)
5420 {
5421 	if (a == NULL || b == NULL) {
5422 		return MP_VAL;
5423 	}
5424 	switch(mp_cmp(a, b)) {
5425 	case MP_LT:
5426 		return -1;
5427 	case MP_GT:
5428 		return 1;
5429 	case MP_EQ:
5430 	default:
5431 		return 0;
5432 	}
5433 }
5434 
5435 int
5436 BN_mod_exp(BIGNUM *Y, BIGNUM *G, BIGNUM *X, BIGNUM *P, BN_CTX *ctx)
5437 {
5438 	if (Y == NULL || G == NULL || X == NULL || P == NULL) {
5439 		return MP_VAL;
5440 	}
5441 	USE_ARG(ctx);
5442 	return mp_exptmod(G, X, P, Y) == MP_OKAY;
5443 }
5444 
5445 BIGNUM *
5446 BN_mod_inverse(BIGNUM *r, BIGNUM *a, const BIGNUM *n, BN_CTX *ctx)
5447 {
5448 	USE_ARG(ctx);
5449 	if (r == NULL || a == NULL || n == NULL) {
5450 		return NULL;
5451 	}
5452 	return (mp_invmod(r, a, __UNCONST(n)) == MP_OKAY) ? r : NULL;
5453 }
5454 
5455 int
5456 BN_mod_mul(BIGNUM *ret, BIGNUM *a, BIGNUM *b, const BIGNUM *m, BN_CTX *ctx)
5457 {
5458 	USE_ARG(ctx);
5459 	if (ret == NULL || a == NULL || b == NULL || m == NULL) {
5460 		return 0;
5461 	}
5462 	return mp_mulmod(ret, a, b, __UNCONST(m)) == MP_OKAY;
5463 }
5464 
5465 BN_CTX *
5466 BN_CTX_new(void)
5467 {
5468 	return netpgp_allocate(1, sizeof(BN_CTX));
5469 }
5470 
5471 void
5472 BN_CTX_init(BN_CTX *c)
5473 {
5474 	if (c != NULL) {
5475 		c->arraysize = 15;
5476 		if ((c->v = netpgp_allocate(sizeof(*c->v), c->arraysize)) == NULL) {
5477 			c->arraysize = 0;
5478 		}
5479 	}
5480 }
5481 
5482 BIGNUM *
5483 BN_CTX_get(BN_CTX *ctx)
5484 {
5485 	if (ctx == NULL || ctx->v == NULL || ctx->arraysize == 0 || ctx->count == ctx->arraysize - 1) {
5486 		return NULL;
5487 	}
5488 	return ctx->v[ctx->count++] = BN_new();
5489 }
5490 
5491 void
5492 BN_CTX_start(BN_CTX *ctx)
5493 {
5494 	BN_CTX_init(ctx);
5495 }
5496 
5497 void
5498 BN_CTX_free(BN_CTX *c)
5499 {
5500 	unsigned	i;
5501 
5502 	if (c != NULL && c->v != NULL) {
5503 		for (i = 0 ; i < c->count ; i++) {
5504 			BN_clear_free(c->v[i]);
5505 		}
5506 		netpgp_deallocate(c->v, sizeof(*c->v) * c->arraysize);
5507 	}
5508 }
5509 
5510 void
5511 BN_CTX_end(BN_CTX *ctx)
5512 {
5513 	BN_CTX_free(ctx);
5514 }
5515 
5516 char *
5517 BN_bn2hex(const BIGNUM *a)
5518 {
5519 	return (a == NULL) ? NULL : formatbn(a, 16);
5520 }
5521 
5522 char *
5523 BN_bn2dec(const BIGNUM *a)
5524 {
5525 	return (a == NULL) ? NULL : formatbn(a, 10);
5526 }
5527 
5528 #ifndef _KERNEL
5529 int
5530 BN_print_fp(FILE *fp, const BIGNUM *a)
5531 {
5532 	char	*s;
5533 	int	 ret;
5534 
5535 	if (fp == NULL || a == NULL) {
5536 		return 0;
5537 	}
5538 	s = BN_bn2hex(a);
5539 	ret = fprintf(fp, "%s", s);
5540 	netpgp_deallocate(s, strlen(s) + 1);
5541 	return ret;
5542 }
5543 #endif
5544 
5545 int
5546 BN_rand(BIGNUM *rnd, int bits, int top, int bottom)
5547 {
5548 	uint64_t	r;
5549 	int		digits;
5550 	int		i;
5551 
5552 	if (rnd == NULL) {
5553 		return 0;
5554 	}
5555 	mp_init_size(rnd, digits = howmany(bits, DIGIT_BIT));
5556 	for (i = 0 ; i < digits ; i++) {
5557 		r = (uint64_t)arc4random();
5558 		r <<= 32;
5559 		r |= arc4random();
5560 		rnd->dp[i] = (r & MP_MASK);
5561 	}
5562 	if (top == 0) {
5563 		rnd->dp[rnd->used - 1] |= (((mp_digit)1)<<((mp_digit)DIGIT_BIT));
5564 	}
5565 	if (top == 1) {
5566 		rnd->dp[rnd->used - 1] |= (((mp_digit)1)<<((mp_digit)DIGIT_BIT));
5567 		rnd->dp[rnd->used - 1] |= (((mp_digit)1)<<((mp_digit)(DIGIT_BIT - 1)));
5568 	}
5569 	if (bottom) {
5570 		rnd->dp[0] |= 0x1;
5571 	}
5572 	return 1;
5573 }
5574 
5575 int
5576 BN_rand_range(BIGNUM *rnd, BIGNUM *range)
5577 {
5578 	if (rnd == NULL || range == NULL || BN_is_zero(range)) {
5579 		return 0;
5580 	}
5581 	BN_rand(rnd, BN_num_bits(range), 1, 0);
5582 	return mp_mod(rnd, range, rnd) == MP_OKAY;
5583 }
5584 
5585 int
5586 BN_is_prime(const BIGNUM *a, int checks, void (*callback)(int, int, void *), BN_CTX *ctx, void *cb_arg)
5587 {
5588 	int	primality;
5589 
5590 	if (a == NULL) {
5591 		return 0;
5592 	}
5593 	USE_ARG(ctx);
5594 	USE_ARG(cb_arg);
5595 	USE_ARG(callback);
5596 	return (mp_prime_is_prime(__UNCONST(a), checks, &primality) == MP_OKAY) ? primality : 0;
5597 }
5598 
5599 const BIGNUM *
5600 BN_value_one(void)
5601 {
5602 	static mp_digit		digit = 1UL;
5603 	static const BIGNUM	one = { &digit, 1, 1, 0 };
5604 
5605 	return &one;
5606 }
5607 
5608 int
5609 BN_hex2bn(BIGNUM **a, const char *str)
5610 {
5611 	return getbn(a, str, 16);
5612 }
5613 
5614 int
5615 BN_dec2bn(BIGNUM **a, const char *str)
5616 {
5617 	return getbn(a, str, 10);
5618 }
5619 
5620 int
5621 BN_mod_sub(BIGNUM *r, BIGNUM *a, BIGNUM *b, const BIGNUM *m, BN_CTX *ctx)
5622 {
5623 	USE_ARG(ctx);
5624 	if (r == NULL || a == NULL || b == NULL || m == NULL) {
5625 		return 0;
5626 	}
5627 	return mp_submod(a, b, __UNCONST(m), r) == MP_OKAY;
5628 }
5629 
5630 int
5631 BN_is_bit_set(const BIGNUM *a, int n)
5632 {
5633 	if (a == NULL || n < 0 || n >= a->used * DIGIT_BIT) {
5634 		return 0;
5635 	}
5636 	return (a->dp[n / DIGIT_BIT] & (1 << (n % DIGIT_BIT))) ? 1 : 0;
5637 }
5638