xref: /openbsd-src/lib/libcrypto/bn/bn_exp.c (revision f2da64fbbbf1b03f09f390ab01267c93dfd77c4c)
1 /* $OpenBSD: bn_exp.c,v 1.26 2016/09/03 17:26:29 bcook Exp $ */
2 /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3  * All rights reserved.
4  *
5  * This package is an SSL implementation written
6  * by Eric Young (eay@cryptsoft.com).
7  * The implementation was written so as to conform with Netscapes SSL.
8  *
9  * This library is free for commercial and non-commercial use as long as
10  * the following conditions are aheared to.  The following conditions
11  * apply to all code found in this distribution, be it the RC4, RSA,
12  * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
13  * included with this distribution is covered by the same copyright terms
14  * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15  *
16  * Copyright remains Eric Young's, and as such any Copyright notices in
17  * the code are not to be removed.
18  * If this package is used in a product, Eric Young should be given attribution
19  * as the author of the parts of the library used.
20  * This can be in the form of a textual message at program startup or
21  * in documentation (online or textual) provided with the package.
22  *
23  * Redistribution and use in source and binary forms, with or without
24  * modification, are permitted provided that the following conditions
25  * are met:
26  * 1. Redistributions of source code must retain the copyright
27  *    notice, this list of conditions and the following disclaimer.
28  * 2. Redistributions in binary form must reproduce the above copyright
29  *    notice, this list of conditions and the following disclaimer in the
30  *    documentation and/or other materials provided with the distribution.
31  * 3. All advertising materials mentioning features or use of this software
32  *    must display the following acknowledgement:
33  *    "This product includes cryptographic software written by
34  *     Eric Young (eay@cryptsoft.com)"
35  *    The word 'cryptographic' can be left out if the rouines from the library
36  *    being used are not cryptographic related :-).
37  * 4. If you include any Windows specific code (or a derivative thereof) from
38  *    the apps directory (application code) you must include an acknowledgement:
39  *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40  *
41  * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51  * SUCH DAMAGE.
52  *
53  * The licence and distribution terms for any publically available version or
54  * derivative of this code cannot be changed.  i.e. this code cannot simply be
55  * copied and put under another distribution licence
56  * [including the GNU Public Licence.]
57  */
58 /* ====================================================================
59  * Copyright (c) 1998-2005 The OpenSSL Project.  All rights reserved.
60  *
61  * Redistribution and use in source and binary forms, with or without
62  * modification, are permitted provided that the following conditions
63  * are met:
64  *
65  * 1. Redistributions of source code must retain the above copyright
66  *    notice, this list of conditions and the following disclaimer.
67  *
68  * 2. Redistributions in binary form must reproduce the above copyright
69  *    notice, this list of conditions and the following disclaimer in
70  *    the documentation and/or other materials provided with the
71  *    distribution.
72  *
73  * 3. All advertising materials mentioning features or use of this
74  *    software must display the following acknowledgment:
75  *    "This product includes software developed by the OpenSSL Project
76  *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
77  *
78  * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
79  *    endorse or promote products derived from this software without
80  *    prior written permission. For written permission, please contact
81  *    openssl-core@openssl.org.
82  *
83  * 5. Products derived from this software may not be called "OpenSSL"
84  *    nor may "OpenSSL" appear in their names without prior written
85  *    permission of the OpenSSL Project.
86  *
87  * 6. Redistributions of any form whatsoever must retain the following
88  *    acknowledgment:
89  *    "This product includes software developed by the OpenSSL Project
90  *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
91  *
92  * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
93  * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
94  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
95  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
96  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
97  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
98  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
99  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
100  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
101  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
102  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
103  * OF THE POSSIBILITY OF SUCH DAMAGE.
104  * ====================================================================
105  *
106  * This product includes cryptographic software written by Eric Young
107  * (eay@cryptsoft.com).  This product includes software written by Tim
108  * Hudson (tjh@cryptsoft.com).
109  *
110  */
111 
112 #include <stdlib.h>
113 #include <string.h>
114 
115 #include <openssl/err.h>
116 
117 #include "bn_lcl.h"
118 #include "constant_time_locl.h"
119 
120 /* maximum precomputation table size for *variable* sliding windows */
121 #define TABLE_SIZE	32
122 
123 /* this one works - simple but works */
124 int
125 BN_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx)
126 {
127 	int i, bits, ret = 0;
128 	BIGNUM *v, *rr;
129 
130 	if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0) {
131 		/* BN_FLG_CONSTTIME only supported by BN_mod_exp_mont() */
132 		BNerr(BN_F_BN_EXP, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
133 		return -1;
134 	}
135 
136 	BN_CTX_start(ctx);
137 	if ((r == a) || (r == p))
138 		rr = BN_CTX_get(ctx);
139 	else
140 		rr = r;
141 	v = BN_CTX_get(ctx);
142 	if (rr == NULL || v == NULL)
143 		goto err;
144 
145 	if (BN_copy(v, a) == NULL)
146 		goto err;
147 	bits = BN_num_bits(p);
148 
149 	if (BN_is_odd(p)) {
150 		if (BN_copy(rr, a) == NULL)
151 			goto err;
152 	} else {
153 		if (!BN_one(rr))
154 			goto err;
155 	}
156 
157 	for (i = 1; i < bits; i++) {
158 		if (!BN_sqr(v, v, ctx))
159 			goto err;
160 		if (BN_is_bit_set(p, i)) {
161 			if (!BN_mul(rr, rr, v, ctx))
162 				goto err;
163 		}
164 	}
165 	ret = 1;
166 
167 err:
168 	if (r != rr && rr != NULL)
169 		BN_copy(r, rr);
170 	BN_CTX_end(ctx);
171 	bn_check_top(r);
172 	return (ret);
173 }
174 
175 int
176 BN_mod_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, const BIGNUM *m,
177     BN_CTX *ctx)
178 {
179 	int ret;
180 
181 	bn_check_top(a);
182 	bn_check_top(p);
183 	bn_check_top(m);
184 
185 	/* For even modulus  m = 2^k*m_odd,  it might make sense to compute
186 	 * a^p mod m_odd  and  a^p mod 2^k  separately (with Montgomery
187 	 * exponentiation for the odd part), using appropriate exponent
188 	 * reductions, and combine the results using the CRT.
189 	 *
190 	 * For now, we use Montgomery only if the modulus is odd; otherwise,
191 	 * exponentiation using the reciprocal-based quick remaindering
192 	 * algorithm is used.
193 	 *
194 	 * (Timing obtained with expspeed.c [computations  a^p mod m
195 	 * where  a, p, m  are of the same length: 256, 512, 1024, 2048,
196 	 * 4096, 8192 bits], compared to the running time of the
197 	 * standard algorithm:
198 	 *
199 	 *   BN_mod_exp_mont   33 .. 40 %  [AMD K6-2, Linux, debug configuration]
200          *                     55 .. 77 %  [UltraSparc processor, but
201 	 *                                  debug-solaris-sparcv8-gcc conf.]
202 	 *
203 	 *   BN_mod_exp_recp   50 .. 70 %  [AMD K6-2, Linux, debug configuration]
204 	 *                     62 .. 118 % [UltraSparc, debug-solaris-sparcv8-gcc]
205 	 *
206 	 * On the Sparc, BN_mod_exp_recp was faster than BN_mod_exp_mont
207 	 * at 2048 and more bits, but at 512 and 1024 bits, it was
208 	 * slower even than the standard algorithm!
209 	 *
210 	 * "Real" timings [linux-elf, solaris-sparcv9-gcc configurations]
211 	 * should be obtained when the new Montgomery reduction code
212 	 * has been integrated into OpenSSL.)
213 	 */
214 
215 #define MONT_MUL_MOD
216 #define MONT_EXP_WORD
217 #define RECP_MUL_MOD
218 
219 #ifdef MONT_MUL_MOD
220 	/* I have finally been able to take out this pre-condition of
221 	 * the top bit being set.  It was caused by an error in BN_div
222 	 * with negatives.  There was also another problem when for a^b%m
223 	 * a >= m.  eay 07-May-97 */
224 /*	if ((m->d[m->top-1]&BN_TBIT) && BN_is_odd(m)) */
225 
226 	if (BN_is_odd(m)) {
227 #  ifdef MONT_EXP_WORD
228 		if (a->top == 1 && !a->neg &&
229 		    (BN_get_flags(p, BN_FLG_CONSTTIME) == 0)) {
230 			BN_ULONG A = a->d[0];
231 			ret = BN_mod_exp_mont_word(r, A,p, m,ctx, NULL);
232 		} else
233 #  endif
234 			ret = BN_mod_exp_mont(r, a,p, m,ctx, NULL);
235 	} else
236 #endif
237 #ifdef RECP_MUL_MOD
238 	{
239 		ret = BN_mod_exp_recp(r, a,p, m, ctx);
240 	}
241 #else
242 	{
243 		ret = BN_mod_exp_simple(r, a,p, m, ctx);
244 	}
245 #endif
246 
247 	bn_check_top(r);
248 	return (ret);
249 }
250 
251 int
252 BN_mod_exp_recp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, const BIGNUM *m,
253     BN_CTX *ctx)
254 {
255 	int i, j, bits, ret = 0, wstart, wend, window, wvalue;
256 	int start = 1;
257 	BIGNUM *aa;
258 	/* Table of variables obtained from 'ctx' */
259 	BIGNUM *val[TABLE_SIZE];
260 	BN_RECP_CTX recp;
261 
262 	if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0) {
263 		/* BN_FLG_CONSTTIME only supported by BN_mod_exp_mont() */
264 		BNerr(BN_F_BN_MOD_EXP_RECP, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
265 		return -1;
266 	}
267 
268 	bits = BN_num_bits(p);
269 	if (bits == 0) {
270 		/* x**0 mod 1 is still zero. */
271 		if (BN_is_one(m)) {
272 			ret = 1;
273 			BN_zero(r);
274 		} else
275 			ret = BN_one(r);
276 		return ret;
277 	}
278 
279 	BN_CTX_start(ctx);
280 	if ((aa = BN_CTX_get(ctx)) == NULL)
281 		goto err;
282 	if ((val[0] = BN_CTX_get(ctx)) == NULL)
283 		goto err;
284 
285 	BN_RECP_CTX_init(&recp);
286 	if (m->neg) {
287 		/* ignore sign of 'm' */
288 		if (!BN_copy(aa, m))
289 			goto err;
290 		aa->neg = 0;
291 		if (BN_RECP_CTX_set(&recp, aa, ctx) <= 0)
292 			goto err;
293 	} else {
294 		if (BN_RECP_CTX_set(&recp, m, ctx) <= 0)
295 			goto err;
296 	}
297 
298 	if (!BN_nnmod(val[0], a, m, ctx))
299 		goto err;		/* 1 */
300 	if (BN_is_zero(val[0])) {
301 		BN_zero(r);
302 		ret = 1;
303 		goto err;
304 	}
305 
306 	window = BN_window_bits_for_exponent_size(bits);
307 	if (window > 1) {
308 		if (!BN_mod_mul_reciprocal(aa, val[0], val[0], &recp, ctx))
309 			goto err;				/* 2 */
310 		j = 1 << (window - 1);
311 		for (i = 1; i < j; i++) {
312 			if (((val[i] = BN_CTX_get(ctx)) == NULL) ||
313 			    !BN_mod_mul_reciprocal(val[i], val[i - 1],
314 			    aa, &recp, ctx))
315 				goto err;
316 		}
317 	}
318 
319 	start = 1;		/* This is used to avoid multiplication etc
320 				 * when there is only the value '1' in the
321 				 * buffer. */
322 	wvalue = 0;		/* The 'value' of the window */
323 	wstart = bits - 1;	/* The top bit of the window */
324 	wend = 0;		/* The bottom bit of the window */
325 
326 	if (!BN_one(r))
327 		goto err;
328 
329 	for (;;) {
330 		if (BN_is_bit_set(p, wstart) == 0) {
331 			if (!start)
332 				if (!BN_mod_mul_reciprocal(r, r,r, &recp, ctx))
333 					goto err;
334 			if (wstart == 0)
335 				break;
336 			wstart--;
337 			continue;
338 		}
339 		/* We now have wstart on a 'set' bit, we now need to work out
340 		 * how bit a window to do.  To do this we need to scan
341 		 * forward until the last set bit before the end of the
342 		 * window */
343 		j = wstart;
344 		wvalue = 1;
345 		wend = 0;
346 		for (i = 1; i < window; i++) {
347 			if (wstart - i < 0)
348 				break;
349 			if (BN_is_bit_set(p, wstart - i)) {
350 				wvalue <<= (i - wend);
351 				wvalue |= 1;
352 				wend = i;
353 			}
354 		}
355 
356 		/* wend is the size of the current window */
357 		j = wend + 1;
358 		/* add the 'bytes above' */
359 		if (!start)
360 			for (i = 0; i < j; i++) {
361 				if (!BN_mod_mul_reciprocal(r, r,r, &recp, ctx))
362 					goto err;
363 			}
364 
365 		/* wvalue will be an odd number < 2^window */
366 		if (!BN_mod_mul_reciprocal(r, r,val[wvalue >> 1], &recp, ctx))
367 			goto err;
368 
369 		/* move the 'window' down further */
370 		wstart -= wend + 1;
371 		wvalue = 0;
372 		start = 0;
373 		if (wstart < 0)
374 			break;
375 	}
376 	ret = 1;
377 
378 err:
379 	BN_CTX_end(ctx);
380 	BN_RECP_CTX_free(&recp);
381 	bn_check_top(r);
382 	return (ret);
383 }
384 
385 int
386 BN_mod_exp_mont(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, const BIGNUM *m,
387     BN_CTX *ctx, BN_MONT_CTX *in_mont)
388 {
389 	int i, j, bits, ret = 0, wstart, wend, window, wvalue;
390 	int start = 1;
391 	BIGNUM *d, *r;
392 	const BIGNUM *aa;
393 	/* Table of variables obtained from 'ctx' */
394 	BIGNUM *val[TABLE_SIZE];
395 	BN_MONT_CTX *mont = NULL;
396 
397 	if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0) {
398 		return BN_mod_exp_mont_consttime(rr, a, p, m, ctx, in_mont);
399 	}
400 
401 	bn_check_top(a);
402 	bn_check_top(p);
403 	bn_check_top(m);
404 
405 	if (!BN_is_odd(m)) {
406 		BNerr(BN_F_BN_MOD_EXP_MONT, BN_R_CALLED_WITH_EVEN_MODULUS);
407 		return (0);
408 	}
409 
410 	bits = BN_num_bits(p);
411 	if (bits == 0) {
412 		/* x**0 mod 1 is still zero. */
413 		if (BN_is_one(m)) {
414 			ret = 1;
415 			BN_zero(rr);
416 		} else
417 			ret = BN_one(rr);
418 		return ret;
419 	}
420 
421 	BN_CTX_start(ctx);
422 	if ((d = BN_CTX_get(ctx)) == NULL)
423 		goto err;
424 	if ((r = BN_CTX_get(ctx)) == NULL)
425 		goto err;
426 	if ((val[0] = BN_CTX_get(ctx)) == NULL)
427 		goto err;
428 
429 	/* If this is not done, things will break in the montgomery
430 	 * part */
431 
432 	if (in_mont != NULL)
433 		mont = in_mont;
434 	else {
435 		if ((mont = BN_MONT_CTX_new()) == NULL)
436 			goto err;
437 		if (!BN_MONT_CTX_set(mont, m, ctx))
438 			goto err;
439 	}
440 
441 	if (a->neg || BN_ucmp(a, m) >= 0) {
442 		if (!BN_nnmod(val[0], a,m, ctx))
443 			goto err;
444 		aa = val[0];
445 	} else
446 		aa = a;
447 	if (BN_is_zero(aa)) {
448 		BN_zero(rr);
449 		ret = 1;
450 		goto err;
451 	}
452 	if (!BN_to_montgomery(val[0], aa, mont, ctx))
453 		goto err; /* 1 */
454 
455 	window = BN_window_bits_for_exponent_size(bits);
456 	if (window > 1) {
457 		if (!BN_mod_mul_montgomery(d, val[0], val[0], mont, ctx))
458 			goto err; /* 2 */
459 		j = 1 << (window - 1);
460 		for (i = 1; i < j; i++) {
461 			if (((val[i] = BN_CTX_get(ctx)) == NULL) ||
462 			    !BN_mod_mul_montgomery(val[i], val[i - 1],
463 			    d, mont, ctx))
464 				goto err;
465 		}
466 	}
467 
468 	start = 1;		/* This is used to avoid multiplication etc
469 				 * when there is only the value '1' in the
470 				 * buffer. */
471 	wvalue = 0;		/* The 'value' of the window */
472 	wstart = bits - 1;	/* The top bit of the window */
473 	wend = 0;		/* The bottom bit of the window */
474 
475 	if (!BN_to_montgomery(r, BN_value_one(), mont, ctx))
476 		goto err;
477 	for (;;) {
478 		if (BN_is_bit_set(p, wstart) == 0) {
479 			if (!start) {
480 				if (!BN_mod_mul_montgomery(r, r, r, mont, ctx))
481 					goto err;
482 			}
483 			if (wstart == 0)
484 				break;
485 			wstart--;
486 			continue;
487 		}
488 		/* We now have wstart on a 'set' bit, we now need to work out
489 		 * how bit a window to do.  To do this we need to scan
490 		 * forward until the last set bit before the end of the
491 		 * window */
492 		j = wstart;
493 		wvalue = 1;
494 		wend = 0;
495 		for (i = 1; i < window; i++) {
496 			if (wstart - i < 0)
497 				break;
498 			if (BN_is_bit_set(p, wstart - i)) {
499 				wvalue <<= (i - wend);
500 				wvalue |= 1;
501 				wend = i;
502 			}
503 		}
504 
505 		/* wend is the size of the current window */
506 		j = wend + 1;
507 		/* add the 'bytes above' */
508 		if (!start)
509 			for (i = 0; i < j; i++) {
510 				if (!BN_mod_mul_montgomery(r, r, r, mont, ctx))
511 					goto err;
512 			}
513 
514 		/* wvalue will be an odd number < 2^window */
515 		if (!BN_mod_mul_montgomery(r, r, val[wvalue >> 1], mont, ctx))
516 			goto err;
517 
518 		/* move the 'window' down further */
519 		wstart -= wend + 1;
520 		wvalue = 0;
521 		start = 0;
522 		if (wstart < 0)
523 			break;
524 	}
525 	if (!BN_from_montgomery(rr, r,mont, ctx))
526 		goto err;
527 	ret = 1;
528 
529 err:
530 	if ((in_mont == NULL) && (mont != NULL))
531 		BN_MONT_CTX_free(mont);
532 	BN_CTX_end(ctx);
533 	bn_check_top(rr);
534 	return (ret);
535 }
536 
537 
538 /* BN_mod_exp_mont_consttime() stores the precomputed powers in a specific layout
539  * so that accessing any of these table values shows the same access pattern as far
540  * as cache lines are concerned.  The following functions are used to transfer a BIGNUM
541  * from/to that table. */
542 
543 static int
544 MOD_EXP_CTIME_COPY_TO_PREBUF(const BIGNUM *b, int top, unsigned char *buf,
545     int idx, int window)
546 {
547 	int i, j;
548 	int width = 1 << window;
549 	BN_ULONG *table = (BN_ULONG *)buf;
550 
551 	if (top > b->top)
552 		top = b->top; /* this works because 'buf' is explicitly zeroed */
553 
554 	for (i = 0, j = idx; i < top; i++, j += width) {
555 		table[j] = b->d[i];
556 	}
557 
558 	return 1;
559 }
560 
561 static int
562 MOD_EXP_CTIME_COPY_FROM_PREBUF(BIGNUM *b, int top, unsigned char *buf, int idx,
563     int window)
564 {
565 	int i, j;
566 	int width = 1 << window;
567 	volatile BN_ULONG *table = (volatile BN_ULONG *)buf;
568 
569 	if (bn_wexpand(b, top) == NULL)
570 		return 0;
571 
572 	if (window <= 3) {
573 		for (i = 0; i < top; i++, table += width) {
574 		    BN_ULONG acc = 0;
575 
576 		    for (j = 0; j < width; j++) {
577 			acc |= table[j] &
578 			       ((BN_ULONG)0 - (constant_time_eq_int(j,idx)&1));
579 		    }
580 
581 		    b->d[i] = acc;
582 		}
583 	} else {
584 		int xstride = 1 << (window - 2);
585 		BN_ULONG y0, y1, y2, y3;
586 
587 		i = idx >> (window - 2);        /* equivalent of idx / xstride */
588 		idx &= xstride - 1;             /* equivalent of idx % xstride */
589 
590 		y0 = (BN_ULONG)0 - (constant_time_eq_int(i,0)&1);
591 		y1 = (BN_ULONG)0 - (constant_time_eq_int(i,1)&1);
592 		y2 = (BN_ULONG)0 - (constant_time_eq_int(i,2)&1);
593 		y3 = (BN_ULONG)0 - (constant_time_eq_int(i,3)&1);
594 
595 		for (i = 0; i < top; i++, table += width) {
596 		    BN_ULONG acc = 0;
597 
598 		    for (j = 0; j < xstride; j++) {
599 			acc |= ( (table[j + 0 * xstride] & y0) |
600 				 (table[j + 1 * xstride] & y1) |
601 				 (table[j + 2 * xstride] & y2) |
602 				 (table[j + 3 * xstride] & y3) )
603 			       & ((BN_ULONG)0 - (constant_time_eq_int(j,idx)&1));
604 		    }
605 
606 		    b->d[i] = acc;
607 		}
608 	}
609 	b->top = top;
610 	bn_correct_top(b);
611 	return 1;
612 }
613 
614 /* Given a pointer value, compute the next address that is a cache line multiple. */
615 #define MOD_EXP_CTIME_ALIGN(x_) \
616 	((unsigned char*)(x_) + (MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH - (((size_t)(x_)) & (MOD_EXP_CTIME_MIN_CACHE_LINE_MASK))))
617 
618 /* This variant of BN_mod_exp_mont() uses fixed windows and the special
619  * precomputation memory layout to limit data-dependency to a minimum
620  * to protect secret exponents (cf. the hyper-threading timing attacks
621  * pointed out by Colin Percival,
622  * http://www.daemonology.net/hyperthreading-considered-harmful/)
623  */
624 int
625 BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
626     const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *in_mont)
627 {
628 	int i, bits, ret = 0, window, wvalue;
629 	int top;
630 	BN_MONT_CTX *mont = NULL;
631 	int numPowers;
632 	unsigned char *powerbufFree = NULL;
633 	int powerbufLen = 0;
634 	unsigned char *powerbuf = NULL;
635 	BIGNUM tmp, am;
636 
637 	bn_check_top(a);
638 	bn_check_top(p);
639 	bn_check_top(m);
640 
641 	if (!BN_is_odd(m)) {
642 		BNerr(BN_F_BN_MOD_EXP_MONT_CONSTTIME,
643 		    BN_R_CALLED_WITH_EVEN_MODULUS);
644 		return (0);
645 	}
646 
647 	top = m->top;
648 
649 	bits = BN_num_bits(p);
650 	if (bits == 0) {
651 		/* x**0 mod 1 is still zero. */
652 		if (BN_is_one(m)) {
653 			ret = 1;
654 			BN_zero(rr);
655 		} else
656 			ret = BN_one(rr);
657 		return ret;
658 	}
659 
660 	BN_CTX_start(ctx);
661 
662 	/* Allocate a montgomery context if it was not supplied by the caller.
663 	 * If this is not done, things will break in the montgomery part.
664  	 */
665 	if (in_mont != NULL)
666 		mont = in_mont;
667 	else {
668 		if ((mont = BN_MONT_CTX_new()) == NULL)
669 			goto err;
670 		if (!BN_MONT_CTX_set(mont, m, ctx))
671 			goto err;
672 	}
673 
674 	/* Get the window size to use with size of p. */
675 	window = BN_window_bits_for_ctime_exponent_size(bits);
676 #if defined(OPENSSL_BN_ASM_MONT5)
677 	if (window == 6 && bits <= 1024)
678 		window = 5;	/* ~5% improvement of 2048-bit RSA sign */
679 #endif
680 
681 	/* Allocate a buffer large enough to hold all of the pre-computed
682 	 * powers of am, am itself and tmp.
683 	 */
684 	numPowers = 1 << window;
685 	powerbufLen = sizeof(m->d[0]) * (top * numPowers +
686 	    ((2*top) > numPowers ? (2*top) : numPowers));
687 	if ((powerbufFree = malloc(powerbufLen +
688 	    MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH)) == NULL)
689 		goto err;
690 
691 	powerbuf = MOD_EXP_CTIME_ALIGN(powerbufFree);
692 	memset(powerbuf, 0, powerbufLen);
693 
694 	/* lay down tmp and am right after powers table */
695 	tmp.d = (BN_ULONG *)(powerbuf + sizeof(m->d[0]) * top * numPowers);
696 	am.d = tmp.d + top;
697 	tmp.top = am.top = 0;
698 	tmp.dmax = am.dmax = top;
699 	tmp.neg = am.neg = 0;
700 	tmp.flags = am.flags = BN_FLG_STATIC_DATA;
701 
702 	/* prepare a^0 in Montgomery domain */
703 #if 1
704 	if (!BN_to_montgomery(&tmp, BN_value_one(), mont, ctx))
705 		goto err;
706 #else
707 	tmp.d[0] = (0 - m - >d[0]) & BN_MASK2;	/* 2^(top*BN_BITS2) - m */
708 	for (i = 1; i < top; i++)
709 		tmp.d[i] = (~m->d[i]) & BN_MASK2;
710 	tmp.top = top;
711 #endif
712 
713 	/* prepare a^1 in Montgomery domain */
714 	if (a->neg || BN_ucmp(a, m) >= 0) {
715 		if (!BN_mod(&am, a,m, ctx))
716 			goto err;
717 		if (!BN_to_montgomery(&am, &am, mont, ctx))
718 			goto err;
719 	} else if (!BN_to_montgomery(&am, a,mont, ctx))
720 		goto err;
721 
722 #if defined(OPENSSL_BN_ASM_MONT5)
723 	/* This optimization uses ideas from http://eprint.iacr.org/2011/239,
724 	 * specifically optimization of cache-timing attack countermeasures
725 	 * and pre-computation optimization. */
726 
727 	/* Dedicated window==4 case improves 512-bit RSA sign by ~15%, but as
728 	 * 512-bit RSA is hardly relevant, we omit it to spare size... */
729 	if (window == 5 && top > 1) {
730 		void bn_mul_mont_gather5(BN_ULONG *rp, const BN_ULONG *ap,
731 		    const void *table, const BN_ULONG *np,
732 		    const BN_ULONG *n0, int num, int power);
733 		void bn_scatter5(const BN_ULONG *inp, size_t num,
734 		    void *table, size_t power);
735 		void bn_gather5(BN_ULONG *out, size_t num,
736 		    void *table, size_t power);
737 
738 		BN_ULONG *np = mont->N.d, *n0 = mont->n0;
739 
740 		/* BN_to_montgomery can contaminate words above .top
741 		 * [in BN_DEBUG[_DEBUG] build]... */
742 		for (i = am.top; i < top; i++)
743 			am.d[i] = 0;
744 		for (i = tmp.top; i < top; i++)
745 			tmp.d[i] = 0;
746 
747 		bn_scatter5(tmp.d, top, powerbuf, 0);
748 		bn_scatter5(am.d, am.top, powerbuf, 1);
749 		bn_mul_mont(tmp.d, am.d, am.d, np, n0, top);
750 		bn_scatter5(tmp.d, top, powerbuf, 2);
751 
752 #if 0
753 		for (i = 3; i < 32; i++) {
754 			/* Calculate a^i = a^(i-1) * a */
755 			bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np,
756 			    n0, top, i - 1);
757 			bn_scatter5(tmp.d, top, powerbuf, i);
758 		}
759 #else
760 		/* same as above, but uses squaring for 1/2 of operations */
761 		for (i = 4; i < 32; i*=2) {
762 			bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
763 			bn_scatter5(tmp.d, top, powerbuf, i);
764 		}
765 		for (i = 3; i < 8; i += 2) {
766 			int j;
767 			bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np,
768 			    n0, top, i - 1);
769 			bn_scatter5(tmp.d, top, powerbuf, i);
770 			for (j = 2 * i; j < 32; j *= 2) {
771 				bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
772 				bn_scatter5(tmp.d, top, powerbuf, j);
773 			}
774 		}
775 		for (; i < 16; i += 2) {
776 			bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np,
777 			    n0, top, i - 1);
778 			bn_scatter5(tmp.d, top, powerbuf, i);
779 			bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
780 			bn_scatter5(tmp.d, top, powerbuf, 2*i);
781 		}
782 		for (; i < 32; i += 2) {
783 			bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np,
784 			    n0, top, i - 1);
785 			bn_scatter5(tmp.d, top, powerbuf, i);
786 		}
787 #endif
788 		bits--;
789 		for (wvalue = 0, i = bits % 5; i >= 0; i--, bits--)
790 			wvalue = (wvalue << 1) + BN_is_bit_set(p, bits);
791 		bn_gather5(tmp.d, top, powerbuf, wvalue);
792 
793 		/* Scan the exponent one window at a time starting from the most
794 		 * significant bits.
795 		 */
796 		while (bits >= 0) {
797 			for (wvalue = 0, i = 0; i < 5; i++, bits--)
798 				wvalue = (wvalue << 1) + BN_is_bit_set(p, bits);
799 
800 			bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
801 			bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
802 			bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
803 			bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
804 			bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
805 			bn_mul_mont_gather5(tmp.d, tmp.d, powerbuf, np, n0, top, wvalue);
806 		}
807 
808 		tmp.top = top;
809 		bn_correct_top(&tmp);
810 	} else
811 #endif
812 	{
813 		if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, 0,
814 		    window))
815 			goto err;
816 		if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&am,  top, powerbuf, 1,
817 		    window))
818 			goto err;
819 
820 		/* If the window size is greater than 1, then calculate
821 		 * val[i=2..2^winsize-1]. Powers are computed as a*a^(i-1)
822 		 * (even powers could instead be computed as (a^(i/2))^2
823 		 * to use the slight performance advantage of sqr over mul).
824 		 */
825 		if (window > 1) {
826 			if (!BN_mod_mul_montgomery(&tmp, &am, &am, mont, ctx))
827 				goto err;
828 			if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf,
829 			    2, window))
830 				goto err;
831 			for (i = 3; i < numPowers; i++) {
832 				/* Calculate a^i = a^(i-1) * a */
833 				if (!BN_mod_mul_montgomery(&tmp, &am, &tmp,
834 				    mont, ctx))
835 					goto err;
836 				if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top,
837 				    powerbuf, i, window))
838 					goto err;
839 			}
840 		}
841 
842 		bits--;
843 		for (wvalue = 0, i = bits % window; i >= 0; i--, bits--)
844 			wvalue = (wvalue << 1) + BN_is_bit_set(p, bits);
845 		if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(&tmp, top, powerbuf,
846 		    wvalue, window))
847 			goto err;
848 
849 		/* Scan the exponent one window at a time starting from the most
850 		 * significant bits.
851 		 */
852 		while (bits >= 0) {
853 			wvalue = 0; /* The 'value' of the window */
854 
855 			/* Scan the window, squaring the result as we go */
856 			for (i = 0; i < window; i++, bits--) {
857 				if (!BN_mod_mul_montgomery(&tmp, &tmp, &tmp,
858 				    mont, ctx))
859 					goto err;
860 				wvalue = (wvalue << 1) + BN_is_bit_set(p, bits);
861 			}
862 
863 			/* Fetch the appropriate pre-computed value from the pre-buf */
864 			if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(&am, top, powerbuf,
865 			    wvalue, window))
866 				goto err;
867 
868 			/* Multiply the result into the intermediate result */
869 			if (!BN_mod_mul_montgomery(&tmp, &tmp, &am, mont, ctx))
870 				goto err;
871 		}
872 	}
873 
874 	/* Convert the final result from montgomery to standard format */
875 	if (!BN_from_montgomery(rr, &tmp, mont, ctx))
876 		goto err;
877 	ret = 1;
878 
879 err:
880 	if ((in_mont == NULL) && (mont != NULL))
881 		BN_MONT_CTX_free(mont);
882 	if (powerbuf != NULL) {
883 		explicit_bzero(powerbuf, powerbufLen);
884 		free(powerbufFree);
885 	}
886 	BN_CTX_end(ctx);
887 	return (ret);
888 }
889 
890 int
891 BN_mod_exp_mont_word(BIGNUM *rr, BN_ULONG a, const BIGNUM *p, const BIGNUM *m,
892     BN_CTX *ctx, BN_MONT_CTX *in_mont)
893 {
894 	BN_MONT_CTX *mont = NULL;
895 	int b, bits, ret = 0;
896 	int r_is_one;
897 	BN_ULONG w, next_w;
898 	BIGNUM *d, *r, *t;
899 	BIGNUM *swap_tmp;
900 
901 #define BN_MOD_MUL_WORD(r, w, m) \
902 		(BN_mul_word(r, (w)) && \
903 		(/* BN_ucmp(r, (m)) < 0 ? 1 :*/  \
904 			(BN_mod(t, r, m, ctx) && (swap_tmp = r, r = t, t = swap_tmp, 1))))
905 		/* BN_MOD_MUL_WORD is only used with 'w' large,
906 		 * so the BN_ucmp test is probably more overhead
907 		 * than always using BN_mod (which uses BN_copy if
908 		 * a similar test returns true). */
909 		/* We can use BN_mod and do not need BN_nnmod because our
910 		 * accumulator is never negative (the result of BN_mod does
911 		 * not depend on the sign of the modulus).
912 		 */
913 #define BN_TO_MONTGOMERY_WORD(r, w, mont) \
914 		(BN_set_word(r, (w)) && BN_to_montgomery(r, r, (mont), ctx))
915 
916 	if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0) {
917 		/* BN_FLG_CONSTTIME only supported by BN_mod_exp_mont() */
918 		BNerr(BN_F_BN_MOD_EXP_MONT_WORD,
919 		    ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
920 		return -1;
921 	}
922 
923 	bn_check_top(p);
924 	bn_check_top(m);
925 
926 	if (!BN_is_odd(m)) {
927 		BNerr(BN_F_BN_MOD_EXP_MONT_WORD, BN_R_CALLED_WITH_EVEN_MODULUS);
928 		return (0);
929 	}
930 	if (m->top == 1)
931 		a %= m->d[0]; /* make sure that 'a' is reduced */
932 
933 	bits = BN_num_bits(p);
934 	if (bits == 0) {
935 		/* x**0 mod 1 is still zero. */
936 		if (BN_is_one(m)) {
937 			ret = 1;
938 			BN_zero(rr);
939 		} else
940 			ret = BN_one(rr);
941 		return ret;
942 	}
943 	if (a == 0) {
944 		BN_zero(rr);
945 		ret = 1;
946 		return ret;
947 	}
948 
949 	BN_CTX_start(ctx);
950 	if ((d = BN_CTX_get(ctx)) == NULL)
951 		goto err;
952 	if ((r = BN_CTX_get(ctx)) == NULL)
953 		goto err;
954 	if ((t = BN_CTX_get(ctx)) == NULL)
955 		goto err;
956 
957 	if (in_mont != NULL)
958 		mont = in_mont;
959 	else {
960 		if ((mont = BN_MONT_CTX_new()) == NULL)
961 			goto err;
962 		if (!BN_MONT_CTX_set(mont, m, ctx))
963 			goto err;
964 	}
965 
966 	r_is_one = 1; /* except for Montgomery factor */
967 
968 	/* bits-1 >= 0 */
969 
970 	/* The result is accumulated in the product r*w. */
971 	w = a; /* bit 'bits-1' of 'p' is always set */
972 	for (b = bits - 2; b >= 0; b--) {
973 		/* First, square r*w. */
974 		next_w = w * w;
975 		if ((next_w / w) != w) /* overflow */
976 		{
977 			if (r_is_one) {
978 				if (!BN_TO_MONTGOMERY_WORD(r, w, mont))
979 					goto err;
980 				r_is_one = 0;
981 			} else {
982 				if (!BN_MOD_MUL_WORD(r, w, m))
983 					goto err;
984 			}
985 			next_w = 1;
986 		}
987 		w = next_w;
988 		if (!r_is_one) {
989 			if (!BN_mod_mul_montgomery(r, r, r, mont, ctx))
990 				goto err;
991 		}
992 
993 		/* Second, multiply r*w by 'a' if exponent bit is set. */
994 		if (BN_is_bit_set(p, b)) {
995 			next_w = w * a;
996 			if ((next_w / a) != w) /* overflow */
997 			{
998 				if (r_is_one) {
999 					if (!BN_TO_MONTGOMERY_WORD(r, w, mont))
1000 						goto err;
1001 					r_is_one = 0;
1002 				} else {
1003 					if (!BN_MOD_MUL_WORD(r, w, m))
1004 						goto err;
1005 				}
1006 				next_w = a;
1007 			}
1008 			w = next_w;
1009 		}
1010 	}
1011 
1012 	/* Finally, set r:=r*w. */
1013 	if (w != 1) {
1014 		if (r_is_one) {
1015 			if (!BN_TO_MONTGOMERY_WORD(r, w, mont))
1016 				goto err;
1017 			r_is_one = 0;
1018 		} else {
1019 			if (!BN_MOD_MUL_WORD(r, w, m))
1020 				goto err;
1021 		}
1022 	}
1023 
1024 	if (r_is_one) /* can happen only if a == 1*/
1025 	{
1026 		if (!BN_one(rr))
1027 			goto err;
1028 	} else {
1029 		if (!BN_from_montgomery(rr, r, mont, ctx))
1030 			goto err;
1031 	}
1032 	ret = 1;
1033 
1034 err:
1035 	if ((in_mont == NULL) && (mont != NULL))
1036 		BN_MONT_CTX_free(mont);
1037 	BN_CTX_end(ctx);
1038 	bn_check_top(rr);
1039 	return (ret);
1040 }
1041 
1042 
1043 /* The old fallback, simple version :-) */
1044 int
1045 BN_mod_exp_simple(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, const BIGNUM *m,
1046     BN_CTX *ctx)
1047 {
1048 	int i, j, bits, ret = 0, wstart, wend, window, wvalue;
1049 	int start = 1;
1050 	BIGNUM *d;
1051 	/* Table of variables obtained from 'ctx' */
1052 	BIGNUM *val[TABLE_SIZE];
1053 
1054 	if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0) {
1055 		/* BN_FLG_CONSTTIME only supported by BN_mod_exp_mont() */
1056 		BNerr(BN_F_BN_MOD_EXP_SIMPLE,
1057 		    ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
1058 		return -1;
1059 	}
1060 
1061 	bits = BN_num_bits(p);
1062 	if (bits == 0) {
1063 		/* x**0 mod 1 is still zero. */
1064 		if (BN_is_one(m)) {
1065 			ret = 1;
1066 			BN_zero(r);
1067 		} else
1068 			ret = BN_one(r);
1069 		return ret;
1070 	}
1071 
1072 	BN_CTX_start(ctx);
1073 	if ((d = BN_CTX_get(ctx)) == NULL)
1074 		goto err;
1075 	if ((val[0] = BN_CTX_get(ctx)) == NULL)
1076 		goto err;
1077 
1078 	if (!BN_nnmod(val[0],a,m,ctx))
1079 		goto err;		/* 1 */
1080 	if (BN_is_zero(val[0])) {
1081 		BN_zero(r);
1082 		ret = 1;
1083 		goto err;
1084 	}
1085 
1086 	window = BN_window_bits_for_exponent_size(bits);
1087 	if (window > 1) {
1088 		if (!BN_mod_mul(d, val[0], val[0], m, ctx))
1089 			goto err;				/* 2 */
1090 		j = 1 << (window - 1);
1091 		for (i = 1; i < j; i++) {
1092 			if (((val[i] = BN_CTX_get(ctx)) == NULL) ||
1093 			    !BN_mod_mul(val[i], val[i - 1], d,m, ctx))
1094 				goto err;
1095 		}
1096 	}
1097 
1098 	start = 1;		/* This is used to avoid multiplication etc
1099 				 * when there is only the value '1' in the
1100 				 * buffer. */
1101 	wvalue = 0;		/* The 'value' of the window */
1102 	wstart = bits - 1;	/* The top bit of the window */
1103 	wend = 0;		/* The bottom bit of the window */
1104 
1105 	if (!BN_one(r))
1106 		goto err;
1107 
1108 	for (;;) {
1109 		if (BN_is_bit_set(p, wstart) == 0) {
1110 			if (!start)
1111 				if (!BN_mod_mul(r, r, r, m, ctx))
1112 					goto err;
1113 			if (wstart == 0)
1114 				break;
1115 			wstart--;
1116 			continue;
1117 		}
1118 		/* We now have wstart on a 'set' bit, we now need to work out
1119 		 * how bit a window to do.  To do this we need to scan
1120 		 * forward until the last set bit before the end of the
1121 		 * window */
1122 		j = wstart;
1123 		wvalue = 1;
1124 		wend = 0;
1125 		for (i = 1; i < window; i++) {
1126 			if (wstart - i < 0)
1127 				break;
1128 			if (BN_is_bit_set(p, wstart - i)) {
1129 				wvalue <<= (i - wend);
1130 				wvalue |= 1;
1131 				wend = i;
1132 			}
1133 		}
1134 
1135 		/* wend is the size of the current window */
1136 		j = wend + 1;
1137 		/* add the 'bytes above' */
1138 		if (!start)
1139 			for (i = 0; i < j; i++) {
1140 				if (!BN_mod_mul(r, r, r, m, ctx))
1141 					goto err;
1142 			}
1143 
1144 		/* wvalue will be an odd number < 2^window */
1145 		if (!BN_mod_mul(r, r, val[wvalue >> 1], m, ctx))
1146 			goto err;
1147 
1148 		/* move the 'window' down further */
1149 		wstart -= wend + 1;
1150 		wvalue = 0;
1151 		start = 0;
1152 		if (wstart < 0)
1153 			break;
1154 	}
1155 	ret = 1;
1156 
1157 err:
1158 	BN_CTX_end(ctx);
1159 	bn_check_top(r);
1160 	return (ret);
1161 }
1162