1 /*
2 * Copyright 1995-2023 The OpenSSL Project Authors. All Rights Reserved.
3 *
4 * Licensed under the Apache License 2.0 (the "License"). You may not use
5 * this file except in compliance with the License. You can obtain a copy
6 * in the file LICENSE in the source distribution or at
7 * https://www.openssl.org/source/license.html
8 */
9
10 #include "internal/cryptlib.h"
11 #include "internal/constant_time.h"
12 #include "bn_local.h"
13
14 #include <stdlib.h>
15 #ifdef _WIN32
16 # include <malloc.h>
17 # ifndef alloca
18 # define alloca _alloca
19 # endif
20 #elif defined(__GNUC__)
21 # ifndef __SSP__
22 # ifndef alloca
23 # define alloca(s) __builtin_alloca((s))
24 # endif
25 # else
26 # undef alloca
27 # endif
28 #elif defined(__sun)
29 # include <alloca.h>
30 #endif
31
32 #include "rsaz_exp.h"
33
34 #undef SPARC_T4_MONT
35 #if defined(OPENSSL_BN_ASM_MONT) && (defined(__sparc__) || defined(__sparc))
36 # include "crypto/sparc_arch.h"
37 # define SPARC_T4_MONT
38 #endif
39
40 /* maximum precomputation table size for *variable* sliding windows */
41 #define TABLE_SIZE 32
42
43 /*
44 * Beyond this limit the constant time code is disabled due to
45 * the possible overflow in the computation of powerbufLen in
46 * BN_mod_exp_mont_consttime.
47 * When this limit is exceeded, the computation will be done using
48 * non-constant time code, but it will take very long.
49 */
50 #define BN_CONSTTIME_SIZE_LIMIT (INT_MAX / BN_BYTES / 256)
51
52 /* this one works - simple but works */
BN_exp(BIGNUM * r,const BIGNUM * a,const BIGNUM * p,BN_CTX * ctx)53 int BN_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx)
54 {
55 int i, bits, ret = 0;
56 BIGNUM *v, *rr;
57
58 if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0
59 || BN_get_flags(a, BN_FLG_CONSTTIME) != 0) {
60 /* BN_FLG_CONSTTIME only supported by BN_mod_exp_mont() */
61 ERR_raise(ERR_LIB_BN, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
62 return 0;
63 }
64
65 BN_CTX_start(ctx);
66 rr = ((r == a) || (r == p)) ? BN_CTX_get(ctx) : r;
67 v = BN_CTX_get(ctx);
68 if (rr == NULL || v == NULL)
69 goto err;
70
71 if (BN_copy(v, a) == NULL)
72 goto err;
73 bits = BN_num_bits(p);
74
75 if (BN_is_odd(p)) {
76 if (BN_copy(rr, a) == NULL)
77 goto err;
78 } else {
79 if (!BN_one(rr))
80 goto err;
81 }
82
83 for (i = 1; i < bits; i++) {
84 if (!BN_sqr(v, v, ctx))
85 goto err;
86 if (BN_is_bit_set(p, i)) {
87 if (!BN_mul(rr, rr, v, ctx))
88 goto err;
89 }
90 }
91 if (r != rr && BN_copy(r, rr) == NULL)
92 goto err;
93
94 ret = 1;
95 err:
96 BN_CTX_end(ctx);
97 bn_check_top(r);
98 return ret;
99 }
100
BN_mod_exp(BIGNUM * r,const BIGNUM * a,const BIGNUM * p,const BIGNUM * m,BN_CTX * ctx)101 int BN_mod_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, const BIGNUM *m,
102 BN_CTX *ctx)
103 {
104 int ret;
105
106 bn_check_top(a);
107 bn_check_top(p);
108 bn_check_top(m);
109
110 /*-
111 * For even modulus m = 2^k*m_odd, it might make sense to compute
112 * a^p mod m_odd and a^p mod 2^k separately (with Montgomery
113 * exponentiation for the odd part), using appropriate exponent
114 * reductions, and combine the results using the CRT.
115 *
116 * For now, we use Montgomery only if the modulus is odd; otherwise,
117 * exponentiation using the reciprocal-based quick remaindering
118 * algorithm is used.
119 *
120 * (Timing obtained with expspeed.c [computations a^p mod m
121 * where a, p, m are of the same length: 256, 512, 1024, 2048,
122 * 4096, 8192 bits], compared to the running time of the
123 * standard algorithm:
124 *
125 * BN_mod_exp_mont 33 .. 40 % [AMD K6-2, Linux, debug configuration]
126 * 55 .. 77 % [UltraSparc processor, but
127 * debug-solaris-sparcv8-gcc conf.]
128 *
129 * BN_mod_exp_recp 50 .. 70 % [AMD K6-2, Linux, debug configuration]
130 * 62 .. 118 % [UltraSparc, debug-solaris-sparcv8-gcc]
131 *
132 * On the Sparc, BN_mod_exp_recp was faster than BN_mod_exp_mont
133 * at 2048 and more bits, but at 512 and 1024 bits, it was
134 * slower even than the standard algorithm!
135 *
136 * "Real" timings [linux-elf, solaris-sparcv9-gcc configurations]
137 * should be obtained when the new Montgomery reduction code
138 * has been integrated into OpenSSL.)
139 */
140
141 #define MONT_MUL_MOD
142 #define MONT_EXP_WORD
143 #define RECP_MUL_MOD
144
145 #ifdef MONT_MUL_MOD
146 if (BN_is_odd(m)) {
147 # ifdef MONT_EXP_WORD
148 if (a->top == 1 && !a->neg
149 && (BN_get_flags(p, BN_FLG_CONSTTIME) == 0)
150 && (BN_get_flags(a, BN_FLG_CONSTTIME) == 0)
151 && (BN_get_flags(m, BN_FLG_CONSTTIME) == 0)) {
152 BN_ULONG A = a->d[0];
153 ret = BN_mod_exp_mont_word(r, A, p, m, ctx, NULL);
154 } else
155 # endif
156 ret = BN_mod_exp_mont(r, a, p, m, ctx, NULL);
157 } else
158 #endif
159 #ifdef RECP_MUL_MOD
160 {
161 ret = BN_mod_exp_recp(r, a, p, m, ctx);
162 }
163 #else
164 {
165 ret = BN_mod_exp_simple(r, a, p, m, ctx);
166 }
167 #endif
168
169 bn_check_top(r);
170 return ret;
171 }
172
BN_mod_exp_recp(BIGNUM * r,const BIGNUM * a,const BIGNUM * p,const BIGNUM * m,BN_CTX * ctx)173 int BN_mod_exp_recp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
174 const BIGNUM *m, BN_CTX *ctx)
175 {
176 int i, j, bits, ret = 0, wstart, wend, window, wvalue;
177 int start = 1;
178 BIGNUM *aa;
179 /* Table of variables obtained from 'ctx' */
180 BIGNUM *val[TABLE_SIZE];
181 BN_RECP_CTX recp;
182
183 if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0
184 || BN_get_flags(a, BN_FLG_CONSTTIME) != 0
185 || BN_get_flags(m, BN_FLG_CONSTTIME) != 0) {
186 /* BN_FLG_CONSTTIME only supported by BN_mod_exp_mont() */
187 ERR_raise(ERR_LIB_BN, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
188 return 0;
189 }
190
191 bits = BN_num_bits(p);
192 if (bits == 0) {
193 /* x**0 mod 1, or x**0 mod -1 is still zero. */
194 if (BN_abs_is_word(m, 1)) {
195 ret = 1;
196 BN_zero(r);
197 } else {
198 ret = BN_one(r);
199 }
200 return ret;
201 }
202
203 BN_RECP_CTX_init(&recp);
204
205 BN_CTX_start(ctx);
206 aa = BN_CTX_get(ctx);
207 val[0] = BN_CTX_get(ctx);
208 if (val[0] == NULL)
209 goto err;
210
211 if (m->neg) {
212 /* ignore sign of 'm' */
213 if (!BN_copy(aa, m))
214 goto err;
215 aa->neg = 0;
216 if (BN_RECP_CTX_set(&recp, aa, ctx) <= 0)
217 goto err;
218 } else {
219 if (BN_RECP_CTX_set(&recp, m, ctx) <= 0)
220 goto err;
221 }
222
223 if (!BN_nnmod(val[0], a, m, ctx))
224 goto err; /* 1 */
225 if (BN_is_zero(val[0])) {
226 BN_zero(r);
227 ret = 1;
228 goto err;
229 }
230
231 window = BN_window_bits_for_exponent_size(bits);
232 if (window > 1) {
233 if (!BN_mod_mul_reciprocal(aa, val[0], val[0], &recp, ctx))
234 goto err; /* 2 */
235 j = 1 << (window - 1);
236 for (i = 1; i < j; i++) {
237 if (((val[i] = BN_CTX_get(ctx)) == NULL) ||
238 !BN_mod_mul_reciprocal(val[i], val[i - 1], aa, &recp, ctx))
239 goto err;
240 }
241 }
242
243 start = 1; /* This is used to avoid multiplication etc
244 * when there is only the value '1' in the
245 * buffer. */
246 wvalue = 0; /* The 'value' of the window */
247 wstart = bits - 1; /* The top bit of the window */
248 wend = 0; /* The bottom bit of the window */
249
250 if (r == p) {
251 BIGNUM *p_dup = BN_CTX_get(ctx);
252
253 if (p_dup == NULL || BN_copy(p_dup, p) == NULL)
254 goto err;
255 p = p_dup;
256 }
257
258 if (!BN_one(r))
259 goto err;
260
261 for (;;) {
262 if (BN_is_bit_set(p, wstart) == 0) {
263 if (!start)
264 if (!BN_mod_mul_reciprocal(r, r, r, &recp, ctx))
265 goto err;
266 if (wstart == 0)
267 break;
268 wstart--;
269 continue;
270 }
271 /*
272 * We now have wstart on a 'set' bit, we now need to work out how bit
273 * a window to do. To do this we need to scan forward until the last
274 * set bit before the end of the window
275 */
276 wvalue = 1;
277 wend = 0;
278 for (i = 1; i < window; i++) {
279 if (wstart - i < 0)
280 break;
281 if (BN_is_bit_set(p, wstart - i)) {
282 wvalue <<= (i - wend);
283 wvalue |= 1;
284 wend = i;
285 }
286 }
287
288 /* wend is the size of the current window */
289 j = wend + 1;
290 /* add the 'bytes above' */
291 if (!start)
292 for (i = 0; i < j; i++) {
293 if (!BN_mod_mul_reciprocal(r, r, r, &recp, ctx))
294 goto err;
295 }
296
297 /* wvalue will be an odd number < 2^window */
298 if (!BN_mod_mul_reciprocal(r, r, val[wvalue >> 1], &recp, ctx))
299 goto err;
300
301 /* move the 'window' down further */
302 wstart -= wend + 1;
303 wvalue = 0;
304 start = 0;
305 if (wstart < 0)
306 break;
307 }
308 ret = 1;
309 err:
310 BN_CTX_end(ctx);
311 BN_RECP_CTX_free(&recp);
312 bn_check_top(r);
313 return ret;
314 }
315
BN_mod_exp_mont(BIGNUM * rr,const BIGNUM * a,const BIGNUM * p,const BIGNUM * m,BN_CTX * ctx,BN_MONT_CTX * in_mont)316 int BN_mod_exp_mont(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
317 const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *in_mont)
318 {
319 int i, j, bits, ret = 0, wstart, wend, window, wvalue;
320 int start = 1;
321 BIGNUM *d, *r;
322 const BIGNUM *aa;
323 /* Table of variables obtained from 'ctx' */
324 BIGNUM *val[TABLE_SIZE];
325 BN_MONT_CTX *mont = NULL;
326
327 bn_check_top(a);
328 bn_check_top(p);
329 bn_check_top(m);
330
331 if (!BN_is_odd(m)) {
332 ERR_raise(ERR_LIB_BN, BN_R_CALLED_WITH_EVEN_MODULUS);
333 return 0;
334 }
335
336 if (m->top <= BN_CONSTTIME_SIZE_LIMIT
337 && (BN_get_flags(p, BN_FLG_CONSTTIME) != 0
338 || BN_get_flags(a, BN_FLG_CONSTTIME) != 0
339 || BN_get_flags(m, BN_FLG_CONSTTIME) != 0)) {
340 return BN_mod_exp_mont_consttime(rr, a, p, m, ctx, in_mont);
341 }
342
343 bits = BN_num_bits(p);
344 if (bits == 0) {
345 /* x**0 mod 1, or x**0 mod -1 is still zero. */
346 if (BN_abs_is_word(m, 1)) {
347 ret = 1;
348 BN_zero(rr);
349 } else {
350 ret = BN_one(rr);
351 }
352 return ret;
353 }
354
355 BN_CTX_start(ctx);
356 d = BN_CTX_get(ctx);
357 r = BN_CTX_get(ctx);
358 val[0] = BN_CTX_get(ctx);
359 if (val[0] == NULL)
360 goto err;
361
362 /*
363 * If this is not done, things will break in the montgomery part
364 */
365
366 if (in_mont != NULL)
367 mont = in_mont;
368 else {
369 if ((mont = BN_MONT_CTX_new()) == NULL)
370 goto err;
371 if (!BN_MONT_CTX_set(mont, m, ctx))
372 goto err;
373 }
374
375 if (a->neg || BN_ucmp(a, m) >= 0) {
376 if (!BN_nnmod(val[0], a, m, ctx))
377 goto err;
378 aa = val[0];
379 } else
380 aa = a;
381 if (!bn_to_mont_fixed_top(val[0], aa, mont, ctx))
382 goto err; /* 1 */
383
384 window = BN_window_bits_for_exponent_size(bits);
385 if (window > 1) {
386 if (!bn_mul_mont_fixed_top(d, val[0], val[0], mont, ctx))
387 goto err; /* 2 */
388 j = 1 << (window - 1);
389 for (i = 1; i < j; i++) {
390 if (((val[i] = BN_CTX_get(ctx)) == NULL) ||
391 !bn_mul_mont_fixed_top(val[i], val[i - 1], d, mont, ctx))
392 goto err;
393 }
394 }
395
396 start = 1; /* This is used to avoid multiplication etc
397 * when there is only the value '1' in the
398 * buffer. */
399 wvalue = 0; /* The 'value' of the window */
400 wstart = bits - 1; /* The top bit of the window */
401 wend = 0; /* The bottom bit of the window */
402
403 #if 1 /* by Shay Gueron's suggestion */
404 j = m->top; /* borrow j */
405 if (m->d[j - 1] & (((BN_ULONG)1) << (BN_BITS2 - 1))) {
406 if (bn_wexpand(r, j) == NULL)
407 goto err;
408 /* 2^(top*BN_BITS2) - m */
409 r->d[0] = (0 - m->d[0]) & BN_MASK2;
410 for (i = 1; i < j; i++)
411 r->d[i] = (~m->d[i]) & BN_MASK2;
412 r->top = j;
413 r->flags |= BN_FLG_FIXED_TOP;
414 } else
415 #endif
416 if (!bn_to_mont_fixed_top(r, BN_value_one(), mont, ctx))
417 goto err;
418 for (;;) {
419 if (BN_is_bit_set(p, wstart) == 0) {
420 if (!start) {
421 if (!bn_mul_mont_fixed_top(r, r, r, mont, ctx))
422 goto err;
423 }
424 if (wstart == 0)
425 break;
426 wstart--;
427 continue;
428 }
429 /*
430 * We now have wstart on a 'set' bit, we now need to work out how bit
431 * a window to do. To do this we need to scan forward until the last
432 * set bit before the end of the window
433 */
434 wvalue = 1;
435 wend = 0;
436 for (i = 1; i < window; i++) {
437 if (wstart - i < 0)
438 break;
439 if (BN_is_bit_set(p, wstart - i)) {
440 wvalue <<= (i - wend);
441 wvalue |= 1;
442 wend = i;
443 }
444 }
445
446 /* wend is the size of the current window */
447 j = wend + 1;
448 /* add the 'bytes above' */
449 if (!start)
450 for (i = 0; i < j; i++) {
451 if (!bn_mul_mont_fixed_top(r, r, r, mont, ctx))
452 goto err;
453 }
454
455 /* wvalue will be an odd number < 2^window */
456 if (!bn_mul_mont_fixed_top(r, r, val[wvalue >> 1], mont, ctx))
457 goto err;
458
459 /* move the 'window' down further */
460 wstart -= wend + 1;
461 wvalue = 0;
462 start = 0;
463 if (wstart < 0)
464 break;
465 }
466 /*
467 * Done with zero-padded intermediate BIGNUMs. Final BN_from_montgomery
468 * removes padding [if any] and makes return value suitable for public
469 * API consumer.
470 */
471 #if defined(SPARC_T4_MONT)
472 if (OPENSSL_sparcv9cap_P[0] & (SPARCV9_VIS3 | SPARCV9_PREFER_FPU)) {
473 j = mont->N.top; /* borrow j */
474 val[0]->d[0] = 1; /* borrow val[0] */
475 for (i = 1; i < j; i++)
476 val[0]->d[i] = 0;
477 val[0]->top = j;
478 if (!BN_mod_mul_montgomery(rr, r, val[0], mont, ctx))
479 goto err;
480 } else
481 #endif
482 if (!BN_from_montgomery(rr, r, mont, ctx))
483 goto err;
484 ret = 1;
485 err:
486 if (in_mont == NULL)
487 BN_MONT_CTX_free(mont);
488 BN_CTX_end(ctx);
489 bn_check_top(rr);
490 return ret;
491 }
492
bn_get_bits(const BIGNUM * a,int bitpos)493 static BN_ULONG bn_get_bits(const BIGNUM *a, int bitpos)
494 {
495 BN_ULONG ret = 0;
496 int wordpos;
497
498 wordpos = bitpos / BN_BITS2;
499 bitpos %= BN_BITS2;
500 if (wordpos >= 0 && wordpos < a->top) {
501 ret = a->d[wordpos] & BN_MASK2;
502 if (bitpos) {
503 ret >>= bitpos;
504 if (++wordpos < a->top)
505 ret |= a->d[wordpos] << (BN_BITS2 - bitpos);
506 }
507 }
508
509 return ret & BN_MASK2;
510 }
511
512 /*
513 * BN_mod_exp_mont_consttime() stores the precomputed powers in a specific
514 * layout so that accessing any of these table values shows the same access
515 * pattern as far as cache lines are concerned. The following functions are
516 * used to transfer a BIGNUM from/to that table.
517 */
518
MOD_EXP_CTIME_COPY_TO_PREBUF(const BIGNUM * b,int top,unsigned char * buf,int idx,int window)519 static int MOD_EXP_CTIME_COPY_TO_PREBUF(const BIGNUM *b, int top,
520 unsigned char *buf, int idx,
521 int window)
522 {
523 int i, j;
524 int width = 1 << window;
525 BN_ULONG *table = (BN_ULONG *)buf;
526
527 if (top > b->top)
528 top = b->top; /* this works because 'buf' is explicitly
529 * zeroed */
530 for (i = 0, j = idx; i < top; i++, j += width) {
531 table[j] = b->d[i];
532 }
533
534 return 1;
535 }
536
MOD_EXP_CTIME_COPY_FROM_PREBUF(BIGNUM * b,int top,unsigned char * buf,int idx,int window)537 static int MOD_EXP_CTIME_COPY_FROM_PREBUF(BIGNUM *b, int top,
538 unsigned char *buf, int idx,
539 int window)
540 {
541 int i, j;
542 int width = 1 << window;
543 /*
544 * We declare table 'volatile' in order to discourage compiler
545 * from reordering loads from the table. Concern is that if
546 * reordered in specific manner loads might give away the
547 * information we are trying to conceal. Some would argue that
548 * compiler can reorder them anyway, but it can as well be
549 * argued that doing so would be violation of standard...
550 */
551 volatile BN_ULONG *table = (volatile BN_ULONG *)buf;
552
553 if (bn_wexpand(b, top) == NULL)
554 return 0;
555
556 if (window <= 3) {
557 for (i = 0; i < top; i++, table += width) {
558 BN_ULONG acc = 0;
559
560 for (j = 0; j < width; j++) {
561 acc |= table[j] &
562 ((BN_ULONG)0 - (constant_time_eq_int(j,idx)&1));
563 }
564
565 b->d[i] = acc;
566 }
567 } else {
568 int xstride = 1 << (window - 2);
569 BN_ULONG y0, y1, y2, y3;
570
571 i = idx >> (window - 2); /* equivalent of idx / xstride */
572 idx &= xstride - 1; /* equivalent of idx % xstride */
573
574 y0 = (BN_ULONG)0 - (constant_time_eq_int(i,0)&1);
575 y1 = (BN_ULONG)0 - (constant_time_eq_int(i,1)&1);
576 y2 = (BN_ULONG)0 - (constant_time_eq_int(i,2)&1);
577 y3 = (BN_ULONG)0 - (constant_time_eq_int(i,3)&1);
578
579 for (i = 0; i < top; i++, table += width) {
580 BN_ULONG acc = 0;
581
582 for (j = 0; j < xstride; j++) {
583 acc |= ( (table[j + 0 * xstride] & y0) |
584 (table[j + 1 * xstride] & y1) |
585 (table[j + 2 * xstride] & y2) |
586 (table[j + 3 * xstride] & y3) )
587 & ((BN_ULONG)0 - (constant_time_eq_int(j,idx)&1));
588 }
589
590 b->d[i] = acc;
591 }
592 }
593
594 b->top = top;
595 b->flags |= BN_FLG_FIXED_TOP;
596 return 1;
597 }
598
599 /*
600 * Given a pointer value, compute the next address that is a cache line
601 * multiple.
602 */
603 #define MOD_EXP_CTIME_ALIGN(x_) \
604 ((unsigned char*)(x_) + (MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH - (((size_t)(x_)) & (MOD_EXP_CTIME_MIN_CACHE_LINE_MASK))))
605
606 /*
607 * This variant of BN_mod_exp_mont() uses fixed windows and the special
608 * precomputation memory layout to limit data-dependency to a minimum to
609 * protect secret exponents (cf. the hyper-threading timing attacks pointed
610 * out by Colin Percival,
611 * http://www.daemonology.net/hyperthreading-considered-harmful/)
612 */
BN_mod_exp_mont_consttime(BIGNUM * rr,const BIGNUM * a,const BIGNUM * p,const BIGNUM * m,BN_CTX * ctx,BN_MONT_CTX * in_mont)613 int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
614 const BIGNUM *m, BN_CTX *ctx,
615 BN_MONT_CTX *in_mont)
616 {
617 int i, bits, ret = 0, window, wvalue, wmask, window0;
618 int top;
619 BN_MONT_CTX *mont = NULL;
620
621 int numPowers;
622 unsigned char *powerbufFree = NULL;
623 int powerbufLen = 0;
624 unsigned char *powerbuf = NULL;
625 BIGNUM tmp, am;
626 #if defined(SPARC_T4_MONT)
627 unsigned int t4 = 0;
628 #endif
629
630 bn_check_top(a);
631 bn_check_top(p);
632 bn_check_top(m);
633
634 if (!BN_is_odd(m)) {
635 ERR_raise(ERR_LIB_BN, BN_R_CALLED_WITH_EVEN_MODULUS);
636 return 0;
637 }
638
639 top = m->top;
640
641 if (top > BN_CONSTTIME_SIZE_LIMIT) {
642 /* Prevent overflowing the powerbufLen computation below */
643 return BN_mod_exp_mont(rr, a, p, m, ctx, in_mont);
644 }
645
646 /*
647 * Use all bits stored in |p|, rather than |BN_num_bits|, so we do not leak
648 * whether the top bits are zero.
649 */
650 bits = p->top * BN_BITS2;
651 if (bits == 0) {
652 /* x**0 mod 1, or x**0 mod -1 is still zero. */
653 if (BN_abs_is_word(m, 1)) {
654 ret = 1;
655 BN_zero(rr);
656 } else {
657 ret = BN_one(rr);
658 }
659 return ret;
660 }
661
662 BN_CTX_start(ctx);
663
664 /*
665 * Allocate a montgomery context if it was not supplied by the caller. If
666 * this is not done, things will break in the montgomery part.
667 */
668 if (in_mont != NULL)
669 mont = in_mont;
670 else {
671 if ((mont = BN_MONT_CTX_new()) == NULL)
672 goto err;
673 if (!BN_MONT_CTX_set(mont, m, ctx))
674 goto err;
675 }
676
677 if (a->neg || BN_ucmp(a, m) >= 0) {
678 BIGNUM *reduced = BN_CTX_get(ctx);
679 if (reduced == NULL
680 || !BN_nnmod(reduced, a, m, ctx)) {
681 goto err;
682 }
683 a = reduced;
684 }
685
686 #ifdef RSAZ_ENABLED
687 /*
688 * If the size of the operands allow it, perform the optimized
689 * RSAZ exponentiation. For further information see
690 * crypto/bn/rsaz_exp.c and accompanying assembly modules.
691 */
692 if ((16 == a->top) && (16 == p->top) && (BN_num_bits(m) == 1024)
693 && rsaz_avx2_eligible()) {
694 if (NULL == bn_wexpand(rr, 16))
695 goto err;
696 RSAZ_1024_mod_exp_avx2(rr->d, a->d, p->d, m->d, mont->RR.d,
697 mont->n0[0]);
698 rr->top = 16;
699 rr->neg = 0;
700 bn_correct_top(rr);
701 ret = 1;
702 goto err;
703 } else if ((8 == a->top) && (8 == p->top) && (BN_num_bits(m) == 512)) {
704 if (NULL == bn_wexpand(rr, 8))
705 goto err;
706 RSAZ_512_mod_exp(rr->d, a->d, p->d, m->d, mont->n0[0], mont->RR.d);
707 rr->top = 8;
708 rr->neg = 0;
709 bn_correct_top(rr);
710 ret = 1;
711 goto err;
712 }
713 #endif
714
715 /* Get the window size to use with size of p. */
716 window = BN_window_bits_for_ctime_exponent_size(bits);
717 #if defined(SPARC_T4_MONT)
718 if (window >= 5 && (top & 15) == 0 && top <= 64 &&
719 (OPENSSL_sparcv9cap_P[1] & (CFR_MONTMUL | CFR_MONTSQR)) ==
720 (CFR_MONTMUL | CFR_MONTSQR) && (t4 = OPENSSL_sparcv9cap_P[0]))
721 window = 5;
722 else
723 #endif
724 #if defined(OPENSSL_BN_ASM_MONT5)
725 if (window >= 5 && top <= BN_SOFT_LIMIT) {
726 window = 5; /* ~5% improvement for RSA2048 sign, and even
727 * for RSA4096 */
728 /* reserve space for mont->N.d[] copy */
729 powerbufLen += top * sizeof(mont->N.d[0]);
730 }
731 #endif
732 (void)0;
733
734 /*
735 * Allocate a buffer large enough to hold all of the pre-computed powers
736 * of am, am itself and tmp.
737 */
738 numPowers = 1 << window;
739 powerbufLen += sizeof(m->d[0]) * (top * numPowers +
740 ((2 * top) >
741 numPowers ? (2 * top) : numPowers));
742 #ifdef alloca
743 if (powerbufLen < 3072)
744 powerbufFree =
745 alloca(powerbufLen + MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH);
746 else
747 #endif
748 if ((powerbufFree =
749 OPENSSL_malloc(powerbufLen + MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH))
750 == NULL)
751 goto err;
752
753 powerbuf = MOD_EXP_CTIME_ALIGN(powerbufFree);
754 memset(powerbuf, 0, powerbufLen);
755
756 #ifdef alloca
757 if (powerbufLen < 3072)
758 powerbufFree = NULL;
759 #endif
760
761 /* lay down tmp and am right after powers table */
762 tmp.d = (BN_ULONG *)(powerbuf + sizeof(m->d[0]) * top * numPowers);
763 am.d = tmp.d + top;
764 tmp.top = am.top = 0;
765 tmp.dmax = am.dmax = top;
766 tmp.neg = am.neg = 0;
767 tmp.flags = am.flags = BN_FLG_STATIC_DATA;
768
769 /* prepare a^0 in Montgomery domain */
770 #if 1 /* by Shay Gueron's suggestion */
771 if (m->d[top - 1] & (((BN_ULONG)1) << (BN_BITS2 - 1))) {
772 /* 2^(top*BN_BITS2) - m */
773 tmp.d[0] = (0 - m->d[0]) & BN_MASK2;
774 for (i = 1; i < top; i++)
775 tmp.d[i] = (~m->d[i]) & BN_MASK2;
776 tmp.top = top;
777 } else
778 #endif
779 if (!bn_to_mont_fixed_top(&tmp, BN_value_one(), mont, ctx))
780 goto err;
781
782 /* prepare a^1 in Montgomery domain */
783 if (!bn_to_mont_fixed_top(&am, a, mont, ctx))
784 goto err;
785
786 if (top > BN_SOFT_LIMIT)
787 goto fallback;
788
789 #if defined(SPARC_T4_MONT)
790 if (t4) {
791 typedef int (*bn_pwr5_mont_f) (BN_ULONG *tp, const BN_ULONG *np,
792 const BN_ULONG *n0, const void *table,
793 int power, int bits);
794 int bn_pwr5_mont_t4_8(BN_ULONG *tp, const BN_ULONG *np,
795 const BN_ULONG *n0, const void *table,
796 int power, int bits);
797 int bn_pwr5_mont_t4_16(BN_ULONG *tp, const BN_ULONG *np,
798 const BN_ULONG *n0, const void *table,
799 int power, int bits);
800 int bn_pwr5_mont_t4_24(BN_ULONG *tp, const BN_ULONG *np,
801 const BN_ULONG *n0, const void *table,
802 int power, int bits);
803 int bn_pwr5_mont_t4_32(BN_ULONG *tp, const BN_ULONG *np,
804 const BN_ULONG *n0, const void *table,
805 int power, int bits);
806 static const bn_pwr5_mont_f pwr5_funcs[4] = {
807 bn_pwr5_mont_t4_8, bn_pwr5_mont_t4_16,
808 bn_pwr5_mont_t4_24, bn_pwr5_mont_t4_32
809 };
810 bn_pwr5_mont_f pwr5_worker = pwr5_funcs[top / 16 - 1];
811
812 typedef int (*bn_mul_mont_f) (BN_ULONG *rp, const BN_ULONG *ap,
813 const void *bp, const BN_ULONG *np,
814 const BN_ULONG *n0);
815 int bn_mul_mont_t4_8(BN_ULONG *rp, const BN_ULONG *ap, const void *bp,
816 const BN_ULONG *np, const BN_ULONG *n0);
817 int bn_mul_mont_t4_16(BN_ULONG *rp, const BN_ULONG *ap,
818 const void *bp, const BN_ULONG *np,
819 const BN_ULONG *n0);
820 int bn_mul_mont_t4_24(BN_ULONG *rp, const BN_ULONG *ap,
821 const void *bp, const BN_ULONG *np,
822 const BN_ULONG *n0);
823 int bn_mul_mont_t4_32(BN_ULONG *rp, const BN_ULONG *ap,
824 const void *bp, const BN_ULONG *np,
825 const BN_ULONG *n0);
826 static const bn_mul_mont_f mul_funcs[4] = {
827 bn_mul_mont_t4_8, bn_mul_mont_t4_16,
828 bn_mul_mont_t4_24, bn_mul_mont_t4_32
829 };
830 bn_mul_mont_f mul_worker = mul_funcs[top / 16 - 1];
831
832 void bn_mul_mont_vis3(BN_ULONG *rp, const BN_ULONG *ap,
833 const void *bp, const BN_ULONG *np,
834 const BN_ULONG *n0, int num);
835 void bn_mul_mont_t4(BN_ULONG *rp, const BN_ULONG *ap,
836 const void *bp, const BN_ULONG *np,
837 const BN_ULONG *n0, int num);
838 void bn_mul_mont_gather5_t4(BN_ULONG *rp, const BN_ULONG *ap,
839 const void *table, const BN_ULONG *np,
840 const BN_ULONG *n0, int num, int power);
841 void bn_flip_n_scatter5_t4(const BN_ULONG *inp, size_t num,
842 void *table, size_t power);
843 void bn_gather5_t4(BN_ULONG *out, size_t num,
844 void *table, size_t power);
845 void bn_flip_t4(BN_ULONG *dst, BN_ULONG *src, size_t num);
846
847 BN_ULONG *np = mont->N.d, *n0 = mont->n0;
848 int stride = 5 * (6 - (top / 16 - 1)); /* multiple of 5, but less
849 * than 32 */
850
851 /*
852 * BN_to_montgomery can contaminate words above .top [in
853 * BN_DEBUG build...
854 */
855 for (i = am.top; i < top; i++)
856 am.d[i] = 0;
857 for (i = tmp.top; i < top; i++)
858 tmp.d[i] = 0;
859
860 bn_flip_n_scatter5_t4(tmp.d, top, powerbuf, 0);
861 bn_flip_n_scatter5_t4(am.d, top, powerbuf, 1);
862 if (!(*mul_worker) (tmp.d, am.d, am.d, np, n0) &&
863 !(*mul_worker) (tmp.d, am.d, am.d, np, n0))
864 bn_mul_mont_vis3(tmp.d, am.d, am.d, np, n0, top);
865 bn_flip_n_scatter5_t4(tmp.d, top, powerbuf, 2);
866
867 for (i = 3; i < 32; i++) {
868 /* Calculate a^i = a^(i-1) * a */
869 if (!(*mul_worker) (tmp.d, tmp.d, am.d, np, n0) &&
870 !(*mul_worker) (tmp.d, tmp.d, am.d, np, n0))
871 bn_mul_mont_vis3(tmp.d, tmp.d, am.d, np, n0, top);
872 bn_flip_n_scatter5_t4(tmp.d, top, powerbuf, i);
873 }
874
875 /* switch to 64-bit domain */
876 np = alloca(top * sizeof(BN_ULONG));
877 top /= 2;
878 bn_flip_t4(np, mont->N.d, top);
879
880 /*
881 * The exponent may not have a whole number of fixed-size windows.
882 * To simplify the main loop, the initial window has between 1 and
883 * full-window-size bits such that what remains is always a whole
884 * number of windows
885 */
886 window0 = (bits - 1) % 5 + 1;
887 wmask = (1 << window0) - 1;
888 bits -= window0;
889 wvalue = bn_get_bits(p, bits) & wmask;
890 bn_gather5_t4(tmp.d, top, powerbuf, wvalue);
891
892 /*
893 * Scan the exponent one window at a time starting from the most
894 * significant bits.
895 */
896 while (bits > 0) {
897 if (bits < stride)
898 stride = bits;
899 bits -= stride;
900 wvalue = bn_get_bits(p, bits);
901
902 if ((*pwr5_worker) (tmp.d, np, n0, powerbuf, wvalue, stride))
903 continue;
904 /* retry once and fall back */
905 if ((*pwr5_worker) (tmp.d, np, n0, powerbuf, wvalue, stride))
906 continue;
907
908 bits += stride - 5;
909 wvalue >>= stride - 5;
910 wvalue &= 31;
911 bn_mul_mont_t4(tmp.d, tmp.d, tmp.d, np, n0, top);
912 bn_mul_mont_t4(tmp.d, tmp.d, tmp.d, np, n0, top);
913 bn_mul_mont_t4(tmp.d, tmp.d, tmp.d, np, n0, top);
914 bn_mul_mont_t4(tmp.d, tmp.d, tmp.d, np, n0, top);
915 bn_mul_mont_t4(tmp.d, tmp.d, tmp.d, np, n0, top);
916 bn_mul_mont_gather5_t4(tmp.d, tmp.d, powerbuf, np, n0, top,
917 wvalue);
918 }
919
920 bn_flip_t4(tmp.d, tmp.d, top);
921 top *= 2;
922 /* back to 32-bit domain */
923 tmp.top = top;
924 bn_correct_top(&tmp);
925 OPENSSL_cleanse(np, top * sizeof(BN_ULONG));
926 } else
927 #endif
928 #if defined(OPENSSL_BN_ASM_MONT5)
929 if (window == 5 && top > 1) {
930 /*
931 * This optimization uses ideas from https://eprint.iacr.org/2011/239,
932 * specifically optimization of cache-timing attack countermeasures,
933 * pre-computation optimization, and Almost Montgomery Multiplication.
934 *
935 * The paper discusses a 4-bit window to optimize 512-bit modular
936 * exponentiation, used in RSA-1024 with CRT, but RSA-1024 is no longer
937 * important.
938 *
939 * |bn_mul_mont_gather5| and |bn_power5| implement the "almost"
940 * reduction variant, so the values here may not be fully reduced.
941 * They are bounded by R (i.e. they fit in |top| words), not |m|.
942 * Additionally, we pass these "almost" reduced inputs into
943 * |bn_mul_mont|, which implements the normal reduction variant.
944 * Given those inputs, |bn_mul_mont| may not give reduced
945 * output, but it will still produce "almost" reduced output.
946 */
947 void bn_mul_mont_gather5(BN_ULONG *rp, const BN_ULONG *ap,
948 const void *table, const BN_ULONG *np,
949 const BN_ULONG *n0, int num, int power);
950 void bn_scatter5(const BN_ULONG *inp, size_t num,
951 void *table, size_t power);
952 void bn_gather5(BN_ULONG *out, size_t num, void *table, size_t power);
953 void bn_power5(BN_ULONG *rp, const BN_ULONG *ap,
954 const void *table, const BN_ULONG *np,
955 const BN_ULONG *n0, int num, int power);
956 int bn_get_bits5(const BN_ULONG *ap, int off);
957
958 BN_ULONG *n0 = mont->n0, *np;
959
960 /*
961 * BN_to_montgomery can contaminate words above .top [in
962 * BN_DEBUG build...
963 */
964 for (i = am.top; i < top; i++)
965 am.d[i] = 0;
966 for (i = tmp.top; i < top; i++)
967 tmp.d[i] = 0;
968
969 /*
970 * copy mont->N.d[] to improve cache locality
971 */
972 for (np = am.d + top, i = 0; i < top; i++)
973 np[i] = mont->N.d[i];
974
975 bn_scatter5(tmp.d, top, powerbuf, 0);
976 bn_scatter5(am.d, am.top, powerbuf, 1);
977 bn_mul_mont(tmp.d, am.d, am.d, np, n0, top);
978 bn_scatter5(tmp.d, top, powerbuf, 2);
979
980 # if 0
981 for (i = 3; i < 32; i++) {
982 /* Calculate a^i = a^(i-1) * a */
983 bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
984 bn_scatter5(tmp.d, top, powerbuf, i);
985 }
986 # else
987 /* same as above, but uses squaring for 1/2 of operations */
988 for (i = 4; i < 32; i *= 2) {
989 bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
990 bn_scatter5(tmp.d, top, powerbuf, i);
991 }
992 for (i = 3; i < 8; i += 2) {
993 int j;
994 bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
995 bn_scatter5(tmp.d, top, powerbuf, i);
996 for (j = 2 * i; j < 32; j *= 2) {
997 bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
998 bn_scatter5(tmp.d, top, powerbuf, j);
999 }
1000 }
1001 for (; i < 16; i += 2) {
1002 bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
1003 bn_scatter5(tmp.d, top, powerbuf, i);
1004 bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
1005 bn_scatter5(tmp.d, top, powerbuf, 2 * i);
1006 }
1007 for (; i < 32; i += 2) {
1008 bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);
1009 bn_scatter5(tmp.d, top, powerbuf, i);
1010 }
1011 # endif
1012 /*
1013 * The exponent may not have a whole number of fixed-size windows.
1014 * To simplify the main loop, the initial window has between 1 and
1015 * full-window-size bits such that what remains is always a whole
1016 * number of windows
1017 */
1018 window0 = (bits - 1) % 5 + 1;
1019 wmask = (1 << window0) - 1;
1020 bits -= window0;
1021 wvalue = bn_get_bits(p, bits) & wmask;
1022 bn_gather5(tmp.d, top, powerbuf, wvalue);
1023
1024 /*
1025 * Scan the exponent one window at a time starting from the most
1026 * significant bits.
1027 */
1028 if (top & 7) {
1029 while (bits > 0) {
1030 bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
1031 bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
1032 bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
1033 bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
1034 bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
1035 bn_mul_mont_gather5(tmp.d, tmp.d, powerbuf, np, n0, top,
1036 bn_get_bits5(p->d, bits -= 5));
1037 }
1038 } else {
1039 while (bits > 0) {
1040 bn_power5(tmp.d, tmp.d, powerbuf, np, n0, top,
1041 bn_get_bits5(p->d, bits -= 5));
1042 }
1043 }
1044
1045 tmp.top = top;
1046 /*
1047 * The result is now in |tmp| in Montgomery form, but it may not be
1048 * fully reduced. This is within bounds for |BN_from_montgomery|
1049 * (tmp < R <= m*R) so it will, when converting from Montgomery form,
1050 * produce a fully reduced result.
1051 *
1052 * This differs from Figure 2 of the paper, which uses AMM(h, 1) to
1053 * convert from Montgomery form with unreduced output, followed by an
1054 * extra reduction step. In the paper's terminology, we replace
1055 * steps 9 and 10 with MM(h, 1).
1056 */
1057 } else
1058 #endif
1059 {
1060 fallback:
1061 if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, 0, window))
1062 goto err;
1063 if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&am, top, powerbuf, 1, window))
1064 goto err;
1065
1066 /*
1067 * If the window size is greater than 1, then calculate
1068 * val[i=2..2^winsize-1]. Powers are computed as a*a^(i-1) (even
1069 * powers could instead be computed as (a^(i/2))^2 to use the slight
1070 * performance advantage of sqr over mul).
1071 */
1072 if (window > 1) {
1073 if (!bn_mul_mont_fixed_top(&tmp, &am, &am, mont, ctx))
1074 goto err;
1075 if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, 2,
1076 window))
1077 goto err;
1078 for (i = 3; i < numPowers; i++) {
1079 /* Calculate a^i = a^(i-1) * a */
1080 if (!bn_mul_mont_fixed_top(&tmp, &am, &tmp, mont, ctx))
1081 goto err;
1082 if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, i,
1083 window))
1084 goto err;
1085 }
1086 }
1087
1088 /*
1089 * The exponent may not have a whole number of fixed-size windows.
1090 * To simplify the main loop, the initial window has between 1 and
1091 * full-window-size bits such that what remains is always a whole
1092 * number of windows
1093 */
1094 window0 = (bits - 1) % window + 1;
1095 wmask = (1 << window0) - 1;
1096 bits -= window0;
1097 wvalue = bn_get_bits(p, bits) & wmask;
1098 if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(&tmp, top, powerbuf, wvalue,
1099 window))
1100 goto err;
1101
1102 wmask = (1 << window) - 1;
1103 /*
1104 * Scan the exponent one window at a time starting from the most
1105 * significant bits.
1106 */
1107 while (bits > 0) {
1108
1109 /* Square the result window-size times */
1110 for (i = 0; i < window; i++)
1111 if (!bn_mul_mont_fixed_top(&tmp, &tmp, &tmp, mont, ctx))
1112 goto err;
1113
1114 /*
1115 * Get a window's worth of bits from the exponent
1116 * This avoids calling BN_is_bit_set for each bit, which
1117 * is not only slower but also makes each bit vulnerable to
1118 * EM (and likely other) side-channel attacks like One&Done
1119 * (for details see "One&Done: A Single-Decryption EM-Based
1120 * Attack on OpenSSL's Constant-Time Blinded RSA" by M. Alam,
1121 * H. Khan, M. Dey, N. Sinha, R. Callan, A. Zajic, and
1122 * M. Prvulovic, in USENIX Security'18)
1123 */
1124 bits -= window;
1125 wvalue = bn_get_bits(p, bits) & wmask;
1126 /*
1127 * Fetch the appropriate pre-computed value from the pre-buf
1128 */
1129 if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(&am, top, powerbuf, wvalue,
1130 window))
1131 goto err;
1132
1133 /* Multiply the result into the intermediate result */
1134 if (!bn_mul_mont_fixed_top(&tmp, &tmp, &am, mont, ctx))
1135 goto err;
1136 }
1137 }
1138
1139 /*
1140 * Done with zero-padded intermediate BIGNUMs. Final BN_from_montgomery
1141 * removes padding [if any] and makes return value suitable for public
1142 * API consumer.
1143 */
1144 #if defined(SPARC_T4_MONT)
1145 if (OPENSSL_sparcv9cap_P[0] & (SPARCV9_VIS3 | SPARCV9_PREFER_FPU)) {
1146 am.d[0] = 1; /* borrow am */
1147 for (i = 1; i < top; i++)
1148 am.d[i] = 0;
1149 if (!BN_mod_mul_montgomery(rr, &tmp, &am, mont, ctx))
1150 goto err;
1151 } else
1152 #endif
1153 if (!BN_from_montgomery(rr, &tmp, mont, ctx))
1154 goto err;
1155 ret = 1;
1156 err:
1157 if (in_mont == NULL)
1158 BN_MONT_CTX_free(mont);
1159 if (powerbuf != NULL) {
1160 OPENSSL_cleanse(powerbuf, powerbufLen);
1161 OPENSSL_free(powerbufFree);
1162 }
1163 BN_CTX_end(ctx);
1164 return ret;
1165 }
1166
BN_mod_exp_mont_word(BIGNUM * rr,BN_ULONG a,const BIGNUM * p,const BIGNUM * m,BN_CTX * ctx,BN_MONT_CTX * in_mont)1167 int BN_mod_exp_mont_word(BIGNUM *rr, BN_ULONG a, const BIGNUM *p,
1168 const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *in_mont)
1169 {
1170 BN_MONT_CTX *mont = NULL;
1171 int b, bits, ret = 0;
1172 int r_is_one;
1173 BN_ULONG w, next_w;
1174 BIGNUM *r, *t;
1175 BIGNUM *swap_tmp;
1176 #define BN_MOD_MUL_WORD(r, w, m) \
1177 (BN_mul_word(r, (w)) && \
1178 (/* BN_ucmp(r, (m)) < 0 ? 1 :*/ \
1179 (BN_mod(t, r, m, ctx) && (swap_tmp = r, r = t, t = swap_tmp, 1))))
1180 /*
1181 * BN_MOD_MUL_WORD is only used with 'w' large, so the BN_ucmp test is
1182 * probably more overhead than always using BN_mod (which uses BN_copy if
1183 * a similar test returns true).
1184 */
1185 /*
1186 * We can use BN_mod and do not need BN_nnmod because our accumulator is
1187 * never negative (the result of BN_mod does not depend on the sign of
1188 * the modulus).
1189 */
1190 #define BN_TO_MONTGOMERY_WORD(r, w, mont) \
1191 (BN_set_word(r, (w)) && BN_to_montgomery(r, r, (mont), ctx))
1192
1193 if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0
1194 || BN_get_flags(m, BN_FLG_CONSTTIME) != 0) {
1195 /* BN_FLG_CONSTTIME only supported by BN_mod_exp_mont() */
1196 ERR_raise(ERR_LIB_BN, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
1197 return 0;
1198 }
1199
1200 bn_check_top(p);
1201 bn_check_top(m);
1202
1203 if (!BN_is_odd(m)) {
1204 ERR_raise(ERR_LIB_BN, BN_R_CALLED_WITH_EVEN_MODULUS);
1205 return 0;
1206 }
1207 if (m->top == 1)
1208 a %= m->d[0]; /* make sure that 'a' is reduced */
1209
1210 bits = BN_num_bits(p);
1211 if (bits == 0) {
1212 /* x**0 mod 1, or x**0 mod -1 is still zero. */
1213 if (BN_abs_is_word(m, 1)) {
1214 ret = 1;
1215 BN_zero(rr);
1216 } else {
1217 ret = BN_one(rr);
1218 }
1219 return ret;
1220 }
1221 if (a == 0) {
1222 BN_zero(rr);
1223 ret = 1;
1224 return ret;
1225 }
1226
1227 BN_CTX_start(ctx);
1228 r = BN_CTX_get(ctx);
1229 t = BN_CTX_get(ctx);
1230 if (t == NULL)
1231 goto err;
1232
1233 if (in_mont != NULL)
1234 mont = in_mont;
1235 else {
1236 if ((mont = BN_MONT_CTX_new()) == NULL)
1237 goto err;
1238 if (!BN_MONT_CTX_set(mont, m, ctx))
1239 goto err;
1240 }
1241
1242 r_is_one = 1; /* except for Montgomery factor */
1243
1244 /* bits-1 >= 0 */
1245
1246 /* The result is accumulated in the product r*w. */
1247 w = a; /* bit 'bits-1' of 'p' is always set */
1248 for (b = bits - 2; b >= 0; b--) {
1249 /* First, square r*w. */
1250 next_w = w * w;
1251 if ((next_w / w) != w) { /* overflow */
1252 if (r_is_one) {
1253 if (!BN_TO_MONTGOMERY_WORD(r, w, mont))
1254 goto err;
1255 r_is_one = 0;
1256 } else {
1257 if (!BN_MOD_MUL_WORD(r, w, m))
1258 goto err;
1259 }
1260 next_w = 1;
1261 }
1262 w = next_w;
1263 if (!r_is_one) {
1264 if (!BN_mod_mul_montgomery(r, r, r, mont, ctx))
1265 goto err;
1266 }
1267
1268 /* Second, multiply r*w by 'a' if exponent bit is set. */
1269 if (BN_is_bit_set(p, b)) {
1270 next_w = w * a;
1271 if ((next_w / a) != w) { /* overflow */
1272 if (r_is_one) {
1273 if (!BN_TO_MONTGOMERY_WORD(r, w, mont))
1274 goto err;
1275 r_is_one = 0;
1276 } else {
1277 if (!BN_MOD_MUL_WORD(r, w, m))
1278 goto err;
1279 }
1280 next_w = a;
1281 }
1282 w = next_w;
1283 }
1284 }
1285
1286 /* Finally, set r:=r*w. */
1287 if (w != 1) {
1288 if (r_is_one) {
1289 if (!BN_TO_MONTGOMERY_WORD(r, w, mont))
1290 goto err;
1291 r_is_one = 0;
1292 } else {
1293 if (!BN_MOD_MUL_WORD(r, w, m))
1294 goto err;
1295 }
1296 }
1297
1298 if (r_is_one) { /* can happen only if a == 1 */
1299 if (!BN_one(rr))
1300 goto err;
1301 } else {
1302 if (!BN_from_montgomery(rr, r, mont, ctx))
1303 goto err;
1304 }
1305 ret = 1;
1306 err:
1307 if (in_mont == NULL)
1308 BN_MONT_CTX_free(mont);
1309 BN_CTX_end(ctx);
1310 bn_check_top(rr);
1311 return ret;
1312 }
1313
1314 /* The old fallback, simple version :-) */
BN_mod_exp_simple(BIGNUM * r,const BIGNUM * a,const BIGNUM * p,const BIGNUM * m,BN_CTX * ctx)1315 int BN_mod_exp_simple(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
1316 const BIGNUM *m, BN_CTX *ctx)
1317 {
1318 int i, j, bits, ret = 0, wstart, wend, window, wvalue;
1319 int start = 1;
1320 BIGNUM *d;
1321 /* Table of variables obtained from 'ctx' */
1322 BIGNUM *val[TABLE_SIZE];
1323
1324 if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0
1325 || BN_get_flags(a, BN_FLG_CONSTTIME) != 0
1326 || BN_get_flags(m, BN_FLG_CONSTTIME) != 0) {
1327 /* BN_FLG_CONSTTIME only supported by BN_mod_exp_mont() */
1328 ERR_raise(ERR_LIB_BN, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
1329 return 0;
1330 }
1331
1332 if (r == m) {
1333 ERR_raise(ERR_LIB_BN, ERR_R_PASSED_INVALID_ARGUMENT);
1334 return 0;
1335 }
1336
1337 bits = BN_num_bits(p);
1338 if (bits == 0) {
1339 /* x**0 mod 1, or x**0 mod -1 is still zero. */
1340 if (BN_abs_is_word(m, 1)) {
1341 ret = 1;
1342 BN_zero(r);
1343 } else {
1344 ret = BN_one(r);
1345 }
1346 return ret;
1347 }
1348
1349 BN_CTX_start(ctx);
1350 d = BN_CTX_get(ctx);
1351 val[0] = BN_CTX_get(ctx);
1352 if (val[0] == NULL)
1353 goto err;
1354
1355 if (!BN_nnmod(val[0], a, m, ctx))
1356 goto err; /* 1 */
1357 if (BN_is_zero(val[0])) {
1358 BN_zero(r);
1359 ret = 1;
1360 goto err;
1361 }
1362
1363 window = BN_window_bits_for_exponent_size(bits);
1364 if (window > 1) {
1365 if (!BN_mod_mul(d, val[0], val[0], m, ctx))
1366 goto err; /* 2 */
1367 j = 1 << (window - 1);
1368 for (i = 1; i < j; i++) {
1369 if (((val[i] = BN_CTX_get(ctx)) == NULL) ||
1370 !BN_mod_mul(val[i], val[i - 1], d, m, ctx))
1371 goto err;
1372 }
1373 }
1374
1375 start = 1; /* This is used to avoid multiplication etc
1376 * when there is only the value '1' in the
1377 * buffer. */
1378 wvalue = 0; /* The 'value' of the window */
1379 wstart = bits - 1; /* The top bit of the window */
1380 wend = 0; /* The bottom bit of the window */
1381
1382 if (r == p) {
1383 BIGNUM *p_dup = BN_CTX_get(ctx);
1384
1385 if (p_dup == NULL || BN_copy(p_dup, p) == NULL)
1386 goto err;
1387 p = p_dup;
1388 }
1389
1390 if (!BN_one(r))
1391 goto err;
1392
1393 for (;;) {
1394 if (BN_is_bit_set(p, wstart) == 0) {
1395 if (!start)
1396 if (!BN_mod_mul(r, r, r, m, ctx))
1397 goto err;
1398 if (wstart == 0)
1399 break;
1400 wstart--;
1401 continue;
1402 }
1403 /*
1404 * We now have wstart on a 'set' bit, we now need to work out how bit
1405 * a window to do. To do this we need to scan forward until the last
1406 * set bit before the end of the window
1407 */
1408 wvalue = 1;
1409 wend = 0;
1410 for (i = 1; i < window; i++) {
1411 if (wstart - i < 0)
1412 break;
1413 if (BN_is_bit_set(p, wstart - i)) {
1414 wvalue <<= (i - wend);
1415 wvalue |= 1;
1416 wend = i;
1417 }
1418 }
1419
1420 /* wend is the size of the current window */
1421 j = wend + 1;
1422 /* add the 'bytes above' */
1423 if (!start)
1424 for (i = 0; i < j; i++) {
1425 if (!BN_mod_mul(r, r, r, m, ctx))
1426 goto err;
1427 }
1428
1429 /* wvalue will be an odd number < 2^window */
1430 if (!BN_mod_mul(r, r, val[wvalue >> 1], m, ctx))
1431 goto err;
1432
1433 /* move the 'window' down further */
1434 wstart -= wend + 1;
1435 wvalue = 0;
1436 start = 0;
1437 if (wstart < 0)
1438 break;
1439 }
1440 ret = 1;
1441 err:
1442 BN_CTX_end(ctx);
1443 bn_check_top(r);
1444 return ret;
1445 }
1446
1447 /*
1448 * This is a variant of modular exponentiation optimization that does
1449 * parallel 2-primes exponentiation using 256-bit (AVX512VL) AVX512_IFMA ISA
1450 * in 52-bit binary redundant representation.
1451 * If such instructions are not available, or input data size is not supported,
1452 * it falls back to two BN_mod_exp_mont_consttime() calls.
1453 */
BN_mod_exp_mont_consttime_x2(BIGNUM * rr1,const BIGNUM * a1,const BIGNUM * p1,const BIGNUM * m1,BN_MONT_CTX * in_mont1,BIGNUM * rr2,const BIGNUM * a2,const BIGNUM * p2,const BIGNUM * m2,BN_MONT_CTX * in_mont2,BN_CTX * ctx)1454 int BN_mod_exp_mont_consttime_x2(BIGNUM *rr1, const BIGNUM *a1, const BIGNUM *p1,
1455 const BIGNUM *m1, BN_MONT_CTX *in_mont1,
1456 BIGNUM *rr2, const BIGNUM *a2, const BIGNUM *p2,
1457 const BIGNUM *m2, BN_MONT_CTX *in_mont2,
1458 BN_CTX *ctx)
1459 {
1460 int ret = 0;
1461
1462 #ifdef RSAZ_ENABLED
1463 BN_MONT_CTX *mont1 = NULL;
1464 BN_MONT_CTX *mont2 = NULL;
1465
1466 if (ossl_rsaz_avx512ifma_eligible() &&
1467 ((a1->top == 16) && (p1->top == 16) && (BN_num_bits(m1) == 1024) &&
1468 (a2->top == 16) && (p2->top == 16) && (BN_num_bits(m2) == 1024))) {
1469
1470 if (bn_wexpand(rr1, 16) == NULL)
1471 goto err;
1472 if (bn_wexpand(rr2, 16) == NULL)
1473 goto err;
1474
1475 /* Ensure that montgomery contexts are initialized */
1476 if (in_mont1 != NULL) {
1477 mont1 = in_mont1;
1478 } else {
1479 if ((mont1 = BN_MONT_CTX_new()) == NULL)
1480 goto err;
1481 if (!BN_MONT_CTX_set(mont1, m1, ctx))
1482 goto err;
1483 }
1484 if (in_mont2 != NULL) {
1485 mont2 = in_mont2;
1486 } else {
1487 if ((mont2 = BN_MONT_CTX_new()) == NULL)
1488 goto err;
1489 if (!BN_MONT_CTX_set(mont2, m2, ctx))
1490 goto err;
1491 }
1492
1493 ret = ossl_rsaz_mod_exp_avx512_x2(rr1->d, a1->d, p1->d, m1->d,
1494 mont1->RR.d, mont1->n0[0],
1495 rr2->d, a2->d, p2->d, m2->d,
1496 mont2->RR.d, mont2->n0[0],
1497 1024 /* factor bit size */);
1498
1499 rr1->top = 16;
1500 rr1->neg = 0;
1501 bn_correct_top(rr1);
1502 bn_check_top(rr1);
1503
1504 rr2->top = 16;
1505 rr2->neg = 0;
1506 bn_correct_top(rr2);
1507 bn_check_top(rr2);
1508
1509 goto err;
1510 }
1511 #endif
1512
1513 /* rr1 = a1^p1 mod m1 */
1514 ret = BN_mod_exp_mont_consttime(rr1, a1, p1, m1, ctx, in_mont1);
1515 /* rr2 = a2^p2 mod m2 */
1516 ret &= BN_mod_exp_mont_consttime(rr2, a2, p2, m2, ctx, in_mont2);
1517
1518 #ifdef RSAZ_ENABLED
1519 err:
1520 if (in_mont2 == NULL)
1521 BN_MONT_CTX_free(mont2);
1522 if (in_mont1 == NULL)
1523 BN_MONT_CTX_free(mont1);
1524 #endif
1525
1526 return ret;
1527 }
1528