1 /* 2 * Copyright 1995-2023 The OpenSSL Project Authors. All Rights Reserved. 3 * 4 * Licensed under the Apache License 2.0 (the "License"). You may not use 5 * this file except in compliance with the License. You can obtain a copy 6 * in the file LICENSE in the source distribution or at 7 * https://www.openssl.org/source/license.html 8 */ 9 10 #include "internal/cryptlib.h" 11 #include "internal/constant_time.h" 12 #include "bn_local.h" 13 14 #include <stdlib.h> 15 #ifdef _WIN32 16 # include <malloc.h> 17 # ifndef alloca 18 # define alloca _alloca 19 # endif 20 #elif defined(__GNUC__) 21 # ifndef __SSP__ 22 # ifndef alloca 23 # define alloca(s) __builtin_alloca((s)) 24 # endif 25 # else 26 # undef alloca 27 # endif 28 #elif defined(__sun) 29 # include <alloca.h> 30 #endif 31 32 #include "rsaz_exp.h" 33 34 #undef SPARC_T4_MONT 35 #if defined(OPENSSL_BN_ASM_MONT) && (defined(__sparc__) || defined(__sparc)) 36 # include "crypto/sparc_arch.h" 37 # define SPARC_T4_MONT 38 #endif 39 40 /* maximum precomputation table size for *variable* sliding windows */ 41 #define TABLE_SIZE 32 42 43 /* 44 * Beyond this limit the constant time code is disabled due to 45 * the possible overflow in the computation of powerbufLen in 46 * BN_mod_exp_mont_consttime. 47 * When this limit is exceeded, the computation will be done using 48 * non-constant time code, but it will take very long. 49 */ 50 #define BN_CONSTTIME_SIZE_LIMIT (INT_MAX / BN_BYTES / 256) 51 52 /* this one works - simple but works */ 53 int BN_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx) 54 { 55 int i, bits, ret = 0; 56 BIGNUM *v, *rr; 57 58 if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0 59 || BN_get_flags(a, BN_FLG_CONSTTIME) != 0) { 60 /* BN_FLG_CONSTTIME only supported by BN_mod_exp_mont() */ 61 ERR_raise(ERR_LIB_BN, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); 62 return 0; 63 } 64 65 BN_CTX_start(ctx); 66 rr = ((r == a) || (r == p)) ? BN_CTX_get(ctx) : r; 67 v = BN_CTX_get(ctx); 68 if (rr == NULL || v == NULL) 69 goto err; 70 71 if (BN_copy(v, a) == NULL) 72 goto err; 73 bits = BN_num_bits(p); 74 75 if (BN_is_odd(p)) { 76 if (BN_copy(rr, a) == NULL) 77 goto err; 78 } else { 79 if (!BN_one(rr)) 80 goto err; 81 } 82 83 for (i = 1; i < bits; i++) { 84 if (!BN_sqr(v, v, ctx)) 85 goto err; 86 if (BN_is_bit_set(p, i)) { 87 if (!BN_mul(rr, rr, v, ctx)) 88 goto err; 89 } 90 } 91 if (r != rr && BN_copy(r, rr) == NULL) 92 goto err; 93 94 ret = 1; 95 err: 96 BN_CTX_end(ctx); 97 bn_check_top(r); 98 return ret; 99 } 100 101 int BN_mod_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, const BIGNUM *m, 102 BN_CTX *ctx) 103 { 104 int ret; 105 106 bn_check_top(a); 107 bn_check_top(p); 108 bn_check_top(m); 109 110 /*- 111 * For even modulus m = 2^k*m_odd, it might make sense to compute 112 * a^p mod m_odd and a^p mod 2^k separately (with Montgomery 113 * exponentiation for the odd part), using appropriate exponent 114 * reductions, and combine the results using the CRT. 115 * 116 * For now, we use Montgomery only if the modulus is odd; otherwise, 117 * exponentiation using the reciprocal-based quick remaindering 118 * algorithm is used. 119 * 120 * (Timing obtained with expspeed.c [computations a^p mod m 121 * where a, p, m are of the same length: 256, 512, 1024, 2048, 122 * 4096, 8192 bits], compared to the running time of the 123 * standard algorithm: 124 * 125 * BN_mod_exp_mont 33 .. 40 % [AMD K6-2, Linux, debug configuration] 126 * 55 .. 77 % [UltraSparc processor, but 127 * debug-solaris-sparcv8-gcc conf.] 128 * 129 * BN_mod_exp_recp 50 .. 70 % [AMD K6-2, Linux, debug configuration] 130 * 62 .. 118 % [UltraSparc, debug-solaris-sparcv8-gcc] 131 * 132 * On the Sparc, BN_mod_exp_recp was faster than BN_mod_exp_mont 133 * at 2048 and more bits, but at 512 and 1024 bits, it was 134 * slower even than the standard algorithm! 135 * 136 * "Real" timings [linux-elf, solaris-sparcv9-gcc configurations] 137 * should be obtained when the new Montgomery reduction code 138 * has been integrated into OpenSSL.) 139 */ 140 141 #define MONT_MUL_MOD 142 #define MONT_EXP_WORD 143 #define RECP_MUL_MOD 144 145 #ifdef MONT_MUL_MOD 146 if (BN_is_odd(m)) { 147 # ifdef MONT_EXP_WORD 148 if (a->top == 1 && !a->neg 149 && (BN_get_flags(p, BN_FLG_CONSTTIME) == 0) 150 && (BN_get_flags(a, BN_FLG_CONSTTIME) == 0) 151 && (BN_get_flags(m, BN_FLG_CONSTTIME) == 0)) { 152 BN_ULONG A = a->d[0]; 153 ret = BN_mod_exp_mont_word(r, A, p, m, ctx, NULL); 154 } else 155 # endif 156 ret = BN_mod_exp_mont(r, a, p, m, ctx, NULL); 157 } else 158 #endif 159 #ifdef RECP_MUL_MOD 160 { 161 ret = BN_mod_exp_recp(r, a, p, m, ctx); 162 } 163 #else 164 { 165 ret = BN_mod_exp_simple(r, a, p, m, ctx); 166 } 167 #endif 168 169 bn_check_top(r); 170 return ret; 171 } 172 173 int BN_mod_exp_recp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, 174 const BIGNUM *m, BN_CTX *ctx) 175 { 176 int i, j, bits, ret = 0, wstart, wend, window, wvalue; 177 int start = 1; 178 BIGNUM *aa; 179 /* Table of variables obtained from 'ctx' */ 180 BIGNUM *val[TABLE_SIZE]; 181 BN_RECP_CTX recp; 182 183 if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0 184 || BN_get_flags(a, BN_FLG_CONSTTIME) != 0 185 || BN_get_flags(m, BN_FLG_CONSTTIME) != 0) { 186 /* BN_FLG_CONSTTIME only supported by BN_mod_exp_mont() */ 187 ERR_raise(ERR_LIB_BN, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); 188 return 0; 189 } 190 191 bits = BN_num_bits(p); 192 if (bits == 0) { 193 /* x**0 mod 1, or x**0 mod -1 is still zero. */ 194 if (BN_abs_is_word(m, 1)) { 195 ret = 1; 196 BN_zero(r); 197 } else { 198 ret = BN_one(r); 199 } 200 return ret; 201 } 202 203 BN_RECP_CTX_init(&recp); 204 205 BN_CTX_start(ctx); 206 aa = BN_CTX_get(ctx); 207 val[0] = BN_CTX_get(ctx); 208 if (val[0] == NULL) 209 goto err; 210 211 if (m->neg) { 212 /* ignore sign of 'm' */ 213 if (!BN_copy(aa, m)) 214 goto err; 215 aa->neg = 0; 216 if (BN_RECP_CTX_set(&recp, aa, ctx) <= 0) 217 goto err; 218 } else { 219 if (BN_RECP_CTX_set(&recp, m, ctx) <= 0) 220 goto err; 221 } 222 223 if (!BN_nnmod(val[0], a, m, ctx)) 224 goto err; /* 1 */ 225 if (BN_is_zero(val[0])) { 226 BN_zero(r); 227 ret = 1; 228 goto err; 229 } 230 231 window = BN_window_bits_for_exponent_size(bits); 232 if (window > 1) { 233 if (!BN_mod_mul_reciprocal(aa, val[0], val[0], &recp, ctx)) 234 goto err; /* 2 */ 235 j = 1 << (window - 1); 236 for (i = 1; i < j; i++) { 237 if (((val[i] = BN_CTX_get(ctx)) == NULL) || 238 !BN_mod_mul_reciprocal(val[i], val[i - 1], aa, &recp, ctx)) 239 goto err; 240 } 241 } 242 243 start = 1; /* This is used to avoid multiplication etc 244 * when there is only the value '1' in the 245 * buffer. */ 246 wvalue = 0; /* The 'value' of the window */ 247 wstart = bits - 1; /* The top bit of the window */ 248 wend = 0; /* The bottom bit of the window */ 249 250 if (!BN_one(r)) 251 goto err; 252 253 for (;;) { 254 if (BN_is_bit_set(p, wstart) == 0) { 255 if (!start) 256 if (!BN_mod_mul_reciprocal(r, r, r, &recp, ctx)) 257 goto err; 258 if (wstart == 0) 259 break; 260 wstart--; 261 continue; 262 } 263 /* 264 * We now have wstart on a 'set' bit, we now need to work out how bit 265 * a window to do. To do this we need to scan forward until the last 266 * set bit before the end of the window 267 */ 268 wvalue = 1; 269 wend = 0; 270 for (i = 1; i < window; i++) { 271 if (wstart - i < 0) 272 break; 273 if (BN_is_bit_set(p, wstart - i)) { 274 wvalue <<= (i - wend); 275 wvalue |= 1; 276 wend = i; 277 } 278 } 279 280 /* wend is the size of the current window */ 281 j = wend + 1; 282 /* add the 'bytes above' */ 283 if (!start) 284 for (i = 0; i < j; i++) { 285 if (!BN_mod_mul_reciprocal(r, r, r, &recp, ctx)) 286 goto err; 287 } 288 289 /* wvalue will be an odd number < 2^window */ 290 if (!BN_mod_mul_reciprocal(r, r, val[wvalue >> 1], &recp, ctx)) 291 goto err; 292 293 /* move the 'window' down further */ 294 wstart -= wend + 1; 295 wvalue = 0; 296 start = 0; 297 if (wstart < 0) 298 break; 299 } 300 ret = 1; 301 err: 302 BN_CTX_end(ctx); 303 BN_RECP_CTX_free(&recp); 304 bn_check_top(r); 305 return ret; 306 } 307 308 int BN_mod_exp_mont(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, 309 const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *in_mont) 310 { 311 int i, j, bits, ret = 0, wstart, wend, window, wvalue; 312 int start = 1; 313 BIGNUM *d, *r; 314 const BIGNUM *aa; 315 /* Table of variables obtained from 'ctx' */ 316 BIGNUM *val[TABLE_SIZE]; 317 BN_MONT_CTX *mont = NULL; 318 319 bn_check_top(a); 320 bn_check_top(p); 321 bn_check_top(m); 322 323 if (!BN_is_odd(m)) { 324 ERR_raise(ERR_LIB_BN, BN_R_CALLED_WITH_EVEN_MODULUS); 325 return 0; 326 } 327 328 if (m->top <= BN_CONSTTIME_SIZE_LIMIT 329 && (BN_get_flags(p, BN_FLG_CONSTTIME) != 0 330 || BN_get_flags(a, BN_FLG_CONSTTIME) != 0 331 || BN_get_flags(m, BN_FLG_CONSTTIME) != 0)) { 332 return BN_mod_exp_mont_consttime(rr, a, p, m, ctx, in_mont); 333 } 334 335 bits = BN_num_bits(p); 336 if (bits == 0) { 337 /* x**0 mod 1, or x**0 mod -1 is still zero. */ 338 if (BN_abs_is_word(m, 1)) { 339 ret = 1; 340 BN_zero(rr); 341 } else { 342 ret = BN_one(rr); 343 } 344 return ret; 345 } 346 347 BN_CTX_start(ctx); 348 d = BN_CTX_get(ctx); 349 r = BN_CTX_get(ctx); 350 val[0] = BN_CTX_get(ctx); 351 if (val[0] == NULL) 352 goto err; 353 354 /* 355 * If this is not done, things will break in the montgomery part 356 */ 357 358 if (in_mont != NULL) 359 mont = in_mont; 360 else { 361 if ((mont = BN_MONT_CTX_new()) == NULL) 362 goto err; 363 if (!BN_MONT_CTX_set(mont, m, ctx)) 364 goto err; 365 } 366 367 if (a->neg || BN_ucmp(a, m) >= 0) { 368 if (!BN_nnmod(val[0], a, m, ctx)) 369 goto err; 370 aa = val[0]; 371 } else 372 aa = a; 373 if (!bn_to_mont_fixed_top(val[0], aa, mont, ctx)) 374 goto err; /* 1 */ 375 376 window = BN_window_bits_for_exponent_size(bits); 377 if (window > 1) { 378 if (!bn_mul_mont_fixed_top(d, val[0], val[0], mont, ctx)) 379 goto err; /* 2 */ 380 j = 1 << (window - 1); 381 for (i = 1; i < j; i++) { 382 if (((val[i] = BN_CTX_get(ctx)) == NULL) || 383 !bn_mul_mont_fixed_top(val[i], val[i - 1], d, mont, ctx)) 384 goto err; 385 } 386 } 387 388 start = 1; /* This is used to avoid multiplication etc 389 * when there is only the value '1' in the 390 * buffer. */ 391 wvalue = 0; /* The 'value' of the window */ 392 wstart = bits - 1; /* The top bit of the window */ 393 wend = 0; /* The bottom bit of the window */ 394 395 #if 1 /* by Shay Gueron's suggestion */ 396 j = m->top; /* borrow j */ 397 if (m->d[j - 1] & (((BN_ULONG)1) << (BN_BITS2 - 1))) { 398 if (bn_wexpand(r, j) == NULL) 399 goto err; 400 /* 2^(top*BN_BITS2) - m */ 401 r->d[0] = (0 - m->d[0]) & BN_MASK2; 402 for (i = 1; i < j; i++) 403 r->d[i] = (~m->d[i]) & BN_MASK2; 404 r->top = j; 405 r->flags |= BN_FLG_FIXED_TOP; 406 } else 407 #endif 408 if (!bn_to_mont_fixed_top(r, BN_value_one(), mont, ctx)) 409 goto err; 410 for (;;) { 411 if (BN_is_bit_set(p, wstart) == 0) { 412 if (!start) { 413 if (!bn_mul_mont_fixed_top(r, r, r, mont, ctx)) 414 goto err; 415 } 416 if (wstart == 0) 417 break; 418 wstart--; 419 continue; 420 } 421 /* 422 * We now have wstart on a 'set' bit, we now need to work out how bit 423 * a window to do. To do this we need to scan forward until the last 424 * set bit before the end of the window 425 */ 426 wvalue = 1; 427 wend = 0; 428 for (i = 1; i < window; i++) { 429 if (wstart - i < 0) 430 break; 431 if (BN_is_bit_set(p, wstart - i)) { 432 wvalue <<= (i - wend); 433 wvalue |= 1; 434 wend = i; 435 } 436 } 437 438 /* wend is the size of the current window */ 439 j = wend + 1; 440 /* add the 'bytes above' */ 441 if (!start) 442 for (i = 0; i < j; i++) { 443 if (!bn_mul_mont_fixed_top(r, r, r, mont, ctx)) 444 goto err; 445 } 446 447 /* wvalue will be an odd number < 2^window */ 448 if (!bn_mul_mont_fixed_top(r, r, val[wvalue >> 1], mont, ctx)) 449 goto err; 450 451 /* move the 'window' down further */ 452 wstart -= wend + 1; 453 wvalue = 0; 454 start = 0; 455 if (wstart < 0) 456 break; 457 } 458 /* 459 * Done with zero-padded intermediate BIGNUMs. Final BN_from_montgomery 460 * removes padding [if any] and makes return value suitable for public 461 * API consumer. 462 */ 463 #if defined(SPARC_T4_MONT) 464 if (OPENSSL_sparcv9cap_P[0] & (SPARCV9_VIS3 | SPARCV9_PREFER_FPU)) { 465 j = mont->N.top; /* borrow j */ 466 val[0]->d[0] = 1; /* borrow val[0] */ 467 for (i = 1; i < j; i++) 468 val[0]->d[i] = 0; 469 val[0]->top = j; 470 if (!BN_mod_mul_montgomery(rr, r, val[0], mont, ctx)) 471 goto err; 472 } else 473 #endif 474 if (!BN_from_montgomery(rr, r, mont, ctx)) 475 goto err; 476 ret = 1; 477 err: 478 if (in_mont == NULL) 479 BN_MONT_CTX_free(mont); 480 BN_CTX_end(ctx); 481 bn_check_top(rr); 482 return ret; 483 } 484 485 static BN_ULONG bn_get_bits(const BIGNUM *a, int bitpos) 486 { 487 BN_ULONG ret = 0; 488 int wordpos; 489 490 wordpos = bitpos / BN_BITS2; 491 bitpos %= BN_BITS2; 492 if (wordpos >= 0 && wordpos < a->top) { 493 ret = a->d[wordpos] & BN_MASK2; 494 if (bitpos) { 495 ret >>= bitpos; 496 if (++wordpos < a->top) 497 ret |= a->d[wordpos] << (BN_BITS2 - bitpos); 498 } 499 } 500 501 return ret & BN_MASK2; 502 } 503 504 /* 505 * BN_mod_exp_mont_consttime() stores the precomputed powers in a specific 506 * layout so that accessing any of these table values shows the same access 507 * pattern as far as cache lines are concerned. The following functions are 508 * used to transfer a BIGNUM from/to that table. 509 */ 510 511 static int MOD_EXP_CTIME_COPY_TO_PREBUF(const BIGNUM *b, int top, 512 unsigned char *buf, int idx, 513 int window) 514 { 515 int i, j; 516 int width = 1 << window; 517 BN_ULONG *table = (BN_ULONG *)buf; 518 519 if (top > b->top) 520 top = b->top; /* this works because 'buf' is explicitly 521 * zeroed */ 522 for (i = 0, j = idx; i < top; i++, j += width) { 523 table[j] = b->d[i]; 524 } 525 526 return 1; 527 } 528 529 static int MOD_EXP_CTIME_COPY_FROM_PREBUF(BIGNUM *b, int top, 530 unsigned char *buf, int idx, 531 int window) 532 { 533 int i, j; 534 int width = 1 << window; 535 /* 536 * We declare table 'volatile' in order to discourage compiler 537 * from reordering loads from the table. Concern is that if 538 * reordered in specific manner loads might give away the 539 * information we are trying to conceal. Some would argue that 540 * compiler can reorder them anyway, but it can as well be 541 * argued that doing so would be violation of standard... 542 */ 543 volatile BN_ULONG *table = (volatile BN_ULONG *)buf; 544 545 if (bn_wexpand(b, top) == NULL) 546 return 0; 547 548 if (window <= 3) { 549 for (i = 0; i < top; i++, table += width) { 550 BN_ULONG acc = 0; 551 552 for (j = 0; j < width; j++) { 553 acc |= table[j] & 554 ((BN_ULONG)0 - (constant_time_eq_int(j,idx)&1)); 555 } 556 557 b->d[i] = acc; 558 } 559 } else { 560 int xstride = 1 << (window - 2); 561 BN_ULONG y0, y1, y2, y3; 562 563 i = idx >> (window - 2); /* equivalent of idx / xstride */ 564 idx &= xstride - 1; /* equivalent of idx % xstride */ 565 566 y0 = (BN_ULONG)0 - (constant_time_eq_int(i,0)&1); 567 y1 = (BN_ULONG)0 - (constant_time_eq_int(i,1)&1); 568 y2 = (BN_ULONG)0 - (constant_time_eq_int(i,2)&1); 569 y3 = (BN_ULONG)0 - (constant_time_eq_int(i,3)&1); 570 571 for (i = 0; i < top; i++, table += width) { 572 BN_ULONG acc = 0; 573 574 for (j = 0; j < xstride; j++) { 575 acc |= ( (table[j + 0 * xstride] & y0) | 576 (table[j + 1 * xstride] & y1) | 577 (table[j + 2 * xstride] & y2) | 578 (table[j + 3 * xstride] & y3) ) 579 & ((BN_ULONG)0 - (constant_time_eq_int(j,idx)&1)); 580 } 581 582 b->d[i] = acc; 583 } 584 } 585 586 b->top = top; 587 b->flags |= BN_FLG_FIXED_TOP; 588 return 1; 589 } 590 591 /* 592 * Given a pointer value, compute the next address that is a cache line 593 * multiple. 594 */ 595 #define MOD_EXP_CTIME_ALIGN(x_) \ 596 ((unsigned char*)(x_) + (MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH - (((size_t)(x_)) & (MOD_EXP_CTIME_MIN_CACHE_LINE_MASK)))) 597 598 /* 599 * This variant of BN_mod_exp_mont() uses fixed windows and the special 600 * precomputation memory layout to limit data-dependency to a minimum to 601 * protect secret exponents (cf. the hyper-threading timing attacks pointed 602 * out by Colin Percival, 603 * http://www.daemonology.net/hyperthreading-considered-harmful/) 604 */ 605 int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, 606 const BIGNUM *m, BN_CTX *ctx, 607 BN_MONT_CTX *in_mont) 608 { 609 int i, bits, ret = 0, window, wvalue, wmask, window0; 610 int top; 611 BN_MONT_CTX *mont = NULL; 612 613 int numPowers; 614 unsigned char *powerbufFree = NULL; 615 int powerbufLen = 0; 616 unsigned char *powerbuf = NULL; 617 BIGNUM tmp, am; 618 #if defined(SPARC_T4_MONT) 619 unsigned int t4 = 0; 620 #endif 621 622 bn_check_top(a); 623 bn_check_top(p); 624 bn_check_top(m); 625 626 if (!BN_is_odd(m)) { 627 ERR_raise(ERR_LIB_BN, BN_R_CALLED_WITH_EVEN_MODULUS); 628 return 0; 629 } 630 631 top = m->top; 632 633 if (top > BN_CONSTTIME_SIZE_LIMIT) { 634 /* Prevent overflowing the powerbufLen computation below */ 635 return BN_mod_exp_mont(rr, a, p, m, ctx, in_mont); 636 } 637 638 /* 639 * Use all bits stored in |p|, rather than |BN_num_bits|, so we do not leak 640 * whether the top bits are zero. 641 */ 642 bits = p->top * BN_BITS2; 643 if (bits == 0) { 644 /* x**0 mod 1, or x**0 mod -1 is still zero. */ 645 if (BN_abs_is_word(m, 1)) { 646 ret = 1; 647 BN_zero(rr); 648 } else { 649 ret = BN_one(rr); 650 } 651 return ret; 652 } 653 654 BN_CTX_start(ctx); 655 656 /* 657 * Allocate a montgomery context if it was not supplied by the caller. If 658 * this is not done, things will break in the montgomery part. 659 */ 660 if (in_mont != NULL) 661 mont = in_mont; 662 else { 663 if ((mont = BN_MONT_CTX_new()) == NULL) 664 goto err; 665 if (!BN_MONT_CTX_set(mont, m, ctx)) 666 goto err; 667 } 668 669 if (a->neg || BN_ucmp(a, m) >= 0) { 670 BIGNUM *reduced = BN_CTX_get(ctx); 671 if (reduced == NULL 672 || !BN_nnmod(reduced, a, m, ctx)) { 673 goto err; 674 } 675 a = reduced; 676 } 677 678 #ifdef RSAZ_ENABLED 679 /* 680 * If the size of the operands allow it, perform the optimized 681 * RSAZ exponentiation. For further information see 682 * crypto/bn/rsaz_exp.c and accompanying assembly modules. 683 */ 684 if ((16 == a->top) && (16 == p->top) && (BN_num_bits(m) == 1024) 685 && rsaz_avx2_eligible()) { 686 if (NULL == bn_wexpand(rr, 16)) 687 goto err; 688 RSAZ_1024_mod_exp_avx2(rr->d, a->d, p->d, m->d, mont->RR.d, 689 mont->n0[0]); 690 rr->top = 16; 691 rr->neg = 0; 692 bn_correct_top(rr); 693 ret = 1; 694 goto err; 695 } else if ((8 == a->top) && (8 == p->top) && (BN_num_bits(m) == 512)) { 696 if (NULL == bn_wexpand(rr, 8)) 697 goto err; 698 RSAZ_512_mod_exp(rr->d, a->d, p->d, m->d, mont->n0[0], mont->RR.d); 699 rr->top = 8; 700 rr->neg = 0; 701 bn_correct_top(rr); 702 ret = 1; 703 goto err; 704 } 705 #endif 706 707 /* Get the window size to use with size of p. */ 708 window = BN_window_bits_for_ctime_exponent_size(bits); 709 #if defined(SPARC_T4_MONT) 710 if (window >= 5 && (top & 15) == 0 && top <= 64 && 711 (OPENSSL_sparcv9cap_P[1] & (CFR_MONTMUL | CFR_MONTSQR)) == 712 (CFR_MONTMUL | CFR_MONTSQR) && (t4 = OPENSSL_sparcv9cap_P[0])) 713 window = 5; 714 else 715 #endif 716 #if defined(OPENSSL_BN_ASM_MONT5) 717 if (window >= 5 && top <= BN_SOFT_LIMIT) { 718 window = 5; /* ~5% improvement for RSA2048 sign, and even 719 * for RSA4096 */ 720 /* reserve space for mont->N.d[] copy */ 721 powerbufLen += top * sizeof(mont->N.d[0]); 722 } 723 #endif 724 (void)0; 725 726 /* 727 * Allocate a buffer large enough to hold all of the pre-computed powers 728 * of am, am itself and tmp. 729 */ 730 numPowers = 1 << window; 731 powerbufLen += sizeof(m->d[0]) * (top * numPowers + 732 ((2 * top) > 733 numPowers ? (2 * top) : numPowers)); 734 #ifdef alloca 735 if (powerbufLen < 3072) 736 powerbufFree = 737 alloca(powerbufLen + MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH); 738 else 739 #endif 740 if ((powerbufFree = 741 OPENSSL_malloc(powerbufLen + MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH)) 742 == NULL) 743 goto err; 744 745 powerbuf = MOD_EXP_CTIME_ALIGN(powerbufFree); 746 memset(powerbuf, 0, powerbufLen); 747 748 #ifdef alloca 749 if (powerbufLen < 3072) 750 powerbufFree = NULL; 751 #endif 752 753 /* lay down tmp and am right after powers table */ 754 tmp.d = (BN_ULONG *)(powerbuf + sizeof(m->d[0]) * top * numPowers); 755 am.d = tmp.d + top; 756 tmp.top = am.top = 0; 757 tmp.dmax = am.dmax = top; 758 tmp.neg = am.neg = 0; 759 tmp.flags = am.flags = BN_FLG_STATIC_DATA; 760 761 /* prepare a^0 in Montgomery domain */ 762 #if 1 /* by Shay Gueron's suggestion */ 763 if (m->d[top - 1] & (((BN_ULONG)1) << (BN_BITS2 - 1))) { 764 /* 2^(top*BN_BITS2) - m */ 765 tmp.d[0] = (0 - m->d[0]) & BN_MASK2; 766 for (i = 1; i < top; i++) 767 tmp.d[i] = (~m->d[i]) & BN_MASK2; 768 tmp.top = top; 769 } else 770 #endif 771 if (!bn_to_mont_fixed_top(&tmp, BN_value_one(), mont, ctx)) 772 goto err; 773 774 /* prepare a^1 in Montgomery domain */ 775 if (!bn_to_mont_fixed_top(&am, a, mont, ctx)) 776 goto err; 777 778 if (top > BN_SOFT_LIMIT) 779 goto fallback; 780 781 #if defined(SPARC_T4_MONT) 782 if (t4) { 783 typedef int (*bn_pwr5_mont_f) (BN_ULONG *tp, const BN_ULONG *np, 784 const BN_ULONG *n0, const void *table, 785 int power, int bits); 786 int bn_pwr5_mont_t4_8(BN_ULONG *tp, const BN_ULONG *np, 787 const BN_ULONG *n0, const void *table, 788 int power, int bits); 789 int bn_pwr5_mont_t4_16(BN_ULONG *tp, const BN_ULONG *np, 790 const BN_ULONG *n0, const void *table, 791 int power, int bits); 792 int bn_pwr5_mont_t4_24(BN_ULONG *tp, const BN_ULONG *np, 793 const BN_ULONG *n0, const void *table, 794 int power, int bits); 795 int bn_pwr5_mont_t4_32(BN_ULONG *tp, const BN_ULONG *np, 796 const BN_ULONG *n0, const void *table, 797 int power, int bits); 798 static const bn_pwr5_mont_f pwr5_funcs[4] = { 799 bn_pwr5_mont_t4_8, bn_pwr5_mont_t4_16, 800 bn_pwr5_mont_t4_24, bn_pwr5_mont_t4_32 801 }; 802 bn_pwr5_mont_f pwr5_worker = pwr5_funcs[top / 16 - 1]; 803 804 typedef int (*bn_mul_mont_f) (BN_ULONG *rp, const BN_ULONG *ap, 805 const void *bp, const BN_ULONG *np, 806 const BN_ULONG *n0); 807 int bn_mul_mont_t4_8(BN_ULONG *rp, const BN_ULONG *ap, const void *bp, 808 const BN_ULONG *np, const BN_ULONG *n0); 809 int bn_mul_mont_t4_16(BN_ULONG *rp, const BN_ULONG *ap, 810 const void *bp, const BN_ULONG *np, 811 const BN_ULONG *n0); 812 int bn_mul_mont_t4_24(BN_ULONG *rp, const BN_ULONG *ap, 813 const void *bp, const BN_ULONG *np, 814 const BN_ULONG *n0); 815 int bn_mul_mont_t4_32(BN_ULONG *rp, const BN_ULONG *ap, 816 const void *bp, const BN_ULONG *np, 817 const BN_ULONG *n0); 818 static const bn_mul_mont_f mul_funcs[4] = { 819 bn_mul_mont_t4_8, bn_mul_mont_t4_16, 820 bn_mul_mont_t4_24, bn_mul_mont_t4_32 821 }; 822 bn_mul_mont_f mul_worker = mul_funcs[top / 16 - 1]; 823 824 void bn_mul_mont_vis3(BN_ULONG *rp, const BN_ULONG *ap, 825 const void *bp, const BN_ULONG *np, 826 const BN_ULONG *n0, int num); 827 void bn_mul_mont_t4(BN_ULONG *rp, const BN_ULONG *ap, 828 const void *bp, const BN_ULONG *np, 829 const BN_ULONG *n0, int num); 830 void bn_mul_mont_gather5_t4(BN_ULONG *rp, const BN_ULONG *ap, 831 const void *table, const BN_ULONG *np, 832 const BN_ULONG *n0, int num, int power); 833 void bn_flip_n_scatter5_t4(const BN_ULONG *inp, size_t num, 834 void *table, size_t power); 835 void bn_gather5_t4(BN_ULONG *out, size_t num, 836 void *table, size_t power); 837 void bn_flip_t4(BN_ULONG *dst, BN_ULONG *src, size_t num); 838 839 BN_ULONG *np = mont->N.d, *n0 = mont->n0; 840 int stride = 5 * (6 - (top / 16 - 1)); /* multiple of 5, but less 841 * than 32 */ 842 843 /* 844 * BN_to_montgomery can contaminate words above .top [in 845 * BN_DEBUG build... 846 */ 847 for (i = am.top; i < top; i++) 848 am.d[i] = 0; 849 for (i = tmp.top; i < top; i++) 850 tmp.d[i] = 0; 851 852 bn_flip_n_scatter5_t4(tmp.d, top, powerbuf, 0); 853 bn_flip_n_scatter5_t4(am.d, top, powerbuf, 1); 854 if (!(*mul_worker) (tmp.d, am.d, am.d, np, n0) && 855 !(*mul_worker) (tmp.d, am.d, am.d, np, n0)) 856 bn_mul_mont_vis3(tmp.d, am.d, am.d, np, n0, top); 857 bn_flip_n_scatter5_t4(tmp.d, top, powerbuf, 2); 858 859 for (i = 3; i < 32; i++) { 860 /* Calculate a^i = a^(i-1) * a */ 861 if (!(*mul_worker) (tmp.d, tmp.d, am.d, np, n0) && 862 !(*mul_worker) (tmp.d, tmp.d, am.d, np, n0)) 863 bn_mul_mont_vis3(tmp.d, tmp.d, am.d, np, n0, top); 864 bn_flip_n_scatter5_t4(tmp.d, top, powerbuf, i); 865 } 866 867 /* switch to 64-bit domain */ 868 np = alloca(top * sizeof(BN_ULONG)); 869 top /= 2; 870 bn_flip_t4(np, mont->N.d, top); 871 872 /* 873 * The exponent may not have a whole number of fixed-size windows. 874 * To simplify the main loop, the initial window has between 1 and 875 * full-window-size bits such that what remains is always a whole 876 * number of windows 877 */ 878 window0 = (bits - 1) % 5 + 1; 879 wmask = (1 << window0) - 1; 880 bits -= window0; 881 wvalue = bn_get_bits(p, bits) & wmask; 882 bn_gather5_t4(tmp.d, top, powerbuf, wvalue); 883 884 /* 885 * Scan the exponent one window at a time starting from the most 886 * significant bits. 887 */ 888 while (bits > 0) { 889 if (bits < stride) 890 stride = bits; 891 bits -= stride; 892 wvalue = bn_get_bits(p, bits); 893 894 if ((*pwr5_worker) (tmp.d, np, n0, powerbuf, wvalue, stride)) 895 continue; 896 /* retry once and fall back */ 897 if ((*pwr5_worker) (tmp.d, np, n0, powerbuf, wvalue, stride)) 898 continue; 899 900 bits += stride - 5; 901 wvalue >>= stride - 5; 902 wvalue &= 31; 903 bn_mul_mont_t4(tmp.d, tmp.d, tmp.d, np, n0, top); 904 bn_mul_mont_t4(tmp.d, tmp.d, tmp.d, np, n0, top); 905 bn_mul_mont_t4(tmp.d, tmp.d, tmp.d, np, n0, top); 906 bn_mul_mont_t4(tmp.d, tmp.d, tmp.d, np, n0, top); 907 bn_mul_mont_t4(tmp.d, tmp.d, tmp.d, np, n0, top); 908 bn_mul_mont_gather5_t4(tmp.d, tmp.d, powerbuf, np, n0, top, 909 wvalue); 910 } 911 912 bn_flip_t4(tmp.d, tmp.d, top); 913 top *= 2; 914 /* back to 32-bit domain */ 915 tmp.top = top; 916 bn_correct_top(&tmp); 917 OPENSSL_cleanse(np, top * sizeof(BN_ULONG)); 918 } else 919 #endif 920 #if defined(OPENSSL_BN_ASM_MONT5) 921 if (window == 5 && top > 1) { 922 /* 923 * This optimization uses ideas from https://eprint.iacr.org/2011/239, 924 * specifically optimization of cache-timing attack countermeasures, 925 * pre-computation optimization, and Almost Montgomery Multiplication. 926 * 927 * The paper discusses a 4-bit window to optimize 512-bit modular 928 * exponentiation, used in RSA-1024 with CRT, but RSA-1024 is no longer 929 * important. 930 * 931 * |bn_mul_mont_gather5| and |bn_power5| implement the "almost" 932 * reduction variant, so the values here may not be fully reduced. 933 * They are bounded by R (i.e. they fit in |top| words), not |m|. 934 * Additionally, we pass these "almost" reduced inputs into 935 * |bn_mul_mont|, which implements the normal reduction variant. 936 * Given those inputs, |bn_mul_mont| may not give reduced 937 * output, but it will still produce "almost" reduced output. 938 */ 939 void bn_mul_mont_gather5(BN_ULONG *rp, const BN_ULONG *ap, 940 const void *table, const BN_ULONG *np, 941 const BN_ULONG *n0, int num, int power); 942 void bn_scatter5(const BN_ULONG *inp, size_t num, 943 void *table, size_t power); 944 void bn_gather5(BN_ULONG *out, size_t num, void *table, size_t power); 945 void bn_power5(BN_ULONG *rp, const BN_ULONG *ap, 946 const void *table, const BN_ULONG *np, 947 const BN_ULONG *n0, int num, int power); 948 int bn_get_bits5(const BN_ULONG *ap, int off); 949 950 BN_ULONG *n0 = mont->n0, *np; 951 952 /* 953 * BN_to_montgomery can contaminate words above .top [in 954 * BN_DEBUG build... 955 */ 956 for (i = am.top; i < top; i++) 957 am.d[i] = 0; 958 for (i = tmp.top; i < top; i++) 959 tmp.d[i] = 0; 960 961 /* 962 * copy mont->N.d[] to improve cache locality 963 */ 964 for (np = am.d + top, i = 0; i < top; i++) 965 np[i] = mont->N.d[i]; 966 967 bn_scatter5(tmp.d, top, powerbuf, 0); 968 bn_scatter5(am.d, am.top, powerbuf, 1); 969 bn_mul_mont(tmp.d, am.d, am.d, np, n0, top); 970 bn_scatter5(tmp.d, top, powerbuf, 2); 971 972 # if 0 973 for (i = 3; i < 32; i++) { 974 /* Calculate a^i = a^(i-1) * a */ 975 bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1); 976 bn_scatter5(tmp.d, top, powerbuf, i); 977 } 978 # else 979 /* same as above, but uses squaring for 1/2 of operations */ 980 for (i = 4; i < 32; i *= 2) { 981 bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); 982 bn_scatter5(tmp.d, top, powerbuf, i); 983 } 984 for (i = 3; i < 8; i += 2) { 985 int j; 986 bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1); 987 bn_scatter5(tmp.d, top, powerbuf, i); 988 for (j = 2 * i; j < 32; j *= 2) { 989 bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); 990 bn_scatter5(tmp.d, top, powerbuf, j); 991 } 992 } 993 for (; i < 16; i += 2) { 994 bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1); 995 bn_scatter5(tmp.d, top, powerbuf, i); 996 bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); 997 bn_scatter5(tmp.d, top, powerbuf, 2 * i); 998 } 999 for (; i < 32; i += 2) { 1000 bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1); 1001 bn_scatter5(tmp.d, top, powerbuf, i); 1002 } 1003 # endif 1004 /* 1005 * The exponent may not have a whole number of fixed-size windows. 1006 * To simplify the main loop, the initial window has between 1 and 1007 * full-window-size bits such that what remains is always a whole 1008 * number of windows 1009 */ 1010 window0 = (bits - 1) % 5 + 1; 1011 wmask = (1 << window0) - 1; 1012 bits -= window0; 1013 wvalue = bn_get_bits(p, bits) & wmask; 1014 bn_gather5(tmp.d, top, powerbuf, wvalue); 1015 1016 /* 1017 * Scan the exponent one window at a time starting from the most 1018 * significant bits. 1019 */ 1020 if (top & 7) { 1021 while (bits > 0) { 1022 bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); 1023 bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); 1024 bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); 1025 bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); 1026 bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); 1027 bn_mul_mont_gather5(tmp.d, tmp.d, powerbuf, np, n0, top, 1028 bn_get_bits5(p->d, bits -= 5)); 1029 } 1030 } else { 1031 while (bits > 0) { 1032 bn_power5(tmp.d, tmp.d, powerbuf, np, n0, top, 1033 bn_get_bits5(p->d, bits -= 5)); 1034 } 1035 } 1036 1037 tmp.top = top; 1038 /* 1039 * The result is now in |tmp| in Montgomery form, but it may not be 1040 * fully reduced. This is within bounds for |BN_from_montgomery| 1041 * (tmp < R <= m*R) so it will, when converting from Montgomery form, 1042 * produce a fully reduced result. 1043 * 1044 * This differs from Figure 2 of the paper, which uses AMM(h, 1) to 1045 * convert from Montgomery form with unreduced output, followed by an 1046 * extra reduction step. In the paper's terminology, we replace 1047 * steps 9 and 10 with MM(h, 1). 1048 */ 1049 } else 1050 #endif 1051 { 1052 fallback: 1053 if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, 0, window)) 1054 goto err; 1055 if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&am, top, powerbuf, 1, window)) 1056 goto err; 1057 1058 /* 1059 * If the window size is greater than 1, then calculate 1060 * val[i=2..2^winsize-1]. Powers are computed as a*a^(i-1) (even 1061 * powers could instead be computed as (a^(i/2))^2 to use the slight 1062 * performance advantage of sqr over mul). 1063 */ 1064 if (window > 1) { 1065 if (!bn_mul_mont_fixed_top(&tmp, &am, &am, mont, ctx)) 1066 goto err; 1067 if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, 2, 1068 window)) 1069 goto err; 1070 for (i = 3; i < numPowers; i++) { 1071 /* Calculate a^i = a^(i-1) * a */ 1072 if (!bn_mul_mont_fixed_top(&tmp, &am, &tmp, mont, ctx)) 1073 goto err; 1074 if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, i, 1075 window)) 1076 goto err; 1077 } 1078 } 1079 1080 /* 1081 * The exponent may not have a whole number of fixed-size windows. 1082 * To simplify the main loop, the initial window has between 1 and 1083 * full-window-size bits such that what remains is always a whole 1084 * number of windows 1085 */ 1086 window0 = (bits - 1) % window + 1; 1087 wmask = (1 << window0) - 1; 1088 bits -= window0; 1089 wvalue = bn_get_bits(p, bits) & wmask; 1090 if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(&tmp, top, powerbuf, wvalue, 1091 window)) 1092 goto err; 1093 1094 wmask = (1 << window) - 1; 1095 /* 1096 * Scan the exponent one window at a time starting from the most 1097 * significant bits. 1098 */ 1099 while (bits > 0) { 1100 1101 /* Square the result window-size times */ 1102 for (i = 0; i < window; i++) 1103 if (!bn_mul_mont_fixed_top(&tmp, &tmp, &tmp, mont, ctx)) 1104 goto err; 1105 1106 /* 1107 * Get a window's worth of bits from the exponent 1108 * This avoids calling BN_is_bit_set for each bit, which 1109 * is not only slower but also makes each bit vulnerable to 1110 * EM (and likely other) side-channel attacks like One&Done 1111 * (for details see "One&Done: A Single-Decryption EM-Based 1112 * Attack on OpenSSL's Constant-Time Blinded RSA" by M. Alam, 1113 * H. Khan, M. Dey, N. Sinha, R. Callan, A. Zajic, and 1114 * M. Prvulovic, in USENIX Security'18) 1115 */ 1116 bits -= window; 1117 wvalue = bn_get_bits(p, bits) & wmask; 1118 /* 1119 * Fetch the appropriate pre-computed value from the pre-buf 1120 */ 1121 if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(&am, top, powerbuf, wvalue, 1122 window)) 1123 goto err; 1124 1125 /* Multiply the result into the intermediate result */ 1126 if (!bn_mul_mont_fixed_top(&tmp, &tmp, &am, mont, ctx)) 1127 goto err; 1128 } 1129 } 1130 1131 /* 1132 * Done with zero-padded intermediate BIGNUMs. Final BN_from_montgomery 1133 * removes padding [if any] and makes return value suitable for public 1134 * API consumer. 1135 */ 1136 #if defined(SPARC_T4_MONT) 1137 if (OPENSSL_sparcv9cap_P[0] & (SPARCV9_VIS3 | SPARCV9_PREFER_FPU)) { 1138 am.d[0] = 1; /* borrow am */ 1139 for (i = 1; i < top; i++) 1140 am.d[i] = 0; 1141 if (!BN_mod_mul_montgomery(rr, &tmp, &am, mont, ctx)) 1142 goto err; 1143 } else 1144 #endif 1145 if (!BN_from_montgomery(rr, &tmp, mont, ctx)) 1146 goto err; 1147 ret = 1; 1148 err: 1149 if (in_mont == NULL) 1150 BN_MONT_CTX_free(mont); 1151 if (powerbuf != NULL) { 1152 OPENSSL_cleanse(powerbuf, powerbufLen); 1153 OPENSSL_free(powerbufFree); 1154 } 1155 BN_CTX_end(ctx); 1156 return ret; 1157 } 1158 1159 int BN_mod_exp_mont_word(BIGNUM *rr, BN_ULONG a, const BIGNUM *p, 1160 const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *in_mont) 1161 { 1162 BN_MONT_CTX *mont = NULL; 1163 int b, bits, ret = 0; 1164 int r_is_one; 1165 BN_ULONG w, next_w; 1166 BIGNUM *r, *t; 1167 BIGNUM *swap_tmp; 1168 #define BN_MOD_MUL_WORD(r, w, m) \ 1169 (BN_mul_word(r, (w)) && \ 1170 (/* BN_ucmp(r, (m)) < 0 ? 1 :*/ \ 1171 (BN_mod(t, r, m, ctx) && (swap_tmp = r, r = t, t = swap_tmp, 1)))) 1172 /* 1173 * BN_MOD_MUL_WORD is only used with 'w' large, so the BN_ucmp test is 1174 * probably more overhead than always using BN_mod (which uses BN_copy if 1175 * a similar test returns true). 1176 */ 1177 /* 1178 * We can use BN_mod and do not need BN_nnmod because our accumulator is 1179 * never negative (the result of BN_mod does not depend on the sign of 1180 * the modulus). 1181 */ 1182 #define BN_TO_MONTGOMERY_WORD(r, w, mont) \ 1183 (BN_set_word(r, (w)) && BN_to_montgomery(r, r, (mont), ctx)) 1184 1185 if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0 1186 || BN_get_flags(m, BN_FLG_CONSTTIME) != 0) { 1187 /* BN_FLG_CONSTTIME only supported by BN_mod_exp_mont() */ 1188 ERR_raise(ERR_LIB_BN, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); 1189 return 0; 1190 } 1191 1192 bn_check_top(p); 1193 bn_check_top(m); 1194 1195 if (!BN_is_odd(m)) { 1196 ERR_raise(ERR_LIB_BN, BN_R_CALLED_WITH_EVEN_MODULUS); 1197 return 0; 1198 } 1199 if (m->top == 1) 1200 a %= m->d[0]; /* make sure that 'a' is reduced */ 1201 1202 bits = BN_num_bits(p); 1203 if (bits == 0) { 1204 /* x**0 mod 1, or x**0 mod -1 is still zero. */ 1205 if (BN_abs_is_word(m, 1)) { 1206 ret = 1; 1207 BN_zero(rr); 1208 } else { 1209 ret = BN_one(rr); 1210 } 1211 return ret; 1212 } 1213 if (a == 0) { 1214 BN_zero(rr); 1215 ret = 1; 1216 return ret; 1217 } 1218 1219 BN_CTX_start(ctx); 1220 r = BN_CTX_get(ctx); 1221 t = BN_CTX_get(ctx); 1222 if (t == NULL) 1223 goto err; 1224 1225 if (in_mont != NULL) 1226 mont = in_mont; 1227 else { 1228 if ((mont = BN_MONT_CTX_new()) == NULL) 1229 goto err; 1230 if (!BN_MONT_CTX_set(mont, m, ctx)) 1231 goto err; 1232 } 1233 1234 r_is_one = 1; /* except for Montgomery factor */ 1235 1236 /* bits-1 >= 0 */ 1237 1238 /* The result is accumulated in the product r*w. */ 1239 w = a; /* bit 'bits-1' of 'p' is always set */ 1240 for (b = bits - 2; b >= 0; b--) { 1241 /* First, square r*w. */ 1242 next_w = w * w; 1243 if ((next_w / w) != w) { /* overflow */ 1244 if (r_is_one) { 1245 if (!BN_TO_MONTGOMERY_WORD(r, w, mont)) 1246 goto err; 1247 r_is_one = 0; 1248 } else { 1249 if (!BN_MOD_MUL_WORD(r, w, m)) 1250 goto err; 1251 } 1252 next_w = 1; 1253 } 1254 w = next_w; 1255 if (!r_is_one) { 1256 if (!BN_mod_mul_montgomery(r, r, r, mont, ctx)) 1257 goto err; 1258 } 1259 1260 /* Second, multiply r*w by 'a' if exponent bit is set. */ 1261 if (BN_is_bit_set(p, b)) { 1262 next_w = w * a; 1263 if ((next_w / a) != w) { /* overflow */ 1264 if (r_is_one) { 1265 if (!BN_TO_MONTGOMERY_WORD(r, w, mont)) 1266 goto err; 1267 r_is_one = 0; 1268 } else { 1269 if (!BN_MOD_MUL_WORD(r, w, m)) 1270 goto err; 1271 } 1272 next_w = a; 1273 } 1274 w = next_w; 1275 } 1276 } 1277 1278 /* Finally, set r:=r*w. */ 1279 if (w != 1) { 1280 if (r_is_one) { 1281 if (!BN_TO_MONTGOMERY_WORD(r, w, mont)) 1282 goto err; 1283 r_is_one = 0; 1284 } else { 1285 if (!BN_MOD_MUL_WORD(r, w, m)) 1286 goto err; 1287 } 1288 } 1289 1290 if (r_is_one) { /* can happen only if a == 1 */ 1291 if (!BN_one(rr)) 1292 goto err; 1293 } else { 1294 if (!BN_from_montgomery(rr, r, mont, ctx)) 1295 goto err; 1296 } 1297 ret = 1; 1298 err: 1299 if (in_mont == NULL) 1300 BN_MONT_CTX_free(mont); 1301 BN_CTX_end(ctx); 1302 bn_check_top(rr); 1303 return ret; 1304 } 1305 1306 /* The old fallback, simple version :-) */ 1307 int BN_mod_exp_simple(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, 1308 const BIGNUM *m, BN_CTX *ctx) 1309 { 1310 int i, j, bits, ret = 0, wstart, wend, window, wvalue; 1311 int start = 1; 1312 BIGNUM *d; 1313 /* Table of variables obtained from 'ctx' */ 1314 BIGNUM *val[TABLE_SIZE]; 1315 1316 if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0 1317 || BN_get_flags(a, BN_FLG_CONSTTIME) != 0 1318 || BN_get_flags(m, BN_FLG_CONSTTIME) != 0) { 1319 /* BN_FLG_CONSTTIME only supported by BN_mod_exp_mont() */ 1320 ERR_raise(ERR_LIB_BN, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); 1321 return 0; 1322 } 1323 1324 bits = BN_num_bits(p); 1325 if (bits == 0) { 1326 /* x**0 mod 1, or x**0 mod -1 is still zero. */ 1327 if (BN_abs_is_word(m, 1)) { 1328 ret = 1; 1329 BN_zero(r); 1330 } else { 1331 ret = BN_one(r); 1332 } 1333 return ret; 1334 } 1335 1336 BN_CTX_start(ctx); 1337 d = BN_CTX_get(ctx); 1338 val[0] = BN_CTX_get(ctx); 1339 if (val[0] == NULL) 1340 goto err; 1341 1342 if (!BN_nnmod(val[0], a, m, ctx)) 1343 goto err; /* 1 */ 1344 if (BN_is_zero(val[0])) { 1345 BN_zero(r); 1346 ret = 1; 1347 goto err; 1348 } 1349 1350 window = BN_window_bits_for_exponent_size(bits); 1351 if (window > 1) { 1352 if (!BN_mod_mul(d, val[0], val[0], m, ctx)) 1353 goto err; /* 2 */ 1354 j = 1 << (window - 1); 1355 for (i = 1; i < j; i++) { 1356 if (((val[i] = BN_CTX_get(ctx)) == NULL) || 1357 !BN_mod_mul(val[i], val[i - 1], d, m, ctx)) 1358 goto err; 1359 } 1360 } 1361 1362 start = 1; /* This is used to avoid multiplication etc 1363 * when there is only the value '1' in the 1364 * buffer. */ 1365 wvalue = 0; /* The 'value' of the window */ 1366 wstart = bits - 1; /* The top bit of the window */ 1367 wend = 0; /* The bottom bit of the window */ 1368 1369 if (!BN_one(r)) 1370 goto err; 1371 1372 for (;;) { 1373 if (BN_is_bit_set(p, wstart) == 0) { 1374 if (!start) 1375 if (!BN_mod_mul(r, r, r, m, ctx)) 1376 goto err; 1377 if (wstart == 0) 1378 break; 1379 wstart--; 1380 continue; 1381 } 1382 /* 1383 * We now have wstart on a 'set' bit, we now need to work out how bit 1384 * a window to do. To do this we need to scan forward until the last 1385 * set bit before the end of the window 1386 */ 1387 wvalue = 1; 1388 wend = 0; 1389 for (i = 1; i < window; i++) { 1390 if (wstart - i < 0) 1391 break; 1392 if (BN_is_bit_set(p, wstart - i)) { 1393 wvalue <<= (i - wend); 1394 wvalue |= 1; 1395 wend = i; 1396 } 1397 } 1398 1399 /* wend is the size of the current window */ 1400 j = wend + 1; 1401 /* add the 'bytes above' */ 1402 if (!start) 1403 for (i = 0; i < j; i++) { 1404 if (!BN_mod_mul(r, r, r, m, ctx)) 1405 goto err; 1406 } 1407 1408 /* wvalue will be an odd number < 2^window */ 1409 if (!BN_mod_mul(r, r, val[wvalue >> 1], m, ctx)) 1410 goto err; 1411 1412 /* move the 'window' down further */ 1413 wstart -= wend + 1; 1414 wvalue = 0; 1415 start = 0; 1416 if (wstart < 0) 1417 break; 1418 } 1419 ret = 1; 1420 err: 1421 BN_CTX_end(ctx); 1422 bn_check_top(r); 1423 return ret; 1424 } 1425 1426 /* 1427 * This is a variant of modular exponentiation optimization that does 1428 * parallel 2-primes exponentiation using 256-bit (AVX512VL) AVX512_IFMA ISA 1429 * in 52-bit binary redundant representation. 1430 * If such instructions are not available, or input data size is not supported, 1431 * it falls back to two BN_mod_exp_mont_consttime() calls. 1432 */ 1433 int BN_mod_exp_mont_consttime_x2(BIGNUM *rr1, const BIGNUM *a1, const BIGNUM *p1, 1434 const BIGNUM *m1, BN_MONT_CTX *in_mont1, 1435 BIGNUM *rr2, const BIGNUM *a2, const BIGNUM *p2, 1436 const BIGNUM *m2, BN_MONT_CTX *in_mont2, 1437 BN_CTX *ctx) 1438 { 1439 int ret = 0; 1440 1441 #ifdef RSAZ_ENABLED 1442 BN_MONT_CTX *mont1 = NULL; 1443 BN_MONT_CTX *mont2 = NULL; 1444 1445 if (ossl_rsaz_avx512ifma_eligible() && 1446 ((a1->top == 16) && (p1->top == 16) && (BN_num_bits(m1) == 1024) && 1447 (a2->top == 16) && (p2->top == 16) && (BN_num_bits(m2) == 1024))) { 1448 1449 if (bn_wexpand(rr1, 16) == NULL) 1450 goto err; 1451 if (bn_wexpand(rr2, 16) == NULL) 1452 goto err; 1453 1454 /* Ensure that montgomery contexts are initialized */ 1455 if (in_mont1 != NULL) { 1456 mont1 = in_mont1; 1457 } else { 1458 if ((mont1 = BN_MONT_CTX_new()) == NULL) 1459 goto err; 1460 if (!BN_MONT_CTX_set(mont1, m1, ctx)) 1461 goto err; 1462 } 1463 if (in_mont2 != NULL) { 1464 mont2 = in_mont2; 1465 } else { 1466 if ((mont2 = BN_MONT_CTX_new()) == NULL) 1467 goto err; 1468 if (!BN_MONT_CTX_set(mont2, m2, ctx)) 1469 goto err; 1470 } 1471 1472 ret = ossl_rsaz_mod_exp_avx512_x2(rr1->d, a1->d, p1->d, m1->d, 1473 mont1->RR.d, mont1->n0[0], 1474 rr2->d, a2->d, p2->d, m2->d, 1475 mont2->RR.d, mont2->n0[0], 1476 1024 /* factor bit size */); 1477 1478 rr1->top = 16; 1479 rr1->neg = 0; 1480 bn_correct_top(rr1); 1481 bn_check_top(rr1); 1482 1483 rr2->top = 16; 1484 rr2->neg = 0; 1485 bn_correct_top(rr2); 1486 bn_check_top(rr2); 1487 1488 goto err; 1489 } 1490 #endif 1491 1492 /* rr1 = a1^p1 mod m1 */ 1493 ret = BN_mod_exp_mont_consttime(rr1, a1, p1, m1, ctx, in_mont1); 1494 /* rr2 = a2^p2 mod m2 */ 1495 ret &= BN_mod_exp_mont_consttime(rr2, a2, p2, m2, ctx, in_mont2); 1496 1497 #ifdef RSAZ_ENABLED 1498 err: 1499 if (in_mont2 == NULL) 1500 BN_MONT_CTX_free(mont2); 1501 if (in_mont1 == NULL) 1502 BN_MONT_CTX_free(mont1); 1503 #endif 1504 1505 return ret; 1506 } 1507