1 /* $NetBSD: cprng_fast.c,v 1.13 2015/04/13 22:43:41 riastradh Exp $ */ 2 3 /*- 4 * Copyright (c) 2014 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Taylor R. Campbell. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 #include <sys/cdefs.h> 33 __KERNEL_RCSID(0, "$NetBSD: cprng_fast.c,v 1.13 2015/04/13 22:43:41 riastradh Exp $"); 34 35 #include <sys/types.h> 36 #include <sys/param.h> 37 #include <sys/bitops.h> 38 #include <sys/cprng.h> 39 #include <sys/cpu.h> 40 #include <sys/intr.h> 41 #include <sys/percpu.h> 42 #include <sys/rnd.h> /* rnd_initial_entropy */ 43 44 /* ChaCha core */ 45 46 #define crypto_core_OUTPUTWORDS 16 47 #define crypto_core_INPUTWORDS 4 48 #define crypto_core_KEYWORDS 8 49 #define crypto_core_CONSTWORDS 4 50 51 #define crypto_core_ROUNDS 8 52 53 static uint32_t 54 rotate(uint32_t u, unsigned c) 55 { 56 57 return (u << c) | (u >> (32 - c)); 58 } 59 60 #define QUARTERROUND(a, b, c, d) do { \ 61 (a) += (b); (d) ^= (a); (d) = rotate((d), 16); \ 62 (c) += (d); (b) ^= (c); (b) = rotate((b), 12); \ 63 (a) += (b); (d) ^= (a); (d) = rotate((d), 8); \ 64 (c) += (d); (b) ^= (c); (b) = rotate((b), 7); \ 65 } while (0) 66 67 static void 68 crypto_core(uint32_t *out, const uint32_t *in, const uint32_t *k, 69 const uint32_t *c) 70 { 71 uint32_t x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15; 72 int i; 73 74 x0 = c[0]; 75 x1 = c[1]; 76 x2 = c[2]; 77 x3 = c[3]; 78 x4 = k[0]; 79 x5 = k[1]; 80 x6 = k[2]; 81 x7 = k[3]; 82 x8 = k[4]; 83 x9 = k[5]; 84 x10 = k[6]; 85 x11 = k[7]; 86 x12 = in[0]; 87 x13 = in[1]; 88 x14 = in[2]; 89 x15 = in[3]; 90 91 for (i = crypto_core_ROUNDS; i > 0; i -= 2) { 92 QUARTERROUND( x0, x4, x8,x12); 93 QUARTERROUND( x1, x5, x9,x13); 94 QUARTERROUND( x2, x6,x10,x14); 95 QUARTERROUND( x3, x7,x11,x15); 96 QUARTERROUND( x0, x5,x10,x15); 97 QUARTERROUND( x1, x6,x11,x12); 98 QUARTERROUND( x2, x7, x8,x13); 99 QUARTERROUND( x3, x4, x9,x14); 100 } 101 102 out[0] = x0 + c[0]; 103 out[1] = x1 + c[1]; 104 out[2] = x2 + c[2]; 105 out[3] = x3 + c[3]; 106 out[4] = x4 + k[0]; 107 out[5] = x5 + k[1]; 108 out[6] = x6 + k[2]; 109 out[7] = x7 + k[3]; 110 out[8] = x8 + k[4]; 111 out[9] = x9 + k[5]; 112 out[10] = x10 + k[6]; 113 out[11] = x11 + k[7]; 114 out[12] = x12 + in[0]; 115 out[13] = x13 + in[1]; 116 out[14] = x14 + in[2]; 117 out[15] = x15 + in[3]; 118 } 119 120 /* `expand 32-byte k' */ 121 static const uint32_t crypto_core_constant32[4] = { 122 0x61707865U, 0x3320646eU, 0x79622d32U, 0x6b206574U, 123 }; 124 125 /* 126 * Test vector for ChaCha20 from 127 * <http://tools.ietf.org/html/draft-strombergson-chacha-test-vectors-00>, 128 * test vectors for ChaCha12 and ChaCha8 generated by the same 129 * crypto_core code with crypto_core_ROUNDS varied. 130 */ 131 132 #define check(E) do \ 133 { \ 134 if (!(E)) \ 135 panic("crypto self-test failed: %s", #E); \ 136 } while (0) 137 138 static void 139 crypto_core_selftest(void) 140 { 141 const uint32_t zero32[8] = {0}; 142 const uint8_t sigma[] = "expand 32-byte k"; 143 uint32_t block[16]; 144 unsigned i; 145 146 #if crypto_core_ROUNDS == 8 147 static const uint8_t out[64] = { 148 0x3e,0x00,0xef,0x2f,0x89,0x5f,0x40,0xd6, 149 0x7f,0x5b,0xb8,0xe8,0x1f,0x09,0xa5,0xa1, 150 0x2c,0x84,0x0e,0xc3,0xce,0x9a,0x7f,0x3b, 151 0x18,0x1b,0xe1,0x88,0xef,0x71,0x1a,0x1e, 152 0x98,0x4c,0xe1,0x72,0xb9,0x21,0x6f,0x41, 153 0x9f,0x44,0x53,0x67,0x45,0x6d,0x56,0x19, 154 0x31,0x4a,0x42,0xa3,0xda,0x86,0xb0,0x01, 155 0x38,0x7b,0xfd,0xb8,0x0e,0x0c,0xfe,0x42, 156 }; 157 #elif crypto_core_ROUNDS == 12 158 static const uint8_t out[64] = { 159 0x9b,0xf4,0x9a,0x6a,0x07,0x55,0xf9,0x53, 160 0x81,0x1f,0xce,0x12,0x5f,0x26,0x83,0xd5, 161 0x04,0x29,0xc3,0xbb,0x49,0xe0,0x74,0x14, 162 0x7e,0x00,0x89,0xa5,0x2e,0xae,0x15,0x5f, 163 0x05,0x64,0xf8,0x79,0xd2,0x7a,0xe3,0xc0, 164 0x2c,0xe8,0x28,0x34,0xac,0xfa,0x8c,0x79, 165 0x3a,0x62,0x9f,0x2c,0xa0,0xde,0x69,0x19, 166 0x61,0x0b,0xe8,0x2f,0x41,0x13,0x26,0xbe, 167 }; 168 #elif crypto_core_ROUNDS == 20 169 static const uint8_t out[64] = { 170 0x76,0xb8,0xe0,0xad,0xa0,0xf1,0x3d,0x90, 171 0x40,0x5d,0x6a,0xe5,0x53,0x86,0xbd,0x28, 172 0xbd,0xd2,0x19,0xb8,0xa0,0x8d,0xed,0x1a, 173 0xa8,0x36,0xef,0xcc,0x8b,0x77,0x0d,0xc7, 174 0xda,0x41,0x59,0x7c,0x51,0x57,0x48,0x8d, 175 0x77,0x24,0xe0,0x3f,0xb8,0xd8,0x4a,0x37, 176 0x6a,0x43,0xb8,0xf4,0x15,0x18,0xa1,0x1c, 177 0xc3,0x87,0xb6,0x69,0xb2,0xee,0x65,0x86, 178 }; 179 #else 180 #error crypto_core_ROUNDS must be 8, 12, or 20. 181 #endif 182 183 check(crypto_core_constant32[0] == le32dec(&sigma[0])); 184 check(crypto_core_constant32[1] == le32dec(&sigma[4])); 185 check(crypto_core_constant32[2] == le32dec(&sigma[8])); 186 check(crypto_core_constant32[3] == le32dec(&sigma[12])); 187 188 crypto_core(block, zero32, zero32, crypto_core_constant32); 189 for (i = 0; i < 16; i++) 190 check(block[i] == le32dec(&out[i*4])); 191 } 192 193 #undef check 194 195 #define CPRNG_FAST_SEED_BYTES (crypto_core_KEYWORDS * sizeof(uint32_t)) 196 197 struct cprng_fast { 198 uint32_t buffer[crypto_core_OUTPUTWORDS]; 199 uint32_t key[crypto_core_KEYWORDS]; 200 uint32_t nonce[crypto_core_INPUTWORDS]; 201 bool have_initial; 202 }; 203 204 __CTASSERT(sizeof ((struct cprng_fast *)0)->key == CPRNG_FAST_SEED_BYTES); 205 206 static void cprng_fast_init_cpu(void *, void *, struct cpu_info *); 207 static void cprng_fast_schedule_reseed(struct cprng_fast *); 208 static void cprng_fast_intr(void *); 209 210 static void cprng_fast_seed(struct cprng_fast *, const void *); 211 static void cprng_fast_buf(struct cprng_fast *, void *, unsigned); 212 213 static void cprng_fast_buf_short(void *, size_t); 214 static void cprng_fast_buf_long(void *, size_t); 215 216 static percpu_t *cprng_fast_percpu __read_mostly; 217 static void *cprng_fast_softint __read_mostly; 218 219 void 220 cprng_fast_init(void) 221 { 222 223 crypto_core_selftest(); 224 cprng_fast_percpu = percpu_alloc(sizeof(struct cprng_fast)); 225 percpu_foreach(cprng_fast_percpu, &cprng_fast_init_cpu, NULL); 226 cprng_fast_softint = softint_establish(SOFTINT_SERIAL|SOFTINT_MPSAFE, 227 &cprng_fast_intr, NULL); 228 } 229 230 static void 231 cprng_fast_init_cpu(void *p, void *arg __unused, struct cpu_info *ci __unused) 232 { 233 struct cprng_fast *const cprng = p; 234 uint8_t seed[CPRNG_FAST_SEED_BYTES]; 235 236 cprng_strong(kern_cprng, seed, sizeof seed, 0); 237 cprng_fast_seed(cprng, seed); 238 cprng->have_initial = rnd_initial_entropy; 239 (void)explicit_memset(seed, 0, sizeof seed); 240 } 241 242 static inline int 243 cprng_fast_get(struct cprng_fast **cprngp) 244 { 245 struct cprng_fast *cprng; 246 int s; 247 248 *cprngp = cprng = percpu_getref(cprng_fast_percpu); 249 s = splvm(); 250 251 if (__predict_false(!cprng->have_initial)) 252 cprng_fast_schedule_reseed(cprng); 253 254 return s; 255 } 256 257 static inline void 258 cprng_fast_put(struct cprng_fast *cprng, int s) 259 { 260 261 KASSERT((cprng == percpu_getref(cprng_fast_percpu)) && 262 (percpu_putref(cprng_fast_percpu), true)); 263 splx(s); 264 percpu_putref(cprng_fast_percpu); 265 } 266 267 static void 268 cprng_fast_schedule_reseed(struct cprng_fast *cprng __unused) 269 { 270 271 softint_schedule(cprng_fast_softint); 272 } 273 274 static void 275 cprng_fast_intr(void *cookie __unused) 276 { 277 struct cprng_fast *cprng; 278 uint8_t seed[CPRNG_FAST_SEED_BYTES]; 279 int s; 280 281 cprng_strong(kern_cprng, seed, sizeof(seed), 0); 282 283 cprng = percpu_getref(cprng_fast_percpu); 284 s = splvm(); 285 cprng_fast_seed(cprng, seed); 286 cprng->have_initial = rnd_initial_entropy; 287 splx(s); 288 percpu_putref(cprng_fast_percpu); 289 290 explicit_memset(seed, 0, sizeof(seed)); 291 } 292 293 /* CPRNG algorithm */ 294 295 /* 296 * The state consists of a key, the current nonce, and a 64-byte buffer 297 * of output. Since we fill the buffer only when we need output, and 298 * eat a 32-bit word at a time, one 32-bit word of the buffer would be 299 * wasted. Instead, we repurpose it to count the number of entries in 300 * the buffer remaining, counting from high to low in order to allow 301 * comparison to zero to detect when we need to refill it. 302 */ 303 #define CPRNG_FAST_BUFIDX (crypto_core_OUTPUTWORDS - 1) 304 305 static void 306 cprng_fast_seed(struct cprng_fast *cprng, const void *seed) 307 { 308 309 (void)memset(cprng->buffer, 0, sizeof cprng->buffer); 310 (void)memcpy(cprng->key, seed, sizeof cprng->key); 311 (void)memset(cprng->nonce, 0, sizeof cprng->nonce); 312 } 313 314 static inline uint32_t 315 cprng_fast_word(struct cprng_fast *cprng) 316 { 317 uint32_t v; 318 319 if (__predict_true(0 < cprng->buffer[CPRNG_FAST_BUFIDX])) { 320 v = cprng->buffer[--cprng->buffer[CPRNG_FAST_BUFIDX]]; 321 } else { 322 /* If we don't have enough words, refill the buffer. */ 323 crypto_core(cprng->buffer, cprng->nonce, cprng->key, 324 crypto_core_constant32); 325 if (__predict_false(++cprng->nonce[0] == 0)) { 326 cprng->nonce[1]++; 327 cprng_fast_schedule_reseed(cprng); 328 } 329 v = cprng->buffer[CPRNG_FAST_BUFIDX]; 330 cprng->buffer[CPRNG_FAST_BUFIDX] = CPRNG_FAST_BUFIDX; 331 } 332 333 return v; 334 } 335 336 static inline void 337 cprng_fast_buf(struct cprng_fast *cprng, void *buf, unsigned n) 338 { 339 uint8_t *p = buf; 340 uint32_t v; 341 unsigned w, r; 342 343 w = n / sizeof(uint32_t); 344 while (w--) { 345 v = cprng_fast_word(cprng); 346 (void)memcpy(p, &v, 4); 347 p += 4; 348 } 349 350 r = n % sizeof(uint32_t); 351 if (r) { 352 v = cprng_fast_word(cprng); 353 while (r--) { 354 *p++ = (v & 0xff); 355 v >>= 8; 356 } 357 } 358 } 359 360 /* 361 * crypto_onetimestream: Expand a short unpredictable one-time seed 362 * into a long unpredictable output. 363 */ 364 static void 365 crypto_onetimestream(const uint32_t seed[crypto_core_KEYWORDS], void *buf, 366 size_t n) 367 { 368 uint32_t block[crypto_core_OUTPUTWORDS]; 369 uint32_t nonce[crypto_core_INPUTWORDS] = {0}; 370 uint8_t *p8; 371 uint32_t *p32; 372 size_t ni, nb, nf; 373 374 /* 375 * Guarantee we can generate up to n bytes. We have 376 * 2^(32*INPUTWORDS) possible inputs yielding output of 377 * 4*OUTPUTWORDS*2^(32*INPUTWORDS) bytes. It suffices to 378 * require that sizeof n > (1/CHAR_BIT) log_2 n be less than 379 * (1/CHAR_BIT) log_2 of the total output stream length. We 380 * have 381 * 382 * log_2 (4 o 2^(32 i)) = log_2 (4 o) + log_2 2^(32 i) 383 * = 2 + log_2 o + 32 i. 384 */ 385 __CTASSERT(CHAR_BIT*sizeof n <= 386 (2 + ilog2(crypto_core_OUTPUTWORDS) + 32*crypto_core_INPUTWORDS)); 387 388 p8 = buf; 389 p32 = (uint32_t *)roundup2((uintptr_t)p8, sizeof(uint32_t)); 390 ni = (uint8_t *)p32 - p8; 391 if (n < ni) 392 ni = n; 393 nb = (n - ni) / sizeof block; 394 nf = (n - ni) % sizeof block; 395 396 KASSERT(((uintptr_t)p32 & 3) == 0); 397 KASSERT(ni <= n); 398 KASSERT(nb <= (n / sizeof block)); 399 KASSERT(nf <= n); 400 KASSERT(n == (ni + (nb * sizeof block) + nf)); 401 KASSERT(ni < sizeof(uint32_t)); 402 KASSERT(nf < sizeof block); 403 404 if (ni) { 405 crypto_core(block, nonce, seed, crypto_core_constant32); 406 nonce[0]++; 407 (void)memcpy(p8, block, ni); 408 } 409 while (nb--) { 410 crypto_core(p32, nonce, seed, crypto_core_constant32); 411 if (++nonce[0] == 0) 412 nonce[1]++; 413 p32 += crypto_core_OUTPUTWORDS; 414 } 415 if (nf) { 416 crypto_core(block, nonce, seed, crypto_core_constant32); 417 if (++nonce[0] == 0) 418 nonce[1]++; 419 (void)memcpy(p32, block, nf); 420 } 421 422 if (ni | nf) 423 (void)explicit_memset(block, 0, sizeof block); 424 } 425 426 /* Public API */ 427 428 uint32_t 429 cprng_fast32(void) 430 { 431 struct cprng_fast *cprng; 432 uint32_t v; 433 int s; 434 435 s = cprng_fast_get(&cprng); 436 v = cprng_fast_word(cprng); 437 cprng_fast_put(cprng, s); 438 439 return v; 440 } 441 442 uint64_t 443 cprng_fast64(void) 444 { 445 struct cprng_fast *cprng; 446 uint32_t hi, lo; 447 int s; 448 449 s = cprng_fast_get(&cprng); 450 hi = cprng_fast_word(cprng); 451 lo = cprng_fast_word(cprng); 452 cprng_fast_put(cprng, s); 453 454 return ((uint64_t)hi << 32) | lo; 455 } 456 457 static void 458 cprng_fast_buf_short(void *buf, size_t len) 459 { 460 struct cprng_fast *cprng; 461 int s; 462 463 s = cprng_fast_get(&cprng); 464 cprng_fast_buf(cprng, buf, len); 465 cprng_fast_put(cprng, s); 466 } 467 468 static __noinline void 469 cprng_fast_buf_long(void *buf, size_t len) 470 { 471 uint32_t seed[crypto_core_KEYWORDS]; 472 struct cprng_fast *cprng; 473 int s; 474 475 s = cprng_fast_get(&cprng); 476 cprng_fast_buf(cprng, seed, sizeof seed); 477 cprng_fast_put(cprng, s); 478 479 crypto_onetimestream(seed, buf, len); 480 481 (void)explicit_memset(seed, 0, sizeof seed); 482 } 483 484 size_t 485 cprng_fast(void *buf, size_t len) 486 { 487 488 /* 489 * We don't want to hog the CPU, so we use the short version, 490 * to generate output without preemption, only if we can do it 491 * with at most one crypto_core. 492 */ 493 if (len <= (sizeof(uint32_t) * crypto_core_OUTPUTWORDS)) 494 cprng_fast_buf_short(buf, len); 495 else 496 cprng_fast_buf_long(buf, len); 497 498 return len; 499 } 500