1 /* $NetBSD: aes_sse2_subr.c,v 1.4 2020/09/08 22:48:24 riastradh Exp $ */
2
3 /*-
4 * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 #include <sys/cdefs.h>
30 __KERNEL_RCSID(1, "$NetBSD: aes_sse2_subr.c,v 1.4 2020/09/08 22:48:24 riastradh Exp $");
31
32 #ifdef _KERNEL
33 #include <sys/systm.h>
34 #include <lib/libkern/libkern.h>
35 #else
36 #include <err.h>
37 #include <assert.h>
38 #include <inttypes.h>
39 #include <stdio.h>
40 #include <string.h>
41 #define KASSERT assert
42 #define panic(fmt, args...) err(1, fmt, ##args)
43 #endif
44
45 #include <crypto/aes/aes.h>
46 #include <crypto/aes/arch/x86/aes_sse2.h>
47
48 #include "aes_sse2_impl.h"
49
50 void
aes_sse2_setkey(uint64_t rk[static30],const void * key,uint32_t nrounds)51 aes_sse2_setkey(uint64_t rk[static 30], const void *key, uint32_t nrounds)
52 {
53 size_t key_len;
54
55 switch (nrounds) {
56 case 10:
57 key_len = 16;
58 break;
59 case 12:
60 key_len = 24;
61 break;
62 case 14:
63 key_len = 32;
64 break;
65 default:
66 panic("invalid AES nrounds: %u", nrounds);
67 }
68
69 aes_sse2_keysched(rk, key, key_len);
70 }
71
72 void
aes_sse2_enc(const struct aesenc * enc,const uint8_t in[static16],uint8_t out[static16],uint32_t nrounds)73 aes_sse2_enc(const struct aesenc *enc, const uint8_t in[static 16],
74 uint8_t out[static 16], uint32_t nrounds)
75 {
76 uint64_t sk_exp[120];
77 __m128i q[4];
78
79 /* Expand round keys for bitslicing. */
80 aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64);
81
82 /* Load input block interleaved with garbage blocks. */
83 q[0] = aes_sse2_interleave_in(_mm_loadu_epi8(in));
84 q[1] = q[2] = q[3] = _mm_setzero_si128();
85
86 /* Transform to bitslice, decrypt, transform from bitslice. */
87 aes_sse2_ortho(q);
88 aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
89 aes_sse2_ortho(q);
90
91 /* Store output block. */
92 _mm_storeu_epi8(out, aes_sse2_interleave_out(q[0]));
93
94 /* Paranoia: Zero temporary buffers. */
95 explicit_memset(sk_exp, 0, sizeof sk_exp);
96 explicit_memset(q, 0, sizeof q);
97 }
98
99 void
aes_sse2_dec(const struct aesdec * dec,const uint8_t in[static16],uint8_t out[static16],uint32_t nrounds)100 aes_sse2_dec(const struct aesdec *dec, const uint8_t in[static 16],
101 uint8_t out[static 16], uint32_t nrounds)
102 {
103 uint64_t sk_exp[120];
104 __m128i q[4];
105
106 /* Expand round keys for bitslicing. */
107 aes_sse2_skey_expand(sk_exp, nrounds, dec->aesd_aes.aes_rk64);
108
109 /* Load input block interleaved with garbage blocks. */
110 q[0] = aes_sse2_interleave_in(_mm_loadu_epi8(in));
111 q[1] = q[2] = q[3] = _mm_setzero_si128();
112
113 /* Transform to bitslice, decrypt, transform from bitslice. */
114 aes_sse2_ortho(q);
115 aes_sse2_bitslice_decrypt(nrounds, sk_exp, q);
116 aes_sse2_ortho(q);
117
118 /* Store output block. */
119 _mm_storeu_epi8(out, aes_sse2_interleave_out(q[0]));
120
121 /* Paranoia: Zero temporary buffers. */
122 explicit_memset(sk_exp, 0, sizeof sk_exp);
123 explicit_memset(q, 0, sizeof q);
124 }
125
126 void
aes_sse2_cbc_enc(const struct aesenc * enc,const uint8_t in[static16],uint8_t out[static16],size_t nbytes,uint8_t iv[static16],uint32_t nrounds)127 aes_sse2_cbc_enc(const struct aesenc *enc, const uint8_t in[static 16],
128 uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16],
129 uint32_t nrounds)
130 {
131 uint64_t sk_exp[120];
132 __m128i q[4];
133 __m128i cv;
134
135 KASSERT(nbytes);
136 KASSERT(nbytes % 16 == 0);
137
138 /* Expand round keys for bitslicing. */
139 aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64);
140
141 /* Load the IV. */
142 cv = _mm_loadu_epi8(iv);
143
144 for (; nbytes; nbytes -= 16, in += 16, out += 16) {
145 /* Load input block and apply CV. */
146 q[0] = aes_sse2_interleave_in(cv ^ _mm_loadu_epi8(in));
147
148 /* Transform to bitslice, encrypt, transform from bitslice. */
149 aes_sse2_ortho(q);
150 aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
151 aes_sse2_ortho(q);
152
153 /* Remember ciphertext as CV and store output block. */
154 cv = aes_sse2_interleave_out(q[0]);
155 _mm_storeu_epi8(out, cv);
156 }
157
158 /* Store updated IV. */
159 _mm_storeu_epi8(iv, cv);
160
161 /* Paranoia: Zero temporary buffers. */
162 explicit_memset(sk_exp, 0, sizeof sk_exp);
163 explicit_memset(q, 0, sizeof q);
164 }
165
166 void
aes_sse2_cbc_dec(const struct aesdec * dec,const uint8_t in[static16],uint8_t out[static16],size_t nbytes,uint8_t ivp[static16],uint32_t nrounds)167 aes_sse2_cbc_dec(const struct aesdec *dec, const uint8_t in[static 16],
168 uint8_t out[static 16], size_t nbytes, uint8_t ivp[static 16],
169 uint32_t nrounds)
170 {
171 uint64_t sk_exp[120];
172 __m128i q[4];
173 __m128i cv, iv, w;
174
175 KASSERT(nbytes);
176 KASSERT(nbytes % 16 == 0);
177
178 /* Expand round keys for bitslicing. */
179 aes_sse2_skey_expand(sk_exp, nrounds, dec->aesd_aes.aes_rk64);
180
181 /* Load the IV. */
182 iv = _mm_loadu_epi8(ivp);
183
184 /* Load the last cipher block. */
185 cv = _mm_loadu_epi8(in + nbytes - 16);
186
187 /* Store the updated IV. */
188 _mm_storeu_epi8(ivp, cv);
189
190 /* Process the last blocks if not an even multiple of four. */
191 if (nbytes % (4*16)) {
192 unsigned n = (nbytes/16) % 4;
193
194 KASSERT(n > 0);
195 KASSERT(n < 4);
196
197 q[1] = q[2] = q[3] = _mm_setzero_si128();
198 q[n - 1] = aes_sse2_interleave_in(cv);
199 switch (nbytes % 64) {
200 case 48:
201 w = _mm_loadu_epi8(in + nbytes - 32);
202 q[1] = aes_sse2_interleave_in(w);
203 w = _mm_loadu_epi8(in + nbytes - 48);
204 q[0] = aes_sse2_interleave_in(w);
205 break;
206 case 32:
207 w = _mm_loadu_epi8(in + nbytes - 32);
208 q[0] = aes_sse2_interleave_in(w);
209 break;
210 case 16:
211 break;
212 }
213
214 /* Decrypt. */
215 aes_sse2_ortho(q);
216 aes_sse2_bitslice_decrypt(nrounds, sk_exp, q);
217 aes_sse2_ortho(q);
218
219 do {
220 n--;
221 w = aes_sse2_interleave_out(q[n]);
222 if ((nbytes -= 16) == 0)
223 goto out;
224 cv = _mm_loadu_epi8(in + nbytes - 16);
225 _mm_storeu_epi8(out + nbytes, w ^ cv);
226 } while (n);
227 }
228
229 for (;;) {
230 KASSERT(nbytes >= 64);
231 nbytes -= 64;
232
233 /*
234 * 1. Set up upper cipher block from cv.
235 * 2. Load lower cipher block into cv and set it up.
236 * 3. Decrypt.
237 */
238 q[3] = aes_sse2_interleave_in(cv);
239
240 w = _mm_loadu_epi8(in + nbytes + 4*8);
241 q[2] = aes_sse2_interleave_in(w);
242
243 w = _mm_loadu_epi8(in + nbytes + 4*4);
244 q[1] = aes_sse2_interleave_in(w);
245
246 w = _mm_loadu_epi8(in + nbytes + 4*0);
247 q[0] = aes_sse2_interleave_in(w);
248
249 aes_sse2_ortho(q);
250 aes_sse2_bitslice_decrypt(nrounds, sk_exp, q);
251 aes_sse2_ortho(q);
252
253 /* Store the upper output block. */
254 w = aes_sse2_interleave_out(q[3]);
255 cv = _mm_loadu_epi8(in + nbytes + 4*8);
256 _mm_storeu_epi8(out + nbytes + 4*12, w ^ cv);
257
258 /* Store the middle output blocks. */
259 w = aes_sse2_interleave_out(q[2]);
260 cv = _mm_loadu_epi8(in + nbytes + 4*4);
261 _mm_storeu_epi8(out + nbytes + 4*8, w ^ cv);
262
263 w = aes_sse2_interleave_out(q[1]);
264 cv = _mm_loadu_epi8(in + nbytes + 4*0);
265 _mm_storeu_epi8(out + nbytes + 4*4, w ^ cv);
266
267 /*
268 * Get the first output block, but don't load the CV
269 * yet -- it might be the previous ciphertext block, or
270 * it might be the IV.
271 */
272 w = aes_sse2_interleave_out(q[0]);
273
274 /* Stop if we've reached the first output block. */
275 if (nbytes == 0)
276 goto out;
277
278 /*
279 * Load the preceding cipher block, and apply it as the
280 * chaining value to this one.
281 */
282 cv = _mm_loadu_epi8(in + nbytes - 16);
283 _mm_storeu_epi8(out + nbytes, w ^ cv);
284 }
285
286 out: /* Store the first output block. */
287 _mm_storeu_epi8(out, w ^ iv);
288
289 /* Paranoia: Zero temporary buffers. */
290 explicit_memset(sk_exp, 0, sizeof sk_exp);
291 explicit_memset(q, 0, sizeof q);
292 }
293
294 static inline __m128i
aes_sse2_xts_update(__m128i t)295 aes_sse2_xts_update(__m128i t)
296 {
297 const __m128i one = _mm_set_epi64x(1, 1);
298 __m128i s, m, c;
299
300 s = _mm_srli_epi64(t, 63); /* 1 if high bit set else 0 */
301 m = _mm_sub_epi64(s, one); /* 0 if high bit set else -1 */
302 m = _mm_shuffle_epi32(m, 0x4e); /* swap halves */
303 c = _mm_set_epi64x(1, 0x87); /* carry */
304
305 return _mm_slli_epi64(t, 1) ^ (c & ~m);
306 }
307
308 static int
aes_sse2_xts_update_selftest(void)309 aes_sse2_xts_update_selftest(void)
310 {
311 static const struct {
312 uint32_t in[4], out[4];
313 } cases[] = {
314 [0] = { {1}, {2} },
315 [1] = { {0x80000000U,0,0,0}, {0,1,0,0} },
316 [2] = { {0,0x80000000U,0,0}, {0,0,1,0} },
317 [3] = { {0,0,0x80000000U,0}, {0,0,0,1} },
318 [4] = { {0,0,0,0x80000000U}, {0x87,0,0,0} },
319 [5] = { {0,0x80000000U,0,0x80000000U}, {0x87,0,1,0} },
320 };
321 unsigned i;
322 uint32_t t[4];
323 int result = 0;
324
325 for (i = 0; i < sizeof(cases)/sizeof(cases[0]); i++) {
326 t[0] = cases[i].in[0];
327 t[1] = cases[i].in[1];
328 t[2] = cases[i].in[2];
329 t[3] = cases[i].in[3];
330 _mm_storeu_epi8(t, aes_sse2_xts_update(_mm_loadu_epi8(t)));
331 if (t[0] != cases[i].out[0] ||
332 t[1] != cases[i].out[1] ||
333 t[2] != cases[i].out[2] ||
334 t[3] != cases[i].out[3]) {
335 printf("%s %u:"
336 " %"PRIx32" %"PRIx32" %"PRIx32" %"PRIx32"\n",
337 __func__, i, t[0], t[1], t[2], t[3]);
338 result = -1;
339 }
340 }
341
342 return result;
343 }
344
345 void
aes_sse2_xts_enc(const struct aesenc * enc,const uint8_t in[static16],uint8_t out[static16],size_t nbytes,uint8_t tweak[static16],uint32_t nrounds)346 aes_sse2_xts_enc(const struct aesenc *enc, const uint8_t in[static 16],
347 uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16],
348 uint32_t nrounds)
349 {
350 uint64_t sk_exp[120];
351 __m128i q[4];
352 __m128i w;
353 __m128i t[5];
354 unsigned i;
355
356 KASSERT(nbytes);
357 KASSERT(nbytes % 16 == 0);
358
359 /* Expand round keys for bitslicing. */
360 aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64);
361
362 /* Load tweak. */
363 t[0] = _mm_loadu_epi8(tweak);
364
365 /* Handle the first block separately if odd number. */
366 if (nbytes % (4*16)) {
367 /* Load up the tweaked inputs. */
368 for (i = 0; i < (nbytes/16) % 4; i++) {
369 w = _mm_loadu_epi8(in + 16*i) ^ t[i];
370 q[i] = aes_sse2_interleave_in(w);
371 t[i + 1] = aes_sse2_xts_update(t[i]);
372 }
373 for (; i < 4; i++)
374 q[i] = _mm_setzero_si128();
375
376 /* Encrypt up to four blocks. */
377 aes_sse2_ortho(q);
378 aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
379 aes_sse2_ortho(q);
380
381 /* Store the tweaked outputs. */
382 for (i = 0; i < (nbytes/16) % 4; i++) {
383 w = aes_sse2_interleave_out(q[i]);
384 _mm_storeu_epi8(out + 16*i, w ^ t[i]);
385 }
386
387 /* Advance to the next block. */
388 t[0] = t[i];
389 in += nbytes % (4*16);
390 out += nbytes % (4*16);
391 nbytes -= nbytes % (4*16);
392 if (nbytes == 0)
393 goto out;
394 }
395
396 do {
397 KASSERT(nbytes % 64 == 0);
398 KASSERT(nbytes >= 64);
399
400 /* Load up the tweaked inputs. */
401 for (i = 0; i < 4; i++) {
402 w = _mm_loadu_epi8(in + 16*i) ^ t[i];
403 q[i] = aes_sse2_interleave_in(w);
404 t[i + 1] = aes_sse2_xts_update(t[i]);
405 }
406
407 /* Encrypt four blocks. */
408 aes_sse2_ortho(q);
409 aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
410 aes_sse2_ortho(q);
411
412 /* Store the tweaked outputs. */
413 for (i = 0; i < 4; i++) {
414 w = aes_sse2_interleave_out(q[i]);
415 _mm_storeu_epi8(out + 16*i, w ^ t[i]);
416 }
417
418 /* Advance to the next block. */
419 t[0] = t[4];
420 in += 64;
421 out += 64;
422 nbytes -= 64;
423 } while (nbytes);
424
425 out: /* Store the updated tweak. */
426 _mm_storeu_epi8(tweak, t[0]);
427
428 /* Paranoia: Zero temporary buffers. */
429 explicit_memset(sk_exp, 0, sizeof sk_exp);
430 explicit_memset(q, 0, sizeof q);
431 explicit_memset(t, 0, sizeof t);
432 }
433
434 void
aes_sse2_xts_dec(const struct aesdec * dec,const uint8_t in[static16],uint8_t out[static16],size_t nbytes,uint8_t tweak[static16],uint32_t nrounds)435 aes_sse2_xts_dec(const struct aesdec *dec, const uint8_t in[static 16],
436 uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16],
437 uint32_t nrounds)
438 {
439 uint64_t sk_exp[120];
440 __m128i q[4];
441 __m128i w;
442 __m128i t[5];
443 unsigned i;
444
445 KASSERT(nbytes);
446 KASSERT(nbytes % 16 == 0);
447
448 /* Expand round keys for bitslicing. */
449 aes_sse2_skey_expand(sk_exp, nrounds, dec->aesd_aes.aes_rk64);
450
451 /* Load tweak. */
452 t[0] = _mm_loadu_epi8(tweak);
453
454 /* Handle the first block separately if odd number. */
455 if (nbytes % (4*16)) {
456 /* Load up the tweaked inputs. */
457 for (i = 0; i < (nbytes/16) % 4; i++) {
458 w = _mm_loadu_epi8(in + 16*i) ^ t[i];
459 q[i] = aes_sse2_interleave_in(w);
460 t[i + 1] = aes_sse2_xts_update(t[i]);
461 }
462 for (; i < 4; i++)
463 q[i] = _mm_setzero_si128();
464
465 /* Decrypt up to four blocks. */
466 aes_sse2_ortho(q);
467 aes_sse2_bitslice_decrypt(nrounds, sk_exp, q);
468 aes_sse2_ortho(q);
469
470 /* Store the tweaked outputs. */
471 for (i = 0; i < (nbytes/16) % 4; i++) {
472 w = aes_sse2_interleave_out(q[i]);
473 _mm_storeu_epi8(out + 16*i, w ^ t[i]);
474 }
475
476 /* Advance to the next block. */
477 t[0] = t[i];
478 in += nbytes % (4*16);
479 out += nbytes % (4*16);
480 nbytes -= nbytes % (4*16);
481 if (nbytes == 0)
482 goto out;
483 }
484
485 do {
486 KASSERT(nbytes % 64 == 0);
487 KASSERT(nbytes >= 64);
488
489 /* Load up the tweaked inputs. */
490 for (i = 0; i < 4; i++) {
491 w = _mm_loadu_epi8(in + 16*i) ^ t[i];
492 q[i] = aes_sse2_interleave_in(w);
493 t[i + 1] = aes_sse2_xts_update(t[i]);
494 }
495
496 /* Decrypt four blocks. */
497 aes_sse2_ortho(q);
498 aes_sse2_bitslice_decrypt(nrounds, sk_exp, q);
499 aes_sse2_ortho(q);
500
501 /* Store the tweaked outputs. */
502 for (i = 0; i < 4; i++) {
503 w = aes_sse2_interleave_out(q[i]);
504 _mm_storeu_epi8(out + 16*i, w ^ t[i]);
505 }
506
507 /* Advance to the next block. */
508 t[0] = t[4];
509 in += 64;
510 out += 64;
511 nbytes -= 64;
512 } while (nbytes);
513
514 out: /* Store the updated tweak. */
515 _mm_storeu_epi8(tweak, t[0]);
516
517 /* Paranoia: Zero temporary buffers. */
518 explicit_memset(sk_exp, 0, sizeof sk_exp);
519 explicit_memset(q, 0, sizeof q);
520 explicit_memset(t, 0, sizeof t);
521 }
522
523 void
aes_sse2_cbcmac_update1(const struct aesenc * enc,const uint8_t in[static16],size_t nbytes,uint8_t auth[static16],uint32_t nrounds)524 aes_sse2_cbcmac_update1(const struct aesenc *enc, const uint8_t in[static 16],
525 size_t nbytes, uint8_t auth[static 16], uint32_t nrounds)
526 {
527 uint64_t sk_exp[120];
528 __m128i q[4];
529
530 KASSERT(nbytes);
531 KASSERT(nbytes % 16 == 0);
532
533 /* Expand round keys for bitslicing. */
534 aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64);
535
536 /* Load initial authenticator. */
537 q[0] = aes_sse2_interleave_in(_mm_loadu_epi8(auth));
538
539 for (; nbytes; nbytes -= 16, in += 16) {
540 q[0] ^= aes_sse2_interleave_in(_mm_loadu_epi8(in));
541 aes_sse2_ortho(q);
542 aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
543 aes_sse2_ortho(q);
544 }
545
546 /* Store updated authenticator. */
547 _mm_storeu_epi8(auth, aes_sse2_interleave_out(q[0]));
548
549 /* Paranoia: Zero temporary buffers. */
550 explicit_memset(sk_exp, 0, sizeof sk_exp);
551 explicit_memset(q, 0, sizeof q);
552 }
553
554 void
aes_sse2_ccm_enc1(const struct aesenc * enc,const uint8_t in[static16],uint8_t out[static16],size_t nbytes,uint8_t authctr[static32],uint32_t nrounds)555 aes_sse2_ccm_enc1(const struct aesenc *enc, const uint8_t in[static 16],
556 uint8_t out[static 16], size_t nbytes, uint8_t authctr[static 32],
557 uint32_t nrounds)
558 {
559 uint64_t sk_exp[120];
560 __m128i q[4];
561 __m128i ctr;
562 uint32_t c0, c1, c2, c3;
563
564 KASSERT(nbytes);
565 KASSERT(nbytes % 16 == 0);
566
567 /* Expand round keys for bitslicing. */
568 aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64);
569
570 /* Set first block to authenticator. */
571 q[0] = aes_sse2_interleave_in(_mm_loadu_epi8(authctr));
572
573 /* Load initial counter block, big-endian so we can increment it. */
574 c0 = le32dec(authctr + 16 + 4*0);
575 c1 = le32dec(authctr + 16 + 4*1);
576 c2 = le32dec(authctr + 16 + 4*2);
577 c3 = be32dec(authctr + 16 + 4*3);
578
579 /* Set other blocks to garbage -- can't take advantage. */
580 q[2] = q[3] = _mm_setzero_si128();
581
582 for (; nbytes; nbytes -= 16, in += 16, out += 16) {
583 /* Update authenticator. */
584 q[0] ^= aes_sse2_interleave_in(_mm_loadu_epi8(in));
585
586 /* Increment 32-bit counter. */
587 ctr = _mm_set_epi32(bswap32(++c3), c2, c1, c0);
588 q[1] = aes_sse2_interleave_in(ctr);
589
590 /* Encrypt authenticator and counter. */
591 aes_sse2_ortho(q);
592 aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
593 aes_sse2_ortho(q);
594
595 /* Encrypt with CTR output. */
596 _mm_storeu_epi8(out,
597 _mm_loadu_epi8(in) ^ aes_sse2_interleave_out(q[1]));
598 }
599
600 /* Update authenticator. */
601 _mm_storeu_epi8(authctr, aes_sse2_interleave_out(q[0]));
602
603 /* Update counter. */
604 be32enc(authctr + 16 + 4*3, c3);
605
606 /* Paranoia: Zero temporary buffers. */
607 explicit_memset(sk_exp, 0, sizeof sk_exp);
608 explicit_memset(q, 0, sizeof q);
609 }
610
611 void
aes_sse2_ccm_dec1(const struct aesenc * enc,const uint8_t in[static16],uint8_t out[static16],size_t nbytes,uint8_t authctr[static32],uint32_t nrounds)612 aes_sse2_ccm_dec1(const struct aesenc *enc, const uint8_t in[static 16],
613 uint8_t out[static 16], size_t nbytes, uint8_t authctr[static 32],
614 uint32_t nrounds)
615 {
616 uint64_t sk_exp[120];
617 __m128i q[4];
618 __m128i ctr, block;
619 uint32_t c0, c1, c2, c3;
620
621 KASSERT(nbytes);
622 KASSERT(nbytes % 16 == 0);
623
624 /* Expand round keys for bitslicing. */
625 aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64);
626
627 /* Load initial counter block, big-endian so we can increment it. */
628 c0 = le32dec(authctr + 16 + 4*0);
629 c1 = le32dec(authctr + 16 + 4*1);
630 c2 = le32dec(authctr + 16 + 4*2);
631 c3 = be32dec(authctr + 16 + 4*3);
632
633 /* Increment 32-bit counter. */
634 ctr = _mm_set_epi32(bswap32(++c3), c2, c1, c0);
635 q[0] = aes_sse2_interleave_in(ctr);
636
637 /*
638 * Set the other blocks to garbage -- we don't have any
639 * plaintext to authenticate yet.
640 */
641 q[1] = q[2] = q[3] = _mm_setzero_si128();
642
643 /* Encrypt first CTR. */
644 aes_sse2_ortho(q);
645 aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
646 aes_sse2_ortho(q);
647
648 /* Load the initial authenticator. */
649 q[1] = aes_sse2_interleave_in(_mm_loadu_epi8(authctr));
650
651 for (;; in += 16, out += 16) {
652 /* Decrypt the block. */
653 block = _mm_loadu_epi8(in) ^ aes_sse2_interleave_out(q[0]);
654
655 /* Update authenticator. */
656 q[1] ^= aes_sse2_interleave_in(block);
657
658 /* Store plaintext. */
659 _mm_storeu_epi8(out, block);
660
661 /* If this is the last block, stop. */
662 if ((nbytes -= 16) == 0)
663 break;
664
665 /* Increment 32-bit counter. */
666 ctr = _mm_set_epi32(bswap32(++c3), c2, c1, c0);
667 q[0] = aes_sse2_interleave_in(ctr);
668
669 /* Authenticate previous plaintext, encrypt next CTR. */
670 aes_sse2_ortho(q);
671 aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
672 aes_sse2_ortho(q);
673 }
674
675 /*
676 * Authenticate last plaintext. We're only doing this for the
677 * authenticator, not for the counter, so don't bother to
678 * initialize q[0], q[2], q[3]. (Even for the sake of
679 * sanitizers, they're already initialized to something by
680 * now.)
681 */
682 aes_sse2_ortho(q);
683 aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
684 aes_sse2_ortho(q);
685
686 /* Update authenticator. */
687 _mm_storeu_epi8(authctr, aes_sse2_interleave_out(q[1]));
688
689 /* Update counter. */
690 be32enc(authctr + 16 + 4*3, c3);
691
692 /* Paranoia: Zero temporary buffers. */
693 explicit_memset(sk_exp, 0, sizeof sk_exp);
694 explicit_memset(q, 0, sizeof q);
695 }
696
697 int
aes_sse2_selftest(void)698 aes_sse2_selftest(void)
699 {
700
701 if (aes_sse2_xts_update_selftest())
702 return -1;
703
704 /* XXX test aes_sse2_bitslice_decrypt */
705 /* XXX test aes_sse2_bitslice_encrypt */
706 /* XXX test aes_sse2_keysched */
707 /* XXX test aes_sse2_ortho */
708 /* XXX test aes_sse2_skey_expand */
709
710 return 0;
711 }
712