xref: /netbsd-src/sys/crypto/aes/arch/x86/aes_sse2_subr.c (revision 0fc796c545103d7b47c5b853cf97f798b961e1f4)
1 /*	$NetBSD: aes_sse2_subr.c,v 1.4 2020/09/08 22:48:24 riastradh Exp $	*/
2 
3 /*-
4  * Copyright (c) 2020 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26  * POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 __KERNEL_RCSID(1, "$NetBSD: aes_sse2_subr.c,v 1.4 2020/09/08 22:48:24 riastradh Exp $");
31 
32 #ifdef _KERNEL
33 #include <sys/systm.h>
34 #include <lib/libkern/libkern.h>
35 #else
36 #include <err.h>
37 #include <assert.h>
38 #include <inttypes.h>
39 #include <stdio.h>
40 #include <string.h>
41 #define	KASSERT			assert
42 #define	panic(fmt, args...)	err(1, fmt, ##args)
43 #endif
44 
45 #include <crypto/aes/aes.h>
46 #include <crypto/aes/arch/x86/aes_sse2.h>
47 
48 #include "aes_sse2_impl.h"
49 
50 void
aes_sse2_setkey(uint64_t rk[static30],const void * key,uint32_t nrounds)51 aes_sse2_setkey(uint64_t rk[static 30], const void *key, uint32_t nrounds)
52 {
53 	size_t key_len;
54 
55 	switch (nrounds) {
56 	case 10:
57 		key_len = 16;
58 		break;
59 	case 12:
60 		key_len = 24;
61 		break;
62 	case 14:
63 		key_len = 32;
64 		break;
65 	default:
66 		panic("invalid AES nrounds: %u", nrounds);
67 	}
68 
69 	aes_sse2_keysched(rk, key, key_len);
70 }
71 
72 void
aes_sse2_enc(const struct aesenc * enc,const uint8_t in[static16],uint8_t out[static16],uint32_t nrounds)73 aes_sse2_enc(const struct aesenc *enc, const uint8_t in[static 16],
74     uint8_t out[static 16], uint32_t nrounds)
75 {
76 	uint64_t sk_exp[120];
77 	__m128i q[4];
78 
79 	/* Expand round keys for bitslicing.  */
80 	aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64);
81 
82 	/* Load input block interleaved with garbage blocks.  */
83 	q[0] = aes_sse2_interleave_in(_mm_loadu_epi8(in));
84 	q[1] = q[2] = q[3] = _mm_setzero_si128();
85 
86 	/* Transform to bitslice, decrypt, transform from bitslice.  */
87 	aes_sse2_ortho(q);
88 	aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
89 	aes_sse2_ortho(q);
90 
91 	/* Store output block.  */
92 	_mm_storeu_epi8(out, aes_sse2_interleave_out(q[0]));
93 
94 	/* Paranoia: Zero temporary buffers.  */
95 	explicit_memset(sk_exp, 0, sizeof sk_exp);
96 	explicit_memset(q, 0, sizeof q);
97 }
98 
99 void
aes_sse2_dec(const struct aesdec * dec,const uint8_t in[static16],uint8_t out[static16],uint32_t nrounds)100 aes_sse2_dec(const struct aesdec *dec, const uint8_t in[static 16],
101     uint8_t out[static 16], uint32_t nrounds)
102 {
103 	uint64_t sk_exp[120];
104 	__m128i q[4];
105 
106 	/* Expand round keys for bitslicing.  */
107 	aes_sse2_skey_expand(sk_exp, nrounds, dec->aesd_aes.aes_rk64);
108 
109 	/* Load input block interleaved with garbage blocks.  */
110 	q[0] = aes_sse2_interleave_in(_mm_loadu_epi8(in));
111 	q[1] = q[2] = q[3] = _mm_setzero_si128();
112 
113 	/* Transform to bitslice, decrypt, transform from bitslice.  */
114 	aes_sse2_ortho(q);
115 	aes_sse2_bitslice_decrypt(nrounds, sk_exp, q);
116 	aes_sse2_ortho(q);
117 
118 	/* Store output block.  */
119 	_mm_storeu_epi8(out, aes_sse2_interleave_out(q[0]));
120 
121 	/* Paranoia: Zero temporary buffers.  */
122 	explicit_memset(sk_exp, 0, sizeof sk_exp);
123 	explicit_memset(q, 0, sizeof q);
124 }
125 
126 void
aes_sse2_cbc_enc(const struct aesenc * enc,const uint8_t in[static16],uint8_t out[static16],size_t nbytes,uint8_t iv[static16],uint32_t nrounds)127 aes_sse2_cbc_enc(const struct aesenc *enc, const uint8_t in[static 16],
128     uint8_t out[static 16], size_t nbytes, uint8_t iv[static 16],
129     uint32_t nrounds)
130 {
131 	uint64_t sk_exp[120];
132 	__m128i q[4];
133 	__m128i cv;
134 
135 	KASSERT(nbytes);
136 	KASSERT(nbytes % 16 == 0);
137 
138 	/* Expand round keys for bitslicing.  */
139 	aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64);
140 
141 	/* Load the IV.  */
142 	cv = _mm_loadu_epi8(iv);
143 
144 	for (; nbytes; nbytes -= 16, in += 16, out += 16) {
145 		/* Load input block and apply CV.  */
146 		q[0] = aes_sse2_interleave_in(cv ^ _mm_loadu_epi8(in));
147 
148 		/* Transform to bitslice, encrypt, transform from bitslice.  */
149 		aes_sse2_ortho(q);
150 		aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
151 		aes_sse2_ortho(q);
152 
153 		/* Remember ciphertext as CV and store output block.  */
154 		cv = aes_sse2_interleave_out(q[0]);
155 		_mm_storeu_epi8(out, cv);
156 	}
157 
158 	/* Store updated IV.  */
159 	_mm_storeu_epi8(iv, cv);
160 
161 	/* Paranoia: Zero temporary buffers.  */
162 	explicit_memset(sk_exp, 0, sizeof sk_exp);
163 	explicit_memset(q, 0, sizeof q);
164 }
165 
166 void
aes_sse2_cbc_dec(const struct aesdec * dec,const uint8_t in[static16],uint8_t out[static16],size_t nbytes,uint8_t ivp[static16],uint32_t nrounds)167 aes_sse2_cbc_dec(const struct aesdec *dec, const uint8_t in[static 16],
168     uint8_t out[static 16], size_t nbytes, uint8_t ivp[static 16],
169     uint32_t nrounds)
170 {
171 	uint64_t sk_exp[120];
172 	__m128i q[4];
173 	__m128i cv, iv, w;
174 
175 	KASSERT(nbytes);
176 	KASSERT(nbytes % 16 == 0);
177 
178 	/* Expand round keys for bitslicing.  */
179 	aes_sse2_skey_expand(sk_exp, nrounds, dec->aesd_aes.aes_rk64);
180 
181 	/* Load the IV.  */
182 	iv = _mm_loadu_epi8(ivp);
183 
184 	/* Load the last cipher block.  */
185 	cv = _mm_loadu_epi8(in + nbytes - 16);
186 
187 	/* Store the updated IV.  */
188 	_mm_storeu_epi8(ivp, cv);
189 
190 	/* Process the last blocks if not an even multiple of four.  */
191 	if (nbytes % (4*16)) {
192 		unsigned n = (nbytes/16) % 4;
193 
194 		KASSERT(n > 0);
195 		KASSERT(n < 4);
196 
197 		q[1] = q[2] = q[3] = _mm_setzero_si128();
198 		q[n - 1] = aes_sse2_interleave_in(cv);
199 		switch (nbytes % 64) {
200 		case 48:
201 			w = _mm_loadu_epi8(in + nbytes - 32);
202 			q[1] = aes_sse2_interleave_in(w);
203 			w = _mm_loadu_epi8(in + nbytes - 48);
204 			q[0] = aes_sse2_interleave_in(w);
205 			break;
206 		case 32:
207 			w = _mm_loadu_epi8(in + nbytes - 32);
208 			q[0] = aes_sse2_interleave_in(w);
209 			break;
210 		case 16:
211 			break;
212 		}
213 
214 		/* Decrypt.  */
215 		aes_sse2_ortho(q);
216 		aes_sse2_bitslice_decrypt(nrounds, sk_exp, q);
217 		aes_sse2_ortho(q);
218 
219 		do {
220 			n--;
221 			w = aes_sse2_interleave_out(q[n]);
222 			if ((nbytes -= 16) == 0)
223 				goto out;
224 			cv = _mm_loadu_epi8(in + nbytes - 16);
225 			_mm_storeu_epi8(out + nbytes, w ^ cv);
226 		} while (n);
227 	}
228 
229 	for (;;) {
230 		KASSERT(nbytes >= 64);
231 		nbytes -= 64;
232 
233 		/*
234 		 * 1. Set up upper cipher block from cv.
235 		 * 2. Load lower cipher block into cv and set it up.
236 		 * 3. Decrypt.
237 		 */
238 		q[3] = aes_sse2_interleave_in(cv);
239 
240 		w = _mm_loadu_epi8(in + nbytes + 4*8);
241 		q[2] = aes_sse2_interleave_in(w);
242 
243 		w = _mm_loadu_epi8(in + nbytes + 4*4);
244 		q[1] = aes_sse2_interleave_in(w);
245 
246 		w = _mm_loadu_epi8(in + nbytes + 4*0);
247 		q[0] = aes_sse2_interleave_in(w);
248 
249 		aes_sse2_ortho(q);
250 		aes_sse2_bitslice_decrypt(nrounds, sk_exp, q);
251 		aes_sse2_ortho(q);
252 
253 		/* Store the upper output block.  */
254 		w = aes_sse2_interleave_out(q[3]);
255 		cv = _mm_loadu_epi8(in + nbytes + 4*8);
256 		_mm_storeu_epi8(out + nbytes + 4*12, w ^ cv);
257 
258 		/* Store the middle output blocks.  */
259 		w = aes_sse2_interleave_out(q[2]);
260 		cv = _mm_loadu_epi8(in + nbytes + 4*4);
261 		_mm_storeu_epi8(out + nbytes + 4*8, w ^ cv);
262 
263 		w = aes_sse2_interleave_out(q[1]);
264 		cv = _mm_loadu_epi8(in + nbytes + 4*0);
265 		_mm_storeu_epi8(out + nbytes + 4*4, w ^ cv);
266 
267 		/*
268 		 * Get the first output block, but don't load the CV
269 		 * yet -- it might be the previous ciphertext block, or
270 		 * it might be the IV.
271 		 */
272 		w = aes_sse2_interleave_out(q[0]);
273 
274 		/* Stop if we've reached the first output block.  */
275 		if (nbytes == 0)
276 			goto out;
277 
278 		/*
279 		 * Load the preceding cipher block, and apply it as the
280 		 * chaining value to this one.
281 		 */
282 		cv = _mm_loadu_epi8(in + nbytes - 16);
283 		_mm_storeu_epi8(out + nbytes, w ^ cv);
284 	}
285 
286 out:	/* Store the first output block.  */
287 	_mm_storeu_epi8(out, w ^ iv);
288 
289 	/* Paranoia: Zero temporary buffers.  */
290 	explicit_memset(sk_exp, 0, sizeof sk_exp);
291 	explicit_memset(q, 0, sizeof q);
292 }
293 
294 static inline __m128i
aes_sse2_xts_update(__m128i t)295 aes_sse2_xts_update(__m128i t)
296 {
297 	const __m128i one = _mm_set_epi64x(1, 1);
298 	__m128i s, m, c;
299 
300 	s = _mm_srli_epi64(t, 63);	/* 1 if high bit set else 0 */
301 	m = _mm_sub_epi64(s, one);	/* 0 if high bit set else -1 */
302 	m = _mm_shuffle_epi32(m, 0x4e);	/* swap halves */
303 	c = _mm_set_epi64x(1, 0x87);	/* carry */
304 
305 	return _mm_slli_epi64(t, 1) ^ (c & ~m);
306 }
307 
308 static int
aes_sse2_xts_update_selftest(void)309 aes_sse2_xts_update_selftest(void)
310 {
311 	static const struct {
312 		uint32_t in[4], out[4];
313 	} cases[] = {
314 		[0] = { {1}, {2} },
315 		[1] = { {0x80000000U,0,0,0}, {0,1,0,0} },
316 		[2] = { {0,0x80000000U,0,0}, {0,0,1,0} },
317 		[3] = { {0,0,0x80000000U,0}, {0,0,0,1} },
318 		[4] = { {0,0,0,0x80000000U}, {0x87,0,0,0} },
319 		[5] = { {0,0x80000000U,0,0x80000000U}, {0x87,0,1,0} },
320 	};
321 	unsigned i;
322 	uint32_t t[4];
323 	int result = 0;
324 
325 	for (i = 0; i < sizeof(cases)/sizeof(cases[0]); i++) {
326 		t[0] = cases[i].in[0];
327 		t[1] = cases[i].in[1];
328 		t[2] = cases[i].in[2];
329 		t[3] = cases[i].in[3];
330 		_mm_storeu_epi8(t, aes_sse2_xts_update(_mm_loadu_epi8(t)));
331 		if (t[0] != cases[i].out[0] ||
332 		    t[1] != cases[i].out[1] ||
333 		    t[2] != cases[i].out[2] ||
334 		    t[3] != cases[i].out[3]) {
335 			printf("%s %u:"
336 			    " %"PRIx32" %"PRIx32" %"PRIx32" %"PRIx32"\n",
337 			    __func__, i, t[0], t[1], t[2], t[3]);
338 			result = -1;
339 		}
340 	}
341 
342 	return result;
343 }
344 
345 void
aes_sse2_xts_enc(const struct aesenc * enc,const uint8_t in[static16],uint8_t out[static16],size_t nbytes,uint8_t tweak[static16],uint32_t nrounds)346 aes_sse2_xts_enc(const struct aesenc *enc, const uint8_t in[static 16],
347     uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16],
348     uint32_t nrounds)
349 {
350 	uint64_t sk_exp[120];
351 	__m128i q[4];
352 	__m128i w;
353 	__m128i t[5];
354 	unsigned i;
355 
356 	KASSERT(nbytes);
357 	KASSERT(nbytes % 16 == 0);
358 
359 	/* Expand round keys for bitslicing.  */
360 	aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64);
361 
362 	/* Load tweak.  */
363 	t[0] = _mm_loadu_epi8(tweak);
364 
365 	/* Handle the first block separately if odd number.  */
366 	if (nbytes % (4*16)) {
367 		/* Load up the tweaked inputs.  */
368 		for (i = 0; i < (nbytes/16) % 4; i++) {
369 			w = _mm_loadu_epi8(in + 16*i) ^ t[i];
370 			q[i] = aes_sse2_interleave_in(w);
371 			t[i + 1] = aes_sse2_xts_update(t[i]);
372 		}
373 		for (; i < 4; i++)
374 			q[i] = _mm_setzero_si128();
375 
376 		/* Encrypt up to four blocks.  */
377 		aes_sse2_ortho(q);
378 		aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
379 		aes_sse2_ortho(q);
380 
381 		/* Store the tweaked outputs.  */
382 		for (i = 0; i < (nbytes/16) % 4; i++) {
383 			w = aes_sse2_interleave_out(q[i]);
384 			_mm_storeu_epi8(out + 16*i, w ^ t[i]);
385 		}
386 
387 		/* Advance to the next block.  */
388 		t[0] = t[i];
389 		in += nbytes % (4*16);
390 		out += nbytes % (4*16);
391 		nbytes -= nbytes % (4*16);
392 		if (nbytes == 0)
393 			goto out;
394 	}
395 
396 	do {
397 		KASSERT(nbytes % 64 == 0);
398 		KASSERT(nbytes >= 64);
399 
400 		/* Load up the tweaked inputs.  */
401 		for (i = 0; i < 4; i++) {
402 			w = _mm_loadu_epi8(in + 16*i) ^ t[i];
403 			q[i] = aes_sse2_interleave_in(w);
404 			t[i + 1] = aes_sse2_xts_update(t[i]);
405 		}
406 
407 		/* Encrypt four blocks.  */
408 		aes_sse2_ortho(q);
409 		aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
410 		aes_sse2_ortho(q);
411 
412 		/* Store the tweaked outputs.  */
413 		for (i = 0; i < 4; i++) {
414 			w = aes_sse2_interleave_out(q[i]);
415 			_mm_storeu_epi8(out + 16*i, w ^ t[i]);
416 		}
417 
418 		/* Advance to the next block.  */
419 		t[0] = t[4];
420 		in += 64;
421 		out += 64;
422 		nbytes -= 64;
423 	} while (nbytes);
424 
425 out:	/* Store the updated tweak.  */
426 	_mm_storeu_epi8(tweak, t[0]);
427 
428 	/* Paranoia: Zero temporary buffers.  */
429 	explicit_memset(sk_exp, 0, sizeof sk_exp);
430 	explicit_memset(q, 0, sizeof q);
431 	explicit_memset(t, 0, sizeof t);
432 }
433 
434 void
aes_sse2_xts_dec(const struct aesdec * dec,const uint8_t in[static16],uint8_t out[static16],size_t nbytes,uint8_t tweak[static16],uint32_t nrounds)435 aes_sse2_xts_dec(const struct aesdec *dec, const uint8_t in[static 16],
436     uint8_t out[static 16], size_t nbytes, uint8_t tweak[static 16],
437     uint32_t nrounds)
438 {
439 	uint64_t sk_exp[120];
440 	__m128i q[4];
441 	__m128i w;
442 	__m128i t[5];
443 	unsigned i;
444 
445 	KASSERT(nbytes);
446 	KASSERT(nbytes % 16 == 0);
447 
448 	/* Expand round keys for bitslicing.  */
449 	aes_sse2_skey_expand(sk_exp, nrounds, dec->aesd_aes.aes_rk64);
450 
451 	/* Load tweak.  */
452 	t[0] = _mm_loadu_epi8(tweak);
453 
454 	/* Handle the first block separately if odd number.  */
455 	if (nbytes % (4*16)) {
456 		/* Load up the tweaked inputs.  */
457 		for (i = 0; i < (nbytes/16) % 4; i++) {
458 			w = _mm_loadu_epi8(in + 16*i) ^ t[i];
459 			q[i] = aes_sse2_interleave_in(w);
460 			t[i + 1] = aes_sse2_xts_update(t[i]);
461 		}
462 		for (; i < 4; i++)
463 			q[i] = _mm_setzero_si128();
464 
465 		/* Decrypt up to four blocks.  */
466 		aes_sse2_ortho(q);
467 		aes_sse2_bitslice_decrypt(nrounds, sk_exp, q);
468 		aes_sse2_ortho(q);
469 
470 		/* Store the tweaked outputs.  */
471 		for (i = 0; i < (nbytes/16) % 4; i++) {
472 			w = aes_sse2_interleave_out(q[i]);
473 			_mm_storeu_epi8(out + 16*i, w ^ t[i]);
474 		}
475 
476 		/* Advance to the next block.  */
477 		t[0] = t[i];
478 		in += nbytes % (4*16);
479 		out += nbytes % (4*16);
480 		nbytes -= nbytes % (4*16);
481 		if (nbytes == 0)
482 			goto out;
483 	}
484 
485 	do {
486 		KASSERT(nbytes % 64 == 0);
487 		KASSERT(nbytes >= 64);
488 
489 		/* Load up the tweaked inputs.  */
490 		for (i = 0; i < 4; i++) {
491 			w = _mm_loadu_epi8(in + 16*i) ^ t[i];
492 			q[i] = aes_sse2_interleave_in(w);
493 			t[i + 1] = aes_sse2_xts_update(t[i]);
494 		}
495 
496 		/* Decrypt four blocks.  */
497 		aes_sse2_ortho(q);
498 		aes_sse2_bitslice_decrypt(nrounds, sk_exp, q);
499 		aes_sse2_ortho(q);
500 
501 		/* Store the tweaked outputs.  */
502 		for (i = 0; i < 4; i++) {
503 			w = aes_sse2_interleave_out(q[i]);
504 			_mm_storeu_epi8(out + 16*i, w ^ t[i]);
505 		}
506 
507 		/* Advance to the next block.  */
508 		t[0] = t[4];
509 		in += 64;
510 		out += 64;
511 		nbytes -= 64;
512 	} while (nbytes);
513 
514 out:	/* Store the updated tweak.  */
515 	_mm_storeu_epi8(tweak, t[0]);
516 
517 	/* Paranoia: Zero temporary buffers.  */
518 	explicit_memset(sk_exp, 0, sizeof sk_exp);
519 	explicit_memset(q, 0, sizeof q);
520 	explicit_memset(t, 0, sizeof t);
521 }
522 
523 void
aes_sse2_cbcmac_update1(const struct aesenc * enc,const uint8_t in[static16],size_t nbytes,uint8_t auth[static16],uint32_t nrounds)524 aes_sse2_cbcmac_update1(const struct aesenc *enc, const uint8_t in[static 16],
525     size_t nbytes, uint8_t auth[static 16], uint32_t nrounds)
526 {
527 	uint64_t sk_exp[120];
528 	__m128i q[4];
529 
530 	KASSERT(nbytes);
531 	KASSERT(nbytes % 16 == 0);
532 
533 	/* Expand round keys for bitslicing.  */
534 	aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64);
535 
536 	/* Load initial authenticator.  */
537 	q[0] = aes_sse2_interleave_in(_mm_loadu_epi8(auth));
538 
539 	for (; nbytes; nbytes -= 16, in += 16) {
540 		q[0] ^= aes_sse2_interleave_in(_mm_loadu_epi8(in));
541 		aes_sse2_ortho(q);
542 		aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
543 		aes_sse2_ortho(q);
544 	}
545 
546 	/* Store updated authenticator.  */
547 	_mm_storeu_epi8(auth, aes_sse2_interleave_out(q[0]));
548 
549 	/* Paranoia: Zero temporary buffers.  */
550 	explicit_memset(sk_exp, 0, sizeof sk_exp);
551 	explicit_memset(q, 0, sizeof q);
552 }
553 
554 void
aes_sse2_ccm_enc1(const struct aesenc * enc,const uint8_t in[static16],uint8_t out[static16],size_t nbytes,uint8_t authctr[static32],uint32_t nrounds)555 aes_sse2_ccm_enc1(const struct aesenc *enc, const uint8_t in[static 16],
556     uint8_t out[static 16], size_t nbytes, uint8_t authctr[static 32],
557     uint32_t nrounds)
558 {
559 	uint64_t sk_exp[120];
560 	__m128i q[4];
561 	__m128i ctr;
562 	uint32_t c0, c1, c2, c3;
563 
564 	KASSERT(nbytes);
565 	KASSERT(nbytes % 16 == 0);
566 
567 	/* Expand round keys for bitslicing.  */
568 	aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64);
569 
570 	/* Set first block to authenticator.  */
571 	q[0] = aes_sse2_interleave_in(_mm_loadu_epi8(authctr));
572 
573 	/* Load initial counter block, big-endian so we can increment it.  */
574 	c0 = le32dec(authctr + 16 + 4*0);
575 	c1 = le32dec(authctr + 16 + 4*1);
576 	c2 = le32dec(authctr + 16 + 4*2);
577 	c3 = be32dec(authctr + 16 + 4*3);
578 
579 	/* Set other blocks to garbage -- can't take advantage.  */
580 	q[2] = q[3] = _mm_setzero_si128();
581 
582 	for (; nbytes; nbytes -= 16, in += 16, out += 16) {
583 		/* Update authenticator.  */
584 		q[0] ^= aes_sse2_interleave_in(_mm_loadu_epi8(in));
585 
586 		/* Increment 32-bit counter.  */
587 		ctr = _mm_set_epi32(bswap32(++c3), c2, c1, c0);
588 		q[1] = aes_sse2_interleave_in(ctr);
589 
590 		/* Encrypt authenticator and counter.  */
591 		aes_sse2_ortho(q);
592 		aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
593 		aes_sse2_ortho(q);
594 
595 		/* Encrypt with CTR output.  */
596 		_mm_storeu_epi8(out,
597 		    _mm_loadu_epi8(in) ^ aes_sse2_interleave_out(q[1]));
598 	}
599 
600 	/* Update authenticator.  */
601 	_mm_storeu_epi8(authctr, aes_sse2_interleave_out(q[0]));
602 
603 	/* Update counter.  */
604 	be32enc(authctr + 16 + 4*3, c3);
605 
606 	/* Paranoia: Zero temporary buffers.  */
607 	explicit_memset(sk_exp, 0, sizeof sk_exp);
608 	explicit_memset(q, 0, sizeof q);
609 }
610 
611 void
aes_sse2_ccm_dec1(const struct aesenc * enc,const uint8_t in[static16],uint8_t out[static16],size_t nbytes,uint8_t authctr[static32],uint32_t nrounds)612 aes_sse2_ccm_dec1(const struct aesenc *enc, const uint8_t in[static 16],
613     uint8_t out[static 16], size_t nbytes, uint8_t authctr[static 32],
614     uint32_t nrounds)
615 {
616 	uint64_t sk_exp[120];
617 	__m128i q[4];
618 	__m128i ctr, block;
619 	uint32_t c0, c1, c2, c3;
620 
621 	KASSERT(nbytes);
622 	KASSERT(nbytes % 16 == 0);
623 
624 	/* Expand round keys for bitslicing.  */
625 	aes_sse2_skey_expand(sk_exp, nrounds, enc->aese_aes.aes_rk64);
626 
627 	/* Load initial counter block, big-endian so we can increment it.  */
628 	c0 = le32dec(authctr + 16 + 4*0);
629 	c1 = le32dec(authctr + 16 + 4*1);
630 	c2 = le32dec(authctr + 16 + 4*2);
631 	c3 = be32dec(authctr + 16 + 4*3);
632 
633 	/* Increment 32-bit counter.  */
634 	ctr = _mm_set_epi32(bswap32(++c3), c2, c1, c0);
635 	q[0] = aes_sse2_interleave_in(ctr);
636 
637 	/*
638 	 * Set the other blocks to garbage -- we don't have any
639 	 * plaintext to authenticate yet.
640 	 */
641 	q[1] = q[2] = q[3] = _mm_setzero_si128();
642 
643 	/* Encrypt first CTR.  */
644 	aes_sse2_ortho(q);
645 	aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
646 	aes_sse2_ortho(q);
647 
648 	/* Load the initial authenticator.  */
649 	q[1] = aes_sse2_interleave_in(_mm_loadu_epi8(authctr));
650 
651 	for (;; in += 16, out += 16) {
652 		/* Decrypt the block.  */
653 		block = _mm_loadu_epi8(in) ^ aes_sse2_interleave_out(q[0]);
654 
655 		/* Update authenticator.  */
656 		q[1] ^= aes_sse2_interleave_in(block);
657 
658 		/* Store plaintext.  */
659 		_mm_storeu_epi8(out, block);
660 
661 		/* If this is the last block, stop.  */
662 		if ((nbytes -= 16) == 0)
663 			break;
664 
665 		/* Increment 32-bit counter.  */
666 		ctr = _mm_set_epi32(bswap32(++c3), c2, c1, c0);
667 		q[0] = aes_sse2_interleave_in(ctr);
668 
669 		/* Authenticate previous plaintext, encrypt next CTR.  */
670 		aes_sse2_ortho(q);
671 		aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
672 		aes_sse2_ortho(q);
673 	}
674 
675 	/*
676 	 * Authenticate last plaintext.  We're only doing this for the
677 	 * authenticator, not for the counter, so don't bother to
678 	 * initialize q[0], q[2], q[3].  (Even for the sake of
679 	 * sanitizers, they're already initialized to something by
680 	 * now.)
681 	 */
682 	aes_sse2_ortho(q);
683 	aes_sse2_bitslice_encrypt(nrounds, sk_exp, q);
684 	aes_sse2_ortho(q);
685 
686 	/* Update authenticator.  */
687 	_mm_storeu_epi8(authctr, aes_sse2_interleave_out(q[1]));
688 
689 	/* Update counter.  */
690 	be32enc(authctr + 16 + 4*3, c3);
691 
692 	/* Paranoia: Zero temporary buffers.  */
693 	explicit_memset(sk_exp, 0, sizeof sk_exp);
694 	explicit_memset(q, 0, sizeof q);
695 }
696 
697 int
aes_sse2_selftest(void)698 aes_sse2_selftest(void)
699 {
700 
701 	if (aes_sse2_xts_update_selftest())
702 		return -1;
703 
704 	/* XXX test aes_sse2_bitslice_decrypt */
705 	/* XXX test aes_sse2_bitslice_encrypt */
706 	/* XXX test aes_sse2_keysched */
707 	/* XXX test aes_sse2_ortho */
708 	/* XXX test aes_sse2_skey_expand */
709 
710 	return 0;
711 }
712