xref: /netbsd-src/sys/crypto/aes/arch/x86/aes_ssse3.c (revision bd9707e06ea7d21b5c24df6dfc14cb37c2819416)
1 /*	$NetBSD: aes_ssse3.c,v 1.2 2020/06/30 20:32:11 riastradh Exp $	*/
2 
3 /*-
4  * Copyright (c) 2020 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26  * POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*
30  * Permutation-based AES using SSSE3, derived from Mike Hamburg's VPAES
31  * software, at <https://crypto.stanford.edu/vpaes/>, described in
32  *
33  *	Mike Hamburg, `Accelerating AES with Vector Permute
34  *	Instructions', in Christophe Clavier and Kris Gaj (eds.),
35  *	Cryptographic Hardware and Embedded Systems -- CHES 2009,
36  *	Springer LNCS 5747, pp. 18-32.
37  *
38  *	https://link.springer.com/chapter/10.1007/978-3-642-04138-9_2
39  */
40 
41 #include <sys/cdefs.h>
42 __KERNEL_RCSID(1, "$NetBSD: aes_ssse3.c,v 1.2 2020/06/30 20:32:11 riastradh Exp $");
43 
44 #include <sys/types.h>
45 
46 #ifdef _KERNEL
47 #include <sys/systm.h>
48 #else
49 #include <err.h>
50 #define	panic(fmt, args...)	err(1, fmt, ##args)
51 #endif
52 
53 #include "aes_ssse3_impl.h"
54 
55 static const union m128const {
56 	uint64_t u64[2];
57 	__m128i m;
58 }
59 mc_forward[4] = {
60 	{.u64 = {0x0407060500030201, 0x0C0F0E0D080B0A09}},
61 	{.u64 = {0x080B0A0904070605, 0x000302010C0F0E0D}},
62 	{.u64 = {0x0C0F0E0D080B0A09, 0x0407060500030201}},
63 	{.u64 = {0x000302010C0F0E0D, 0x080B0A0904070605}},
64 },
65 mc_backward[4] = {
66 	{.u64 = {0x0605040702010003, 0x0E0D0C0F0A09080B}},
67 	{.u64 = {0x020100030E0D0C0F, 0x0A09080B06050407}},
68 	{.u64 = {0x0E0D0C0F0A09080B, 0x0605040702010003}},
69 	{.u64 = {0x0A09080B06050407, 0x020100030E0D0C0F}},
70 },
71 ipt[2] = {
72 	{.u64 = {0xC2B2E8985A2A7000, 0xCABAE09052227808}},
73 	{.u64 = {0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81}},
74 },
75 opt[2] = {
76 	{.u64 = {0xFF9F4929D6B66000, 0xF7974121DEBE6808}},
77 	{.u64 = {0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0}},
78 },
79 dipt[2] = {
80 	{.u64 = {0x0F505B040B545F00, 0x154A411E114E451A}},
81 	{.u64 = {0x86E383E660056500, 0x12771772F491F194}},
82 },
83 sb1[2] = {
84 	{.u64 = {0xB19BE18FCB503E00, 0xA5DF7A6E142AF544}},
85 	{.u64 = {0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF}},
86 },
87 sb2[2] = {
88 	{.u64 = {0xE27A93C60B712400, 0x5EB7E955BC982FCD}},
89 	{.u64 = {0x69EB88400AE12900, 0xC2A163C8AB82234A}},
90 },
91 sbo[2] = {
92 	{.u64 = {0xD0D26D176FBDC700, 0x15AABF7AC502A878}},
93 	{.u64 = {0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA}},
94 },
95 dsb9[2] = {
96 	{.u64 = {0x851C03539A86D600, 0xCAD51F504F994CC9}},
97 	{.u64 = {0xC03B1789ECD74900, 0x725E2C9EB2FBA565}},
98 },
99 dsbd[2] = {
100 	{.u64 = {0x7D57CCDFE6B1A200, 0xF56E9B13882A4439}},
101 	{.u64 = {0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3}},
102 },
103 dsbb[2] = {
104 	{.u64 = {0xD022649296B44200, 0x602646F6B0F2D404}},
105 	{.u64 = {0xC19498A6CD596700, 0xF3FF0C3E3255AA6B}},
106 },
107 dsbe[2] = {
108 	{.u64 = {0x46F2929626D4D000, 0x2242600464B4F6B0}},
109 	{.u64 = {0x0C55A6CDFFAAC100, 0x9467F36B98593E32}},
110 },
111 dsbo[2] = {
112 	{.u64 = {0x1387EA537EF94000, 0xC7AA6DB9D4943E2D}},
113 	{.u64 = {0x12D7560F93441D00, 0xCA4B8159D8C58E9C}},
114 },
115 dks1[2] = {
116 	{.u64 = {0xB6116FC87ED9A700, 0x4AED933482255BFC}},
117 	{.u64 = {0x4576516227143300, 0x8BB89FACE9DAFDCE}},
118 },
119 dks2[2] = {
120 	{.u64 = {0x27438FEBCCA86400, 0x4622EE8AADC90561}},
121 	{.u64 = {0x815C13CE4F92DD00, 0x73AEE13CBD602FF2}},
122 },
123 dks3[2] = {
124 	{.u64 = {0x03C4C50201C6C700, 0xF83F3EF9FA3D3CFB}},
125 	{.u64 = {0xEE1921D638CFF700, 0xA5526A9D7384BC4B}},
126 },
127 dks4[2] = {
128 	{.u64 = {0xE3C390B053732000, 0xA080D3F310306343}},
129 	{.u64 = {0xA0CA214B036982E8, 0x2F45AEC48CE60D67}},
130 },
131 deskew[2] = {
132 	{.u64 = {0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A}},
133 	{.u64 = {0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77}},
134 },
135 sr[4] = {
136 	{.u64 = {0x0706050403020100, 0x0F0E0D0C0B0A0908}},
137 	{.u64 = {0x030E09040F0A0500, 0x0B06010C07020D08}},
138 	{.u64 = {0x0F060D040B020900, 0x070E050C030A0108}},
139 	{.u64 = {0x0B0E0104070A0D00, 0x0306090C0F020508}},
140 },
141 rcon =	{.u64 = {0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81}},
142 s63 =	{.u64 = {0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B}},
143 of =	{.u64 = {0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F}},
144 inv =	{.u64 = {0x0E05060F0D080180, 0x040703090A0B0C02}},
145 inva =	{.u64 = {0x01040A060F0B0780, 0x030D0E0C02050809}};
146 
147 static inline __m128i
loadroundkey(const uint32_t * rk32)148 loadroundkey(const uint32_t *rk32)
149 {
150 	return _mm_load_si128((const void *)rk32);
151 }
152 
153 static inline void
storeroundkey(uint32_t * rk32,__m128i rk)154 storeroundkey(uint32_t *rk32, __m128i rk)
155 {
156 	_mm_store_si128((void *)rk32, rk);
157 }
158 
159 /* Given abcdefgh, set *lo = 0b0d0f0h and *hi = 0a0c0e0g.  */
160 static inline void
bytes2nybbles(__m128i * restrict lo,__m128i * restrict hi,__m128i x)161 bytes2nybbles(__m128i *restrict lo, __m128i *restrict hi, __m128i x)
162 {
163 
164 	*lo = x & of.m;
165 	*hi = _mm_srli_epi32(x & ~of.m, 4);
166 }
167 
168 /* Given 0p0q0r0s, return 0x0y0z0w where x = a/p, y = a/q, &c.  */
169 static inline __m128i
gf16_inva(__m128i x)170 gf16_inva(__m128i x)
171 {
172 	return _mm_shuffle_epi8(inva.m, x);
173 }
174 
175 /* Given 0p0q0r0s, return 0x0y0z0w where x = 1/p, y = 1/q, &c.  */
176 static inline __m128i
gf16_inv(__m128i x)177 gf16_inv(__m128i x)
178 {
179 	return _mm_shuffle_epi8(inv.m, x);
180 }
181 
182 /*
183  * t is a pair of maps respectively from low and high nybbles to bytes.
184  * Apply t the nybbles, and add the results in GF(2).
185  */
186 static __m128i
aes_schedule_transform(__m128i x,const union m128const t[static2])187 aes_schedule_transform(__m128i x, const union m128const t[static 2])
188 {
189 	__m128i lo, hi;
190 
191 	bytes2nybbles(&lo, &hi, x);
192 	return _mm_shuffle_epi8(t[0].m, lo) ^ _mm_shuffle_epi8(t[1].m, hi);
193 }
194 
195 static inline void
subbytes(__m128i * io,__m128i * jo,__m128i x)196 subbytes(__m128i *io, __m128i *jo, __m128i x)
197 {
198 	__m128i k, i, ak, j;
199 
200 	bytes2nybbles(&k, &i, x);
201 	ak = gf16_inva(k);
202 	j = i ^ k;
203 	*io = j ^ gf16_inv(ak ^ gf16_inv(i));
204 	*jo = i ^ gf16_inv(ak ^ gf16_inv(j));
205 }
206 
207 static __m128i
aes_schedule_low_round(__m128i rk,__m128i prk)208 aes_schedule_low_round(__m128i rk, __m128i prk)
209 {
210 	__m128i io, jo;
211 
212 	/* smear prk */
213 	prk ^= _mm_slli_si128(prk, 4);
214 	prk ^= _mm_slli_si128(prk, 8);
215 	prk ^= s63.m;
216 
217 	/* subbytes */
218 	subbytes(&io, &jo, rk);
219 	rk = _mm_shuffle_epi8(sb1[0].m, io) ^ _mm_shuffle_epi8(sb1[1].m, jo);
220 
221 	/* add in smeared stuff */
222 	return rk ^ prk;
223 }
224 
225 static __m128i
aes_schedule_round(__m128i rk,__m128i prk,__m128i * rcon_rot)226 aes_schedule_round(__m128i rk, __m128i prk, __m128i *rcon_rot)
227 {
228 
229 	/* extract rcon from rcon_rot */
230 	prk ^= _mm_alignr_epi8(_mm_setzero_si128(), *rcon_rot, 15);
231 	*rcon_rot = _mm_alignr_epi8(*rcon_rot, *rcon_rot, 15);
232 
233 	/* rotate */
234 	rk = _mm_shuffle_epi32(rk, 0xff);
235 	rk = _mm_alignr_epi8(rk, rk, 1);
236 
237 	return aes_schedule_low_round(rk, prk);
238 }
239 
240 static __m128i
aes_schedule_mangle_enc(__m128i x,__m128i sr_i)241 aes_schedule_mangle_enc(__m128i x, __m128i sr_i)
242 {
243 	__m128i y = _mm_setzero_si128();
244 
245 	x ^= s63.m;
246 
247 	x = _mm_shuffle_epi8(x, mc_forward[0].m);
248 	y ^= x;
249 	x = _mm_shuffle_epi8(x, mc_forward[0].m);
250 	y ^= x;
251 	x = _mm_shuffle_epi8(x, mc_forward[0].m);
252 	y ^= x;
253 
254 	return _mm_shuffle_epi8(y, sr_i);
255 }
256 
257 static __m128i
aes_schedule_mangle_last_enc(__m128i x,__m128i sr_i)258 aes_schedule_mangle_last_enc(__m128i x, __m128i sr_i)
259 {
260 
261 	return aes_schedule_transform(_mm_shuffle_epi8(x, sr_i) ^ s63.m, opt);
262 }
263 
264 static __m128i
aes_schedule_mangle_dec(__m128i x,__m128i sr_i)265 aes_schedule_mangle_dec(__m128i x, __m128i sr_i)
266 {
267 	__m128i y = _mm_setzero_si128();
268 
269 	x = aes_schedule_transform(x, dks1);
270 	y = _mm_shuffle_epi8(y ^ x, mc_forward[0].m);
271 	x = aes_schedule_transform(x, dks2);
272 	y = _mm_shuffle_epi8(y ^ x, mc_forward[0].m);
273 	x = aes_schedule_transform(x, dks3);
274 	y = _mm_shuffle_epi8(y ^ x, mc_forward[0].m);
275 	x = aes_schedule_transform(x, dks4);
276 	y = _mm_shuffle_epi8(y ^ x, mc_forward[0].m);
277 
278 	return _mm_shuffle_epi8(y, sr_i);
279 }
280 
281 static __m128i
aes_schedule_mangle_last_dec(__m128i x)282 aes_schedule_mangle_last_dec(__m128i x)
283 {
284 
285 	return aes_schedule_transform(x ^ s63.m, deskew);
286 }
287 
288 static __m128i
aes_schedule_192_smear(__m128i prkhi,__m128i prk)289 aes_schedule_192_smear(__m128i prkhi, __m128i prk)
290 {
291 	__m128i rk;
292 
293 	rk = prkhi;
294 	rk ^= _mm_shuffle_epi32(prkhi, 0x80);
295 	rk ^= _mm_shuffle_epi32(prk, 0xfe);
296 
297 	return rk;
298 }
299 
300 static __m128i
aes_schedule_192_smearhi(__m128i rk)301 aes_schedule_192_smearhi(__m128i rk)
302 {
303 	return (__m128i)_mm_movehl_ps((__m128)rk, _mm_setzero_ps());
304 }
305 
306 void
aes_ssse3_setenckey(struct aesenc * enc,const uint8_t * key,unsigned nrounds)307 aes_ssse3_setenckey(struct aesenc *enc, const uint8_t *key, unsigned nrounds)
308 {
309 	uint32_t *rk32 = enc->aese_aes.aes_rk;
310 	__m128i mrk;		/* mangled round key */
311 	__m128i rk;		/* round key */
312 	__m128i prk;		/* previous round key */
313 	__m128i rcon_rot = rcon.m;
314 	uint64_t i = 3;
315 
316 	/* input transform */
317 	rk = aes_schedule_transform(_mm_loadu_epi8(key), ipt);
318 	storeroundkey(rk32, rk);
319 	rk32 += 4;
320 
321 	switch (nrounds) {
322 	case 10:
323 		for (;;) {
324 			rk = aes_schedule_round(rk, rk, &rcon_rot);
325 			if (--nrounds == 0)
326 				break;
327 			mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4].m);
328 			storeroundkey(rk32, mrk);
329 			rk32 += 4;
330 		}
331 		break;
332 	case 12: {
333 		__m128i prkhi;		/* high half of previous round key */
334 
335 		prk = rk;
336 		rk = aes_schedule_transform(_mm_loadu_epi8(key + 8), ipt);
337 		prkhi = aes_schedule_192_smearhi(rk);
338 		for (;;) {
339 			prk = aes_schedule_round(rk, prk, &rcon_rot);
340 			rk = _mm_alignr_epi8(prk, prkhi, 8);
341 
342 			mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4].m);
343 			storeroundkey(rk32, mrk);
344 			rk32 += 4;
345 			rk = aes_schedule_192_smear(prkhi, prk);
346 			prkhi = aes_schedule_192_smearhi(rk);
347 
348 			mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4].m);
349 			storeroundkey(rk32, mrk);
350 			rk32 += 4;
351 			rk = prk = aes_schedule_round(rk, prk, &rcon_rot);
352 			if ((nrounds -= 3) == 0)
353 				break;
354 
355 			mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4].m);
356 			storeroundkey(rk32, mrk);
357 			rk32 += 4;
358 			rk = aes_schedule_192_smear(prkhi, prk);
359 			prkhi = aes_schedule_192_smearhi(rk);
360 		}
361 		break;
362 	}
363 	case 14: {
364 		__m128i pprk;		/* previous previous round key */
365 
366 		prk = rk;
367 		rk = aes_schedule_transform(_mm_loadu_epi8(key + 16), ipt);
368 		for (;;) {
369 			mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4].m);
370 			storeroundkey(rk32, mrk);
371 			rk32 += 4;
372 			pprk = rk;
373 
374 			/* high round */
375 			rk = prk = aes_schedule_round(rk, prk, &rcon_rot);
376 			if ((nrounds -= 2) == 0)
377 				break;
378 			mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4].m);
379 			storeroundkey(rk32, mrk);
380 			rk32 += 4;
381 
382 			/* low round */
383 			rk = _mm_shuffle_epi32(rk, 0xff);
384 			rk = aes_schedule_low_round(rk, pprk);
385 		}
386 		break;
387 	}
388 	default:
389 		panic("invalid number of AES rounds: %u", nrounds);
390 	}
391 	storeroundkey(rk32, aes_schedule_mangle_last_enc(rk, sr[i-- % 4].m));
392 }
393 
394 void
aes_ssse3_setdeckey(struct aesdec * dec,const uint8_t * key,unsigned nrounds)395 aes_ssse3_setdeckey(struct aesdec *dec, const uint8_t *key, unsigned nrounds)
396 {
397 	uint32_t *rk32 = dec->aesd_aes.aes_rk;
398 	__m128i mrk;		/* mangled round key */
399 	__m128i ork;		/* original round key */
400 	__m128i rk;		/* round key */
401 	__m128i prk;		/* previous round key */
402 	__m128i rcon_rot = rcon.m;
403 	unsigned i = nrounds == 12 ? 0 : 2;
404 
405 	ork = _mm_loadu_epi8(key);
406 
407 	/* input transform */
408 	rk = aes_schedule_transform(ork, ipt);
409 
410 	/* go from end */
411 	rk32 += 4*nrounds;
412 	storeroundkey(rk32, _mm_shuffle_epi8(ork, sr[i].m));
413 	rk32 -= 4;
414 	i ^= 3;
415 
416 	switch (nrounds) {
417 	case 10:
418 		for (;;) {
419 			rk = aes_schedule_round(rk, rk, &rcon_rot);
420 			if (--nrounds == 0)
421 				break;
422 			mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4].m);
423 			storeroundkey(rk32, mrk);
424 			rk32 -= 4;
425 		}
426 		break;
427 	case 12: {
428 		__m128i prkhi;		/* high half of previous round key */
429 
430 		prk = rk;
431 		rk = aes_schedule_transform(_mm_loadu_epi8(key + 8), ipt);
432 		prkhi = aes_schedule_192_smearhi(rk);
433 		for (;;) {
434 			prk = aes_schedule_round(rk, prk, &rcon_rot);
435 			rk = _mm_alignr_epi8(prk, prkhi, 8);
436 
437 			mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4].m);
438 			storeroundkey(rk32, mrk);
439 			rk32 -= 4;
440 			rk = aes_schedule_192_smear(prkhi, prk);
441 			prkhi = aes_schedule_192_smearhi(rk);
442 
443 			mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4].m);
444 			storeroundkey(rk32, mrk);
445 			rk32 -= 4;
446 			rk = prk = aes_schedule_round(rk, prk, &rcon_rot);
447 			if ((nrounds -= 3) == 0)
448 				break;
449 
450 			mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4].m);
451 			storeroundkey(rk32, mrk);
452 			rk32 -= 4;
453 			rk = aes_schedule_192_smear(prkhi, prk);
454 			prkhi = aes_schedule_192_smearhi(rk);
455 		}
456 		break;
457 	}
458 	case 14: {
459 		__m128i pprk;		/* previous previous round key */
460 
461 		prk = rk;
462 		rk = aes_schedule_transform(_mm_loadu_epi8(key + 16), ipt);
463 		for (;;) {
464 			mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4].m);
465 			storeroundkey(rk32, mrk);
466 			rk32 -= 4;
467 			pprk = rk;
468 
469 			/* high round */
470 			rk = prk = aes_schedule_round(rk, prk, &rcon_rot);
471 			if ((nrounds -= 2) == 0)
472 				break;
473 			mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4].m);
474 			storeroundkey(rk32, mrk);
475 			rk32 -= 4;
476 
477 			/* low round */
478 			rk = _mm_shuffle_epi32(rk, 0xff);
479 			rk = aes_schedule_low_round(rk, pprk);
480 		}
481 		break;
482 	}
483 	default:
484 		panic("invalid number of AES rounds: %u", nrounds);
485 	}
486 	storeroundkey(rk32, aes_schedule_mangle_last_dec(rk));
487 }
488 
489 __m128i
aes_ssse3_enc1(const struct aesenc * enc,__m128i x,unsigned nrounds)490 aes_ssse3_enc1(const struct aesenc *enc, __m128i x, unsigned nrounds)
491 {
492 	const uint32_t *rk32 = enc->aese_aes.aes_rk;
493 	__m128i io, jo;
494 	unsigned rmod4 = 0;
495 
496 	x = aes_schedule_transform(x, ipt);
497 	x ^= loadroundkey(rk32);
498 	for (;;) {
499 		__m128i A, A2, A2_B, A2_B_D;
500 
501 		subbytes(&io, &jo, x);
502 
503 		rk32 += 4;
504 		rmod4 = (rmod4 + 1) % 4;
505 		if (--nrounds == 0)
506 			break;
507 
508 		A = _mm_shuffle_epi8(sb1[0].m, io) ^
509 		    _mm_shuffle_epi8(sb1[1].m, jo);
510 		A ^= loadroundkey(rk32);
511 		A2 = _mm_shuffle_epi8(sb2[0].m, io) ^
512 		    _mm_shuffle_epi8(sb2[1].m, jo);
513 		A2_B = A2 ^ _mm_shuffle_epi8(A, mc_forward[rmod4].m);
514 		A2_B_D = A2_B ^ _mm_shuffle_epi8(A, mc_backward[rmod4].m);
515 		x = A2_B_D ^ _mm_shuffle_epi8(A2_B, mc_forward[rmod4].m);
516 	}
517 	x = _mm_shuffle_epi8(sbo[0].m, io) ^ _mm_shuffle_epi8(sbo[1].m, jo);
518 	x ^= loadroundkey(rk32);
519 	return _mm_shuffle_epi8(x, sr[rmod4].m);
520 }
521 
522 __m128i
aes_ssse3_dec1(const struct aesdec * dec,__m128i x,unsigned nrounds)523 aes_ssse3_dec1(const struct aesdec *dec, __m128i x, unsigned nrounds)
524 {
525 	const uint32_t *rk32 = dec->aesd_aes.aes_rk;
526 	unsigned i = 3 & ~(nrounds - 1);
527 	__m128i io, jo, mc;
528 
529 	x = aes_schedule_transform(x, dipt);
530 	x ^= loadroundkey(rk32);
531 	rk32 += 4;
532 
533 	mc = mc_forward[3].m;
534 	for (;;) {
535 		subbytes(&io, &jo, x);
536 		if (--nrounds == 0)
537 			break;
538 
539 		x = _mm_shuffle_epi8(dsb9[0].m, io) ^
540 		    _mm_shuffle_epi8(dsb9[1].m, jo);
541 		x ^= loadroundkey(rk32);
542 		rk32 += 4;				/* next round key */
543 
544 		x = _mm_shuffle_epi8(x, mc);
545 		x ^= _mm_shuffle_epi8(dsbd[0].m, io) ^
546 		    _mm_shuffle_epi8(dsbd[1].m, jo);
547 
548 		x = _mm_shuffle_epi8(x, mc);
549 		x ^= _mm_shuffle_epi8(dsbb[0].m, io) ^
550 		    _mm_shuffle_epi8(dsbb[1].m, jo);
551 
552 		x = _mm_shuffle_epi8(x, mc);
553 		x ^= _mm_shuffle_epi8(dsbe[0].m, io) ^
554 		    _mm_shuffle_epi8(dsbe[1].m, jo);
555 
556 		mc = _mm_alignr_epi8(mc, mc, 12);
557 	}
558 	x = _mm_shuffle_epi8(dsbo[0].m, io) ^ _mm_shuffle_epi8(dsbo[1].m, jo);
559 	x ^= loadroundkey(rk32);
560 	return _mm_shuffle_epi8(x, sr[i].m);
561 }
562