1 /*	$NetBSD: aes_neon.c,v 1.6 2020/11/21 08:09:21 rin Exp $	*/
2 
3 /*-
4  * Copyright (c) 2020 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26  * POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*
30  * Permutation-based AES using NEON, derived from Mike Hamburg's VPAES
31  * software, at <https://crypto.stanford.edu/vpaes/>, described in
32  *
33  *	Mike Hamburg, `Accelerating AES with Vector Permute
34  *	Instructions', in Christophe Clavier and Kris Gaj (eds.),
35  *	Cryptographic Hardware and Embedded Systems -- CHES 2009,
36  *	Springer LNCS 5747, pp. 18-32.
37  *
38  *	https://link.springer.com/chapter/10.1007/978-3-642-04138-9_2
39  */
40 
41 #include <sys/cdefs.h>
42 __KERNEL_RCSID(1, "$NetBSD: aes_neon.c,v 1.6 2020/11/21 08:09:21 rin Exp $");
43 
44 #include <sys/types.h>
45 
46 #ifdef _KERNEL
47 #include <sys/systm.h>
48 #else
49 #include <err.h>
50 #define	panic(fmt, args...)		err(1, fmt, ##args)
51 #endif
52 
53 #include "aes_neon_impl.h"
54 
55 #ifdef __aarch64__
56 #define	__aarch64_used
57 #else
58 #define	__aarch64_used	__unused
59 #endif
60 
61 static const uint8x16_t
62 mc_forward[4] = {
63 	VQ_N_U8(0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04,
64 	    0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C),
65 	VQ_N_U8(0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08,
66 	    0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00),
67 	VQ_N_U8(0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C,
68 	    0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04),
69 	VQ_N_U8(0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00,
70 	    0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08),
71 },
72 mc_backward[4] __aarch64_used = {
73 	VQ_N_U8(0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06,
74 	    0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E),
75 	VQ_N_U8(0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02,
76 	    0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A),
77 	VQ_N_U8(0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E,
78 	    0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06),
79 	VQ_N_U8(0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A,
80 	    0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02),
81 },
82 ipt[2] __aarch64_used = {
83 	VQ_N_U8(0x00,0x70,0x2A,0x5A,0x98,0xE8,0xB2,0xC2,
84 	    0x08,0x78,0x22,0x52,0x90,0xE0,0xBA,0xCA),
85 	VQ_N_U8(0x00,0x4D,0x7C,0x31,0x7D,0x30,0x01,0x4C,
86 	    0x81,0xCC,0xFD,0xB0,0xFC,0xB1,0x80,0xCD),
87 },
88 opt[2] = {
89 	VQ_N_U8(0x00,0x60,0xB6,0xD6,0x29,0x49,0x9F,0xFF,
90 	    0x08,0x68,0xBE,0xDE,0x21,0x41,0x97,0xF7),
91 	VQ_N_U8(0x00,0xEC,0xBC,0x50,0x51,0xBD,0xED,0x01,
92 	    0xE0,0x0C,0x5C,0xB0,0xB1,0x5D,0x0D,0xE1),
93 },
94 dipt[2] __aarch64_used = {
95 	VQ_N_U8(0x00,0x5F,0x54,0x0B,0x04,0x5B,0x50,0x0F,
96 	    0x1A,0x45,0x4E,0x11,0x1E,0x41,0x4A,0x15),
97 	VQ_N_U8(0x00,0x65,0x05,0x60,0xE6,0x83,0xE3,0x86,
98 	    0x94,0xF1,0x91,0xF4,0x72,0x17,0x77,0x12),
99 },
100 sb1[2] __aarch64_used = {
101 	VQ_N_U8(0x00,0x3E,0x50,0xCB,0x8F,0xE1,0x9B,0xB1,
102 	    0x44,0xF5,0x2A,0x14,0x6E,0x7A,0xDF,0xA5),
103 	VQ_N_U8(0x00,0x23,0xE2,0xFA,0x15,0xD4,0x18,0x36,
104 	    0xEF,0xD9,0x2E,0x0D,0xC1,0xCC,0xF7,0x3B),
105 },
106 sb2[2] __aarch64_used = {
107 	VQ_N_U8(0x00,0x24,0x71,0x0B,0xC6,0x93,0x7A,0xE2,
108 	    0xCD,0x2F,0x98,0xBC,0x55,0xE9,0xB7,0x5E),
109 	VQ_N_U8(0x00,0x29,0xE1,0x0A,0x40,0x88,0xEB,0x69,
110 	    0x4A,0x23,0x82,0xAB,0xC8,0x63,0xA1,0xC2),
111 },
112 sbo[2] __aarch64_used = {
113 	VQ_N_U8(0x00,0xC7,0xBD,0x6F,0x17,0x6D,0xD2,0xD0,
114 	    0x78,0xA8,0x02,0xC5,0x7A,0xBF,0xAA,0x15),
115 	VQ_N_U8(0x00,0x6A,0xBB,0x5F,0xA5,0x74,0xE4,0xCF,
116 	    0xFA,0x35,0x2B,0x41,0xD1,0x90,0x1E,0x8E),
117 },
118 dsb9[2] __aarch64_used = {
119 	VQ_N_U8(0x00,0xD6,0x86,0x9A,0x53,0x03,0x1C,0x85,
120 	    0xC9,0x4C,0x99,0x4F,0x50,0x1F,0xD5,0xCA),
121 	VQ_N_U8(0x00,0x49,0xD7,0xEC,0x89,0x17,0x3B,0xC0,
122 	    0x65,0xA5,0xFB,0xB2,0x9E,0x2C,0x5E,0x72),
123 },
124 dsbd[2] __aarch64_used = {
125 	VQ_N_U8(0x00,0xA2,0xB1,0xE6,0xDF,0xCC,0x57,0x7D,
126 	    0x39,0x44,0x2A,0x88,0x13,0x9B,0x6E,0xF5),
127 	VQ_N_U8(0x00,0xCB,0xC6,0x24,0xF7,0xFA,0xE2,0x3C,
128 	    0xD3,0xEF,0xDE,0x15,0x0D,0x18,0x31,0x29),
129 },
130 dsbb[2] __aarch64_used = {
131 	VQ_N_U8(0x00,0x42,0xB4,0x96,0x92,0x64,0x22,0xD0,
132 	    0x04,0xD4,0xF2,0xB0,0xF6,0x46,0x26,0x60),
133 	VQ_N_U8(0x00,0x67,0x59,0xCD,0xA6,0x98,0x94,0xC1,
134 	    0x6B,0xAA,0x55,0x32,0x3E,0x0C,0xFF,0xF3),
135 },
136 dsbe[2] __aarch64_used = {
137 	VQ_N_U8(0x00,0xD0,0xD4,0x26,0x96,0x92,0xF2,0x46,
138 	    0xB0,0xF6,0xB4,0x64,0x04,0x60,0x42,0x22),
139 	VQ_N_U8(0x00,0xC1,0xAA,0xFF,0xCD,0xA6,0x55,0x0C,
140 	    0x32,0x3E,0x59,0x98,0x6B,0xF3,0x67,0x94),
141 },
142 dsbo[2] __aarch64_used = {
143 	VQ_N_U8(0x00,0x40,0xF9,0x7E,0x53,0xEA,0x87,0x13,
144 	    0x2D,0x3E,0x94,0xD4,0xB9,0x6D,0xAA,0xC7),
145 	VQ_N_U8(0x00,0x1D,0x44,0x93,0x0F,0x56,0xD7,0x12,
146 	    0x9C,0x8E,0xC5,0xD8,0x59,0x81,0x4B,0xCA),
147 },
148 dks1[2] = {
149 	VQ_N_U8(0x00,0xA7,0xD9,0x7E,0xC8,0x6F,0x11,0xB6,
150 	    0xFC,0x5B,0x25,0x82,0x34,0x93,0xED,0x4A),
151 	VQ_N_U8(0x00,0x33,0x14,0x27,0x62,0x51,0x76,0x45,
152 	    0xCE,0xFD,0xDA,0xE9,0xAC,0x9F,0xB8,0x8B),
153 },
154 dks2[2] = {
155 	VQ_N_U8(0x00,0x64,0xA8,0xCC,0xEB,0x8F,0x43,0x27,
156 	    0x61,0x05,0xC9,0xAD,0x8A,0xEE,0x22,0x46),
157 	VQ_N_U8(0x00,0xDD,0x92,0x4F,0xCE,0x13,0x5C,0x81,
158 	    0xF2,0x2F,0x60,0xBD,0x3C,0xE1,0xAE,0x73),
159 },
160 dks3[2] = {
161 	VQ_N_U8(0x00,0xC7,0xC6,0x01,0x02,0xC5,0xC4,0x03,
162 	    0xFB,0x3C,0x3D,0xFA,0xF9,0x3E,0x3F,0xF8),
163 	VQ_N_U8(0x00,0xF7,0xCF,0x38,0xD6,0x21,0x19,0xEE,
164 	    0x4B,0xBC,0x84,0x73,0x9D,0x6A,0x52,0xA5),
165 },
166 dks4[2] = {
167 	VQ_N_U8(0x00,0x20,0x73,0x53,0xB0,0x90,0xC3,0xE3,
168 	    0x43,0x63,0x30,0x10,0xF3,0xD3,0x80,0xA0),
169 	VQ_N_U8(0xE8,0x82,0x69,0x03,0x4B,0x21,0xCA,0xA0,
170 	    0x67,0x0D,0xE6,0x8C,0xC4,0xAE,0x45,0x2F),
171 },
172 deskew[2] = {
173 	VQ_N_U8(0x00,0xE3,0xA4,0x47,0x40,0xA3,0xE4,0x07,
174 	    0x1A,0xF9,0xBE,0x5D,0x5A,0xB9,0xFE,0x1D),
175 	VQ_N_U8(0x00,0x69,0xEA,0x83,0xDC,0xB5,0x36,0x5F,
176 	    0x77,0x1E,0x9D,0xF4,0xAB,0xC2,0x41,0x28),
177 },
178 sr[4] __aarch64_used = {
179 	VQ_N_U8(0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
180 	    0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F),
181 	VQ_N_U8(0x00,0x05,0x0A,0x0F,0x04,0x09,0x0E,0x03,
182 	    0x08,0x0D,0x02,0x07,0x0C,0x01,0x06,0x0B),
183 	VQ_N_U8(0x00,0x09,0x02,0x0B,0x04,0x0D,0x06,0x0F,
184 	    0x08,0x01,0x0A,0x03,0x0C,0x05,0x0E,0x07),
185 	VQ_N_U8(0x00,0x0D,0x0A,0x07,0x04,0x01,0x0E,0x0B,
186 	    0x08,0x05,0x02,0x0F,0x0C,0x09,0x06,0x03),
187 },
188 rcon	= VQ_N_U8(0xB6,0xEE,0x9D,0xAF,0xB9,0x91,0x83,0x1F,
189 	    0x81,0x7D,0x7C,0x4D,0x08,0x98,0x2A,0x70),
190 of	= VQ_N_U8(0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,
191 	    0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F),
192 s63	= VQ_N_U8(0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,
193 	    0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B,0x5B),
194 inv	= VQ_N_U8(0x80,0x01,0x08,0x0D,0x0F,0x06,0x05,0x0E,
195 	    0x02,0x0C,0x0B,0x0A,0x09,0x03,0x07,0x04),
196 inva	= VQ_N_U8(0x80,0x07,0x0B,0x0F,0x06,0x0A,0x04,0x01,
197 	    0x09,0x08,0x05,0x02,0x0C,0x0E,0x0D,0x03);
198 
199 #ifdef __aarch64__
200 static inline uint8x16_t
loadroundkey(const void * rkp)201 loadroundkey(const void *rkp)
202 {
203 	return vld1q_u8(rkp);
204 }
205 #endif
206 
207 static inline void
storeroundkey(void * rkp,uint8x16_t rk)208 storeroundkey(void *rkp, uint8x16_t rk)
209 {
210 	vst1q_u8(rkp, rk);
211 }
212 
213 /* Given abcdefgh, set *lo = 0b0d0f0h and *hi = 0a0c0e0g.  */
214 static inline void
bytes2nybbles(uint8x16_t * restrict lo,uint8x16_t * restrict hi,uint8x16_t x)215 bytes2nybbles(uint8x16_t *restrict lo, uint8x16_t *restrict hi, uint8x16_t x)
216 {
217 
218 	*lo = of & x;
219 	*hi = of & vshrq_n_u8(x, 4);
220 }
221 
222 /*
223  * t is a pair of maps respectively from low and high nybbles to bytes.
224  * Apply t the nybbles, and add the results in GF(2).
225  */
226 static uint8x16_t
aes_schedule_transform(uint8x16_t x,const uint8x16_t t[static2])227 aes_schedule_transform(uint8x16_t x, const uint8x16_t t[static 2])
228 {
229 	uint8x16_t lo, hi;
230 
231 	bytes2nybbles(&lo, &hi, x);
232 	return vqtbl1q_u8(t[0], lo) ^ vqtbl1q_u8(t[1], hi);
233 }
234 
235 static inline void
subbytes(uint8x16_t * io,uint8x16_t * jo,uint8x16_t x,uint8x16_t inv_,uint8x16_t inva_)236 subbytes(uint8x16_t *io, uint8x16_t *jo, uint8x16_t x, uint8x16_t inv_,
237     uint8x16_t inva_)
238 {
239 	uint8x16_t k, i, ak, j;
240 
241 	bytes2nybbles(&k, &i, x);
242 	ak = vqtbl1q_u8(inva_, k);
243 	j = i ^ k;
244 	*io = j ^ vqtbl1q_u8(inv_, ak ^ vqtbl1q_u8(inv_, i));
245 	*jo = i ^ vqtbl1q_u8(inv_, ak ^ vqtbl1q_u8(inv_, j));
246 }
247 
248 static uint8x16_t
aes_schedule_low_round(uint8x16_t rk,uint8x16_t prk)249 aes_schedule_low_round(uint8x16_t rk, uint8x16_t prk)
250 {
251 	uint8x16_t io, jo;
252 
253 	/* smear prk */
254 	prk ^= vextq_u8(vdupq_n_u8(0), prk, 12);
255 	prk ^= vextq_u8(vdupq_n_u8(0), prk, 8);
256 	prk ^= s63;
257 
258 	/* subbytes */
259 	subbytes(&io, &jo, rk, inv, inva);
260 	rk = vqtbl1q_u8(sb1[0], io) ^ vqtbl1q_u8(sb1[1], jo);
261 
262 	/* add in smeared stuff */
263 	return rk ^ prk;
264 }
265 
266 static uint8x16_t
aes_schedule_round(uint8x16_t rk,uint8x16_t prk,uint8x16_t * rcon_rot)267 aes_schedule_round(uint8x16_t rk, uint8x16_t prk, uint8x16_t *rcon_rot)
268 {
269 	uint32x4_t rk32;
270 
271 	/* extract rcon from rcon_rot */
272 	prk ^= vextq_u8(*rcon_rot, vdupq_n_u8(0), 15);
273 	*rcon_rot = vextq_u8(*rcon_rot, *rcon_rot, 15);
274 
275 	/* rotate */
276 	rk32 = vreinterpretq_u32_u8(rk);
277 	rk32 = vdupq_n_u32(vgetq_lane_u32(rk32, 3));
278 	rk = vreinterpretq_u8_u32(rk32);
279 	rk = vextq_u8(rk, rk, 1);
280 
281 	return aes_schedule_low_round(rk, prk);
282 }
283 
284 static uint8x16_t
aes_schedule_mangle_enc(uint8x16_t x,uint8x16_t sr_i)285 aes_schedule_mangle_enc(uint8x16_t x, uint8x16_t sr_i)
286 {
287 	uint8x16_t y = vdupq_n_u8(0);
288 
289 	x ^= s63;
290 
291 	x = vqtbl1q_u8(x, mc_forward[0]);
292 	y ^= x;
293 	x = vqtbl1q_u8(x, mc_forward[0]);
294 	y ^= x;
295 	x = vqtbl1q_u8(x, mc_forward[0]);
296 	y ^= x;
297 
298 	return vqtbl1q_u8(y, sr_i);
299 }
300 
301 static uint8x16_t
aes_schedule_mangle_last_enc(uint8x16_t x,uint8x16_t sr_i)302 aes_schedule_mangle_last_enc(uint8x16_t x, uint8x16_t sr_i)
303 {
304 
305 	return aes_schedule_transform(vqtbl1q_u8(x, sr_i) ^ s63, opt);
306 }
307 
308 static uint8x16_t
aes_schedule_mangle_dec(uint8x16_t x,uint8x16_t sr_i)309 aes_schedule_mangle_dec(uint8x16_t x, uint8x16_t sr_i)
310 {
311 	uint8x16_t y = vdupq_n_u8(0);
312 
313 	x = aes_schedule_transform(x, dks1);
314 	y = vqtbl1q_u8(y ^ x, mc_forward[0]);
315 	x = aes_schedule_transform(x, dks2);
316 	y = vqtbl1q_u8(y ^ x, mc_forward[0]);
317 	x = aes_schedule_transform(x, dks3);
318 	y = vqtbl1q_u8(y ^ x, mc_forward[0]);
319 	x = aes_schedule_transform(x, dks4);
320 	y = vqtbl1q_u8(y ^ x, mc_forward[0]);
321 
322 	return vqtbl1q_u8(y, sr_i);
323 }
324 
325 static uint8x16_t
aes_schedule_mangle_last_dec(uint8x16_t x)326 aes_schedule_mangle_last_dec(uint8x16_t x)
327 {
328 
329 	return aes_schedule_transform(x ^ s63, deskew);
330 }
331 
332 static uint8x16_t
aes_schedule_192_smear(uint8x16_t prkhi,uint8x16_t prk)333 aes_schedule_192_smear(uint8x16_t prkhi, uint8x16_t prk)
334 {
335 	uint32x4_t prkhi32 = vreinterpretq_u32_u8(prkhi);
336 	uint32x4_t prk32 = vreinterpretq_u32_u8(prk);
337 	uint32x4_t rk32;
338 
339 	rk32 = prkhi32;
340 	rk32 ^= vsetq_lane_u32(vgetq_lane_u32(prkhi32, 2),
341 	    vdupq_n_u32(vgetq_lane_u32(prkhi32, 0)),
342 	    3);
343 	rk32 ^= vsetq_lane_u32(vgetq_lane_u32(prk32, 2),
344 	    vdupq_n_u32(vgetq_lane_u32(prk32, 3)),
345 	    0);
346 
347 	return vreinterpretq_u8_u32(rk32);
348 }
349 
350 static uint8x16_t
aes_schedule_192_smearhi(uint8x16_t rk)351 aes_schedule_192_smearhi(uint8x16_t rk)
352 {
353 	uint64x2_t rk64 = vreinterpretq_u64_u8(rk);
354 
355 	rk64 = vsetq_lane_u64(0, rk64, 0);
356 
357 	return vreinterpretq_u8_u64(rk64);
358 }
359 
360 void
aes_neon_setenckey(struct aesenc * enc,const uint8_t * key,unsigned nrounds)361 aes_neon_setenckey(struct aesenc *enc, const uint8_t *key, unsigned nrounds)
362 {
363 	uint32_t *rk32 = enc->aese_aes.aes_rk;
364 	uint8x16_t mrk;		/* mangled round key */
365 	uint8x16_t rk;		/* round key */
366 	uint8x16_t prk;		/* previous round key */
367 	uint8x16_t rcon_rot = rcon;
368 	uint64_t i = 3;
369 
370 	/* input transform */
371 	rk = aes_schedule_transform(vld1q_u8(key), ipt);
372 	storeroundkey(rk32, rk);
373 	rk32 += 4;
374 
375 	switch (nrounds) {
376 	case 10:
377 		for (;;) {
378 			rk = aes_schedule_round(rk, rk, &rcon_rot);
379 			if (--nrounds == 0)
380 				break;
381 			mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]);
382 			storeroundkey(rk32, mrk);
383 			rk32 += 4;
384 		}
385 		break;
386 	case 12: {
387 		uint8x16_t prkhi;	/* high half of previous round key */
388 
389 		prk = rk;
390 		rk = aes_schedule_transform(vld1q_u8(key + 8), ipt);
391 		prkhi = aes_schedule_192_smearhi(rk);
392 		for (;;) {
393 			prk = aes_schedule_round(rk, prk, &rcon_rot);
394 			rk = vextq_u8(prkhi, prk, 8);
395 
396 			mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]);
397 			storeroundkey(rk32, mrk);
398 			rk32 += 4;
399 			rk = aes_schedule_192_smear(prkhi, prk);
400 			prkhi = aes_schedule_192_smearhi(rk);
401 
402 			mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]);
403 			storeroundkey(rk32, mrk);
404 			rk32 += 4;
405 			rk = prk = aes_schedule_round(rk, prk, &rcon_rot);
406 			if ((nrounds -= 3) == 0)
407 				break;
408 
409 			mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]);
410 			storeroundkey(rk32, mrk);
411 			rk32 += 4;
412 			rk = aes_schedule_192_smear(prkhi, prk);
413 			prkhi = aes_schedule_192_smearhi(rk);
414 		}
415 		break;
416 	}
417 	case 14: {
418 		uint8x16_t pprk;	/* previous previous round key */
419 
420 		prk = rk;
421 		rk = aes_schedule_transform(vld1q_u8(key + 16), ipt);
422 		for (;;) {
423 			mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]);
424 			storeroundkey(rk32, mrk);
425 			rk32 += 4;
426 			pprk = rk;
427 
428 			/* high round */
429 			rk = prk = aes_schedule_round(rk, prk, &rcon_rot);
430 			if ((nrounds -= 2) == 0)
431 				break;
432 			mrk = aes_schedule_mangle_enc(rk, sr[i-- % 4]);
433 			storeroundkey(rk32, mrk);
434 			rk32 += 4;
435 
436 			/* low round */
437 			rk = vreinterpretq_u8_u32(
438 				vdupq_n_u32(
439 				    vgetq_lane_u32(vreinterpretq_u32_u8(rk),
440 					3)));
441 			rk = aes_schedule_low_round(rk, pprk);
442 		}
443 		break;
444 	}
445 	default:
446 		panic("invalid number of AES rounds: %u", nrounds);
447 	}
448 	storeroundkey(rk32, aes_schedule_mangle_last_enc(rk, sr[i-- % 4]));
449 }
450 
451 void
aes_neon_setdeckey(struct aesdec * dec,const uint8_t * key,unsigned nrounds)452 aes_neon_setdeckey(struct aesdec *dec, const uint8_t *key, unsigned nrounds)
453 {
454 	uint32_t *rk32 = dec->aesd_aes.aes_rk;
455 	uint8x16_t mrk;		/* mangled round key */
456 	uint8x16_t ork;		/* original round key */
457 	uint8x16_t rk;		/* round key */
458 	uint8x16_t prk;		/* previous round key */
459 	uint8x16_t rcon_rot = rcon;
460 	unsigned i = nrounds == 12 ? 0 : 2;
461 
462 	ork = vld1q_u8(key);
463 
464 	/* input transform */
465 	rk = aes_schedule_transform(ork, ipt);
466 
467 	/* go from end */
468 	rk32 += 4*nrounds;
469 	storeroundkey(rk32, vqtbl1q_u8(ork, sr[i]));
470 	rk32 -= 4;
471 	i ^= 3;
472 
473 	switch (nrounds) {
474 	case 10:
475 		for (;;) {
476 			rk = aes_schedule_round(rk, rk, &rcon_rot);
477 			if (--nrounds == 0)
478 				break;
479 			mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]);
480 			storeroundkey(rk32, mrk);
481 			rk32 -= 4;
482 		}
483 		break;
484 	case 12: {
485 		uint8x16_t prkhi;	/* high half of previous round key */
486 
487 		prk = rk;
488 		rk = aes_schedule_transform(vld1q_u8(key + 8), ipt);
489 		prkhi = aes_schedule_192_smearhi(rk);
490 		for (;;) {
491 			prk = aes_schedule_round(rk, prk, &rcon_rot);
492 			rk = vextq_u8(prkhi, prk, 8);
493 
494 			mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]);
495 			storeroundkey(rk32, mrk);
496 			rk32 -= 4;
497 			rk = aes_schedule_192_smear(prkhi, prk);
498 			prkhi = aes_schedule_192_smearhi(rk);
499 
500 			mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]);
501 			storeroundkey(rk32, mrk);
502 			rk32 -= 4;
503 			rk = prk = aes_schedule_round(rk, prk, &rcon_rot);
504 			if ((nrounds -= 3) == 0)
505 				break;
506 
507 			mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]);
508 			storeroundkey(rk32, mrk);
509 			rk32 -= 4;
510 			rk = aes_schedule_192_smear(prkhi, prk);
511 			prkhi = aes_schedule_192_smearhi(rk);
512 		}
513 		break;
514 	}
515 	case 14: {
516 		uint8x16_t pprk;	/* previous previous round key */
517 
518 		prk = rk;
519 		rk = aes_schedule_transform(vld1q_u8(key + 16), ipt);
520 		for (;;) {
521 			mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]);
522 			storeroundkey(rk32, mrk);
523 			rk32 -= 4;
524 			pprk = rk;
525 
526 			/* high round */
527 			rk = prk = aes_schedule_round(rk, prk, &rcon_rot);
528 			if ((nrounds -= 2) == 0)
529 				break;
530 			mrk = aes_schedule_mangle_dec(rk, sr[i-- % 4]);
531 			storeroundkey(rk32, mrk);
532 			rk32 -= 4;
533 
534 			/* low round */
535 			rk = vreinterpretq_u8_u32(
536 				vdupq_n_u32(
537 				    vgetq_lane_u32(vreinterpretq_u32_u8(rk),
538 					3)));
539 			rk = aes_schedule_low_round(rk, pprk);
540 		}
541 		break;
542 	}
543 	default:
544 		panic("invalid number of AES rounds: %u", nrounds);
545 	}
546 	storeroundkey(rk32, aes_schedule_mangle_last_dec(rk));
547 }
548 
549 #ifdef __aarch64__
550 
551 /*
552  * GCC does a lousy job of compiling NEON intrinsics for arm32, so we
553  * do the performance-critical parts -- encryption and decryption -- in
554  * hand-written assembly on arm32.
555  */
556 
557 uint8x16_t
aes_neon_enc1(const struct aesenc * enc,uint8x16_t x,unsigned nrounds)558 aes_neon_enc1(const struct aesenc *enc, uint8x16_t x, unsigned nrounds)
559 {
560 	const uint32_t *rk32 = enc->aese_aes.aes_rk;
561 	uint8x16_t inv_ = *(const volatile uint8x16_t *)&inv;
562 	uint8x16_t inva_ = *(const volatile uint8x16_t *)&inva;
563 	uint8x16_t sb1_0 = ((const volatile uint8x16_t *)sb1)[0];
564 	uint8x16_t sb1_1 = ((const volatile uint8x16_t *)sb1)[1];
565 	uint8x16_t sb2_0 = ((const volatile uint8x16_t *)sb2)[0];
566 	uint8x16_t sb2_1 = ((const volatile uint8x16_t *)sb2)[1];
567 	uint8x16_t io, jo;
568 	unsigned rmod4 = 0;
569 
570 	x = aes_schedule_transform(x, ipt);
571 	x ^= loadroundkey(rk32);
572 	for (;;) {
573 		uint8x16_t A, A2, A2_B, A2_B_D;
574 
575 		subbytes(&io, &jo, x, inv_, inva_);
576 
577 		rk32 += 4;
578 		rmod4 = (rmod4 + 1) % 4;
579 		if (--nrounds == 0)
580 			break;
581 
582 		A = vqtbl1q_u8(sb1_0, io) ^ vqtbl1q_u8(sb1_1, jo);
583 		A ^= loadroundkey(rk32);
584 		A2 = vqtbl1q_u8(sb2_0, io) ^ vqtbl1q_u8(sb2_1, jo);
585 		A2_B = A2 ^ vqtbl1q_u8(A, mc_forward[rmod4]);
586 		A2_B_D = A2_B ^ vqtbl1q_u8(A, mc_backward[rmod4]);
587 		x = A2_B_D ^ vqtbl1q_u8(A2_B, mc_forward[rmod4]);
588 	}
589 	x = vqtbl1q_u8(sbo[0], io) ^ vqtbl1q_u8(sbo[1], jo);
590 	x ^= loadroundkey(rk32);
591 	return vqtbl1q_u8(x, sr[rmod4]);
592 }
593 
594 uint8x16x2_t
aes_neon_enc2(const struct aesenc * enc,uint8x16x2_t x,unsigned nrounds)595 aes_neon_enc2(const struct aesenc *enc, uint8x16x2_t x, unsigned nrounds)
596 {
597 	const uint32_t *rk32 = enc->aese_aes.aes_rk;
598 	uint8x16_t inv_ = *(const volatile uint8x16_t *)&inv;
599 	uint8x16_t inva_ = *(const volatile uint8x16_t *)&inva;
600 	uint8x16_t sb1_0 = ((const volatile uint8x16_t *)sb1)[0];
601 	uint8x16_t sb1_1 = ((const volatile uint8x16_t *)sb1)[1];
602 	uint8x16_t sb2_0 = ((const volatile uint8x16_t *)sb2)[0];
603 	uint8x16_t sb2_1 = ((const volatile uint8x16_t *)sb2)[1];
604 	uint8x16_t x0 = x.val[0], x1 = x.val[1];
605 	uint8x16_t io0, jo0, io1, jo1;
606 	unsigned rmod4 = 0;
607 
608 	x0 = aes_schedule_transform(x0, ipt);
609 	x1 = aes_schedule_transform(x1, ipt);
610 	x0 ^= loadroundkey(rk32);
611 	x1 ^= loadroundkey(rk32);
612 	for (;;) {
613 		uint8x16_t A_0, A2_0, A2_B_0, A2_B_D_0;
614 		uint8x16_t A_1, A2_1, A2_B_1, A2_B_D_1;
615 
616 		subbytes(&io0, &jo0, x0, inv_, inva_);
617 		subbytes(&io1, &jo1, x1, inv_, inva_);
618 
619 		rk32 += 4;
620 		rmod4 = (rmod4 + 1) % 4;
621 		if (--nrounds == 0)
622 			break;
623 
624 		A_0 = vqtbl1q_u8(sb1_0, io0) ^ vqtbl1q_u8(sb1_1, jo0);
625 		A_1 = vqtbl1q_u8(sb1_0, io1) ^ vqtbl1q_u8(sb1_1, jo1);
626 		A_0 ^= loadroundkey(rk32);
627 		A_1 ^= loadroundkey(rk32);
628 		A2_0 = vqtbl1q_u8(sb2_0, io0) ^ vqtbl1q_u8(sb2_1, jo0);
629 		A2_1 = vqtbl1q_u8(sb2_0, io1) ^ vqtbl1q_u8(sb2_1, jo1);
630 		A2_B_0 = A2_0 ^ vqtbl1q_u8(A_0, mc_forward[rmod4]);
631 		A2_B_1 = A2_1 ^ vqtbl1q_u8(A_1, mc_forward[rmod4]);
632 		A2_B_D_0 = A2_B_0 ^ vqtbl1q_u8(A_0, mc_backward[rmod4]);
633 		A2_B_D_1 = A2_B_1 ^ vqtbl1q_u8(A_1, mc_backward[rmod4]);
634 		x0 = A2_B_D_0 ^ vqtbl1q_u8(A2_B_0, mc_forward[rmod4]);
635 		x1 = A2_B_D_1 ^ vqtbl1q_u8(A2_B_1, mc_forward[rmod4]);
636 	}
637 	x0 = vqtbl1q_u8(sbo[0], io0) ^ vqtbl1q_u8(sbo[1], jo0);
638 	x1 = vqtbl1q_u8(sbo[0], io1) ^ vqtbl1q_u8(sbo[1], jo1);
639 	x0 ^= loadroundkey(rk32);
640 	x1 ^= loadroundkey(rk32);
641 	return (uint8x16x2_t) { .val = {
642 		[0] = vqtbl1q_u8(x0, sr[rmod4]),
643 		[1] = vqtbl1q_u8(x1, sr[rmod4]),
644 	} };
645 }
646 
647 uint8x16_t
aes_neon_dec1(const struct aesdec * dec,uint8x16_t x,unsigned nrounds)648 aes_neon_dec1(const struct aesdec *dec, uint8x16_t x, unsigned nrounds)
649 {
650 	const uint32_t *rk32 = dec->aesd_aes.aes_rk;
651 	unsigned i = 3 & ~(nrounds - 1);
652 	uint8x16_t inv_ = *(const volatile uint8x16_t *)&inv;
653 	uint8x16_t inva_ = *(const volatile uint8x16_t *)&inva;
654 	uint8x16_t io, jo, mc;
655 
656 	x = aes_schedule_transform(x, dipt);
657 	x ^= loadroundkey(rk32);
658 	rk32 += 4;
659 
660 	mc = mc_forward[3];
661 	for (;;) {
662 		subbytes(&io, &jo, x, inv_, inva_);
663 		if (--nrounds == 0)
664 			break;
665 
666 		x = vqtbl1q_u8(dsb9[0], io) ^ vqtbl1q_u8(dsb9[1], jo);
667 		x ^= loadroundkey(rk32);
668 		rk32 += 4;				/* next round key */
669 
670 		x = vqtbl1q_u8(x, mc);
671 		x ^= vqtbl1q_u8(dsbd[0], io) ^ vqtbl1q_u8(dsbd[1], jo);
672 
673 		x = vqtbl1q_u8(x, mc);
674 		x ^= vqtbl1q_u8(dsbb[0], io) ^ vqtbl1q_u8(dsbb[1], jo);
675 
676 		x = vqtbl1q_u8(x, mc);
677 		x ^= vqtbl1q_u8(dsbe[0], io) ^ vqtbl1q_u8(dsbe[1], jo);
678 
679 		mc = vextq_u8(mc, mc, 12);
680 	}
681 	x = vqtbl1q_u8(dsbo[0], io) ^ vqtbl1q_u8(dsbo[1], jo);
682 	x ^= loadroundkey(rk32);
683 	return vqtbl1q_u8(x, sr[i]);
684 }
685 
686 uint8x16x2_t
aes_neon_dec2(const struct aesdec * dec,uint8x16x2_t x,unsigned nrounds)687 aes_neon_dec2(const struct aesdec *dec, uint8x16x2_t x, unsigned nrounds)
688 {
689 	const uint32_t *rk32 = dec->aesd_aes.aes_rk;
690 	unsigned i = 3 & ~(nrounds - 1);
691 	uint8x16_t inv_ = *(const volatile uint8x16_t *)&inv;
692 	uint8x16_t inva_ = *(const volatile uint8x16_t *)&inva;
693 	uint8x16_t x0 = x.val[0], x1 = x.val[1];
694 	uint8x16_t io0, jo0, io1, jo1, mc;
695 
696 	x0 = aes_schedule_transform(x0, dipt);
697 	x1 = aes_schedule_transform(x1, dipt);
698 	x0 ^= loadroundkey(rk32);
699 	x1 ^= loadroundkey(rk32);
700 	rk32 += 4;
701 
702 	mc = mc_forward[3];
703 	for (;;) {
704 		subbytes(&io0, &jo0, x0, inv_, inva_);
705 		subbytes(&io1, &jo1, x1, inv_, inva_);
706 		if (--nrounds == 0)
707 			break;
708 
709 		x0 = vqtbl1q_u8(dsb9[0], io0) ^ vqtbl1q_u8(dsb9[1], jo0);
710 		x1 = vqtbl1q_u8(dsb9[0], io1) ^ vqtbl1q_u8(dsb9[1], jo1);
711 		x0 ^= loadroundkey(rk32);
712 		x1 ^= loadroundkey(rk32);
713 		rk32 += 4;				/* next round key */
714 
715 		x0 = vqtbl1q_u8(x0, mc);
716 		x1 = vqtbl1q_u8(x1, mc);
717 		x0 ^= vqtbl1q_u8(dsbd[0], io0) ^ vqtbl1q_u8(dsbd[1], jo0);
718 		x1 ^= vqtbl1q_u8(dsbd[0], io1) ^ vqtbl1q_u8(dsbd[1], jo1);
719 
720 		x0 = vqtbl1q_u8(x0, mc);
721 		x1 = vqtbl1q_u8(x1, mc);
722 		x0 ^= vqtbl1q_u8(dsbb[0], io0) ^ vqtbl1q_u8(dsbb[1], jo0);
723 		x1 ^= vqtbl1q_u8(dsbb[0], io1) ^ vqtbl1q_u8(dsbb[1], jo1);
724 
725 		x0 = vqtbl1q_u8(x0, mc);
726 		x1 = vqtbl1q_u8(x1, mc);
727 		x0 ^= vqtbl1q_u8(dsbe[0], io0) ^ vqtbl1q_u8(dsbe[1], jo0);
728 		x1 ^= vqtbl1q_u8(dsbe[0], io1) ^ vqtbl1q_u8(dsbe[1], jo1);
729 
730 		mc = vextq_u8(mc, mc, 12);
731 	}
732 	x0 = vqtbl1q_u8(dsbo[0], io0) ^ vqtbl1q_u8(dsbo[1], jo0);
733 	x1 = vqtbl1q_u8(dsbo[0], io1) ^ vqtbl1q_u8(dsbo[1], jo1);
734 	x0 ^= loadroundkey(rk32);
735 	x1 ^= loadroundkey(rk32);
736 	return (uint8x16x2_t) { .val = {
737 		[0] = vqtbl1q_u8(x0, sr[i]),
738 		[1] = vqtbl1q_u8(x1, sr[i]),
739 	} };
740 }
741 
742 #endif
743