xref: /netbsd-src/sys/crypto/aes/arch/x86/aes_ni_64.S (revision 74648be169a34e72eb43da140706220b93ee8d82)
1/*	$NetBSD: aes_ni_64.S,v 1.6 2020/07/27 20:57:23 riastradh Exp $	*/
2
3/*-
4 * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29#include <machine/asm.h>
30
31RCSID("$NetBSD: aes_ni_64.S,v 1.6 2020/07/27 20:57:23 riastradh Exp $")
32
33/*
34 * MOVDQA/MOVDQU are Move Double Quadword (Aligned/Unaligned), defined
35 * to operate on integers; MOVAPS/MOVUPS are Move (Aligned/Unaligned)
36 * Packed Single, defined to operate on binary32 floats.  They have
37 * exactly the same architectural effects (move a 128-bit quantity from
38 * memory into an xmm register).
39 *
40 * In principle, they might have different microarchitectural effects
41 * so that MOVAPS/MOVUPS might incur a penalty when the register is
42 * later used for integer paths, but in practice they don't.  So we use
43 * the one whose instruction encoding is shorter -- MOVAPS/MOVUPS.
44 */
45#define	movdqa	movaps
46#define	movdqu	movups
47
48/*
49 * aesni_setenckey128(struct aesenc *enckey@rdi, const uint8_t key[16] @rsi)
50 *
51 *	Expand a 16-byte AES-128 key into 10 round keys.
52 *
53 *	Standard ABI calling convention.
54 */
55ENTRY(aesni_setenckey128)
56	movdqu	(%rsi),%xmm0	/* load master key into %xmm0 */
57	movdqa	%xmm0,(%rdi)	/* store master key as the first round key */
58	lea	0x10(%rdi),%rdi	/* advance %rdi to next round key */
59	aeskeygenassist $0x1,%xmm0,%xmm2
60	call	aesni_expand128
61	aeskeygenassist $0x2,%xmm0,%xmm2
62	call	aesni_expand128
63	aeskeygenassist $0x4,%xmm0,%xmm2
64	call	aesni_expand128
65	aeskeygenassist $0x8,%xmm0,%xmm2
66	call	aesni_expand128
67	aeskeygenassist $0x10,%xmm0,%xmm2
68	call	aesni_expand128
69	aeskeygenassist $0x20,%xmm0,%xmm2
70	call	aesni_expand128
71	aeskeygenassist $0x40,%xmm0,%xmm2
72	call	aesni_expand128
73	aeskeygenassist $0x80,%xmm0,%xmm2
74	call	aesni_expand128
75	aeskeygenassist $0x1b,%xmm0,%xmm2
76	call	aesni_expand128
77	aeskeygenassist $0x36,%xmm0,%xmm2
78	call	aesni_expand128
79	ret
80END(aesni_setenckey128)
81
82/*
83 * aesni_setenckey192(struct aesenc *enckey@rdi, const uint8_t key[24] @rsi)
84 *
85 *	Expand a 24-byte AES-192 key into 12 round keys.
86 *
87 *	Standard ABI calling convention.
88 */
89ENTRY(aesni_setenckey192)
90	movdqu	(%rsi),%xmm0	/* load master key [0:128) into %xmm0 */
91	movq	0x10(%rsi),%xmm1 /* load master key [128:192) into %xmm1 */
92	movdqa	%xmm0,(%rdi)	/* store master key [0:128) as round key */
93	lea	0x10(%rdi),%rdi /* advance %rdi to next round key */
94	aeskeygenassist $0x1,%xmm1,%xmm2
95	call	aesni_expand192a
96	aeskeygenassist $0x2,%xmm0,%xmm2
97	call	aesni_expand192b
98	aeskeygenassist $0x4,%xmm1,%xmm2
99	call	aesni_expand192a
100	aeskeygenassist $0x8,%xmm0,%xmm2
101	call	aesni_expand192b
102	aeskeygenassist $0x10,%xmm1,%xmm2
103	call	aesni_expand192a
104	aeskeygenassist $0x20,%xmm0,%xmm2
105	call	aesni_expand192b
106	aeskeygenassist $0x40,%xmm1,%xmm2
107	call	aesni_expand192a
108	aeskeygenassist $0x80,%xmm0,%xmm2
109	call	aesni_expand192b
110	ret
111END(aesni_setenckey192)
112
113/*
114 * aesni_setenckey256(struct aesenc *enckey@rdi, const uint8_t key[32] @rsi)
115 *
116 *	Expand a 32-byte AES-256 key into 14 round keys.
117 *
118 *	Standard ABI calling convention.
119 */
120ENTRY(aesni_setenckey256)
121	movdqu	(%rsi),%xmm0	/* load master key [0:128) into %xmm0 */
122	movdqu	0x10(%rsi),%xmm1 /* load master key [128:256) into %xmm1 */
123	movdqa	%xmm0,(%rdi)	/* store master key [0:128) as round key */
124	movdqa	%xmm1,0x10(%rdi) /* store master key [128:256) as round key */
125	lea	0x20(%rdi),%rdi	/* advance %rdi to next round key */
126	aeskeygenassist $0x1,%xmm1,%xmm2
127	call	aesni_expand256a
128	aeskeygenassist $0x1,%xmm0,%xmm2
129	call	aesni_expand256b
130	aeskeygenassist $0x2,%xmm1,%xmm2
131	call	aesni_expand256a
132	aeskeygenassist $0x2,%xmm0,%xmm2
133	call	aesni_expand256b
134	aeskeygenassist $0x4,%xmm1,%xmm2
135	call	aesni_expand256a
136	aeskeygenassist $0x4,%xmm0,%xmm2
137	call	aesni_expand256b
138	aeskeygenassist $0x8,%xmm1,%xmm2
139	call	aesni_expand256a
140	aeskeygenassist $0x8,%xmm0,%xmm2
141	call	aesni_expand256b
142	aeskeygenassist $0x10,%xmm1,%xmm2
143	call	aesni_expand256a
144	aeskeygenassist $0x10,%xmm0,%xmm2
145	call	aesni_expand256b
146	aeskeygenassist $0x20,%xmm1,%xmm2
147	call	aesni_expand256a
148	aeskeygenassist $0x20,%xmm0,%xmm2
149	call	aesni_expand256b
150	aeskeygenassist $0x40,%xmm1,%xmm2
151	call	aesni_expand256a
152	ret
153END(aesni_setenckey256)
154
155/*
156 * aesni_expand128(uint128_t *rkp@rdi, uint128_t prk@xmm0,
157 *     uint128_t keygenassist@xmm2)
158 *
159 *	1. Compute the AES-128 round key using the previous round key.
160 *	2. Store it at *rkp.
161 *	3. Set %xmm0 to it.
162 *	4. Advance %rdi to point at the next round key.
163 *
164 *	Internal ABI.  On entry:
165 *
166 *		%rdi = rkp, pointer to round key to compute
167 *		%xmm0 = (prk[0], prk[1], prk[2], prk[3])
168 *		%xmm2 = (xxx, xxx, xxx, t = Rot(SubWord(prk[3])) ^ RCON)
169 *
170 *	On exit:
171 *
172 *		%rdi = &rkp[1], rkp advanced by one round key
173 *		%xmm0 = rk, the round key we just computed
174 *		%xmm2 = garbage
175 *		%xmm4 = garbage
176 *		%xmm5 = garbage
177 *		%xmm6 = garbage
178 *
179 *	Note: %xmm1 is preserved (as are %xmm3 and %xmm7 through %xmm15,
180 *	and all other registers).
181 */
182	.text
183	_ALIGN_TEXT
184	.type	aesni_expand128,@function
185aesni_expand128:
186	/*
187	 * %xmm2 := (%xmm2[3], %xmm2[3], %xmm2[3], %xmm2[3]),
188	 * i.e., set each word of %xmm2 to t := Rot(SubWord(prk[3])) ^ RCON.
189	 */
190	pshufd	$0b11111111,%xmm2,%xmm2
191
192	/*
193	 * %xmm4 := (0, prk[0], prk[1], prk[2])
194	 * %xmm5 := (0, 0, prk[0], prk[1])
195	 * %xmm6 := (0, 0, 0, prk[0])
196	 */
197	movdqa	%xmm0,%xmm4
198	movdqa	%xmm0,%xmm5
199	movdqa	%xmm0,%xmm6
200	pslldq	$4,%xmm4
201	pslldq	$8,%xmm5
202	pslldq	$12,%xmm6
203
204	/*
205	 * %xmm0 := (rk[0] = t ^ prk[0],
206	 *     rk[1] = t ^ prk[0] ^ prk[1],
207	 *     rk[2] = t ^ prk[0] ^ prk[1] ^ prk[2],
208	 *     rk[3] = t ^ prk[0] ^ prk[1] ^ prk[2] ^ prk[3])
209	 */
210	pxor	%xmm2,%xmm0
211	pxor	%xmm4,%xmm0
212	pxor	%xmm5,%xmm0
213	pxor	%xmm6,%xmm0
214
215	movdqa	%xmm0,(%rdi)	/* store round key */
216	lea	0x10(%rdi),%rdi	/* advance to next round key address */
217	ret
218END(aesni_expand128)
219
220/*
221 * aesni_expand192a(uint128_t *rkp@rdi, uint128_t prk@xmm0,
222 *     uint64_t rklo@xmm1, uint128_t keygenassist@xmm2)
223 *
224 *	Set even-numbered AES-192 round key.
225 *
226 *	Internal ABI.  On entry:
227 *
228 *		%rdi = rkp, pointer to two round keys to compute
229 *		%xmm0 = (prk[0], prk[1], prk[2], prk[3])
230 *		%xmm1 = (rklo[0], rklo[1], xxx, xxx)
231 *		%xmm2 = (xxx, t = Rot(SubWord(rklo[1])) ^ RCON, xxx, xxx)
232 *
233 *	On exit:
234 *
235 *		%rdi = &rkp[2], rkp advanced by two round keys
236 *		%xmm0 = nrk, second round key we just computed
237 *		%xmm1 = rk, first round key we just computed
238 *		%xmm2 = garbage
239 *		%xmm4 = garbage
240 *		%xmm5 = garbage
241 *		%xmm6 = garbage
242 *		%xmm7 = garbage
243 */
244	.text
245	_ALIGN_TEXT
246	.type	aesni_expand192a,@function
247aesni_expand192a:
248	/*
249	 * %xmm2 := (%xmm2[1], %xmm2[1], %xmm2[1], %xmm2[1]),
250	 * i.e., set each word of %xmm2 to t := Rot(SubWord(rklo[1])) ^ RCON.
251	 */
252	pshufd	$0b01010101,%xmm2,%xmm2
253
254	/*
255	 * We need to compute:
256	 *
257	 * rk[0] := rklo[0]
258	 * rk[1] := rklo[1]
259	 * rk[2] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0]
260	 * rk[3] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ prk[1]
261	 * nrk[0] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ prk[1] ^ prk[2]
262	 * nrk[1] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3]
263	 * nrk[2] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3] ^ rklo[0]
264	 * nrk[3] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3] ^ rklo[0]
265	 *     ^ rklo[1]
266	 */
267
268	/*
269	 * %xmm4 := (prk[0], prk[1], prk[2], prk[3])
270	 * %xmm5 := (0, prk[0], prk[1], prk[2])
271	 * %xmm6 := (0, 0, prk[0], prk[1])
272	 * %xmm7 := (0, 0, 0, prk[0])
273	 */
274	movdqa	%xmm0,%xmm4
275	movdqa	%xmm0,%xmm5
276	movdqa	%xmm0,%xmm6
277	movdqa	%xmm0,%xmm7
278	pslldq	$4,%xmm5
279	pslldq	$8,%xmm6
280	pslldq	$12,%xmm7
281
282	/* %xmm4 := (rk[2], rk[3], nrk[0], nrk[1]) */
283	pxor	%xmm2,%xmm4
284	pxor	%xmm5,%xmm4
285	pxor	%xmm6,%xmm4
286	pxor	%xmm7,%xmm4
287
288	/*
289	 * At this point, rk is split across %xmm1 (rk[0],rk[1],...) and
290	 * %xmm4 (rk[2],rk[3],...); nrk is in %xmm4 (...,nrk[0],nrk[1]);
291	 * and we have yet to compute nrk[2] or nrk[3], which requires
292	 * rklo[0] and rklo[1] in %xmm1 (rklo[0], rklo[1], ...).  We need
293	 * nrk to end up in %xmm0 at the end, so gather rk into %xmm1 and
294	 * nrk into %xmm0.
295	 */
296
297	/* %xmm0 := (nrk[0], nrk[1], nrk[1], nrk[1]) */
298	pshufd	$0b11111110,%xmm4,%xmm0
299
300	/*
301	 * %xmm6 := (0, 0, rklo[0], rklo[1])
302	 * %xmm7 := (0, 0, 0, rklo[0])
303	 */
304	movdqa	%xmm1,%xmm6
305	movdqa	%xmm1,%xmm7
306
307	pslldq	$8,%xmm6
308	pslldq	$12,%xmm7
309
310	/*
311	 * %xmm0 := (nrk[0],
312	 *     nrk[1],
313	 *     nrk[2] = nrk[1] ^ rklo[0],
314	 *     nrk[3] = nrk[1] ^ rklo[0] ^ rklo[1])
315	 */
316	pxor	%xmm6,%xmm0
317	pxor	%xmm7,%xmm0
318
319	/* %xmm1 := (rk[0], rk[1], rk[2], rk[3]) */
320	shufps	$0b01000100,%xmm4,%xmm1
321
322	movdqa	%xmm1,(%rdi)		/* store round key */
323	movdqa	%xmm0,0x10(%rdi)	/* store next round key */
324	lea	0x20(%rdi),%rdi		/* advance two round keys */
325	ret
326END(aesni_expand192a)
327
328/*
329 * aesni_expand192b(uint128_t *roundkey@rdi, uint128_t prk@xmm0,
330 *     uint128_t keygenassist@xmm2)
331 *
332 *	Set odd-numbered AES-192 round key.
333 *
334 *	Internal ABI.  On entry:
335 *
336 *		%rdi = rkp, pointer to round key to compute
337 *		%xmm0 = (prk[0], prk[1], prk[2], prk[3])
338 *		%xmm1 = (xxx, xxx, pprk[2], pprk[3])
339 *		%xmm2 = (xxx, xxx, xxx, t = Rot(Sub(prk[3])) ^ RCON)
340 *
341 *	On exit:
342 *
343 *		%rdi = &rkp[1], rkp advanced by one round key
344 *		%xmm0 = rk, the round key we just computed
345 *		%xmm1 = (nrk[0], nrk[1], xxx, xxx), half of next round key
346 *		%xmm2 = garbage
347 *		%xmm4 = garbage
348 *		%xmm5 = garbage
349 *		%xmm6 = garbage
350 *		%xmm7 = garbage
351 */
352	.text
353	_ALIGN_TEXT
354	.type	aesni_expand192b,@function
355aesni_expand192b:
356	/*
357	 * %xmm2 := (%xmm2[3], %xmm2[3], %xmm2[3], %xmm2[3]),
358	 * i.e., set each word of %xmm2 to t := Rot(Sub(prk[3])) ^ RCON.
359	 */
360	pshufd	$0b11111111,%xmm2,%xmm2
361
362	/*
363	 * We need to compute:
364	 *
365	 * rk[0] := Rot(Sub(prk[3])) ^ RCON ^ pprk[2]
366	 * rk[1] := Rot(Sub(prk[3])) ^ RCON ^ pprk[2] ^ pprk[3]
367	 * rk[2] := Rot(Sub(prk[3])) ^ RCON ^ pprk[2] ^ pprk[3] ^ prk[0]
368	 * rk[3] := Rot(Sub(prk[3])) ^ RCON ^ pprk[2] ^ pprk[3] ^ prk[0]
369	 *     ^ prk[1]
370	 * nrk[0] := Rot(Sub(prk[3])) ^ RCON ^ pprk[2] ^ pprk[3] ^ prk[0]
371	 *     ^ prk[1] ^ prk[2]
372	 * nrk[1] := Rot(Sub(prk[3])) ^ RCON ^ pprk[2] ^ pprk[3] ^ prk[0]
373	 *     ^ prk[1] ^ prk[2] ^ prk[3]
374	 */
375
376	/* %xmm1 := (pprk[2], pprk[3], prk[0], prk[1]) */
377	shufps	$0b01001110,%xmm0,%xmm1
378
379	/*
380	 * %xmm5 := (0, pprk[2], pprk[3], prk[0])
381	 * %xmm6 := (0, 0, pprk[2], pprk[3])
382	 * %xmm7 := (0, 0, 0, pprk[2])
383	 */
384	movdqa	%xmm1,%xmm5
385	movdqa	%xmm1,%xmm6
386	movdqa	%xmm1,%xmm7
387	pslldq	$4,%xmm5
388	pslldq	$8,%xmm6
389	pslldq	$12,%xmm7
390
391	/* %xmm1 := (rk[0], rk[1], rk[2], rk[3) */
392	pxor	%xmm2,%xmm1
393	pxor	%xmm5,%xmm1
394	pxor	%xmm6,%xmm1
395	pxor	%xmm7,%xmm1
396
397	/* %xmm4 := (prk[2], prk[3], xxx, xxx) */
398	pshufd	$0b00001110,%xmm0,%xmm4
399
400	/* %xmm5 := (0, prk[2], xxx, xxx) */
401	movdqa	%xmm4,%xmm5
402	pslldq	$4,%xmm5
403
404	/* %xmm0 := (rk[0], rk[1], rk[2], rk[3]) */
405	movdqa	%xmm1,%xmm0
406
407	/* %xmm1 := (rk[3], rk[3], xxx, xxx) */
408	shufps	$0b00001111,%xmm1,%xmm1
409
410	/*
411	 * %xmm1 := (nrk[0] = rk[3] ^ prk[2],
412	 *     nrk[1] = rk[3] ^ prk[2] ^ prk[3],
413	 *     xxx,
414	 *     xxx)
415	 */
416	pxor	%xmm4,%xmm1
417	pxor	%xmm5,%xmm1
418
419	movdqa	%xmm0,(%rdi)	/* store round key */
420	lea	0x10(%rdi),%rdi	/* advance to next round key address */
421	ret
422END(aesni_expand192b)
423
424/*
425 * aesni_expand256a(uint128_t *rkp@rdi, uint128_t pprk@xmm0,
426 *     uint128_t prk@xmm1, uint128_t keygenassist@xmm2)
427 *
428 *	Set even-numbered AES-256 round key.
429 *
430 *	Internal ABI.  On entry:
431 *
432 *		%rdi = rkp, pointer to round key to compute
433 *		%xmm0 = (pprk[0], pprk[1], pprk[2], pprk[3])
434 *		%xmm1 = (prk[0], prk[1], prk[2], prk[3])
435 *		%xmm2 = (xxx, xxx, xxx, t = Rot(SubWord(prk[3])))
436 *
437 *	On exit:
438 *
439 *		%rdi = &rkp[1], rkp advanced by one round key
440 *		%xmm0 = rk, the round key we just computed
441 *		%xmm1 = prk, previous round key, preserved from entry
442 *		%xmm2 = garbage
443 *		%xmm4 = garbage
444 *		%xmm5 = garbage
445 *		%xmm6 = garbage
446 *
447 *	The computation turns out to be the same as for AES-128; the
448 *	previous round key does not figure into it, only the
449 *	previous-previous round key.
450 */
451	aesni_expand256a = aesni_expand128
452
453/*
454 * aesni_expand256b(uint128_t *rkp@rdi, uint128_t prk@xmm0,
455 *     uint128_t pprk@xmm1, uint128_t keygenassist@xmm2)
456 *
457 *	Set odd-numbered AES-256 round key.
458 *
459 *	Internal ABI.  On entry:
460 *
461 *		%rdi = rkp, pointer to round key to compute
462 *		%xmm0 = (prk[0], prk[1], prk[2], prk[3])
463 *		%xmm1 = (pprk[0], pprk[1], pprk[2], pprk[3])
464 *		%xmm2 = (xxx, xxx, t = Sub(prk[3]), xxx)
465 *
466 *	On exit:
467 *
468 *		%rdi = &rkp[1], rkp advanced by one round key
469 *		%xmm0 = prk, previous round key, preserved from entry
470 *		%xmm1 = rk, the round key we just computed
471 *		%xmm2 = garbage
472 *		%xmm4 = garbage
473 *		%xmm5 = garbage
474 *		%xmm6 = garbage
475 */
476	.text
477	_ALIGN_TEXT
478	.type	aesni_expand256b,@function
479aesni_expand256b:
480	/*
481	 * %xmm2 := (%xmm2[3], %xmm2[3], %xmm2[3], %xmm2[3]),
482	 * i.e., set each word of %xmm2 to t := Sub(prk[3]).
483	 */
484	pshufd	$0b10101010,%xmm2,%xmm2
485
486	/*
487	 * %xmm4 := (0, pprk[0], pprk[1], pprk[2])
488	 * %xmm5 := (0, 0, pprk[0], pprk[1])
489	 * %xmm6 := (0, 0, 0, pprk[0])
490	 */
491	movdqa	%xmm1,%xmm4
492	movdqa	%xmm1,%xmm5
493	movdqa	%xmm1,%xmm6
494	pslldq	$4,%xmm4
495	pslldq	$8,%xmm5
496	pslldq	$12,%xmm6
497
498	/*
499	 * %xmm0 := (rk[0] = t ^ pprk[0],
500	 *     rk[1] = t ^ pprk[0] ^ pprk[1],
501	 *     rk[2] = t ^ pprk[0] ^ pprk[1] ^ pprk[2],
502	 *     rk[3] = t ^ pprk[0] ^ pprk[1] ^ pprk[2] ^ pprk[3])
503	 */
504	pxor	%xmm2,%xmm1
505	pxor	%xmm4,%xmm1
506	pxor	%xmm5,%xmm1
507	pxor	%xmm6,%xmm1
508
509	movdqa	%xmm1,(%rdi)	/* store round key */
510	lea	0x10(%rdi),%rdi	/* advance to next round key address */
511	ret
512END(aesni_expand256b)
513
514/*
515 * aesni_enctodec(const struct aesenc *enckey@rdi, struct aesdec *deckey@rsi,
516 *     uint32_t nrounds@rdx)
517 *
518 *	Convert AES encryption round keys to AES decryption round keys.
519 *	`rounds' must be between 10 and 14.
520 *
521 *	Standard ABI calling convention.
522 */
523ENTRY(aesni_enctodec)
524	shl	$4,%edx		/* rdx := byte offset of last round key */
525	movdqa	(%rdi,%rdx),%xmm0	/* load last round key */
526	movdqa	%xmm0,(%rsi)	/* store last round key verbatim */
527	jmp	2f
528	_ALIGN_TEXT
5291:	movdqa	(%rdi,%rdx),%xmm0	/* load round key */
530	aesimc	%xmm0,%xmm0	/* convert encryption to decryption */
531	movdqa	%xmm0,(%rsi)	/* store round key */
5322:	sub	$0x10,%rdx	/* advance to next round key */
533	lea	0x10(%rsi),%rsi
534	jnz	1b		/* repeat if more rounds */
535	movdqa	(%rdi),%xmm0	/* load first round key */
536	movdqa	%xmm0,(%rsi)	/* store first round key verbatim */
537	ret
538END(aesni_enctodec)
539
540/*
541 * aesni_enc(const struct aesenc *enckey@rdi, const uint8_t in[16] @rsi,
542 *     uint8_t out[16] @rdx, uint32_t nrounds@ecx)
543 *
544 *	Encrypt a single block.
545 *
546 *	Standard ABI calling convention.
547 */
548ENTRY(aesni_enc)
549	movdqu	(%rsi),%xmm0
550	call	aesni_enc1
551	movdqu	%xmm0,(%rdx)
552	ret
553END(aesni_enc)
554
555/*
556 * aesni_dec(const struct aesdec *deckey@rdi, const uint8_t in[16] @rsi,
557 *     uint8_t out[16] @rdx, uint32_t nrounds@ecx)
558 *
559 *	Decrypt a single block.
560 *
561 *	Standard ABI calling convention.
562 */
563ENTRY(aesni_dec)
564	movdqu	(%rsi),%xmm0
565	call	aesni_dec1
566	movdqu	%xmm0,(%rdx)
567	ret
568END(aesni_dec)
569
570/*
571 * aesni_cbc_enc(const struct aesenc *enckey@rdi, const uint8_t *in@rsi,
572 *     uint8_t *out@rdx, size_t nbytes@rcx, uint8_t iv[16] @r8,
573 *     uint32_t nrounds@r9d)
574 *
575 *	Encrypt a contiguous sequence of blocks with AES-CBC.
576 *
577 *	nbytes must be an integral multiple of 16.
578 *
579 *	Standard ABI calling convention.
580 */
581ENTRY(aesni_cbc_enc)
582	cmp	$0,%rcx
583	jz	2f
584	mov	%rcx,%r10		/* r10 := nbytes */
585	movdqu	(%r8),%xmm0		/* xmm0 := chaining value */
586	_ALIGN_TEXT
5871:	movdqu	(%rsi),%xmm1		/* xmm1 := plaintext block */
588	lea	0x10(%rsi),%rsi
589	pxor	%xmm1,%xmm0		/* xmm0 := cv ^ ptxt */
590	mov	%r9d,%ecx		/* ecx := nrounds */
591	call	aesni_enc1		/* xmm0 := ciphertext block */
592	movdqu	%xmm0,(%rdx)
593	lea	0x10(%rdx),%rdx
594	sub	$0x10,%r10
595	jnz	1b			/* repeat if r10 is nonzero */
596	movdqu	%xmm0,(%r8)		/* store chaining value */
5972:	ret
598END(aesni_cbc_enc)
599
600/*
601 * aesni_cbc_dec1(const struct aesdec *deckey@rdi, const uint8_t *in@rsi,
602 *     uint8_t *out@rdx, size_t nbytes@rcx, const uint8_t iv[16] @r8,
603 *     uint32_t nrounds@r9)
604 *
605 *	Decrypt a contiguous sequence of blocks with AES-CBC.
606 *
607 *	nbytes must be a positive integral multiple of 16.  This routine
608 *	is not vectorized; use aesni_cbc_dec8 for >=8 blocks at once.
609 *
610 *	Standard ABI calling convention.
611 */
612ENTRY(aesni_cbc_dec1)
613	push	%rbp			/* create stack frame uint128[1] */
614	mov	%rsp,%rbp
615	sub	$0x10,%rsp
616	movdqu	(%r8),%xmm8		/* xmm8 := iv */
617	movdqa	%xmm8,(%rsp)		/* save iv */
618	mov	%rcx,%r10		/* r10 := nbytes */
619	movdqu	-0x10(%rsi,%r10),%xmm0	/* xmm0 := last ciphertext block */
620	movdqu	%xmm0,(%r8)		/* update iv */
621	jmp	2f
622	_ALIGN_TEXT
6231:	movdqu	-0x10(%rsi,%r10),%xmm8	/* xmm8 := chaining value */
624	pxor	%xmm8,%xmm0		/* xmm0 := ptxt */
625	movdqu	%xmm0,(%rdx,%r10)	/* store plaintext block */
626	movdqa	%xmm8,%xmm0		/* move cv = ciphertext block */
6272:	mov	%r9d,%ecx		/* ecx := nrounds */
628	call	aesni_dec1		/* xmm0 := cv ^ ptxt */
629	sub	$0x10,%r10
630	jnz	1b			/* repeat if more blocks */
631	pxor	(%rsp),%xmm0		/* xmm0 := ptxt */
632	movdqu	%xmm0,(%rdx)		/* store first plaintext block */
633	leave
634	ret
635END(aesni_cbc_dec1)
636
637/*
638 * aesni_cbc_dec8(const struct aesdec *deckey@rdi, const uint8_t *in@rsi,
639 *     uint8_t *out@rdx, size_t nbytes@rcx, const uint8_t iv[16] @r8,
640 *     uint32_t nrounds@r9)
641 *
642 *	Decrypt a contiguous sequence of 8-block units with AES-CBC.
643 *
644 *	nbytes must be a positive integral multiple of 128.
645 *
646 *	Standard ABI calling convention.
647 */
648ENTRY(aesni_cbc_dec8)
649	push	%rbp			/* create stack frame uint128[1] */
650	mov	%rsp,%rbp
651	sub	$0x10,%rsp
652	movdqu	(%r8),%xmm8		/* xmm8 := iv */
653	movdqa	%xmm8,(%rsp)		/* save iv */
654	mov	%rcx,%r10		/* r10 := nbytes */
655	movdqu	-0x10(%rsi,%r10),%xmm7	/* xmm7 := ciphertext block[n-1] */
656	movdqu	%xmm7,(%r8)		/* update iv */
657	jmp	2f
658	_ALIGN_TEXT
6591:	movdqu	-0x10(%rsi,%r10),%xmm7	/* xmm7 := cv[0] */
660	pxor	%xmm7,%xmm0		/* xmm0 := ptxt[0] */
661	movdqu	%xmm0,(%rdx,%r10)	/* store plaintext block */
6622:	movdqu	-0x20(%rsi,%r10),%xmm6	/* xmm6 := ciphertext block[n-2] */
663	movdqu	-0x30(%rsi,%r10),%xmm5	/* xmm5 := ciphertext block[n-3] */
664	movdqu	-0x40(%rsi,%r10),%xmm4	/* xmm4 := ciphertext block[n-4] */
665	movdqu	-0x50(%rsi,%r10),%xmm3	/* xmm3 := ciphertext block[n-5] */
666	movdqu	-0x60(%rsi,%r10),%xmm2	/* xmm2 := ciphertext block[n-6] */
667	movdqu	-0x70(%rsi,%r10),%xmm1	/* xmm1 := ciphertext block[n-7] */
668	movdqu	-0x80(%rsi,%r10),%xmm0	/* xmm0 := ciphertext block[n-8] */
669	movdqa	%xmm6,%xmm15		/* xmm[8+i] := cv[i], 0<i<8 */
670	movdqa	%xmm5,%xmm14
671	movdqa	%xmm4,%xmm13
672	movdqa	%xmm3,%xmm12
673	movdqa	%xmm2,%xmm11
674	movdqa	%xmm1,%xmm10
675	movdqa	%xmm0,%xmm9
676	mov	%r9d,%ecx		/* ecx := nrounds */
677	call	aesni_dec8		/* xmm[i] := cv[i] ^ ptxt[i], 0<=i<8 */
678	pxor	%xmm15,%xmm7		/* xmm[i] := ptxt[i], 0<i<8 */
679	pxor	%xmm14,%xmm6
680	pxor	%xmm13,%xmm5
681	pxor	%xmm12,%xmm4
682	pxor	%xmm11,%xmm3
683	pxor	%xmm10,%xmm2
684	pxor	%xmm9,%xmm1
685	movdqu	%xmm7,-0x10(%rdx,%r10)	/* store plaintext blocks */
686	movdqu	%xmm6,-0x20(%rdx,%r10)
687	movdqu	%xmm5,-0x30(%rdx,%r10)
688	movdqu	%xmm4,-0x40(%rdx,%r10)
689	movdqu	%xmm3,-0x50(%rdx,%r10)
690	movdqu	%xmm2,-0x60(%rdx,%r10)
691	movdqu	%xmm1,-0x70(%rdx,%r10)
692	sub	$0x80,%r10
693	jnz	1b			/* repeat if more blocks */
694	pxor	(%rsp),%xmm0		/* xmm0 := ptxt[0] */
695	movdqu	%xmm0,(%rdx)		/* store first plaintext block */
696	leave
697	ret
698END(aesni_cbc_dec8)
699
700/*
701 * aesni_xts_enc1(const struct aesenc *enckey@rdi, const uint8_t *in@rsi,
702 *     uint8_t *out@rdx, size_t nbytes@rcx, uint8_t tweak[16] @r8,
703 *     uint32_t nrounds@r9d)
704 *
705 *	Encrypt a contiguous sequence of blocks with AES-XTS.
706 *
707 *	nbytes must be a positive integral multiple of 16.  This routine
708 *	is not vectorized; use aesni_xts_enc8 for >=8 blocks at once.
709 *
710 *	Standard ABI calling convention.
711 */
712ENTRY(aesni_xts_enc1)
713	mov	%rcx,%r10		/* r10 := nbytes */
714	movdqu	(%r8),%xmm15		/* xmm15 := tweak */
715	_ALIGN_TEXT
7161:	movdqu	(%rsi),%xmm0		/* xmm0 := ptxt */
717	lea	0x10(%rsi),%rsi		/* advance rdi to next block */
718	pxor	%xmm15,%xmm0		/* xmm0 := ptxt ^ tweak */
719	mov	%r9d,%ecx		/* ecx := nrounds */
720	call	aesni_enc1		/* xmm0 := AES(ptxt ^ tweak) */
721	pxor	%xmm15,%xmm0		/* xmm0 := AES(ptxt ^ tweak) ^ tweak */
722	movdqu	%xmm0,(%rdx)		/* store ciphertext block */
723	lea	0x10(%rdx),%rdx		/* advance rsi to next block */
724	call	aesni_xts_mulx		/* xmm15 *= x; trash xmm0 */
725	sub	$0x10,%r10
726	jnz	1b			/* repeat if more blocks */
727	movdqu	%xmm15,(%r8)		/* update tweak */
728	ret
729END(aesni_xts_enc1)
730
731/*
732 * aesni_xts_enc8(const struct aesenc *enckey@rdi, const uint8_t *in@rsi,
733 *     uint8_t *out@rdx, size_t nbytes@rcx, uint8_t tweak[16] @r8,
734 *     uint32_t nrounds@r9d)
735 *
736 *	Encrypt a contiguous sequence of blocks with AES-XTS.
737 *
738 *	nbytes must be a positive integral multiple of 128.
739 *
740 *	Standard ABI calling convention.
741 */
742ENTRY(aesni_xts_enc8)
743	push	%rbp			/* create stack frame uint128[1] */
744	mov	%rsp,%rbp
745	sub	$0x10,%rsp
746	mov	%rcx,%r10		/* r10 := nbytes */
747	movdqu	(%r8),%xmm15		/* xmm15 := tweak[0] */
748	_ALIGN_TEXT
7491:	movdqa	%xmm15,%xmm8		/* xmm8 := tweak[0] */
750	call	aesni_xts_mulx		/* xmm15 := tweak[1] */
751	movdqa	%xmm15,%xmm9		/* xmm9 := tweak[1] */
752	call	aesni_xts_mulx		/* xmm15 := tweak[2] */
753	movdqa	%xmm15,%xmm10		/* xmm10 := tweak[2] */
754	call	aesni_xts_mulx		/* xmm15 := tweak[3] */
755	movdqa	%xmm15,%xmm11		/* xmm11 := tweak[3] */
756	call	aesni_xts_mulx		/* xmm15 := tweak[4] */
757	movdqa	%xmm15,%xmm12		/* xmm12 := tweak[4] */
758	call	aesni_xts_mulx		/* xmm15 := tweak[5] */
759	movdqa	%xmm15,%xmm13		/* xmm13 := tweak[5] */
760	call	aesni_xts_mulx		/* xmm15 := tweak[6] */
761	movdqa	%xmm15,%xmm14		/* xmm14 := tweak[6] */
762	call	aesni_xts_mulx		/* xmm15 := tweak[7] */
763	movdqu	(%rsi),%xmm0		/* xmm[i] := ptxt[i] */
764	movdqu	0x10(%rsi),%xmm1
765	movdqu	0x20(%rsi),%xmm2
766	movdqu	0x30(%rsi),%xmm3
767	movdqu	0x40(%rsi),%xmm4
768	movdqu	0x50(%rsi),%xmm5
769	movdqu	0x60(%rsi),%xmm6
770	movdqu	0x70(%rsi),%xmm7
771	lea	0x80(%rsi),%rsi		/* advance rsi to next block group */
772	movdqa	%xmm8,(%rsp)		/* save tweak[0] */
773	pxor	%xmm8,%xmm0		/* xmm[i] := ptxt[i] ^ tweak[i] */
774	pxor	%xmm9,%xmm1
775	pxor	%xmm10,%xmm2
776	pxor	%xmm11,%xmm3
777	pxor	%xmm12,%xmm4
778	pxor	%xmm13,%xmm5
779	pxor	%xmm14,%xmm6
780	pxor	%xmm15,%xmm7
781	mov	%r9d,%ecx		/* ecx := nrounds */
782	call	aesni_enc8		/* xmm[i] := AES(ptxt[i] ^ tweak[i]) */
783	pxor	(%rsp),%xmm0		/* xmm[i] := AES(...) ^ tweak[i] */
784	pxor	%xmm9,%xmm1
785	pxor	%xmm10,%xmm2
786	pxor	%xmm11,%xmm3
787	pxor	%xmm12,%xmm4
788	pxor	%xmm13,%xmm5
789	pxor	%xmm14,%xmm6
790	pxor	%xmm15,%xmm7
791	movdqu	%xmm0,(%rdx)		/* store ciphertext blocks */
792	movdqu	%xmm1,0x10(%rdx)
793	movdqu	%xmm2,0x20(%rdx)
794	movdqu	%xmm3,0x30(%rdx)
795	movdqu	%xmm4,0x40(%rdx)
796	movdqu	%xmm5,0x50(%rdx)
797	movdqu	%xmm6,0x60(%rdx)
798	movdqu	%xmm7,0x70(%rdx)
799	lea	0x80(%rdx),%rdx		/* advance rdx to next block group */
800	call	aesni_xts_mulx		/* xmm15 := tweak[8] */
801	sub	$0x80,%r10
802	jnz	1b			/* repeat if more block groups */
803	movdqu	%xmm15,(%r8)		/* update tweak */
804	leave
805	ret
806END(aesni_xts_enc8)
807
808/*
809 * aesni_xts_dec1(const struct aesdec *deckey@rdi, const uint8_t *in@rsi,
810 *     uint8_t *out@rdx, size_t nbytes@rcx, uint8_t tweak[16] @r8,
811 *     uint32_t nrounds@r9d)
812 *
813 *	Decrypt a contiguous sequence of blocks with AES-XTS.
814 *
815 *	nbytes must be a positive integral multiple of 16.  This routine
816 *	is not vectorized; use aesni_xts_dec8 for >=8 blocks at once.
817 *
818 *	Standard ABI calling convention.
819 */
820ENTRY(aesni_xts_dec1)
821	mov	%rcx,%r10		/* r10 := nbytes */
822	movdqu	(%r8),%xmm15		/* xmm15 := tweak */
823	_ALIGN_TEXT
8241:	movdqu	(%rsi),%xmm0		/* xmm0 := ctxt */
825	lea	0x10(%rsi),%rsi		/* advance rdi to next block */
826	pxor	%xmm15,%xmm0		/* xmm0 := ctxt ^ tweak */
827	mov	%r9d,%ecx		/* ecx := nrounds */
828	call	aesni_dec1		/* xmm0 := AES(ctxt ^ tweak) */
829	pxor	%xmm15,%xmm0		/* xmm0 := AES(ctxt ^ tweak) ^ tweak */
830	movdqu	%xmm0,(%rdx)		/* store plaintext block */
831	lea	0x10(%rdx),%rdx		/* advance rsi to next block */
832	call	aesni_xts_mulx		/* xmm15 *= x; trash xmm0 */
833	sub	$0x10,%r10
834	jnz	1b			/* repeat if more blocks */
835	movdqu	%xmm15,(%r8)		/* update tweak */
836	ret
837END(aesni_xts_dec1)
838
839/*
840 * aesni_xts_dec8(const struct aesdec *deckey@rdi, const uint8_t *in@rsi,
841 *     uint8_t *out@rdx, size_t nbytes@rcx, uint8_t tweak[16] @r8,
842 *     uint32_t nrounds@r9d)
843 *
844 *	Decrypt a contiguous sequence of blocks with AES-XTS.
845 *
846 *	nbytes must be a positive integral multiple of 128.
847 *
848 *	Standard ABI calling convention.
849 */
850ENTRY(aesni_xts_dec8)
851	push	%rbp			/* create stack frame uint128[1] */
852	mov	%rsp,%rbp
853	sub	$0x10,%rsp
854	mov	%rcx,%r10		/* r10 := nbytes */
855	movdqu	(%r8),%xmm15		/* xmm15 := tweak[0] */
856	_ALIGN_TEXT
8571:	movdqa	%xmm15,%xmm8		/* xmm8 := tweak[0] */
858	call	aesni_xts_mulx		/* xmm15 := tweak[1] */
859	movdqa	%xmm15,%xmm9		/* xmm9 := tweak[1] */
860	call	aesni_xts_mulx		/* xmm15 := tweak[2] */
861	movdqa	%xmm15,%xmm10		/* xmm10 := tweak[2] */
862	call	aesni_xts_mulx		/* xmm15 := tweak[3] */
863	movdqa	%xmm15,%xmm11		/* xmm11 := tweak[3] */
864	call	aesni_xts_mulx		/* xmm51 := tweak[4] */
865	movdqa	%xmm15,%xmm12		/* xmm12 := tweak[4] */
866	call	aesni_xts_mulx		/* xmm15 := tweak[5] */
867	movdqa	%xmm15,%xmm13		/* xmm13 := tweak[5] */
868	call	aesni_xts_mulx		/* xmm15 := tweak[6] */
869	movdqa	%xmm15,%xmm14		/* xmm14 := tweak[6] */
870	call	aesni_xts_mulx		/* xmm15 := tweak[7] */
871	movdqu	(%rsi),%xmm0		/* xmm[i] := ptxt[i] */
872	movdqu	0x10(%rsi),%xmm1
873	movdqu	0x20(%rsi),%xmm2
874	movdqu	0x30(%rsi),%xmm3
875	movdqu	0x40(%rsi),%xmm4
876	movdqu	0x50(%rsi),%xmm5
877	movdqu	0x60(%rsi),%xmm6
878	movdqu	0x70(%rsi),%xmm7
879	lea	0x80(%rsi),%rsi		/* advance rsi to next block group */
880	movdqa	%xmm8,(%rsp)		/* save tweak[0] */
881	pxor	%xmm8,%xmm0		/* xmm[i] := ptxt[i] ^ tweak[i] */
882	pxor	%xmm9,%xmm1
883	pxor	%xmm10,%xmm2
884	pxor	%xmm11,%xmm3
885	pxor	%xmm12,%xmm4
886	pxor	%xmm13,%xmm5
887	pxor	%xmm14,%xmm6
888	pxor	%xmm15,%xmm7
889	mov	%r9d,%ecx		/* ecx := nrounds */
890	call	aesni_dec8		/* xmm[i] := AES(ptxt[i] ^ tweak[i]) */
891	pxor	(%rsp),%xmm0		/* xmm[i] := AES(...) ^ tweak[i] */
892	pxor	%xmm9,%xmm1
893	pxor	%xmm10,%xmm2
894	pxor	%xmm11,%xmm3
895	pxor	%xmm12,%xmm4
896	pxor	%xmm13,%xmm5
897	pxor	%xmm14,%xmm6
898	pxor	%xmm15,%xmm7
899	movdqu	%xmm0,(%rdx)		/* store ciphertext blocks */
900	movdqu	%xmm1,0x10(%rdx)
901	movdqu	%xmm2,0x20(%rdx)
902	movdqu	%xmm3,0x30(%rdx)
903	movdqu	%xmm4,0x40(%rdx)
904	movdqu	%xmm5,0x50(%rdx)
905	movdqu	%xmm6,0x60(%rdx)
906	movdqu	%xmm7,0x70(%rdx)
907	lea	0x80(%rdx),%rdx		/* advance rdx to next block group */
908	call	aesni_xts_mulx		/* xmm15 := tweak[8] */
909	sub	$0x80,%r10
910	jnz	1b			/* repeat if more block groups */
911	movdqu	%xmm15,(%r8)		/* update tweak */
912	leave
913	ret
914END(aesni_xts_dec8)
915
916/*
917 * aesni_xts_mulx(tweak@xmm15)
918 *
919 *	Multiply xmm15 by x, modulo x^128 + x^7 + x^2 + x + 1, in place.
920 *	Uses %xmm0 as temporary.
921 */
922	.text
923	_ALIGN_TEXT
924	.type	aesni_xts_mulx,@function
925aesni_xts_mulx:
926	/*
927	 * Simultaneously determine
928	 * (a) whether the high bit of the low quadword must be
929	 *     shifted into the low bit of the high quadword, and
930	 * (b) whether the high bit of the high quadword must be
931	 *     carried into x^128 = x^7 + x^2 + x + 1.
932	 */
933	pxor	%xmm0,%xmm0	/* xmm0 := 0 */
934	pcmpgtq	%xmm15,%xmm0	/* xmm0[i] := -1 if 0 > xmm15[i] else 0 */
935	pshufd	$0b01001110,%xmm0,%xmm0	/* swap halves of xmm0 */
936	pand	xtscarry(%rip),%xmm0	/* copy xtscarry according to mask */
937	psllq	$1,%xmm15	/* shift */
938	pxor	%xmm0,%xmm15	/* incorporate (a) and (b) */
939	ret
940END(aesni_xts_mulx)
941
942	.section .rodata
943	.p2align 4
944	.type	xtscarry,@object
945xtscarry:
946	.byte	0x87,0,0,0, 0,0,0,0,  1,0,0,0, 0,0,0,0
947END(xtscarry)
948
949/*
950 * aesni_xts_update(const uint8_t in[16] @rdi, uint8_t out[16] @rsi)
951 *
952 *	Update an AES-XTS tweak.
953 *
954 *	Standard ABI calling convention.
955 */
956ENTRY(aesni_xts_update)
957	movdqu	(%rdi),%xmm15
958	call	aesni_xts_mulx
959	movdqu	%xmm15,(%rsi)
960	ret
961END(aesni_xts_update)
962
963/*
964 * aesni_cbcmac_update1(const struct aesenc *enckey@rdi, const uint8_t *in@rsi,
965 *     size_t nbytes@rdx, uint8_t auth[16] @rcx, uint32_t nrounds@r8d)
966 *
967 *	Update CBC-MAC.
968 *
969 *	nbytes must be a positive integral multiple of 16.
970 *
971 *	Standard ABI calling convention.
972 */
973ENTRY(aesni_cbcmac_update1)
974	movdqu	(%rcx),%xmm0		/* xmm0 := auth */
975	mov	%rdx,%r10		/* r10 := nbytes */
976	mov	%rcx,%rdx		/* rdx := &auth */
977	_ALIGN_TEXT
9781:	pxor	(%rsi),%xmm0		/* xmm0 ^= plaintext block */
979	lea	0x10(%rsi),%rsi
980	mov	%r8d,%ecx		/* ecx := nrounds */
981	call	aesni_enc1		/* xmm0 := auth'; trash rax,rcx,xmm8 */
982	sub	$0x10,%r10
983	jnz	1b
984	movdqu	%xmm0,(%rdx)		/* store auth' */
985	ret
986END(aesni_cbcmac_update1)
987
988/*
989 * aesni_ccm_enc1(const struct aesenc *enckey@rdi, const uint8_t *in@rsi,
990 *     uint8_t *out@rdx, size_t nbytes@rcx,
991 *     uint8_t authctr[32] @r8, uint32_t nrounds@r9d)
992 *
993 *	Update CCM encryption.
994 *
995 *	nbytes must be a positive integral multiple of 16.
996 *
997 *	Standard ABI calling convention.
998 */
999ENTRY(aesni_ccm_enc1)
1000	mov	%rcx,%r10		/* r10 := nbytes */
1001	movdqu	0x10(%r8),%xmm2		/* xmm2 := ctr (be) */
1002	movdqa	bswap32(%rip),%xmm4	/* xmm4 := bswap32 table */
1003	movdqa	ctr32_inc(%rip),%xmm5	/* xmm5 := (0,0,0,1) (le) */
1004	movdqu	(%r8),%xmm0		/* xmm0 := auth */
1005	pshufb	%xmm4,%xmm2		/* xmm2 := ctr (le) */
1006	_ALIGN_TEXT
10071:	movdqu	(%rsi),%xmm3		/* xmm3 := plaintext block */
1008	paddd	%xmm5,%xmm2		/* increment ctr (32-bit) */
1009	lea	0x10(%rsi),%rsi
1010	movdqa	%xmm2,%xmm1		/* xmm1 := ctr (le) */
1011	mov	%r9d,%ecx		/* ecx := nrounds */
1012	pshufb	%xmm4,%xmm1		/* xmm1 := ctr (be) */
1013	pxor	%xmm3,%xmm0		/* xmm0 := auth ^ ptxt */
1014	call	aesni_enc2		/* trash rax/rcx/xmm8 */
1015	pxor	%xmm1,%xmm3		/* xmm3 := ciphertext block */
1016	sub	$0x10,%r10		/* count down bytes */
1017	movdqu	%xmm3,(%rdx)		/* store ciphertext block */
1018	lea	0x10(%rdx),%rdx
1019	jnz	1b			/* repeat if more blocks */
1020	pshufb	%xmm4,%xmm2		/* xmm2 := ctr (be) */
1021	movdqu	%xmm0,(%r8)		/* store updated auth */
1022	movdqu	%xmm2,0x10(%r8)		/* store updated ctr */
1023	ret
1024END(aesni_ccm_enc1)
1025
1026/*
1027 * aesni_ccm_dec1(const struct aesenc *enckey@rdi, const uint8_t *in@rsi,
1028 *     uint8_t *out@rdx, size_t nbytes@rcx,
1029 *     uint8_t authctr[32] @r8, uint32_t nrounds@r9d)
1030 *
1031 *	Update CCM decryption.
1032 *
1033 *	nbytes must be a positive integral multiple of 16.
1034 *
1035 *	Standard ABI calling convention.
1036 */
1037ENTRY(aesni_ccm_dec1)
1038	movdqu	0x10(%r8),%xmm2		/* xmm2 := ctr (be) */
1039	movdqa	bswap32(%rip),%xmm4	/* xmm4 := bswap32 table */
1040	movdqa	ctr32_inc(%rip),%xmm5	/* xmm5 := (0,0,0,1) (le) */
1041	movdqu	(%r8),%xmm1		/* xmm1 := auth */
1042	pshufb	%xmm4,%xmm2		/* xmm2 := ctr (le) */
1043	mov	%rcx,%r10		/* r10 := nbytes */
1044
1045	/* Decrypt the first block.  */
1046	paddd	%xmm5,%xmm2		/* increment ctr (32-bit) */
1047	mov	%r9d,%ecx		/* ecx := nrounds */
1048	movdqa	%xmm2,%xmm0		/* xmm0 := ctr (le) */
1049	movdqu	(%rsi),%xmm3		/* xmm3 := ctxt */
1050	pshufb	%xmm4,%xmm0		/* xmm0 := ctr (be) */
1051	lea	0x10(%rsi),%rsi
1052	call	aesni_enc1		/* xmm0 := pad; trash rax/rcx/xmm8 */
1053	jmp	2f
1054
1055	_ALIGN_TEXT
10561:	/*
1057	 * Authenticate the last block and decrypt the next block
1058	 * simultaneously.
1059	 *
1060	 *	xmm1 = auth ^ ptxt[-1]
1061	 *	xmm2 = ctr[-1] (le)
1062	 */
1063	paddd	%xmm5,%xmm2		/* increment ctr (32-bit) */
1064	mov	%r9d,%ecx		/* ecx := nrounds */
1065	movdqa	%xmm2,%xmm0		/* xmm0 := ctr (le) */
1066	movdqu	(%rsi),%xmm3		/* xmm3 := ctxt */
1067	pshufb	%xmm4,%xmm0		/* xmm0 := ctr (be) */
1068	lea	0x10(%rsi),%rsi
1069	call	aesni_enc2		/* xmm0 := pad, xmm1 := auth';
1070					 * trash rax/rcx/xmm8 */
10712:	pxor	%xmm0,%xmm3		/* xmm3 := ptxt */
1072	sub	$0x10,%r10
1073	movdqu	%xmm3,(%rdx)		/* store plaintext */
1074	lea	0x10(%rdx),%rdx
1075	pxor	%xmm3,%xmm1		/* xmm1 := auth ^ ptxt */
1076	jnz	1b
1077
1078	/* Authenticate the last block.  */
1079	movdqa	%xmm1,%xmm0		/* xmm0 := auth ^ ptxt */
1080	mov	%r9d,%ecx		/* ecx := nrounds */
1081	call	aesni_enc1		/* xmm0 := auth' */
1082	pshufb	%xmm4,%xmm2		/* xmm2 := ctr (be) */
1083	movdqu	%xmm0,(%r8)		/* store updated auth */
1084	movdqu	%xmm2,0x10(%r8)		/* store updated ctr */
1085	ret
1086END(aesni_ccm_dec1)
1087
1088	.section .rodata
1089	.p2align 4
1090	.type	bswap32,@object
1091bswap32:
1092	.byte	3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12
1093END(bswap32)
1094
1095	.section .rodata
1096	.p2align 4
1097	.type	ctr32_inc,@object
1098ctr32_inc:
1099	.byte	0,0,0,0, 0,0,0,0, 0,0,0,0, 1,0,0,0
1100END(ctr32_inc)
1101
1102/*
1103 * aesni_enc1(const struct aesenc *enckey@rdi, uint128_t block@xmm0,
1104 *     uint32_t nrounds@ecx)
1105 *
1106 *	Encrypt a single AES block in %xmm0.
1107 *
1108 *	Internal ABI.  Uses %rax and %xmm8 as temporaries.  Destroys %ecx.
1109 */
1110	.text
1111	_ALIGN_TEXT
1112	.type	aesni_enc1,@function
1113aesni_enc1:
1114	pxor	(%rdi),%xmm0	/* xor in first round key */
1115	shl	$4,%ecx		/* ecx := total byte size of round keys */
1116	lea	0x10(%rdi,%rcx),%rax	/* rax := end of round key array */
1117	neg	%rcx		/* rcx := byte offset of round key from end */
1118	jmp	2f
1119	_ALIGN_TEXT
11201:	aesenc	%xmm8,%xmm0
11212:	movdqa	(%rax,%rcx),%xmm8	/* load round key */
1122	add	$0x10,%rcx
1123	jnz	1b		/* repeat if more rounds */
1124	aesenclast %xmm8,%xmm0
1125	ret
1126END(aesni_enc1)
1127
1128/*
1129 * aesni_enc2(const struct aesenc *enckey@rdi, uint128_t block0@xmm0,
1130 *     uint128_t block1@xmm1, uint32_t nrounds@ecx)
1131 *
1132 *	Encrypt two AES blocks in %xmm0 and %xmm1.
1133 *
1134 *	Internal ABI.  Uses %rax and %xmm8 as temporaries.  Destroys %ecx.
1135 */
1136	.text
1137	_ALIGN_TEXT
1138	.type	aesni_enc2,@function
1139aesni_enc2:
1140	movdqa	(%rdi),%xmm8	/* xmm8 := first round key */
1141	shl	$4,%ecx		/* ecx := total byte size of round keys */
1142	lea	0x10(%rdi,%rcx),%rax	/* rax := end of round key array */
1143	neg	%rcx		/* rcx := byte offset of round key from end */
1144	pxor	%xmm8,%xmm0	/* xor in first round key */
1145	pxor	%xmm8,%xmm1
1146	jmp	2f
1147	_ALIGN_TEXT
11481:	aesenc	%xmm8,%xmm0
1149	aesenc	%xmm8,%xmm1
11502:	movdqa	(%rax,%rcx),%xmm8	/* load round key */
1151	add	$0x10,%rcx
1152	jnz	1b		/* repeat if there's more */
1153	aesenclast %xmm8,%xmm0
1154	aesenclast %xmm8,%xmm1
1155	ret
1156END(aesni_enc2)
1157
1158/*
1159 * aesni_enc8(const struct aesenc *enckey@rdi, uint128_t block0@xmm0, ...,
1160 *     block7@xmm7, uint32_t nrounds@ecx)
1161 *
1162 *	Encrypt eight AES blocks in %xmm0 through %xmm7 in parallel.
1163 *
1164 *	Internal ABI.  Uses %rax and %xmm8 as temporaries.  Destroys %ecx.
1165 */
1166	.text
1167	_ALIGN_TEXT
1168	.type	aesni_enc8,@function
1169aesni_enc8:
1170	movdqa	(%rdi),%xmm8	/* xor in first round key */
1171	pxor	%xmm8,%xmm0
1172	pxor	%xmm8,%xmm1
1173	pxor	%xmm8,%xmm2
1174	pxor	%xmm8,%xmm3
1175	pxor	%xmm8,%xmm4
1176	pxor	%xmm8,%xmm5
1177	pxor	%xmm8,%xmm6
1178	pxor	%xmm8,%xmm7
1179	shl	$4,%ecx		/* ecx := total byte size of round keys */
1180	lea	0x10(%rdi,%rcx),%rax	/* rax := end of round key array */
1181	neg	%rcx		/* rcx := byte offset of round key from end */
1182	jmp	2f
1183	_ALIGN_TEXT
11841:	aesenc	%xmm8,%xmm0
1185	aesenc	%xmm8,%xmm1
1186	aesenc	%xmm8,%xmm2
1187	aesenc	%xmm8,%xmm3
1188	aesenc	%xmm8,%xmm4
1189	aesenc	%xmm8,%xmm5
1190	aesenc	%xmm8,%xmm6
1191	aesenc	%xmm8,%xmm7
11922:	movdqa	(%rax,%rcx),%xmm8	/* load round key */
1193	add	$0x10,%rcx
1194	jnz	1b		/* repeat if more rounds */
1195	aesenclast %xmm8,%xmm0
1196	aesenclast %xmm8,%xmm1
1197	aesenclast %xmm8,%xmm2
1198	aesenclast %xmm8,%xmm3
1199	aesenclast %xmm8,%xmm4
1200	aesenclast %xmm8,%xmm5
1201	aesenclast %xmm8,%xmm6
1202	aesenclast %xmm8,%xmm7
1203	ret
1204END(aesni_enc8)
1205
1206/*
1207 * aesni_dec1(const struct aesdec *deckey@rdi, uint128_t block@xmm0,
1208 *     uint32_t nrounds@ecx)
1209 *
1210 *	Decrypt a single AES block in %xmm0.
1211 *
1212 *	Internal ABI.  Uses %rax and %xmm8 as temporaries.  Destroys %ecx.
1213 */
1214	.text
1215	_ALIGN_TEXT
1216	.type	aesni_dec1,@function
1217aesni_dec1:
1218	pxor	(%rdi),%xmm0	/* xor in first round key */
1219	shl	$4,%ecx		/* ecx := byte offset of round key */
1220	lea	0x10(%rdi,%rcx),%rax	/* rax := pointer to round key */
1221	neg	%rcx		/* rcx := byte offset of round key from end */
1222	jmp	2f
1223	_ALIGN_TEXT
12241:	aesdec	%xmm8,%xmm0
12252:	movdqa	(%rax,%rcx),%xmm8	/* load round key */
1226	add	$0x10,%rcx
1227	jnz	1b		/* repeat if more rounds */
1228	aesdeclast %xmm8,%xmm0
1229	ret
1230END(aesni_dec1)
1231
1232/*
1233 * aesni_dec8(const struct aesdec *deckey@rdi, uint128_t block0@xmm0, ...,
1234 *     block7@xmm7, uint32_t nrounds@ecx)
1235 *
1236 *	Decrypt eight AES blocks in %xmm0 through %xmm7 in parallel.
1237 *
1238 *	Internal ABI.  Uses %xmm8 as temporary.  Destroys %rcx.
1239 */
1240	.text
1241	_ALIGN_TEXT
1242	.type	aesni_dec8,@function
1243aesni_dec8:
1244	movdqa	(%rdi),%xmm8	/* xor in first round key */
1245	pxor	%xmm8,%xmm0
1246	pxor	%xmm8,%xmm1
1247	pxor	%xmm8,%xmm2
1248	pxor	%xmm8,%xmm3
1249	pxor	%xmm8,%xmm4
1250	pxor	%xmm8,%xmm5
1251	pxor	%xmm8,%xmm6
1252	pxor	%xmm8,%xmm7
1253	shl	$4,%ecx		/* ecx := byte offset of round key */
1254	lea	0x10(%rdi,%rcx),%rax	/* rax := pointer to round key */
1255	neg	%rcx		/* rcx := byte offset of round key from end */
1256	jmp	2f
1257	_ALIGN_TEXT
12581:	aesdec	%xmm8,%xmm0
1259	aesdec	%xmm8,%xmm1
1260	aesdec	%xmm8,%xmm2
1261	aesdec	%xmm8,%xmm3
1262	aesdec	%xmm8,%xmm4
1263	aesdec	%xmm8,%xmm5
1264	aesdec	%xmm8,%xmm6
1265	aesdec	%xmm8,%xmm7
12662:	movdqa	(%rax,%rcx),%xmm8	/* load round key */
1267	add	$0x10,%rcx
1268	jnz	1b		/* repeat if more rounds */
1269	aesdeclast %xmm8,%xmm0
1270	aesdeclast %xmm8,%xmm1
1271	aesdeclast %xmm8,%xmm2
1272	aesdeclast %xmm8,%xmm3
1273	aesdeclast %xmm8,%xmm4
1274	aesdeclast %xmm8,%xmm5
1275	aesdeclast %xmm8,%xmm6
1276	aesdeclast %xmm8,%xmm7
1277	ret
1278END(aesni_dec8)
1279