xref: /netbsd-src/sys/crypto/aes/arch/arm/aes_armv8_64.S (revision 32d1c65c71fbdb65a012e8392a62a757dd6853e9)
1/*	$NetBSD: aes_armv8_64.S,v 1.15 2020/09/08 23:58:09 riastradh Exp $	*/
2
3/*-
4 * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29#include <aarch64/asm.h>
30
31RCSID("$NetBSD: aes_armv8_64.S,v 1.15 2020/09/08 23:58:09 riastradh Exp $")
32
33	.arch_extension	aes
34
35/*
36 * uint32_t rcon[10]
37 *
38 *	Table mapping n ---> x^n mod (x^8 + x^4 + x^3 + x + 1) in GF(2).
39 *	Such elements of GF(8) need only eight bits to be represented,
40 *	but we store them in 4-byte units so we can copy one into all
41 *	four 4-byte lanes of a vector register with a single LD1R.  The
42 *	access pattern is fixed, so indices into this table are never
43 *	secret.
44 */
45	.section .rodata
46	.p2align 2
47	.type	rcon,@object
48rcon:
49	.long	0x01
50	.long	0x02
51	.long	0x04
52	.long	0x08
53	.long	0x10
54	.long	0x20
55	.long	0x40
56	.long	0x80
57	.long	0x1b
58	.long	0x36
59END(rcon)
60
61/*
62 * uint128_t unshiftrows_rotword_1
63 *
64 *	Table for TBL instruction to undo ShiftRows, and then do
65 *	RotWord on word 1, and then copy it into all the other words.
66 */
67	.section .rodata
68	.p2align 4
69	.type	unshiftrows_rotword_1,@object
70unshiftrows_rotword_1:
71	.byte	0x01,0x0e,0x0b,0x04
72	.byte	0x01,0x0e,0x0b,0x04
73	.byte	0x01,0x0e,0x0b,0x04
74	.byte	0x01,0x0e,0x0b,0x04
75END(unshiftrows_rotword_1)
76
77/*
78 * uint128_t unshiftrows_3
79 *
80 *	Table for TBL instruction to undo ShiftRows, and then copy word
81 *	3 into all the other words.
82 */
83	.section .rodata
84	.p2align 4
85	.type	unshiftrows_3,@object
86unshiftrows_3:
87	.byte	0x0c,0x09,0x06,0x03
88	.byte	0x0c,0x09,0x06,0x03
89	.byte	0x0c,0x09,0x06,0x03
90	.byte	0x0c,0x09,0x06,0x03
91END(unshiftrows_3)
92
93/*
94 * uint128_t unshiftrows_rotword_3
95 *
96 *	Table for TBL instruction to undo ShiftRows, and then do
97 *	RotWord on word 3, and then copy it into all the other words.
98 */
99	.section .rodata
100	.p2align 4
101	.type	unshiftrows_rotword_3,@object
102unshiftrows_rotword_3:
103	.byte	0x09,0x06,0x03,0x0c
104	.byte	0x09,0x06,0x03,0x0c
105	.byte	0x09,0x06,0x03,0x0c
106	.byte	0x09,0x06,0x03,0x0c
107END(unshiftrows_rotword_3)
108
109/*
110 * aesarmv8_setenckey128(struct aesenc *enckey@x0, const uint8_t key[16] @x1)
111 *
112 *	Expand a 16-byte AES-128 key into 10 round keys.
113 *
114 *	Standard ABI calling convention.
115 */
116ENTRY(aesarmv8_setenckey128)
117	ld1	{v1.16b}, [x1]	/* q1 := master key */
118
119	adrl	x4, unshiftrows_rotword_3
120	eor	v0.16b, v0.16b, v0.16b	/* q0 := 0 */
121	ld1	{v16.16b}, [x4]	/* q16 := unshiftrows_rotword_3 table */
122
123	str	q1, [x0], #0x10	/* store master key as first round key */
124	mov	x2, #10		/* round count */
125	adrl	x3, rcon	/* round constant */
126
1271:	/*
128	 * q0 = 0
129	 * v1.4s = (prk[0], prk[1], prk[2], prk[3])
130	 * x0 = pointer to round key to compute
131	 * x2 = round count
132	 * x3 = rcon pointer
133	 */
134
135	/* q3 := ShiftRows(SubBytes(q1)) */
136	mov	v3.16b, v1.16b
137	aese	v3.16b, v0.16b
138
139	/* v3.4s[i] := RotWords(SubBytes(prk[3])) ^ RCON */
140	ld1r	{v4.4s}, [x3], #4
141	tbl	v3.16b, {v3.16b}, v16.16b
142	eor	v3.16b, v3.16b, v4.16b
143
144	/*
145	 * v5.4s := (0,prk[0],prk[1],prk[2])
146	 * v6.4s := (0,0,prk[0],prk[1])
147	 * v7.4s := (0,0,0,prk[0])
148	 */
149	ext	v5.16b, v0.16b, v1.16b, #12
150	ext	v6.16b, v0.16b, v1.16b, #8
151	ext	v7.16b, v0.16b, v1.16b, #4
152
153	/* v1.4s := (rk[0], rk[1], rk[2], rk[3]) */
154	eor	v1.16b, v1.16b, v3.16b
155	eor	v1.16b, v1.16b, v5.16b
156	eor	v1.16b, v1.16b, v6.16b
157	eor	v1.16b, v1.16b, v7.16b
158
159	subs	x2, x2, #1	/* count down rounds */
160	str	q1, [x0], #0x10	/* store round key */
161	b.ne	1b
162
163	ret
164END(aesarmv8_setenckey128)
165
166/*
167 * aesarmv8_setenckey192(struct aesenc *enckey@x0, const uint8_t key[24] @x1)
168 *
169 *	Expand a 24-byte AES-192 key into 12 round keys.
170 *
171 *	Standard ABI calling convention.
172 */
173ENTRY(aesarmv8_setenckey192)
174	ld1	{v1.16b}, [x1], #0x10	/* q1 := master key[0:128) */
175	ld1	{v2.8b}, [x1]	/* d2 := master key[128:192) */
176
177	adrl	x4, unshiftrows_rotword_1
178	adrl	x5, unshiftrows_rotword_3
179	eor	v0.16b, v0.16b, v0.16b	/* q0 := 0 */
180	ld1	{v16.16b}, [x4]	/* q16 := unshiftrows_rotword_1 */
181	ld1	{v17.16b}, [x5]	/* q17 := unshiftrows_rotword_3 */
182
183	str	q1, [x0], #0x10	/* store master key[0:128) as round key */
184	mov	x2, #12		/* round count */
185	adrl	x3, rcon	/* round constant */
186
1871:	/*
188	 * q0 = 0
189	 * v1.4s = (prk[0], prk[1], prk[2], prk[3])
190	 * v2.4s = (rklo[0], rklo[1], xxx, xxx)
191	 * x0 = pointer to three round keys to compute
192	 * x2 = round count
193	 * x3 = rcon pointer
194	 */
195
196	/* q3 := ShiftRows(SubBytes(q2)) */
197	mov	v3.16b, v2.16b
198	aese	v3.16b, v0.16b
199
200	/* v3.4s[i] := RotWords(SubBytes(rklo[1])) ^ RCON */
201	ld1r	{v4.4s}, [x3], #4
202	tbl	v3.16b, {v3.16b}, v16.16b
203	eor	v3.16b, v3.16b, v4.16b
204
205	/*
206	 * We need to compute:
207	 *
208	 * rk[0] := rklo[0]
209	 * rk[1] := rklo[1]
210	 * rk[2] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0]
211	 * rk[3] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ prk[1]
212	 * nrk[0] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ prk[1] ^ prk[2]
213	 * nrk[1] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3]
214	 * nrk[2] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3] ^ rklo[0]
215	 * nrk[3] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3] ^ rklo[0]
216	 *     ^ rklo[1]
217	 */
218
219	/*
220	 * v5.4s := (0,prk[0],prk[1],prk[2])
221	 * v6.4s := (0,0,prk[0],prk[1])
222	 * v7.4s := (0,0,0,prk[0])
223	 */
224	ext	v5.16b, v0.16b, v1.16b, #12
225	ext	v6.16b, v0.16b, v1.16b, #8
226	ext	v7.16b, v0.16b, v1.16b, #4
227
228	/* v5.4s := (rk[2], rk[3], nrk[0], nrk[1]) */
229	eor	v5.16b, v5.16b, v1.16b
230	eor	v5.16b, v5.16b, v3.16b
231	eor	v5.16b, v5.16b, v6.16b
232	eor	v5.16b, v5.16b, v7.16b
233
234	/*
235	 * At this point, rk is split across v2.4s = (rk[0],rk[1],...)
236	 * and v5.4s = (rk[2],rk[3],...); nrk is in v5.4s =
237	 * (...,nrk[0],nrk[1]); and we have yet to compute nrk[2] or
238	 * nrk[3], which requires rklo[0] and rklo[1] in v2.4s =
239	 * (rklo[0],rklo[1],...).
240	 */
241
242	/* v1.4s := (nrk[0], nrk[1], nrk[1], nrk[1]) */
243	dup	v1.4s, v5.s[3]
244	mov	v1.s[0], v5.s[2]
245
246	/*
247	 * v6.4s := (0, 0, rklo[0], rklo[1])
248	 * v7.4s := (0, 0, 0, rklo[0])
249	 */
250	ext	v6.16b, v0.16b, v2.16b, #8
251	ext	v7.16b, v0.16b, v2.16b, #4
252
253	/* v3.4s := (nrk[0], nrk[1], nrk[2], nrk[3]) */
254	eor	v3.16b, v1.16b, v6.16b
255	eor	v3.16b, v3.16b, v7.16b
256
257	/*
258	 * Recall v2.4s = (rk[0], rk[1], xxx, xxx)
259	 * and v5.4s = (rk[2], rk[3], xxx, xxx).  Set
260	 * v2.4s := (rk[0], rk[1], rk[2], rk[3])
261	 */
262	mov	v2.d[1], v5.d[0]
263
264	/* store two round keys */
265	stp	q2, q3, [x0], #0x20
266
267	/*
268	 * Live vector registers at this point:
269	 *
270	 *	q0 = zero
271	 *	q2 = rk
272	 *	q3 = nrk
273	 *	v5.4s = (rk[2], rk[3], nrk[0], nrk[1])
274	 *	q16 = unshiftrows_rotword_1
275	 *	q17 = unshiftrows_rotword_3
276	 *
277	 * We have to compute, in q1:
278	 *
279	 * nnrk[0] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2]
280	 * nnrk[1] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3]
281	 * nnrk[2] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0]
282	 * nnrk[3] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0]
283	 *     ^ nrk[1]
284	 *
285	 * And, if there's any more afterward, in q2:
286	 *
287	 * nnnrklo[0] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0]
288	 *     ^ nrk[1] ^ nrk[2]
289	 * nnnrklo[1] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0]
290	 *     ^ nrk[1] ^ nrk[2] ^ nrk[3]
291	 */
292
293	/* q1 := RotWords(SubBytes(q3)) */
294	mov	v1.16b, v3.16b
295	aese	v1.16b, v0.16b
296
297	/* v1.4s[i] := RotWords(SubBytes(nrk[3])) ^ RCON' */
298	ld1r	{v4.4s}, [x3], #4
299	tbl	v1.16b, {v1.16b}, v17.16b
300	eor	v1.16b, v1.16b, v4.16b
301
302	/*
303	 * v5.4s := (rk[2], rk[3], nrk[0], nrk[1]) [already]
304	 * v4.4s := (0, rk[2], rk[3], nrk[0])
305	 * v6.4s := (0, 0, rk[2], rk[3])
306	 * v7.4s := (0, 0, 0, rk[2])
307	 */
308	ext	v4.16b, v0.16b, v5.16b, #12
309	ext	v6.16b, v0.16b, v5.16b, #8
310	ext	v7.16b, v0.16b, v5.16b, #4
311
312	/* v1.4s := (nnrk[0], nnrk[1], nnrk[2], nnrk[3]) */
313	eor	v1.16b, v1.16b, v5.16b
314	eor	v1.16b, v1.16b, v4.16b
315	eor	v1.16b, v1.16b, v6.16b
316	eor	v1.16b, v1.16b, v7.16b
317
318	subs	x2, x2, #3	/* count down three rounds */
319	str	q1, [x0], #0x10	/* store third round key */
320	b.eq	2f
321
322	/*
323	 * v4.4s := (nrk[2], nrk[3], xxx, xxx)
324	 * v5.4s := (0, nrk[2], xxx, xxx)
325	 */
326	ext	v4.16b, v3.16b, v0.16b, #8
327	ext	v5.16b, v0.16b, v4.16b, #12
328
329	/* v2.4s := (nnrk[3], nnrk[3], xxx, xxx) */
330	dup	v2.4s, v1.s[3]
331
332	/*
333	 * v2.4s := (nnnrklo[0] = nnrk[3] ^ nrk[2],
334	 *     nnnrklo[1] = nnrk[3] ^ nrk[2] ^ nrk[3],
335	 *     xxx, xxx)
336	 */
337	eor	v2.16b, v2.16b, v4.16b
338	eor	v2.16b, v2.16b, v5.16b
339
340	b	1b
341
3422:	ret
343END(aesarmv8_setenckey192)
344
345/*
346 * aesarmv8_setenckey256(struct aesenc *enckey@x0, const uint8_t key[32] @x1)
347 *
348 *	Expand a 32-byte AES-256 key into 14 round keys.
349 *
350 *	Standard ABI calling convention.
351 */
352ENTRY(aesarmv8_setenckey256)
353	/* q1 := key[0:128), q2 := key[128:256) */
354	ld1	{v1.16b-v2.16b}, [x1], #0x20
355
356	adrl	x4, unshiftrows_rotword_3
357	adrl	x5, unshiftrows_3
358	eor	v0.16b, v0.16b, v0.16b	/* q0 := 0 */
359	ld1	{v16.16b}, [x4]	/* q16 := unshiftrows_rotword_3 */
360	ld1	{v17.16b}, [x5]	/* q17 := unshiftrows_3 */
361
362	/* store master key as first two round keys */
363	stp	q1, q2, [x0], #0x20
364	mov	x2, #14		/* round count */
365	adrl	x3, rcon	/* round constant */
366
3671:	/*
368	 * q0 = 0
369	 * v1.4s = (pprk[0], pprk[1], pprk[2], pprk[3])
370	 * v2.4s = (prk[0], prk[1], prk[2], prk[3])
371	 * x2 = round count
372	 * x3 = rcon pointer
373	 */
374
375	/* q3 := ShiftRows(SubBytes(q2)) */
376	mov	v3.16b, v2.16b
377	aese	v3.16b, v0.16b
378
379	/* v3.4s[i] := RotWords(SubBytes(prk[3])) ^ RCON */
380	ld1r	{v4.4s}, [x3], #4
381	tbl	v3.16b, {v3.16b}, v16.16b
382	eor	v3.16b, v3.16b, v4.16b
383
384	/*
385	 * v5.4s := (0,pprk[0],pprk[1],pprk[2])
386	 * v6.4s := (0,0,pprk[0],pprk[1])
387	 * v7.4s := (0,0,0,pprk[0])
388	 */
389	ext	v5.16b, v0.16b, v1.16b, #12
390	ext	v6.16b, v0.16b, v1.16b, #8
391	ext	v7.16b, v0.16b, v1.16b, #4
392
393	/* v1.4s := (rk[0], rk[1], rk[2], rk[3]) */
394	eor	v1.16b, v1.16b, v3.16b
395	eor	v1.16b, v1.16b, v5.16b
396	eor	v1.16b, v1.16b, v6.16b
397	eor	v1.16b, v1.16b, v7.16b
398
399	subs	x2, x2, #2		/* count down two rounds */
400	b.eq	2f			/* stop if this is the last one */
401
402	/* q3 := ShiftRows(SubBytes(q1)) */
403	mov	v3.16b, v1.16b
404	aese	v3.16b, v0.16b
405
406	/* v3.4s[i] := SubBytes(rk[3]) */
407	tbl	v3.16b, {v3.16b}, v17.16b
408
409	/*
410	 * v5.4s := (0,prk[0],prk[1],prk[2])
411	 * v6.4s := (0,0,prk[0],prk[1])
412	 * v7.4s := (0,0,0,prk[0])
413	 */
414	ext	v5.16b, v0.16b, v2.16b, #12
415	ext	v6.16b, v0.16b, v2.16b, #8
416	ext	v7.16b, v0.16b, v2.16b, #4
417
418	/* v2.4s := (nrk[0], nrk[1], nrk[2], nrk[3]) */
419	eor	v2.16b, v2.16b, v3.16b
420	eor	v2.16b, v2.16b, v5.16b
421	eor	v2.16b, v2.16b, v6.16b
422	eor	v2.16b, v2.16b, v7.16b
423
424	stp	q1, q2, [x0], #0x20	/* store two round keys */
425	b	1b
426
4272:	str	q1, [x0]		/* store last round key */
428	ret
429END(aesarmv8_setenckey256)
430
431/*
432 * aesarmv8_enctodec(const struct aesenc *enckey@x0, struct aesdec *deckey@x1,
433 *     uint32_t nrounds@x2)
434 *
435 *	Convert AES encryption round keys to AES decryption round keys.
436 *	`rounds' must be between 10 and 14.
437 *
438 *	Standard ABI calling convention.
439 */
440ENTRY(aesarmv8_enctodec)
441	ldr	q0, [x0, x2, lsl #4]	/* load last round key */
442	b	2f
443	_ALIGN_TEXT
4441:	aesimc	v0.16b, v0.16b	/* convert encryption to decryption */
4452:	str	q0, [x1], #0x10	/* store round key */
446	subs	x2, x2, #1	/* count down round */
447	ldr	q0, [x0, x2, lsl #4]	/* load previous round key */
448	b.ne	1b		/* repeat if there's more */
449	str	q0, [x1]	/* store first round key verbatim */
450	ret
451END(aesarmv8_enctodec)
452
453/*
454 * aesarmv8_enc(const struct aesenc *enckey@x0, const uint8_t in[16] @x1,
455 *     uint8_t out[16] @x2, uint32_t nrounds@x3)
456 *
457 *	Encrypt a single block.
458 *
459 *	Standard ABI calling convention.
460 */
461ENTRY(aesarmv8_enc)
462	stp	fp, lr, [sp, #-16]!	/* push stack frame */
463	mov	fp, sp
464	ld1	{v0.16b}, [x1]	/* q0 := ptxt */
465	bl	aesarmv8_enc1	/* q0 := ctxt; trash x0/x3/q16 */
466	st1	{v0.16b}, [x2]	/* store ctxt */
467	ldp	fp, lr, [sp], #16	/* pop stack frame */
468	ret
469END(aesarmv8_enc)
470
471/*
472 * aesarmv8_dec(const struct aesdec *deckey@x0, const uint8_t in[16] @x1,
473 *     uint8_t out[16] @x2, uint32_t nrounds@x3)
474 *
475 *	Decrypt a single block.
476 *
477 *	Standard ABI calling convention.
478 */
479ENTRY(aesarmv8_dec)
480	stp	fp, lr, [sp, #-16]!	/* push stack frame */
481	mov	fp, sp
482	ld1	{v0.16b}, [x1]	/* q0 := ctxt */
483	bl	aesarmv8_dec1	/* q0 := ptxt; trash x0/x3/q16 */
484	st1	{v0.16b}, [x2]	/* store ptxt */
485	ldp	fp, lr, [sp], #16	/* pop stack frame */
486	ret
487END(aesarmv8_dec)
488
489/*
490 * aesarmv8_cbc_enc(const struct aesenc *enckey@x0, const uint8_t *in@x1,
491 *     uint8_t *out@x2, size_t nbytes@x3, uint8_t iv[16] @x4,
492 *     uint32_t nrounds@x5)
493 *
494 *	Encrypt a contiguous sequence of blocks with AES-CBC.
495 *
496 *	nbytes must be an integral multiple of 16.
497 *
498 *	Standard ABI calling convention.
499 */
500ENTRY(aesarmv8_cbc_enc)
501	cbz	x3, 2f			/* stop if nothing to do */
502	stp	fp, lr, [sp, #-16]!	/* push stack frame */
503	mov	fp, sp
504	mov	x9, x0			/* x9 := enckey */
505	mov	x10, x3			/* x10 := nbytes */
506	ld1	{v0.16b}, [x4]		/* q0 := chaining value */
507	_ALIGN_TEXT
5081:	ld1	{v1.16b}, [x1], #0x10	/* q1 := plaintext block */
509	eor	v0.16b, v0.16b, v1.16b	/* q0 := cv ^ ptxt */
510	mov	x0, x9			/* x0 := enckey */
511	mov	x3, x5			/* x3 := nrounds */
512	bl	aesarmv8_enc1		/* q0 := ctxt; trash x0/x3/q16 */
513	subs	x10, x10, #0x10		/* count down nbytes */
514	st1	{v0.16b}, [x2], #0x10	/* store ciphertext block */
515	b.ne	1b			/* repeat if x10 is nonzero */
516	st1	{v0.16b}, [x4]		/* store chaining value */
517	ldp	fp, lr, [sp], #16	/* pop stack frame */
5182:	ret
519END(aesarmv8_cbc_enc)
520
521/*
522 * aesarmv8_cbc_dec1(const struct aesdec *deckey@x0, const uint8_t *in@x1,
523 *     uint8_t *out@x2, size_t nbytes@x3, const uint8_t iv[16] @x4,
524 *     uint32_t nrounds@x5)
525 *
526 *	Decrypt a contiguous sequence of blocks with AES-CBC.
527 *
528 *	nbytes must be a positive integral multiple of 16.  This routine
529 *	is not vectorized; use aesarmv8_cbc_dec8 for >=8 blocks at once.
530 *
531 *	Standard ABI calling convention.
532 */
533ENTRY(aesarmv8_cbc_dec1)
534	stp	fp, lr, [sp, #-16]!	/* push stack frame */
535	mov	fp, sp
536	ld1	{v24.16b}, [x4]		/* q24 := iv */
537	mov	x9, x0			/* x9 := enckey */
538	mov	x10, x3			/* x10 := nbytes */
539	add	x1, x1, x3		/* x1 := pointer past end of in */
540	add	x2, x2, x3		/* x2 := pointer past end of out */
541	sub	x1, x1, #0x10
542	ld1	{v0.16b}, [x1]		/* q0 := last ciphertext block */
543	st1	{v0.16b}, [x4]		/* update iv */
544	b	2f
545	_ALIGN_TEXT
5461:	sub	x1, x1, #0x10
547	ld1	{v31.16b}, [x1]		/* q31 := chaining value */
548	sub	x2, x2, #0x10
549	eor	v0.16b, v0.16b, v31.16b	/* q0 := plaintext block */
550	st1	{v0.16b}, [x2]		/* store plaintext block */
551	mov	v0.16b, v31.16b		/* move cv = ciphertext block */
5522:	mov	x0, x9			/* x0 := enckey */
553	mov	x3, x5			/* x3 := nrounds */
554	bl	aesarmv8_dec1		/* q0 := cv ^ ptxt; trash x0/x3/q16 */
555	subs	x10, x10, #0x10		/* count down nbytes */
556	b.ne	1b			/* repeat if more blocks */
557	eor	v0.16b, v0.16b, v24.16b	/* q0 := first plaintext block */
558	sub	x2, x2, #0x10		/* store first plaintext block */
559	st1	{v0.16b}, [x2]
560	ldp	fp, lr, [sp], #16	/* pop stack frame */
561	ret
562END(aesarmv8_cbc_dec1)
563
564/*
565 * aesarmv8_cbc_dec8(const struct aesdec *deckey@x0, const uint8_t *in@x1,
566 *     uint8_t *out@x2, size_t nbytes@x3, const uint8_t iv[16] @x4,
567 *     uint32_t nrounds@x5)
568 *
569 *	Decrypt a contiguous sequence of 8-block units with AES-CBC.
570 *
571 *	nbytes must be a positive integral multiple of 128.
572 *
573 *	Standard ABI calling convention.
574 */
575ENTRY(aesarmv8_cbc_dec8)
576	stp	fp, lr, [sp, #-16]!	/* push stack frame */
577	mov	fp, sp
578	ld1	{v24.16b}, [x4]		/* q24 := iv */
579	mov	x9, x0			/* x9 := enckey */
580	mov	x10, x3			/* x10 := nbytes */
581	add	x1, x1, x3		/* x1 := pointer past end of in */
582	add	x2, x2, x3		/* x2 := pointer past end of out */
583	sub	x1, x1, #0x20
584	ld1	{v6.16b, v7.16b}, [x1]	/* q6, q7 := last ciphertext blocks */
585	st1	{v7.16b}, [x4]		/* update iv */
586	b	2f
587	_ALIGN_TEXT
5881:	sub	x1, x1, #0x20
589	ld1	{v6.16b, v7.16b}, [x1]
590	eor	v0.16b, v0.16b, v7.16b	/* q0 := pt0 */
591	sub	x2, x2, #0x20
592	st1	{v0.16b, v1.16b}, [x2]
5932:	sub	x1, x1, #0x20
594	ld1	{v4.16b-v5.16b}, [x1]
595	sub	x1, x1, #0x40
596	ld1	{v0.16b-v3.16b}, [x1]
597
598	mov	v31.16b, v6.16b		/* q[24+i] := cv[i], 0<i<8 */
599	mov	v30.16b, v5.16b
600	mov	v29.16b, v4.16b
601	mov	v28.16b, v3.16b
602	mov	v27.16b, v2.16b
603	mov	v26.16b, v1.16b
604	mov	v25.16b, v0.16b
605	mov	x0, x9			/* x0 := enckey */
606	mov	x3, x5			/* x3 := nrounds */
607	bl	aesarmv8_dec8		/* q[i] := cv[i] ^ pt[i];
608					 * trash x0/x3/q16 */
609	eor	v7.16b, v7.16b, v31.16b	/* q[i] := pt[i] */
610	eor	v6.16b, v6.16b, v30.16b
611	eor	v5.16b, v5.16b, v29.16b
612	eor	v4.16b, v4.16b, v28.16b
613	eor	v3.16b, v3.16b, v27.16b
614	eor	v2.16b, v2.16b, v26.16b
615	eor	v1.16b, v1.16b, v25.16b
616	subs	x10, x10, #0x80		/* count down nbytes */
617	sub	x2, x2, #0x20		/* store plaintext blocks */
618	st1	{v6.16b-v7.16b}, [x2]
619	sub	x2, x2, #0x40
620	st1	{v2.16b-v5.16b}, [x2]
621	b.ne	1b			/* repeat if there's more */
622	eor	v0.16b, v0.16b, v24.16b	/* q0 := pt0 */
623	sub	x2, x2, #0x20
624	st1	{v0.16b, v1.16b}, [x2]	/* store first two plaintext blocks */
625	ldp	fp, lr, [sp], #16	/* pop stack frame */
626	ret
627END(aesarmv8_cbc_dec8)
628
629/*
630 * aesarmv8_xts_enc1(const struct aesenc *enckey@x0, const uint8_t *in@x1,
631 *     uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
632 *     uint32_t nrounds@x5)
633 *
634 *	Encrypt a contiguous sequence of blocks with AES-XTS.
635 *
636 *	nbytes must be a positive integral multiple of 16.  This routine
637 *	is not vectorized; use aesarmv8_xts_enc8 for >=8 blocks at once.
638 *
639 *	Standard ABI calling convention.
640 */
641ENTRY(aesarmv8_xts_enc1)
642	stp	fp, lr, [sp, #-16]!	/* push stack frame */
643	mov	fp, sp
644	mov	x9, x0			/* x9 := enckey */
645	mov	x10, x3			/* x10 := nbytes */
646	ld1	{v31.16b}, [x4]		/* q31 := tweak */
647	_ALIGN_TEXT
6481:	ld1	{v0.16b}, [x1], #0x10	/* q0 := ptxt */
649	mov	x0, x9			/* x0 := enckey */
650	mov	x3, x5			/* x3 := nrounds */
651	eor	v0.16b, v0.16b, v31.16b	/* q0 := ptxt ^ tweak */
652	bl	aesarmv8_enc1		/* q0 := AES(...); trash x0/x3/q16 */
653	eor	v0.16b, v0.16b, v31.16b	/* q0 := AES(ptxt ^ tweak) ^ tweak */
654	st1	{v0.16b}, [x2], #0x10	/* store ciphertext block */
655	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
656	subs	x10, x10, #0x10		/* count down nbytes */
657	b.ne	1b			/* repeat if more blocks */
658	st1	{v31.16b}, [x4]		/* update tweak */
659	ldp	fp, lr, [sp], #16	/* pop stack frame */
660	ret
661END(aesarmv8_xts_enc1)
662
663/*
664 * aesarmv8_xts_enc8(const struct aesenc *enckey@x0, const uint8_t *in@x1,
665 *     uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
666 *     uint32_t nrounds@x5)
667 *
668 *	Encrypt a contiguous sequence of blocks with AES-XTS.
669 *
670 *	nbytes must be a positive integral multiple of 128.
671 *
672 *	Standard ABI calling convention.
673 */
674ENTRY(aesarmv8_xts_enc8)
675	stp	fp, lr, [sp, #-16]!	/* push stack frame */
676	mov	fp, sp
677	mov	x9, x0			/* x9 := enckey */
678	mov	x10, x3			/* x10 := nbytes */
679	ld1	{v31.16b}, [x4]		/* q31 := tweak */
680	_ALIGN_TEXT
6811:	mov	v24.16b, v31.16b	/* q24 := tweak[0] */
682	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
683	mov	v25.16b, v31.16b	/* q25 := tweak[1] */
684	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
685	mov	v26.16b, v31.16b	/* q26 := tweak[2] */
686	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
687	mov	v27.16b, v31.16b	/* q27 := tweak[3] */
688	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
689	mov	v28.16b, v31.16b	/* q28 := tweak[4] */
690	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
691	mov	v29.16b, v31.16b	/* q29 := tweak[5] */
692	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
693	mov	v30.16b, v31.16b	/* q30 := tweak[6] */
694	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
695					/* q31 := tweak[7] */
696	ld1	{v0.16b-v3.16b}, [x1], #0x40	/* q[i] := ptxt[i] */
697	ld1	{v4.16b-v7.16b}, [x1], #0x40
698	eor	v0.16b, v0.16b, v24.16b	/* q[i] := ptxt[i] ^ tweak[i] */
699	eor	v1.16b, v1.16b, v25.16b
700	eor	v2.16b, v2.16b, v26.16b
701	eor	v3.16b, v3.16b, v27.16b
702	eor	v4.16b, v4.16b, v28.16b
703	eor	v5.16b, v5.16b, v29.16b
704	eor	v6.16b, v6.16b, v30.16b
705	eor	v7.16b, v7.16b, v31.16b
706	mov	x0, x9			/* x0 := enckey */
707	mov	x3, x5			/* x3 := nrounds */
708	bl	aesarmv8_enc8		/* encrypt q0-q7; trash x0/x3/q16 */
709	eor	v0.16b, v0.16b, v24.16b	/* q[i] := AES(...) ^ tweak[i] */
710	eor	v1.16b, v1.16b, v25.16b
711	eor	v2.16b, v2.16b, v26.16b
712	eor	v3.16b, v3.16b, v27.16b
713	eor	v4.16b, v4.16b, v28.16b
714	eor	v5.16b, v5.16b, v29.16b
715	eor	v6.16b, v6.16b, v30.16b
716	eor	v7.16b, v7.16b, v31.16b
717	st1	{v0.16b-v3.16b}, [x2], #0x40	/* store ciphertext blocks */
718	st1	{v4.16b-v7.16b}, [x2], #0x40
719	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
720	subs	x10, x10, #0x80		/* count down nbytes */
721	b.ne	1b			/* repeat if more block groups */
722	st1	{v31.16b}, [x4]		/* update tweak */
723	ldp	fp, lr, [sp], #16	/* pop stack frame */
724	ret
725END(aesarmv8_xts_enc8)
726
727/*
728 * aesarmv8_xts_dec1(const struct aesdec *deckey@x0, const uint8_t *in@x1,
729 *     uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
730 *     uint32_t nrounds@x5)
731 *
732 *	Decrypt a contiguous sequdece of blocks with AES-XTS.
733 *
734 *	nbytes must be a positive integral multiple of 16.  This routine
735 *	is not vectorized; use aesarmv8_xts_dec8 for >=8 blocks at once.
736 *
737 *	Standard ABI calling convention.
738 */
739ENTRY(aesarmv8_xts_dec1)
740	stp	fp, lr, [sp, #-16]!	/* push stack frame */
741	mov	fp, sp
742	mov	x9, x0			/* x9 := deckey */
743	mov	x10, x3			/* x10 := nbytes */
744	ld1	{v31.16b}, [x4]		/* q31 := tweak */
745	_ALIGN_TEXT
7461:	ld1	{v0.16b}, [x1], #0x10	/* q0 := ctxt */
747	mov	x0, x9			/* x0 := deckey */
748	mov	x3, x5			/* x3 := nrounds */
749	eor	v0.16b, v0.16b, v31.16b	/* q0 := ctxt ^ tweak */
750	bl	aesarmv8_dec1		/* q0 := AES(...); trash x0/x3/q16 */
751	eor	v0.16b, v0.16b, v31.16b	/* q0 := AES(ctxt ^ tweak) ^ tweak */
752	st1	{v0.16b}, [x2], #0x10	/* store plaintext block */
753	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
754	subs	x10, x10, #0x10		/* count down nbytes */
755	b.ne	1b			/* repeat if more blocks */
756	st1	{v31.16b}, [x4]		/* update tweak */
757	ldp	fp, lr, [sp], #16	/* pop stack frame */
758	ret
759END(aesarmv8_xts_dec1)
760
761/*
762 * aesarmv8_xts_dec8(const struct aesdec *deckey@x0, const uint8_t *in@x1,
763 *     uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
764 *     uint32_t nrounds@x5)
765 *
766 *	Decrypt a contiguous sequdece of blocks with AES-XTS.
767 *
768 *	nbytes must be a positive integral multiple of 128.
769 *
770 *	Standard ABI calling convention.
771 */
772ENTRY(aesarmv8_xts_dec8)
773	stp	fp, lr, [sp, #-16]!	/* push stack frame */
774	mov	fp, sp
775	mov	x9, x0			/* x9 := deckey */
776	mov	x10, x3			/* x10 := nbytes */
777	ld1	{v31.16b}, [x4]		/* q31 := tweak */
778	_ALIGN_TEXT
7791:	mov	v24.16b, v31.16b	/* q24 := tweak[0] */
780	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
781	mov	v25.16b, v31.16b	/* q25 := tweak[1] */
782	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
783	mov	v26.16b, v31.16b	/* q26 := tweak[2] */
784	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
785	mov	v27.16b, v31.16b	/* q27 := tweak[3] */
786	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
787	mov	v28.16b, v31.16b	/* q28 := tweak[4] */
788	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
789	mov	v29.16b, v31.16b	/* q29 := tweak[5] */
790	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
791	mov	v30.16b, v31.16b	/* q30 := tweak[6] */
792	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
793					/* q31 := tweak[7] */
794	ld1	{v0.16b-v3.16b}, [x1], #0x40	/* q[i] := ctxt[i] */
795	ld1	{v4.16b-v7.16b}, [x1], #0x40
796	eor	v0.16b, v0.16b, v24.16b	/* q[i] := ctxt[i] ^ tweak[i] */
797	eor	v1.16b, v1.16b, v25.16b
798	eor	v2.16b, v2.16b, v26.16b
799	eor	v3.16b, v3.16b, v27.16b
800	eor	v4.16b, v4.16b, v28.16b
801	eor	v5.16b, v5.16b, v29.16b
802	eor	v6.16b, v6.16b, v30.16b
803	eor	v7.16b, v7.16b, v31.16b
804	mov	x0, x9			/* x0 := deckey */
805	mov	x3, x5			/* x3 := nrounds */
806	bl	aesarmv8_dec8		/* decrypt q0-q7; trash x0/x3/q16 */
807	eor	v0.16b, v0.16b, v24.16b	/* q[i] := AES(...) ^ tweak[i] */
808	eor	v1.16b, v1.16b, v25.16b
809	eor	v2.16b, v2.16b, v26.16b
810	eor	v3.16b, v3.16b, v27.16b
811	eor	v4.16b, v4.16b, v28.16b
812	eor	v5.16b, v5.16b, v29.16b
813	eor	v6.16b, v6.16b, v30.16b
814	eor	v7.16b, v7.16b, v31.16b
815	st1	{v0.16b-v3.16b}, [x2], #0x40	/* store plaintext blocks */
816	st1	{v4.16b-v7.16b}, [x2], #0x40
817	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
818	subs	x10, x10, #0x80		/* count down nbytes */
819	b.ne	1b			/* repeat if more block groups */
820	st1	{v31.16b}, [x4]		/* update tweak */
821	ldp	fp, lr, [sp], #16	/* pop stack frame */
822	ret
823END(aesarmv8_xts_dec8)
824
825/*
826 * aesarmv8_xts_mulx(tweak@q31)
827 *
828 *	Multiply q31 by x, modulo x^128 + x^7 + x^2 + x + 1, in place.
829 *	Uses x0 and q0/q1 as temporaries.
830 */
831	.text
832	_ALIGN_TEXT
833	.type	aesarmv8_xts_mulx,@function
834aesarmv8_xts_mulx:
835	/*
836	 * Simultaneously determine
837	 * (a) whether the high bit of the low half must be
838	 *     shifted into the low bit of the high half, and
839	 * (b) whether the high bit of the high half must be
840	 *     carried into x^128 = x^7 + x^2 + x + 1.
841	 */
842	adrl	x0, xtscarry
843	cmlt	v1.2d, v31.2d, #0 /* v1.2d[i] := -1 if v31.2d[i] < 0, else 0 */
844	ld1	{v0.16b}, [x0]		/* q0 := xtscarry */
845	ext	v1.16b, v1.16b, v1.16b, #8 /* swap halves of q1 */
846	shl	v31.2d, v31.2d, #1	/* shift */
847	and	v0.16b, v0.16b, v1.16b	/* copy xtscarry according to mask */
848	eor	v31.16b, v31.16b, v0.16b /* incorporate (a) and (b) */
849	ret
850END(aesarmv8_xts_mulx)
851
852	.section .rodata
853	.p2align 4
854	.type	xtscarry,@object
855xtscarry:
856	.byte	0x87,0,0,0, 0,0,0,0,  1,0,0,0, 0,0,0,0
857END(xtscarry)
858
859/*
860 * aesarmv8_xts_update(const uint8_t in[16] @x0, uint8_t out[16] @x1)
861 *
862 *	Update an AES-XTS tweak.
863 *
864 *	Standard ABI calling convention.
865 */
866ENTRY(aesarmv8_xts_update)
867	stp	fp, lr, [sp, #-16]!	/* push stack frame */
868	mov	fp, sp
869	ld1	{v31.16b}, [x0]		/* load tweak */
870	bl	aesarmv8_xts_mulx	/* q31 *= x; trash x0/q0/q1 */
871	st1	{v31.16b}, [x1]		/* store tweak */
872	ldp	fp, lr, [sp], #16	/* pop stack frame */
873	ret
874END(aesarmv8_xts_update)
875
876/*
877 * aesarmv8_cbcmac_update1(const struct aesenc *enckey@x0,
878 *     const uint8_t *in@x1, size_t nbytes@x2, uint8_t auth[16] @x3,
879 *     uint32_t nrounds@x4)
880 *
881 *	Update CBC-MAC.
882 *
883 *	nbytes must be a positive integral multiple of 16.
884 *
885 *	Standard ABI calling convention.
886 */
887ENTRY(aesarmv8_cbcmac_update1)
888	stp	fp, lr, [sp, #-16]!	/* push stack frame */
889	mov	fp, sp
890	ld1	{v0.16b}, [x3]		/* q0 := initial authenticator */
891	mov	x9, x0			/* x9 := enckey */
892	mov	x5, x3			/* x5 := &auth (enc1 trashes x3) */
893	_ALIGN_TEXT
8941:	ld1	{v1.16b}, [x1], #0x10	/* q1 := plaintext block */
895	mov	x0, x9			/* x0 := enckey */
896	mov	x3, x4			/* x3 := nrounds */
897	eor	v0.16b, v0.16b, v1.16b	/* q0 := auth ^ ptxt */
898	bl	aesarmv8_enc1		/* q0 := auth'; trash x0/x3/q16 */
899	subs	x2, x2, #0x10		/* count down nbytes */
900	b.ne	1b			/* repeat if x10 is nonzero */
901	st1	{v0.16b}, [x5]		/* store updated authenticator */
902	ldp	fp, lr, [sp], #16	/* pop stack frame */
903	ret
904END(aesarmv8_cbcmac_update1)
905
906/*
907 * aesarmv8_ccm_enc1(const struct aesenc *enckey@x0, const uint8_t *in@x1,
908 *     uint8_t *out@x2, size_t nbytes@x3, uint8_t authctr[32] @x4,
909 *     uint32_t nrounds@x5)
910 *
911 *	Update CCM encryption.
912 *
913 *	nbytes must be a positive integral multiple of 16.
914 *
915 *	Standard ABI calling convention.
916 */
917ENTRY(aesarmv8_ccm_enc1)
918	stp	fp, lr, [sp, #-16]!	/* push stack frame */
919	mov	fp, sp
920	ld1	{v0.16b-v1.16b}, [x4]	/* q0 := auth, q1 := ctr (be) */
921	adrl	x11, ctr32_inc		/* x11 := &ctr32_inc */
922	ld1	{v5.4s}, [x11]		/* q5 := (0,0,0,1) (host-endian) */
923	mov	x9, x0			/* x9 := enckey */
924	mov	x10, x3			/* x10 := nbytes */
925	rev32	v2.16b, v1.16b		/* q2 := ctr (host-endian) */
926	_ALIGN_TEXT
9271:	ld1	{v3.16b}, [x1], #0x10	/* q3 := plaintext block */
928	add	v2.4s, v2.4s, v5.4s	/* increment ctr (32-bit) */
929	mov	x0, x9			/* x0 := enckey */
930	mov	x3, x5			/* x3 := nrounds */
931	rev32	v1.16b, v2.16b		/* q1 := ctr (big-endian) */
932	eor	v0.16b, v0.16b, v3.16b	/* q0 := auth ^ ptxt */
933	bl	aesarmv8_enc2		/* q0 := auth', q1 := pad;
934					 * trash x0/x3/q16 */
935	eor	v3.16b, v1.16b, v3.16b	/* q3 := ciphertext block */
936	subs	x10, x10, #0x10		/* count down bytes */
937	st1	{v3.16b}, [x2], #0x10	/* store ciphertext block */
938	b.ne	1b			/* repeat if more blocks */
939	rev32	v1.16b, v2.16b		/* q1 := ctr (big-endian) */
940	st1	{v0.16b-v1.16b}, [x4]	/* store updated auth/ctr */
941	ldp	fp, lr, [sp], #16	/* pop stack frame */
942	ret
943END(aesarmv8_ccm_enc1)
944
945/*
946 * aesarmv8_ccm_dec1(const struct aesenc *enckey@x0, const uint8_t *in@x1,
947 *     uint8_t *out@x2, size_t nbytes@x3, uint8_t authctr[32] @x4,
948 *     uint32_t nrounds@x5)
949 *
950 *	Update CCM decryption.
951 *
952 *	nbytes must be a positive integral multiple of 16.
953 *
954 *	Standard ABI calling convention.
955 */
956ENTRY(aesarmv8_ccm_dec1)
957	stp	fp, lr, [sp, #-16]!	/* push stack frame */
958	mov	fp, sp
959	ld1	{v1.16b, v2.16b}, [x4]	/* q1 := auth, q2 := ctr (be) */
960	adrl	x11, ctr32_inc		/* x11 := &ctr32_inc */
961	ld1	{v5.4s}, [x11]		/* q5 := (0,0,0,1) (host-endian) */
962	mov	x9, x0			/* x9 := enckey */
963	mov	x10, x3			/* x10 := nbytes */
964	rev32	v2.16b, v2.16b		/* q2 := ctr (host-endian) */
965
966	/* Decrypt the first block.  */
967	add	v2.4s, v2.4s, v5.4s	/* increment ctr (32-bit) */
968	mov	x3, x5			/* x3 := nrounds */
969	rev32	v0.16b, v2.16b		/* q0 := ctr (big-endian) */
970	ld1	{v3.16b}, [x1], #0x10	/* q3 := ctxt */
971	bl	aesarmv8_enc1		/* q0 := pad; trash x0/x3/q16 */
972	b	2f
973
974	_ALIGN_TEXT
9751:	/*
976	 * Authenticate the last block and decrypt the next block
977	 * simultaneously.
978	 *
979	 *	q1 = auth ^ ptxt[-1]
980	 *	q2 = ctr[-1] (le)
981	 */
982	add	v2.4s, v2.4s, v5.4s	/* increment ctr (32-bit) */
983	mov	x0, x9			/* x0 := enckey */
984	mov	x3, x5			/* x3 := nrounds */
985	rev32	v0.16b, v2.16b		/* q0 := ctr (big-endian) */
986	ld1	{v3.16b}, [x1], #0x10	/* q3 := ctxt */
987	bl	aesarmv8_enc2		/* q0 := pad, q1 := auth';
988					 * trash x0/x3/q16 */
9892:	eor	v3.16b, v0.16b, v3.16b	/* q3 := plaintext block */
990	subs	x10, x10, #0x10
991	st1	{v3.16b}, [x2], #0x10		/* store plaintext */
992	eor	v1.16b, v1.16b, v3.16b	/* q1 := auth ^ ptxt */
993	b.ne	1b
994
995	rev32	v2.16b, v2.16b		/* q2 := ctr (big-endian) */
996
997	/* Authenticate the last block.  */
998	mov	x0, x9			/* x0 := enckey */
999	mov	x3, x5			/* x3 := nrounds */
1000	mov	v0.16b, v1.16b		/* q0 := auth ^ ptxt */
1001	bl	aesarmv8_enc1		/* q0 := auth'; trash x0/x3/q16 */
1002
1003	mov	v1.16b, v2.16b		/* store updated auth/ctr */
1004	st1	{v0.16b-v1.16b}, [x4]
1005	ldp	fp, lr, [sp], #16	/* pop stack frame */
1006	ret
1007END(aesarmv8_ccm_dec1)
1008
1009	.section .rodata
1010	.p2align 4
1011	.type	ctr32_inc,@object
1012ctr32_inc:
1013	.int	0, 0, 0, 1
1014END(ctr32_inc)
1015
1016/*
1017 * aesarmv8_enc1(const struct aesenc *enckey@x0,
1018 *     uint128_t block@q0, uint32_t nrounds@x3)
1019 *
1020 *	Encrypt a single AES block in q0.
1021 *
1022 *	Internal ABI.  Uses q16 as temporary.  Destroys x0 and x3.
1023 */
1024	.text
1025	_ALIGN_TEXT
1026	.type	aesarmv8_enc1,@function
1027aesarmv8_enc1:
1028	ldr	q16, [x0], #0x10	/* load round key */
1029	sub	x3, x3, #1
1030	_ALIGN_TEXT
10311:	/* q0 := MixColumns(ShiftRows(SubBytes(AddRoundKey_q16(q0)))) */
1032	aese	v0.16b, v16.16b
1033	aesmc	v0.16b, v0.16b
1034	ldr	q16, [x0], #0x10
1035	subs	x3, x3, #1
1036	b.ne	1b
1037	/* q0 := ShiftRows(SubBytes(AddRoundKey_q16(q0))) */
1038	aese	v0.16b, v16.16b
1039	ldr	q16, [x0]		/* load last round key */
1040	/* q0 := AddRoundKey_q16(q0) */
1041	eor	v0.16b, v0.16b, v16.16b
1042	ret
1043END(aesarmv8_enc1)
1044
1045/*
1046 * aesarmv8_enc2(const struct aesenc *enckey@x0,
1047 *     uint128_t block@q0, uint128_t block@q1, uint32_t nrounds@x3)
1048 *
1049 *	Encrypt two AES blocks in q0 and q1.
1050 *
1051 *	Internal ABI.  Uses q16 as temporary.  Destroys x0 and x3.
1052 */
1053	.text
1054	_ALIGN_TEXT
1055	.type	aesarmv8_enc2,@function
1056aesarmv8_enc2:
1057	ldr	q16, [x0], #0x10	/* load round key */
1058	sub	x3, x3, #1
1059	_ALIGN_TEXT
10601:	/* q[i] := MixColumns(ShiftRows(SubBytes(AddRoundKey_q16(q[i])))) */
1061	aese	v0.16b, v16.16b
1062	aesmc	v0.16b, v0.16b
1063	aese	v1.16b, v16.16b
1064	aesmc	v1.16b, v1.16b
1065	ldr	q16, [x0], #0x10	/* load next round key */
1066	subs	x3, x3, #1
1067	b.ne	1b
1068	/* q[i] := ShiftRows(SubBytes(AddRoundKey_q16(q[i]))) */
1069	aese	v0.16b, v16.16b
1070	aese	v1.16b, v16.16b
1071	ldr	q16, [x0]		/* load last round key */
1072	/* q[i] := AddRoundKey_q16(q[i]) */
1073	eor	v0.16b, v0.16b, v16.16b
1074	eor	v1.16b, v1.16b, v16.16b
1075	ret
1076END(aesarmv8_enc2)
1077
1078/*
1079 * aesarmv8_enc8(const struct aesenc *enckey@x0,
1080 *     uint128_t block0@q0, ..., uint128_t block7@q7,
1081 *     uint32_t nrounds@x3)
1082 *
1083 *	Encrypt eight AES blocks in q0 through q7 in parallel.
1084 *
1085 *	Internal ABI.  Uses q16 as temporary.  Destroys x0 and x3.
1086 */
1087	.text
1088	_ALIGN_TEXT
1089	.type	aesarmv8_enc8,@function
1090aesarmv8_enc8:
1091	ldr	q16, [x0], #0x10	/* load round key */
1092	sub	x3, x3, #1
1093	_ALIGN_TEXT
10941:	/* q[i] := MixColumns(ShiftRows(SubBytes(AddRoundKey_q16(q[i])))) */
1095	aese	v0.16b, v16.16b
1096	aesmc	v0.16b, v0.16b
1097	aese	v1.16b, v16.16b
1098	aesmc	v1.16b, v1.16b
1099	aese	v2.16b, v16.16b
1100	aesmc	v2.16b, v2.16b
1101	aese	v3.16b, v16.16b
1102	aesmc	v3.16b, v3.16b
1103	aese	v4.16b, v16.16b
1104	aesmc	v4.16b, v4.16b
1105	aese	v5.16b, v16.16b
1106	aesmc	v5.16b, v5.16b
1107	aese	v6.16b, v16.16b
1108	aesmc	v6.16b, v6.16b
1109	aese	v7.16b, v16.16b
1110	aesmc	v7.16b, v7.16b
1111	ldr	q16, [x0], #0x10	/* load next round key */
1112	subs	x3, x3, #1
1113	b.ne	1b
1114	/* q[i] := ShiftRows(SubBytes(AddRoundKey_q16(q[i]))) */
1115	aese	v0.16b, v16.16b
1116	aese	v1.16b, v16.16b
1117	aese	v2.16b, v16.16b
1118	aese	v3.16b, v16.16b
1119	aese	v4.16b, v16.16b
1120	aese	v5.16b, v16.16b
1121	aese	v6.16b, v16.16b
1122	aese	v7.16b, v16.16b
1123	ldr	q16, [x0]		/* load last round key */
1124	/* q[i] := AddRoundKey_q16(q[i]) */
1125	eor	v0.16b, v0.16b, v16.16b
1126	eor	v1.16b, v1.16b, v16.16b
1127	eor	v2.16b, v2.16b, v16.16b
1128	eor	v3.16b, v3.16b, v16.16b
1129	eor	v4.16b, v4.16b, v16.16b
1130	eor	v5.16b, v5.16b, v16.16b
1131	eor	v6.16b, v6.16b, v16.16b
1132	eor	v7.16b, v7.16b, v16.16b
1133	ret
1134END(aesarmv8_enc8)
1135
1136/*
1137 * aesarmv8_dec1(const struct aesdec *deckey@x0,
1138 *     uint128_t block@q0, uint32_t nrounds@x3)
1139 *
1140 *	Decrypt a single AES block in q0.
1141 *
1142 *	Internal ABI.  Uses q16 as temporary.  Destroys x0 and x3.
1143 */
1144	.text
1145	_ALIGN_TEXT
1146	.type	aesarmv8_dec1,@function
1147aesarmv8_dec1:
1148	ldr	q16, [x0], #0x10	/* load round key */
1149	sub	x3, x3, #1
1150	_ALIGN_TEXT
11511:	/* q0 := InSubBytes(InShiftRows(AddRoundKey_q16(q0))) */
1152	aesd	v0.16b, v16.16b
1153	/* q0 := InMixColumns(q0) */
1154	aesimc	v0.16b, v0.16b
1155	ldr	q16, [x0], #0x10	/* load next round key */
1156	subs	x3, x3, #1
1157	b.ne	1b
1158	/* q0 := InSubBytes(InShiftRows(AddRoundKey_q16(q0))) */
1159	aesd	v0.16b, v16.16b
1160	ldr	q16, [x0]		/* load last round key */
1161	/* q0 := AddRoundKey_q16(q0) */
1162	eor	v0.16b, v0.16b, v16.16b
1163	ret
1164END(aesarmv8_dec1)
1165
1166/*
1167 * aesarmv8_dec8(const struct aesdec *deckey@x0,
1168 *     uint128_t block0@q0, ..., uint128_t block7@q7,
1169 *     uint32_t nrounds@x3)
1170 *
1171 *	Decrypt eight AES blocks in q0 through q7 in parallel.
1172 *
1173 *	Internal ABI.  Uses q16 as temporary.  Destroys x0 and x3.
1174 */
1175	.text
1176	_ALIGN_TEXT
1177	.type	aesarmv8_dec8,@function
1178aesarmv8_dec8:
1179	ldr	q16, [x0], #0x10	/* load round key */
1180	sub	x3, x3, #1
1181	_ALIGN_TEXT
11821:	/* q[i] := InSubBytes(InShiftRows(AddRoundKey_q16(q[i]))) */
1183	aesd	v0.16b, v16.16b
1184	/* q[i] := InMixColumns(q[i]) */
1185	aesimc	v0.16b, v0.16b
1186	aesd	v1.16b, v16.16b
1187	aesimc	v1.16b, v1.16b
1188	aesd	v2.16b, v16.16b
1189	aesimc	v2.16b, v2.16b
1190	aesd	v3.16b, v16.16b
1191	aesimc	v3.16b, v3.16b
1192	aesd	v4.16b, v16.16b
1193	aesimc	v4.16b, v4.16b
1194	aesd	v5.16b, v16.16b
1195	aesimc	v5.16b, v5.16b
1196	aesd	v6.16b, v16.16b
1197	aesimc	v6.16b, v6.16b
1198	aesd	v7.16b, v16.16b
1199	aesimc	v7.16b, v7.16b
1200	ldr	q16, [x0], #0x10	/* load next round key */
1201	subs	x3, x3, #1
1202	b.ne	1b
1203	/* q[i] := InSubBytes(InShiftRows(AddRoundKey_q16(q[i]))) */
1204	aesd	v0.16b, v16.16b
1205	aesd	v1.16b, v16.16b
1206	aesd	v2.16b, v16.16b
1207	aesd	v3.16b, v16.16b
1208	aesd	v4.16b, v16.16b
1209	aesd	v5.16b, v16.16b
1210	aesd	v6.16b, v16.16b
1211	aesd	v7.16b, v16.16b
1212	ldr	q16, [x0]		/* load last round key */
1213	/* q[i] := AddRoundKey_q16(q[i]) */
1214	eor	v0.16b, v0.16b, v16.16b
1215	eor	v1.16b, v1.16b, v16.16b
1216	eor	v2.16b, v2.16b, v16.16b
1217	eor	v3.16b, v3.16b, v16.16b
1218	eor	v4.16b, v4.16b, v16.16b
1219	eor	v5.16b, v5.16b, v16.16b
1220	eor	v6.16b, v6.16b, v16.16b
1221	eor	v7.16b, v7.16b, v16.16b
1222	ret
1223END(aesarmv8_dec8)
1224