xref: /openbsd-src/sys/arch/amd64/amd64/aes_intel.S (revision 3dd0809fbcf33671ef9b3771096948131e7cafab)
1/*	$OpenBSD: aes_intel.S,v 1.14 2021/09/04 22:15:33 bluhm Exp $	*/
2
3/*
4 * Implement AES algorithm in Intel AES-NI instructions.
5 *
6 * The white paper of AES-NI instructions can be downloaded from:
7 *   http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
8 *
9 * Copyright (C) 2008-2010, Intel Corporation
10 *    Author: Huang Ying <ying.huang@intel.com>
11 *            Vinodh Gopal <vinodh.gopal@intel.com>
12 *            Kahraman Akdemir
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following
16 * conditions are met:
17 *
18 * - Redistributions of source code must retain the above copyright
19 *   notice, this list of conditions and the following disclaimer.
20 *
21 * - Redistributions in binary form must reproduce the above copyright
22 *   notice, this list of conditions and the following disclaimer in the
23 *   documentation and/or other materials provided with the
24 *   distribution.
25 *
26 * - Neither the name of Intel Corporation nor the names of its
27 *   contributors may be used to endorse or promote products
28 *   derived from this software without specific prior written
29 *   permission.
30 *
31 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
32 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
33 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
34 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
35 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
36 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
37 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
38 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
39 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
40 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
41 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
42 */
43
44/*
45 * Changes to the original source code released by Intel:
46 *
47 * - assembler macros were converted to the actual instructions;
48 * - aesni_ctr_enc was changed to be RFC 3686 compliant;
49 * - aes-gcm mode added;
50 * - aes-xts implementation added;
51 *
52 * Copyright (c) 2010,2011 Mike Belopuhov
53 * Copyright (c) 2013 Joel Sing <jsing@openbsd.org>
54 *
55 * Permission to use, copy, modify, and distribute this software for any
56 * purpose with or without fee is hereby granted, provided that the above
57 * copyright notice and this permission notice appear in all copies.
58 *
59 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
60 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
61 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
62 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
63 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
64 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
65 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
66 */
67
68#include <machine/param.h>
69#include <machine/asm.h>
70
71#define STATE1		%xmm0
72#define STATE2		%xmm4
73#define STATE3		%xmm5
74#define STATE4		%xmm6
75#define STATE		STATE1
76#define IN1		%xmm1
77#define IN2		%xmm7
78#define IN3		%xmm8
79#define IN4		%xmm9
80#define IN		IN1
81#define KEY		%xmm2
82#define IV		%xmm3
83#define BSWAP_MASK	%xmm10
84#define CTR		%xmm11
85#define INC		%xmm12
86
87#define KEYP		%rdi
88#define OUTP		%rsi
89#define INP		%rdx
90#define LEN		%rcx
91#define HSTATE		%rcx
92#define IVP		%r8
93#define ICBP		%r8
94#define KLEN		%r9d
95#define T1		%r10
96#define TKEYP		T1
97#define T2		%r11
98#define TCTR_LOW	T2
99
100	.section .rodata
101.align 16
102.Lbswap_mask:
103	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
104
105	.text
106
107_key_expansion_128:
108_key_expansion_256a:
109	RETGUARD_SETUP(_key_expansion_128, rax)
110	pshufd	$0b11111111,%xmm1,%xmm1
111	shufps	$0b00010000,%xmm0,%xmm4
112	pxor	%xmm4,%xmm0
113	shufps	$0b10001100,%xmm0,%xmm4
114	pxor	%xmm4,%xmm0
115	pxor	%xmm1,%xmm0
116	movaps	%xmm0,(%rcx)
117	add	$0x10,%rcx
118	RETGUARD_CHECK(_key_expansion_128, rax)
119	ret
120	lfence
121
122_key_expansion_192a:
123	RETGUARD_SETUP(_key_expansion_192a, rax)
124	pshufd	$0b01010101,%xmm1,%xmm1
125	shufps	$0b00010000,%xmm0,%xmm4
126	pxor	%xmm4,%xmm0
127	shufps	$0b10001100,%xmm0,%xmm4
128	pxor	%xmm4,%xmm0
129	pxor	%xmm1,%xmm0
130
131	movaps	%xmm2,%xmm5
132	movaps	%xmm2,%xmm6
133	pslldq	$4,%xmm5
134	pshufd	$0b11111111,%xmm0,%xmm3
135	pxor	%xmm3,%xmm2
136	pxor	%xmm5,%xmm2
137
138	movaps	%xmm0,%xmm1
139	shufps	$0b01000100,%xmm0,%xmm6
140	movaps	%xmm6,(%rcx)
141	shufps	$0b01001110,%xmm2,%xmm1
142	movaps	%xmm1,16(%rcx)
143	add	$0x20,%rcx
144	RETGUARD_CHECK(_key_expansion_192a, rax)
145	ret
146	lfence
147
148_key_expansion_192b:
149	RETGUARD_SETUP(_key_expansion_192b, rax)
150	pshufd	$0b01010101,%xmm1,%xmm1
151	shufps	$0b00010000,%xmm0,%xmm4
152	pxor	%xmm4,%xmm0
153	shufps	$0b10001100,%xmm0,%xmm4
154	pxor	%xmm4,%xmm0
155	pxor	%xmm1,%xmm0
156
157	movaps	%xmm2,%xmm5
158	pslldq	$4,%xmm5
159	pshufd	$0b11111111,%xmm0,%xmm3
160	pxor	%xmm3,%xmm2
161	pxor	%xmm5,%xmm2
162
163	movaps	%xmm0,(%rcx)
164	add	$0x10,%rcx
165	RETGUARD_CHECK(_key_expansion_192b, rax)
166	ret
167	lfence
168
169_key_expansion_256b:
170	RETGUARD_SETUP(_key_expansion_256b, rax)
171	pshufd	$0b10101010,%xmm1,%xmm1
172	shufps	$0b00010000,%xmm2,%xmm4
173	pxor	%xmm4,%xmm2
174	shufps	$0b10001100,%xmm2,%xmm4
175	pxor	%xmm4,%xmm2
176	pxor	%xmm1,%xmm2
177	movaps	%xmm2,(%rcx)
178	add	$0x10,%rcx
179	RETGUARD_CHECK(_key_expansion_256b, rax)
180	ret
181	lfence
182
183/*
184 * void aesni_set_key(struct aesni_session *ses, uint8_t *key, size_t len)
185 */
186ENTRY(aesni_set_key)
187	RETGUARD_SETUP(aesni_set_key, r11)
188	movups	(%rsi),%xmm0		# user key (first 16 bytes)
189	movaps	%xmm0,(%rdi)
190	lea	0x10(%rdi),%rcx		# key addr
191	movl	%edx,480(%rdi)
192	pxor	%xmm4,%xmm4		# xmm4 is assumed 0 in _key_expansion_x
193	cmp	$24,%dl
194	jb	2f
195	je	1f
196	movups	0x10(%rsi),%xmm2	# other user key
197	movaps	%xmm2,(%rcx)
198	add	$0x10,%rcx
199	aeskeygenassist $0x1,%xmm2,%xmm1	# round 1
200	call	_key_expansion_256a
201	aeskeygenassist $0x1,%xmm0,%xmm1
202	call	_key_expansion_256b
203	aeskeygenassist $0x2,%xmm2,%xmm1	# round 2
204	call	_key_expansion_256a
205	aeskeygenassist $0x2,%xmm0,%xmm1
206	call	_key_expansion_256b
207	aeskeygenassist $0x4,%xmm2,%xmm1	# round 3
208	call	_key_expansion_256a
209	aeskeygenassist $0x4,%xmm0,%xmm1
210	call	_key_expansion_256b
211	aeskeygenassist $0x8,%xmm2,%xmm1	# round 4
212	call	_key_expansion_256a
213	aeskeygenassist $0x8,%xmm0,%xmm1
214	call	_key_expansion_256b
215	aeskeygenassist $0x10,%xmm2,%xmm1	# round 5
216	call	_key_expansion_256a
217	aeskeygenassist $0x10,%xmm0,%xmm1
218	call	_key_expansion_256b
219	aeskeygenassist $0x20,%xmm2,%xmm1	# round 6
220	call	_key_expansion_256a
221	aeskeygenassist $0x20,%xmm0,%xmm1
222	call	_key_expansion_256b
223	aeskeygenassist $0x40,%xmm2,%xmm1	# round 7
224	call	_key_expansion_256a
225	jmp	3f
2261:	/* 192 bit key */
227	movq	0x10(%rsi),%xmm2	# other user key
228	aeskeygenassist $0x1,%xmm2,%xmm1	# round 1
229	call	_key_expansion_192a
230	aeskeygenassist $0x2,%xmm2,%xmm1	# round 2
231	call	_key_expansion_192b
232	aeskeygenassist $0x4,%xmm2,%xmm1	# round 3
233	call	_key_expansion_192a
234	aeskeygenassist $0x8,%xmm2,%xmm1	# round 4
235	call	_key_expansion_192b
236	aeskeygenassist $0x10,%xmm2,%xmm1	# round 5
237	call	_key_expansion_192a
238	aeskeygenassist $0x20,%xmm2,%xmm1	# round 6
239	call	_key_expansion_192b
240	aeskeygenassist $0x40,%xmm2,%xmm1	# round 7
241	call	_key_expansion_192a
242	aeskeygenassist $0x80,%xmm2,%xmm1	# round 8
243	call	_key_expansion_192b
244	jmp	3f
2452:	/* 128 bit key */
246	aeskeygenassist $0x1,%xmm0,%xmm1	# round 1
247	call	_key_expansion_128
248	aeskeygenassist $0x2,%xmm0,%xmm1	# round 2
249	call	_key_expansion_128
250	aeskeygenassist $0x4,%xmm0,%xmm1	# round 3
251	call	_key_expansion_128
252	aeskeygenassist $0x8,%xmm0,%xmm1	# round 4
253	call	_key_expansion_128
254	aeskeygenassist $0x10,%xmm0,%xmm1	# round 5
255	call	_key_expansion_128
256	aeskeygenassist $0x20,%xmm0,%xmm1	# round 6
257	call	_key_expansion_128
258	aeskeygenassist $0x40,%xmm0,%xmm1	# round 7
259	call	_key_expansion_128
260	aeskeygenassist $0x80,%xmm0,%xmm1	# round 8
261	call	_key_expansion_128
262	aeskeygenassist $0x1b,%xmm0,%xmm1	# round 9
263	call	_key_expansion_128
264	aeskeygenassist $0x36,%xmm0,%xmm1	# round 10
265	call	_key_expansion_128
2663:
267	sub	$0x10,%rcx
268	movaps	(%rdi),%xmm0
269	movaps	(%rcx),%xmm1
270	movaps	%xmm0,240(%rcx)
271	movaps	%xmm1,240(%rdi)
272	add	$0x10,%rdi
273	lea	240-16(%rcx),%rsi
274.align 4
2754:
276	movaps	(%rdi),%xmm0
277	aesimc	%xmm0,%xmm1
278	movaps	%xmm1,(%rsi)
279	add	$0x10,%rdi
280	sub	$0x10,%rsi
281	cmp	%rcx,%rdi
282	jb	4b
283	RETGUARD_CHECK(aesni_set_key, r11)
284	ret
285	lfence
286
287/*
288 * void aesni_enc(struct aesni_session *ses, uint8_t *dst, uint8_t *src)
289 */
290ENTRY(aesni_enc)
291	RETGUARD_SETUP(aesni_enc, r11)
292	movl	480(KEYP),KLEN		# key length
293	movups	(INP),STATE		# input
294	call	_aesni_enc1
295	movups	STATE,(OUTP)		# output
296	RETGUARD_CHECK(aesni_enc, r11)
297	ret
298	lfence
299
300/*
301 * _aesni_enc1:		internal ABI
302 * input:
303 *	KEYP:		key struct pointer
304 *	KLEN:		round count
305 *	STATE:		initial state (input)
306 * output:
307 *	STATE:		final state (output)
308 * changed:
309 *	KEY
310 *	TKEYP (T1)
311 */
312_aesni_enc1:
313	RETGUARD_SETUP(_aesni_enc1, rax)
314	movaps	(KEYP),KEY		# key
315	mov	KEYP,TKEYP
316	pxor	KEY,STATE		# round 0
317	add	$0x30,TKEYP
318	cmp	$24,KLEN
319	jb	2f
320	lea	0x20(TKEYP),TKEYP
321	je	1f
322	add	$0x20,TKEYP
323	movaps	-0x60(TKEYP),KEY
324	aesenc	KEY,STATE
325	movaps	-0x50(TKEYP),KEY
326	aesenc	KEY,STATE
327.align 4
3281:	/* 192 bit key */
329	movaps	-0x40(TKEYP),KEY
330	aesenc	KEY,STATE
331	movaps	-0x30(TKEYP),KEY
332	aesenc	KEY,STATE
333.align 4
3342:	/* 128 bit key */
335	movaps	-0x20(TKEYP),KEY
336	aesenc	KEY,STATE
337	movaps	-0x10(TKEYP),KEY
338	aesenc	KEY,STATE
339	movaps	(TKEYP),KEY
340	aesenc	KEY,STATE
341	movaps	0x10(TKEYP),KEY
342	aesenc	KEY,STATE
343	movaps	0x20(TKEYP),KEY
344	aesenc	KEY,STATE
345	movaps	0x30(TKEYP),KEY
346	aesenc	KEY,STATE
347	movaps	0x40(TKEYP),KEY
348	aesenc	KEY,STATE
349	movaps	0x50(TKEYP),KEY
350	aesenc	KEY,STATE
351	movaps	0x60(TKEYP),KEY
352	aesenc	KEY,STATE
353	movaps	0x70(TKEYP),KEY
354	aesenclast KEY,STATE
355	RETGUARD_CHECK(_aesni_enc1, rax)
356	ret
357	lfence
358
359/*
360 * _aesni_enc4:	internal ABI
361 * input:
362 *	KEYP:		key struct pointer
363 *	KLEN:		round count
364 *	STATE1:		initial state (input)
365 *	STATE2
366 *	STATE3
367 *	STATE4
368 * output:
369 *	STATE1:		final state (output)
370 *	STATE2
371 *	STATE3
372 *	STATE4
373 * changed:
374 *	KEY
375 *	TKEYP (T1)
376 */
377_aesni_enc4:
378	RETGUARD_SETUP(_aesni_enc4, rax)
379	movaps	(KEYP),KEY		# key
380	mov	KEYP,TKEYP
381	pxor	KEY,STATE1		# round 0
382	pxor	KEY,STATE2
383	pxor	KEY,STATE3
384	pxor	KEY,STATE4
385	add	$0x30,TKEYP
386	cmp	$24,KLEN
387	jb	2f
388	lea	0x20(TKEYP),TKEYP
389	je	1f
390	add	$0x20,TKEYP
391	movaps	-0x60(TKEYP),KEY
392	aesenc	KEY,STATE1
393	aesenc	KEY,STATE2
394	aesenc	KEY,STATE3
395	aesenc	KEY,STATE4
396	movaps	-0x50(TKEYP),KEY
397	aesenc	KEY,STATE1
398	aesenc	KEY,STATE2
399	aesenc	KEY,STATE3
400	aesenc	KEY,STATE4
401#.align 4
4021:	/* 192 bit key */
403	movaps	-0x40(TKEYP),KEY
404	aesenc	KEY,STATE1
405	aesenc	KEY,STATE2
406	aesenc	KEY,STATE3
407	aesenc	KEY,STATE4
408	movaps	-0x30(TKEYP),KEY
409	aesenc	KEY,STATE1
410	aesenc	KEY,STATE2
411	aesenc	KEY,STATE3
412	aesenc	KEY,STATE4
413#.align 4
4142:	/* 128 bit key */
415	movaps	-0x20(TKEYP),KEY
416	aesenc	KEY,STATE1
417	aesenc	KEY,STATE2
418	aesenc	KEY,STATE3
419	aesenc	KEY,STATE4
420	movaps	-0x10(TKEYP),KEY
421	aesenc	KEY,STATE1
422	aesenc	KEY,STATE2
423	aesenc	KEY,STATE3
424	aesenc	KEY,STATE4
425	movaps	(TKEYP),KEY
426	aesenc	KEY,STATE1
427	aesenc	KEY,STATE2
428	aesenc	KEY,STATE3
429	aesenc	KEY,STATE4
430	movaps	0x10(TKEYP),KEY
431	aesenc	KEY,STATE1
432	aesenc	KEY,STATE2
433	aesenc	KEY,STATE3
434	aesenc	KEY,STATE4
435	movaps	0x20(TKEYP),KEY
436	aesenc	KEY,STATE1
437	aesenc	KEY,STATE2
438	aesenc	KEY,STATE3
439	aesenc	KEY,STATE4
440	movaps	0x30(TKEYP),KEY
441	aesenc	KEY,STATE1
442	aesenc	KEY,STATE2
443	aesenc	KEY,STATE3
444	aesenc	KEY,STATE4
445	movaps	0x40(TKEYP),KEY
446	aesenc	KEY,STATE1
447	aesenc	KEY,STATE2
448	aesenc	KEY,STATE3
449	aesenc	KEY,STATE4
450	movaps	0x50(TKEYP),KEY
451	aesenc	KEY,STATE1
452	aesenc	KEY,STATE2
453	aesenc	KEY,STATE3
454	aesenc	KEY,STATE4
455	movaps	0x60(TKEYP),KEY
456	aesenc	KEY,STATE1
457	aesenc	KEY,STATE2
458	aesenc	KEY,STATE3
459	aesenc	KEY,STATE4
460	movaps	0x70(TKEYP),KEY
461	aesenclast KEY,STATE1		# last round
462	aesenclast KEY,STATE2
463	aesenclast KEY,STATE3
464	aesenclast KEY,STATE4
465	RETGUARD_CHECK(_aesni_enc4, rax)
466	ret
467	lfence
468
469/*
470 * void aesni_dec(struct aesni_session *ses, uint8_t *dst, uint8_t *src)
471 */
472ENTRY(aesni_dec)
473	RETGUARD_SETUP(aesni_dec, r11)
474	mov	480(KEYP),KLEN		# key length
475	add	$240,KEYP
476	movups	(INP),STATE		# input
477	call	_aesni_dec1
478	movups	STATE,(OUTP)		# output
479	RETGUARD_CHECK(aesni_dec, r11)
480	ret
481	lfence
482
483/*
484 * _aesni_dec1:		internal ABI
485 * input:
486 *	KEYP:		key struct pointer
487 *	KLEN:		key length
488 *	STATE:		initial state (input)
489 * output:
490 *	STATE:		final state (output)
491 * changed:
492 *	KEY
493 *	TKEYP (T1)
494 */
495_aesni_dec1:
496	RETGUARD_SETUP(_aesni_dec1, rax)
497	movaps	(KEYP),KEY		# key
498	mov	KEYP,TKEYP
499	pxor	KEY,STATE		# round 0
500	add	$0x30,TKEYP
501	cmp	$24,KLEN
502	jb	2f
503	lea	0x20(TKEYP),TKEYP
504	je	1f
505	add	$0x20,TKEYP
506	movaps	-0x60(TKEYP),KEY
507	aesdec	KEY,STATE
508	movaps	-0x50(TKEYP),KEY
509	aesdec	KEY,STATE
510.align 4
5111:	/* 192 bit key */
512	movaps	-0x40(TKEYP),KEY
513	aesdec	KEY,STATE
514	movaps	-0x30(TKEYP),KEY
515	aesdec	KEY,STATE
516.align 4
5172:	/* 128 bit key */
518	movaps	-0x20(TKEYP),KEY
519	aesdec	KEY,STATE
520	movaps	-0x10(TKEYP),KEY
521	aesdec	KEY,STATE
522	movaps	(TKEYP),KEY
523	aesdec	KEY,STATE
524	movaps	0x10(TKEYP),KEY
525	aesdec	KEY,STATE
526	movaps	0x20(TKEYP),KEY
527	aesdec	KEY,STATE
528	movaps	0x30(TKEYP),KEY
529	aesdec	KEY,STATE
530	movaps	0x40(TKEYP),KEY
531	aesdec	KEY,STATE
532	movaps	0x50(TKEYP),KEY
533	aesdec	KEY,STATE
534	movaps	0x60(TKEYP),KEY
535	aesdec	KEY,STATE
536	movaps	0x70(TKEYP),KEY
537	aesdeclast KEY,STATE
538	RETGUARD_CHECK(_aesni_dec1, rax)
539	ret
540	lfence
541
542/*
543 * _aesni_dec4:	internal ABI
544 * input:
545 *	KEYP:		key struct pointer
546 *	KLEN:		key length
547 *	STATE1:		initial state (input)
548 *	STATE2
549 *	STATE3
550 *	STATE4
551 * output:
552 *	STATE1:		final state (output)
553 *	STATE2
554 *	STATE3
555 *	STATE4
556 * changed:
557 *	KEY
558 *	TKEYP (T1)
559 */
560_aesni_dec4:
561	RETGUARD_SETUP(_aesni_dec4, rax)
562	movaps	(KEYP),KEY		# key
563	mov	KEYP,TKEYP
564	pxor	KEY,STATE1		# round 0
565	pxor	KEY,STATE2
566	pxor	KEY,STATE3
567	pxor	KEY,STATE4
568	add	$0x30,TKEYP
569	cmp	$24,KLEN
570	jb	2f
571	lea	0x20(TKEYP),TKEYP
572	je	1f
573	add 	$0x20,TKEYP
574	movaps	-0x60(TKEYP),KEY
575	aesdec	KEY,STATE1
576	aesdec	KEY,STATE2
577	aesdec	KEY,STATE3
578	aesdec	KEY,STATE4
579	movaps	-0x50(TKEYP),KEY
580	aesdec	KEY,STATE1
581	aesdec	KEY,STATE2
582	aesdec	KEY,STATE3
583	aesdec	KEY,STATE4
584.align 4
5851:	/* 192 bit key */
586	movaps	-0x40(TKEYP),KEY
587	aesdec	KEY,STATE1
588	aesdec	KEY,STATE2
589	aesdec	KEY,STATE3
590	aesdec	KEY,STATE4
591	movaps	-0x30(TKEYP),KEY
592	aesdec	KEY,STATE1
593	aesdec	KEY,STATE2
594	aesdec	KEY,STATE3
595	aesdec	KEY,STATE4
596.align 4
5972:	/* 128 bit key */
598	movaps	-0x20(TKEYP),KEY
599	aesdec	KEY,STATE1
600	aesdec	KEY,STATE2
601	aesdec	KEY,STATE3
602	aesdec	KEY,STATE4
603	movaps	-0x10(TKEYP),KEY
604	aesdec	KEY,STATE1
605	aesdec	KEY,STATE2
606	aesdec	KEY,STATE3
607	aesdec	KEY,STATE4
608	movaps	(TKEYP),KEY
609	aesdec	KEY,STATE1
610	aesdec	KEY,STATE2
611	aesdec	KEY,STATE3
612	aesdec	KEY,STATE4
613	movaps	0x10(TKEYP),KEY
614	aesdec	KEY,STATE1
615	aesdec	KEY,STATE2
616	aesdec	KEY,STATE3
617	aesdec	KEY,STATE4
618	movaps	0x20(TKEYP),KEY
619	aesdec	KEY,STATE1
620	aesdec	KEY,STATE2
621	aesdec	KEY,STATE3
622	aesdec	KEY,STATE4
623	movaps	0x30(TKEYP),KEY
624	aesdec	KEY,STATE1
625	aesdec	KEY,STATE2
626	aesdec	KEY,STATE3
627	aesdec	KEY,STATE4
628	movaps	0x40(TKEYP),KEY
629	aesdec	KEY,STATE1
630	aesdec	KEY,STATE2
631	aesdec	KEY,STATE3
632	aesdec	KEY,STATE4
633	movaps	0x50(TKEYP),KEY
634	aesdec	KEY,STATE1
635	aesdec	KEY,STATE2
636	aesdec	KEY,STATE3
637	aesdec	KEY,STATE4
638	movaps	0x60(TKEYP),KEY
639	aesdec	KEY,STATE1
640	aesdec	KEY,STATE2
641	aesdec	KEY,STATE3
642	aesdec	KEY,STATE4
643	movaps	0x70(TKEYP),KEY
644	aesdeclast KEY,STATE1		# last round
645	aesdeclast KEY,STATE2
646	aesdeclast KEY,STATE3
647	aesdeclast KEY,STATE4
648	RETGUARD_CHECK(_aesni_dec4, rax)
649	ret
650	lfence
651
652#if 0
653/*
654 * void aesni_ecb_enc(struct aesni_session *ses, uint8_t *dst, uint8_t *src,
655 *     size_t len)
656 */
657ENTRY(aesni_ecb_enc)
658	RETGUARD_SETUP(aesni_ecb_enc, r11)
659	test	LEN,LEN			# check length
660	jz	3f
661	mov	480(KEYP),KLEN
662	cmp	$16,LEN
663	jb	3f
664	cmp	$64,LEN
665	jb	2f
666.align 4
6671:
668	movups	(INP),STATE1
669	movups	0x10(INP),STATE2
670	movups	0x20(INP),STATE3
671	movups	0x30(INP),STATE4
672	call	_aesni_enc4
673	movups	STATE1,(OUTP)
674	movups	STATE2,0x10(OUTP)
675	movups	STATE3,0x20(OUTP)
676	movups	STATE4,0x30(OUTP)
677	sub	$64,LEN
678	add	$64,INP
679	add	$64,OUTP
680	cmp	$64,LEN
681	jge	1b
682	cmp	$16,LEN
683	jb	3f
684.align 4
6852:
686	movups	(INP),STATE1
687	call	_aesni_enc1
688	movups	STATE1,(OUTP)
689	sub	$16,LEN
690	add	$16,INP
691	add	$16,OUTP
692	cmp	$16,LEN
693	jge	2b
6943:
695	RETGUARD_CHECK(aesni_ecb_enc, r11)
696	ret
697	lfence
698
699/*
700 * void aesni_ecb_dec(struct aesni_session *ses, uint8_t *dst, uint8_t *src,
701 *     size_t len);
702 */
703ENTRY(aesni_ecb_dec)
704	RETGUARD_SETUP(aesni_ecb_dec, r11)
705	test	LEN,LEN
706	jz	3f
707	mov	480(KEYP),KLEN
708	add	$240,KEYP
709	cmp	$16,LEN
710	jb	3f
711	cmp	$64,LEN
712	jb	2f
713.align 4
7141:
715	movups	(INP),STATE1
716	movups	0x10(INP),STATE2
717	movups	0x20(INP),STATE3
718	movups	0x30(INP),STATE4
719	call	_aesni_dec4
720	movups	STATE1,(OUTP)
721	movups	STATE2,0x10(OUTP)
722	movups	STATE3,0x20(OUTP)
723	movups	STATE4,0x30(OUTP)
724	sub	$64,LEN
725	add	$64,INP
726	add	$64,OUTP
727	cmp	$64,LEN
728	jge	1b
729	cmp	$16,LEN
730	jb	3f
731.align 4
7322:
733	movups	(INP),STATE1
734	call	_aesni_dec1
735	movups	STATE1,(OUTP)
736	sub	$16,LEN
737	add	$16,INP
738	add	$16,OUTP
739	cmp	$16,LEN
740	jge	2b
7413:
742	RETGUARD_CHECK(aesni_ecb_dec, r11)
743	ret
744	lfence
745#endif
746
747/*
748 * void aesni_cbc_enc(struct aesni_session *ses, uint8_t *dst, uint8_t *src,
749 *     size_t len, uint8_t *iv)
750 */
751ENTRY(aesni_cbc_enc)
752	RETGUARD_SETUP(aesni_cbc_enc, r11)
753	cmp	$16,LEN
754	jb	2f
755	mov	480(KEYP),KLEN
756	movups	(IVP),STATE	# load iv as initial state
757.align 4
7581:
759	movups	(INP),IN	# load input
760	pxor	IN,STATE
761	call	_aesni_enc1
762	movups	STATE,(OUTP)	# store output
763	sub	$16,LEN
764	add	$16,INP
765	add	$16,OUTP
766	cmp	$16,LEN
767	jge	1b
768	movups	STATE,(IVP)
7692:
770	RETGUARD_CHECK(aesni_cbc_enc, r11)
771	ret
772	lfence
773
774/*
775 * void aesni_cbc_dec(struct aesni_session *ses, uint8_t *dst, uint8_t *src,
776 *     size_t len, uint8_t *iv)
777 */
778ENTRY(aesni_cbc_dec)
779	RETGUARD_SETUP(aesni_cbc_dec, r11)
780	cmp	$16,LEN
781	jb	4f
782	mov	480(KEYP),KLEN
783	add	$240,KEYP
784	movups	(IVP),IV
785	cmp	$64,LEN
786	jb	2f
787.align 4
7881:	/* pipeline 4 instructions when possible */
789	movups	(INP),IN1
790	movaps	IN1,STATE1
791	movups	0x10(INP),IN2
792	movaps	IN2,STATE2
793	movups	0x20(INP),IN3
794	movaps	IN3,STATE3
795	movups	0x30(INP),IN4
796	movaps	IN4,STATE4
797	call	_aesni_dec4
798	pxor	IV,STATE1
799	pxor	IN1,STATE2
800	pxor	IN2,STATE3
801	pxor	IN3,STATE4
802	movaps	IN4,IV
803	movups	STATE1,(OUTP)
804	movups	STATE2,0x10(OUTP)
805	movups	STATE3,0x20(OUTP)
806	movups	STATE4,0x30(OUTP)
807	sub	$64,LEN
808	add	$64,INP
809	add	$64,OUTP
810	cmp	$64,LEN
811	jge	1b
812	cmp	$16,LEN
813	jb	3f
814.align 4
8152:
816	movups	(INP),IN
817	movaps	IN,STATE
818	call	_aesni_dec1
819	pxor	IV,STATE
820	movups	STATE,(OUTP)
821	movaps	IN,IV
822	sub	$16,LEN
823	add	$16,INP
824	add	$16,OUTP
825	cmp	$16,LEN
826	jge	2b
8273:
828	movups	IV,(IVP)
8294:
830	RETGUARD_CHECK(aesni_cbc_dec, r11)
831	ret
832	lfence
833
834/*
835 * _aesni_inc_init:	internal ABI
836 *	setup registers used by _aesni_inc
837 * input:
838 *	ICB
839 * output:
840 *	CTR:		== CTR, in little endian
841 *	IV:		== IV, in big endian
842 *	TCTR_LOW:	== lower dword of CTR
843 *	INC:		== 1, in little endian
844 *	BSWAP_MASK	== endian swapping mask
845 */
846_aesni_inc_init:
847	RETGUARD_SETUP(_aesni_inc_init, rax)
848	movdqa	CTR,IV
849	pslldq	$8,IV
850	movdqu	.Lbswap_mask,BSWAP_MASK
851	pshufb	BSWAP_MASK,CTR
852	mov	$1,TCTR_LOW
853	movd	TCTR_LOW,INC
854	movd	CTR,TCTR_LOW
855	RETGUARD_CHECK(_aesni_inc_init, rax)
856	ret
857	lfence
858
859/*
860 * _aesni_inc:		internal ABI
861 *	Increase IV by 1, IV is in big endian
862 * input:
863 *	IV
864 *	CTR:		== IV, in little endian
865 *	TCTR_LOW:	== lower dword of CTR
866 *	INC:		== 1, in little endian
867 *	BSWAP_MASK	== endian swapping mask
868 * output:
869 *	IV:		Increase by 1
870 * changed:
871 *	CTR:		== output IV, in little endian
872 *	TCTR_LOW:	== lower dword of CTR
873 */
874_aesni_inc:
875	RETGUARD_SETUP(_aesni_inc, rax)
876	paddq	INC,CTR
877	add	$1,TCTR_LOW
878	jnc	1f
879	pslldq	$8,INC
880	paddq	INC,CTR
881	psrldq	$8,INC
8821:
883	movaps	CTR,IV
884	pshufb	BSWAP_MASK,IV
885	RETGUARD_CHECK(_aesni_inc, rax)
886	ret
887	lfence
888
889/*
890 * void aesni_ctr_enc(struct aesni_session *ses, uint8_t *dst, uint8_t *src,
891 *     size_t len, uint8_t *icb)
892 */
893ENTRY(aesni_ctr_enc)
894	RETGUARD_SETUP(aesni_ctr_enc, r11)
895	RETGUARD_PUSH(r11)
896	cmp	$16,LEN
897	jb	4f
898	mov	480(KEYP),KLEN
899	movdqu	(ICBP),CTR
900	call	_aesni_inc_init
901	cmp	$64,LEN
902	jb	2f
903.align 4
9041:	/* pipeline 4 instructions when possible */
905	call	_aesni_inc
906	movaps	IV,STATE1
907	movups	(INP),IN1
908	call	_aesni_inc
909	movaps	IV,STATE2
910	movups	0x10(INP),IN2
911	call	_aesni_inc
912	movaps	IV,STATE3
913	movups	0x20(INP),IN3
914	call	_aesni_inc
915	movaps	IV,STATE4
916	movups	0x30(INP),IN4
917	call	_aesni_enc4
918	pxor	IN1,STATE1
919	movups	STATE1,(OUTP)
920	pxor	IN2,STATE2
921	movups	STATE2,0x10(OUTP)
922	pxor	IN3,STATE3
923	movups	STATE3,0x20(OUTP)
924	pxor	IN4,STATE4
925	movups	STATE4,0x30(OUTP)
926	sub	$64,LEN
927	add	$64,INP
928	add	$64,OUTP
929	cmp	$64,LEN
930	jge	1b
931	cmp	$16,LEN
932	jb	3f
933.align 4
9342:
935	call	_aesni_inc
936	movaps	IV,STATE
937	movups	(INP),IN
938	call	_aesni_enc1
939	pxor	IN,STATE
940	movups	STATE,(OUTP)
941	sub	$16,LEN
942	add	$16,INP
943	add	$16,OUTP
944	cmp	$16,LEN
945	jge	2b
9463:
947	movq	IV,(IVP)
9484:
949	RETGUARD_POP(r11)
950	RETGUARD_CHECK(aesni_ctr_enc, r11)
951	ret
952	lfence
953
954_aesni_gmac_gfmul:
955	RETGUARD_SETUP(_aesni_gmac_gfmul, rax)
956	movdqa	%xmm0,%xmm3
957	pclmulqdq $0x00,%xmm1,%xmm3	# xmm3 holds a0*b0
958	movdqa	%xmm0,%xmm4
959	pclmulqdq $0x10,%xmm1,%xmm4	# xmm4 holds a0*b1
960	movdqa	%xmm0,%xmm5
961	pclmulqdq $0x01,%xmm1,%xmm5	# xmm5 holds a1*b0
962	movdqa	%xmm0,%xmm6
963	pclmulqdq $0x11,%xmm1,%xmm6	# xmm6 holds a1*b1
964
965	pxor	%xmm5,%xmm4		# xmm4 holds a0*b1 + a1*b0
966	movdqa	%xmm4,%xmm5
967	psrldq	$8,%xmm4
968	pslldq	$8,%xmm5
969	pxor	%xmm5,%xmm3
970	pxor	%xmm4,%xmm6
971
972	/*
973	 * <xmm6:xmm3> holds the result of the carry-less
974	 * multiplication of xmm0 by xmm1
975	 *
976	 * shift the result by one bit position to the left
977	 * cope for the fact that bits are reversed
978	 */
979	movdqa	%xmm3,%xmm7
980	movdqa	%xmm6,%xmm8
981	pslld	$1,%xmm3
982	pslld	$1,%xmm6
983	psrld	$31,%xmm7
984	psrld	$31,%xmm8
985	movdqa	%xmm7,%xmm9
986	pslldq	$4,%xmm8
987	pslldq	$4,%xmm7
988	psrldq	$12,%xmm9
989	por	%xmm7,%xmm3
990	por	%xmm8,%xmm6
991	por	%xmm9,%xmm6
992
993	/* first phase of the reduction */
994	movdqa	%xmm3,%xmm7
995	movdqa	%xmm3,%xmm8
996	movdqa	%xmm3,%xmm9
997	pslld	$31,%xmm7		# packed right shifting << 31
998	pslld	$30,%xmm8		# packed right shifting shift << 30
999	pslld	$25,%xmm9		# packed right shifting shift << 25
1000	pxor	%xmm8,%xmm7		# xor the shifted versions
1001	pxor	%xmm9,%xmm7
1002	movdqa	%xmm7,%xmm8
1003	pslldq	$12,%xmm7
1004	psrldq	$4,%xmm8
1005	pxor	%xmm7,%xmm3
1006
1007	/* second phase of the reduction */
1008	movdqa	%xmm3,%xmm2
1009	movdqa	%xmm3,%xmm4
1010	movdqa	%xmm3,%xmm5
1011	psrld	$1,%xmm2		# packed left shifting >> 1
1012	psrld	$2,%xmm4		# packed left shifting >> 2
1013	psrld	$7,%xmm5		# packed left shifting >> 7
1014	pxor	%xmm4,%xmm2		# xor the shifted versions
1015	pxor	%xmm5,%xmm2
1016	pxor	%xmm8,%xmm2
1017	pxor	%xmm2,%xmm3
1018	pxor	%xmm3,%xmm6		# the result is in xmm6
1019	RETGUARD_CHECK(_aesni_gmac_gfmul, rax)
1020	ret
1021	lfence
1022
1023/*
1024 * void aesni_gmac_update(GHASH_CTX *ghash, uint8_t *src, size_t len)
1025 */
1026ENTRY(aesni_gmac_update)
1027	RETGUARD_SETUP(aesni_gmac_update, r11)
1028	cmp	$16,%rdx
1029	jb	2f
1030
1031	movdqu	.Lbswap_mask,BSWAP_MASK	# endianness swap mask
1032
1033	movdqu	(%rdi),%xmm1		# hash subkey
1034	movdqu	32(%rdi),%xmm6		# initial state
1035	pshufb	BSWAP_MASK,%xmm1
1036	pshufb	BSWAP_MASK,%xmm6
10371:
1038	movdqu	(%rsi),%xmm2
1039	pshufb	BSWAP_MASK,%xmm2
1040	movdqa	%xmm6,%xmm0
1041	pxor	%xmm2,%xmm0
1042	call	_aesni_gmac_gfmul
1043
1044	sub	$16,%rdx
1045	add	$16,%rsi
1046	cmp	$16,%rdx
1047	jge	1b
1048
1049	pshufb	BSWAP_MASK,%xmm6
1050	movdqu	%xmm6,16(%rdi)
1051	movdqu	%xmm6,32(%rdi)
10522:
1053	RETGUARD_CHECK(aesni_gmac_update, r11)
1054	ret
1055	lfence
1056
1057/*
1058 * void aesni_gmac_final(struct aesni_sess *ses, uint8_t *tag,
1059 *     uint8_t *icb, uint8_t *hashstate)
1060 */
1061ENTRY(aesni_gmac_final)
1062	RETGUARD_SETUP(aesni_gmac_final, r11)
1063	movl	480(KEYP),KLEN		# key length
1064	movdqu	(INP),STATE		# icb
1065	call	_aesni_enc1
1066	movdqu	(HSTATE),IN
1067	pxor	IN,STATE
1068	movdqu	STATE,(OUTP)		# output
1069	RETGUARD_CHECK(aesni_gmac_final, r11)
1070	ret
1071	lfence
1072
1073/*
1074 * void aesni_xts_enc(struct aesni_xts_ctx *xts, uint8_t *dst, uint8_t *src,
1075 *    size_t len, uint8_t *iv)
1076 */
1077ENTRY(aesni_xts_enc)
1078	RETGUARD_SETUP(aesni_xts_enc, r11)
1079	RETGUARD_PUSH(r11)
1080	cmp	$16,%rcx
1081	jb	2f
1082
1083	call	_aesni_xts_tweak
1084
1085	movl	480(KEYP),KLEN		# key length
10861:
1087	movups	(%rdx),%xmm0		# src
1088	pxor	%xmm3,%xmm0		# xor block with tweak
1089	call	_aesni_enc1
1090	pxor	%xmm3,%xmm0		# xor block with tweak
1091	movups	%xmm0,(%rsi)		# dst
1092
1093	call	_aesni_xts_tweak_exp
1094
1095	add	$16,%rsi
1096	add	$16,%rdx
1097	sub	$16,%rcx
1098	cmp	$16,%rcx
1099	jge	1b
11002:
1101	RETGUARD_POP(r11)
1102	RETGUARD_CHECK(aesni_xts_enc, r11)
1103	ret
1104	lfence
1105
1106/*
1107 * void aesni_xts_dec(struct aesni_xts_ctx *xts, uint8_t *dst, uint8_t *src,
1108 *    size_t len, uint8_t *iv)
1109 */
1110ENTRY(aesni_xts_dec)
1111	RETGUARD_SETUP(aesni_xts_dec, r11)
1112	RETGUARD_PUSH(r11)
1113	cmp	$16,%rcx
1114	jb	2f
1115
1116	call	_aesni_xts_tweak
1117
1118	movl	480(KEYP),KLEN		# key length
1119	add	$240,KEYP		# decryption key
11201:
1121	movups	(%rdx),%xmm0		# src
1122	pxor	%xmm3,%xmm0		# xor block with tweak
1123	call	_aesni_dec1
1124	pxor	%xmm3,%xmm0		# xor block with tweak
1125	movups	%xmm0,(%rsi)		# dst
1126
1127	call	_aesni_xts_tweak_exp
1128
1129	add	$16,%rsi
1130	add	$16,%rdx
1131	sub	$16,%rcx
1132	cmp	$16,%rcx
1133	jge	1b
11342:
1135	RETGUARD_POP(r11)
1136	RETGUARD_CHECK(aesni_xts_dec, r11)
1137	ret
1138	lfence
1139
1140/*
1141 * Prepare tweak as E_k2(IV). IV is specified as LE representation of a
1142 * 64-bit block number which we allow to be passed in directly. Since
1143 * we're on a 64-bit LE host the representation is already correct.
1144 *
1145 * xts is in %rdi, iv is in %r8 and we return the tweak in %xmm3.
1146 */
1147_aesni_xts_tweak:
1148	RETGUARD_SETUP(_aesni_xts_tweak, rax)
1149	RETGUARD_PUSH(rax)
1150	mov	(%r8),%r10
1151	movd	%r10,%xmm0		# Last 64-bits of IV are always zero.
1152	mov	KEYP,%r11
1153	lea	496(%rdi),KEYP
1154	movl	480(KEYP),KLEN
1155	call	_aesni_enc1
1156	movdqa	%xmm0,%xmm3
1157	mov	%r11,KEYP
1158	RETGUARD_POP(rax)
1159	RETGUARD_CHECK(_aesni_xts_tweak, rax)
1160	ret
1161	lfence
1162
1163/*
1164 * Exponentiate AES XTS tweak (in %xmm3).
1165 */
1166_aesni_xts_tweak_exp:
1167	RETGUARD_SETUP(_aesni_xts_tweak_exp, rax)
1168	pextrw	$7,%xmm3,%r10
1169	pextrw	$3,%xmm3,%r11
1170	psllq	$1,%xmm3		# Left shift.
1171
1172	and	$0x8000,%r11		# Carry between quads.
1173	jz	1f
1174	mov	$1,%r11
1175	pxor	%xmm0,%xmm0
1176	pinsrw	$4,%r11,%xmm0
1177	por	%xmm0,%xmm3
11781:
1179	and	$0x8000,%r10
1180	jz	2f
1181	pextrw	$0,%xmm3,%r11
1182	xor	$0x87,%r11		# AES XTS alpha - GF(2^128).
1183	pinsrw	$0,%r11,%xmm3
11842:
1185	RETGUARD_CHECK(_aesni_xts_tweak_exp, rax)
1186	ret
1187	lfence
1188