xref: /openbsd-src/lib/libcrypto/sha/sha256_amd64_shani.S (revision 228e7c1edf50bd8bdd3aca0a9db75db8848ad9e0)
1/* $OpenBSD: sha256_amd64_shani.S,v 1.1 2024/11/16 15:31:36 jsing Exp $ */
2/*
3 * Copyright (c) 2024 Joel Sing <jsing@openbsd.org>
4 *
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 */
17
18#ifdef __CET__
19#include <cet.h>
20#else
21#define _CET_ENDBR
22#endif
23
24/*
25 * SHA-256 implementation using the Intel SHA extensions:
26 *
27 * https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sha-extensions.html
28 */
29
30#define	ctx		%rdi
31#define	in		%rsi
32#define	num		%rdx
33
34#define	end		%rbx
35
36#define	k256		%rbp
37
38#define	xmsg		%xmm0
39
40#define	xhs0		%xmm1
41#define	xhs1		%xmm2
42
43#define	xabef		%xmm3
44#define	xcdgh		%xmm4
45
46#define	xmsgtmp0	%xmm6
47#define	xmsgtmp1	%xmm7
48#define	xmsgtmp2	%xmm8
49#define	xmsgtmp3	%xmm9
50#define	xmsgtmp4	%xmm10
51
52#define	xshufmask	%xmm11
53
54#define	xtmp0		%xmm12
55
56#define sha256_message_schedule_load(idx, m, xmsgtmp) \
57	movdqu	(idx*16)(m), xmsg;					\
58	pshufb	xshufmask, xmsg;					\
59	movdqa	xmsg, xmsgtmp;
60
61#define sha256_message_schedule_update(xmt0, xmt1, xmt2, xmt3) \
62	sha256msg1 xmt1, xmt0;						\
63	movdqa	xmt3, xmsgtmp4;						\
64	palignr	$4, xmt2, xmsgtmp4;					\
65	paddd	xmsgtmp4, xmt0;						\
66	sha256msg2 xmt3, xmt0;
67
68#define sha256_shani_round(idx) \
69	paddd	(idx*16)(k256), xmsg;					\
70	sha256rnds2 xmsg, xhs0, xhs1;					\
71	pshufd	$0x0e, xmsg, xmsg;					\
72	sha256rnds2 xmsg, xhs1, xhs0;
73
74#define sha256_shani_round_load(idx, m, xmsgtmp) \
75	sha256_message_schedule_load(idx, m, xmsgtmp);			\
76	sha256_shani_round(idx);
77
78#define sha256_shani_round_update(idx, xmt0, xmt1, xmt2, xmt3) \
79	sha256_message_schedule_update(xmt0, xmt1, xmt2, xmt3);		\
80	movdqa	xmt0, xmsg;						\
81	sha256_shani_round(idx);
82
83.text
84
85/*
86 * void sha256_block_shani(SHA256_CTX *ctx, const void *in, size_t num);
87 *
88 * Standard x86-64 ABI: rdi = ctx, rsi = in, rdx = num
89 */
90.align 16
91.globl	sha256_block_shani
92.type	sha256_block_shani,@function
93sha256_block_shani:
94	_CET_ENDBR
95
96	/* Save callee save registers. */
97	pushq	%rbx
98	pushq	%rbp
99
100	/* Compute end of message. */
101	shlq	$6, num
102	leaq	(in, num, 1), end
103
104	/* Address of SHA-256 constants. */
105	leaq	K256(%rip), k256
106
107	/* Load endian shuffle mask. */
108	movdqa	shufmask(%rip), xshufmask
109
110	/* Load current hash state from context. */
111	movdqu	(0*16)(ctx), xhs0	/* dcba */
112	movdqu	(1*16)(ctx), xhs1	/* hgfe */
113
114	/* Rearrange words to construct abef/cdgh. */
115	pshufd	$0xb1, xhs0, xhs0	/* cdab */
116	pshufd	$0x1b, xhs1, xhs1	/* efgh */
117	movdqa	xhs0, xtmp0
118	palignr	$8, xhs1, xhs0		/* abef */
119	pblendw	$0xf0, xtmp0, xhs1	/* cdgh */
120
121	jmp	.Lshani_block_loop
122
123.align 16
124.Lshani_block_loop:
125	/* Save state for accumulation. */
126	movdqa	xhs0, xabef
127	movdqa	xhs1, xcdgh
128
129	/* Rounds 0 through 15 (four rounds at a time). */
130	sha256_shani_round_load(0, in, xmsgtmp0)
131	sha256_shani_round_load(1, in, xmsgtmp1)
132	sha256_shani_round_load(2, in, xmsgtmp2)
133	sha256_shani_round_load(3, in, xmsgtmp3)
134
135	/* Rounds 16 through 63 (four rounds at a time). */
136	sha256_shani_round_update(4, xmsgtmp0, xmsgtmp1, xmsgtmp2, xmsgtmp3)
137	sha256_shani_round_update(5, xmsgtmp1, xmsgtmp2, xmsgtmp3, xmsgtmp0)
138	sha256_shani_round_update(6, xmsgtmp2, xmsgtmp3, xmsgtmp0, xmsgtmp1)
139	sha256_shani_round_update(7, xmsgtmp3, xmsgtmp0, xmsgtmp1, xmsgtmp2)
140
141	sha256_shani_round_update(8, xmsgtmp0, xmsgtmp1, xmsgtmp2, xmsgtmp3)
142	sha256_shani_round_update(9, xmsgtmp1, xmsgtmp2, xmsgtmp3, xmsgtmp0)
143	sha256_shani_round_update(10, xmsgtmp2, xmsgtmp3, xmsgtmp0, xmsgtmp1)
144	sha256_shani_round_update(11, xmsgtmp3, xmsgtmp0, xmsgtmp1, xmsgtmp2)
145
146	sha256_shani_round_update(12, xmsgtmp0, xmsgtmp1, xmsgtmp2, xmsgtmp3)
147	sha256_shani_round_update(13, xmsgtmp1, xmsgtmp2, xmsgtmp3, xmsgtmp0)
148	sha256_shani_round_update(14, xmsgtmp2, xmsgtmp3, xmsgtmp0, xmsgtmp1)
149	sha256_shani_round_update(15, xmsgtmp3, xmsgtmp0, xmsgtmp1, xmsgtmp2)
150
151	/* Accumulate hash state. */
152	paddd	xabef, xhs0
153	paddd	xcdgh, xhs1
154
155	addq	$64, in
156	cmpq	end, in
157	jb	.Lshani_block_loop
158
159	/* Rearrange words to construct dcba/hgfe. */
160	pshufd	$0x1b, xhs0, xhs0	/* feba */
161	pshufd	$0xb1, xhs1, xhs1	/* dchg */
162	movdqa	xhs0, xtmp0
163	pblendw	$0xf0, xhs1, xhs0	/* dcba */
164	palignr	$8, xtmp0, xhs1		/* hgfe */
165
166	/* Update stored hash context. */
167	movdqu	xhs0, (0*16)(ctx)
168	movdqu	xhs1, (1*16)(ctx)
169
170	/* Restore callee save registers. */
171	popq	%rbp
172	popq	%rbx
173
174	ret
175
176.rodata
177
178/*
179 * Shuffle mask - little endian to big endian word conversion.
180 */
181.align	16
182.type	shufmask,@object
183shufmask:
184.octa	0x0c0d0e0f08090a0b0405060700010203
185.size	shufmask,.-shufmask
186
187/*
188 * SHA-256 constants - see FIPS 180-4 section 4.2.2.
189 */
190.align	64
191.type	K256,@object
192K256:
193.long	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
194.long	0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
195.long	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
196.long	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
197.long	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
198.long	0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
199.long	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
200.long	0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
201.long	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
202.long	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
203.long	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
204.long	0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
205.long	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
206.long	0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
207.long	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
208.long	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
209.size	K256,.-K256
210