xref: /openbsd-src/lib/libcrypto/sha/sha256_amd64_generic.S (revision 07e532b2d38ac57f99443e450b32ab7b5ddec390)
1/* $OpenBSD: sha256_amd64_generic.S,v 1.3 2024/11/16 12:34:16 jsing Exp $ */
2/*
3 * Copyright (c) 2024 Joel Sing <jsing@openbsd.org>
4 *
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 */
17
18#ifdef __CET__
19#include <cet.h>
20#else
21#define _CET_ENDBR
22#endif
23
24#define	ctx		%rdi
25#define	in		%rsi
26#define	num		%rdx
27
28#define	round		%rdi
29
30#define	hs0		%r8d
31#define	hs1		%r9d
32#define	hs2		%r10d
33#define	hs3		%r11d
34#define	hs4		%r12d
35#define	hs5		%r13d
36#define	hs6		%r14d
37#define	hs7		%r15d
38
39#define	k256		%rbp
40
41#define	tmp0		%eax
42#define	tmp1		%ebx
43#define	tmp2		%ecx
44#define	tmp3		%edx
45
46/*
47 * Load message into wt, storing a copy in the message schedule:
48 *
49 *  Wt = Mt
50 */
51#define sha256_message_schedule_load(idx, m, w, wt) \
52	movl	(m, round, 4), wt;				\
53	bswapl	wt;						\
54	movl	wt, ((idx&0xf)*4)(w);
55
56/*
57 * Update message schedule and return current value in wt:
58 *
59 *  Wt = sigma1(W(t-2)) + W(t-7) + sigma0(W(t-15)) + W(t-16)
60 *
61 *  sigma0(x) = ror(x, 7) ^ ror(x, 18) ^ (x >> 3)
62 *  sigma1(x) = ror(x, 17) ^ ror(x, 19) ^ (x >> 10)
63 */
64#define sha256_message_schedule_update(idx, w, wt) \
65	movl	(((idx-2)&0xf)*4)(w), wt;	/* sigma1 */	\
66	movl	wt, tmp1;			/* sigma1 */	\
67	rorl	$(19-17), tmp1;			/* sigma1 */	\
68	xorl	wt, tmp1;			/* sigma1 */	\
69	rorl	$17, tmp1;			/* sigma1 */	\
70	shrl	$10, wt;			/* sigma1 */	\
71	xorl	tmp1, wt;			/* sigma1 */	\
72	\
73	addl	(((idx-7)&0xf)*4)(w), wt;	/* Wt-7 */	\
74	addl	(((idx-16)&0xf)*4)(w), wt;	/* Wt-16 */	\
75	\
76	movl	(((idx-15)&0xf)*4)(w), tmp2;	/* sigma0 */	\
77	movl	tmp2, tmp3;			/* sigma0 */	\
78	rorl	$(18-7), tmp2;			/* sigma0 */	\
79	xorl	tmp3, tmp2;			/* sigma0 */	\
80	rorl	$7, tmp2;			/* sigma0 */	\
81	shrl	$3, tmp3;			/* sigma0 */	\
82	xorl	tmp3, tmp2;			/* sigma0 */	\
83	addl	tmp2, wt;			/* sigma0 */	\
84	\
85	movl	wt, ((idx&0xf)*4)(w);
86
87/*
88 * Compute a SHA-256 round:
89 *
90 *  T1 = h + Sigma1(e) + Ch(e, f, g) + Kt + Wt
91 *  T2 = Sigma0(a) + Maj(a, b, c)
92 *
93 *  Sigma0(x) = ror(x, 2) ^ ror(x, 13) ^ ror(x, 22)
94 *  Sigma1(x) = ror(x, 6) ^ ror(x, 11) ^ ror(x, 25)
95 *  Ch(x, y, z) = (x & y) ^ (~x & z) = ((y ^ z) & x) ^ z
96 *  Maj(x, y, z) = (x & y) ^ (x & z) ^ (y & z) = ((y ^ z) & x) ^ (y & z)
97 *
98 * Upon completion d = d + T1, h = T1 + T2, pending rotation.
99 */
100#define sha256_round(idx, a, b, c, d, e, f, g, h, k, w, wt) \
101	addl	wt, h;				/* T1 Wt */	\
102	addl	(k256, round, 4), h;		/* T1 Kt */	\
103	\
104	movl	e, tmp1;			/* T1 Sigma1 */	\
105	rorl	$(25-11), tmp1;			/* T1 Sigma1 */	\
106	xorl	e, tmp1;			/* T1 Sigma1 */ \
107	rorl	$(11-6), tmp1;			/* T1 Sigma1 */	\
108	xorl	e, tmp1;			/* T1 Sigma1 */ \
109	rorl	$6, tmp1;			/* T1 Sigma1 */	\
110	addl	tmp1, h;			/* T1 Sigma1 */	\
111	\
112	movl	f, tmp2;			/* T1 Ch */	\
113	xorl	g, tmp2;			/* T1 Ch */	\
114	andl	e, tmp2;			/* T1 Ch */	\
115	xorl	g, tmp2;			/* T1 Ch */	\
116	addl	tmp2, h;			/* T1 Ch */	\
117	\
118	addl	h, d;				/* d += T1 */	\
119	\
120	movl	a, tmp1;			/* T2 Sigma0 */	\
121	rorl	$(22-13), tmp1;			/* T2 Sigma0 */	\
122	xorl	a, tmp1;			/* T2 Sigma0 */	\
123	rorl	$(13-2), tmp1;			/* T2 Sigma0 */	\
124	xorl	a, tmp1;			/* T2 Sigma0 */	\
125	rorl	$2, tmp1;			/* T2 Sigma0 */	\
126	addl	tmp1, h;			/* T2 Sigma0 */	\
127	\
128	movl	b, tmp2;			/* T2 Maj */	\
129	xorl	c, tmp2;			/* T2 Maj */	\
130	andl	a, tmp2;			/* T2 Maj */	\
131	movl	b, tmp3;			/* T2 Maj */	\
132	andl	c, tmp3;			/* T2 Maj */	\
133	xorl	tmp2, tmp3;			/* T2 Maj */	\
134	addl	tmp3, h;			/* T2 Maj */	\
135	\
136	addq	$1, round;
137
138#define sha256_round_load(idx, a, b, c, d, e, f, g, h) \
139	sha256_message_schedule_load(idx, in, %rsp, tmp0) \
140	sha256_round(idx, a, b, c, d, e, f, g, h, k256, %rsp, tmp0)
141
142#define sha256_round_update(idx, a, b, c, d, e, f, g, h) \
143	sha256_message_schedule_update(idx, %rsp, tmp0) \
144	sha256_round(idx, a, b, c, d, e, f, g, h, k256, %rsp, tmp0)
145
146.text
147
148/*
149 * void sha256_block_generic(SHA256_CTX *ctx, const void *in, size_t num);
150 *
151 * Standard x86-64 ABI: rdi = ctx, rsi = in, rdx = num
152 */
153.align 16
154.globl	sha256_block_generic
155.type	sha256_block_generic,@function
156sha256_block_generic:
157	_CET_ENDBR
158
159	/* Save callee save registers. */
160	pushq	%rbx
161	pushq	%rbp
162	pushq	%r12
163	pushq	%r13
164	pushq	%r14
165	pushq	%r15
166
167	/* Allocate space for message schedule, context pointer and end of message. */
168	movq	%rsp, %rax
169	subq	$(64+3*8), %rsp
170	andq	$~63, %rsp
171	movq	%rax, (64+2*8)(%rsp)
172	movq	ctx, (64+1*8)(%rsp)
173
174	/* Compute and store end of message. */
175	shlq	$6, num
176	leaq	(in, num, 1), %rbx
177	movq	%rbx, (64+0*8)(%rsp)
178
179	/* Address of SHA-256 constants. */
180	leaq	K256(%rip), k256
181
182	/* Load current hash state from context. */
183	movl	(0*4)(ctx), hs0
184	movl	(1*4)(ctx), hs1
185	movl	(2*4)(ctx), hs2
186	movl	(3*4)(ctx), hs3
187	movl	(4*4)(ctx), hs4
188	movl	(5*4)(ctx), hs5
189	movl	(6*4)(ctx), hs6
190	movl	(7*4)(ctx), hs7
191
192	jmp	.Lblock_loop0
193
194.align 16
195.Lblock_loop0:
196	mov	$0, round
197
198	/* Round 0 through 15. */
199	sha256_round_load(0, hs0, hs1, hs2, hs3, hs4, hs5, hs6, hs7)
200	sha256_round_load(1, hs7, hs0, hs1, hs2, hs3, hs4, hs5, hs6)
201	sha256_round_load(2, hs6, hs7, hs0, hs1, hs2, hs3, hs4, hs5)
202	sha256_round_load(3, hs5, hs6, hs7, hs0, hs1, hs2, hs3, hs4)
203	sha256_round_load(4, hs4, hs5, hs6, hs7, hs0, hs1, hs2, hs3)
204	sha256_round_load(5, hs3, hs4, hs5, hs6, hs7, hs0, hs1, hs2)
205	sha256_round_load(6, hs2, hs3, hs4, hs5, hs6, hs7, hs0, hs1)
206	sha256_round_load(7, hs1, hs2, hs3, hs4, hs5, hs6, hs7, hs0)
207	sha256_round_load(8, hs0, hs1, hs2, hs3, hs4, hs5, hs6, hs7)
208	sha256_round_load(9, hs7, hs0, hs1, hs2, hs3, hs4, hs5, hs6)
209	sha256_round_load(10, hs6, hs7, hs0, hs1, hs2, hs3, hs4, hs5)
210	sha256_round_load(11, hs5, hs6, hs7, hs0, hs1, hs2, hs3, hs4)
211	sha256_round_load(12, hs4, hs5, hs6, hs7, hs0, hs1, hs2, hs3)
212	sha256_round_load(13, hs3, hs4, hs5, hs6, hs7, hs0, hs1, hs2)
213	sha256_round_load(14, hs2, hs3, hs4, hs5, hs6, hs7, hs0, hs1)
214	sha256_round_load(15, hs1, hs2, hs3, hs4, hs5, hs6, hs7, hs0)
215
216	jmp	.Lblock_loop16
217
218.align 16
219.Lblock_loop16:
220	/* Round 16 through 63. */
221	sha256_round_update(16, hs0, hs1, hs2, hs3, hs4, hs5, hs6, hs7)
222	sha256_round_update(17, hs7, hs0, hs1, hs2, hs3, hs4, hs5, hs6)
223	sha256_round_update(18, hs6, hs7, hs0, hs1, hs2, hs3, hs4, hs5)
224	sha256_round_update(19, hs5, hs6, hs7, hs0, hs1, hs2, hs3, hs4)
225	sha256_round_update(20, hs4, hs5, hs6, hs7, hs0, hs1, hs2, hs3)
226	sha256_round_update(21, hs3, hs4, hs5, hs6, hs7, hs0, hs1, hs2)
227	sha256_round_update(22, hs2, hs3, hs4, hs5, hs6, hs7, hs0, hs1)
228	sha256_round_update(23, hs1, hs2, hs3, hs4, hs5, hs6, hs7, hs0)
229	sha256_round_update(24, hs0, hs1, hs2, hs3, hs4, hs5, hs6, hs7)
230	sha256_round_update(25, hs7, hs0, hs1, hs2, hs3, hs4, hs5, hs6)
231	sha256_round_update(26, hs6, hs7, hs0, hs1, hs2, hs3, hs4, hs5)
232	sha256_round_update(27, hs5, hs6, hs7, hs0, hs1, hs2, hs3, hs4)
233	sha256_round_update(28, hs4, hs5, hs6, hs7, hs0, hs1, hs2, hs3)
234	sha256_round_update(29, hs3, hs4, hs5, hs6, hs7, hs0, hs1, hs2)
235	sha256_round_update(30, hs2, hs3, hs4, hs5, hs6, hs7, hs0, hs1)
236	sha256_round_update(31, hs1, hs2, hs3, hs4, hs5, hs6, hs7, hs0)
237
238	cmp	$64, round
239	jb	.Lblock_loop16
240
241	movq	(64+1*8)(%rsp), ctx
242
243	/* Add intermediate state to hash state. */
244	addl	(0*4)(ctx), hs0
245	addl	(1*4)(ctx), hs1
246	addl	(2*4)(ctx), hs2
247	addl	(3*4)(ctx), hs3
248	addl	(4*4)(ctx), hs4
249	addl	(5*4)(ctx), hs5
250	addl	(6*4)(ctx), hs6
251	addl	(7*4)(ctx), hs7
252
253	/* Store new hash state to context. */
254	movl	hs0, (0*4)(ctx)
255	movl	hs1, (1*4)(ctx)
256	movl	hs2, (2*4)(ctx)
257	movl	hs3, (3*4)(ctx)
258	movl	hs4, (4*4)(ctx)
259	movl	hs5, (5*4)(ctx)
260	movl	hs6, (6*4)(ctx)
261	movl	hs7, (7*4)(ctx)
262
263	addq	$64, in
264	cmpq	(64+0*8)(%rsp), in
265	jb	.Lblock_loop0
266
267	movq	(64+2*8)(%rsp), %rsp
268
269	/* Restore callee save registers. */
270	popq	%r15
271	popq	%r14
272	popq	%r13
273	popq	%r12
274	popq	%rbp
275	popq	%rbx
276
277	ret
278
279/*
280 * SHA-256 constants - see FIPS 180-4 section 4.2.2.
281 */
282.rodata
283.align	64
284.type	K256,@object
285K256:
286.long	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
287.long	0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
288.long	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
289.long	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
290.long	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
291.long	0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
292.long	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
293.long	0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
294.long	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
295.long	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
296.long	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
297.long	0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
298.long	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
299.long	0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
300.long	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
301.long	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
302.size	K256,.-K256
303