xref: /openbsd-src/lib/libcrypto/sha/sha1_amd64_generic.S (revision f6bb4990b68d550f8383b4fefcfc131f6b2ed563)
1*f6bb4990Sjsing/* $OpenBSD: sha1_amd64_generic.S,v 1.2 2025/01/18 02:56:07 jsing Exp $ */
2a61493a0Sjsing/*
3a61493a0Sjsing * Copyright (c) 2024 Joel Sing <jsing@openbsd.org>
4a61493a0Sjsing *
5a61493a0Sjsing * Permission to use, copy, modify, and distribute this software for any
6a61493a0Sjsing * purpose with or without fee is hereby granted, provided that the above
7a61493a0Sjsing * copyright notice and this permission notice appear in all copies.
8a61493a0Sjsing *
9a61493a0Sjsing * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10a61493a0Sjsing * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11a61493a0Sjsing * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12a61493a0Sjsing * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13a61493a0Sjsing * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14a61493a0Sjsing * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15a61493a0Sjsing * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16a61493a0Sjsing */
17a61493a0Sjsing
18a61493a0Sjsing#ifdef __CET__
19a61493a0Sjsing#include <cet.h>
20a61493a0Sjsing#else
21a61493a0Sjsing#define _CET_ENDBR
22a61493a0Sjsing#endif
23a61493a0Sjsing
24a61493a0Sjsing#define	ctx		%rdi
25a61493a0Sjsing#define	in		%rsi
26a61493a0Sjsing#define	num		%rdx
27a61493a0Sjsing
28a61493a0Sjsing#define	end		%rbp
29a61493a0Sjsing
30a61493a0Sjsing#define	hs0		%r8d
31a61493a0Sjsing#define	hs1		%r9d
32a61493a0Sjsing#define	hs2		%r10d
33a61493a0Sjsing#define	hs3		%r11d
34a61493a0Sjsing#define	hs4		%r12d
35a61493a0Sjsing
36a61493a0Sjsing#define	tmp0		%eax
37a61493a0Sjsing#define	tmp1		%ebx
38a61493a0Sjsing#define	tmp2		%ecx
39a61493a0Sjsing#define	tmp3		%edx
40a61493a0Sjsing
41a61493a0Sjsing/*
42a61493a0Sjsing * Load message into wt, storing a copy in the message schedule:
43a61493a0Sjsing *
44a61493a0Sjsing *  Wt = Mt
45a61493a0Sjsing */
46a61493a0Sjsing#define sha1_message_schedule_load(idx, m, w, wt) \
47a61493a0Sjsing	movl	((idx&0xf)*4)(m), wt;				\
48a61493a0Sjsing	bswapl	wt;						\
49a61493a0Sjsing	movl	wt, ((idx&0xf)*4)(w);
50a61493a0Sjsing
51a61493a0Sjsing/*
52a61493a0Sjsing * Update message schedule and return current value in wt:
53a61493a0Sjsing *
54a61493a0Sjsing *  W0 = rol(W13 ^ W8 ^ W2 ^ W0, 1)
55a61493a0Sjsing */
56a61493a0Sjsing#define sha1_message_schedule_update(idx, w, wt) \
57a61493a0Sjsing	movl	(((idx-3)&0xf)*4)(w), wt;	/* W13 */	\
58a61493a0Sjsing	xorl	(((idx-8)&0xf)*4)(w), wt;	/* W8 */	\
59a61493a0Sjsing	xorl	(((idx-14)&0xf)*4)(w), wt;	/* W2 */	\
60a61493a0Sjsing	xorl	(((idx)&0xf)*4)(w), wt;		/* W0 */	\
61a61493a0Sjsing	roll	$1, wt;						\
62a61493a0Sjsing	\
63a61493a0Sjsing	movl	wt, ((idx&0xf)*4)(w);
64a61493a0Sjsing
65a61493a0Sjsing/*
66a61493a0Sjsing * Compute a SHA-1 round without logic function:
67a61493a0Sjsing *
68a61493a0Sjsing *  T = rol(a, 5) + e + Kt + Wt
69a61493a0Sjsing *
70a61493a0Sjsing * The caller is required to compute the appropriate logic function
71a61493a0Sjsing * (Ch, Maj, Parity) and add it to e.
72a61493a0Sjsing *
73a61493a0Sjsing * Upon completion b = rol(b, 30), e = T, pending rotation.
74a61493a0Sjsing */
75a61493a0Sjsing#define sha1_round(a, b, c, d, e, kt, wt) \
76a61493a0Sjsing	leal	kt(wt, e, 1), e;		/* Kt + Wt */	\
77a61493a0Sjsing	\
78a61493a0Sjsing	movl	a, tmp1;			/* rol(a, 5) */	\
79a61493a0Sjsing	roll	$5, tmp1;					\
80a61493a0Sjsing	addl	tmp1, e;					\
81a61493a0Sjsing	\
82a61493a0Sjsing	roll	$30, b;				/* rol(b, 30) */
83a61493a0Sjsing
84a61493a0Sjsing/*
85a61493a0Sjsing * Compute a SHA-1 round with Ch:
86a61493a0Sjsing *
87a61493a0Sjsing *  T = rol(a, 5) + Ch(b, c, d) + e + Kt + Wt
88a61493a0Sjsing *
89a61493a0Sjsing *  Ch(x, y, z) = (x & y) ^ (~x & z) = ((y ^ z) & x) ^ z
90a61493a0Sjsing *
91a61493a0Sjsing * Upon completion b = rol(b, 30), e = T, pending rotation.
92a61493a0Sjsing */
93a61493a0Sjsing#define sha1_round_ch(a, b, c, d, e, kt, wt) \
94a61493a0Sjsing	movl	c, tmp2;			/* Ch */	\
95a61493a0Sjsing	xorl	d, tmp2;			/* Ch */	\
96a61493a0Sjsing	andl	b, tmp2;			/* Ch */	\
97a61493a0Sjsing	xorl	d, tmp2;			/* Ch */	\
98a61493a0Sjsing	addl	tmp2, e;			/* Ch */	\
99a61493a0Sjsing	\
100a61493a0Sjsing	sha1_round(a, b, c, d, e, kt, wt);
101a61493a0Sjsing
102a61493a0Sjsing/*
103a61493a0Sjsing * Compute a SHA-1 round with Parity:
104a61493a0Sjsing *
105a61493a0Sjsing *  T = rol(a, 5) + Parity(b, c, d) + e + Kt + Wt
106a61493a0Sjsing *
107a61493a0Sjsing *  Parity(x, y, z) = x ^ y ^ z
108a61493a0Sjsing *
109a61493a0Sjsing * Upon completion b = rol(b, 30), e = T, pending rotation.
110a61493a0Sjsing */
111a61493a0Sjsing#define sha1_round_parity(a, b, c, d, e, kt, wt) \
112a61493a0Sjsing	movl	b, tmp2;			/* Parity */	\
113a61493a0Sjsing	xorl	c, tmp2;			/* Parity */	\
114a61493a0Sjsing	xorl	d, tmp2;			/* Parity */	\
115a61493a0Sjsing	addl	tmp2, e;			/* Parity */	\
116a61493a0Sjsing	\
117a61493a0Sjsing	sha1_round(a, b, c, d, e, kt, wt);
118a61493a0Sjsing
119a61493a0Sjsing/*
120a61493a0Sjsing * Compute a SHA-1 round with Maj:
121a61493a0Sjsing *
122a61493a0Sjsing *  T = rol(a, 5) + Maj(b, c, d) + e + Kt + Wt
123a61493a0Sjsing *
124a61493a0Sjsing *  Maj(x, y, z) = (x & y) ^ (x & z) ^ (y & z) = ((y ^ z) & x) ^ (y & z)
125a61493a0Sjsing *
126a61493a0Sjsing * Upon completion b = rol(b, 30), e = T, pending rotation.
127a61493a0Sjsing */
128a61493a0Sjsing#define sha1_round_maj(a, b, c, d, e, kt, wt) \
129a61493a0Sjsing	movl	c, tmp2;			/* Maj */	\
130a61493a0Sjsing	xorl	d, tmp2;			/* Maj */	\
131a61493a0Sjsing	andl	b, tmp2;			/* Maj */	\
132a61493a0Sjsing	movl	c, tmp3;			/* Maj */	\
133a61493a0Sjsing	andl	d, tmp3;			/* Maj */	\
134a61493a0Sjsing	xorl	tmp2, tmp3;			/* Maj */	\
135a61493a0Sjsing	addl	tmp3, e;			/* Maj */	\
136a61493a0Sjsing	\
137a61493a0Sjsing	sha1_round(a, b, c, d, e, kt, wt);
138a61493a0Sjsing
139a61493a0Sjsing#define sha1_round1_load(idx, a, b, c, d, e) \
140a61493a0Sjsing	sha1_message_schedule_load(idx, in, %rsp, tmp0) \
141a61493a0Sjsing	sha1_round_ch(a, b, c, d, e, 0x5a827999, tmp0)
142a61493a0Sjsing
143a61493a0Sjsing#define sha1_round1_update(idx, a, b, c, d, e) \
144a61493a0Sjsing	sha1_message_schedule_update(idx, %rsp, tmp0) \
145a61493a0Sjsing	sha1_round_ch(a, b, c, d, e, 0x5a827999, tmp0)
146a61493a0Sjsing
147a61493a0Sjsing#define sha1_round2_update(idx, a, b, c, d, e) \
148a61493a0Sjsing	sha1_message_schedule_update(idx, %rsp, tmp0) \
149a61493a0Sjsing	sha1_round_parity(a, b, c, d, e, 0x6ed9eba1, tmp0)
150a61493a0Sjsing
151a61493a0Sjsing#define sha1_round3_update(idx, a, b, c, d, e) \
152a61493a0Sjsing	sha1_message_schedule_update(idx, %rsp, tmp0) \
153a61493a0Sjsing	sha1_round_maj(a, b, c, d, e, 0x8f1bbcdc, tmp0)
154a61493a0Sjsing
155a61493a0Sjsing#define sha1_round4_update(idx, a, b, c, d, e) \
156a61493a0Sjsing	sha1_message_schedule_update(idx, %rsp, tmp0) \
157a61493a0Sjsing	sha1_round_parity(a, b, c, d, e, 0xca62c1d6, tmp0)
158a61493a0Sjsing
159a61493a0Sjsing.text
160a61493a0Sjsing
161a61493a0Sjsing/*
162a61493a0Sjsing * void sha1_block_generic(SHA1_CTX *ctx, const void *in, size_t num);
163a61493a0Sjsing *
164a61493a0Sjsing * Standard x86-64 ABI: rdi = ctx, rsi = in, rdx = num
165a61493a0Sjsing */
166a61493a0Sjsing.align 16
167a61493a0Sjsing.globl	sha1_block_generic
168a61493a0Sjsing.type	sha1_block_generic,@function
169a61493a0Sjsingsha1_block_generic:
170a61493a0Sjsing	_CET_ENDBR
171a61493a0Sjsing
172a61493a0Sjsing	/* Save callee save registers. */
173a61493a0Sjsing	pushq	%rbx
174a61493a0Sjsing	pushq	%rbp
175a61493a0Sjsing	pushq	%r12
176a61493a0Sjsing
177a61493a0Sjsing	/* Allocate space for message schedule. */
178a61493a0Sjsing	movq	%rsp, %rax
179a61493a0Sjsing	subq	$(64+1*8), %rsp
180a61493a0Sjsing	andq	$~63, %rsp
181a61493a0Sjsing	movq	%rax, (64+0*8)(%rsp)
182a61493a0Sjsing
183*f6bb4990Sjsing	/* Compute end of message. */
184a61493a0Sjsing	shlq	$6, num
185*f6bb4990Sjsing	leaq	(in, num, 1), end
186a61493a0Sjsing
187a61493a0Sjsing	/* Load current hash state from context. */
188a61493a0Sjsing	movl	(0*4)(ctx), hs0
189a61493a0Sjsing	movl	(1*4)(ctx), hs1
190a61493a0Sjsing	movl	(2*4)(ctx), hs2
191a61493a0Sjsing	movl	(3*4)(ctx), hs3
192a61493a0Sjsing	movl	(4*4)(ctx), hs4
193a61493a0Sjsing
194a61493a0Sjsing	jmp	.Lblock_loop
195a61493a0Sjsing
196a61493a0Sjsing.align 16
197a61493a0Sjsing.Lblock_loop:
198a61493a0Sjsing
199a61493a0Sjsing	/* Round 0 through 15. */
200a61493a0Sjsing	sha1_round1_load(0, hs0, hs1, hs2, hs3, hs4)
201a61493a0Sjsing	sha1_round1_load(1, hs4, hs0, hs1, hs2, hs3)
202a61493a0Sjsing	sha1_round1_load(2, hs3, hs4, hs0, hs1, hs2)
203a61493a0Sjsing	sha1_round1_load(3, hs2, hs3, hs4, hs0, hs1)
204a61493a0Sjsing	sha1_round1_load(4, hs1, hs2, hs3, hs4, hs0)
205a61493a0Sjsing	sha1_round1_load(5, hs0, hs1, hs2, hs3, hs4)
206a61493a0Sjsing	sha1_round1_load(6, hs4, hs0, hs1, hs2, hs3)
207a61493a0Sjsing	sha1_round1_load(7, hs3, hs4, hs0, hs1, hs2)
208a61493a0Sjsing	sha1_round1_load(8, hs2, hs3, hs4, hs0, hs1)
209a61493a0Sjsing	sha1_round1_load(9, hs1, hs2, hs3, hs4, hs0)
210a61493a0Sjsing	sha1_round1_load(10, hs0, hs1, hs2, hs3, hs4)
211a61493a0Sjsing	sha1_round1_load(11, hs4, hs0, hs1, hs2, hs3)
212a61493a0Sjsing	sha1_round1_load(12, hs3, hs4, hs0, hs1, hs2)
213a61493a0Sjsing	sha1_round1_load(13, hs2, hs3, hs4, hs0, hs1)
214a61493a0Sjsing	sha1_round1_load(14, hs1, hs2, hs3, hs4, hs0)
215a61493a0Sjsing	sha1_round1_load(15, hs0, hs1, hs2, hs3, hs4)
216a61493a0Sjsing
217a61493a0Sjsing	/* Round 16 through 31. */
218a61493a0Sjsing	sha1_round1_update(16, hs4, hs0, hs1, hs2, hs3);
219a61493a0Sjsing	sha1_round1_update(17, hs3, hs4, hs0, hs1, hs2);
220a61493a0Sjsing	sha1_round1_update(18, hs2, hs3, hs4, hs0, hs1);
221a61493a0Sjsing	sha1_round1_update(19, hs1, hs2, hs3, hs4, hs0);
222a61493a0Sjsing	sha1_round2_update(20, hs0, hs1, hs2, hs3, hs4);
223a61493a0Sjsing	sha1_round2_update(21, hs4, hs0, hs1, hs2, hs3);
224a61493a0Sjsing	sha1_round2_update(22, hs3, hs4, hs0, hs1, hs2);
225a61493a0Sjsing	sha1_round2_update(23, hs2, hs3, hs4, hs0, hs1);
226a61493a0Sjsing	sha1_round2_update(24, hs1, hs2, hs3, hs4, hs0);
227a61493a0Sjsing	sha1_round2_update(25, hs0, hs1, hs2, hs3, hs4);
228a61493a0Sjsing	sha1_round2_update(26, hs4, hs0, hs1, hs2, hs3);
229a61493a0Sjsing	sha1_round2_update(27, hs3, hs4, hs0, hs1, hs2);
230a61493a0Sjsing	sha1_round2_update(28, hs2, hs3, hs4, hs0, hs1);
231a61493a0Sjsing	sha1_round2_update(29, hs1, hs2, hs3, hs4, hs0);
232a61493a0Sjsing	sha1_round2_update(30, hs0, hs1, hs2, hs3, hs4);
233a61493a0Sjsing	sha1_round2_update(31, hs4, hs0, hs1, hs2, hs3);
234a61493a0Sjsing
235a61493a0Sjsing	/* Round 32 through 47. */
236a61493a0Sjsing	sha1_round2_update(32, hs3, hs4, hs0, hs1, hs2);
237a61493a0Sjsing	sha1_round2_update(33, hs2, hs3, hs4, hs0, hs1);
238a61493a0Sjsing	sha1_round2_update(34, hs1, hs2, hs3, hs4, hs0);
239a61493a0Sjsing	sha1_round2_update(35, hs0, hs1, hs2, hs3, hs4);
240a61493a0Sjsing	sha1_round2_update(36, hs4, hs0, hs1, hs2, hs3);
241a61493a0Sjsing	sha1_round2_update(37, hs3, hs4, hs0, hs1, hs2);
242a61493a0Sjsing	sha1_round2_update(38, hs2, hs3, hs4, hs0, hs1);
243a61493a0Sjsing	sha1_round2_update(39, hs1, hs2, hs3, hs4, hs0);
244a61493a0Sjsing	sha1_round3_update(40, hs0, hs1, hs2, hs3, hs4);
245a61493a0Sjsing	sha1_round3_update(41, hs4, hs0, hs1, hs2, hs3);
246a61493a0Sjsing	sha1_round3_update(42, hs3, hs4, hs0, hs1, hs2);
247a61493a0Sjsing	sha1_round3_update(43, hs2, hs3, hs4, hs0, hs1);
248a61493a0Sjsing	sha1_round3_update(44, hs1, hs2, hs3, hs4, hs0);
249a61493a0Sjsing	sha1_round3_update(45, hs0, hs1, hs2, hs3, hs4);
250a61493a0Sjsing	sha1_round3_update(46, hs4, hs0, hs1, hs2, hs3);
251a61493a0Sjsing	sha1_round3_update(47, hs3, hs4, hs0, hs1, hs2);
252a61493a0Sjsing
253a61493a0Sjsing	/* Round 48 through 63. */
254a61493a0Sjsing	sha1_round3_update(48, hs2, hs3, hs4, hs0, hs1);
255a61493a0Sjsing	sha1_round3_update(49, hs1, hs2, hs3, hs4, hs0);
256a61493a0Sjsing	sha1_round3_update(50, hs0, hs1, hs2, hs3, hs4);
257a61493a0Sjsing	sha1_round3_update(51, hs4, hs0, hs1, hs2, hs3);
258a61493a0Sjsing	sha1_round3_update(52, hs3, hs4, hs0, hs1, hs2);
259a61493a0Sjsing	sha1_round3_update(53, hs2, hs3, hs4, hs0, hs1);
260a61493a0Sjsing	sha1_round3_update(54, hs1, hs2, hs3, hs4, hs0);
261a61493a0Sjsing	sha1_round3_update(55, hs0, hs1, hs2, hs3, hs4);
262a61493a0Sjsing	sha1_round3_update(56, hs4, hs0, hs1, hs2, hs3);
263a61493a0Sjsing	sha1_round3_update(57, hs3, hs4, hs0, hs1, hs2);
264a61493a0Sjsing	sha1_round3_update(58, hs2, hs3, hs4, hs0, hs1);
265a61493a0Sjsing	sha1_round3_update(59, hs1, hs2, hs3, hs4, hs0);
266a61493a0Sjsing	sha1_round4_update(60, hs0, hs1, hs2, hs3, hs4);
267a61493a0Sjsing	sha1_round4_update(61, hs4, hs0, hs1, hs2, hs3);
268a61493a0Sjsing	sha1_round4_update(62, hs3, hs4, hs0, hs1, hs2);
269a61493a0Sjsing	sha1_round4_update(63, hs2, hs3, hs4, hs0, hs1);
270a61493a0Sjsing
271a61493a0Sjsing	/* Round 64 through 79. */
272a61493a0Sjsing	sha1_round4_update(64, hs1, hs2, hs3, hs4, hs0);
273a61493a0Sjsing	sha1_round4_update(65, hs0, hs1, hs2, hs3, hs4);
274a61493a0Sjsing	sha1_round4_update(66, hs4, hs0, hs1, hs2, hs3);
275a61493a0Sjsing	sha1_round4_update(67, hs3, hs4, hs0, hs1, hs2);
276a61493a0Sjsing	sha1_round4_update(68, hs2, hs3, hs4, hs0, hs1);
277a61493a0Sjsing	sha1_round4_update(69, hs1, hs2, hs3, hs4, hs0);
278a61493a0Sjsing	sha1_round4_update(70, hs0, hs1, hs2, hs3, hs4);
279a61493a0Sjsing	sha1_round4_update(71, hs4, hs0, hs1, hs2, hs3);
280a61493a0Sjsing	sha1_round4_update(72, hs3, hs4, hs0, hs1, hs2);
281a61493a0Sjsing	sha1_round4_update(73, hs2, hs3, hs4, hs0, hs1);
282a61493a0Sjsing	sha1_round4_update(74, hs1, hs2, hs3, hs4, hs0);
283a61493a0Sjsing	sha1_round4_update(75, hs0, hs1, hs2, hs3, hs4);
284a61493a0Sjsing	sha1_round4_update(76, hs4, hs0, hs1, hs2, hs3);
285a61493a0Sjsing	sha1_round4_update(77, hs3, hs4, hs0, hs1, hs2);
286a61493a0Sjsing	sha1_round4_update(78, hs2, hs3, hs4, hs0, hs1);
287a61493a0Sjsing	sha1_round4_update(79, hs1, hs2, hs3, hs4, hs0);
288a61493a0Sjsing
289a61493a0Sjsing	/* Add intermediate state to hash state. */
290a61493a0Sjsing	addl	(0*4)(ctx), hs0
291a61493a0Sjsing	addl	(1*4)(ctx), hs1
292a61493a0Sjsing	addl	(2*4)(ctx), hs2
293a61493a0Sjsing	addl	(3*4)(ctx), hs3
294a61493a0Sjsing	addl	(4*4)(ctx), hs4
295a61493a0Sjsing
296a61493a0Sjsing	/* Store new hash state to context. */
297a61493a0Sjsing	movl	hs0, (0*4)(ctx)
298a61493a0Sjsing	movl	hs1, (1*4)(ctx)
299a61493a0Sjsing	movl	hs2, (2*4)(ctx)
300a61493a0Sjsing	movl	hs3, (3*4)(ctx)
301a61493a0Sjsing	movl	hs4, (4*4)(ctx)
302a61493a0Sjsing
303a61493a0Sjsing	addq	$64, in
304a61493a0Sjsing	cmpq	end, in
305a61493a0Sjsing	jb	.Lblock_loop
306a61493a0Sjsing
307a61493a0Sjsing	movq	(64+0*8)(%rsp), %rsp
308a61493a0Sjsing
309a61493a0Sjsing	/* Restore callee save registers. */
310a61493a0Sjsing	popq	%r12
311a61493a0Sjsing	popq	%rbp
312a61493a0Sjsing	popq	%rbx
313a61493a0Sjsing
314a61493a0Sjsing	ret
315