xref: /openbsd-src/lib/libcrypto/sha/sha256_amd64_generic.S (revision 07e532b2d38ac57f99443e450b32ab7b5ddec390)
1*07e532b2Sjsing/* $OpenBSD: sha256_amd64_generic.S,v 1.3 2024/11/16 12:34:16 jsing Exp $ */
20acd6edbSjsing/*
30acd6edbSjsing * Copyright (c) 2024 Joel Sing <jsing@openbsd.org>
40acd6edbSjsing *
50acd6edbSjsing * Permission to use, copy, modify, and distribute this software for any
60acd6edbSjsing * purpose with or without fee is hereby granted, provided that the above
70acd6edbSjsing * copyright notice and this permission notice appear in all copies.
80acd6edbSjsing *
90acd6edbSjsing * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
100acd6edbSjsing * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
110acd6edbSjsing * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
120acd6edbSjsing * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
130acd6edbSjsing * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
140acd6edbSjsing * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
150acd6edbSjsing * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
160acd6edbSjsing */
170acd6edbSjsing
180acd6edbSjsing#ifdef __CET__
190acd6edbSjsing#include <cet.h>
200acd6edbSjsing#else
210acd6edbSjsing#define _CET_ENDBR
220acd6edbSjsing#endif
230acd6edbSjsing
240acd6edbSjsing#define	ctx		%rdi
250acd6edbSjsing#define	in		%rsi
260acd6edbSjsing#define	num		%rdx
270acd6edbSjsing
280acd6edbSjsing#define	round		%rdi
290acd6edbSjsing
300acd6edbSjsing#define	hs0		%r8d
310acd6edbSjsing#define	hs1		%r9d
320acd6edbSjsing#define	hs2		%r10d
330acd6edbSjsing#define	hs3		%r11d
340acd6edbSjsing#define	hs4		%r12d
350acd6edbSjsing#define	hs5		%r13d
360acd6edbSjsing#define	hs6		%r14d
370acd6edbSjsing#define	hs7		%r15d
380acd6edbSjsing
390acd6edbSjsing#define	k256		%rbp
400acd6edbSjsing
410acd6edbSjsing#define	tmp0		%eax
420acd6edbSjsing#define	tmp1		%ebx
430acd6edbSjsing#define	tmp2		%ecx
440acd6edbSjsing#define	tmp3		%edx
450acd6edbSjsing
460acd6edbSjsing/*
470acd6edbSjsing * Load message into wt, storing a copy in the message schedule:
480acd6edbSjsing *
490acd6edbSjsing *  Wt = Mt
500acd6edbSjsing */
510acd6edbSjsing#define sha256_message_schedule_load(idx, m, w, wt) \
520acd6edbSjsing	movl	(m, round, 4), wt;				\
530acd6edbSjsing	bswapl	wt;						\
540acd6edbSjsing	movl	wt, ((idx&0xf)*4)(w);
550acd6edbSjsing
560acd6edbSjsing/*
570acd6edbSjsing * Update message schedule and return current value in wt:
580acd6edbSjsing *
590acd6edbSjsing *  Wt = sigma1(W(t-2)) + W(t-7) + sigma0(W(t-15)) + W(t-16)
600acd6edbSjsing *
610acd6edbSjsing *  sigma0(x) = ror(x, 7) ^ ror(x, 18) ^ (x >> 3)
620acd6edbSjsing *  sigma1(x) = ror(x, 17) ^ ror(x, 19) ^ (x >> 10)
630acd6edbSjsing */
640acd6edbSjsing#define sha256_message_schedule_update(idx, w, wt) \
650acd6edbSjsing	movl	(((idx-2)&0xf)*4)(w), wt;	/* sigma1 */	\
660acd6edbSjsing	movl	wt, tmp1;			/* sigma1 */	\
670acd6edbSjsing	rorl	$(19-17), tmp1;			/* sigma1 */	\
680acd6edbSjsing	xorl	wt, tmp1;			/* sigma1 */	\
690acd6edbSjsing	rorl	$17, tmp1;			/* sigma1 */	\
700acd6edbSjsing	shrl	$10, wt;			/* sigma1 */	\
710acd6edbSjsing	xorl	tmp1, wt;			/* sigma1 */	\
720acd6edbSjsing	\
730acd6edbSjsing	addl	(((idx-7)&0xf)*4)(w), wt;	/* Wt-7 */	\
740acd6edbSjsing	addl	(((idx-16)&0xf)*4)(w), wt;	/* Wt-16 */	\
750acd6edbSjsing	\
760acd6edbSjsing	movl	(((idx-15)&0xf)*4)(w), tmp2;	/* sigma0 */	\
770acd6edbSjsing	movl	tmp2, tmp3;			/* sigma0 */	\
780acd6edbSjsing	rorl	$(18-7), tmp2;			/* sigma0 */	\
790acd6edbSjsing	xorl	tmp3, tmp2;			/* sigma0 */	\
800acd6edbSjsing	rorl	$7, tmp2;			/* sigma0 */	\
810acd6edbSjsing	shrl	$3, tmp3;			/* sigma0 */	\
820acd6edbSjsing	xorl	tmp3, tmp2;			/* sigma0 */	\
830acd6edbSjsing	addl	tmp2, wt;			/* sigma0 */	\
840acd6edbSjsing	\
850acd6edbSjsing	movl	wt, ((idx&0xf)*4)(w);
860acd6edbSjsing
870acd6edbSjsing/*
880acd6edbSjsing * Compute a SHA-256 round:
890acd6edbSjsing *
900acd6edbSjsing *  T1 = h + Sigma1(e) + Ch(e, f, g) + Kt + Wt
910acd6edbSjsing *  T2 = Sigma0(a) + Maj(a, b, c)
920acd6edbSjsing *
930acd6edbSjsing *  Sigma0(x) = ror(x, 2) ^ ror(x, 13) ^ ror(x, 22)
940acd6edbSjsing *  Sigma1(x) = ror(x, 6) ^ ror(x, 11) ^ ror(x, 25)
950acd6edbSjsing *  Ch(x, y, z) = (x & y) ^ (~x & z) = ((y ^ z) & x) ^ z
960acd6edbSjsing *  Maj(x, y, z) = (x & y) ^ (x & z) ^ (y & z) = ((y ^ z) & x) ^ (y & z)
970acd6edbSjsing *
980acd6edbSjsing * Upon completion d = d + T1, h = T1 + T2, pending rotation.
990acd6edbSjsing */
1000acd6edbSjsing#define sha256_round(idx, a, b, c, d, e, f, g, h, k, w, wt) \
1010acd6edbSjsing	addl	wt, h;				/* T1 Wt */	\
1020acd6edbSjsing	addl	(k256, round, 4), h;		/* T1 Kt */	\
1030acd6edbSjsing	\
1040acd6edbSjsing	movl	e, tmp1;			/* T1 Sigma1 */	\
1050acd6edbSjsing	rorl	$(25-11), tmp1;			/* T1 Sigma1 */	\
1060acd6edbSjsing	xorl	e, tmp1;			/* T1 Sigma1 */ \
1070acd6edbSjsing	rorl	$(11-6), tmp1;			/* T1 Sigma1 */	\
1080acd6edbSjsing	xorl	e, tmp1;			/* T1 Sigma1 */ \
1090acd6edbSjsing	rorl	$6, tmp1;			/* T1 Sigma1 */	\
1100acd6edbSjsing	addl	tmp1, h;			/* T1 Sigma1 */	\
1110acd6edbSjsing	\
1120acd6edbSjsing	movl	f, tmp2;			/* T1 Ch */	\
1130acd6edbSjsing	xorl	g, tmp2;			/* T1 Ch */	\
1140acd6edbSjsing	andl	e, tmp2;			/* T1 Ch */	\
1150acd6edbSjsing	xorl	g, tmp2;			/* T1 Ch */	\
1160acd6edbSjsing	addl	tmp2, h;			/* T1 Ch */	\
1170acd6edbSjsing	\
1180acd6edbSjsing	addl	h, d;				/* d += T1 */	\
1190acd6edbSjsing	\
1200acd6edbSjsing	movl	a, tmp1;			/* T2 Sigma0 */	\
1210acd6edbSjsing	rorl	$(22-13), tmp1;			/* T2 Sigma0 */	\
1220acd6edbSjsing	xorl	a, tmp1;			/* T2 Sigma0 */	\
1230acd6edbSjsing	rorl	$(13-2), tmp1;			/* T2 Sigma0 */	\
1240acd6edbSjsing	xorl	a, tmp1;			/* T2 Sigma0 */	\
1250acd6edbSjsing	rorl	$2, tmp1;			/* T2 Sigma0 */	\
1260acd6edbSjsing	addl	tmp1, h;			/* T2 Sigma0 */	\
1270acd6edbSjsing	\
1280acd6edbSjsing	movl	b, tmp2;			/* T2 Maj */	\
1290acd6edbSjsing	xorl	c, tmp2;			/* T2 Maj */	\
1300acd6edbSjsing	andl	a, tmp2;			/* T2 Maj */	\
1310acd6edbSjsing	movl	b, tmp3;			/* T2 Maj */	\
1320acd6edbSjsing	andl	c, tmp3;			/* T2 Maj */	\
1330acd6edbSjsing	xorl	tmp2, tmp3;			/* T2 Maj */	\
1340acd6edbSjsing	addl	tmp3, h;			/* T2 Maj */	\
1350acd6edbSjsing	\
1360acd6edbSjsing	addq	$1, round;
1370acd6edbSjsing
1380acd6edbSjsing#define sha256_round_load(idx, a, b, c, d, e, f, g, h) \
1390acd6edbSjsing	sha256_message_schedule_load(idx, in, %rsp, tmp0) \
1400acd6edbSjsing	sha256_round(idx, a, b, c, d, e, f, g, h, k256, %rsp, tmp0)
1410acd6edbSjsing
1420acd6edbSjsing#define sha256_round_update(idx, a, b, c, d, e, f, g, h) \
1430acd6edbSjsing	sha256_message_schedule_update(idx, %rsp, tmp0) \
1440acd6edbSjsing	sha256_round(idx, a, b, c, d, e, f, g, h, k256, %rsp, tmp0)
1450acd6edbSjsing
1460acd6edbSjsing.text
1470acd6edbSjsing
1480acd6edbSjsing/*
1490acd6edbSjsing * void sha256_block_generic(SHA256_CTX *ctx, const void *in, size_t num);
1500acd6edbSjsing *
1510acd6edbSjsing * Standard x86-64 ABI: rdi = ctx, rsi = in, rdx = num
1520acd6edbSjsing */
1530acd6edbSjsing.align 16
1540acd6edbSjsing.globl	sha256_block_generic
1550acd6edbSjsing.type	sha256_block_generic,@function
1560acd6edbSjsingsha256_block_generic:
1570acd6edbSjsing	_CET_ENDBR
1580acd6edbSjsing
1590acd6edbSjsing	/* Save callee save registers. */
1600acd6edbSjsing	pushq	%rbx
1610acd6edbSjsing	pushq	%rbp
1620acd6edbSjsing	pushq	%r12
1630acd6edbSjsing	pushq	%r13
1640acd6edbSjsing	pushq	%r14
1650acd6edbSjsing	pushq	%r15
1660acd6edbSjsing
167644472e5Sjsing	/* Allocate space for message schedule, context pointer and end of message. */
1680acd6edbSjsing	movq	%rsp, %rax
169644472e5Sjsing	subq	$(64+3*8), %rsp
1700acd6edbSjsing	andq	$~63, %rsp
171644472e5Sjsing	movq	%rax, (64+2*8)(%rsp)
172644472e5Sjsing	movq	ctx, (64+1*8)(%rsp)
1730acd6edbSjsing
1740acd6edbSjsing	/* Compute and store end of message. */
1750acd6edbSjsing	shlq	$6, num
1760acd6edbSjsing	leaq	(in, num, 1), %rbx
177644472e5Sjsing	movq	%rbx, (64+0*8)(%rsp)
1780acd6edbSjsing
1790acd6edbSjsing	/* Address of SHA-256 constants. */
1800acd6edbSjsing	leaq	K256(%rip), k256
1810acd6edbSjsing
1820acd6edbSjsing	/* Load current hash state from context. */
1830acd6edbSjsing	movl	(0*4)(ctx), hs0
1840acd6edbSjsing	movl	(1*4)(ctx), hs1
1850acd6edbSjsing	movl	(2*4)(ctx), hs2
1860acd6edbSjsing	movl	(3*4)(ctx), hs3
1870acd6edbSjsing	movl	(4*4)(ctx), hs4
1880acd6edbSjsing	movl	(5*4)(ctx), hs5
1890acd6edbSjsing	movl	(6*4)(ctx), hs6
1900acd6edbSjsing	movl	(7*4)(ctx), hs7
1910acd6edbSjsing
1920acd6edbSjsing	jmp	.Lblock_loop0
1930acd6edbSjsing
1940acd6edbSjsing.align 16
1950acd6edbSjsing.Lblock_loop0:
1960acd6edbSjsing	mov	$0, round
1970acd6edbSjsing
1980acd6edbSjsing	/* Round 0 through 15. */
1990acd6edbSjsing	sha256_round_load(0, hs0, hs1, hs2, hs3, hs4, hs5, hs6, hs7)
2000acd6edbSjsing	sha256_round_load(1, hs7, hs0, hs1, hs2, hs3, hs4, hs5, hs6)
2010acd6edbSjsing	sha256_round_load(2, hs6, hs7, hs0, hs1, hs2, hs3, hs4, hs5)
2020acd6edbSjsing	sha256_round_load(3, hs5, hs6, hs7, hs0, hs1, hs2, hs3, hs4)
2030acd6edbSjsing	sha256_round_load(4, hs4, hs5, hs6, hs7, hs0, hs1, hs2, hs3)
2040acd6edbSjsing	sha256_round_load(5, hs3, hs4, hs5, hs6, hs7, hs0, hs1, hs2)
2050acd6edbSjsing	sha256_round_load(6, hs2, hs3, hs4, hs5, hs6, hs7, hs0, hs1)
2060acd6edbSjsing	sha256_round_load(7, hs1, hs2, hs3, hs4, hs5, hs6, hs7, hs0)
2070acd6edbSjsing	sha256_round_load(8, hs0, hs1, hs2, hs3, hs4, hs5, hs6, hs7)
2080acd6edbSjsing	sha256_round_load(9, hs7, hs0, hs1, hs2, hs3, hs4, hs5, hs6)
2090acd6edbSjsing	sha256_round_load(10, hs6, hs7, hs0, hs1, hs2, hs3, hs4, hs5)
2100acd6edbSjsing	sha256_round_load(11, hs5, hs6, hs7, hs0, hs1, hs2, hs3, hs4)
2110acd6edbSjsing	sha256_round_load(12, hs4, hs5, hs6, hs7, hs0, hs1, hs2, hs3)
2120acd6edbSjsing	sha256_round_load(13, hs3, hs4, hs5, hs6, hs7, hs0, hs1, hs2)
2130acd6edbSjsing	sha256_round_load(14, hs2, hs3, hs4, hs5, hs6, hs7, hs0, hs1)
2140acd6edbSjsing	sha256_round_load(15, hs1, hs2, hs3, hs4, hs5, hs6, hs7, hs0)
2150acd6edbSjsing
2160acd6edbSjsing	jmp	.Lblock_loop16
2170acd6edbSjsing
2180acd6edbSjsing.align 16
2190acd6edbSjsing.Lblock_loop16:
2200acd6edbSjsing	/* Round 16 through 63. */
2210acd6edbSjsing	sha256_round_update(16, hs0, hs1, hs2, hs3, hs4, hs5, hs6, hs7)
2220acd6edbSjsing	sha256_round_update(17, hs7, hs0, hs1, hs2, hs3, hs4, hs5, hs6)
2230acd6edbSjsing	sha256_round_update(18, hs6, hs7, hs0, hs1, hs2, hs3, hs4, hs5)
2240acd6edbSjsing	sha256_round_update(19, hs5, hs6, hs7, hs0, hs1, hs2, hs3, hs4)
2250acd6edbSjsing	sha256_round_update(20, hs4, hs5, hs6, hs7, hs0, hs1, hs2, hs3)
2260acd6edbSjsing	sha256_round_update(21, hs3, hs4, hs5, hs6, hs7, hs0, hs1, hs2)
2270acd6edbSjsing	sha256_round_update(22, hs2, hs3, hs4, hs5, hs6, hs7, hs0, hs1)
2280acd6edbSjsing	sha256_round_update(23, hs1, hs2, hs3, hs4, hs5, hs6, hs7, hs0)
2290acd6edbSjsing	sha256_round_update(24, hs0, hs1, hs2, hs3, hs4, hs5, hs6, hs7)
2300acd6edbSjsing	sha256_round_update(25, hs7, hs0, hs1, hs2, hs3, hs4, hs5, hs6)
2310acd6edbSjsing	sha256_round_update(26, hs6, hs7, hs0, hs1, hs2, hs3, hs4, hs5)
2320acd6edbSjsing	sha256_round_update(27, hs5, hs6, hs7, hs0, hs1, hs2, hs3, hs4)
2330acd6edbSjsing	sha256_round_update(28, hs4, hs5, hs6, hs7, hs0, hs1, hs2, hs3)
2340acd6edbSjsing	sha256_round_update(29, hs3, hs4, hs5, hs6, hs7, hs0, hs1, hs2)
2350acd6edbSjsing	sha256_round_update(30, hs2, hs3, hs4, hs5, hs6, hs7, hs0, hs1)
2360acd6edbSjsing	sha256_round_update(31, hs1, hs2, hs3, hs4, hs5, hs6, hs7, hs0)
2370acd6edbSjsing
2380acd6edbSjsing	cmp	$64, round
2390acd6edbSjsing	jb	.Lblock_loop16
2400acd6edbSjsing
241644472e5Sjsing	movq	(64+1*8)(%rsp), ctx
2420acd6edbSjsing
2430acd6edbSjsing	/* Add intermediate state to hash state. */
2440acd6edbSjsing	addl	(0*4)(ctx), hs0
2450acd6edbSjsing	addl	(1*4)(ctx), hs1
2460acd6edbSjsing	addl	(2*4)(ctx), hs2
2470acd6edbSjsing	addl	(3*4)(ctx), hs3
2480acd6edbSjsing	addl	(4*4)(ctx), hs4
2490acd6edbSjsing	addl	(5*4)(ctx), hs5
2500acd6edbSjsing	addl	(6*4)(ctx), hs6
2510acd6edbSjsing	addl	(7*4)(ctx), hs7
2520acd6edbSjsing
2530acd6edbSjsing	/* Store new hash state to context. */
2540acd6edbSjsing	movl	hs0, (0*4)(ctx)
2550acd6edbSjsing	movl	hs1, (1*4)(ctx)
2560acd6edbSjsing	movl	hs2, (2*4)(ctx)
2570acd6edbSjsing	movl	hs3, (3*4)(ctx)
2580acd6edbSjsing	movl	hs4, (4*4)(ctx)
2590acd6edbSjsing	movl	hs5, (5*4)(ctx)
2600acd6edbSjsing	movl	hs6, (6*4)(ctx)
2610acd6edbSjsing	movl	hs7, (7*4)(ctx)
2620acd6edbSjsing
2630acd6edbSjsing	addq	$64, in
264644472e5Sjsing	cmpq	(64+0*8)(%rsp), in
2650acd6edbSjsing	jb	.Lblock_loop0
2660acd6edbSjsing
267644472e5Sjsing	movq	(64+2*8)(%rsp), %rsp
2680acd6edbSjsing
2690acd6edbSjsing	/* Restore callee save registers. */
2700acd6edbSjsing	popq	%r15
2710acd6edbSjsing	popq	%r14
2720acd6edbSjsing	popq	%r13
2730acd6edbSjsing	popq	%r12
2740acd6edbSjsing	popq	%rbp
2750acd6edbSjsing	popq	%rbx
2760acd6edbSjsing
2770acd6edbSjsing	ret
2780acd6edbSjsing
2790acd6edbSjsing/*
2800acd6edbSjsing * SHA-256 constants - see FIPS 180-4 section 4.2.2.
2810acd6edbSjsing */
2820acd6edbSjsing.rodata
2830acd6edbSjsing.align	64
2840acd6edbSjsing.type	K256,@object
2850acd6edbSjsingK256:
2860acd6edbSjsing.long	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
2870acd6edbSjsing.long	0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
2880acd6edbSjsing.long	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
2890acd6edbSjsing.long	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
2900acd6edbSjsing.long	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
2910acd6edbSjsing.long	0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
2920acd6edbSjsing.long	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
2930acd6edbSjsing.long	0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
2940acd6edbSjsing.long	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
2950acd6edbSjsing.long	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
2960acd6edbSjsing.long	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
2970acd6edbSjsing.long	0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
2980acd6edbSjsing.long	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
2990acd6edbSjsing.long	0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
3000acd6edbSjsing.long	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
3010acd6edbSjsing.long	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
302*07e532b2Sjsing.size	K256,.-K256
303