xref: /openbsd-src/lib/libcrypto/sha/sha256_amd64_shani.S (revision 228e7c1edf50bd8bdd3aca0a9db75db8848ad9e0)
1*228e7c1eSjsing/* $OpenBSD: sha256_amd64_shani.S,v 1.1 2024/11/16 15:31:36 jsing Exp $ */
2*228e7c1eSjsing/*
3*228e7c1eSjsing * Copyright (c) 2024 Joel Sing <jsing@openbsd.org>
4*228e7c1eSjsing *
5*228e7c1eSjsing * Permission to use, copy, modify, and distribute this software for any
6*228e7c1eSjsing * purpose with or without fee is hereby granted, provided that the above
7*228e7c1eSjsing * copyright notice and this permission notice appear in all copies.
8*228e7c1eSjsing *
9*228e7c1eSjsing * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10*228e7c1eSjsing * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11*228e7c1eSjsing * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12*228e7c1eSjsing * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13*228e7c1eSjsing * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14*228e7c1eSjsing * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15*228e7c1eSjsing * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16*228e7c1eSjsing */
17*228e7c1eSjsing
18*228e7c1eSjsing#ifdef __CET__
19*228e7c1eSjsing#include <cet.h>
20*228e7c1eSjsing#else
21*228e7c1eSjsing#define _CET_ENDBR
22*228e7c1eSjsing#endif
23*228e7c1eSjsing
24*228e7c1eSjsing/*
25*228e7c1eSjsing * SHA-256 implementation using the Intel SHA extensions:
26*228e7c1eSjsing *
27*228e7c1eSjsing * https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sha-extensions.html
28*228e7c1eSjsing */
29*228e7c1eSjsing
30*228e7c1eSjsing#define	ctx		%rdi
31*228e7c1eSjsing#define	in		%rsi
32*228e7c1eSjsing#define	num		%rdx
33*228e7c1eSjsing
34*228e7c1eSjsing#define	end		%rbx
35*228e7c1eSjsing
36*228e7c1eSjsing#define	k256		%rbp
37*228e7c1eSjsing
38*228e7c1eSjsing#define	xmsg		%xmm0
39*228e7c1eSjsing
40*228e7c1eSjsing#define	xhs0		%xmm1
41*228e7c1eSjsing#define	xhs1		%xmm2
42*228e7c1eSjsing
43*228e7c1eSjsing#define	xabef		%xmm3
44*228e7c1eSjsing#define	xcdgh		%xmm4
45*228e7c1eSjsing
46*228e7c1eSjsing#define	xmsgtmp0	%xmm6
47*228e7c1eSjsing#define	xmsgtmp1	%xmm7
48*228e7c1eSjsing#define	xmsgtmp2	%xmm8
49*228e7c1eSjsing#define	xmsgtmp3	%xmm9
50*228e7c1eSjsing#define	xmsgtmp4	%xmm10
51*228e7c1eSjsing
52*228e7c1eSjsing#define	xshufmask	%xmm11
53*228e7c1eSjsing
54*228e7c1eSjsing#define	xtmp0		%xmm12
55*228e7c1eSjsing
56*228e7c1eSjsing#define sha256_message_schedule_load(idx, m, xmsgtmp) \
57*228e7c1eSjsing	movdqu	(idx*16)(m), xmsg;					\
58*228e7c1eSjsing	pshufb	xshufmask, xmsg;					\
59*228e7c1eSjsing	movdqa	xmsg, xmsgtmp;
60*228e7c1eSjsing
61*228e7c1eSjsing#define sha256_message_schedule_update(xmt0, xmt1, xmt2, xmt3) \
62*228e7c1eSjsing	sha256msg1 xmt1, xmt0;						\
63*228e7c1eSjsing	movdqa	xmt3, xmsgtmp4;						\
64*228e7c1eSjsing	palignr	$4, xmt2, xmsgtmp4;					\
65*228e7c1eSjsing	paddd	xmsgtmp4, xmt0;						\
66*228e7c1eSjsing	sha256msg2 xmt3, xmt0;
67*228e7c1eSjsing
68*228e7c1eSjsing#define sha256_shani_round(idx) \
69*228e7c1eSjsing	paddd	(idx*16)(k256), xmsg;					\
70*228e7c1eSjsing	sha256rnds2 xmsg, xhs0, xhs1;					\
71*228e7c1eSjsing	pshufd	$0x0e, xmsg, xmsg;					\
72*228e7c1eSjsing	sha256rnds2 xmsg, xhs1, xhs0;
73*228e7c1eSjsing
74*228e7c1eSjsing#define sha256_shani_round_load(idx, m, xmsgtmp) \
75*228e7c1eSjsing	sha256_message_schedule_load(idx, m, xmsgtmp);			\
76*228e7c1eSjsing	sha256_shani_round(idx);
77*228e7c1eSjsing
78*228e7c1eSjsing#define sha256_shani_round_update(idx, xmt0, xmt1, xmt2, xmt3) \
79*228e7c1eSjsing	sha256_message_schedule_update(xmt0, xmt1, xmt2, xmt3);		\
80*228e7c1eSjsing	movdqa	xmt0, xmsg;						\
81*228e7c1eSjsing	sha256_shani_round(idx);
82*228e7c1eSjsing
83*228e7c1eSjsing.text
84*228e7c1eSjsing
85*228e7c1eSjsing/*
86*228e7c1eSjsing * void sha256_block_shani(SHA256_CTX *ctx, const void *in, size_t num);
87*228e7c1eSjsing *
88*228e7c1eSjsing * Standard x86-64 ABI: rdi = ctx, rsi = in, rdx = num
89*228e7c1eSjsing */
90*228e7c1eSjsing.align 16
91*228e7c1eSjsing.globl	sha256_block_shani
92*228e7c1eSjsing.type	sha256_block_shani,@function
93*228e7c1eSjsingsha256_block_shani:
94*228e7c1eSjsing	_CET_ENDBR
95*228e7c1eSjsing
96*228e7c1eSjsing	/* Save callee save registers. */
97*228e7c1eSjsing	pushq	%rbx
98*228e7c1eSjsing	pushq	%rbp
99*228e7c1eSjsing
100*228e7c1eSjsing	/* Compute end of message. */
101*228e7c1eSjsing	shlq	$6, num
102*228e7c1eSjsing	leaq	(in, num, 1), end
103*228e7c1eSjsing
104*228e7c1eSjsing	/* Address of SHA-256 constants. */
105*228e7c1eSjsing	leaq	K256(%rip), k256
106*228e7c1eSjsing
107*228e7c1eSjsing	/* Load endian shuffle mask. */
108*228e7c1eSjsing	movdqa	shufmask(%rip), xshufmask
109*228e7c1eSjsing
110*228e7c1eSjsing	/* Load current hash state from context. */
111*228e7c1eSjsing	movdqu	(0*16)(ctx), xhs0	/* dcba */
112*228e7c1eSjsing	movdqu	(1*16)(ctx), xhs1	/* hgfe */
113*228e7c1eSjsing
114*228e7c1eSjsing	/* Rearrange words to construct abef/cdgh. */
115*228e7c1eSjsing	pshufd	$0xb1, xhs0, xhs0	/* cdab */
116*228e7c1eSjsing	pshufd	$0x1b, xhs1, xhs1	/* efgh */
117*228e7c1eSjsing	movdqa	xhs0, xtmp0
118*228e7c1eSjsing	palignr	$8, xhs1, xhs0		/* abef */
119*228e7c1eSjsing	pblendw	$0xf0, xtmp0, xhs1	/* cdgh */
120*228e7c1eSjsing
121*228e7c1eSjsing	jmp	.Lshani_block_loop
122*228e7c1eSjsing
123*228e7c1eSjsing.align 16
124*228e7c1eSjsing.Lshani_block_loop:
125*228e7c1eSjsing	/* Save state for accumulation. */
126*228e7c1eSjsing	movdqa	xhs0, xabef
127*228e7c1eSjsing	movdqa	xhs1, xcdgh
128*228e7c1eSjsing
129*228e7c1eSjsing	/* Rounds 0 through 15 (four rounds at a time). */
130*228e7c1eSjsing	sha256_shani_round_load(0, in, xmsgtmp0)
131*228e7c1eSjsing	sha256_shani_round_load(1, in, xmsgtmp1)
132*228e7c1eSjsing	sha256_shani_round_load(2, in, xmsgtmp2)
133*228e7c1eSjsing	sha256_shani_round_load(3, in, xmsgtmp3)
134*228e7c1eSjsing
135*228e7c1eSjsing	/* Rounds 16 through 63 (four rounds at a time). */
136*228e7c1eSjsing	sha256_shani_round_update(4, xmsgtmp0, xmsgtmp1, xmsgtmp2, xmsgtmp3)
137*228e7c1eSjsing	sha256_shani_round_update(5, xmsgtmp1, xmsgtmp2, xmsgtmp3, xmsgtmp0)
138*228e7c1eSjsing	sha256_shani_round_update(6, xmsgtmp2, xmsgtmp3, xmsgtmp0, xmsgtmp1)
139*228e7c1eSjsing	sha256_shani_round_update(7, xmsgtmp3, xmsgtmp0, xmsgtmp1, xmsgtmp2)
140*228e7c1eSjsing
141*228e7c1eSjsing	sha256_shani_round_update(8, xmsgtmp0, xmsgtmp1, xmsgtmp2, xmsgtmp3)
142*228e7c1eSjsing	sha256_shani_round_update(9, xmsgtmp1, xmsgtmp2, xmsgtmp3, xmsgtmp0)
143*228e7c1eSjsing	sha256_shani_round_update(10, xmsgtmp2, xmsgtmp3, xmsgtmp0, xmsgtmp1)
144*228e7c1eSjsing	sha256_shani_round_update(11, xmsgtmp3, xmsgtmp0, xmsgtmp1, xmsgtmp2)
145*228e7c1eSjsing
146*228e7c1eSjsing	sha256_shani_round_update(12, xmsgtmp0, xmsgtmp1, xmsgtmp2, xmsgtmp3)
147*228e7c1eSjsing	sha256_shani_round_update(13, xmsgtmp1, xmsgtmp2, xmsgtmp3, xmsgtmp0)
148*228e7c1eSjsing	sha256_shani_round_update(14, xmsgtmp2, xmsgtmp3, xmsgtmp0, xmsgtmp1)
149*228e7c1eSjsing	sha256_shani_round_update(15, xmsgtmp3, xmsgtmp0, xmsgtmp1, xmsgtmp2)
150*228e7c1eSjsing
151*228e7c1eSjsing	/* Accumulate hash state. */
152*228e7c1eSjsing	paddd	xabef, xhs0
153*228e7c1eSjsing	paddd	xcdgh, xhs1
154*228e7c1eSjsing
155*228e7c1eSjsing	addq	$64, in
156*228e7c1eSjsing	cmpq	end, in
157*228e7c1eSjsing	jb	.Lshani_block_loop
158*228e7c1eSjsing
159*228e7c1eSjsing	/* Rearrange words to construct dcba/hgfe. */
160*228e7c1eSjsing	pshufd	$0x1b, xhs0, xhs0	/* feba */
161*228e7c1eSjsing	pshufd	$0xb1, xhs1, xhs1	/* dchg */
162*228e7c1eSjsing	movdqa	xhs0, xtmp0
163*228e7c1eSjsing	pblendw	$0xf0, xhs1, xhs0	/* dcba */
164*228e7c1eSjsing	palignr	$8, xtmp0, xhs1		/* hgfe */
165*228e7c1eSjsing
166*228e7c1eSjsing	/* Update stored hash context. */
167*228e7c1eSjsing	movdqu	xhs0, (0*16)(ctx)
168*228e7c1eSjsing	movdqu	xhs1, (1*16)(ctx)
169*228e7c1eSjsing
170*228e7c1eSjsing	/* Restore callee save registers. */
171*228e7c1eSjsing	popq	%rbp
172*228e7c1eSjsing	popq	%rbx
173*228e7c1eSjsing
174*228e7c1eSjsing	ret
175*228e7c1eSjsing
176*228e7c1eSjsing.rodata
177*228e7c1eSjsing
178*228e7c1eSjsing/*
179*228e7c1eSjsing * Shuffle mask - little endian to big endian word conversion.
180*228e7c1eSjsing */
181*228e7c1eSjsing.align	16
182*228e7c1eSjsing.type	shufmask,@object
183*228e7c1eSjsingshufmask:
184*228e7c1eSjsing.octa	0x0c0d0e0f08090a0b0405060700010203
185*228e7c1eSjsing.size	shufmask,.-shufmask
186*228e7c1eSjsing
187*228e7c1eSjsing/*
188*228e7c1eSjsing * SHA-256 constants - see FIPS 180-4 section 4.2.2.
189*228e7c1eSjsing */
190*228e7c1eSjsing.align	64
191*228e7c1eSjsing.type	K256,@object
192*228e7c1eSjsingK256:
193*228e7c1eSjsing.long	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
194*228e7c1eSjsing.long	0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
195*228e7c1eSjsing.long	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
196*228e7c1eSjsing.long	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
197*228e7c1eSjsing.long	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
198*228e7c1eSjsing.long	0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
199*228e7c1eSjsing.long	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
200*228e7c1eSjsing.long	0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
201*228e7c1eSjsing.long	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
202*228e7c1eSjsing.long	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
203*228e7c1eSjsing.long	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
204*228e7c1eSjsing.long	0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
205*228e7c1eSjsing.long	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
206*228e7c1eSjsing.long	0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
207*228e7c1eSjsing.long	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
208*228e7c1eSjsing.long	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
209*228e7c1eSjsing.size	K256,.-K256
210