1*f6bb4990Sjsing/* $OpenBSD: sha1_amd64_generic.S,v 1.2 2025/01/18 02:56:07 jsing Exp $ */ 2a61493a0Sjsing/* 3a61493a0Sjsing * Copyright (c) 2024 Joel Sing <jsing@openbsd.org> 4a61493a0Sjsing * 5a61493a0Sjsing * Permission to use, copy, modify, and distribute this software for any 6a61493a0Sjsing * purpose with or without fee is hereby granted, provided that the above 7a61493a0Sjsing * copyright notice and this permission notice appear in all copies. 8a61493a0Sjsing * 9a61493a0Sjsing * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10a61493a0Sjsing * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11a61493a0Sjsing * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12a61493a0Sjsing * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13a61493a0Sjsing * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14a61493a0Sjsing * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15a61493a0Sjsing * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16a61493a0Sjsing */ 17a61493a0Sjsing 18a61493a0Sjsing#ifdef __CET__ 19a61493a0Sjsing#include <cet.h> 20a61493a0Sjsing#else 21a61493a0Sjsing#define _CET_ENDBR 22a61493a0Sjsing#endif 23a61493a0Sjsing 24a61493a0Sjsing#define ctx %rdi 25a61493a0Sjsing#define in %rsi 26a61493a0Sjsing#define num %rdx 27a61493a0Sjsing 28a61493a0Sjsing#define end %rbp 29a61493a0Sjsing 30a61493a0Sjsing#define hs0 %r8d 31a61493a0Sjsing#define hs1 %r9d 32a61493a0Sjsing#define hs2 %r10d 33a61493a0Sjsing#define hs3 %r11d 34a61493a0Sjsing#define hs4 %r12d 35a61493a0Sjsing 36a61493a0Sjsing#define tmp0 %eax 37a61493a0Sjsing#define tmp1 %ebx 38a61493a0Sjsing#define tmp2 %ecx 39a61493a0Sjsing#define tmp3 %edx 40a61493a0Sjsing 41a61493a0Sjsing/* 42a61493a0Sjsing * Load message into wt, storing a copy in the message schedule: 43a61493a0Sjsing * 44a61493a0Sjsing * Wt = Mt 45a61493a0Sjsing */ 46a61493a0Sjsing#define sha1_message_schedule_load(idx, m, w, wt) \ 47a61493a0Sjsing movl ((idx&0xf)*4)(m), wt; \ 48a61493a0Sjsing bswapl wt; \ 49a61493a0Sjsing movl wt, ((idx&0xf)*4)(w); 50a61493a0Sjsing 51a61493a0Sjsing/* 52a61493a0Sjsing * Update message schedule and return current value in wt: 53a61493a0Sjsing * 54a61493a0Sjsing * W0 = rol(W13 ^ W8 ^ W2 ^ W0, 1) 55a61493a0Sjsing */ 56a61493a0Sjsing#define sha1_message_schedule_update(idx, w, wt) \ 57a61493a0Sjsing movl (((idx-3)&0xf)*4)(w), wt; /* W13 */ \ 58a61493a0Sjsing xorl (((idx-8)&0xf)*4)(w), wt; /* W8 */ \ 59a61493a0Sjsing xorl (((idx-14)&0xf)*4)(w), wt; /* W2 */ \ 60a61493a0Sjsing xorl (((idx)&0xf)*4)(w), wt; /* W0 */ \ 61a61493a0Sjsing roll $1, wt; \ 62a61493a0Sjsing \ 63a61493a0Sjsing movl wt, ((idx&0xf)*4)(w); 64a61493a0Sjsing 65a61493a0Sjsing/* 66a61493a0Sjsing * Compute a SHA-1 round without logic function: 67a61493a0Sjsing * 68a61493a0Sjsing * T = rol(a, 5) + e + Kt + Wt 69a61493a0Sjsing * 70a61493a0Sjsing * The caller is required to compute the appropriate logic function 71a61493a0Sjsing * (Ch, Maj, Parity) and add it to e. 72a61493a0Sjsing * 73a61493a0Sjsing * Upon completion b = rol(b, 30), e = T, pending rotation. 74a61493a0Sjsing */ 75a61493a0Sjsing#define sha1_round(a, b, c, d, e, kt, wt) \ 76a61493a0Sjsing leal kt(wt, e, 1), e; /* Kt + Wt */ \ 77a61493a0Sjsing \ 78a61493a0Sjsing movl a, tmp1; /* rol(a, 5) */ \ 79a61493a0Sjsing roll $5, tmp1; \ 80a61493a0Sjsing addl tmp1, e; \ 81a61493a0Sjsing \ 82a61493a0Sjsing roll $30, b; /* rol(b, 30) */ 83a61493a0Sjsing 84a61493a0Sjsing/* 85a61493a0Sjsing * Compute a SHA-1 round with Ch: 86a61493a0Sjsing * 87a61493a0Sjsing * T = rol(a, 5) + Ch(b, c, d) + e + Kt + Wt 88a61493a0Sjsing * 89a61493a0Sjsing * Ch(x, y, z) = (x & y) ^ (~x & z) = ((y ^ z) & x) ^ z 90a61493a0Sjsing * 91a61493a0Sjsing * Upon completion b = rol(b, 30), e = T, pending rotation. 92a61493a0Sjsing */ 93a61493a0Sjsing#define sha1_round_ch(a, b, c, d, e, kt, wt) \ 94a61493a0Sjsing movl c, tmp2; /* Ch */ \ 95a61493a0Sjsing xorl d, tmp2; /* Ch */ \ 96a61493a0Sjsing andl b, tmp2; /* Ch */ \ 97a61493a0Sjsing xorl d, tmp2; /* Ch */ \ 98a61493a0Sjsing addl tmp2, e; /* Ch */ \ 99a61493a0Sjsing \ 100a61493a0Sjsing sha1_round(a, b, c, d, e, kt, wt); 101a61493a0Sjsing 102a61493a0Sjsing/* 103a61493a0Sjsing * Compute a SHA-1 round with Parity: 104a61493a0Sjsing * 105a61493a0Sjsing * T = rol(a, 5) + Parity(b, c, d) + e + Kt + Wt 106a61493a0Sjsing * 107a61493a0Sjsing * Parity(x, y, z) = x ^ y ^ z 108a61493a0Sjsing * 109a61493a0Sjsing * Upon completion b = rol(b, 30), e = T, pending rotation. 110a61493a0Sjsing */ 111a61493a0Sjsing#define sha1_round_parity(a, b, c, d, e, kt, wt) \ 112a61493a0Sjsing movl b, tmp2; /* Parity */ \ 113a61493a0Sjsing xorl c, tmp2; /* Parity */ \ 114a61493a0Sjsing xorl d, tmp2; /* Parity */ \ 115a61493a0Sjsing addl tmp2, e; /* Parity */ \ 116a61493a0Sjsing \ 117a61493a0Sjsing sha1_round(a, b, c, d, e, kt, wt); 118a61493a0Sjsing 119a61493a0Sjsing/* 120a61493a0Sjsing * Compute a SHA-1 round with Maj: 121a61493a0Sjsing * 122a61493a0Sjsing * T = rol(a, 5) + Maj(b, c, d) + e + Kt + Wt 123a61493a0Sjsing * 124a61493a0Sjsing * Maj(x, y, z) = (x & y) ^ (x & z) ^ (y & z) = ((y ^ z) & x) ^ (y & z) 125a61493a0Sjsing * 126a61493a0Sjsing * Upon completion b = rol(b, 30), e = T, pending rotation. 127a61493a0Sjsing */ 128a61493a0Sjsing#define sha1_round_maj(a, b, c, d, e, kt, wt) \ 129a61493a0Sjsing movl c, tmp2; /* Maj */ \ 130a61493a0Sjsing xorl d, tmp2; /* Maj */ \ 131a61493a0Sjsing andl b, tmp2; /* Maj */ \ 132a61493a0Sjsing movl c, tmp3; /* Maj */ \ 133a61493a0Sjsing andl d, tmp3; /* Maj */ \ 134a61493a0Sjsing xorl tmp2, tmp3; /* Maj */ \ 135a61493a0Sjsing addl tmp3, e; /* Maj */ \ 136a61493a0Sjsing \ 137a61493a0Sjsing sha1_round(a, b, c, d, e, kt, wt); 138a61493a0Sjsing 139a61493a0Sjsing#define sha1_round1_load(idx, a, b, c, d, e) \ 140a61493a0Sjsing sha1_message_schedule_load(idx, in, %rsp, tmp0) \ 141a61493a0Sjsing sha1_round_ch(a, b, c, d, e, 0x5a827999, tmp0) 142a61493a0Sjsing 143a61493a0Sjsing#define sha1_round1_update(idx, a, b, c, d, e) \ 144a61493a0Sjsing sha1_message_schedule_update(idx, %rsp, tmp0) \ 145a61493a0Sjsing sha1_round_ch(a, b, c, d, e, 0x5a827999, tmp0) 146a61493a0Sjsing 147a61493a0Sjsing#define sha1_round2_update(idx, a, b, c, d, e) \ 148a61493a0Sjsing sha1_message_schedule_update(idx, %rsp, tmp0) \ 149a61493a0Sjsing sha1_round_parity(a, b, c, d, e, 0x6ed9eba1, tmp0) 150a61493a0Sjsing 151a61493a0Sjsing#define sha1_round3_update(idx, a, b, c, d, e) \ 152a61493a0Sjsing sha1_message_schedule_update(idx, %rsp, tmp0) \ 153a61493a0Sjsing sha1_round_maj(a, b, c, d, e, 0x8f1bbcdc, tmp0) 154a61493a0Sjsing 155a61493a0Sjsing#define sha1_round4_update(idx, a, b, c, d, e) \ 156a61493a0Sjsing sha1_message_schedule_update(idx, %rsp, tmp0) \ 157a61493a0Sjsing sha1_round_parity(a, b, c, d, e, 0xca62c1d6, tmp0) 158a61493a0Sjsing 159a61493a0Sjsing.text 160a61493a0Sjsing 161a61493a0Sjsing/* 162a61493a0Sjsing * void sha1_block_generic(SHA1_CTX *ctx, const void *in, size_t num); 163a61493a0Sjsing * 164a61493a0Sjsing * Standard x86-64 ABI: rdi = ctx, rsi = in, rdx = num 165a61493a0Sjsing */ 166a61493a0Sjsing.align 16 167a61493a0Sjsing.globl sha1_block_generic 168a61493a0Sjsing.type sha1_block_generic,@function 169a61493a0Sjsingsha1_block_generic: 170a61493a0Sjsing _CET_ENDBR 171a61493a0Sjsing 172a61493a0Sjsing /* Save callee save registers. */ 173a61493a0Sjsing pushq %rbx 174a61493a0Sjsing pushq %rbp 175a61493a0Sjsing pushq %r12 176a61493a0Sjsing 177a61493a0Sjsing /* Allocate space for message schedule. */ 178a61493a0Sjsing movq %rsp, %rax 179a61493a0Sjsing subq $(64+1*8), %rsp 180a61493a0Sjsing andq $~63, %rsp 181a61493a0Sjsing movq %rax, (64+0*8)(%rsp) 182a61493a0Sjsing 183*f6bb4990Sjsing /* Compute end of message. */ 184a61493a0Sjsing shlq $6, num 185*f6bb4990Sjsing leaq (in, num, 1), end 186a61493a0Sjsing 187a61493a0Sjsing /* Load current hash state from context. */ 188a61493a0Sjsing movl (0*4)(ctx), hs0 189a61493a0Sjsing movl (1*4)(ctx), hs1 190a61493a0Sjsing movl (2*4)(ctx), hs2 191a61493a0Sjsing movl (3*4)(ctx), hs3 192a61493a0Sjsing movl (4*4)(ctx), hs4 193a61493a0Sjsing 194a61493a0Sjsing jmp .Lblock_loop 195a61493a0Sjsing 196a61493a0Sjsing.align 16 197a61493a0Sjsing.Lblock_loop: 198a61493a0Sjsing 199a61493a0Sjsing /* Round 0 through 15. */ 200a61493a0Sjsing sha1_round1_load(0, hs0, hs1, hs2, hs3, hs4) 201a61493a0Sjsing sha1_round1_load(1, hs4, hs0, hs1, hs2, hs3) 202a61493a0Sjsing sha1_round1_load(2, hs3, hs4, hs0, hs1, hs2) 203a61493a0Sjsing sha1_round1_load(3, hs2, hs3, hs4, hs0, hs1) 204a61493a0Sjsing sha1_round1_load(4, hs1, hs2, hs3, hs4, hs0) 205a61493a0Sjsing sha1_round1_load(5, hs0, hs1, hs2, hs3, hs4) 206a61493a0Sjsing sha1_round1_load(6, hs4, hs0, hs1, hs2, hs3) 207a61493a0Sjsing sha1_round1_load(7, hs3, hs4, hs0, hs1, hs2) 208a61493a0Sjsing sha1_round1_load(8, hs2, hs3, hs4, hs0, hs1) 209a61493a0Sjsing sha1_round1_load(9, hs1, hs2, hs3, hs4, hs0) 210a61493a0Sjsing sha1_round1_load(10, hs0, hs1, hs2, hs3, hs4) 211a61493a0Sjsing sha1_round1_load(11, hs4, hs0, hs1, hs2, hs3) 212a61493a0Sjsing sha1_round1_load(12, hs3, hs4, hs0, hs1, hs2) 213a61493a0Sjsing sha1_round1_load(13, hs2, hs3, hs4, hs0, hs1) 214a61493a0Sjsing sha1_round1_load(14, hs1, hs2, hs3, hs4, hs0) 215a61493a0Sjsing sha1_round1_load(15, hs0, hs1, hs2, hs3, hs4) 216a61493a0Sjsing 217a61493a0Sjsing /* Round 16 through 31. */ 218a61493a0Sjsing sha1_round1_update(16, hs4, hs0, hs1, hs2, hs3); 219a61493a0Sjsing sha1_round1_update(17, hs3, hs4, hs0, hs1, hs2); 220a61493a0Sjsing sha1_round1_update(18, hs2, hs3, hs4, hs0, hs1); 221a61493a0Sjsing sha1_round1_update(19, hs1, hs2, hs3, hs4, hs0); 222a61493a0Sjsing sha1_round2_update(20, hs0, hs1, hs2, hs3, hs4); 223a61493a0Sjsing sha1_round2_update(21, hs4, hs0, hs1, hs2, hs3); 224a61493a0Sjsing sha1_round2_update(22, hs3, hs4, hs0, hs1, hs2); 225a61493a0Sjsing sha1_round2_update(23, hs2, hs3, hs4, hs0, hs1); 226a61493a0Sjsing sha1_round2_update(24, hs1, hs2, hs3, hs4, hs0); 227a61493a0Sjsing sha1_round2_update(25, hs0, hs1, hs2, hs3, hs4); 228a61493a0Sjsing sha1_round2_update(26, hs4, hs0, hs1, hs2, hs3); 229a61493a0Sjsing sha1_round2_update(27, hs3, hs4, hs0, hs1, hs2); 230a61493a0Sjsing sha1_round2_update(28, hs2, hs3, hs4, hs0, hs1); 231a61493a0Sjsing sha1_round2_update(29, hs1, hs2, hs3, hs4, hs0); 232a61493a0Sjsing sha1_round2_update(30, hs0, hs1, hs2, hs3, hs4); 233a61493a0Sjsing sha1_round2_update(31, hs4, hs0, hs1, hs2, hs3); 234a61493a0Sjsing 235a61493a0Sjsing /* Round 32 through 47. */ 236a61493a0Sjsing sha1_round2_update(32, hs3, hs4, hs0, hs1, hs2); 237a61493a0Sjsing sha1_round2_update(33, hs2, hs3, hs4, hs0, hs1); 238a61493a0Sjsing sha1_round2_update(34, hs1, hs2, hs3, hs4, hs0); 239a61493a0Sjsing sha1_round2_update(35, hs0, hs1, hs2, hs3, hs4); 240a61493a0Sjsing sha1_round2_update(36, hs4, hs0, hs1, hs2, hs3); 241a61493a0Sjsing sha1_round2_update(37, hs3, hs4, hs0, hs1, hs2); 242a61493a0Sjsing sha1_round2_update(38, hs2, hs3, hs4, hs0, hs1); 243a61493a0Sjsing sha1_round2_update(39, hs1, hs2, hs3, hs4, hs0); 244a61493a0Sjsing sha1_round3_update(40, hs0, hs1, hs2, hs3, hs4); 245a61493a0Sjsing sha1_round3_update(41, hs4, hs0, hs1, hs2, hs3); 246a61493a0Sjsing sha1_round3_update(42, hs3, hs4, hs0, hs1, hs2); 247a61493a0Sjsing sha1_round3_update(43, hs2, hs3, hs4, hs0, hs1); 248a61493a0Sjsing sha1_round3_update(44, hs1, hs2, hs3, hs4, hs0); 249a61493a0Sjsing sha1_round3_update(45, hs0, hs1, hs2, hs3, hs4); 250a61493a0Sjsing sha1_round3_update(46, hs4, hs0, hs1, hs2, hs3); 251a61493a0Sjsing sha1_round3_update(47, hs3, hs4, hs0, hs1, hs2); 252a61493a0Sjsing 253a61493a0Sjsing /* Round 48 through 63. */ 254a61493a0Sjsing sha1_round3_update(48, hs2, hs3, hs4, hs0, hs1); 255a61493a0Sjsing sha1_round3_update(49, hs1, hs2, hs3, hs4, hs0); 256a61493a0Sjsing sha1_round3_update(50, hs0, hs1, hs2, hs3, hs4); 257a61493a0Sjsing sha1_round3_update(51, hs4, hs0, hs1, hs2, hs3); 258a61493a0Sjsing sha1_round3_update(52, hs3, hs4, hs0, hs1, hs2); 259a61493a0Sjsing sha1_round3_update(53, hs2, hs3, hs4, hs0, hs1); 260a61493a0Sjsing sha1_round3_update(54, hs1, hs2, hs3, hs4, hs0); 261a61493a0Sjsing sha1_round3_update(55, hs0, hs1, hs2, hs3, hs4); 262a61493a0Sjsing sha1_round3_update(56, hs4, hs0, hs1, hs2, hs3); 263a61493a0Sjsing sha1_round3_update(57, hs3, hs4, hs0, hs1, hs2); 264a61493a0Sjsing sha1_round3_update(58, hs2, hs3, hs4, hs0, hs1); 265a61493a0Sjsing sha1_round3_update(59, hs1, hs2, hs3, hs4, hs0); 266a61493a0Sjsing sha1_round4_update(60, hs0, hs1, hs2, hs3, hs4); 267a61493a0Sjsing sha1_round4_update(61, hs4, hs0, hs1, hs2, hs3); 268a61493a0Sjsing sha1_round4_update(62, hs3, hs4, hs0, hs1, hs2); 269a61493a0Sjsing sha1_round4_update(63, hs2, hs3, hs4, hs0, hs1); 270a61493a0Sjsing 271a61493a0Sjsing /* Round 64 through 79. */ 272a61493a0Sjsing sha1_round4_update(64, hs1, hs2, hs3, hs4, hs0); 273a61493a0Sjsing sha1_round4_update(65, hs0, hs1, hs2, hs3, hs4); 274a61493a0Sjsing sha1_round4_update(66, hs4, hs0, hs1, hs2, hs3); 275a61493a0Sjsing sha1_round4_update(67, hs3, hs4, hs0, hs1, hs2); 276a61493a0Sjsing sha1_round4_update(68, hs2, hs3, hs4, hs0, hs1); 277a61493a0Sjsing sha1_round4_update(69, hs1, hs2, hs3, hs4, hs0); 278a61493a0Sjsing sha1_round4_update(70, hs0, hs1, hs2, hs3, hs4); 279a61493a0Sjsing sha1_round4_update(71, hs4, hs0, hs1, hs2, hs3); 280a61493a0Sjsing sha1_round4_update(72, hs3, hs4, hs0, hs1, hs2); 281a61493a0Sjsing sha1_round4_update(73, hs2, hs3, hs4, hs0, hs1); 282a61493a0Sjsing sha1_round4_update(74, hs1, hs2, hs3, hs4, hs0); 283a61493a0Sjsing sha1_round4_update(75, hs0, hs1, hs2, hs3, hs4); 284a61493a0Sjsing sha1_round4_update(76, hs4, hs0, hs1, hs2, hs3); 285a61493a0Sjsing sha1_round4_update(77, hs3, hs4, hs0, hs1, hs2); 286a61493a0Sjsing sha1_round4_update(78, hs2, hs3, hs4, hs0, hs1); 287a61493a0Sjsing sha1_round4_update(79, hs1, hs2, hs3, hs4, hs0); 288a61493a0Sjsing 289a61493a0Sjsing /* Add intermediate state to hash state. */ 290a61493a0Sjsing addl (0*4)(ctx), hs0 291a61493a0Sjsing addl (1*4)(ctx), hs1 292a61493a0Sjsing addl (2*4)(ctx), hs2 293a61493a0Sjsing addl (3*4)(ctx), hs3 294a61493a0Sjsing addl (4*4)(ctx), hs4 295a61493a0Sjsing 296a61493a0Sjsing /* Store new hash state to context. */ 297a61493a0Sjsing movl hs0, (0*4)(ctx) 298a61493a0Sjsing movl hs1, (1*4)(ctx) 299a61493a0Sjsing movl hs2, (2*4)(ctx) 300a61493a0Sjsing movl hs3, (3*4)(ctx) 301a61493a0Sjsing movl hs4, (4*4)(ctx) 302a61493a0Sjsing 303a61493a0Sjsing addq $64, in 304a61493a0Sjsing cmpq end, in 305a61493a0Sjsing jb .Lblock_loop 306a61493a0Sjsing 307a61493a0Sjsing movq (64+0*8)(%rsp), %rsp 308a61493a0Sjsing 309a61493a0Sjsing /* Restore callee save registers. */ 310a61493a0Sjsing popq %r12 311a61493a0Sjsing popq %rbp 312a61493a0Sjsing popq %rbx 313a61493a0Sjsing 314a61493a0Sjsing ret 315