1/* $OpenBSD: sha256_amd64_generic.S,v 1.3 2024/11/16 12:34:16 jsing Exp $ */ 2/* 3 * Copyright (c) 2024 Joel Sing <jsing@openbsd.org> 4 * 5 * Permission to use, copy, modify, and distribute this software for any 6 * purpose with or without fee is hereby granted, provided that the above 7 * copyright notice and this permission notice appear in all copies. 8 * 9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 */ 17 18#ifdef __CET__ 19#include <cet.h> 20#else 21#define _CET_ENDBR 22#endif 23 24#define ctx %rdi 25#define in %rsi 26#define num %rdx 27 28#define round %rdi 29 30#define hs0 %r8d 31#define hs1 %r9d 32#define hs2 %r10d 33#define hs3 %r11d 34#define hs4 %r12d 35#define hs5 %r13d 36#define hs6 %r14d 37#define hs7 %r15d 38 39#define k256 %rbp 40 41#define tmp0 %eax 42#define tmp1 %ebx 43#define tmp2 %ecx 44#define tmp3 %edx 45 46/* 47 * Load message into wt, storing a copy in the message schedule: 48 * 49 * Wt = Mt 50 */ 51#define sha256_message_schedule_load(idx, m, w, wt) \ 52 movl (m, round, 4), wt; \ 53 bswapl wt; \ 54 movl wt, ((idx&0xf)*4)(w); 55 56/* 57 * Update message schedule and return current value in wt: 58 * 59 * Wt = sigma1(W(t-2)) + W(t-7) + sigma0(W(t-15)) + W(t-16) 60 * 61 * sigma0(x) = ror(x, 7) ^ ror(x, 18) ^ (x >> 3) 62 * sigma1(x) = ror(x, 17) ^ ror(x, 19) ^ (x >> 10) 63 */ 64#define sha256_message_schedule_update(idx, w, wt) \ 65 movl (((idx-2)&0xf)*4)(w), wt; /* sigma1 */ \ 66 movl wt, tmp1; /* sigma1 */ \ 67 rorl $(19-17), tmp1; /* sigma1 */ \ 68 xorl wt, tmp1; /* sigma1 */ \ 69 rorl $17, tmp1; /* sigma1 */ \ 70 shrl $10, wt; /* sigma1 */ \ 71 xorl tmp1, wt; /* sigma1 */ \ 72 \ 73 addl (((idx-7)&0xf)*4)(w), wt; /* Wt-7 */ \ 74 addl (((idx-16)&0xf)*4)(w), wt; /* Wt-16 */ \ 75 \ 76 movl (((idx-15)&0xf)*4)(w), tmp2; /* sigma0 */ \ 77 movl tmp2, tmp3; /* sigma0 */ \ 78 rorl $(18-7), tmp2; /* sigma0 */ \ 79 xorl tmp3, tmp2; /* sigma0 */ \ 80 rorl $7, tmp2; /* sigma0 */ \ 81 shrl $3, tmp3; /* sigma0 */ \ 82 xorl tmp3, tmp2; /* sigma0 */ \ 83 addl tmp2, wt; /* sigma0 */ \ 84 \ 85 movl wt, ((idx&0xf)*4)(w); 86 87/* 88 * Compute a SHA-256 round: 89 * 90 * T1 = h + Sigma1(e) + Ch(e, f, g) + Kt + Wt 91 * T2 = Sigma0(a) + Maj(a, b, c) 92 * 93 * Sigma0(x) = ror(x, 2) ^ ror(x, 13) ^ ror(x, 22) 94 * Sigma1(x) = ror(x, 6) ^ ror(x, 11) ^ ror(x, 25) 95 * Ch(x, y, z) = (x & y) ^ (~x & z) = ((y ^ z) & x) ^ z 96 * Maj(x, y, z) = (x & y) ^ (x & z) ^ (y & z) = ((y ^ z) & x) ^ (y & z) 97 * 98 * Upon completion d = d + T1, h = T1 + T2, pending rotation. 99 */ 100#define sha256_round(idx, a, b, c, d, e, f, g, h, k, w, wt) \ 101 addl wt, h; /* T1 Wt */ \ 102 addl (k256, round, 4), h; /* T1 Kt */ \ 103 \ 104 movl e, tmp1; /* T1 Sigma1 */ \ 105 rorl $(25-11), tmp1; /* T1 Sigma1 */ \ 106 xorl e, tmp1; /* T1 Sigma1 */ \ 107 rorl $(11-6), tmp1; /* T1 Sigma1 */ \ 108 xorl e, tmp1; /* T1 Sigma1 */ \ 109 rorl $6, tmp1; /* T1 Sigma1 */ \ 110 addl tmp1, h; /* T1 Sigma1 */ \ 111 \ 112 movl f, tmp2; /* T1 Ch */ \ 113 xorl g, tmp2; /* T1 Ch */ \ 114 andl e, tmp2; /* T1 Ch */ \ 115 xorl g, tmp2; /* T1 Ch */ \ 116 addl tmp2, h; /* T1 Ch */ \ 117 \ 118 addl h, d; /* d += T1 */ \ 119 \ 120 movl a, tmp1; /* T2 Sigma0 */ \ 121 rorl $(22-13), tmp1; /* T2 Sigma0 */ \ 122 xorl a, tmp1; /* T2 Sigma0 */ \ 123 rorl $(13-2), tmp1; /* T2 Sigma0 */ \ 124 xorl a, tmp1; /* T2 Sigma0 */ \ 125 rorl $2, tmp1; /* T2 Sigma0 */ \ 126 addl tmp1, h; /* T2 Sigma0 */ \ 127 \ 128 movl b, tmp2; /* T2 Maj */ \ 129 xorl c, tmp2; /* T2 Maj */ \ 130 andl a, tmp2; /* T2 Maj */ \ 131 movl b, tmp3; /* T2 Maj */ \ 132 andl c, tmp3; /* T2 Maj */ \ 133 xorl tmp2, tmp3; /* T2 Maj */ \ 134 addl tmp3, h; /* T2 Maj */ \ 135 \ 136 addq $1, round; 137 138#define sha256_round_load(idx, a, b, c, d, e, f, g, h) \ 139 sha256_message_schedule_load(idx, in, %rsp, tmp0) \ 140 sha256_round(idx, a, b, c, d, e, f, g, h, k256, %rsp, tmp0) 141 142#define sha256_round_update(idx, a, b, c, d, e, f, g, h) \ 143 sha256_message_schedule_update(idx, %rsp, tmp0) \ 144 sha256_round(idx, a, b, c, d, e, f, g, h, k256, %rsp, tmp0) 145 146.text 147 148/* 149 * void sha256_block_generic(SHA256_CTX *ctx, const void *in, size_t num); 150 * 151 * Standard x86-64 ABI: rdi = ctx, rsi = in, rdx = num 152 */ 153.align 16 154.globl sha256_block_generic 155.type sha256_block_generic,@function 156sha256_block_generic: 157 _CET_ENDBR 158 159 /* Save callee save registers. */ 160 pushq %rbx 161 pushq %rbp 162 pushq %r12 163 pushq %r13 164 pushq %r14 165 pushq %r15 166 167 /* Allocate space for message schedule, context pointer and end of message. */ 168 movq %rsp, %rax 169 subq $(64+3*8), %rsp 170 andq $~63, %rsp 171 movq %rax, (64+2*8)(%rsp) 172 movq ctx, (64+1*8)(%rsp) 173 174 /* Compute and store end of message. */ 175 shlq $6, num 176 leaq (in, num, 1), %rbx 177 movq %rbx, (64+0*8)(%rsp) 178 179 /* Address of SHA-256 constants. */ 180 leaq K256(%rip), k256 181 182 /* Load current hash state from context. */ 183 movl (0*4)(ctx), hs0 184 movl (1*4)(ctx), hs1 185 movl (2*4)(ctx), hs2 186 movl (3*4)(ctx), hs3 187 movl (4*4)(ctx), hs4 188 movl (5*4)(ctx), hs5 189 movl (6*4)(ctx), hs6 190 movl (7*4)(ctx), hs7 191 192 jmp .Lblock_loop0 193 194.align 16 195.Lblock_loop0: 196 mov $0, round 197 198 /* Round 0 through 15. */ 199 sha256_round_load(0, hs0, hs1, hs2, hs3, hs4, hs5, hs6, hs7) 200 sha256_round_load(1, hs7, hs0, hs1, hs2, hs3, hs4, hs5, hs6) 201 sha256_round_load(2, hs6, hs7, hs0, hs1, hs2, hs3, hs4, hs5) 202 sha256_round_load(3, hs5, hs6, hs7, hs0, hs1, hs2, hs3, hs4) 203 sha256_round_load(4, hs4, hs5, hs6, hs7, hs0, hs1, hs2, hs3) 204 sha256_round_load(5, hs3, hs4, hs5, hs6, hs7, hs0, hs1, hs2) 205 sha256_round_load(6, hs2, hs3, hs4, hs5, hs6, hs7, hs0, hs1) 206 sha256_round_load(7, hs1, hs2, hs3, hs4, hs5, hs6, hs7, hs0) 207 sha256_round_load(8, hs0, hs1, hs2, hs3, hs4, hs5, hs6, hs7) 208 sha256_round_load(9, hs7, hs0, hs1, hs2, hs3, hs4, hs5, hs6) 209 sha256_round_load(10, hs6, hs7, hs0, hs1, hs2, hs3, hs4, hs5) 210 sha256_round_load(11, hs5, hs6, hs7, hs0, hs1, hs2, hs3, hs4) 211 sha256_round_load(12, hs4, hs5, hs6, hs7, hs0, hs1, hs2, hs3) 212 sha256_round_load(13, hs3, hs4, hs5, hs6, hs7, hs0, hs1, hs2) 213 sha256_round_load(14, hs2, hs3, hs4, hs5, hs6, hs7, hs0, hs1) 214 sha256_round_load(15, hs1, hs2, hs3, hs4, hs5, hs6, hs7, hs0) 215 216 jmp .Lblock_loop16 217 218.align 16 219.Lblock_loop16: 220 /* Round 16 through 63. */ 221 sha256_round_update(16, hs0, hs1, hs2, hs3, hs4, hs5, hs6, hs7) 222 sha256_round_update(17, hs7, hs0, hs1, hs2, hs3, hs4, hs5, hs6) 223 sha256_round_update(18, hs6, hs7, hs0, hs1, hs2, hs3, hs4, hs5) 224 sha256_round_update(19, hs5, hs6, hs7, hs0, hs1, hs2, hs3, hs4) 225 sha256_round_update(20, hs4, hs5, hs6, hs7, hs0, hs1, hs2, hs3) 226 sha256_round_update(21, hs3, hs4, hs5, hs6, hs7, hs0, hs1, hs2) 227 sha256_round_update(22, hs2, hs3, hs4, hs5, hs6, hs7, hs0, hs1) 228 sha256_round_update(23, hs1, hs2, hs3, hs4, hs5, hs6, hs7, hs0) 229 sha256_round_update(24, hs0, hs1, hs2, hs3, hs4, hs5, hs6, hs7) 230 sha256_round_update(25, hs7, hs0, hs1, hs2, hs3, hs4, hs5, hs6) 231 sha256_round_update(26, hs6, hs7, hs0, hs1, hs2, hs3, hs4, hs5) 232 sha256_round_update(27, hs5, hs6, hs7, hs0, hs1, hs2, hs3, hs4) 233 sha256_round_update(28, hs4, hs5, hs6, hs7, hs0, hs1, hs2, hs3) 234 sha256_round_update(29, hs3, hs4, hs5, hs6, hs7, hs0, hs1, hs2) 235 sha256_round_update(30, hs2, hs3, hs4, hs5, hs6, hs7, hs0, hs1) 236 sha256_round_update(31, hs1, hs2, hs3, hs4, hs5, hs6, hs7, hs0) 237 238 cmp $64, round 239 jb .Lblock_loop16 240 241 movq (64+1*8)(%rsp), ctx 242 243 /* Add intermediate state to hash state. */ 244 addl (0*4)(ctx), hs0 245 addl (1*4)(ctx), hs1 246 addl (2*4)(ctx), hs2 247 addl (3*4)(ctx), hs3 248 addl (4*4)(ctx), hs4 249 addl (5*4)(ctx), hs5 250 addl (6*4)(ctx), hs6 251 addl (7*4)(ctx), hs7 252 253 /* Store new hash state to context. */ 254 movl hs0, (0*4)(ctx) 255 movl hs1, (1*4)(ctx) 256 movl hs2, (2*4)(ctx) 257 movl hs3, (3*4)(ctx) 258 movl hs4, (4*4)(ctx) 259 movl hs5, (5*4)(ctx) 260 movl hs6, (6*4)(ctx) 261 movl hs7, (7*4)(ctx) 262 263 addq $64, in 264 cmpq (64+0*8)(%rsp), in 265 jb .Lblock_loop0 266 267 movq (64+2*8)(%rsp), %rsp 268 269 /* Restore callee save registers. */ 270 popq %r15 271 popq %r14 272 popq %r13 273 popq %r12 274 popq %rbp 275 popq %rbx 276 277 ret 278 279/* 280 * SHA-256 constants - see FIPS 180-4 section 4.2.2. 281 */ 282.rodata 283.align 64 284.type K256,@object 285K256: 286.long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 287.long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 288.long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 289.long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 290.long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc 291.long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da 292.long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 293.long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 294.long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 295.long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 296.long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 297.long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 298.long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 299.long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 300.long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 301.long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 302.size K256,.-K256 303