1*07e532b2Sjsing/* $OpenBSD: sha256_amd64_generic.S,v 1.3 2024/11/16 12:34:16 jsing Exp $ */ 20acd6edbSjsing/* 30acd6edbSjsing * Copyright (c) 2024 Joel Sing <jsing@openbsd.org> 40acd6edbSjsing * 50acd6edbSjsing * Permission to use, copy, modify, and distribute this software for any 60acd6edbSjsing * purpose with or without fee is hereby granted, provided that the above 70acd6edbSjsing * copyright notice and this permission notice appear in all copies. 80acd6edbSjsing * 90acd6edbSjsing * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 100acd6edbSjsing * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 110acd6edbSjsing * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 120acd6edbSjsing * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 130acd6edbSjsing * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 140acd6edbSjsing * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 150acd6edbSjsing * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 160acd6edbSjsing */ 170acd6edbSjsing 180acd6edbSjsing#ifdef __CET__ 190acd6edbSjsing#include <cet.h> 200acd6edbSjsing#else 210acd6edbSjsing#define _CET_ENDBR 220acd6edbSjsing#endif 230acd6edbSjsing 240acd6edbSjsing#define ctx %rdi 250acd6edbSjsing#define in %rsi 260acd6edbSjsing#define num %rdx 270acd6edbSjsing 280acd6edbSjsing#define round %rdi 290acd6edbSjsing 300acd6edbSjsing#define hs0 %r8d 310acd6edbSjsing#define hs1 %r9d 320acd6edbSjsing#define hs2 %r10d 330acd6edbSjsing#define hs3 %r11d 340acd6edbSjsing#define hs4 %r12d 350acd6edbSjsing#define hs5 %r13d 360acd6edbSjsing#define hs6 %r14d 370acd6edbSjsing#define hs7 %r15d 380acd6edbSjsing 390acd6edbSjsing#define k256 %rbp 400acd6edbSjsing 410acd6edbSjsing#define tmp0 %eax 420acd6edbSjsing#define tmp1 %ebx 430acd6edbSjsing#define tmp2 %ecx 440acd6edbSjsing#define tmp3 %edx 450acd6edbSjsing 460acd6edbSjsing/* 470acd6edbSjsing * Load message into wt, storing a copy in the message schedule: 480acd6edbSjsing * 490acd6edbSjsing * Wt = Mt 500acd6edbSjsing */ 510acd6edbSjsing#define sha256_message_schedule_load(idx, m, w, wt) \ 520acd6edbSjsing movl (m, round, 4), wt; \ 530acd6edbSjsing bswapl wt; \ 540acd6edbSjsing movl wt, ((idx&0xf)*4)(w); 550acd6edbSjsing 560acd6edbSjsing/* 570acd6edbSjsing * Update message schedule and return current value in wt: 580acd6edbSjsing * 590acd6edbSjsing * Wt = sigma1(W(t-2)) + W(t-7) + sigma0(W(t-15)) + W(t-16) 600acd6edbSjsing * 610acd6edbSjsing * sigma0(x) = ror(x, 7) ^ ror(x, 18) ^ (x >> 3) 620acd6edbSjsing * sigma1(x) = ror(x, 17) ^ ror(x, 19) ^ (x >> 10) 630acd6edbSjsing */ 640acd6edbSjsing#define sha256_message_schedule_update(idx, w, wt) \ 650acd6edbSjsing movl (((idx-2)&0xf)*4)(w), wt; /* sigma1 */ \ 660acd6edbSjsing movl wt, tmp1; /* sigma1 */ \ 670acd6edbSjsing rorl $(19-17), tmp1; /* sigma1 */ \ 680acd6edbSjsing xorl wt, tmp1; /* sigma1 */ \ 690acd6edbSjsing rorl $17, tmp1; /* sigma1 */ \ 700acd6edbSjsing shrl $10, wt; /* sigma1 */ \ 710acd6edbSjsing xorl tmp1, wt; /* sigma1 */ \ 720acd6edbSjsing \ 730acd6edbSjsing addl (((idx-7)&0xf)*4)(w), wt; /* Wt-7 */ \ 740acd6edbSjsing addl (((idx-16)&0xf)*4)(w), wt; /* Wt-16 */ \ 750acd6edbSjsing \ 760acd6edbSjsing movl (((idx-15)&0xf)*4)(w), tmp2; /* sigma0 */ \ 770acd6edbSjsing movl tmp2, tmp3; /* sigma0 */ \ 780acd6edbSjsing rorl $(18-7), tmp2; /* sigma0 */ \ 790acd6edbSjsing xorl tmp3, tmp2; /* sigma0 */ \ 800acd6edbSjsing rorl $7, tmp2; /* sigma0 */ \ 810acd6edbSjsing shrl $3, tmp3; /* sigma0 */ \ 820acd6edbSjsing xorl tmp3, tmp2; /* sigma0 */ \ 830acd6edbSjsing addl tmp2, wt; /* sigma0 */ \ 840acd6edbSjsing \ 850acd6edbSjsing movl wt, ((idx&0xf)*4)(w); 860acd6edbSjsing 870acd6edbSjsing/* 880acd6edbSjsing * Compute a SHA-256 round: 890acd6edbSjsing * 900acd6edbSjsing * T1 = h + Sigma1(e) + Ch(e, f, g) + Kt + Wt 910acd6edbSjsing * T2 = Sigma0(a) + Maj(a, b, c) 920acd6edbSjsing * 930acd6edbSjsing * Sigma0(x) = ror(x, 2) ^ ror(x, 13) ^ ror(x, 22) 940acd6edbSjsing * Sigma1(x) = ror(x, 6) ^ ror(x, 11) ^ ror(x, 25) 950acd6edbSjsing * Ch(x, y, z) = (x & y) ^ (~x & z) = ((y ^ z) & x) ^ z 960acd6edbSjsing * Maj(x, y, z) = (x & y) ^ (x & z) ^ (y & z) = ((y ^ z) & x) ^ (y & z) 970acd6edbSjsing * 980acd6edbSjsing * Upon completion d = d + T1, h = T1 + T2, pending rotation. 990acd6edbSjsing */ 1000acd6edbSjsing#define sha256_round(idx, a, b, c, d, e, f, g, h, k, w, wt) \ 1010acd6edbSjsing addl wt, h; /* T1 Wt */ \ 1020acd6edbSjsing addl (k256, round, 4), h; /* T1 Kt */ \ 1030acd6edbSjsing \ 1040acd6edbSjsing movl e, tmp1; /* T1 Sigma1 */ \ 1050acd6edbSjsing rorl $(25-11), tmp1; /* T1 Sigma1 */ \ 1060acd6edbSjsing xorl e, tmp1; /* T1 Sigma1 */ \ 1070acd6edbSjsing rorl $(11-6), tmp1; /* T1 Sigma1 */ \ 1080acd6edbSjsing xorl e, tmp1; /* T1 Sigma1 */ \ 1090acd6edbSjsing rorl $6, tmp1; /* T1 Sigma1 */ \ 1100acd6edbSjsing addl tmp1, h; /* T1 Sigma1 */ \ 1110acd6edbSjsing \ 1120acd6edbSjsing movl f, tmp2; /* T1 Ch */ \ 1130acd6edbSjsing xorl g, tmp2; /* T1 Ch */ \ 1140acd6edbSjsing andl e, tmp2; /* T1 Ch */ \ 1150acd6edbSjsing xorl g, tmp2; /* T1 Ch */ \ 1160acd6edbSjsing addl tmp2, h; /* T1 Ch */ \ 1170acd6edbSjsing \ 1180acd6edbSjsing addl h, d; /* d += T1 */ \ 1190acd6edbSjsing \ 1200acd6edbSjsing movl a, tmp1; /* T2 Sigma0 */ \ 1210acd6edbSjsing rorl $(22-13), tmp1; /* T2 Sigma0 */ \ 1220acd6edbSjsing xorl a, tmp1; /* T2 Sigma0 */ \ 1230acd6edbSjsing rorl $(13-2), tmp1; /* T2 Sigma0 */ \ 1240acd6edbSjsing xorl a, tmp1; /* T2 Sigma0 */ \ 1250acd6edbSjsing rorl $2, tmp1; /* T2 Sigma0 */ \ 1260acd6edbSjsing addl tmp1, h; /* T2 Sigma0 */ \ 1270acd6edbSjsing \ 1280acd6edbSjsing movl b, tmp2; /* T2 Maj */ \ 1290acd6edbSjsing xorl c, tmp2; /* T2 Maj */ \ 1300acd6edbSjsing andl a, tmp2; /* T2 Maj */ \ 1310acd6edbSjsing movl b, tmp3; /* T2 Maj */ \ 1320acd6edbSjsing andl c, tmp3; /* T2 Maj */ \ 1330acd6edbSjsing xorl tmp2, tmp3; /* T2 Maj */ \ 1340acd6edbSjsing addl tmp3, h; /* T2 Maj */ \ 1350acd6edbSjsing \ 1360acd6edbSjsing addq $1, round; 1370acd6edbSjsing 1380acd6edbSjsing#define sha256_round_load(idx, a, b, c, d, e, f, g, h) \ 1390acd6edbSjsing sha256_message_schedule_load(idx, in, %rsp, tmp0) \ 1400acd6edbSjsing sha256_round(idx, a, b, c, d, e, f, g, h, k256, %rsp, tmp0) 1410acd6edbSjsing 1420acd6edbSjsing#define sha256_round_update(idx, a, b, c, d, e, f, g, h) \ 1430acd6edbSjsing sha256_message_schedule_update(idx, %rsp, tmp0) \ 1440acd6edbSjsing sha256_round(idx, a, b, c, d, e, f, g, h, k256, %rsp, tmp0) 1450acd6edbSjsing 1460acd6edbSjsing.text 1470acd6edbSjsing 1480acd6edbSjsing/* 1490acd6edbSjsing * void sha256_block_generic(SHA256_CTX *ctx, const void *in, size_t num); 1500acd6edbSjsing * 1510acd6edbSjsing * Standard x86-64 ABI: rdi = ctx, rsi = in, rdx = num 1520acd6edbSjsing */ 1530acd6edbSjsing.align 16 1540acd6edbSjsing.globl sha256_block_generic 1550acd6edbSjsing.type sha256_block_generic,@function 1560acd6edbSjsingsha256_block_generic: 1570acd6edbSjsing _CET_ENDBR 1580acd6edbSjsing 1590acd6edbSjsing /* Save callee save registers. */ 1600acd6edbSjsing pushq %rbx 1610acd6edbSjsing pushq %rbp 1620acd6edbSjsing pushq %r12 1630acd6edbSjsing pushq %r13 1640acd6edbSjsing pushq %r14 1650acd6edbSjsing pushq %r15 1660acd6edbSjsing 167644472e5Sjsing /* Allocate space for message schedule, context pointer and end of message. */ 1680acd6edbSjsing movq %rsp, %rax 169644472e5Sjsing subq $(64+3*8), %rsp 1700acd6edbSjsing andq $~63, %rsp 171644472e5Sjsing movq %rax, (64+2*8)(%rsp) 172644472e5Sjsing movq ctx, (64+1*8)(%rsp) 1730acd6edbSjsing 1740acd6edbSjsing /* Compute and store end of message. */ 1750acd6edbSjsing shlq $6, num 1760acd6edbSjsing leaq (in, num, 1), %rbx 177644472e5Sjsing movq %rbx, (64+0*8)(%rsp) 1780acd6edbSjsing 1790acd6edbSjsing /* Address of SHA-256 constants. */ 1800acd6edbSjsing leaq K256(%rip), k256 1810acd6edbSjsing 1820acd6edbSjsing /* Load current hash state from context. */ 1830acd6edbSjsing movl (0*4)(ctx), hs0 1840acd6edbSjsing movl (1*4)(ctx), hs1 1850acd6edbSjsing movl (2*4)(ctx), hs2 1860acd6edbSjsing movl (3*4)(ctx), hs3 1870acd6edbSjsing movl (4*4)(ctx), hs4 1880acd6edbSjsing movl (5*4)(ctx), hs5 1890acd6edbSjsing movl (6*4)(ctx), hs6 1900acd6edbSjsing movl (7*4)(ctx), hs7 1910acd6edbSjsing 1920acd6edbSjsing jmp .Lblock_loop0 1930acd6edbSjsing 1940acd6edbSjsing.align 16 1950acd6edbSjsing.Lblock_loop0: 1960acd6edbSjsing mov $0, round 1970acd6edbSjsing 1980acd6edbSjsing /* Round 0 through 15. */ 1990acd6edbSjsing sha256_round_load(0, hs0, hs1, hs2, hs3, hs4, hs5, hs6, hs7) 2000acd6edbSjsing sha256_round_load(1, hs7, hs0, hs1, hs2, hs3, hs4, hs5, hs6) 2010acd6edbSjsing sha256_round_load(2, hs6, hs7, hs0, hs1, hs2, hs3, hs4, hs5) 2020acd6edbSjsing sha256_round_load(3, hs5, hs6, hs7, hs0, hs1, hs2, hs3, hs4) 2030acd6edbSjsing sha256_round_load(4, hs4, hs5, hs6, hs7, hs0, hs1, hs2, hs3) 2040acd6edbSjsing sha256_round_load(5, hs3, hs4, hs5, hs6, hs7, hs0, hs1, hs2) 2050acd6edbSjsing sha256_round_load(6, hs2, hs3, hs4, hs5, hs6, hs7, hs0, hs1) 2060acd6edbSjsing sha256_round_load(7, hs1, hs2, hs3, hs4, hs5, hs6, hs7, hs0) 2070acd6edbSjsing sha256_round_load(8, hs0, hs1, hs2, hs3, hs4, hs5, hs6, hs7) 2080acd6edbSjsing sha256_round_load(9, hs7, hs0, hs1, hs2, hs3, hs4, hs5, hs6) 2090acd6edbSjsing sha256_round_load(10, hs6, hs7, hs0, hs1, hs2, hs3, hs4, hs5) 2100acd6edbSjsing sha256_round_load(11, hs5, hs6, hs7, hs0, hs1, hs2, hs3, hs4) 2110acd6edbSjsing sha256_round_load(12, hs4, hs5, hs6, hs7, hs0, hs1, hs2, hs3) 2120acd6edbSjsing sha256_round_load(13, hs3, hs4, hs5, hs6, hs7, hs0, hs1, hs2) 2130acd6edbSjsing sha256_round_load(14, hs2, hs3, hs4, hs5, hs6, hs7, hs0, hs1) 2140acd6edbSjsing sha256_round_load(15, hs1, hs2, hs3, hs4, hs5, hs6, hs7, hs0) 2150acd6edbSjsing 2160acd6edbSjsing jmp .Lblock_loop16 2170acd6edbSjsing 2180acd6edbSjsing.align 16 2190acd6edbSjsing.Lblock_loop16: 2200acd6edbSjsing /* Round 16 through 63. */ 2210acd6edbSjsing sha256_round_update(16, hs0, hs1, hs2, hs3, hs4, hs5, hs6, hs7) 2220acd6edbSjsing sha256_round_update(17, hs7, hs0, hs1, hs2, hs3, hs4, hs5, hs6) 2230acd6edbSjsing sha256_round_update(18, hs6, hs7, hs0, hs1, hs2, hs3, hs4, hs5) 2240acd6edbSjsing sha256_round_update(19, hs5, hs6, hs7, hs0, hs1, hs2, hs3, hs4) 2250acd6edbSjsing sha256_round_update(20, hs4, hs5, hs6, hs7, hs0, hs1, hs2, hs3) 2260acd6edbSjsing sha256_round_update(21, hs3, hs4, hs5, hs6, hs7, hs0, hs1, hs2) 2270acd6edbSjsing sha256_round_update(22, hs2, hs3, hs4, hs5, hs6, hs7, hs0, hs1) 2280acd6edbSjsing sha256_round_update(23, hs1, hs2, hs3, hs4, hs5, hs6, hs7, hs0) 2290acd6edbSjsing sha256_round_update(24, hs0, hs1, hs2, hs3, hs4, hs5, hs6, hs7) 2300acd6edbSjsing sha256_round_update(25, hs7, hs0, hs1, hs2, hs3, hs4, hs5, hs6) 2310acd6edbSjsing sha256_round_update(26, hs6, hs7, hs0, hs1, hs2, hs3, hs4, hs5) 2320acd6edbSjsing sha256_round_update(27, hs5, hs6, hs7, hs0, hs1, hs2, hs3, hs4) 2330acd6edbSjsing sha256_round_update(28, hs4, hs5, hs6, hs7, hs0, hs1, hs2, hs3) 2340acd6edbSjsing sha256_round_update(29, hs3, hs4, hs5, hs6, hs7, hs0, hs1, hs2) 2350acd6edbSjsing sha256_round_update(30, hs2, hs3, hs4, hs5, hs6, hs7, hs0, hs1) 2360acd6edbSjsing sha256_round_update(31, hs1, hs2, hs3, hs4, hs5, hs6, hs7, hs0) 2370acd6edbSjsing 2380acd6edbSjsing cmp $64, round 2390acd6edbSjsing jb .Lblock_loop16 2400acd6edbSjsing 241644472e5Sjsing movq (64+1*8)(%rsp), ctx 2420acd6edbSjsing 2430acd6edbSjsing /* Add intermediate state to hash state. */ 2440acd6edbSjsing addl (0*4)(ctx), hs0 2450acd6edbSjsing addl (1*4)(ctx), hs1 2460acd6edbSjsing addl (2*4)(ctx), hs2 2470acd6edbSjsing addl (3*4)(ctx), hs3 2480acd6edbSjsing addl (4*4)(ctx), hs4 2490acd6edbSjsing addl (5*4)(ctx), hs5 2500acd6edbSjsing addl (6*4)(ctx), hs6 2510acd6edbSjsing addl (7*4)(ctx), hs7 2520acd6edbSjsing 2530acd6edbSjsing /* Store new hash state to context. */ 2540acd6edbSjsing movl hs0, (0*4)(ctx) 2550acd6edbSjsing movl hs1, (1*4)(ctx) 2560acd6edbSjsing movl hs2, (2*4)(ctx) 2570acd6edbSjsing movl hs3, (3*4)(ctx) 2580acd6edbSjsing movl hs4, (4*4)(ctx) 2590acd6edbSjsing movl hs5, (5*4)(ctx) 2600acd6edbSjsing movl hs6, (6*4)(ctx) 2610acd6edbSjsing movl hs7, (7*4)(ctx) 2620acd6edbSjsing 2630acd6edbSjsing addq $64, in 264644472e5Sjsing cmpq (64+0*8)(%rsp), in 2650acd6edbSjsing jb .Lblock_loop0 2660acd6edbSjsing 267644472e5Sjsing movq (64+2*8)(%rsp), %rsp 2680acd6edbSjsing 2690acd6edbSjsing /* Restore callee save registers. */ 2700acd6edbSjsing popq %r15 2710acd6edbSjsing popq %r14 2720acd6edbSjsing popq %r13 2730acd6edbSjsing popq %r12 2740acd6edbSjsing popq %rbp 2750acd6edbSjsing popq %rbx 2760acd6edbSjsing 2770acd6edbSjsing ret 2780acd6edbSjsing 2790acd6edbSjsing/* 2800acd6edbSjsing * SHA-256 constants - see FIPS 180-4 section 4.2.2. 2810acd6edbSjsing */ 2820acd6edbSjsing.rodata 2830acd6edbSjsing.align 64 2840acd6edbSjsing.type K256,@object 2850acd6edbSjsingK256: 2860acd6edbSjsing.long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 2870acd6edbSjsing.long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 2880acd6edbSjsing.long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 2890acd6edbSjsing.long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 2900acd6edbSjsing.long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc 2910acd6edbSjsing.long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da 2920acd6edbSjsing.long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 2930acd6edbSjsing.long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 2940acd6edbSjsing.long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 2950acd6edbSjsing.long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 2960acd6edbSjsing.long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 2970acd6edbSjsing.long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 2980acd6edbSjsing.long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 2990acd6edbSjsing.long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 3000acd6edbSjsing.long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 3010acd6edbSjsing.long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 302*07e532b2Sjsing.size K256,.-K256 303