1*228e7c1eSjsing/* $OpenBSD: sha256_amd64_shani.S,v 1.1 2024/11/16 15:31:36 jsing Exp $ */ 2*228e7c1eSjsing/* 3*228e7c1eSjsing * Copyright (c) 2024 Joel Sing <jsing@openbsd.org> 4*228e7c1eSjsing * 5*228e7c1eSjsing * Permission to use, copy, modify, and distribute this software for any 6*228e7c1eSjsing * purpose with or without fee is hereby granted, provided that the above 7*228e7c1eSjsing * copyright notice and this permission notice appear in all copies. 8*228e7c1eSjsing * 9*228e7c1eSjsing * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10*228e7c1eSjsing * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11*228e7c1eSjsing * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12*228e7c1eSjsing * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13*228e7c1eSjsing * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14*228e7c1eSjsing * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15*228e7c1eSjsing * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16*228e7c1eSjsing */ 17*228e7c1eSjsing 18*228e7c1eSjsing#ifdef __CET__ 19*228e7c1eSjsing#include <cet.h> 20*228e7c1eSjsing#else 21*228e7c1eSjsing#define _CET_ENDBR 22*228e7c1eSjsing#endif 23*228e7c1eSjsing 24*228e7c1eSjsing/* 25*228e7c1eSjsing * SHA-256 implementation using the Intel SHA extensions: 26*228e7c1eSjsing * 27*228e7c1eSjsing * https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sha-extensions.html 28*228e7c1eSjsing */ 29*228e7c1eSjsing 30*228e7c1eSjsing#define ctx %rdi 31*228e7c1eSjsing#define in %rsi 32*228e7c1eSjsing#define num %rdx 33*228e7c1eSjsing 34*228e7c1eSjsing#define end %rbx 35*228e7c1eSjsing 36*228e7c1eSjsing#define k256 %rbp 37*228e7c1eSjsing 38*228e7c1eSjsing#define xmsg %xmm0 39*228e7c1eSjsing 40*228e7c1eSjsing#define xhs0 %xmm1 41*228e7c1eSjsing#define xhs1 %xmm2 42*228e7c1eSjsing 43*228e7c1eSjsing#define xabef %xmm3 44*228e7c1eSjsing#define xcdgh %xmm4 45*228e7c1eSjsing 46*228e7c1eSjsing#define xmsgtmp0 %xmm6 47*228e7c1eSjsing#define xmsgtmp1 %xmm7 48*228e7c1eSjsing#define xmsgtmp2 %xmm8 49*228e7c1eSjsing#define xmsgtmp3 %xmm9 50*228e7c1eSjsing#define xmsgtmp4 %xmm10 51*228e7c1eSjsing 52*228e7c1eSjsing#define xshufmask %xmm11 53*228e7c1eSjsing 54*228e7c1eSjsing#define xtmp0 %xmm12 55*228e7c1eSjsing 56*228e7c1eSjsing#define sha256_message_schedule_load(idx, m, xmsgtmp) \ 57*228e7c1eSjsing movdqu (idx*16)(m), xmsg; \ 58*228e7c1eSjsing pshufb xshufmask, xmsg; \ 59*228e7c1eSjsing movdqa xmsg, xmsgtmp; 60*228e7c1eSjsing 61*228e7c1eSjsing#define sha256_message_schedule_update(xmt0, xmt1, xmt2, xmt3) \ 62*228e7c1eSjsing sha256msg1 xmt1, xmt0; \ 63*228e7c1eSjsing movdqa xmt3, xmsgtmp4; \ 64*228e7c1eSjsing palignr $4, xmt2, xmsgtmp4; \ 65*228e7c1eSjsing paddd xmsgtmp4, xmt0; \ 66*228e7c1eSjsing sha256msg2 xmt3, xmt0; 67*228e7c1eSjsing 68*228e7c1eSjsing#define sha256_shani_round(idx) \ 69*228e7c1eSjsing paddd (idx*16)(k256), xmsg; \ 70*228e7c1eSjsing sha256rnds2 xmsg, xhs0, xhs1; \ 71*228e7c1eSjsing pshufd $0x0e, xmsg, xmsg; \ 72*228e7c1eSjsing sha256rnds2 xmsg, xhs1, xhs0; 73*228e7c1eSjsing 74*228e7c1eSjsing#define sha256_shani_round_load(idx, m, xmsgtmp) \ 75*228e7c1eSjsing sha256_message_schedule_load(idx, m, xmsgtmp); \ 76*228e7c1eSjsing sha256_shani_round(idx); 77*228e7c1eSjsing 78*228e7c1eSjsing#define sha256_shani_round_update(idx, xmt0, xmt1, xmt2, xmt3) \ 79*228e7c1eSjsing sha256_message_schedule_update(xmt0, xmt1, xmt2, xmt3); \ 80*228e7c1eSjsing movdqa xmt0, xmsg; \ 81*228e7c1eSjsing sha256_shani_round(idx); 82*228e7c1eSjsing 83*228e7c1eSjsing.text 84*228e7c1eSjsing 85*228e7c1eSjsing/* 86*228e7c1eSjsing * void sha256_block_shani(SHA256_CTX *ctx, const void *in, size_t num); 87*228e7c1eSjsing * 88*228e7c1eSjsing * Standard x86-64 ABI: rdi = ctx, rsi = in, rdx = num 89*228e7c1eSjsing */ 90*228e7c1eSjsing.align 16 91*228e7c1eSjsing.globl sha256_block_shani 92*228e7c1eSjsing.type sha256_block_shani,@function 93*228e7c1eSjsingsha256_block_shani: 94*228e7c1eSjsing _CET_ENDBR 95*228e7c1eSjsing 96*228e7c1eSjsing /* Save callee save registers. */ 97*228e7c1eSjsing pushq %rbx 98*228e7c1eSjsing pushq %rbp 99*228e7c1eSjsing 100*228e7c1eSjsing /* Compute end of message. */ 101*228e7c1eSjsing shlq $6, num 102*228e7c1eSjsing leaq (in, num, 1), end 103*228e7c1eSjsing 104*228e7c1eSjsing /* Address of SHA-256 constants. */ 105*228e7c1eSjsing leaq K256(%rip), k256 106*228e7c1eSjsing 107*228e7c1eSjsing /* Load endian shuffle mask. */ 108*228e7c1eSjsing movdqa shufmask(%rip), xshufmask 109*228e7c1eSjsing 110*228e7c1eSjsing /* Load current hash state from context. */ 111*228e7c1eSjsing movdqu (0*16)(ctx), xhs0 /* dcba */ 112*228e7c1eSjsing movdqu (1*16)(ctx), xhs1 /* hgfe */ 113*228e7c1eSjsing 114*228e7c1eSjsing /* Rearrange words to construct abef/cdgh. */ 115*228e7c1eSjsing pshufd $0xb1, xhs0, xhs0 /* cdab */ 116*228e7c1eSjsing pshufd $0x1b, xhs1, xhs1 /* efgh */ 117*228e7c1eSjsing movdqa xhs0, xtmp0 118*228e7c1eSjsing palignr $8, xhs1, xhs0 /* abef */ 119*228e7c1eSjsing pblendw $0xf0, xtmp0, xhs1 /* cdgh */ 120*228e7c1eSjsing 121*228e7c1eSjsing jmp .Lshani_block_loop 122*228e7c1eSjsing 123*228e7c1eSjsing.align 16 124*228e7c1eSjsing.Lshani_block_loop: 125*228e7c1eSjsing /* Save state for accumulation. */ 126*228e7c1eSjsing movdqa xhs0, xabef 127*228e7c1eSjsing movdqa xhs1, xcdgh 128*228e7c1eSjsing 129*228e7c1eSjsing /* Rounds 0 through 15 (four rounds at a time). */ 130*228e7c1eSjsing sha256_shani_round_load(0, in, xmsgtmp0) 131*228e7c1eSjsing sha256_shani_round_load(1, in, xmsgtmp1) 132*228e7c1eSjsing sha256_shani_round_load(2, in, xmsgtmp2) 133*228e7c1eSjsing sha256_shani_round_load(3, in, xmsgtmp3) 134*228e7c1eSjsing 135*228e7c1eSjsing /* Rounds 16 through 63 (four rounds at a time). */ 136*228e7c1eSjsing sha256_shani_round_update(4, xmsgtmp0, xmsgtmp1, xmsgtmp2, xmsgtmp3) 137*228e7c1eSjsing sha256_shani_round_update(5, xmsgtmp1, xmsgtmp2, xmsgtmp3, xmsgtmp0) 138*228e7c1eSjsing sha256_shani_round_update(6, xmsgtmp2, xmsgtmp3, xmsgtmp0, xmsgtmp1) 139*228e7c1eSjsing sha256_shani_round_update(7, xmsgtmp3, xmsgtmp0, xmsgtmp1, xmsgtmp2) 140*228e7c1eSjsing 141*228e7c1eSjsing sha256_shani_round_update(8, xmsgtmp0, xmsgtmp1, xmsgtmp2, xmsgtmp3) 142*228e7c1eSjsing sha256_shani_round_update(9, xmsgtmp1, xmsgtmp2, xmsgtmp3, xmsgtmp0) 143*228e7c1eSjsing sha256_shani_round_update(10, xmsgtmp2, xmsgtmp3, xmsgtmp0, xmsgtmp1) 144*228e7c1eSjsing sha256_shani_round_update(11, xmsgtmp3, xmsgtmp0, xmsgtmp1, xmsgtmp2) 145*228e7c1eSjsing 146*228e7c1eSjsing sha256_shani_round_update(12, xmsgtmp0, xmsgtmp1, xmsgtmp2, xmsgtmp3) 147*228e7c1eSjsing sha256_shani_round_update(13, xmsgtmp1, xmsgtmp2, xmsgtmp3, xmsgtmp0) 148*228e7c1eSjsing sha256_shani_round_update(14, xmsgtmp2, xmsgtmp3, xmsgtmp0, xmsgtmp1) 149*228e7c1eSjsing sha256_shani_round_update(15, xmsgtmp3, xmsgtmp0, xmsgtmp1, xmsgtmp2) 150*228e7c1eSjsing 151*228e7c1eSjsing /* Accumulate hash state. */ 152*228e7c1eSjsing paddd xabef, xhs0 153*228e7c1eSjsing paddd xcdgh, xhs1 154*228e7c1eSjsing 155*228e7c1eSjsing addq $64, in 156*228e7c1eSjsing cmpq end, in 157*228e7c1eSjsing jb .Lshani_block_loop 158*228e7c1eSjsing 159*228e7c1eSjsing /* Rearrange words to construct dcba/hgfe. */ 160*228e7c1eSjsing pshufd $0x1b, xhs0, xhs0 /* feba */ 161*228e7c1eSjsing pshufd $0xb1, xhs1, xhs1 /* dchg */ 162*228e7c1eSjsing movdqa xhs0, xtmp0 163*228e7c1eSjsing pblendw $0xf0, xhs1, xhs0 /* dcba */ 164*228e7c1eSjsing palignr $8, xtmp0, xhs1 /* hgfe */ 165*228e7c1eSjsing 166*228e7c1eSjsing /* Update stored hash context. */ 167*228e7c1eSjsing movdqu xhs0, (0*16)(ctx) 168*228e7c1eSjsing movdqu xhs1, (1*16)(ctx) 169*228e7c1eSjsing 170*228e7c1eSjsing /* Restore callee save registers. */ 171*228e7c1eSjsing popq %rbp 172*228e7c1eSjsing popq %rbx 173*228e7c1eSjsing 174*228e7c1eSjsing ret 175*228e7c1eSjsing 176*228e7c1eSjsing.rodata 177*228e7c1eSjsing 178*228e7c1eSjsing/* 179*228e7c1eSjsing * Shuffle mask - little endian to big endian word conversion. 180*228e7c1eSjsing */ 181*228e7c1eSjsing.align 16 182*228e7c1eSjsing.type shufmask,@object 183*228e7c1eSjsingshufmask: 184*228e7c1eSjsing.octa 0x0c0d0e0f08090a0b0405060700010203 185*228e7c1eSjsing.size shufmask,.-shufmask 186*228e7c1eSjsing 187*228e7c1eSjsing/* 188*228e7c1eSjsing * SHA-256 constants - see FIPS 180-4 section 4.2.2. 189*228e7c1eSjsing */ 190*228e7c1eSjsing.align 64 191*228e7c1eSjsing.type K256,@object 192*228e7c1eSjsingK256: 193*228e7c1eSjsing.long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 194*228e7c1eSjsing.long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 195*228e7c1eSjsing.long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 196*228e7c1eSjsing.long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 197*228e7c1eSjsing.long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc 198*228e7c1eSjsing.long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da 199*228e7c1eSjsing.long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 200*228e7c1eSjsing.long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 201*228e7c1eSjsing.long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 202*228e7c1eSjsing.long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 203*228e7c1eSjsing.long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 204*228e7c1eSjsing.long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 205*228e7c1eSjsing.long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 206*228e7c1eSjsing.long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 207*228e7c1eSjsing.long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 208*228e7c1eSjsing.long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 209*228e7c1eSjsing.size K256,.-K256 210