1/* $OpenBSD: sha256_amd64_shani.S,v 1.1 2024/11/16 15:31:36 jsing Exp $ */ 2/* 3 * Copyright (c) 2024 Joel Sing <jsing@openbsd.org> 4 * 5 * Permission to use, copy, modify, and distribute this software for any 6 * purpose with or without fee is hereby granted, provided that the above 7 * copyright notice and this permission notice appear in all copies. 8 * 9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 */ 17 18#ifdef __CET__ 19#include <cet.h> 20#else 21#define _CET_ENDBR 22#endif 23 24/* 25 * SHA-256 implementation using the Intel SHA extensions: 26 * 27 * https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sha-extensions.html 28 */ 29 30#define ctx %rdi 31#define in %rsi 32#define num %rdx 33 34#define end %rbx 35 36#define k256 %rbp 37 38#define xmsg %xmm0 39 40#define xhs0 %xmm1 41#define xhs1 %xmm2 42 43#define xabef %xmm3 44#define xcdgh %xmm4 45 46#define xmsgtmp0 %xmm6 47#define xmsgtmp1 %xmm7 48#define xmsgtmp2 %xmm8 49#define xmsgtmp3 %xmm9 50#define xmsgtmp4 %xmm10 51 52#define xshufmask %xmm11 53 54#define xtmp0 %xmm12 55 56#define sha256_message_schedule_load(idx, m, xmsgtmp) \ 57 movdqu (idx*16)(m), xmsg; \ 58 pshufb xshufmask, xmsg; \ 59 movdqa xmsg, xmsgtmp; 60 61#define sha256_message_schedule_update(xmt0, xmt1, xmt2, xmt3) \ 62 sha256msg1 xmt1, xmt0; \ 63 movdqa xmt3, xmsgtmp4; \ 64 palignr $4, xmt2, xmsgtmp4; \ 65 paddd xmsgtmp4, xmt0; \ 66 sha256msg2 xmt3, xmt0; 67 68#define sha256_shani_round(idx) \ 69 paddd (idx*16)(k256), xmsg; \ 70 sha256rnds2 xmsg, xhs0, xhs1; \ 71 pshufd $0x0e, xmsg, xmsg; \ 72 sha256rnds2 xmsg, xhs1, xhs0; 73 74#define sha256_shani_round_load(idx, m, xmsgtmp) \ 75 sha256_message_schedule_load(idx, m, xmsgtmp); \ 76 sha256_shani_round(idx); 77 78#define sha256_shani_round_update(idx, xmt0, xmt1, xmt2, xmt3) \ 79 sha256_message_schedule_update(xmt0, xmt1, xmt2, xmt3); \ 80 movdqa xmt0, xmsg; \ 81 sha256_shani_round(idx); 82 83.text 84 85/* 86 * void sha256_block_shani(SHA256_CTX *ctx, const void *in, size_t num); 87 * 88 * Standard x86-64 ABI: rdi = ctx, rsi = in, rdx = num 89 */ 90.align 16 91.globl sha256_block_shani 92.type sha256_block_shani,@function 93sha256_block_shani: 94 _CET_ENDBR 95 96 /* Save callee save registers. */ 97 pushq %rbx 98 pushq %rbp 99 100 /* Compute end of message. */ 101 shlq $6, num 102 leaq (in, num, 1), end 103 104 /* Address of SHA-256 constants. */ 105 leaq K256(%rip), k256 106 107 /* Load endian shuffle mask. */ 108 movdqa shufmask(%rip), xshufmask 109 110 /* Load current hash state from context. */ 111 movdqu (0*16)(ctx), xhs0 /* dcba */ 112 movdqu (1*16)(ctx), xhs1 /* hgfe */ 113 114 /* Rearrange words to construct abef/cdgh. */ 115 pshufd $0xb1, xhs0, xhs0 /* cdab */ 116 pshufd $0x1b, xhs1, xhs1 /* efgh */ 117 movdqa xhs0, xtmp0 118 palignr $8, xhs1, xhs0 /* abef */ 119 pblendw $0xf0, xtmp0, xhs1 /* cdgh */ 120 121 jmp .Lshani_block_loop 122 123.align 16 124.Lshani_block_loop: 125 /* Save state for accumulation. */ 126 movdqa xhs0, xabef 127 movdqa xhs1, xcdgh 128 129 /* Rounds 0 through 15 (four rounds at a time). */ 130 sha256_shani_round_load(0, in, xmsgtmp0) 131 sha256_shani_round_load(1, in, xmsgtmp1) 132 sha256_shani_round_load(2, in, xmsgtmp2) 133 sha256_shani_round_load(3, in, xmsgtmp3) 134 135 /* Rounds 16 through 63 (four rounds at a time). */ 136 sha256_shani_round_update(4, xmsgtmp0, xmsgtmp1, xmsgtmp2, xmsgtmp3) 137 sha256_shani_round_update(5, xmsgtmp1, xmsgtmp2, xmsgtmp3, xmsgtmp0) 138 sha256_shani_round_update(6, xmsgtmp2, xmsgtmp3, xmsgtmp0, xmsgtmp1) 139 sha256_shani_round_update(7, xmsgtmp3, xmsgtmp0, xmsgtmp1, xmsgtmp2) 140 141 sha256_shani_round_update(8, xmsgtmp0, xmsgtmp1, xmsgtmp2, xmsgtmp3) 142 sha256_shani_round_update(9, xmsgtmp1, xmsgtmp2, xmsgtmp3, xmsgtmp0) 143 sha256_shani_round_update(10, xmsgtmp2, xmsgtmp3, xmsgtmp0, xmsgtmp1) 144 sha256_shani_round_update(11, xmsgtmp3, xmsgtmp0, xmsgtmp1, xmsgtmp2) 145 146 sha256_shani_round_update(12, xmsgtmp0, xmsgtmp1, xmsgtmp2, xmsgtmp3) 147 sha256_shani_round_update(13, xmsgtmp1, xmsgtmp2, xmsgtmp3, xmsgtmp0) 148 sha256_shani_round_update(14, xmsgtmp2, xmsgtmp3, xmsgtmp0, xmsgtmp1) 149 sha256_shani_round_update(15, xmsgtmp3, xmsgtmp0, xmsgtmp1, xmsgtmp2) 150 151 /* Accumulate hash state. */ 152 paddd xabef, xhs0 153 paddd xcdgh, xhs1 154 155 addq $64, in 156 cmpq end, in 157 jb .Lshani_block_loop 158 159 /* Rearrange words to construct dcba/hgfe. */ 160 pshufd $0x1b, xhs0, xhs0 /* feba */ 161 pshufd $0xb1, xhs1, xhs1 /* dchg */ 162 movdqa xhs0, xtmp0 163 pblendw $0xf0, xhs1, xhs0 /* dcba */ 164 palignr $8, xtmp0, xhs1 /* hgfe */ 165 166 /* Update stored hash context. */ 167 movdqu xhs0, (0*16)(ctx) 168 movdqu xhs1, (1*16)(ctx) 169 170 /* Restore callee save registers. */ 171 popq %rbp 172 popq %rbx 173 174 ret 175 176.rodata 177 178/* 179 * Shuffle mask - little endian to big endian word conversion. 180 */ 181.align 16 182.type shufmask,@object 183shufmask: 184.octa 0x0c0d0e0f08090a0b0405060700010203 185.size shufmask,.-shufmask 186 187/* 188 * SHA-256 constants - see FIPS 180-4 section 4.2.2. 189 */ 190.align 64 191.type K256,@object 192K256: 193.long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 194.long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 195.long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 196.long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 197.long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc 198.long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da 199.long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 200.long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 201.long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 202.long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 203.long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 204.long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 205.long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 206.long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 207.long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 208.long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 209.size K256,.-K256 210