1 /* sha1.c - Functions to compute SHA1 message digest of files or 2 memory blocks according to the NIST specification FIPS-180-1. 3 4 Copyright (C) 2000-2024 Free Software Foundation, Inc. 5 6 This program is free software; you can redistribute it and/or modify it 7 under the terms of the GNU General Public License as published by the 8 Free Software Foundation; either version 2, or (at your option) any 9 later version. 10 11 This program is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 GNU General Public License for more details. 15 16 You should have received a copy of the GNU General Public License 17 along with this program; if not, write to the Free Software Foundation, 18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ 19 20 /* Written by Scott G. Miller 21 Credits: 22 Robert Klep <robert@ilse.nl> -- Expansion function fix 23 */ 24 25 #include <config.h> 26 27 #include "sha1.h" 28 29 #include <stddef.h> 30 #include <string.h> 31 32 #ifdef HAVE_X86_SHA1_HW_SUPPORT 33 # include <x86intrin.h> 34 # include <cpuid.h> 35 #endif 36 37 #if USE_UNLOCKED_IO 38 # include "unlocked-io.h" 39 #endif 40 41 #ifdef WORDS_BIGENDIAN 42 # define SWAP(n) (n) 43 #else 44 # define SWAP(n) \ 45 (((n) << 24) | (((n) & 0xff00) << 8) | (((n) >> 8) & 0xff00) | ((n) >> 24)) 46 #endif 47 48 #define BLOCKSIZE 4096 49 #if BLOCKSIZE % 64 != 0 50 # error "invalid BLOCKSIZE" 51 #endif 52 53 /* This array contains the bytes used to pad the buffer to the next 54 64-byte boundary. (RFC 1321, 3.1: Step 1) */ 55 static const unsigned char fillbuf[64] = { 0x80, 0 /* , 0, 0, ... */ }; 56 57 58 /* Take a pointer to a 160 bit block of data (five 32 bit ints) and 59 initialize it to the start constants of the SHA1 algorithm. This 60 must be called before using hash in the call to sha1_hash. */ 61 void 62 sha1_init_ctx (struct sha1_ctx *ctx) 63 { 64 ctx->A = 0x67452301; 65 ctx->B = 0xefcdab89; 66 ctx->C = 0x98badcfe; 67 ctx->D = 0x10325476; 68 ctx->E = 0xc3d2e1f0; 69 70 ctx->total[0] = ctx->total[1] = 0; 71 ctx->buflen = 0; 72 } 73 74 /* Put result from CTX in first 20 bytes following RESBUF. The result 75 must be in little endian byte order. 76 77 IMPORTANT: On some systems it is required that RESBUF is correctly 78 aligned for a 32-bit value. */ 79 void * 80 sha1_read_ctx (const struct sha1_ctx *ctx, void *resbuf) 81 { 82 ((sha1_uint32 *) resbuf)[0] = SWAP (ctx->A); 83 ((sha1_uint32 *) resbuf)[1] = SWAP (ctx->B); 84 ((sha1_uint32 *) resbuf)[2] = SWAP (ctx->C); 85 ((sha1_uint32 *) resbuf)[3] = SWAP (ctx->D); 86 ((sha1_uint32 *) resbuf)[4] = SWAP (ctx->E); 87 88 return resbuf; 89 } 90 91 /* Process the remaining bytes in the internal buffer and the usual 92 prolog according to the standard and write the result to RESBUF. 93 94 IMPORTANT: On some systems it is required that RESBUF is correctly 95 aligned for a 32-bit value. */ 96 void * 97 sha1_finish_ctx (struct sha1_ctx *ctx, void *resbuf) 98 { 99 /* Take yet unprocessed bytes into account. */ 100 sha1_uint32 bytes = ctx->buflen; 101 size_t size = (bytes < 56) ? 64 / 4 : 64 * 2 / 4; 102 103 /* Now count remaining bytes. */ 104 ctx->total[0] += bytes; 105 if (ctx->total[0] < bytes) 106 ++ctx->total[1]; 107 108 /* Put the 64-bit file length in *bits* at the end of the buffer. */ 109 ctx->buffer[size - 2] = SWAP ((ctx->total[1] << 3) | (ctx->total[0] >> 29)); 110 ctx->buffer[size - 1] = SWAP (ctx->total[0] << 3); 111 112 memcpy (&((char *) ctx->buffer)[bytes], fillbuf, (size - 2) * 4 - bytes); 113 114 /* Process last bytes. */ 115 sha1_process_block (ctx->buffer, size * 4, ctx); 116 117 return sha1_read_ctx (ctx, resbuf); 118 } 119 120 /* Compute SHA1 message digest for bytes read from STREAM. The 121 resulting message digest number will be written into the 16 bytes 122 beginning at RESBLOCK. */ 123 int 124 sha1_stream (FILE *stream, void *resblock) 125 { 126 struct sha1_ctx ctx; 127 char buffer[BLOCKSIZE + 72]; 128 size_t sum; 129 130 /* Initialize the computation context. */ 131 sha1_init_ctx (&ctx); 132 133 /* Iterate over full file contents. */ 134 while (1) 135 { 136 /* We read the file in blocks of BLOCKSIZE bytes. One call of the 137 computation function processes the whole buffer so that with the 138 next round of the loop another block can be read. */ 139 size_t n; 140 sum = 0; 141 142 /* Read block. Take care for partial reads. */ 143 while (1) 144 { 145 n = fread (buffer + sum, 1, BLOCKSIZE - sum, stream); 146 147 sum += n; 148 149 if (sum == BLOCKSIZE) 150 break; 151 152 if (n == 0) 153 { 154 /* Check for the error flag IFF N == 0, so that we don't 155 exit the loop after a partial read due to e.g., EAGAIN 156 or EWOULDBLOCK. */ 157 if (ferror (stream)) 158 return 1; 159 goto process_partial_block; 160 } 161 162 /* We've read at least one byte, so ignore errors. But always 163 check for EOF, since feof may be true even though N > 0. 164 Otherwise, we could end up calling fread after EOF. */ 165 if (feof (stream)) 166 goto process_partial_block; 167 } 168 169 /* Process buffer with BLOCKSIZE bytes. Note that 170 BLOCKSIZE % 64 == 0 171 */ 172 sha1_process_block (buffer, BLOCKSIZE, &ctx); 173 } 174 175 process_partial_block:; 176 177 /* Process any remaining bytes. */ 178 if (sum > 0) 179 sha1_process_bytes (buffer, sum, &ctx); 180 181 /* Construct result in desired memory. */ 182 sha1_finish_ctx (&ctx, resblock); 183 return 0; 184 } 185 186 /* Compute SHA1 message digest for LEN bytes beginning at BUFFER. The 187 result is always in little endian byte order, so that a byte-wise 188 output yields to the wanted ASCII representation of the message 189 digest. */ 190 void * 191 sha1_buffer (const char *buffer, size_t len, void *resblock) 192 { 193 struct sha1_ctx ctx; 194 195 /* Initialize the computation context. */ 196 sha1_init_ctx (&ctx); 197 198 /* Process whole buffer but last len % 64 bytes. */ 199 sha1_process_bytes (buffer, len, &ctx); 200 201 /* Put result in desired memory area. */ 202 return sha1_finish_ctx (&ctx, resblock); 203 } 204 205 void 206 sha1_process_bytes (const void *buffer, size_t len, struct sha1_ctx *ctx) 207 { 208 /* When we already have some bits in our internal buffer concatenate 209 both inputs first. */ 210 if (ctx->buflen != 0) 211 { 212 size_t left_over = ctx->buflen; 213 size_t add = 128 - left_over > len ? len : 128 - left_over; 214 215 memcpy (&((char *) ctx->buffer)[left_over], buffer, add); 216 ctx->buflen += add; 217 218 if (ctx->buflen > 64) 219 { 220 sha1_process_block (ctx->buffer, ctx->buflen & ~63, ctx); 221 222 ctx->buflen &= 63; 223 /* The regions in the following copy operation cannot overlap. */ 224 memcpy (ctx->buffer, 225 &((char *) ctx->buffer)[(left_over + add) & ~63], 226 ctx->buflen); 227 } 228 229 buffer = (const char *) buffer + add; 230 len -= add; 231 } 232 233 /* Process available complete blocks. */ 234 if (len >= 64) 235 { 236 #if !_STRING_ARCH_unaligned 237 # define alignof(type) offsetof (struct { char c; type x; }, x) 238 # define UNALIGNED_P(p) (((size_t) p) % alignof (sha1_uint32) != 0) 239 if (UNALIGNED_P (buffer)) 240 while (len > 64) 241 { 242 sha1_process_block (memcpy (ctx->buffer, buffer, 64), 64, ctx); 243 buffer = (const char *) buffer + 64; 244 len -= 64; 245 } 246 else 247 #endif 248 { 249 sha1_process_block (buffer, len & ~63, ctx); 250 buffer = (const char *) buffer + (len & ~63); 251 len &= 63; 252 } 253 } 254 255 /* Move remaining bytes in internal buffer. */ 256 if (len > 0) 257 { 258 size_t left_over = ctx->buflen; 259 260 memcpy (&((char *) ctx->buffer)[left_over], buffer, len); 261 left_over += len; 262 if (left_over >= 64) 263 { 264 sha1_process_block (ctx->buffer, 64, ctx); 265 left_over -= 64; 266 memmove (ctx->buffer, &ctx->buffer[16], left_over); 267 } 268 ctx->buflen = left_over; 269 } 270 } 271 272 /* --- Code below is the primary difference between md5.c and sha1.c --- */ 273 274 /* SHA1 round constants */ 275 #define K1 0x5a827999 276 #define K2 0x6ed9eba1 277 #define K3 0x8f1bbcdc 278 #define K4 0xca62c1d6 279 280 /* Round functions. Note that F2 is the same as F4. */ 281 #define F1(B,C,D) ( D ^ ( B & ( C ^ D ) ) ) 282 #define F2(B,C,D) (B ^ C ^ D) 283 #define F3(B,C,D) ( ( B & C ) | ( D & ( B | C ) ) ) 284 #define F4(B,C,D) (B ^ C ^ D) 285 286 /* Process LEN bytes of BUFFER, accumulating context into CTX. 287 It is assumed that LEN % 64 == 0. 288 Most of this code comes from GnuPG's cipher/sha1.c. */ 289 290 void 291 sha1_process_block (const void *buffer, size_t len, struct sha1_ctx *ctx) 292 { 293 const sha1_uint32 *words = (const sha1_uint32*) buffer; 294 size_t nwords = len / sizeof (sha1_uint32); 295 const sha1_uint32 *endp = words + nwords; 296 sha1_uint32 x[16]; 297 sha1_uint32 a = ctx->A; 298 sha1_uint32 b = ctx->B; 299 sha1_uint32 c = ctx->C; 300 sha1_uint32 d = ctx->D; 301 sha1_uint32 e = ctx->E; 302 303 /* First increment the byte count. RFC 1321 specifies the possible 304 length of the file up to 2^64 bits. Here we only compute the 305 number of bytes. Do a double word increment. */ 306 ctx->total[0] += len; 307 ctx->total[1] += ((len >> 31) >> 1) + (ctx->total[0] < len); 308 309 #define rol(x, n) (((x) << (n)) | ((sha1_uint32) (x) >> (32 - (n)))) 310 311 #define M(I) ( tm = x[I&0x0f] ^ x[(I-14)&0x0f] \ 312 ^ x[(I-8)&0x0f] ^ x[(I-3)&0x0f] \ 313 , (x[I&0x0f] = rol(tm, 1)) ) 314 315 #define R(A,B,C,D,E,F,K,M) do { E += rol( A, 5 ) \ 316 + F( B, C, D ) \ 317 + K \ 318 + M; \ 319 B = rol( B, 30 ); \ 320 } while(0) 321 322 while (words < endp) 323 { 324 sha1_uint32 tm; 325 int t; 326 for (t = 0; t < 16; t++) 327 { 328 x[t] = SWAP (*words); 329 words++; 330 } 331 332 R( a, b, c, d, e, F1, K1, x[ 0] ); 333 R( e, a, b, c, d, F1, K1, x[ 1] ); 334 R( d, e, a, b, c, F1, K1, x[ 2] ); 335 R( c, d, e, a, b, F1, K1, x[ 3] ); 336 R( b, c, d, e, a, F1, K1, x[ 4] ); 337 R( a, b, c, d, e, F1, K1, x[ 5] ); 338 R( e, a, b, c, d, F1, K1, x[ 6] ); 339 R( d, e, a, b, c, F1, K1, x[ 7] ); 340 R( c, d, e, a, b, F1, K1, x[ 8] ); 341 R( b, c, d, e, a, F1, K1, x[ 9] ); 342 R( a, b, c, d, e, F1, K1, x[10] ); 343 R( e, a, b, c, d, F1, K1, x[11] ); 344 R( d, e, a, b, c, F1, K1, x[12] ); 345 R( c, d, e, a, b, F1, K1, x[13] ); 346 R( b, c, d, e, a, F1, K1, x[14] ); 347 R( a, b, c, d, e, F1, K1, x[15] ); 348 R( e, a, b, c, d, F1, K1, M(16) ); 349 R( d, e, a, b, c, F1, K1, M(17) ); 350 R( c, d, e, a, b, F1, K1, M(18) ); 351 R( b, c, d, e, a, F1, K1, M(19) ); 352 R( a, b, c, d, e, F2, K2, M(20) ); 353 R( e, a, b, c, d, F2, K2, M(21) ); 354 R( d, e, a, b, c, F2, K2, M(22) ); 355 R( c, d, e, a, b, F2, K2, M(23) ); 356 R( b, c, d, e, a, F2, K2, M(24) ); 357 R( a, b, c, d, e, F2, K2, M(25) ); 358 R( e, a, b, c, d, F2, K2, M(26) ); 359 R( d, e, a, b, c, F2, K2, M(27) ); 360 R( c, d, e, a, b, F2, K2, M(28) ); 361 R( b, c, d, e, a, F2, K2, M(29) ); 362 R( a, b, c, d, e, F2, K2, M(30) ); 363 R( e, a, b, c, d, F2, K2, M(31) ); 364 R( d, e, a, b, c, F2, K2, M(32) ); 365 R( c, d, e, a, b, F2, K2, M(33) ); 366 R( b, c, d, e, a, F2, K2, M(34) ); 367 R( a, b, c, d, e, F2, K2, M(35) ); 368 R( e, a, b, c, d, F2, K2, M(36) ); 369 R( d, e, a, b, c, F2, K2, M(37) ); 370 R( c, d, e, a, b, F2, K2, M(38) ); 371 R( b, c, d, e, a, F2, K2, M(39) ); 372 R( a, b, c, d, e, F3, K3, M(40) ); 373 R( e, a, b, c, d, F3, K3, M(41) ); 374 R( d, e, a, b, c, F3, K3, M(42) ); 375 R( c, d, e, a, b, F3, K3, M(43) ); 376 R( b, c, d, e, a, F3, K3, M(44) ); 377 R( a, b, c, d, e, F3, K3, M(45) ); 378 R( e, a, b, c, d, F3, K3, M(46) ); 379 R( d, e, a, b, c, F3, K3, M(47) ); 380 R( c, d, e, a, b, F3, K3, M(48) ); 381 R( b, c, d, e, a, F3, K3, M(49) ); 382 R( a, b, c, d, e, F3, K3, M(50) ); 383 R( e, a, b, c, d, F3, K3, M(51) ); 384 R( d, e, a, b, c, F3, K3, M(52) ); 385 R( c, d, e, a, b, F3, K3, M(53) ); 386 R( b, c, d, e, a, F3, K3, M(54) ); 387 R( a, b, c, d, e, F3, K3, M(55) ); 388 R( e, a, b, c, d, F3, K3, M(56) ); 389 R( d, e, a, b, c, F3, K3, M(57) ); 390 R( c, d, e, a, b, F3, K3, M(58) ); 391 R( b, c, d, e, a, F3, K3, M(59) ); 392 R( a, b, c, d, e, F4, K4, M(60) ); 393 R( e, a, b, c, d, F4, K4, M(61) ); 394 R( d, e, a, b, c, F4, K4, M(62) ); 395 R( c, d, e, a, b, F4, K4, M(63) ); 396 R( b, c, d, e, a, F4, K4, M(64) ); 397 R( a, b, c, d, e, F4, K4, M(65) ); 398 R( e, a, b, c, d, F4, K4, M(66) ); 399 R( d, e, a, b, c, F4, K4, M(67) ); 400 R( c, d, e, a, b, F4, K4, M(68) ); 401 R( b, c, d, e, a, F4, K4, M(69) ); 402 R( a, b, c, d, e, F4, K4, M(70) ); 403 R( e, a, b, c, d, F4, K4, M(71) ); 404 R( d, e, a, b, c, F4, K4, M(72) ); 405 R( c, d, e, a, b, F4, K4, M(73) ); 406 R( b, c, d, e, a, F4, K4, M(74) ); 407 R( a, b, c, d, e, F4, K4, M(75) ); 408 R( e, a, b, c, d, F4, K4, M(76) ); 409 R( d, e, a, b, c, F4, K4, M(77) ); 410 R( c, d, e, a, b, F4, K4, M(78) ); 411 R( b, c, d, e, a, F4, K4, M(79) ); 412 413 a = ctx->A += a; 414 b = ctx->B += b; 415 c = ctx->C += c; 416 d = ctx->D += d; 417 e = ctx->E += e; 418 } 419 } 420 421 #if defined(HAVE_X86_SHA1_HW_SUPPORT) 422 /* HW specific version of sha1_process_bytes. */ 423 424 static void sha1_hw_process_block (const void *, size_t, struct sha1_ctx *); 425 426 static void 427 sha1_hw_process_bytes (const void *buffer, size_t len, struct sha1_ctx *ctx) 428 { 429 /* When we already have some bits in our internal buffer concatenate 430 both inputs first. */ 431 if (ctx->buflen != 0) 432 { 433 size_t left_over = ctx->buflen; 434 size_t add = 128 - left_over > len ? len : 128 - left_over; 435 436 memcpy (&((char *) ctx->buffer)[left_over], buffer, add); 437 ctx->buflen += add; 438 439 if (ctx->buflen > 64) 440 { 441 sha1_hw_process_block (ctx->buffer, ctx->buflen & ~63, ctx); 442 443 ctx->buflen &= 63; 444 /* The regions in the following copy operation cannot overlap. */ 445 memcpy (ctx->buffer, 446 &((char *) ctx->buffer)[(left_over + add) & ~63], 447 ctx->buflen); 448 } 449 450 buffer = (const char *) buffer + add; 451 len -= add; 452 } 453 454 /* Process available complete blocks. */ 455 if (len >= 64) 456 { 457 #if !_STRING_ARCH_unaligned 458 # define alignof(type) offsetof (struct { char c; type x; }, x) 459 # define UNALIGNED_P(p) (((size_t) p) % alignof (sha1_uint32) != 0) 460 if (UNALIGNED_P (buffer)) 461 while (len > 64) 462 { 463 sha1_hw_process_block (memcpy (ctx->buffer, buffer, 64), 64, ctx); 464 buffer = (const char *) buffer + 64; 465 len -= 64; 466 } 467 else 468 #endif 469 { 470 sha1_hw_process_block (buffer, len & ~63, ctx); 471 buffer = (const char *) buffer + (len & ~63); 472 len &= 63; 473 } 474 } 475 476 /* Move remaining bytes in internal buffer. */ 477 if (len > 0) 478 { 479 size_t left_over = ctx->buflen; 480 481 memcpy (&((char *) ctx->buffer)[left_over], buffer, len); 482 left_over += len; 483 if (left_over >= 64) 484 { 485 sha1_hw_process_block (ctx->buffer, 64, ctx); 486 left_over -= 64; 487 memmove (ctx->buffer, &ctx->buffer[16], left_over); 488 } 489 ctx->buflen = left_over; 490 } 491 } 492 493 /* Process LEN bytes of BUFFER, accumulating context into CTX. 494 Using CPU specific intrinsics. */ 495 496 #ifdef HAVE_X86_SHA1_HW_SUPPORT 497 __attribute__((__target__ ("sse4.1,sha"))) 498 #endif 499 static void 500 sha1_hw_process_block (const void *buffer, size_t len, struct sha1_ctx *ctx) 501 { 502 #ifdef HAVE_X86_SHA1_HW_SUPPORT 503 /* Implemented from 504 https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sha-extensions.html */ 505 const __m128i *words = (const __m128i *) buffer; 506 const __m128i *endp = (const __m128i *) ((const char *) buffer + len); 507 __m128i abcd, abcd_save, e0, e0_save, e1, msg0, msg1, msg2, msg3; 508 const __m128i shuf_mask 509 = _mm_set_epi64x (0x0001020304050607ULL, 0x08090a0b0c0d0e0fULL); 510 char check[((offsetof (struct sha1_ctx, B) 511 == offsetof (struct sha1_ctx, A) + sizeof (ctx->A)) 512 && (offsetof (struct sha1_ctx, C) 513 == offsetof (struct sha1_ctx, A) + 2 * sizeof (ctx->A)) 514 && (offsetof (struct sha1_ctx, D) 515 == offsetof (struct sha1_ctx, A) + 3 * sizeof (ctx->A))) 516 ? 1 : -1]; 517 518 /* First increment the byte count. RFC 1321 specifies the possible 519 length of the file up to 2^64 bits. Here we only compute the 520 number of bytes. Do a double word increment. */ 521 ctx->total[0] += len; 522 ctx->total[1] += ((len >> 31) >> 1) + (ctx->total[0] < len); 523 524 (void) &check[0]; 525 abcd = _mm_loadu_si128 ((const __m128i *) &ctx->A); 526 e0 = _mm_set_epi32 (ctx->E, 0, 0, 0); 527 abcd = _mm_shuffle_epi32 (abcd, 0x1b); /* 0, 1, 2, 3 */ 528 529 while (words < endp) 530 { 531 abcd_save = abcd; 532 e0_save = e0; 533 534 /* 0..3 */ 535 msg0 = _mm_loadu_si128 (words); 536 msg0 = _mm_shuffle_epi8 (msg0, shuf_mask); 537 e0 = _mm_add_epi32 (e0, msg0); 538 e1 = abcd; 539 abcd = _mm_sha1rnds4_epu32 (abcd, e0, 0); 540 541 /* 4..7 */ 542 msg1 = _mm_loadu_si128 (words + 1); 543 msg1 = _mm_shuffle_epi8 (msg1, shuf_mask); 544 e1 = _mm_sha1nexte_epu32 (e1, msg1); 545 e0 = abcd; 546 abcd = _mm_sha1rnds4_epu32 (abcd, e1, 0); 547 msg0 = _mm_sha1msg1_epu32 (msg0, msg1); 548 549 /* 8..11 */ 550 msg2 = _mm_loadu_si128 (words + 2); 551 msg2 = _mm_shuffle_epi8 (msg2, shuf_mask); 552 e0 = _mm_sha1nexte_epu32 (e0, msg2); 553 e1 = abcd; 554 abcd = _mm_sha1rnds4_epu32 (abcd, e0, 0); 555 msg1 = _mm_sha1msg1_epu32 (msg1, msg2); 556 msg0 = _mm_xor_si128 (msg0, msg2); 557 558 /* 12..15 */ 559 msg3 = _mm_loadu_si128 (words + 3); 560 msg3 = _mm_shuffle_epi8 (msg3, shuf_mask); 561 e1 = _mm_sha1nexte_epu32 (e1, msg3); 562 e0 = abcd; 563 msg0 = _mm_sha1msg2_epu32 (msg0, msg3); 564 abcd = _mm_sha1rnds4_epu32 (abcd, e1, 0); 565 msg2 = _mm_sha1msg1_epu32 (msg2, msg3); 566 msg1 = _mm_xor_si128 (msg1, msg3); 567 568 /* 16..19 */ 569 e0 = _mm_sha1nexte_epu32 (e0, msg0); 570 e1 = abcd; 571 msg1 = _mm_sha1msg2_epu32 (msg1, msg0); 572 abcd = _mm_sha1rnds4_epu32 (abcd, e0, 0); 573 msg3 = _mm_sha1msg1_epu32 (msg3, msg0); 574 msg2 = _mm_xor_si128 (msg2, msg0); 575 576 /* 20..23 */ 577 e1 = _mm_sha1nexte_epu32 (e1, msg1); 578 e0 = abcd; 579 msg2 = _mm_sha1msg2_epu32 (msg2, msg1); 580 abcd = _mm_sha1rnds4_epu32 (abcd, e1, 1); 581 msg0 = _mm_sha1msg1_epu32 (msg0, msg1); 582 msg3 = _mm_xor_si128 (msg3, msg1); 583 584 /* 24..27 */ 585 e0 = _mm_sha1nexte_epu32 (e0, msg2); 586 e1 = abcd; 587 msg3 = _mm_sha1msg2_epu32 (msg3, msg2); 588 abcd = _mm_sha1rnds4_epu32 (abcd, e0, 1); 589 msg1 = _mm_sha1msg1_epu32 (msg1, msg2); 590 msg0 = _mm_xor_si128 (msg0, msg2); 591 592 /* 28..31 */ 593 e1 = _mm_sha1nexte_epu32 (e1, msg3); 594 e0 = abcd; 595 msg0 = _mm_sha1msg2_epu32 (msg0, msg3); 596 abcd = _mm_sha1rnds4_epu32 (abcd, e1, 1); 597 msg2 = _mm_sha1msg1_epu32 (msg2, msg3); 598 msg1 = _mm_xor_si128 (msg1, msg3); 599 600 /* 32..35 */ 601 e0 = _mm_sha1nexte_epu32 (e0, msg0); 602 e1 = abcd; 603 msg1 = _mm_sha1msg2_epu32 (msg1, msg0); 604 abcd = _mm_sha1rnds4_epu32 (abcd, e0, 1); 605 msg3 = _mm_sha1msg1_epu32 (msg3, msg0); 606 msg2 = _mm_xor_si128 (msg2, msg0); 607 608 /* 36..39 */ 609 e1 = _mm_sha1nexte_epu32 (e1, msg1); 610 e0 = abcd; 611 msg2 = _mm_sha1msg2_epu32 (msg2, msg1); 612 abcd = _mm_sha1rnds4_epu32 (abcd, e1, 1); 613 msg0 = _mm_sha1msg1_epu32 (msg0, msg1); 614 msg3 = _mm_xor_si128 (msg3, msg1); 615 616 /* 40..43 */ 617 e0 = _mm_sha1nexte_epu32 (e0, msg2); 618 e1 = abcd; 619 msg3 = _mm_sha1msg2_epu32 (msg3, msg2); 620 abcd = _mm_sha1rnds4_epu32 (abcd, e0, 2); 621 msg1 = _mm_sha1msg1_epu32 (msg1, msg2); 622 msg0 = _mm_xor_si128 (msg0, msg2); 623 624 /* 44..47 */ 625 e1 = _mm_sha1nexte_epu32 (e1, msg3); 626 e0 = abcd; 627 msg0 = _mm_sha1msg2_epu32 (msg0, msg3); 628 abcd = _mm_sha1rnds4_epu32 (abcd, e1, 2); 629 msg2 = _mm_sha1msg1_epu32 (msg2, msg3); 630 msg1 = _mm_xor_si128 (msg1, msg3); 631 632 /* 48..51 */ 633 e0 = _mm_sha1nexte_epu32 (e0, msg0); 634 e1 = abcd; 635 msg1 = _mm_sha1msg2_epu32 (msg1, msg0); 636 abcd = _mm_sha1rnds4_epu32 (abcd, e0, 2); 637 msg3 = _mm_sha1msg1_epu32 (msg3, msg0); 638 msg2 = _mm_xor_si128 (msg2, msg0); 639 640 /* 52..55 */ 641 e1 = _mm_sha1nexte_epu32 (e1, msg1); 642 e0 = abcd; 643 msg2 = _mm_sha1msg2_epu32 (msg2, msg1); 644 abcd = _mm_sha1rnds4_epu32 (abcd, e1, 2); 645 msg0 = _mm_sha1msg1_epu32 (msg0, msg1); 646 msg3 = _mm_xor_si128 (msg3, msg1); 647 648 /* 56..59 */ 649 e0 = _mm_sha1nexte_epu32 (e0, msg2); 650 e1 = abcd; 651 msg3 = _mm_sha1msg2_epu32 (msg3, msg2); 652 abcd = _mm_sha1rnds4_epu32 (abcd, e0, 2); 653 msg1 = _mm_sha1msg1_epu32 (msg1, msg2); 654 msg0 = _mm_xor_si128 (msg0, msg2); 655 656 /* 60..63 */ 657 e1 = _mm_sha1nexte_epu32 (e1, msg3); 658 e0 = abcd; 659 msg0 = _mm_sha1msg2_epu32 (msg0, msg3); 660 abcd = _mm_sha1rnds4_epu32 (abcd, e1, 3); 661 msg2 = _mm_sha1msg1_epu32 (msg2, msg3); 662 msg1 = _mm_xor_si128 (msg1, msg3); 663 664 /* 64..67 */ 665 e0 = _mm_sha1nexte_epu32 (e0, msg0); 666 e1 = abcd; 667 msg1 = _mm_sha1msg2_epu32 (msg1, msg0); 668 abcd = _mm_sha1rnds4_epu32 (abcd, e0, 3); 669 msg3 = _mm_sha1msg1_epu32 (msg3, msg0); 670 msg2 = _mm_xor_si128 (msg2, msg0); 671 672 /* 68..71 */ 673 e1 = _mm_sha1nexte_epu32 (e1, msg1); 674 e0 = abcd; 675 msg2 = _mm_sha1msg2_epu32 (msg2, msg1); 676 abcd = _mm_sha1rnds4_epu32 (abcd, e1, 3); 677 msg3 = _mm_xor_si128 (msg3, msg1); 678 679 /* 72..75 */ 680 e0 = _mm_sha1nexte_epu32 (e0, msg2); 681 e1 = abcd; 682 msg3 = _mm_sha1msg2_epu32 (msg3, msg2); 683 abcd = _mm_sha1rnds4_epu32 (abcd, e0, 3); 684 685 /* 76..79 */ 686 e1 = _mm_sha1nexte_epu32 (e1, msg3); 687 e0 = abcd; 688 abcd = _mm_sha1rnds4_epu32 (abcd, e1, 3); 689 690 /* Finalize. */ 691 e0 = _mm_sha1nexte_epu32 (e0, e0_save); 692 abcd = _mm_add_epi32 (abcd, abcd_save); 693 694 words = words + 4; 695 } 696 697 abcd = _mm_shuffle_epi32 (abcd, 0x1b); /* 0, 1, 2, 3 */ 698 _mm_storeu_si128 ((__m128i *) &ctx->A, abcd); 699 ctx->E = _mm_extract_epi32 (e0, 3); 700 #endif 701 } 702 #endif 703 704 /* Return sha1_process_bytes or some hardware optimized version thereof 705 depending on current CPU. */ 706 707 sha1_process_bytes_fn 708 sha1_choose_process_bytes (void) 709 { 710 #ifdef HAVE_X86_SHA1_HW_SUPPORT 711 unsigned int eax, ebx, ecx, edx; 712 if (__get_cpuid_count (7, 0, &eax, &ebx, &ecx, &edx) 713 && (ebx & bit_SHA) != 0 714 && __get_cpuid (1, &eax, &ebx, &ecx, &edx) 715 && (ecx & bit_SSE4_1) != 0) 716 return sha1_hw_process_bytes; 717 #endif 718 return sha1_process_bytes; 719 } 720