xref: /netbsd-src/external/gpl3/binutils/dist/libiberty/sha1.c (revision cb63e24e8d6aae7ddac1859a9015f48b1d8bd90e)
1 /* sha1.c - Functions to compute SHA1 message digest of files or
2    memory blocks according to the NIST specification FIPS-180-1.
3 
4    Copyright (C) 2000-2024 Free Software Foundation, Inc.
5 
6    This program is free software; you can redistribute it and/or modify it
7    under the terms of the GNU General Public License as published by the
8    Free Software Foundation; either version 2, or (at your option) any
9    later version.
10 
11    This program is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14    GNU General Public License for more details.
15 
16    You should have received a copy of the GNU General Public License
17    along with this program; if not, write to the Free Software Foundation,
18    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
19 
20 /* Written by Scott G. Miller
21    Credits:
22       Robert Klep <robert@ilse.nl>  -- Expansion function fix
23 */
24 
25 #include <config.h>
26 
27 #include "sha1.h"
28 
29 #include <stddef.h>
30 #include <string.h>
31 
32 #ifdef HAVE_X86_SHA1_HW_SUPPORT
33 # include <x86intrin.h>
34 # include <cpuid.h>
35 #endif
36 
37 #if USE_UNLOCKED_IO
38 # include "unlocked-io.h"
39 #endif
40 
41 #ifdef WORDS_BIGENDIAN
42 # define SWAP(n) (n)
43 #else
44 # define SWAP(n) \
45     (((n) << 24) | (((n) & 0xff00) << 8) | (((n) >> 8) & 0xff00) | ((n) >> 24))
46 #endif
47 
48 #define BLOCKSIZE 4096
49 #if BLOCKSIZE % 64 != 0
50 # error "invalid BLOCKSIZE"
51 #endif
52 
53 /* This array contains the bytes used to pad the buffer to the next
54    64-byte boundary.  (RFC 1321, 3.1: Step 1)  */
55 static const unsigned char fillbuf[64] = { 0x80, 0 /* , 0, 0, ...  */ };
56 
57 
58 /* Take a pointer to a 160 bit block of data (five 32 bit ints) and
59    initialize it to the start constants of the SHA1 algorithm.  This
60    must be called before using hash in the call to sha1_hash.  */
61 void
sha1_init_ctx(struct sha1_ctx * ctx)62 sha1_init_ctx (struct sha1_ctx *ctx)
63 {
64   ctx->A = 0x67452301;
65   ctx->B = 0xefcdab89;
66   ctx->C = 0x98badcfe;
67   ctx->D = 0x10325476;
68   ctx->E = 0xc3d2e1f0;
69 
70   ctx->total[0] = ctx->total[1] = 0;
71   ctx->buflen = 0;
72 }
73 
74 /* Put result from CTX in first 20 bytes following RESBUF.  The result
75    must be in little endian byte order.
76 
77    IMPORTANT: On some systems it is required that RESBUF is correctly
78    aligned for a 32-bit value.  */
79 void *
sha1_read_ctx(const struct sha1_ctx * ctx,void * resbuf)80 sha1_read_ctx (const struct sha1_ctx *ctx, void *resbuf)
81 {
82   ((sha1_uint32 *) resbuf)[0] = SWAP (ctx->A);
83   ((sha1_uint32 *) resbuf)[1] = SWAP (ctx->B);
84   ((sha1_uint32 *) resbuf)[2] = SWAP (ctx->C);
85   ((sha1_uint32 *) resbuf)[3] = SWAP (ctx->D);
86   ((sha1_uint32 *) resbuf)[4] = SWAP (ctx->E);
87 
88   return resbuf;
89 }
90 
91 /* Process the remaining bytes in the internal buffer and the usual
92    prolog according to the standard and write the result to RESBUF.
93 
94    IMPORTANT: On some systems it is required that RESBUF is correctly
95    aligned for a 32-bit value.  */
96 void *
sha1_finish_ctx(struct sha1_ctx * ctx,void * resbuf)97 sha1_finish_ctx (struct sha1_ctx *ctx, void *resbuf)
98 {
99   /* Take yet unprocessed bytes into account.  */
100   sha1_uint32 bytes = ctx->buflen;
101   size_t size = (bytes < 56) ? 64 / 4 : 64 * 2 / 4;
102 
103   /* Now count remaining bytes.  */
104   ctx->total[0] += bytes;
105   if (ctx->total[0] < bytes)
106     ++ctx->total[1];
107 
108   /* Put the 64-bit file length in *bits* at the end of the buffer.  */
109   ctx->buffer[size - 2] = SWAP ((ctx->total[1] << 3) | (ctx->total[0] >> 29));
110   ctx->buffer[size - 1] = SWAP (ctx->total[0] << 3);
111 
112   memcpy (&((char *) ctx->buffer)[bytes], fillbuf, (size - 2) * 4 - bytes);
113 
114   /* Process last bytes.  */
115   sha1_process_block (ctx->buffer, size * 4, ctx);
116 
117   return sha1_read_ctx (ctx, resbuf);
118 }
119 
120 /* Compute SHA1 message digest for bytes read from STREAM.  The
121    resulting message digest number will be written into the 16 bytes
122    beginning at RESBLOCK.  */
123 int
sha1_stream(FILE * stream,void * resblock)124 sha1_stream (FILE *stream, void *resblock)
125 {
126   struct sha1_ctx ctx;
127   char buffer[BLOCKSIZE + 72];
128   size_t sum;
129 
130   /* Initialize the computation context.  */
131   sha1_init_ctx (&ctx);
132 
133   /* Iterate over full file contents.  */
134   while (1)
135     {
136       /* We read the file in blocks of BLOCKSIZE bytes.  One call of the
137 	 computation function processes the whole buffer so that with the
138 	 next round of the loop another block can be read.  */
139       size_t n;
140       sum = 0;
141 
142       /* Read block.  Take care for partial reads.  */
143       while (1)
144 	{
145 	  n = fread (buffer + sum, 1, BLOCKSIZE - sum, stream);
146 
147 	  sum += n;
148 
149 	  if (sum == BLOCKSIZE)
150 	    break;
151 
152 	  if (n == 0)
153 	    {
154 	      /* Check for the error flag IFF N == 0, so that we don't
155 		 exit the loop after a partial read due to e.g., EAGAIN
156 		 or EWOULDBLOCK.  */
157 	      if (ferror (stream))
158 		return 1;
159 	      goto process_partial_block;
160 	    }
161 
162 	  /* We've read at least one byte, so ignore errors.  But always
163 	     check for EOF, since feof may be true even though N > 0.
164 	     Otherwise, we could end up calling fread after EOF.  */
165 	  if (feof (stream))
166 	    goto process_partial_block;
167 	}
168 
169       /* Process buffer with BLOCKSIZE bytes.  Note that
170 			BLOCKSIZE % 64 == 0
171        */
172       sha1_process_block (buffer, BLOCKSIZE, &ctx);
173     }
174 
175  process_partial_block:;
176 
177   /* Process any remaining bytes.  */
178   if (sum > 0)
179     sha1_process_bytes (buffer, sum, &ctx);
180 
181   /* Construct result in desired memory.  */
182   sha1_finish_ctx (&ctx, resblock);
183   return 0;
184 }
185 
186 /* Compute SHA1 message digest for LEN bytes beginning at BUFFER.  The
187    result is always in little endian byte order, so that a byte-wise
188    output yields to the wanted ASCII representation of the message
189    digest.  */
190 void *
sha1_buffer(const char * buffer,size_t len,void * resblock)191 sha1_buffer (const char *buffer, size_t len, void *resblock)
192 {
193   struct sha1_ctx ctx;
194 
195   /* Initialize the computation context.  */
196   sha1_init_ctx (&ctx);
197 
198   /* Process whole buffer but last len % 64 bytes.  */
199   sha1_process_bytes (buffer, len, &ctx);
200 
201   /* Put result in desired memory area.  */
202   return sha1_finish_ctx (&ctx, resblock);
203 }
204 
205 void
sha1_process_bytes(const void * buffer,size_t len,struct sha1_ctx * ctx)206 sha1_process_bytes (const void *buffer, size_t len, struct sha1_ctx *ctx)
207 {
208   /* When we already have some bits in our internal buffer concatenate
209      both inputs first.  */
210   if (ctx->buflen != 0)
211     {
212       size_t left_over = ctx->buflen;
213       size_t add = 128 - left_over > len ? len : 128 - left_over;
214 
215       memcpy (&((char *) ctx->buffer)[left_over], buffer, add);
216       ctx->buflen += add;
217 
218       if (ctx->buflen > 64)
219 	{
220 	  sha1_process_block (ctx->buffer, ctx->buflen & ~63, ctx);
221 
222 	  ctx->buflen &= 63;
223 	  /* The regions in the following copy operation cannot overlap.  */
224 	  memcpy (ctx->buffer,
225 		  &((char *) ctx->buffer)[(left_over + add) & ~63],
226 		  ctx->buflen);
227 	}
228 
229       buffer = (const char *) buffer + add;
230       len -= add;
231     }
232 
233   /* Process available complete blocks.  */
234   if (len >= 64)
235     {
236 #if !_STRING_ARCH_unaligned
237 # define alignof(type) offsetof (struct { char c; type x; }, x)
238 # define UNALIGNED_P(p) (((size_t) p) % alignof (sha1_uint32) != 0)
239       if (UNALIGNED_P (buffer))
240 	while (len > 64)
241 	  {
242 	    sha1_process_block (memcpy (ctx->buffer, buffer, 64), 64, ctx);
243 	    buffer = (const char *) buffer + 64;
244 	    len -= 64;
245 	  }
246       else
247 #endif
248 	{
249 	  sha1_process_block (buffer, len & ~63, ctx);
250 	  buffer = (const char *) buffer + (len & ~63);
251 	  len &= 63;
252 	}
253     }
254 
255   /* Move remaining bytes in internal buffer.  */
256   if (len > 0)
257     {
258       size_t left_over = ctx->buflen;
259 
260       memcpy (&((char *) ctx->buffer)[left_over], buffer, len);
261       left_over += len;
262       if (left_over >= 64)
263 	{
264 	  sha1_process_block (ctx->buffer, 64, ctx);
265 	  left_over -= 64;
266 	  memmove (ctx->buffer, &ctx->buffer[16], left_over);
267 	}
268       ctx->buflen = left_over;
269     }
270 }
271 
272 /* --- Code below is the primary difference between md5.c and sha1.c --- */
273 
274 /* SHA1 round constants */
275 #define K1 0x5a827999
276 #define K2 0x6ed9eba1
277 #define K3 0x8f1bbcdc
278 #define K4 0xca62c1d6
279 
280 /* Round functions.  Note that F2 is the same as F4.  */
281 #define F1(B,C,D) ( D ^ ( B & ( C ^ D ) ) )
282 #define F2(B,C,D) (B ^ C ^ D)
283 #define F3(B,C,D) ( ( B & C ) | ( D & ( B | C ) ) )
284 #define F4(B,C,D) (B ^ C ^ D)
285 
286 /* Process LEN bytes of BUFFER, accumulating context into CTX.
287    It is assumed that LEN % 64 == 0.
288    Most of this code comes from GnuPG's cipher/sha1.c.  */
289 
290 void
sha1_process_block(const void * buffer,size_t len,struct sha1_ctx * ctx)291 sha1_process_block (const void *buffer, size_t len, struct sha1_ctx *ctx)
292 {
293   const sha1_uint32 *words = (const sha1_uint32*) buffer;
294   size_t nwords = len / sizeof (sha1_uint32);
295   const sha1_uint32 *endp = words + nwords;
296   sha1_uint32 x[16];
297   sha1_uint32 a = ctx->A;
298   sha1_uint32 b = ctx->B;
299   sha1_uint32 c = ctx->C;
300   sha1_uint32 d = ctx->D;
301   sha1_uint32 e = ctx->E;
302 
303   /* First increment the byte count.  RFC 1321 specifies the possible
304      length of the file up to 2^64 bits.  Here we only compute the
305      number of bytes.  Do a double word increment.  */
306   ctx->total[0] += len;
307   ctx->total[1] += ((len >> 31) >> 1) + (ctx->total[0] < len);
308 
309 #define rol(x, n) (((x) << (n)) | ((sha1_uint32) (x) >> (32 - (n))))
310 
311 #define M(I) ( tm =   x[I&0x0f] ^ x[(I-14)&0x0f] \
312 		    ^ x[(I-8)&0x0f] ^ x[(I-3)&0x0f] \
313 	       , (x[I&0x0f] = rol(tm, 1)) )
314 
315 #define R(A,B,C,D,E,F,K,M)  do { E += rol( A, 5 )     \
316 				      + F( B, C, D )  \
317 				      + K	      \
318 				      + M;	      \
319 				 B = rol( B, 30 );    \
320 			       } while(0)
321 
322   while (words < endp)
323     {
324       sha1_uint32 tm;
325       int t;
326       for (t = 0; t < 16; t++)
327 	{
328 	  x[t] = SWAP (*words);
329 	  words++;
330 	}
331 
332       R( a, b, c, d, e, F1, K1, x[ 0] );
333       R( e, a, b, c, d, F1, K1, x[ 1] );
334       R( d, e, a, b, c, F1, K1, x[ 2] );
335       R( c, d, e, a, b, F1, K1, x[ 3] );
336       R( b, c, d, e, a, F1, K1, x[ 4] );
337       R( a, b, c, d, e, F1, K1, x[ 5] );
338       R( e, a, b, c, d, F1, K1, x[ 6] );
339       R( d, e, a, b, c, F1, K1, x[ 7] );
340       R( c, d, e, a, b, F1, K1, x[ 8] );
341       R( b, c, d, e, a, F1, K1, x[ 9] );
342       R( a, b, c, d, e, F1, K1, x[10] );
343       R( e, a, b, c, d, F1, K1, x[11] );
344       R( d, e, a, b, c, F1, K1, x[12] );
345       R( c, d, e, a, b, F1, K1, x[13] );
346       R( b, c, d, e, a, F1, K1, x[14] );
347       R( a, b, c, d, e, F1, K1, x[15] );
348       R( e, a, b, c, d, F1, K1, M(16) );
349       R( d, e, a, b, c, F1, K1, M(17) );
350       R( c, d, e, a, b, F1, K1, M(18) );
351       R( b, c, d, e, a, F1, K1, M(19) );
352       R( a, b, c, d, e, F2, K2, M(20) );
353       R( e, a, b, c, d, F2, K2, M(21) );
354       R( d, e, a, b, c, F2, K2, M(22) );
355       R( c, d, e, a, b, F2, K2, M(23) );
356       R( b, c, d, e, a, F2, K2, M(24) );
357       R( a, b, c, d, e, F2, K2, M(25) );
358       R( e, a, b, c, d, F2, K2, M(26) );
359       R( d, e, a, b, c, F2, K2, M(27) );
360       R( c, d, e, a, b, F2, K2, M(28) );
361       R( b, c, d, e, a, F2, K2, M(29) );
362       R( a, b, c, d, e, F2, K2, M(30) );
363       R( e, a, b, c, d, F2, K2, M(31) );
364       R( d, e, a, b, c, F2, K2, M(32) );
365       R( c, d, e, a, b, F2, K2, M(33) );
366       R( b, c, d, e, a, F2, K2, M(34) );
367       R( a, b, c, d, e, F2, K2, M(35) );
368       R( e, a, b, c, d, F2, K2, M(36) );
369       R( d, e, a, b, c, F2, K2, M(37) );
370       R( c, d, e, a, b, F2, K2, M(38) );
371       R( b, c, d, e, a, F2, K2, M(39) );
372       R( a, b, c, d, e, F3, K3, M(40) );
373       R( e, a, b, c, d, F3, K3, M(41) );
374       R( d, e, a, b, c, F3, K3, M(42) );
375       R( c, d, e, a, b, F3, K3, M(43) );
376       R( b, c, d, e, a, F3, K3, M(44) );
377       R( a, b, c, d, e, F3, K3, M(45) );
378       R( e, a, b, c, d, F3, K3, M(46) );
379       R( d, e, a, b, c, F3, K3, M(47) );
380       R( c, d, e, a, b, F3, K3, M(48) );
381       R( b, c, d, e, a, F3, K3, M(49) );
382       R( a, b, c, d, e, F3, K3, M(50) );
383       R( e, a, b, c, d, F3, K3, M(51) );
384       R( d, e, a, b, c, F3, K3, M(52) );
385       R( c, d, e, a, b, F3, K3, M(53) );
386       R( b, c, d, e, a, F3, K3, M(54) );
387       R( a, b, c, d, e, F3, K3, M(55) );
388       R( e, a, b, c, d, F3, K3, M(56) );
389       R( d, e, a, b, c, F3, K3, M(57) );
390       R( c, d, e, a, b, F3, K3, M(58) );
391       R( b, c, d, e, a, F3, K3, M(59) );
392       R( a, b, c, d, e, F4, K4, M(60) );
393       R( e, a, b, c, d, F4, K4, M(61) );
394       R( d, e, a, b, c, F4, K4, M(62) );
395       R( c, d, e, a, b, F4, K4, M(63) );
396       R( b, c, d, e, a, F4, K4, M(64) );
397       R( a, b, c, d, e, F4, K4, M(65) );
398       R( e, a, b, c, d, F4, K4, M(66) );
399       R( d, e, a, b, c, F4, K4, M(67) );
400       R( c, d, e, a, b, F4, K4, M(68) );
401       R( b, c, d, e, a, F4, K4, M(69) );
402       R( a, b, c, d, e, F4, K4, M(70) );
403       R( e, a, b, c, d, F4, K4, M(71) );
404       R( d, e, a, b, c, F4, K4, M(72) );
405       R( c, d, e, a, b, F4, K4, M(73) );
406       R( b, c, d, e, a, F4, K4, M(74) );
407       R( a, b, c, d, e, F4, K4, M(75) );
408       R( e, a, b, c, d, F4, K4, M(76) );
409       R( d, e, a, b, c, F4, K4, M(77) );
410       R( c, d, e, a, b, F4, K4, M(78) );
411       R( b, c, d, e, a, F4, K4, M(79) );
412 
413       a = ctx->A += a;
414       b = ctx->B += b;
415       c = ctx->C += c;
416       d = ctx->D += d;
417       e = ctx->E += e;
418     }
419 }
420 
421 #if defined(HAVE_X86_SHA1_HW_SUPPORT)
422 /* HW specific version of sha1_process_bytes.  */
423 
424 static void sha1_hw_process_block (const void *, size_t, struct sha1_ctx *);
425 
426 static void
sha1_hw_process_bytes(const void * buffer,size_t len,struct sha1_ctx * ctx)427 sha1_hw_process_bytes (const void *buffer, size_t len, struct sha1_ctx *ctx)
428 {
429   /* When we already have some bits in our internal buffer concatenate
430      both inputs first.  */
431   if (ctx->buflen != 0)
432     {
433       size_t left_over = ctx->buflen;
434       size_t add = 128 - left_over > len ? len : 128 - left_over;
435 
436       memcpy (&((char *) ctx->buffer)[left_over], buffer, add);
437       ctx->buflen += add;
438 
439       if (ctx->buflen > 64)
440 	{
441 	  sha1_hw_process_block (ctx->buffer, ctx->buflen & ~63, ctx);
442 
443 	  ctx->buflen &= 63;
444 	  /* The regions in the following copy operation cannot overlap.  */
445 	  memcpy (ctx->buffer,
446 		  &((char *) ctx->buffer)[(left_over + add) & ~63],
447 		  ctx->buflen);
448 	}
449 
450       buffer = (const char *) buffer + add;
451       len -= add;
452     }
453 
454   /* Process available complete blocks.  */
455   if (len >= 64)
456     {
457 #if !_STRING_ARCH_unaligned
458 # define alignof(type) offsetof (struct { char c; type x; }, x)
459 # define UNALIGNED_P(p) (((size_t) p) % alignof (sha1_uint32) != 0)
460       if (UNALIGNED_P (buffer))
461 	while (len > 64)
462 	  {
463 	    sha1_hw_process_block (memcpy (ctx->buffer, buffer, 64), 64, ctx);
464 	    buffer = (const char *) buffer + 64;
465 	    len -= 64;
466 	  }
467       else
468 #endif
469 	{
470 	  sha1_hw_process_block (buffer, len & ~63, ctx);
471 	  buffer = (const char *) buffer + (len & ~63);
472 	  len &= 63;
473 	}
474     }
475 
476   /* Move remaining bytes in internal buffer.  */
477   if (len > 0)
478     {
479       size_t left_over = ctx->buflen;
480 
481       memcpy (&((char *) ctx->buffer)[left_over], buffer, len);
482       left_over += len;
483       if (left_over >= 64)
484 	{
485 	  sha1_hw_process_block (ctx->buffer, 64, ctx);
486 	  left_over -= 64;
487 	  memmove (ctx->buffer, &ctx->buffer[16], left_over);
488 	}
489       ctx->buflen = left_over;
490     }
491 }
492 
493 /* Process LEN bytes of BUFFER, accumulating context into CTX.
494    Using CPU specific intrinsics.  */
495 
496 #ifdef HAVE_X86_SHA1_HW_SUPPORT
497 __attribute__((__target__ ("sse4.1,sha")))
498 #endif
499 static void
sha1_hw_process_block(const void * buffer,size_t len,struct sha1_ctx * ctx)500 sha1_hw_process_block (const void *buffer, size_t len, struct sha1_ctx *ctx)
501 {
502 #ifdef HAVE_X86_SHA1_HW_SUPPORT
503   /* Implemented from
504      https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sha-extensions.html  */
505   const __m128i *words = (const __m128i *) buffer;
506   const __m128i *endp = (const __m128i *) ((const char *) buffer + len);
507   __m128i abcd, abcd_save, e0, e0_save, e1, msg0, msg1, msg2, msg3;
508   const __m128i shuf_mask
509     = _mm_set_epi64x (0x0001020304050607ULL, 0x08090a0b0c0d0e0fULL);
510   char check[((offsetof (struct sha1_ctx, B)
511 	     == offsetof (struct sha1_ctx, A) + sizeof (ctx->A))
512 		   && (offsetof (struct sha1_ctx, C)
513 		       == offsetof (struct sha1_ctx, A) + 2 * sizeof (ctx->A))
514 		   && (offsetof (struct sha1_ctx, D)
515 		       == offsetof (struct sha1_ctx, A) + 3 * sizeof (ctx->A)))
516 		  ? 1 : -1];
517 
518   /* First increment the byte count.  RFC 1321 specifies the possible
519      length of the file up to 2^64 bits.  Here we only compute the
520      number of bytes.  Do a double word increment.  */
521   ctx->total[0] += len;
522   ctx->total[1] += ((len >> 31) >> 1) + (ctx->total[0] < len);
523 
524   (void) &check[0];
525   abcd = _mm_loadu_si128 ((const __m128i *) &ctx->A);
526   e0 = _mm_set_epi32 (ctx->E, 0, 0, 0);
527   abcd = _mm_shuffle_epi32 (abcd, 0x1b); /* 0, 1, 2, 3 */
528 
529   while (words < endp)
530     {
531       abcd_save = abcd;
532       e0_save = e0;
533 
534       /* 0..3 */
535       msg0 = _mm_loadu_si128 (words);
536       msg0 = _mm_shuffle_epi8 (msg0, shuf_mask);
537       e0 = _mm_add_epi32 (e0, msg0);
538       e1 = abcd;
539       abcd = _mm_sha1rnds4_epu32 (abcd, e0, 0);
540 
541       /* 4..7 */
542       msg1 = _mm_loadu_si128 (words + 1);
543       msg1 = _mm_shuffle_epi8 (msg1, shuf_mask);
544       e1 = _mm_sha1nexte_epu32 (e1, msg1);
545       e0 = abcd;
546       abcd = _mm_sha1rnds4_epu32 (abcd, e1, 0);
547       msg0 = _mm_sha1msg1_epu32 (msg0, msg1);
548 
549       /* 8..11 */
550       msg2 = _mm_loadu_si128 (words + 2);
551       msg2 = _mm_shuffle_epi8 (msg2, shuf_mask);
552       e0 = _mm_sha1nexte_epu32 (e0, msg2);
553       e1 = abcd;
554       abcd = _mm_sha1rnds4_epu32 (abcd, e0, 0);
555       msg1 = _mm_sha1msg1_epu32 (msg1, msg2);
556       msg0 = _mm_xor_si128 (msg0, msg2);
557 
558       /* 12..15 */
559       msg3 = _mm_loadu_si128 (words + 3);
560       msg3 = _mm_shuffle_epi8 (msg3, shuf_mask);
561       e1 = _mm_sha1nexte_epu32 (e1, msg3);
562       e0 = abcd;
563       msg0 = _mm_sha1msg2_epu32 (msg0, msg3);
564       abcd = _mm_sha1rnds4_epu32 (abcd, e1, 0);
565       msg2 = _mm_sha1msg1_epu32 (msg2, msg3);
566       msg1 = _mm_xor_si128 (msg1, msg3);
567 
568       /* 16..19 */
569       e0 = _mm_sha1nexte_epu32 (e0, msg0);
570       e1 = abcd;
571       msg1 = _mm_sha1msg2_epu32 (msg1, msg0);
572       abcd = _mm_sha1rnds4_epu32 (abcd, e0, 0);
573       msg3 = _mm_sha1msg1_epu32 (msg3, msg0);
574       msg2 = _mm_xor_si128 (msg2, msg0);
575 
576       /* 20..23 */
577       e1 = _mm_sha1nexte_epu32 (e1, msg1);
578       e0 = abcd;
579       msg2 = _mm_sha1msg2_epu32 (msg2, msg1);
580       abcd = _mm_sha1rnds4_epu32 (abcd, e1, 1);
581       msg0 = _mm_sha1msg1_epu32 (msg0, msg1);
582       msg3 = _mm_xor_si128 (msg3, msg1);
583 
584       /* 24..27 */
585       e0 = _mm_sha1nexte_epu32 (e0, msg2);
586       e1 = abcd;
587       msg3 = _mm_sha1msg2_epu32 (msg3, msg2);
588       abcd = _mm_sha1rnds4_epu32 (abcd, e0, 1);
589       msg1 = _mm_sha1msg1_epu32 (msg1, msg2);
590       msg0 = _mm_xor_si128 (msg0, msg2);
591 
592       /* 28..31 */
593       e1 = _mm_sha1nexte_epu32 (e1, msg3);
594       e0 = abcd;
595       msg0 = _mm_sha1msg2_epu32 (msg0, msg3);
596       abcd = _mm_sha1rnds4_epu32 (abcd, e1, 1);
597       msg2 = _mm_sha1msg1_epu32 (msg2, msg3);
598       msg1 = _mm_xor_si128 (msg1, msg3);
599 
600       /* 32..35 */
601       e0 = _mm_sha1nexte_epu32 (e0, msg0);
602       e1 = abcd;
603       msg1 = _mm_sha1msg2_epu32 (msg1, msg0);
604       abcd = _mm_sha1rnds4_epu32 (abcd, e0, 1);
605       msg3 = _mm_sha1msg1_epu32 (msg3, msg0);
606       msg2 = _mm_xor_si128 (msg2, msg0);
607 
608       /* 36..39 */
609       e1 = _mm_sha1nexte_epu32 (e1, msg1);
610       e0 = abcd;
611       msg2 = _mm_sha1msg2_epu32 (msg2, msg1);
612       abcd = _mm_sha1rnds4_epu32 (abcd, e1, 1);
613       msg0 = _mm_sha1msg1_epu32 (msg0, msg1);
614       msg3 = _mm_xor_si128 (msg3, msg1);
615 
616       /* 40..43 */
617       e0 = _mm_sha1nexte_epu32 (e0, msg2);
618       e1 = abcd;
619       msg3 = _mm_sha1msg2_epu32 (msg3, msg2);
620       abcd = _mm_sha1rnds4_epu32 (abcd, e0, 2);
621       msg1 = _mm_sha1msg1_epu32 (msg1, msg2);
622       msg0 = _mm_xor_si128 (msg0, msg2);
623 
624       /* 44..47 */
625       e1 = _mm_sha1nexte_epu32 (e1, msg3);
626       e0 = abcd;
627       msg0 = _mm_sha1msg2_epu32 (msg0, msg3);
628       abcd = _mm_sha1rnds4_epu32 (abcd, e1, 2);
629       msg2 = _mm_sha1msg1_epu32 (msg2, msg3);
630       msg1 = _mm_xor_si128 (msg1, msg3);
631 
632       /* 48..51 */
633       e0 = _mm_sha1nexte_epu32 (e0, msg0);
634       e1 = abcd;
635       msg1 = _mm_sha1msg2_epu32 (msg1, msg0);
636       abcd = _mm_sha1rnds4_epu32 (abcd, e0, 2);
637       msg3 = _mm_sha1msg1_epu32 (msg3, msg0);
638       msg2 = _mm_xor_si128 (msg2, msg0);
639 
640       /* 52..55 */
641       e1 = _mm_sha1nexte_epu32 (e1, msg1);
642       e0 = abcd;
643       msg2 = _mm_sha1msg2_epu32 (msg2, msg1);
644       abcd = _mm_sha1rnds4_epu32 (abcd, e1, 2);
645       msg0 = _mm_sha1msg1_epu32 (msg0, msg1);
646       msg3 = _mm_xor_si128 (msg3, msg1);
647 
648       /* 56..59 */
649       e0 = _mm_sha1nexte_epu32 (e0, msg2);
650       e1 = abcd;
651       msg3 = _mm_sha1msg2_epu32 (msg3, msg2);
652       abcd = _mm_sha1rnds4_epu32 (abcd, e0, 2);
653       msg1 = _mm_sha1msg1_epu32 (msg1, msg2);
654       msg0 = _mm_xor_si128 (msg0, msg2);
655 
656       /* 60..63 */
657       e1 = _mm_sha1nexte_epu32 (e1, msg3);
658       e0 = abcd;
659       msg0 = _mm_sha1msg2_epu32 (msg0, msg3);
660       abcd = _mm_sha1rnds4_epu32 (abcd, e1, 3);
661       msg2 = _mm_sha1msg1_epu32 (msg2, msg3);
662       msg1 = _mm_xor_si128 (msg1, msg3);
663 
664       /* 64..67 */
665       e0 = _mm_sha1nexte_epu32 (e0, msg0);
666       e1 = abcd;
667       msg1 = _mm_sha1msg2_epu32 (msg1, msg0);
668       abcd = _mm_sha1rnds4_epu32 (abcd, e0, 3);
669       msg3 = _mm_sha1msg1_epu32 (msg3, msg0);
670       msg2 = _mm_xor_si128 (msg2, msg0);
671 
672       /* 68..71 */
673       e1 = _mm_sha1nexte_epu32 (e1, msg1);
674       e0 = abcd;
675       msg2 = _mm_sha1msg2_epu32 (msg2, msg1);
676       abcd = _mm_sha1rnds4_epu32 (abcd, e1, 3);
677       msg3 = _mm_xor_si128 (msg3, msg1);
678 
679       /* 72..75 */
680       e0 = _mm_sha1nexte_epu32 (e0, msg2);
681       e1 = abcd;
682       msg3 = _mm_sha1msg2_epu32 (msg3, msg2);
683       abcd = _mm_sha1rnds4_epu32 (abcd, e0, 3);
684 
685       /* 76..79 */
686       e1 = _mm_sha1nexte_epu32 (e1, msg3);
687       e0 = abcd;
688       abcd = _mm_sha1rnds4_epu32 (abcd, e1, 3);
689 
690       /* Finalize. */
691       e0 = _mm_sha1nexte_epu32 (e0, e0_save);
692       abcd = _mm_add_epi32 (abcd, abcd_save);
693 
694       words = words + 4;
695     }
696 
697   abcd = _mm_shuffle_epi32 (abcd, 0x1b); /* 0, 1, 2, 3 */
698   _mm_storeu_si128 ((__m128i *) &ctx->A, abcd);
699   ctx->E = _mm_extract_epi32 (e0, 3);
700 #endif
701 }
702 #endif
703 
704 /* Return sha1_process_bytes or some hardware optimized version thereof
705    depending on current CPU.  */
706 
707 sha1_process_bytes_fn
sha1_choose_process_bytes(void)708 sha1_choose_process_bytes (void)
709 {
710 #ifdef HAVE_X86_SHA1_HW_SUPPORT
711   unsigned int eax, ebx, ecx, edx;
712   if (__get_cpuid_count (7, 0, &eax, &ebx, &ecx, &edx)
713       && (ebx & bit_SHA) != 0
714       && __get_cpuid (1, &eax, &ebx, &ecx, &edx)
715       && (ecx & bit_SSE4_1) != 0)
716     return sha1_hw_process_bytes;
717 #endif
718   return sha1_process_bytes;
719 }
720