xref: /freebsd-src/contrib/llvm-project/llvm/lib/Support/BLAKE3/blake3_sse41.c (revision 81ad626541db97eb356e2c1d4a20eb2a26a766ab)
1*81ad6265SDimitry Andric #include "blake3_impl.h"
2*81ad6265SDimitry Andric 
3*81ad6265SDimitry Andric #include <immintrin.h>
4*81ad6265SDimitry Andric 
5*81ad6265SDimitry Andric #define DEGREE 4
6*81ad6265SDimitry Andric 
7*81ad6265SDimitry Andric #define _mm_shuffle_ps2(a, b, c)                                               \
8*81ad6265SDimitry Andric   (_mm_castps_si128(                                                           \
9*81ad6265SDimitry Andric       _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c))))
10*81ad6265SDimitry Andric 
loadu(const uint8_t src[16])11*81ad6265SDimitry Andric INLINE __m128i loadu(const uint8_t src[16]) {
12*81ad6265SDimitry Andric   return _mm_loadu_si128((const __m128i *)src);
13*81ad6265SDimitry Andric }
14*81ad6265SDimitry Andric 
storeu(__m128i src,uint8_t dest[16])15*81ad6265SDimitry Andric INLINE void storeu(__m128i src, uint8_t dest[16]) {
16*81ad6265SDimitry Andric   _mm_storeu_si128((__m128i *)dest, src);
17*81ad6265SDimitry Andric }
18*81ad6265SDimitry Andric 
addv(__m128i a,__m128i b)19*81ad6265SDimitry Andric INLINE __m128i addv(__m128i a, __m128i b) { return _mm_add_epi32(a, b); }
20*81ad6265SDimitry Andric 
21*81ad6265SDimitry Andric // Note that clang-format doesn't like the name "xor" for some reason.
xorv(__m128i a,__m128i b)22*81ad6265SDimitry Andric INLINE __m128i xorv(__m128i a, __m128i b) { return _mm_xor_si128(a, b); }
23*81ad6265SDimitry Andric 
set1(uint32_t x)24*81ad6265SDimitry Andric INLINE __m128i set1(uint32_t x) { return _mm_set1_epi32((int32_t)x); }
25*81ad6265SDimitry Andric 
set4(uint32_t a,uint32_t b,uint32_t c,uint32_t d)26*81ad6265SDimitry Andric INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
27*81ad6265SDimitry Andric   return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d);
28*81ad6265SDimitry Andric }
29*81ad6265SDimitry Andric 
rot16(__m128i x)30*81ad6265SDimitry Andric INLINE __m128i rot16(__m128i x) {
31*81ad6265SDimitry Andric   return _mm_shuffle_epi8(
32*81ad6265SDimitry Andric       x, _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2));
33*81ad6265SDimitry Andric }
34*81ad6265SDimitry Andric 
rot12(__m128i x)35*81ad6265SDimitry Andric INLINE __m128i rot12(__m128i x) {
36*81ad6265SDimitry Andric   return xorv(_mm_srli_epi32(x, 12), _mm_slli_epi32(x, 32 - 12));
37*81ad6265SDimitry Andric }
38*81ad6265SDimitry Andric 
rot8(__m128i x)39*81ad6265SDimitry Andric INLINE __m128i rot8(__m128i x) {
40*81ad6265SDimitry Andric   return _mm_shuffle_epi8(
41*81ad6265SDimitry Andric       x, _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1));
42*81ad6265SDimitry Andric }
43*81ad6265SDimitry Andric 
rot7(__m128i x)44*81ad6265SDimitry Andric INLINE __m128i rot7(__m128i x) {
45*81ad6265SDimitry Andric   return xorv(_mm_srli_epi32(x, 7), _mm_slli_epi32(x, 32 - 7));
46*81ad6265SDimitry Andric }
47*81ad6265SDimitry Andric 
g1(__m128i * row0,__m128i * row1,__m128i * row2,__m128i * row3,__m128i m)48*81ad6265SDimitry Andric INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
49*81ad6265SDimitry Andric                __m128i m) {
50*81ad6265SDimitry Andric   *row0 = addv(addv(*row0, m), *row1);
51*81ad6265SDimitry Andric   *row3 = xorv(*row3, *row0);
52*81ad6265SDimitry Andric   *row3 = rot16(*row3);
53*81ad6265SDimitry Andric   *row2 = addv(*row2, *row3);
54*81ad6265SDimitry Andric   *row1 = xorv(*row1, *row2);
55*81ad6265SDimitry Andric   *row1 = rot12(*row1);
56*81ad6265SDimitry Andric }
57*81ad6265SDimitry Andric 
g2(__m128i * row0,__m128i * row1,__m128i * row2,__m128i * row3,__m128i m)58*81ad6265SDimitry Andric INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
59*81ad6265SDimitry Andric                __m128i m) {
60*81ad6265SDimitry Andric   *row0 = addv(addv(*row0, m), *row1);
61*81ad6265SDimitry Andric   *row3 = xorv(*row3, *row0);
62*81ad6265SDimitry Andric   *row3 = rot8(*row3);
63*81ad6265SDimitry Andric   *row2 = addv(*row2, *row3);
64*81ad6265SDimitry Andric   *row1 = xorv(*row1, *row2);
65*81ad6265SDimitry Andric   *row1 = rot7(*row1);
66*81ad6265SDimitry Andric }
67*81ad6265SDimitry Andric 
68*81ad6265SDimitry Andric // Note the optimization here of leaving row1 as the unrotated row, rather than
69*81ad6265SDimitry Andric // row0. All the message loads below are adjusted to compensate for this. See
70*81ad6265SDimitry Andric // discussion at https://github.com/sneves/blake2-avx2/pull/4
diagonalize(__m128i * row0,__m128i * row2,__m128i * row3)71*81ad6265SDimitry Andric INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
72*81ad6265SDimitry Andric   *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3));
73*81ad6265SDimitry Andric   *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
74*81ad6265SDimitry Andric   *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1));
75*81ad6265SDimitry Andric }
76*81ad6265SDimitry Andric 
undiagonalize(__m128i * row0,__m128i * row2,__m128i * row3)77*81ad6265SDimitry Andric INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
78*81ad6265SDimitry Andric   *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1));
79*81ad6265SDimitry Andric   *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
80*81ad6265SDimitry Andric   *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3));
81*81ad6265SDimitry Andric }
82*81ad6265SDimitry Andric 
compress_pre(__m128i rows[4],const uint32_t cv[8],const uint8_t block[BLAKE3_BLOCK_LEN],uint8_t block_len,uint64_t counter,uint8_t flags)83*81ad6265SDimitry Andric INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8],
84*81ad6265SDimitry Andric                          const uint8_t block[BLAKE3_BLOCK_LEN],
85*81ad6265SDimitry Andric                          uint8_t block_len, uint64_t counter, uint8_t flags) {
86*81ad6265SDimitry Andric   rows[0] = loadu((uint8_t *)&cv[0]);
87*81ad6265SDimitry Andric   rows[1] = loadu((uint8_t *)&cv[4]);
88*81ad6265SDimitry Andric   rows[2] = set4(IV[0], IV[1], IV[2], IV[3]);
89*81ad6265SDimitry Andric   rows[3] = set4(counter_low(counter), counter_high(counter),
90*81ad6265SDimitry Andric                  (uint32_t)block_len, (uint32_t)flags);
91*81ad6265SDimitry Andric 
92*81ad6265SDimitry Andric   __m128i m0 = loadu(&block[sizeof(__m128i) * 0]);
93*81ad6265SDimitry Andric   __m128i m1 = loadu(&block[sizeof(__m128i) * 1]);
94*81ad6265SDimitry Andric   __m128i m2 = loadu(&block[sizeof(__m128i) * 2]);
95*81ad6265SDimitry Andric   __m128i m3 = loadu(&block[sizeof(__m128i) * 3]);
96*81ad6265SDimitry Andric 
97*81ad6265SDimitry Andric   __m128i t0, t1, t2, t3, tt;
98*81ad6265SDimitry Andric 
99*81ad6265SDimitry Andric   // Round 1. The first round permutes the message words from the original
100*81ad6265SDimitry Andric   // input order, into the groups that get mixed in parallel.
101*81ad6265SDimitry Andric   t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); //  6  4  2  0
102*81ad6265SDimitry Andric   g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
103*81ad6265SDimitry Andric   t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); //  7  5  3  1
104*81ad6265SDimitry Andric   g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
105*81ad6265SDimitry Andric   diagonalize(&rows[0], &rows[2], &rows[3]);
106*81ad6265SDimitry Andric   t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10  8
107*81ad6265SDimitry Andric   t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3));   // 12 10  8 14
108*81ad6265SDimitry Andric   g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
109*81ad6265SDimitry Andric   t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11  9
110*81ad6265SDimitry Andric   t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3));   // 13 11  9 15
111*81ad6265SDimitry Andric   g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
112*81ad6265SDimitry Andric   undiagonalize(&rows[0], &rows[2], &rows[3]);
113*81ad6265SDimitry Andric   m0 = t0;
114*81ad6265SDimitry Andric   m1 = t1;
115*81ad6265SDimitry Andric   m2 = t2;
116*81ad6265SDimitry Andric   m3 = t3;
117*81ad6265SDimitry Andric 
118*81ad6265SDimitry Andric   // Round 2. This round and all following rounds apply a fixed permutation
119*81ad6265SDimitry Andric   // to the message words from the round before.
120*81ad6265SDimitry Andric   t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
121*81ad6265SDimitry Andric   t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
122*81ad6265SDimitry Andric   g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
123*81ad6265SDimitry Andric   t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
124*81ad6265SDimitry Andric   tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
125*81ad6265SDimitry Andric   t1 = _mm_blend_epi16(tt, t1, 0xCC);
126*81ad6265SDimitry Andric   g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
127*81ad6265SDimitry Andric   diagonalize(&rows[0], &rows[2], &rows[3]);
128*81ad6265SDimitry Andric   t2 = _mm_unpacklo_epi64(m3, m1);
129*81ad6265SDimitry Andric   tt = _mm_blend_epi16(t2, m2, 0xC0);
130*81ad6265SDimitry Andric   t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
131*81ad6265SDimitry Andric   g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
132*81ad6265SDimitry Andric   t3 = _mm_unpackhi_epi32(m1, m3);
133*81ad6265SDimitry Andric   tt = _mm_unpacklo_epi32(m2, t3);
134*81ad6265SDimitry Andric   t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
135*81ad6265SDimitry Andric   g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
136*81ad6265SDimitry Andric   undiagonalize(&rows[0], &rows[2], &rows[3]);
137*81ad6265SDimitry Andric   m0 = t0;
138*81ad6265SDimitry Andric   m1 = t1;
139*81ad6265SDimitry Andric   m2 = t2;
140*81ad6265SDimitry Andric   m3 = t3;
141*81ad6265SDimitry Andric 
142*81ad6265SDimitry Andric   // Round 3
143*81ad6265SDimitry Andric   t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
144*81ad6265SDimitry Andric   t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
145*81ad6265SDimitry Andric   g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
146*81ad6265SDimitry Andric   t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
147*81ad6265SDimitry Andric   tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
148*81ad6265SDimitry Andric   t1 = _mm_blend_epi16(tt, t1, 0xCC);
149*81ad6265SDimitry Andric   g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
150*81ad6265SDimitry Andric   diagonalize(&rows[0], &rows[2], &rows[3]);
151*81ad6265SDimitry Andric   t2 = _mm_unpacklo_epi64(m3, m1);
152*81ad6265SDimitry Andric   tt = _mm_blend_epi16(t2, m2, 0xC0);
153*81ad6265SDimitry Andric   t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
154*81ad6265SDimitry Andric   g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
155*81ad6265SDimitry Andric   t3 = _mm_unpackhi_epi32(m1, m3);
156*81ad6265SDimitry Andric   tt = _mm_unpacklo_epi32(m2, t3);
157*81ad6265SDimitry Andric   t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
158*81ad6265SDimitry Andric   g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
159*81ad6265SDimitry Andric   undiagonalize(&rows[0], &rows[2], &rows[3]);
160*81ad6265SDimitry Andric   m0 = t0;
161*81ad6265SDimitry Andric   m1 = t1;
162*81ad6265SDimitry Andric   m2 = t2;
163*81ad6265SDimitry Andric   m3 = t3;
164*81ad6265SDimitry Andric 
165*81ad6265SDimitry Andric   // Round 4
166*81ad6265SDimitry Andric   t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
167*81ad6265SDimitry Andric   t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
168*81ad6265SDimitry Andric   g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
169*81ad6265SDimitry Andric   t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
170*81ad6265SDimitry Andric   tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
171*81ad6265SDimitry Andric   t1 = _mm_blend_epi16(tt, t1, 0xCC);
172*81ad6265SDimitry Andric   g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
173*81ad6265SDimitry Andric   diagonalize(&rows[0], &rows[2], &rows[3]);
174*81ad6265SDimitry Andric   t2 = _mm_unpacklo_epi64(m3, m1);
175*81ad6265SDimitry Andric   tt = _mm_blend_epi16(t2, m2, 0xC0);
176*81ad6265SDimitry Andric   t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
177*81ad6265SDimitry Andric   g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
178*81ad6265SDimitry Andric   t3 = _mm_unpackhi_epi32(m1, m3);
179*81ad6265SDimitry Andric   tt = _mm_unpacklo_epi32(m2, t3);
180*81ad6265SDimitry Andric   t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
181*81ad6265SDimitry Andric   g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
182*81ad6265SDimitry Andric   undiagonalize(&rows[0], &rows[2], &rows[3]);
183*81ad6265SDimitry Andric   m0 = t0;
184*81ad6265SDimitry Andric   m1 = t1;
185*81ad6265SDimitry Andric   m2 = t2;
186*81ad6265SDimitry Andric   m3 = t3;
187*81ad6265SDimitry Andric 
188*81ad6265SDimitry Andric   // Round 5
189*81ad6265SDimitry Andric   t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
190*81ad6265SDimitry Andric   t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
191*81ad6265SDimitry Andric   g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
192*81ad6265SDimitry Andric   t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
193*81ad6265SDimitry Andric   tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
194*81ad6265SDimitry Andric   t1 = _mm_blend_epi16(tt, t1, 0xCC);
195*81ad6265SDimitry Andric   g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
196*81ad6265SDimitry Andric   diagonalize(&rows[0], &rows[2], &rows[3]);
197*81ad6265SDimitry Andric   t2 = _mm_unpacklo_epi64(m3, m1);
198*81ad6265SDimitry Andric   tt = _mm_blend_epi16(t2, m2, 0xC0);
199*81ad6265SDimitry Andric   t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
200*81ad6265SDimitry Andric   g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
201*81ad6265SDimitry Andric   t3 = _mm_unpackhi_epi32(m1, m3);
202*81ad6265SDimitry Andric   tt = _mm_unpacklo_epi32(m2, t3);
203*81ad6265SDimitry Andric   t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
204*81ad6265SDimitry Andric   g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
205*81ad6265SDimitry Andric   undiagonalize(&rows[0], &rows[2], &rows[3]);
206*81ad6265SDimitry Andric   m0 = t0;
207*81ad6265SDimitry Andric   m1 = t1;
208*81ad6265SDimitry Andric   m2 = t2;
209*81ad6265SDimitry Andric   m3 = t3;
210*81ad6265SDimitry Andric 
211*81ad6265SDimitry Andric   // Round 6
212*81ad6265SDimitry Andric   t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
213*81ad6265SDimitry Andric   t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
214*81ad6265SDimitry Andric   g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
215*81ad6265SDimitry Andric   t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
216*81ad6265SDimitry Andric   tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
217*81ad6265SDimitry Andric   t1 = _mm_blend_epi16(tt, t1, 0xCC);
218*81ad6265SDimitry Andric   g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
219*81ad6265SDimitry Andric   diagonalize(&rows[0], &rows[2], &rows[3]);
220*81ad6265SDimitry Andric   t2 = _mm_unpacklo_epi64(m3, m1);
221*81ad6265SDimitry Andric   tt = _mm_blend_epi16(t2, m2, 0xC0);
222*81ad6265SDimitry Andric   t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
223*81ad6265SDimitry Andric   g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
224*81ad6265SDimitry Andric   t3 = _mm_unpackhi_epi32(m1, m3);
225*81ad6265SDimitry Andric   tt = _mm_unpacklo_epi32(m2, t3);
226*81ad6265SDimitry Andric   t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
227*81ad6265SDimitry Andric   g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
228*81ad6265SDimitry Andric   undiagonalize(&rows[0], &rows[2], &rows[3]);
229*81ad6265SDimitry Andric   m0 = t0;
230*81ad6265SDimitry Andric   m1 = t1;
231*81ad6265SDimitry Andric   m2 = t2;
232*81ad6265SDimitry Andric   m3 = t3;
233*81ad6265SDimitry Andric 
234*81ad6265SDimitry Andric   // Round 7
235*81ad6265SDimitry Andric   t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
236*81ad6265SDimitry Andric   t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
237*81ad6265SDimitry Andric   g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
238*81ad6265SDimitry Andric   t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
239*81ad6265SDimitry Andric   tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
240*81ad6265SDimitry Andric   t1 = _mm_blend_epi16(tt, t1, 0xCC);
241*81ad6265SDimitry Andric   g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
242*81ad6265SDimitry Andric   diagonalize(&rows[0], &rows[2], &rows[3]);
243*81ad6265SDimitry Andric   t2 = _mm_unpacklo_epi64(m3, m1);
244*81ad6265SDimitry Andric   tt = _mm_blend_epi16(t2, m2, 0xC0);
245*81ad6265SDimitry Andric   t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
246*81ad6265SDimitry Andric   g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
247*81ad6265SDimitry Andric   t3 = _mm_unpackhi_epi32(m1, m3);
248*81ad6265SDimitry Andric   tt = _mm_unpacklo_epi32(m2, t3);
249*81ad6265SDimitry Andric   t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
250*81ad6265SDimitry Andric   g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
251*81ad6265SDimitry Andric   undiagonalize(&rows[0], &rows[2], &rows[3]);
252*81ad6265SDimitry Andric }
253*81ad6265SDimitry Andric 
blake3_compress_in_place_sse41(uint32_t cv[8],const uint8_t block[BLAKE3_BLOCK_LEN],uint8_t block_len,uint64_t counter,uint8_t flags)254*81ad6265SDimitry Andric void blake3_compress_in_place_sse41(uint32_t cv[8],
255*81ad6265SDimitry Andric                                     const uint8_t block[BLAKE3_BLOCK_LEN],
256*81ad6265SDimitry Andric                                     uint8_t block_len, uint64_t counter,
257*81ad6265SDimitry Andric                                     uint8_t flags) {
258*81ad6265SDimitry Andric   __m128i rows[4];
259*81ad6265SDimitry Andric   compress_pre(rows, cv, block, block_len, counter, flags);
260*81ad6265SDimitry Andric   storeu(xorv(rows[0], rows[2]), (uint8_t *)&cv[0]);
261*81ad6265SDimitry Andric   storeu(xorv(rows[1], rows[3]), (uint8_t *)&cv[4]);
262*81ad6265SDimitry Andric }
263*81ad6265SDimitry Andric 
blake3_compress_xof_sse41(const uint32_t cv[8],const uint8_t block[BLAKE3_BLOCK_LEN],uint8_t block_len,uint64_t counter,uint8_t flags,uint8_t out[64])264*81ad6265SDimitry Andric void blake3_compress_xof_sse41(const uint32_t cv[8],
265*81ad6265SDimitry Andric                                const uint8_t block[BLAKE3_BLOCK_LEN],
266*81ad6265SDimitry Andric                                uint8_t block_len, uint64_t counter,
267*81ad6265SDimitry Andric                                uint8_t flags, uint8_t out[64]) {
268*81ad6265SDimitry Andric   __m128i rows[4];
269*81ad6265SDimitry Andric   compress_pre(rows, cv, block, block_len, counter, flags);
270*81ad6265SDimitry Andric   storeu(xorv(rows[0], rows[2]), &out[0]);
271*81ad6265SDimitry Andric   storeu(xorv(rows[1], rows[3]), &out[16]);
272*81ad6265SDimitry Andric   storeu(xorv(rows[2], loadu((uint8_t *)&cv[0])), &out[32]);
273*81ad6265SDimitry Andric   storeu(xorv(rows[3], loadu((uint8_t *)&cv[4])), &out[48]);
274*81ad6265SDimitry Andric }
275*81ad6265SDimitry Andric 
round_fn(__m128i v[16],__m128i m[16],size_t r)276*81ad6265SDimitry Andric INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r) {
277*81ad6265SDimitry Andric   v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
278*81ad6265SDimitry Andric   v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
279*81ad6265SDimitry Andric   v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
280*81ad6265SDimitry Andric   v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
281*81ad6265SDimitry Andric   v[0] = addv(v[0], v[4]);
282*81ad6265SDimitry Andric   v[1] = addv(v[1], v[5]);
283*81ad6265SDimitry Andric   v[2] = addv(v[2], v[6]);
284*81ad6265SDimitry Andric   v[3] = addv(v[3], v[7]);
285*81ad6265SDimitry Andric   v[12] = xorv(v[12], v[0]);
286*81ad6265SDimitry Andric   v[13] = xorv(v[13], v[1]);
287*81ad6265SDimitry Andric   v[14] = xorv(v[14], v[2]);
288*81ad6265SDimitry Andric   v[15] = xorv(v[15], v[3]);
289*81ad6265SDimitry Andric   v[12] = rot16(v[12]);
290*81ad6265SDimitry Andric   v[13] = rot16(v[13]);
291*81ad6265SDimitry Andric   v[14] = rot16(v[14]);
292*81ad6265SDimitry Andric   v[15] = rot16(v[15]);
293*81ad6265SDimitry Andric   v[8] = addv(v[8], v[12]);
294*81ad6265SDimitry Andric   v[9] = addv(v[9], v[13]);
295*81ad6265SDimitry Andric   v[10] = addv(v[10], v[14]);
296*81ad6265SDimitry Andric   v[11] = addv(v[11], v[15]);
297*81ad6265SDimitry Andric   v[4] = xorv(v[4], v[8]);
298*81ad6265SDimitry Andric   v[5] = xorv(v[5], v[9]);
299*81ad6265SDimitry Andric   v[6] = xorv(v[6], v[10]);
300*81ad6265SDimitry Andric   v[7] = xorv(v[7], v[11]);
301*81ad6265SDimitry Andric   v[4] = rot12(v[4]);
302*81ad6265SDimitry Andric   v[5] = rot12(v[5]);
303*81ad6265SDimitry Andric   v[6] = rot12(v[6]);
304*81ad6265SDimitry Andric   v[7] = rot12(v[7]);
305*81ad6265SDimitry Andric   v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
306*81ad6265SDimitry Andric   v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
307*81ad6265SDimitry Andric   v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
308*81ad6265SDimitry Andric   v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
309*81ad6265SDimitry Andric   v[0] = addv(v[0], v[4]);
310*81ad6265SDimitry Andric   v[1] = addv(v[1], v[5]);
311*81ad6265SDimitry Andric   v[2] = addv(v[2], v[6]);
312*81ad6265SDimitry Andric   v[3] = addv(v[3], v[7]);
313*81ad6265SDimitry Andric   v[12] = xorv(v[12], v[0]);
314*81ad6265SDimitry Andric   v[13] = xorv(v[13], v[1]);
315*81ad6265SDimitry Andric   v[14] = xorv(v[14], v[2]);
316*81ad6265SDimitry Andric   v[15] = xorv(v[15], v[3]);
317*81ad6265SDimitry Andric   v[12] = rot8(v[12]);
318*81ad6265SDimitry Andric   v[13] = rot8(v[13]);
319*81ad6265SDimitry Andric   v[14] = rot8(v[14]);
320*81ad6265SDimitry Andric   v[15] = rot8(v[15]);
321*81ad6265SDimitry Andric   v[8] = addv(v[8], v[12]);
322*81ad6265SDimitry Andric   v[9] = addv(v[9], v[13]);
323*81ad6265SDimitry Andric   v[10] = addv(v[10], v[14]);
324*81ad6265SDimitry Andric   v[11] = addv(v[11], v[15]);
325*81ad6265SDimitry Andric   v[4] = xorv(v[4], v[8]);
326*81ad6265SDimitry Andric   v[5] = xorv(v[5], v[9]);
327*81ad6265SDimitry Andric   v[6] = xorv(v[6], v[10]);
328*81ad6265SDimitry Andric   v[7] = xorv(v[7], v[11]);
329*81ad6265SDimitry Andric   v[4] = rot7(v[4]);
330*81ad6265SDimitry Andric   v[5] = rot7(v[5]);
331*81ad6265SDimitry Andric   v[6] = rot7(v[6]);
332*81ad6265SDimitry Andric   v[7] = rot7(v[7]);
333*81ad6265SDimitry Andric 
334*81ad6265SDimitry Andric   v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
335*81ad6265SDimitry Andric   v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
336*81ad6265SDimitry Andric   v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
337*81ad6265SDimitry Andric   v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
338*81ad6265SDimitry Andric   v[0] = addv(v[0], v[5]);
339*81ad6265SDimitry Andric   v[1] = addv(v[1], v[6]);
340*81ad6265SDimitry Andric   v[2] = addv(v[2], v[7]);
341*81ad6265SDimitry Andric   v[3] = addv(v[3], v[4]);
342*81ad6265SDimitry Andric   v[15] = xorv(v[15], v[0]);
343*81ad6265SDimitry Andric   v[12] = xorv(v[12], v[1]);
344*81ad6265SDimitry Andric   v[13] = xorv(v[13], v[2]);
345*81ad6265SDimitry Andric   v[14] = xorv(v[14], v[3]);
346*81ad6265SDimitry Andric   v[15] = rot16(v[15]);
347*81ad6265SDimitry Andric   v[12] = rot16(v[12]);
348*81ad6265SDimitry Andric   v[13] = rot16(v[13]);
349*81ad6265SDimitry Andric   v[14] = rot16(v[14]);
350*81ad6265SDimitry Andric   v[10] = addv(v[10], v[15]);
351*81ad6265SDimitry Andric   v[11] = addv(v[11], v[12]);
352*81ad6265SDimitry Andric   v[8] = addv(v[8], v[13]);
353*81ad6265SDimitry Andric   v[9] = addv(v[9], v[14]);
354*81ad6265SDimitry Andric   v[5] = xorv(v[5], v[10]);
355*81ad6265SDimitry Andric   v[6] = xorv(v[6], v[11]);
356*81ad6265SDimitry Andric   v[7] = xorv(v[7], v[8]);
357*81ad6265SDimitry Andric   v[4] = xorv(v[4], v[9]);
358*81ad6265SDimitry Andric   v[5] = rot12(v[5]);
359*81ad6265SDimitry Andric   v[6] = rot12(v[6]);
360*81ad6265SDimitry Andric   v[7] = rot12(v[7]);
361*81ad6265SDimitry Andric   v[4] = rot12(v[4]);
362*81ad6265SDimitry Andric   v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
363*81ad6265SDimitry Andric   v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
364*81ad6265SDimitry Andric   v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
365*81ad6265SDimitry Andric   v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
366*81ad6265SDimitry Andric   v[0] = addv(v[0], v[5]);
367*81ad6265SDimitry Andric   v[1] = addv(v[1], v[6]);
368*81ad6265SDimitry Andric   v[2] = addv(v[2], v[7]);
369*81ad6265SDimitry Andric   v[3] = addv(v[3], v[4]);
370*81ad6265SDimitry Andric   v[15] = xorv(v[15], v[0]);
371*81ad6265SDimitry Andric   v[12] = xorv(v[12], v[1]);
372*81ad6265SDimitry Andric   v[13] = xorv(v[13], v[2]);
373*81ad6265SDimitry Andric   v[14] = xorv(v[14], v[3]);
374*81ad6265SDimitry Andric   v[15] = rot8(v[15]);
375*81ad6265SDimitry Andric   v[12] = rot8(v[12]);
376*81ad6265SDimitry Andric   v[13] = rot8(v[13]);
377*81ad6265SDimitry Andric   v[14] = rot8(v[14]);
378*81ad6265SDimitry Andric   v[10] = addv(v[10], v[15]);
379*81ad6265SDimitry Andric   v[11] = addv(v[11], v[12]);
380*81ad6265SDimitry Andric   v[8] = addv(v[8], v[13]);
381*81ad6265SDimitry Andric   v[9] = addv(v[9], v[14]);
382*81ad6265SDimitry Andric   v[5] = xorv(v[5], v[10]);
383*81ad6265SDimitry Andric   v[6] = xorv(v[6], v[11]);
384*81ad6265SDimitry Andric   v[7] = xorv(v[7], v[8]);
385*81ad6265SDimitry Andric   v[4] = xorv(v[4], v[9]);
386*81ad6265SDimitry Andric   v[5] = rot7(v[5]);
387*81ad6265SDimitry Andric   v[6] = rot7(v[6]);
388*81ad6265SDimitry Andric   v[7] = rot7(v[7]);
389*81ad6265SDimitry Andric   v[4] = rot7(v[4]);
390*81ad6265SDimitry Andric }
391*81ad6265SDimitry Andric 
transpose_vecs(__m128i vecs[DEGREE])392*81ad6265SDimitry Andric INLINE void transpose_vecs(__m128i vecs[DEGREE]) {
393*81ad6265SDimitry Andric   // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is
394*81ad6265SDimitry Andric   // 22/33. Note that this doesn't split the vector into two lanes, as the
395*81ad6265SDimitry Andric   // AVX2 counterparts do.
396*81ad6265SDimitry Andric   __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
397*81ad6265SDimitry Andric   __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
398*81ad6265SDimitry Andric   __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
399*81ad6265SDimitry Andric   __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
400*81ad6265SDimitry Andric 
401*81ad6265SDimitry Andric   // Interleave 64-bit lanes.
402*81ad6265SDimitry Andric   __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01);
403*81ad6265SDimitry Andric   __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01);
404*81ad6265SDimitry Andric   __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23);
405*81ad6265SDimitry Andric   __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23);
406*81ad6265SDimitry Andric 
407*81ad6265SDimitry Andric   vecs[0] = abcd_0;
408*81ad6265SDimitry Andric   vecs[1] = abcd_1;
409*81ad6265SDimitry Andric   vecs[2] = abcd_2;
410*81ad6265SDimitry Andric   vecs[3] = abcd_3;
411*81ad6265SDimitry Andric }
412*81ad6265SDimitry Andric 
transpose_msg_vecs(const uint8_t * const * inputs,size_t block_offset,__m128i out[16])413*81ad6265SDimitry Andric INLINE void transpose_msg_vecs(const uint8_t *const *inputs,
414*81ad6265SDimitry Andric                                size_t block_offset, __m128i out[16]) {
415*81ad6265SDimitry Andric   out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m128i)]);
416*81ad6265SDimitry Andric   out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m128i)]);
417*81ad6265SDimitry Andric   out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m128i)]);
418*81ad6265SDimitry Andric   out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m128i)]);
419*81ad6265SDimitry Andric   out[4] = loadu(&inputs[0][block_offset + 1 * sizeof(__m128i)]);
420*81ad6265SDimitry Andric   out[5] = loadu(&inputs[1][block_offset + 1 * sizeof(__m128i)]);
421*81ad6265SDimitry Andric   out[6] = loadu(&inputs[2][block_offset + 1 * sizeof(__m128i)]);
422*81ad6265SDimitry Andric   out[7] = loadu(&inputs[3][block_offset + 1 * sizeof(__m128i)]);
423*81ad6265SDimitry Andric   out[8] = loadu(&inputs[0][block_offset + 2 * sizeof(__m128i)]);
424*81ad6265SDimitry Andric   out[9] = loadu(&inputs[1][block_offset + 2 * sizeof(__m128i)]);
425*81ad6265SDimitry Andric   out[10] = loadu(&inputs[2][block_offset + 2 * sizeof(__m128i)]);
426*81ad6265SDimitry Andric   out[11] = loadu(&inputs[3][block_offset + 2 * sizeof(__m128i)]);
427*81ad6265SDimitry Andric   out[12] = loadu(&inputs[0][block_offset + 3 * sizeof(__m128i)]);
428*81ad6265SDimitry Andric   out[13] = loadu(&inputs[1][block_offset + 3 * sizeof(__m128i)]);
429*81ad6265SDimitry Andric   out[14] = loadu(&inputs[2][block_offset + 3 * sizeof(__m128i)]);
430*81ad6265SDimitry Andric   out[15] = loadu(&inputs[3][block_offset + 3 * sizeof(__m128i)]);
431*81ad6265SDimitry Andric   for (size_t i = 0; i < 4; ++i) {
432*81ad6265SDimitry Andric     _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
433*81ad6265SDimitry Andric   }
434*81ad6265SDimitry Andric   transpose_vecs(&out[0]);
435*81ad6265SDimitry Andric   transpose_vecs(&out[4]);
436*81ad6265SDimitry Andric   transpose_vecs(&out[8]);
437*81ad6265SDimitry Andric   transpose_vecs(&out[12]);
438*81ad6265SDimitry Andric }
439*81ad6265SDimitry Andric 
load_counters(uint64_t counter,bool increment_counter,__m128i * out_lo,__m128i * out_hi)440*81ad6265SDimitry Andric INLINE void load_counters(uint64_t counter, bool increment_counter,
441*81ad6265SDimitry Andric                           __m128i *out_lo, __m128i *out_hi) {
442*81ad6265SDimitry Andric   const __m128i mask = _mm_set1_epi32(-(int32_t)increment_counter);
443*81ad6265SDimitry Andric   const __m128i add0 = _mm_set_epi32(3, 2, 1, 0);
444*81ad6265SDimitry Andric   const __m128i add1 = _mm_and_si128(mask, add0);
445*81ad6265SDimitry Andric   __m128i l = _mm_add_epi32(_mm_set1_epi32((int32_t)counter), add1);
446*81ad6265SDimitry Andric   __m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)),
447*81ad6265SDimitry Andric                                   _mm_xor_si128(   l, _mm_set1_epi32(0x80000000)));
448*81ad6265SDimitry Andric   __m128i h = _mm_sub_epi32(_mm_set1_epi32((int32_t)(counter >> 32)), carry);
449*81ad6265SDimitry Andric   *out_lo = l;
450*81ad6265SDimitry Andric   *out_hi = h;
451*81ad6265SDimitry Andric }
452*81ad6265SDimitry Andric 
453*81ad6265SDimitry Andric static
blake3_hash4_sse41(const uint8_t * const * inputs,size_t blocks,const uint32_t key[8],uint64_t counter,bool increment_counter,uint8_t flags,uint8_t flags_start,uint8_t flags_end,uint8_t * out)454*81ad6265SDimitry Andric void blake3_hash4_sse41(const uint8_t *const *inputs, size_t blocks,
455*81ad6265SDimitry Andric                         const uint32_t key[8], uint64_t counter,
456*81ad6265SDimitry Andric                         bool increment_counter, uint8_t flags,
457*81ad6265SDimitry Andric                         uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
458*81ad6265SDimitry Andric   __m128i h_vecs[8] = {
459*81ad6265SDimitry Andric       set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]),
460*81ad6265SDimitry Andric       set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]),
461*81ad6265SDimitry Andric   };
462*81ad6265SDimitry Andric   __m128i counter_low_vec, counter_high_vec;
463*81ad6265SDimitry Andric   load_counters(counter, increment_counter, &counter_low_vec,
464*81ad6265SDimitry Andric                 &counter_high_vec);
465*81ad6265SDimitry Andric   uint8_t block_flags = flags | flags_start;
466*81ad6265SDimitry Andric 
467*81ad6265SDimitry Andric   for (size_t block = 0; block < blocks; block++) {
468*81ad6265SDimitry Andric     if (block + 1 == blocks) {
469*81ad6265SDimitry Andric       block_flags |= flags_end;
470*81ad6265SDimitry Andric     }
471*81ad6265SDimitry Andric     __m128i block_len_vec = set1(BLAKE3_BLOCK_LEN);
472*81ad6265SDimitry Andric     __m128i block_flags_vec = set1(block_flags);
473*81ad6265SDimitry Andric     __m128i msg_vecs[16];
474*81ad6265SDimitry Andric     transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
475*81ad6265SDimitry Andric 
476*81ad6265SDimitry Andric     __m128i v[16] = {
477*81ad6265SDimitry Andric         h_vecs[0],       h_vecs[1],        h_vecs[2],     h_vecs[3],
478*81ad6265SDimitry Andric         h_vecs[4],       h_vecs[5],        h_vecs[6],     h_vecs[7],
479*81ad6265SDimitry Andric         set1(IV[0]),     set1(IV[1]),      set1(IV[2]),   set1(IV[3]),
480*81ad6265SDimitry Andric         counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
481*81ad6265SDimitry Andric     };
482*81ad6265SDimitry Andric     round_fn(v, msg_vecs, 0);
483*81ad6265SDimitry Andric     round_fn(v, msg_vecs, 1);
484*81ad6265SDimitry Andric     round_fn(v, msg_vecs, 2);
485*81ad6265SDimitry Andric     round_fn(v, msg_vecs, 3);
486*81ad6265SDimitry Andric     round_fn(v, msg_vecs, 4);
487*81ad6265SDimitry Andric     round_fn(v, msg_vecs, 5);
488*81ad6265SDimitry Andric     round_fn(v, msg_vecs, 6);
489*81ad6265SDimitry Andric     h_vecs[0] = xorv(v[0], v[8]);
490*81ad6265SDimitry Andric     h_vecs[1] = xorv(v[1], v[9]);
491*81ad6265SDimitry Andric     h_vecs[2] = xorv(v[2], v[10]);
492*81ad6265SDimitry Andric     h_vecs[3] = xorv(v[3], v[11]);
493*81ad6265SDimitry Andric     h_vecs[4] = xorv(v[4], v[12]);
494*81ad6265SDimitry Andric     h_vecs[5] = xorv(v[5], v[13]);
495*81ad6265SDimitry Andric     h_vecs[6] = xorv(v[6], v[14]);
496*81ad6265SDimitry Andric     h_vecs[7] = xorv(v[7], v[15]);
497*81ad6265SDimitry Andric 
498*81ad6265SDimitry Andric     block_flags = flags;
499*81ad6265SDimitry Andric   }
500*81ad6265SDimitry Andric 
501*81ad6265SDimitry Andric   transpose_vecs(&h_vecs[0]);
502*81ad6265SDimitry Andric   transpose_vecs(&h_vecs[4]);
503*81ad6265SDimitry Andric   // The first four vecs now contain the first half of each output, and the
504*81ad6265SDimitry Andric   // second four vecs contain the second half of each output.
505*81ad6265SDimitry Andric   storeu(h_vecs[0], &out[0 * sizeof(__m128i)]);
506*81ad6265SDimitry Andric   storeu(h_vecs[4], &out[1 * sizeof(__m128i)]);
507*81ad6265SDimitry Andric   storeu(h_vecs[1], &out[2 * sizeof(__m128i)]);
508*81ad6265SDimitry Andric   storeu(h_vecs[5], &out[3 * sizeof(__m128i)]);
509*81ad6265SDimitry Andric   storeu(h_vecs[2], &out[4 * sizeof(__m128i)]);
510*81ad6265SDimitry Andric   storeu(h_vecs[6], &out[5 * sizeof(__m128i)]);
511*81ad6265SDimitry Andric   storeu(h_vecs[3], &out[6 * sizeof(__m128i)]);
512*81ad6265SDimitry Andric   storeu(h_vecs[7], &out[7 * sizeof(__m128i)]);
513*81ad6265SDimitry Andric }
514*81ad6265SDimitry Andric 
hash_one_sse41(const uint8_t * input,size_t blocks,const uint32_t key[8],uint64_t counter,uint8_t flags,uint8_t flags_start,uint8_t flags_end,uint8_t out[BLAKE3_OUT_LEN])515*81ad6265SDimitry Andric INLINE void hash_one_sse41(const uint8_t *input, size_t blocks,
516*81ad6265SDimitry Andric                            const uint32_t key[8], uint64_t counter,
517*81ad6265SDimitry Andric                            uint8_t flags, uint8_t flags_start,
518*81ad6265SDimitry Andric                            uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) {
519*81ad6265SDimitry Andric   uint32_t cv[8];
520*81ad6265SDimitry Andric   memcpy(cv, key, BLAKE3_KEY_LEN);
521*81ad6265SDimitry Andric   uint8_t block_flags = flags | flags_start;
522*81ad6265SDimitry Andric   while (blocks > 0) {
523*81ad6265SDimitry Andric     if (blocks == 1) {
524*81ad6265SDimitry Andric       block_flags |= flags_end;
525*81ad6265SDimitry Andric     }
526*81ad6265SDimitry Andric     blake3_compress_in_place_sse41(cv, input, BLAKE3_BLOCK_LEN, counter,
527*81ad6265SDimitry Andric                                    block_flags);
528*81ad6265SDimitry Andric     input = &input[BLAKE3_BLOCK_LEN];
529*81ad6265SDimitry Andric     blocks -= 1;
530*81ad6265SDimitry Andric     block_flags = flags;
531*81ad6265SDimitry Andric   }
532*81ad6265SDimitry Andric   memcpy(out, cv, BLAKE3_OUT_LEN);
533*81ad6265SDimitry Andric }
534*81ad6265SDimitry Andric 
blake3_hash_many_sse41(const uint8_t * const * inputs,size_t num_inputs,size_t blocks,const uint32_t key[8],uint64_t counter,bool increment_counter,uint8_t flags,uint8_t flags_start,uint8_t flags_end,uint8_t * out)535*81ad6265SDimitry Andric void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs,
536*81ad6265SDimitry Andric                             size_t blocks, const uint32_t key[8],
537*81ad6265SDimitry Andric                             uint64_t counter, bool increment_counter,
538*81ad6265SDimitry Andric                             uint8_t flags, uint8_t flags_start,
539*81ad6265SDimitry Andric                             uint8_t flags_end, uint8_t *out) {
540*81ad6265SDimitry Andric   while (num_inputs >= DEGREE) {
541*81ad6265SDimitry Andric     blake3_hash4_sse41(inputs, blocks, key, counter, increment_counter, flags,
542*81ad6265SDimitry Andric                        flags_start, flags_end, out);
543*81ad6265SDimitry Andric     if (increment_counter) {
544*81ad6265SDimitry Andric       counter += DEGREE;
545*81ad6265SDimitry Andric     }
546*81ad6265SDimitry Andric     inputs += DEGREE;
547*81ad6265SDimitry Andric     num_inputs -= DEGREE;
548*81ad6265SDimitry Andric     out = &out[DEGREE * BLAKE3_OUT_LEN];
549*81ad6265SDimitry Andric   }
550*81ad6265SDimitry Andric   while (num_inputs > 0) {
551*81ad6265SDimitry Andric     hash_one_sse41(inputs[0], blocks, key, counter, flags, flags_start,
552*81ad6265SDimitry Andric                    flags_end, out);
553*81ad6265SDimitry Andric     if (increment_counter) {
554*81ad6265SDimitry Andric       counter += 1;
555*81ad6265SDimitry Andric     }
556*81ad6265SDimitry Andric     inputs += 1;
557*81ad6265SDimitry Andric     num_inputs -= 1;
558*81ad6265SDimitry Andric     out = &out[BLAKE3_OUT_LEN];
559*81ad6265SDimitry Andric   }
560*81ad6265SDimitry Andric }
561