1*81ad6265SDimitry Andric #include "blake3_impl.h"
2*81ad6265SDimitry Andric
3*81ad6265SDimitry Andric #include <immintrin.h>
4*81ad6265SDimitry Andric
5*81ad6265SDimitry Andric #define DEGREE 4
6*81ad6265SDimitry Andric
7*81ad6265SDimitry Andric #define _mm_shuffle_ps2(a, b, c) \
8*81ad6265SDimitry Andric (_mm_castps_si128( \
9*81ad6265SDimitry Andric _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c))))
10*81ad6265SDimitry Andric
loadu(const uint8_t src[16])11*81ad6265SDimitry Andric INLINE __m128i loadu(const uint8_t src[16]) {
12*81ad6265SDimitry Andric return _mm_loadu_si128((const __m128i *)src);
13*81ad6265SDimitry Andric }
14*81ad6265SDimitry Andric
storeu(__m128i src,uint8_t dest[16])15*81ad6265SDimitry Andric INLINE void storeu(__m128i src, uint8_t dest[16]) {
16*81ad6265SDimitry Andric _mm_storeu_si128((__m128i *)dest, src);
17*81ad6265SDimitry Andric }
18*81ad6265SDimitry Andric
addv(__m128i a,__m128i b)19*81ad6265SDimitry Andric INLINE __m128i addv(__m128i a, __m128i b) { return _mm_add_epi32(a, b); }
20*81ad6265SDimitry Andric
21*81ad6265SDimitry Andric // Note that clang-format doesn't like the name "xor" for some reason.
xorv(__m128i a,__m128i b)22*81ad6265SDimitry Andric INLINE __m128i xorv(__m128i a, __m128i b) { return _mm_xor_si128(a, b); }
23*81ad6265SDimitry Andric
set1(uint32_t x)24*81ad6265SDimitry Andric INLINE __m128i set1(uint32_t x) { return _mm_set1_epi32((int32_t)x); }
25*81ad6265SDimitry Andric
set4(uint32_t a,uint32_t b,uint32_t c,uint32_t d)26*81ad6265SDimitry Andric INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
27*81ad6265SDimitry Andric return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d);
28*81ad6265SDimitry Andric }
29*81ad6265SDimitry Andric
rot16(__m128i x)30*81ad6265SDimitry Andric INLINE __m128i rot16(__m128i x) {
31*81ad6265SDimitry Andric return _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, 0xB1), 0xB1);
32*81ad6265SDimitry Andric }
33*81ad6265SDimitry Andric
rot12(__m128i x)34*81ad6265SDimitry Andric INLINE __m128i rot12(__m128i x) {
35*81ad6265SDimitry Andric return xorv(_mm_srli_epi32(x, 12), _mm_slli_epi32(x, 32 - 12));
36*81ad6265SDimitry Andric }
37*81ad6265SDimitry Andric
rot8(__m128i x)38*81ad6265SDimitry Andric INLINE __m128i rot8(__m128i x) {
39*81ad6265SDimitry Andric return xorv(_mm_srli_epi32(x, 8), _mm_slli_epi32(x, 32 - 8));
40*81ad6265SDimitry Andric }
41*81ad6265SDimitry Andric
rot7(__m128i x)42*81ad6265SDimitry Andric INLINE __m128i rot7(__m128i x) {
43*81ad6265SDimitry Andric return xorv(_mm_srli_epi32(x, 7), _mm_slli_epi32(x, 32 - 7));
44*81ad6265SDimitry Andric }
45*81ad6265SDimitry Andric
g1(__m128i * row0,__m128i * row1,__m128i * row2,__m128i * row3,__m128i m)46*81ad6265SDimitry Andric INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
47*81ad6265SDimitry Andric __m128i m) {
48*81ad6265SDimitry Andric *row0 = addv(addv(*row0, m), *row1);
49*81ad6265SDimitry Andric *row3 = xorv(*row3, *row0);
50*81ad6265SDimitry Andric *row3 = rot16(*row3);
51*81ad6265SDimitry Andric *row2 = addv(*row2, *row3);
52*81ad6265SDimitry Andric *row1 = xorv(*row1, *row2);
53*81ad6265SDimitry Andric *row1 = rot12(*row1);
54*81ad6265SDimitry Andric }
55*81ad6265SDimitry Andric
g2(__m128i * row0,__m128i * row1,__m128i * row2,__m128i * row3,__m128i m)56*81ad6265SDimitry Andric INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
57*81ad6265SDimitry Andric __m128i m) {
58*81ad6265SDimitry Andric *row0 = addv(addv(*row0, m), *row1);
59*81ad6265SDimitry Andric *row3 = xorv(*row3, *row0);
60*81ad6265SDimitry Andric *row3 = rot8(*row3);
61*81ad6265SDimitry Andric *row2 = addv(*row2, *row3);
62*81ad6265SDimitry Andric *row1 = xorv(*row1, *row2);
63*81ad6265SDimitry Andric *row1 = rot7(*row1);
64*81ad6265SDimitry Andric }
65*81ad6265SDimitry Andric
66*81ad6265SDimitry Andric // Note the optimization here of leaving row1 as the unrotated row, rather than
67*81ad6265SDimitry Andric // row0. All the message loads below are adjusted to compensate for this. See
68*81ad6265SDimitry Andric // discussion at https://github.com/sneves/blake2-avx2/pull/4
diagonalize(__m128i * row0,__m128i * row2,__m128i * row3)69*81ad6265SDimitry Andric INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
70*81ad6265SDimitry Andric *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3));
71*81ad6265SDimitry Andric *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
72*81ad6265SDimitry Andric *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1));
73*81ad6265SDimitry Andric }
74*81ad6265SDimitry Andric
undiagonalize(__m128i * row0,__m128i * row2,__m128i * row3)75*81ad6265SDimitry Andric INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
76*81ad6265SDimitry Andric *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1));
77*81ad6265SDimitry Andric *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
78*81ad6265SDimitry Andric *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3));
79*81ad6265SDimitry Andric }
80*81ad6265SDimitry Andric
blend_epi16(__m128i a,__m128i b,const int16_t imm8)81*81ad6265SDimitry Andric INLINE __m128i blend_epi16(__m128i a, __m128i b, const int16_t imm8) {
82*81ad6265SDimitry Andric const __m128i bits = _mm_set_epi16(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
83*81ad6265SDimitry Andric __m128i mask = _mm_set1_epi16(imm8);
84*81ad6265SDimitry Andric mask = _mm_and_si128(mask, bits);
85*81ad6265SDimitry Andric mask = _mm_cmpeq_epi16(mask, bits);
86*81ad6265SDimitry Andric return _mm_or_si128(_mm_and_si128(mask, b), _mm_andnot_si128(mask, a));
87*81ad6265SDimitry Andric }
88*81ad6265SDimitry Andric
compress_pre(__m128i rows[4],const uint32_t cv[8],const uint8_t block[BLAKE3_BLOCK_LEN],uint8_t block_len,uint64_t counter,uint8_t flags)89*81ad6265SDimitry Andric INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8],
90*81ad6265SDimitry Andric const uint8_t block[BLAKE3_BLOCK_LEN],
91*81ad6265SDimitry Andric uint8_t block_len, uint64_t counter, uint8_t flags) {
92*81ad6265SDimitry Andric rows[0] = loadu((uint8_t *)&cv[0]);
93*81ad6265SDimitry Andric rows[1] = loadu((uint8_t *)&cv[4]);
94*81ad6265SDimitry Andric rows[2] = set4(IV[0], IV[1], IV[2], IV[3]);
95*81ad6265SDimitry Andric rows[3] = set4(counter_low(counter), counter_high(counter),
96*81ad6265SDimitry Andric (uint32_t)block_len, (uint32_t)flags);
97*81ad6265SDimitry Andric
98*81ad6265SDimitry Andric __m128i m0 = loadu(&block[sizeof(__m128i) * 0]);
99*81ad6265SDimitry Andric __m128i m1 = loadu(&block[sizeof(__m128i) * 1]);
100*81ad6265SDimitry Andric __m128i m2 = loadu(&block[sizeof(__m128i) * 2]);
101*81ad6265SDimitry Andric __m128i m3 = loadu(&block[sizeof(__m128i) * 3]);
102*81ad6265SDimitry Andric
103*81ad6265SDimitry Andric __m128i t0, t1, t2, t3, tt;
104*81ad6265SDimitry Andric
105*81ad6265SDimitry Andric // Round 1. The first round permutes the message words from the original
106*81ad6265SDimitry Andric // input order, into the groups that get mixed in parallel.
107*81ad6265SDimitry Andric t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); // 6 4 2 0
108*81ad6265SDimitry Andric g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
109*81ad6265SDimitry Andric t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); // 7 5 3 1
110*81ad6265SDimitry Andric g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
111*81ad6265SDimitry Andric diagonalize(&rows[0], &rows[2], &rows[3]);
112*81ad6265SDimitry Andric t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10 8
113*81ad6265SDimitry Andric t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); // 12 10 8 14
114*81ad6265SDimitry Andric g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
115*81ad6265SDimitry Andric t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11 9
116*81ad6265SDimitry Andric t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3)); // 13 11 9 15
117*81ad6265SDimitry Andric g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
118*81ad6265SDimitry Andric undiagonalize(&rows[0], &rows[2], &rows[3]);
119*81ad6265SDimitry Andric m0 = t0;
120*81ad6265SDimitry Andric m1 = t1;
121*81ad6265SDimitry Andric m2 = t2;
122*81ad6265SDimitry Andric m3 = t3;
123*81ad6265SDimitry Andric
124*81ad6265SDimitry Andric // Round 2. This round and all following rounds apply a fixed permutation
125*81ad6265SDimitry Andric // to the message words from the round before.
126*81ad6265SDimitry Andric t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
127*81ad6265SDimitry Andric t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
128*81ad6265SDimitry Andric g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
129*81ad6265SDimitry Andric t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
130*81ad6265SDimitry Andric tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
131*81ad6265SDimitry Andric t1 = blend_epi16(tt, t1, 0xCC);
132*81ad6265SDimitry Andric g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
133*81ad6265SDimitry Andric diagonalize(&rows[0], &rows[2], &rows[3]);
134*81ad6265SDimitry Andric t2 = _mm_unpacklo_epi64(m3, m1);
135*81ad6265SDimitry Andric tt = blend_epi16(t2, m2, 0xC0);
136*81ad6265SDimitry Andric t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
137*81ad6265SDimitry Andric g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
138*81ad6265SDimitry Andric t3 = _mm_unpackhi_epi32(m1, m3);
139*81ad6265SDimitry Andric tt = _mm_unpacklo_epi32(m2, t3);
140*81ad6265SDimitry Andric t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
141*81ad6265SDimitry Andric g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
142*81ad6265SDimitry Andric undiagonalize(&rows[0], &rows[2], &rows[3]);
143*81ad6265SDimitry Andric m0 = t0;
144*81ad6265SDimitry Andric m1 = t1;
145*81ad6265SDimitry Andric m2 = t2;
146*81ad6265SDimitry Andric m3 = t3;
147*81ad6265SDimitry Andric
148*81ad6265SDimitry Andric // Round 3
149*81ad6265SDimitry Andric t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
150*81ad6265SDimitry Andric t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
151*81ad6265SDimitry Andric g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
152*81ad6265SDimitry Andric t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
153*81ad6265SDimitry Andric tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
154*81ad6265SDimitry Andric t1 = blend_epi16(tt, t1, 0xCC);
155*81ad6265SDimitry Andric g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
156*81ad6265SDimitry Andric diagonalize(&rows[0], &rows[2], &rows[3]);
157*81ad6265SDimitry Andric t2 = _mm_unpacklo_epi64(m3, m1);
158*81ad6265SDimitry Andric tt = blend_epi16(t2, m2, 0xC0);
159*81ad6265SDimitry Andric t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
160*81ad6265SDimitry Andric g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
161*81ad6265SDimitry Andric t3 = _mm_unpackhi_epi32(m1, m3);
162*81ad6265SDimitry Andric tt = _mm_unpacklo_epi32(m2, t3);
163*81ad6265SDimitry Andric t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
164*81ad6265SDimitry Andric g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
165*81ad6265SDimitry Andric undiagonalize(&rows[0], &rows[2], &rows[3]);
166*81ad6265SDimitry Andric m0 = t0;
167*81ad6265SDimitry Andric m1 = t1;
168*81ad6265SDimitry Andric m2 = t2;
169*81ad6265SDimitry Andric m3 = t3;
170*81ad6265SDimitry Andric
171*81ad6265SDimitry Andric // Round 4
172*81ad6265SDimitry Andric t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
173*81ad6265SDimitry Andric t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
174*81ad6265SDimitry Andric g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
175*81ad6265SDimitry Andric t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
176*81ad6265SDimitry Andric tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
177*81ad6265SDimitry Andric t1 = blend_epi16(tt, t1, 0xCC);
178*81ad6265SDimitry Andric g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
179*81ad6265SDimitry Andric diagonalize(&rows[0], &rows[2], &rows[3]);
180*81ad6265SDimitry Andric t2 = _mm_unpacklo_epi64(m3, m1);
181*81ad6265SDimitry Andric tt = blend_epi16(t2, m2, 0xC0);
182*81ad6265SDimitry Andric t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
183*81ad6265SDimitry Andric g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
184*81ad6265SDimitry Andric t3 = _mm_unpackhi_epi32(m1, m3);
185*81ad6265SDimitry Andric tt = _mm_unpacklo_epi32(m2, t3);
186*81ad6265SDimitry Andric t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
187*81ad6265SDimitry Andric g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
188*81ad6265SDimitry Andric undiagonalize(&rows[0], &rows[2], &rows[3]);
189*81ad6265SDimitry Andric m0 = t0;
190*81ad6265SDimitry Andric m1 = t1;
191*81ad6265SDimitry Andric m2 = t2;
192*81ad6265SDimitry Andric m3 = t3;
193*81ad6265SDimitry Andric
194*81ad6265SDimitry Andric // Round 5
195*81ad6265SDimitry Andric t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
196*81ad6265SDimitry Andric t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
197*81ad6265SDimitry Andric g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
198*81ad6265SDimitry Andric t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
199*81ad6265SDimitry Andric tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
200*81ad6265SDimitry Andric t1 = blend_epi16(tt, t1, 0xCC);
201*81ad6265SDimitry Andric g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
202*81ad6265SDimitry Andric diagonalize(&rows[0], &rows[2], &rows[3]);
203*81ad6265SDimitry Andric t2 = _mm_unpacklo_epi64(m3, m1);
204*81ad6265SDimitry Andric tt = blend_epi16(t2, m2, 0xC0);
205*81ad6265SDimitry Andric t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
206*81ad6265SDimitry Andric g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
207*81ad6265SDimitry Andric t3 = _mm_unpackhi_epi32(m1, m3);
208*81ad6265SDimitry Andric tt = _mm_unpacklo_epi32(m2, t3);
209*81ad6265SDimitry Andric t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
210*81ad6265SDimitry Andric g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
211*81ad6265SDimitry Andric undiagonalize(&rows[0], &rows[2], &rows[3]);
212*81ad6265SDimitry Andric m0 = t0;
213*81ad6265SDimitry Andric m1 = t1;
214*81ad6265SDimitry Andric m2 = t2;
215*81ad6265SDimitry Andric m3 = t3;
216*81ad6265SDimitry Andric
217*81ad6265SDimitry Andric // Round 6
218*81ad6265SDimitry Andric t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
219*81ad6265SDimitry Andric t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
220*81ad6265SDimitry Andric g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
221*81ad6265SDimitry Andric t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
222*81ad6265SDimitry Andric tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
223*81ad6265SDimitry Andric t1 = blend_epi16(tt, t1, 0xCC);
224*81ad6265SDimitry Andric g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
225*81ad6265SDimitry Andric diagonalize(&rows[0], &rows[2], &rows[3]);
226*81ad6265SDimitry Andric t2 = _mm_unpacklo_epi64(m3, m1);
227*81ad6265SDimitry Andric tt = blend_epi16(t2, m2, 0xC0);
228*81ad6265SDimitry Andric t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
229*81ad6265SDimitry Andric g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
230*81ad6265SDimitry Andric t3 = _mm_unpackhi_epi32(m1, m3);
231*81ad6265SDimitry Andric tt = _mm_unpacklo_epi32(m2, t3);
232*81ad6265SDimitry Andric t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
233*81ad6265SDimitry Andric g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
234*81ad6265SDimitry Andric undiagonalize(&rows[0], &rows[2], &rows[3]);
235*81ad6265SDimitry Andric m0 = t0;
236*81ad6265SDimitry Andric m1 = t1;
237*81ad6265SDimitry Andric m2 = t2;
238*81ad6265SDimitry Andric m3 = t3;
239*81ad6265SDimitry Andric
240*81ad6265SDimitry Andric // Round 7
241*81ad6265SDimitry Andric t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
242*81ad6265SDimitry Andric t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
243*81ad6265SDimitry Andric g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
244*81ad6265SDimitry Andric t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
245*81ad6265SDimitry Andric tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
246*81ad6265SDimitry Andric t1 = blend_epi16(tt, t1, 0xCC);
247*81ad6265SDimitry Andric g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
248*81ad6265SDimitry Andric diagonalize(&rows[0], &rows[2], &rows[3]);
249*81ad6265SDimitry Andric t2 = _mm_unpacklo_epi64(m3, m1);
250*81ad6265SDimitry Andric tt = blend_epi16(t2, m2, 0xC0);
251*81ad6265SDimitry Andric t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
252*81ad6265SDimitry Andric g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
253*81ad6265SDimitry Andric t3 = _mm_unpackhi_epi32(m1, m3);
254*81ad6265SDimitry Andric tt = _mm_unpacklo_epi32(m2, t3);
255*81ad6265SDimitry Andric t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
256*81ad6265SDimitry Andric g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
257*81ad6265SDimitry Andric undiagonalize(&rows[0], &rows[2], &rows[3]);
258*81ad6265SDimitry Andric }
259*81ad6265SDimitry Andric
blake3_compress_in_place_sse2(uint32_t cv[8],const uint8_t block[BLAKE3_BLOCK_LEN],uint8_t block_len,uint64_t counter,uint8_t flags)260*81ad6265SDimitry Andric void blake3_compress_in_place_sse2(uint32_t cv[8],
261*81ad6265SDimitry Andric const uint8_t block[BLAKE3_BLOCK_LEN],
262*81ad6265SDimitry Andric uint8_t block_len, uint64_t counter,
263*81ad6265SDimitry Andric uint8_t flags) {
264*81ad6265SDimitry Andric __m128i rows[4];
265*81ad6265SDimitry Andric compress_pre(rows, cv, block, block_len, counter, flags);
266*81ad6265SDimitry Andric storeu(xorv(rows[0], rows[2]), (uint8_t *)&cv[0]);
267*81ad6265SDimitry Andric storeu(xorv(rows[1], rows[3]), (uint8_t *)&cv[4]);
268*81ad6265SDimitry Andric }
269*81ad6265SDimitry Andric
blake3_compress_xof_sse2(const uint32_t cv[8],const uint8_t block[BLAKE3_BLOCK_LEN],uint8_t block_len,uint64_t counter,uint8_t flags,uint8_t out[64])270*81ad6265SDimitry Andric void blake3_compress_xof_sse2(const uint32_t cv[8],
271*81ad6265SDimitry Andric const uint8_t block[BLAKE3_BLOCK_LEN],
272*81ad6265SDimitry Andric uint8_t block_len, uint64_t counter,
273*81ad6265SDimitry Andric uint8_t flags, uint8_t out[64]) {
274*81ad6265SDimitry Andric __m128i rows[4];
275*81ad6265SDimitry Andric compress_pre(rows, cv, block, block_len, counter, flags);
276*81ad6265SDimitry Andric storeu(xorv(rows[0], rows[2]), &out[0]);
277*81ad6265SDimitry Andric storeu(xorv(rows[1], rows[3]), &out[16]);
278*81ad6265SDimitry Andric storeu(xorv(rows[2], loadu((uint8_t *)&cv[0])), &out[32]);
279*81ad6265SDimitry Andric storeu(xorv(rows[3], loadu((uint8_t *)&cv[4])), &out[48]);
280*81ad6265SDimitry Andric }
281*81ad6265SDimitry Andric
round_fn(__m128i v[16],__m128i m[16],size_t r)282*81ad6265SDimitry Andric INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r) {
283*81ad6265SDimitry Andric v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
284*81ad6265SDimitry Andric v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
285*81ad6265SDimitry Andric v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
286*81ad6265SDimitry Andric v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
287*81ad6265SDimitry Andric v[0] = addv(v[0], v[4]);
288*81ad6265SDimitry Andric v[1] = addv(v[1], v[5]);
289*81ad6265SDimitry Andric v[2] = addv(v[2], v[6]);
290*81ad6265SDimitry Andric v[3] = addv(v[3], v[7]);
291*81ad6265SDimitry Andric v[12] = xorv(v[12], v[0]);
292*81ad6265SDimitry Andric v[13] = xorv(v[13], v[1]);
293*81ad6265SDimitry Andric v[14] = xorv(v[14], v[2]);
294*81ad6265SDimitry Andric v[15] = xorv(v[15], v[3]);
295*81ad6265SDimitry Andric v[12] = rot16(v[12]);
296*81ad6265SDimitry Andric v[13] = rot16(v[13]);
297*81ad6265SDimitry Andric v[14] = rot16(v[14]);
298*81ad6265SDimitry Andric v[15] = rot16(v[15]);
299*81ad6265SDimitry Andric v[8] = addv(v[8], v[12]);
300*81ad6265SDimitry Andric v[9] = addv(v[9], v[13]);
301*81ad6265SDimitry Andric v[10] = addv(v[10], v[14]);
302*81ad6265SDimitry Andric v[11] = addv(v[11], v[15]);
303*81ad6265SDimitry Andric v[4] = xorv(v[4], v[8]);
304*81ad6265SDimitry Andric v[5] = xorv(v[5], v[9]);
305*81ad6265SDimitry Andric v[6] = xorv(v[6], v[10]);
306*81ad6265SDimitry Andric v[7] = xorv(v[7], v[11]);
307*81ad6265SDimitry Andric v[4] = rot12(v[4]);
308*81ad6265SDimitry Andric v[5] = rot12(v[5]);
309*81ad6265SDimitry Andric v[6] = rot12(v[6]);
310*81ad6265SDimitry Andric v[7] = rot12(v[7]);
311*81ad6265SDimitry Andric v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
312*81ad6265SDimitry Andric v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
313*81ad6265SDimitry Andric v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
314*81ad6265SDimitry Andric v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
315*81ad6265SDimitry Andric v[0] = addv(v[0], v[4]);
316*81ad6265SDimitry Andric v[1] = addv(v[1], v[5]);
317*81ad6265SDimitry Andric v[2] = addv(v[2], v[6]);
318*81ad6265SDimitry Andric v[3] = addv(v[3], v[7]);
319*81ad6265SDimitry Andric v[12] = xorv(v[12], v[0]);
320*81ad6265SDimitry Andric v[13] = xorv(v[13], v[1]);
321*81ad6265SDimitry Andric v[14] = xorv(v[14], v[2]);
322*81ad6265SDimitry Andric v[15] = xorv(v[15], v[3]);
323*81ad6265SDimitry Andric v[12] = rot8(v[12]);
324*81ad6265SDimitry Andric v[13] = rot8(v[13]);
325*81ad6265SDimitry Andric v[14] = rot8(v[14]);
326*81ad6265SDimitry Andric v[15] = rot8(v[15]);
327*81ad6265SDimitry Andric v[8] = addv(v[8], v[12]);
328*81ad6265SDimitry Andric v[9] = addv(v[9], v[13]);
329*81ad6265SDimitry Andric v[10] = addv(v[10], v[14]);
330*81ad6265SDimitry Andric v[11] = addv(v[11], v[15]);
331*81ad6265SDimitry Andric v[4] = xorv(v[4], v[8]);
332*81ad6265SDimitry Andric v[5] = xorv(v[5], v[9]);
333*81ad6265SDimitry Andric v[6] = xorv(v[6], v[10]);
334*81ad6265SDimitry Andric v[7] = xorv(v[7], v[11]);
335*81ad6265SDimitry Andric v[4] = rot7(v[4]);
336*81ad6265SDimitry Andric v[5] = rot7(v[5]);
337*81ad6265SDimitry Andric v[6] = rot7(v[6]);
338*81ad6265SDimitry Andric v[7] = rot7(v[7]);
339*81ad6265SDimitry Andric
340*81ad6265SDimitry Andric v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
341*81ad6265SDimitry Andric v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
342*81ad6265SDimitry Andric v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
343*81ad6265SDimitry Andric v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
344*81ad6265SDimitry Andric v[0] = addv(v[0], v[5]);
345*81ad6265SDimitry Andric v[1] = addv(v[1], v[6]);
346*81ad6265SDimitry Andric v[2] = addv(v[2], v[7]);
347*81ad6265SDimitry Andric v[3] = addv(v[3], v[4]);
348*81ad6265SDimitry Andric v[15] = xorv(v[15], v[0]);
349*81ad6265SDimitry Andric v[12] = xorv(v[12], v[1]);
350*81ad6265SDimitry Andric v[13] = xorv(v[13], v[2]);
351*81ad6265SDimitry Andric v[14] = xorv(v[14], v[3]);
352*81ad6265SDimitry Andric v[15] = rot16(v[15]);
353*81ad6265SDimitry Andric v[12] = rot16(v[12]);
354*81ad6265SDimitry Andric v[13] = rot16(v[13]);
355*81ad6265SDimitry Andric v[14] = rot16(v[14]);
356*81ad6265SDimitry Andric v[10] = addv(v[10], v[15]);
357*81ad6265SDimitry Andric v[11] = addv(v[11], v[12]);
358*81ad6265SDimitry Andric v[8] = addv(v[8], v[13]);
359*81ad6265SDimitry Andric v[9] = addv(v[9], v[14]);
360*81ad6265SDimitry Andric v[5] = xorv(v[5], v[10]);
361*81ad6265SDimitry Andric v[6] = xorv(v[6], v[11]);
362*81ad6265SDimitry Andric v[7] = xorv(v[7], v[8]);
363*81ad6265SDimitry Andric v[4] = xorv(v[4], v[9]);
364*81ad6265SDimitry Andric v[5] = rot12(v[5]);
365*81ad6265SDimitry Andric v[6] = rot12(v[6]);
366*81ad6265SDimitry Andric v[7] = rot12(v[7]);
367*81ad6265SDimitry Andric v[4] = rot12(v[4]);
368*81ad6265SDimitry Andric v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
369*81ad6265SDimitry Andric v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
370*81ad6265SDimitry Andric v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
371*81ad6265SDimitry Andric v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
372*81ad6265SDimitry Andric v[0] = addv(v[0], v[5]);
373*81ad6265SDimitry Andric v[1] = addv(v[1], v[6]);
374*81ad6265SDimitry Andric v[2] = addv(v[2], v[7]);
375*81ad6265SDimitry Andric v[3] = addv(v[3], v[4]);
376*81ad6265SDimitry Andric v[15] = xorv(v[15], v[0]);
377*81ad6265SDimitry Andric v[12] = xorv(v[12], v[1]);
378*81ad6265SDimitry Andric v[13] = xorv(v[13], v[2]);
379*81ad6265SDimitry Andric v[14] = xorv(v[14], v[3]);
380*81ad6265SDimitry Andric v[15] = rot8(v[15]);
381*81ad6265SDimitry Andric v[12] = rot8(v[12]);
382*81ad6265SDimitry Andric v[13] = rot8(v[13]);
383*81ad6265SDimitry Andric v[14] = rot8(v[14]);
384*81ad6265SDimitry Andric v[10] = addv(v[10], v[15]);
385*81ad6265SDimitry Andric v[11] = addv(v[11], v[12]);
386*81ad6265SDimitry Andric v[8] = addv(v[8], v[13]);
387*81ad6265SDimitry Andric v[9] = addv(v[9], v[14]);
388*81ad6265SDimitry Andric v[5] = xorv(v[5], v[10]);
389*81ad6265SDimitry Andric v[6] = xorv(v[6], v[11]);
390*81ad6265SDimitry Andric v[7] = xorv(v[7], v[8]);
391*81ad6265SDimitry Andric v[4] = xorv(v[4], v[9]);
392*81ad6265SDimitry Andric v[5] = rot7(v[5]);
393*81ad6265SDimitry Andric v[6] = rot7(v[6]);
394*81ad6265SDimitry Andric v[7] = rot7(v[7]);
395*81ad6265SDimitry Andric v[4] = rot7(v[4]);
396*81ad6265SDimitry Andric }
397*81ad6265SDimitry Andric
transpose_vecs(__m128i vecs[DEGREE])398*81ad6265SDimitry Andric INLINE void transpose_vecs(__m128i vecs[DEGREE]) {
399*81ad6265SDimitry Andric // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is
400*81ad6265SDimitry Andric // 22/33. Note that this doesn't split the vector into two lanes, as the
401*81ad6265SDimitry Andric // AVX2 counterparts do.
402*81ad6265SDimitry Andric __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
403*81ad6265SDimitry Andric __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
404*81ad6265SDimitry Andric __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
405*81ad6265SDimitry Andric __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
406*81ad6265SDimitry Andric
407*81ad6265SDimitry Andric // Interleave 64-bit lanes.
408*81ad6265SDimitry Andric __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01);
409*81ad6265SDimitry Andric __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01);
410*81ad6265SDimitry Andric __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23);
411*81ad6265SDimitry Andric __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23);
412*81ad6265SDimitry Andric
413*81ad6265SDimitry Andric vecs[0] = abcd_0;
414*81ad6265SDimitry Andric vecs[1] = abcd_1;
415*81ad6265SDimitry Andric vecs[2] = abcd_2;
416*81ad6265SDimitry Andric vecs[3] = abcd_3;
417*81ad6265SDimitry Andric }
418*81ad6265SDimitry Andric
transpose_msg_vecs(const uint8_t * const * inputs,size_t block_offset,__m128i out[16])419*81ad6265SDimitry Andric INLINE void transpose_msg_vecs(const uint8_t *const *inputs,
420*81ad6265SDimitry Andric size_t block_offset, __m128i out[16]) {
421*81ad6265SDimitry Andric out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m128i)]);
422*81ad6265SDimitry Andric out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m128i)]);
423*81ad6265SDimitry Andric out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m128i)]);
424*81ad6265SDimitry Andric out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m128i)]);
425*81ad6265SDimitry Andric out[4] = loadu(&inputs[0][block_offset + 1 * sizeof(__m128i)]);
426*81ad6265SDimitry Andric out[5] = loadu(&inputs[1][block_offset + 1 * sizeof(__m128i)]);
427*81ad6265SDimitry Andric out[6] = loadu(&inputs[2][block_offset + 1 * sizeof(__m128i)]);
428*81ad6265SDimitry Andric out[7] = loadu(&inputs[3][block_offset + 1 * sizeof(__m128i)]);
429*81ad6265SDimitry Andric out[8] = loadu(&inputs[0][block_offset + 2 * sizeof(__m128i)]);
430*81ad6265SDimitry Andric out[9] = loadu(&inputs[1][block_offset + 2 * sizeof(__m128i)]);
431*81ad6265SDimitry Andric out[10] = loadu(&inputs[2][block_offset + 2 * sizeof(__m128i)]);
432*81ad6265SDimitry Andric out[11] = loadu(&inputs[3][block_offset + 2 * sizeof(__m128i)]);
433*81ad6265SDimitry Andric out[12] = loadu(&inputs[0][block_offset + 3 * sizeof(__m128i)]);
434*81ad6265SDimitry Andric out[13] = loadu(&inputs[1][block_offset + 3 * sizeof(__m128i)]);
435*81ad6265SDimitry Andric out[14] = loadu(&inputs[2][block_offset + 3 * sizeof(__m128i)]);
436*81ad6265SDimitry Andric out[15] = loadu(&inputs[3][block_offset + 3 * sizeof(__m128i)]);
437*81ad6265SDimitry Andric for (size_t i = 0; i < 4; ++i) {
438*81ad6265SDimitry Andric _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
439*81ad6265SDimitry Andric }
440*81ad6265SDimitry Andric transpose_vecs(&out[0]);
441*81ad6265SDimitry Andric transpose_vecs(&out[4]);
442*81ad6265SDimitry Andric transpose_vecs(&out[8]);
443*81ad6265SDimitry Andric transpose_vecs(&out[12]);
444*81ad6265SDimitry Andric }
445*81ad6265SDimitry Andric
load_counters(uint64_t counter,bool increment_counter,__m128i * out_lo,__m128i * out_hi)446*81ad6265SDimitry Andric INLINE void load_counters(uint64_t counter, bool increment_counter,
447*81ad6265SDimitry Andric __m128i *out_lo, __m128i *out_hi) {
448*81ad6265SDimitry Andric const __m128i mask = _mm_set1_epi32(-(int32_t)increment_counter);
449*81ad6265SDimitry Andric const __m128i add0 = _mm_set_epi32(3, 2, 1, 0);
450*81ad6265SDimitry Andric const __m128i add1 = _mm_and_si128(mask, add0);
451*81ad6265SDimitry Andric __m128i l = _mm_add_epi32(_mm_set1_epi32((int32_t)counter), add1);
452*81ad6265SDimitry Andric __m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)),
453*81ad6265SDimitry Andric _mm_xor_si128( l, _mm_set1_epi32(0x80000000)));
454*81ad6265SDimitry Andric __m128i h = _mm_sub_epi32(_mm_set1_epi32((int32_t)(counter >> 32)), carry);
455*81ad6265SDimitry Andric *out_lo = l;
456*81ad6265SDimitry Andric *out_hi = h;
457*81ad6265SDimitry Andric }
458*81ad6265SDimitry Andric
459*81ad6265SDimitry Andric static
blake3_hash4_sse2(const uint8_t * const * inputs,size_t blocks,const uint32_t key[8],uint64_t counter,bool increment_counter,uint8_t flags,uint8_t flags_start,uint8_t flags_end,uint8_t * out)460*81ad6265SDimitry Andric void blake3_hash4_sse2(const uint8_t *const *inputs, size_t blocks,
461*81ad6265SDimitry Andric const uint32_t key[8], uint64_t counter,
462*81ad6265SDimitry Andric bool increment_counter, uint8_t flags,
463*81ad6265SDimitry Andric uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
464*81ad6265SDimitry Andric __m128i h_vecs[8] = {
465*81ad6265SDimitry Andric set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]),
466*81ad6265SDimitry Andric set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]),
467*81ad6265SDimitry Andric };
468*81ad6265SDimitry Andric __m128i counter_low_vec, counter_high_vec;
469*81ad6265SDimitry Andric load_counters(counter, increment_counter, &counter_low_vec,
470*81ad6265SDimitry Andric &counter_high_vec);
471*81ad6265SDimitry Andric uint8_t block_flags = flags | flags_start;
472*81ad6265SDimitry Andric
473*81ad6265SDimitry Andric for (size_t block = 0; block < blocks; block++) {
474*81ad6265SDimitry Andric if (block + 1 == blocks) {
475*81ad6265SDimitry Andric block_flags |= flags_end;
476*81ad6265SDimitry Andric }
477*81ad6265SDimitry Andric __m128i block_len_vec = set1(BLAKE3_BLOCK_LEN);
478*81ad6265SDimitry Andric __m128i block_flags_vec = set1(block_flags);
479*81ad6265SDimitry Andric __m128i msg_vecs[16];
480*81ad6265SDimitry Andric transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
481*81ad6265SDimitry Andric
482*81ad6265SDimitry Andric __m128i v[16] = {
483*81ad6265SDimitry Andric h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
484*81ad6265SDimitry Andric h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
485*81ad6265SDimitry Andric set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]),
486*81ad6265SDimitry Andric counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
487*81ad6265SDimitry Andric };
488*81ad6265SDimitry Andric round_fn(v, msg_vecs, 0);
489*81ad6265SDimitry Andric round_fn(v, msg_vecs, 1);
490*81ad6265SDimitry Andric round_fn(v, msg_vecs, 2);
491*81ad6265SDimitry Andric round_fn(v, msg_vecs, 3);
492*81ad6265SDimitry Andric round_fn(v, msg_vecs, 4);
493*81ad6265SDimitry Andric round_fn(v, msg_vecs, 5);
494*81ad6265SDimitry Andric round_fn(v, msg_vecs, 6);
495*81ad6265SDimitry Andric h_vecs[0] = xorv(v[0], v[8]);
496*81ad6265SDimitry Andric h_vecs[1] = xorv(v[1], v[9]);
497*81ad6265SDimitry Andric h_vecs[2] = xorv(v[2], v[10]);
498*81ad6265SDimitry Andric h_vecs[3] = xorv(v[3], v[11]);
499*81ad6265SDimitry Andric h_vecs[4] = xorv(v[4], v[12]);
500*81ad6265SDimitry Andric h_vecs[5] = xorv(v[5], v[13]);
501*81ad6265SDimitry Andric h_vecs[6] = xorv(v[6], v[14]);
502*81ad6265SDimitry Andric h_vecs[7] = xorv(v[7], v[15]);
503*81ad6265SDimitry Andric
504*81ad6265SDimitry Andric block_flags = flags;
505*81ad6265SDimitry Andric }
506*81ad6265SDimitry Andric
507*81ad6265SDimitry Andric transpose_vecs(&h_vecs[0]);
508*81ad6265SDimitry Andric transpose_vecs(&h_vecs[4]);
509*81ad6265SDimitry Andric // The first four vecs now contain the first half of each output, and the
510*81ad6265SDimitry Andric // second four vecs contain the second half of each output.
511*81ad6265SDimitry Andric storeu(h_vecs[0], &out[0 * sizeof(__m128i)]);
512*81ad6265SDimitry Andric storeu(h_vecs[4], &out[1 * sizeof(__m128i)]);
513*81ad6265SDimitry Andric storeu(h_vecs[1], &out[2 * sizeof(__m128i)]);
514*81ad6265SDimitry Andric storeu(h_vecs[5], &out[3 * sizeof(__m128i)]);
515*81ad6265SDimitry Andric storeu(h_vecs[2], &out[4 * sizeof(__m128i)]);
516*81ad6265SDimitry Andric storeu(h_vecs[6], &out[5 * sizeof(__m128i)]);
517*81ad6265SDimitry Andric storeu(h_vecs[3], &out[6 * sizeof(__m128i)]);
518*81ad6265SDimitry Andric storeu(h_vecs[7], &out[7 * sizeof(__m128i)]);
519*81ad6265SDimitry Andric }
520*81ad6265SDimitry Andric
hash_one_sse2(const uint8_t * input,size_t blocks,const uint32_t key[8],uint64_t counter,uint8_t flags,uint8_t flags_start,uint8_t flags_end,uint8_t out[BLAKE3_OUT_LEN])521*81ad6265SDimitry Andric INLINE void hash_one_sse2(const uint8_t *input, size_t blocks,
522*81ad6265SDimitry Andric const uint32_t key[8], uint64_t counter,
523*81ad6265SDimitry Andric uint8_t flags, uint8_t flags_start,
524*81ad6265SDimitry Andric uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) {
525*81ad6265SDimitry Andric uint32_t cv[8];
526*81ad6265SDimitry Andric memcpy(cv, key, BLAKE3_KEY_LEN);
527*81ad6265SDimitry Andric uint8_t block_flags = flags | flags_start;
528*81ad6265SDimitry Andric while (blocks > 0) {
529*81ad6265SDimitry Andric if (blocks == 1) {
530*81ad6265SDimitry Andric block_flags |= flags_end;
531*81ad6265SDimitry Andric }
532*81ad6265SDimitry Andric blake3_compress_in_place_sse2(cv, input, BLAKE3_BLOCK_LEN, counter,
533*81ad6265SDimitry Andric block_flags);
534*81ad6265SDimitry Andric input = &input[BLAKE3_BLOCK_LEN];
535*81ad6265SDimitry Andric blocks -= 1;
536*81ad6265SDimitry Andric block_flags = flags;
537*81ad6265SDimitry Andric }
538*81ad6265SDimitry Andric memcpy(out, cv, BLAKE3_OUT_LEN);
539*81ad6265SDimitry Andric }
540*81ad6265SDimitry Andric
blake3_hash_many_sse2(const uint8_t * const * inputs,size_t num_inputs,size_t blocks,const uint32_t key[8],uint64_t counter,bool increment_counter,uint8_t flags,uint8_t flags_start,uint8_t flags_end,uint8_t * out)541*81ad6265SDimitry Andric void blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs,
542*81ad6265SDimitry Andric size_t blocks, const uint32_t key[8],
543*81ad6265SDimitry Andric uint64_t counter, bool increment_counter,
544*81ad6265SDimitry Andric uint8_t flags, uint8_t flags_start,
545*81ad6265SDimitry Andric uint8_t flags_end, uint8_t *out) {
546*81ad6265SDimitry Andric while (num_inputs >= DEGREE) {
547*81ad6265SDimitry Andric blake3_hash4_sse2(inputs, blocks, key, counter, increment_counter, flags,
548*81ad6265SDimitry Andric flags_start, flags_end, out);
549*81ad6265SDimitry Andric if (increment_counter) {
550*81ad6265SDimitry Andric counter += DEGREE;
551*81ad6265SDimitry Andric }
552*81ad6265SDimitry Andric inputs += DEGREE;
553*81ad6265SDimitry Andric num_inputs -= DEGREE;
554*81ad6265SDimitry Andric out = &out[DEGREE * BLAKE3_OUT_LEN];
555*81ad6265SDimitry Andric }
556*81ad6265SDimitry Andric while (num_inputs > 0) {
557*81ad6265SDimitry Andric hash_one_sse2(inputs[0], blocks, key, counter, flags, flags_start,
558*81ad6265SDimitry Andric flags_end, out);
559*81ad6265SDimitry Andric if (increment_counter) {
560*81ad6265SDimitry Andric counter += 1;
561*81ad6265SDimitry Andric }
562*81ad6265SDimitry Andric inputs += 1;
563*81ad6265SDimitry Andric num_inputs -= 1;
564*81ad6265SDimitry Andric out = &out[BLAKE3_OUT_LEN];
565*81ad6265SDimitry Andric }
566*81ad6265SDimitry Andric }
567