1*81ad6265SDimitry Andric #include "blake3_impl.h"
2*81ad6265SDimitry Andric
3*81ad6265SDimitry Andric #include <immintrin.h>
4*81ad6265SDimitry Andric
5*81ad6265SDimitry Andric #define _mm_shuffle_ps2(a, b, c) \
6*81ad6265SDimitry Andric (_mm_castps_si128( \
7*81ad6265SDimitry Andric _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c))))
8*81ad6265SDimitry Andric
loadu_128(const uint8_t src[16])9*81ad6265SDimitry Andric INLINE __m128i loadu_128(const uint8_t src[16]) {
10*81ad6265SDimitry Andric return _mm_loadu_si128((const __m128i *)src);
11*81ad6265SDimitry Andric }
12*81ad6265SDimitry Andric
loadu_256(const uint8_t src[32])13*81ad6265SDimitry Andric INLINE __m256i loadu_256(const uint8_t src[32]) {
14*81ad6265SDimitry Andric return _mm256_loadu_si256((const __m256i *)src);
15*81ad6265SDimitry Andric }
16*81ad6265SDimitry Andric
loadu_512(const uint8_t src[64])17*81ad6265SDimitry Andric INLINE __m512i loadu_512(const uint8_t src[64]) {
18*81ad6265SDimitry Andric return _mm512_loadu_si512((const __m512i *)src);
19*81ad6265SDimitry Andric }
20*81ad6265SDimitry Andric
storeu_128(__m128i src,uint8_t dest[16])21*81ad6265SDimitry Andric INLINE void storeu_128(__m128i src, uint8_t dest[16]) {
22*81ad6265SDimitry Andric _mm_storeu_si128((__m128i *)dest, src);
23*81ad6265SDimitry Andric }
24*81ad6265SDimitry Andric
storeu_256(__m256i src,uint8_t dest[16])25*81ad6265SDimitry Andric INLINE void storeu_256(__m256i src, uint8_t dest[16]) {
26*81ad6265SDimitry Andric _mm256_storeu_si256((__m256i *)dest, src);
27*81ad6265SDimitry Andric }
28*81ad6265SDimitry Andric
add_128(__m128i a,__m128i b)29*81ad6265SDimitry Andric INLINE __m128i add_128(__m128i a, __m128i b) { return _mm_add_epi32(a, b); }
30*81ad6265SDimitry Andric
add_256(__m256i a,__m256i b)31*81ad6265SDimitry Andric INLINE __m256i add_256(__m256i a, __m256i b) { return _mm256_add_epi32(a, b); }
32*81ad6265SDimitry Andric
add_512(__m512i a,__m512i b)33*81ad6265SDimitry Andric INLINE __m512i add_512(__m512i a, __m512i b) { return _mm512_add_epi32(a, b); }
34*81ad6265SDimitry Andric
xor_128(__m128i a,__m128i b)35*81ad6265SDimitry Andric INLINE __m128i xor_128(__m128i a, __m128i b) { return _mm_xor_si128(a, b); }
36*81ad6265SDimitry Andric
xor_256(__m256i a,__m256i b)37*81ad6265SDimitry Andric INLINE __m256i xor_256(__m256i a, __m256i b) { return _mm256_xor_si256(a, b); }
38*81ad6265SDimitry Andric
xor_512(__m512i a,__m512i b)39*81ad6265SDimitry Andric INLINE __m512i xor_512(__m512i a, __m512i b) { return _mm512_xor_si512(a, b); }
40*81ad6265SDimitry Andric
set1_128(uint32_t x)41*81ad6265SDimitry Andric INLINE __m128i set1_128(uint32_t x) { return _mm_set1_epi32((int32_t)x); }
42*81ad6265SDimitry Andric
set1_256(uint32_t x)43*81ad6265SDimitry Andric INLINE __m256i set1_256(uint32_t x) { return _mm256_set1_epi32((int32_t)x); }
44*81ad6265SDimitry Andric
set1_512(uint32_t x)45*81ad6265SDimitry Andric INLINE __m512i set1_512(uint32_t x) { return _mm512_set1_epi32((int32_t)x); }
46*81ad6265SDimitry Andric
set4(uint32_t a,uint32_t b,uint32_t c,uint32_t d)47*81ad6265SDimitry Andric INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
48*81ad6265SDimitry Andric return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d);
49*81ad6265SDimitry Andric }
50*81ad6265SDimitry Andric
rot16_128(__m128i x)51*81ad6265SDimitry Andric INLINE __m128i rot16_128(__m128i x) { return _mm_ror_epi32(x, 16); }
52*81ad6265SDimitry Andric
rot16_256(__m256i x)53*81ad6265SDimitry Andric INLINE __m256i rot16_256(__m256i x) { return _mm256_ror_epi32(x, 16); }
54*81ad6265SDimitry Andric
rot16_512(__m512i x)55*81ad6265SDimitry Andric INLINE __m512i rot16_512(__m512i x) { return _mm512_ror_epi32(x, 16); }
56*81ad6265SDimitry Andric
rot12_128(__m128i x)57*81ad6265SDimitry Andric INLINE __m128i rot12_128(__m128i x) { return _mm_ror_epi32(x, 12); }
58*81ad6265SDimitry Andric
rot12_256(__m256i x)59*81ad6265SDimitry Andric INLINE __m256i rot12_256(__m256i x) { return _mm256_ror_epi32(x, 12); }
60*81ad6265SDimitry Andric
rot12_512(__m512i x)61*81ad6265SDimitry Andric INLINE __m512i rot12_512(__m512i x) { return _mm512_ror_epi32(x, 12); }
62*81ad6265SDimitry Andric
rot8_128(__m128i x)63*81ad6265SDimitry Andric INLINE __m128i rot8_128(__m128i x) { return _mm_ror_epi32(x, 8); }
64*81ad6265SDimitry Andric
rot8_256(__m256i x)65*81ad6265SDimitry Andric INLINE __m256i rot8_256(__m256i x) { return _mm256_ror_epi32(x, 8); }
66*81ad6265SDimitry Andric
rot8_512(__m512i x)67*81ad6265SDimitry Andric INLINE __m512i rot8_512(__m512i x) { return _mm512_ror_epi32(x, 8); }
68*81ad6265SDimitry Andric
rot7_128(__m128i x)69*81ad6265SDimitry Andric INLINE __m128i rot7_128(__m128i x) { return _mm_ror_epi32(x, 7); }
70*81ad6265SDimitry Andric
rot7_256(__m256i x)71*81ad6265SDimitry Andric INLINE __m256i rot7_256(__m256i x) { return _mm256_ror_epi32(x, 7); }
72*81ad6265SDimitry Andric
rot7_512(__m512i x)73*81ad6265SDimitry Andric INLINE __m512i rot7_512(__m512i x) { return _mm512_ror_epi32(x, 7); }
74*81ad6265SDimitry Andric
75*81ad6265SDimitry Andric /*
76*81ad6265SDimitry Andric * ----------------------------------------------------------------------------
77*81ad6265SDimitry Andric * compress_avx512
78*81ad6265SDimitry Andric * ----------------------------------------------------------------------------
79*81ad6265SDimitry Andric */
80*81ad6265SDimitry Andric
g1(__m128i * row0,__m128i * row1,__m128i * row2,__m128i * row3,__m128i m)81*81ad6265SDimitry Andric INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
82*81ad6265SDimitry Andric __m128i m) {
83*81ad6265SDimitry Andric *row0 = add_128(add_128(*row0, m), *row1);
84*81ad6265SDimitry Andric *row3 = xor_128(*row3, *row0);
85*81ad6265SDimitry Andric *row3 = rot16_128(*row3);
86*81ad6265SDimitry Andric *row2 = add_128(*row2, *row3);
87*81ad6265SDimitry Andric *row1 = xor_128(*row1, *row2);
88*81ad6265SDimitry Andric *row1 = rot12_128(*row1);
89*81ad6265SDimitry Andric }
90*81ad6265SDimitry Andric
g2(__m128i * row0,__m128i * row1,__m128i * row2,__m128i * row3,__m128i m)91*81ad6265SDimitry Andric INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
92*81ad6265SDimitry Andric __m128i m) {
93*81ad6265SDimitry Andric *row0 = add_128(add_128(*row0, m), *row1);
94*81ad6265SDimitry Andric *row3 = xor_128(*row3, *row0);
95*81ad6265SDimitry Andric *row3 = rot8_128(*row3);
96*81ad6265SDimitry Andric *row2 = add_128(*row2, *row3);
97*81ad6265SDimitry Andric *row1 = xor_128(*row1, *row2);
98*81ad6265SDimitry Andric *row1 = rot7_128(*row1);
99*81ad6265SDimitry Andric }
100*81ad6265SDimitry Andric
101*81ad6265SDimitry Andric // Note the optimization here of leaving row1 as the unrotated row, rather than
102*81ad6265SDimitry Andric // row0. All the message loads below are adjusted to compensate for this. See
103*81ad6265SDimitry Andric // discussion at https://github.com/sneves/blake2-avx2/pull/4
diagonalize(__m128i * row0,__m128i * row2,__m128i * row3)104*81ad6265SDimitry Andric INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
105*81ad6265SDimitry Andric *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3));
106*81ad6265SDimitry Andric *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
107*81ad6265SDimitry Andric *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1));
108*81ad6265SDimitry Andric }
109*81ad6265SDimitry Andric
undiagonalize(__m128i * row0,__m128i * row2,__m128i * row3)110*81ad6265SDimitry Andric INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
111*81ad6265SDimitry Andric *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1));
112*81ad6265SDimitry Andric *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
113*81ad6265SDimitry Andric *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3));
114*81ad6265SDimitry Andric }
115*81ad6265SDimitry Andric
compress_pre(__m128i rows[4],const uint32_t cv[8],const uint8_t block[BLAKE3_BLOCK_LEN],uint8_t block_len,uint64_t counter,uint8_t flags)116*81ad6265SDimitry Andric INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8],
117*81ad6265SDimitry Andric const uint8_t block[BLAKE3_BLOCK_LEN],
118*81ad6265SDimitry Andric uint8_t block_len, uint64_t counter, uint8_t flags) {
119*81ad6265SDimitry Andric rows[0] = loadu_128((uint8_t *)&cv[0]);
120*81ad6265SDimitry Andric rows[1] = loadu_128((uint8_t *)&cv[4]);
121*81ad6265SDimitry Andric rows[2] = set4(IV[0], IV[1], IV[2], IV[3]);
122*81ad6265SDimitry Andric rows[3] = set4(counter_low(counter), counter_high(counter),
123*81ad6265SDimitry Andric (uint32_t)block_len, (uint32_t)flags);
124*81ad6265SDimitry Andric
125*81ad6265SDimitry Andric __m128i m0 = loadu_128(&block[sizeof(__m128i) * 0]);
126*81ad6265SDimitry Andric __m128i m1 = loadu_128(&block[sizeof(__m128i) * 1]);
127*81ad6265SDimitry Andric __m128i m2 = loadu_128(&block[sizeof(__m128i) * 2]);
128*81ad6265SDimitry Andric __m128i m3 = loadu_128(&block[sizeof(__m128i) * 3]);
129*81ad6265SDimitry Andric
130*81ad6265SDimitry Andric __m128i t0, t1, t2, t3, tt;
131*81ad6265SDimitry Andric
132*81ad6265SDimitry Andric // Round 1. The first round permutes the message words from the original
133*81ad6265SDimitry Andric // input order, into the groups that get mixed in parallel.
134*81ad6265SDimitry Andric t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); // 6 4 2 0
135*81ad6265SDimitry Andric g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
136*81ad6265SDimitry Andric t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); // 7 5 3 1
137*81ad6265SDimitry Andric g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
138*81ad6265SDimitry Andric diagonalize(&rows[0], &rows[2], &rows[3]);
139*81ad6265SDimitry Andric t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10 8
140*81ad6265SDimitry Andric t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); // 12 10 8 14
141*81ad6265SDimitry Andric g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
142*81ad6265SDimitry Andric t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11 9
143*81ad6265SDimitry Andric t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3)); // 13 11 9 15
144*81ad6265SDimitry Andric g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
145*81ad6265SDimitry Andric undiagonalize(&rows[0], &rows[2], &rows[3]);
146*81ad6265SDimitry Andric m0 = t0;
147*81ad6265SDimitry Andric m1 = t1;
148*81ad6265SDimitry Andric m2 = t2;
149*81ad6265SDimitry Andric m3 = t3;
150*81ad6265SDimitry Andric
151*81ad6265SDimitry Andric // Round 2. This round and all following rounds apply a fixed permutation
152*81ad6265SDimitry Andric // to the message words from the round before.
153*81ad6265SDimitry Andric t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
154*81ad6265SDimitry Andric t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
155*81ad6265SDimitry Andric g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
156*81ad6265SDimitry Andric t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
157*81ad6265SDimitry Andric tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
158*81ad6265SDimitry Andric t1 = _mm_blend_epi16(tt, t1, 0xCC);
159*81ad6265SDimitry Andric g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
160*81ad6265SDimitry Andric diagonalize(&rows[0], &rows[2], &rows[3]);
161*81ad6265SDimitry Andric t2 = _mm_unpacklo_epi64(m3, m1);
162*81ad6265SDimitry Andric tt = _mm_blend_epi16(t2, m2, 0xC0);
163*81ad6265SDimitry Andric t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
164*81ad6265SDimitry Andric g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
165*81ad6265SDimitry Andric t3 = _mm_unpackhi_epi32(m1, m3);
166*81ad6265SDimitry Andric tt = _mm_unpacklo_epi32(m2, t3);
167*81ad6265SDimitry Andric t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
168*81ad6265SDimitry Andric g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
169*81ad6265SDimitry Andric undiagonalize(&rows[0], &rows[2], &rows[3]);
170*81ad6265SDimitry Andric m0 = t0;
171*81ad6265SDimitry Andric m1 = t1;
172*81ad6265SDimitry Andric m2 = t2;
173*81ad6265SDimitry Andric m3 = t3;
174*81ad6265SDimitry Andric
175*81ad6265SDimitry Andric // Round 3
176*81ad6265SDimitry Andric t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
177*81ad6265SDimitry Andric t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
178*81ad6265SDimitry Andric g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
179*81ad6265SDimitry Andric t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
180*81ad6265SDimitry Andric tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
181*81ad6265SDimitry Andric t1 = _mm_blend_epi16(tt, t1, 0xCC);
182*81ad6265SDimitry Andric g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
183*81ad6265SDimitry Andric diagonalize(&rows[0], &rows[2], &rows[3]);
184*81ad6265SDimitry Andric t2 = _mm_unpacklo_epi64(m3, m1);
185*81ad6265SDimitry Andric tt = _mm_blend_epi16(t2, m2, 0xC0);
186*81ad6265SDimitry Andric t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
187*81ad6265SDimitry Andric g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
188*81ad6265SDimitry Andric t3 = _mm_unpackhi_epi32(m1, m3);
189*81ad6265SDimitry Andric tt = _mm_unpacklo_epi32(m2, t3);
190*81ad6265SDimitry Andric t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
191*81ad6265SDimitry Andric g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
192*81ad6265SDimitry Andric undiagonalize(&rows[0], &rows[2], &rows[3]);
193*81ad6265SDimitry Andric m0 = t0;
194*81ad6265SDimitry Andric m1 = t1;
195*81ad6265SDimitry Andric m2 = t2;
196*81ad6265SDimitry Andric m3 = t3;
197*81ad6265SDimitry Andric
198*81ad6265SDimitry Andric // Round 4
199*81ad6265SDimitry Andric t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
200*81ad6265SDimitry Andric t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
201*81ad6265SDimitry Andric g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
202*81ad6265SDimitry Andric t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
203*81ad6265SDimitry Andric tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
204*81ad6265SDimitry Andric t1 = _mm_blend_epi16(tt, t1, 0xCC);
205*81ad6265SDimitry Andric g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
206*81ad6265SDimitry Andric diagonalize(&rows[0], &rows[2], &rows[3]);
207*81ad6265SDimitry Andric t2 = _mm_unpacklo_epi64(m3, m1);
208*81ad6265SDimitry Andric tt = _mm_blend_epi16(t2, m2, 0xC0);
209*81ad6265SDimitry Andric t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
210*81ad6265SDimitry Andric g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
211*81ad6265SDimitry Andric t3 = _mm_unpackhi_epi32(m1, m3);
212*81ad6265SDimitry Andric tt = _mm_unpacklo_epi32(m2, t3);
213*81ad6265SDimitry Andric t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
214*81ad6265SDimitry Andric g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
215*81ad6265SDimitry Andric undiagonalize(&rows[0], &rows[2], &rows[3]);
216*81ad6265SDimitry Andric m0 = t0;
217*81ad6265SDimitry Andric m1 = t1;
218*81ad6265SDimitry Andric m2 = t2;
219*81ad6265SDimitry Andric m3 = t3;
220*81ad6265SDimitry Andric
221*81ad6265SDimitry Andric // Round 5
222*81ad6265SDimitry Andric t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
223*81ad6265SDimitry Andric t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
224*81ad6265SDimitry Andric g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
225*81ad6265SDimitry Andric t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
226*81ad6265SDimitry Andric tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
227*81ad6265SDimitry Andric t1 = _mm_blend_epi16(tt, t1, 0xCC);
228*81ad6265SDimitry Andric g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
229*81ad6265SDimitry Andric diagonalize(&rows[0], &rows[2], &rows[3]);
230*81ad6265SDimitry Andric t2 = _mm_unpacklo_epi64(m3, m1);
231*81ad6265SDimitry Andric tt = _mm_blend_epi16(t2, m2, 0xC0);
232*81ad6265SDimitry Andric t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
233*81ad6265SDimitry Andric g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
234*81ad6265SDimitry Andric t3 = _mm_unpackhi_epi32(m1, m3);
235*81ad6265SDimitry Andric tt = _mm_unpacklo_epi32(m2, t3);
236*81ad6265SDimitry Andric t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
237*81ad6265SDimitry Andric g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
238*81ad6265SDimitry Andric undiagonalize(&rows[0], &rows[2], &rows[3]);
239*81ad6265SDimitry Andric m0 = t0;
240*81ad6265SDimitry Andric m1 = t1;
241*81ad6265SDimitry Andric m2 = t2;
242*81ad6265SDimitry Andric m3 = t3;
243*81ad6265SDimitry Andric
244*81ad6265SDimitry Andric // Round 6
245*81ad6265SDimitry Andric t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
246*81ad6265SDimitry Andric t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
247*81ad6265SDimitry Andric g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
248*81ad6265SDimitry Andric t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
249*81ad6265SDimitry Andric tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
250*81ad6265SDimitry Andric t1 = _mm_blend_epi16(tt, t1, 0xCC);
251*81ad6265SDimitry Andric g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
252*81ad6265SDimitry Andric diagonalize(&rows[0], &rows[2], &rows[3]);
253*81ad6265SDimitry Andric t2 = _mm_unpacklo_epi64(m3, m1);
254*81ad6265SDimitry Andric tt = _mm_blend_epi16(t2, m2, 0xC0);
255*81ad6265SDimitry Andric t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
256*81ad6265SDimitry Andric g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
257*81ad6265SDimitry Andric t3 = _mm_unpackhi_epi32(m1, m3);
258*81ad6265SDimitry Andric tt = _mm_unpacklo_epi32(m2, t3);
259*81ad6265SDimitry Andric t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
260*81ad6265SDimitry Andric g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
261*81ad6265SDimitry Andric undiagonalize(&rows[0], &rows[2], &rows[3]);
262*81ad6265SDimitry Andric m0 = t0;
263*81ad6265SDimitry Andric m1 = t1;
264*81ad6265SDimitry Andric m2 = t2;
265*81ad6265SDimitry Andric m3 = t3;
266*81ad6265SDimitry Andric
267*81ad6265SDimitry Andric // Round 7
268*81ad6265SDimitry Andric t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
269*81ad6265SDimitry Andric t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
270*81ad6265SDimitry Andric g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
271*81ad6265SDimitry Andric t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
272*81ad6265SDimitry Andric tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
273*81ad6265SDimitry Andric t1 = _mm_blend_epi16(tt, t1, 0xCC);
274*81ad6265SDimitry Andric g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
275*81ad6265SDimitry Andric diagonalize(&rows[0], &rows[2], &rows[3]);
276*81ad6265SDimitry Andric t2 = _mm_unpacklo_epi64(m3, m1);
277*81ad6265SDimitry Andric tt = _mm_blend_epi16(t2, m2, 0xC0);
278*81ad6265SDimitry Andric t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
279*81ad6265SDimitry Andric g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
280*81ad6265SDimitry Andric t3 = _mm_unpackhi_epi32(m1, m3);
281*81ad6265SDimitry Andric tt = _mm_unpacklo_epi32(m2, t3);
282*81ad6265SDimitry Andric t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
283*81ad6265SDimitry Andric g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
284*81ad6265SDimitry Andric undiagonalize(&rows[0], &rows[2], &rows[3]);
285*81ad6265SDimitry Andric }
286*81ad6265SDimitry Andric
blake3_compress_xof_avx512(const uint32_t cv[8],const uint8_t block[BLAKE3_BLOCK_LEN],uint8_t block_len,uint64_t counter,uint8_t flags,uint8_t out[64])287*81ad6265SDimitry Andric void blake3_compress_xof_avx512(const uint32_t cv[8],
288*81ad6265SDimitry Andric const uint8_t block[BLAKE3_BLOCK_LEN],
289*81ad6265SDimitry Andric uint8_t block_len, uint64_t counter,
290*81ad6265SDimitry Andric uint8_t flags, uint8_t out[64]) {
291*81ad6265SDimitry Andric __m128i rows[4];
292*81ad6265SDimitry Andric compress_pre(rows, cv, block, block_len, counter, flags);
293*81ad6265SDimitry Andric storeu_128(xor_128(rows[0], rows[2]), &out[0]);
294*81ad6265SDimitry Andric storeu_128(xor_128(rows[1], rows[3]), &out[16]);
295*81ad6265SDimitry Andric storeu_128(xor_128(rows[2], loadu_128((uint8_t *)&cv[0])), &out[32]);
296*81ad6265SDimitry Andric storeu_128(xor_128(rows[3], loadu_128((uint8_t *)&cv[4])), &out[48]);
297*81ad6265SDimitry Andric }
298*81ad6265SDimitry Andric
blake3_compress_in_place_avx512(uint32_t cv[8],const uint8_t block[BLAKE3_BLOCK_LEN],uint8_t block_len,uint64_t counter,uint8_t flags)299*81ad6265SDimitry Andric void blake3_compress_in_place_avx512(uint32_t cv[8],
300*81ad6265SDimitry Andric const uint8_t block[BLAKE3_BLOCK_LEN],
301*81ad6265SDimitry Andric uint8_t block_len, uint64_t counter,
302*81ad6265SDimitry Andric uint8_t flags) {
303*81ad6265SDimitry Andric __m128i rows[4];
304*81ad6265SDimitry Andric compress_pre(rows, cv, block, block_len, counter, flags);
305*81ad6265SDimitry Andric storeu_128(xor_128(rows[0], rows[2]), (uint8_t *)&cv[0]);
306*81ad6265SDimitry Andric storeu_128(xor_128(rows[1], rows[3]), (uint8_t *)&cv[4]);
307*81ad6265SDimitry Andric }
308*81ad6265SDimitry Andric
309*81ad6265SDimitry Andric /*
310*81ad6265SDimitry Andric * ----------------------------------------------------------------------------
311*81ad6265SDimitry Andric * hash4_avx512
312*81ad6265SDimitry Andric * ----------------------------------------------------------------------------
313*81ad6265SDimitry Andric */
314*81ad6265SDimitry Andric
round_fn4(__m128i v[16],__m128i m[16],size_t r)315*81ad6265SDimitry Andric INLINE void round_fn4(__m128i v[16], __m128i m[16], size_t r) {
316*81ad6265SDimitry Andric v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
317*81ad6265SDimitry Andric v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
318*81ad6265SDimitry Andric v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
319*81ad6265SDimitry Andric v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
320*81ad6265SDimitry Andric v[0] = add_128(v[0], v[4]);
321*81ad6265SDimitry Andric v[1] = add_128(v[1], v[5]);
322*81ad6265SDimitry Andric v[2] = add_128(v[2], v[6]);
323*81ad6265SDimitry Andric v[3] = add_128(v[3], v[7]);
324*81ad6265SDimitry Andric v[12] = xor_128(v[12], v[0]);
325*81ad6265SDimitry Andric v[13] = xor_128(v[13], v[1]);
326*81ad6265SDimitry Andric v[14] = xor_128(v[14], v[2]);
327*81ad6265SDimitry Andric v[15] = xor_128(v[15], v[3]);
328*81ad6265SDimitry Andric v[12] = rot16_128(v[12]);
329*81ad6265SDimitry Andric v[13] = rot16_128(v[13]);
330*81ad6265SDimitry Andric v[14] = rot16_128(v[14]);
331*81ad6265SDimitry Andric v[15] = rot16_128(v[15]);
332*81ad6265SDimitry Andric v[8] = add_128(v[8], v[12]);
333*81ad6265SDimitry Andric v[9] = add_128(v[9], v[13]);
334*81ad6265SDimitry Andric v[10] = add_128(v[10], v[14]);
335*81ad6265SDimitry Andric v[11] = add_128(v[11], v[15]);
336*81ad6265SDimitry Andric v[4] = xor_128(v[4], v[8]);
337*81ad6265SDimitry Andric v[5] = xor_128(v[5], v[9]);
338*81ad6265SDimitry Andric v[6] = xor_128(v[6], v[10]);
339*81ad6265SDimitry Andric v[7] = xor_128(v[7], v[11]);
340*81ad6265SDimitry Andric v[4] = rot12_128(v[4]);
341*81ad6265SDimitry Andric v[5] = rot12_128(v[5]);
342*81ad6265SDimitry Andric v[6] = rot12_128(v[6]);
343*81ad6265SDimitry Andric v[7] = rot12_128(v[7]);
344*81ad6265SDimitry Andric v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
345*81ad6265SDimitry Andric v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
346*81ad6265SDimitry Andric v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
347*81ad6265SDimitry Andric v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
348*81ad6265SDimitry Andric v[0] = add_128(v[0], v[4]);
349*81ad6265SDimitry Andric v[1] = add_128(v[1], v[5]);
350*81ad6265SDimitry Andric v[2] = add_128(v[2], v[6]);
351*81ad6265SDimitry Andric v[3] = add_128(v[3], v[7]);
352*81ad6265SDimitry Andric v[12] = xor_128(v[12], v[0]);
353*81ad6265SDimitry Andric v[13] = xor_128(v[13], v[1]);
354*81ad6265SDimitry Andric v[14] = xor_128(v[14], v[2]);
355*81ad6265SDimitry Andric v[15] = xor_128(v[15], v[3]);
356*81ad6265SDimitry Andric v[12] = rot8_128(v[12]);
357*81ad6265SDimitry Andric v[13] = rot8_128(v[13]);
358*81ad6265SDimitry Andric v[14] = rot8_128(v[14]);
359*81ad6265SDimitry Andric v[15] = rot8_128(v[15]);
360*81ad6265SDimitry Andric v[8] = add_128(v[8], v[12]);
361*81ad6265SDimitry Andric v[9] = add_128(v[9], v[13]);
362*81ad6265SDimitry Andric v[10] = add_128(v[10], v[14]);
363*81ad6265SDimitry Andric v[11] = add_128(v[11], v[15]);
364*81ad6265SDimitry Andric v[4] = xor_128(v[4], v[8]);
365*81ad6265SDimitry Andric v[5] = xor_128(v[5], v[9]);
366*81ad6265SDimitry Andric v[6] = xor_128(v[6], v[10]);
367*81ad6265SDimitry Andric v[7] = xor_128(v[7], v[11]);
368*81ad6265SDimitry Andric v[4] = rot7_128(v[4]);
369*81ad6265SDimitry Andric v[5] = rot7_128(v[5]);
370*81ad6265SDimitry Andric v[6] = rot7_128(v[6]);
371*81ad6265SDimitry Andric v[7] = rot7_128(v[7]);
372*81ad6265SDimitry Andric
373*81ad6265SDimitry Andric v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
374*81ad6265SDimitry Andric v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
375*81ad6265SDimitry Andric v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
376*81ad6265SDimitry Andric v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
377*81ad6265SDimitry Andric v[0] = add_128(v[0], v[5]);
378*81ad6265SDimitry Andric v[1] = add_128(v[1], v[6]);
379*81ad6265SDimitry Andric v[2] = add_128(v[2], v[7]);
380*81ad6265SDimitry Andric v[3] = add_128(v[3], v[4]);
381*81ad6265SDimitry Andric v[15] = xor_128(v[15], v[0]);
382*81ad6265SDimitry Andric v[12] = xor_128(v[12], v[1]);
383*81ad6265SDimitry Andric v[13] = xor_128(v[13], v[2]);
384*81ad6265SDimitry Andric v[14] = xor_128(v[14], v[3]);
385*81ad6265SDimitry Andric v[15] = rot16_128(v[15]);
386*81ad6265SDimitry Andric v[12] = rot16_128(v[12]);
387*81ad6265SDimitry Andric v[13] = rot16_128(v[13]);
388*81ad6265SDimitry Andric v[14] = rot16_128(v[14]);
389*81ad6265SDimitry Andric v[10] = add_128(v[10], v[15]);
390*81ad6265SDimitry Andric v[11] = add_128(v[11], v[12]);
391*81ad6265SDimitry Andric v[8] = add_128(v[8], v[13]);
392*81ad6265SDimitry Andric v[9] = add_128(v[9], v[14]);
393*81ad6265SDimitry Andric v[5] = xor_128(v[5], v[10]);
394*81ad6265SDimitry Andric v[6] = xor_128(v[6], v[11]);
395*81ad6265SDimitry Andric v[7] = xor_128(v[7], v[8]);
396*81ad6265SDimitry Andric v[4] = xor_128(v[4], v[9]);
397*81ad6265SDimitry Andric v[5] = rot12_128(v[5]);
398*81ad6265SDimitry Andric v[6] = rot12_128(v[6]);
399*81ad6265SDimitry Andric v[7] = rot12_128(v[7]);
400*81ad6265SDimitry Andric v[4] = rot12_128(v[4]);
401*81ad6265SDimitry Andric v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
402*81ad6265SDimitry Andric v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
403*81ad6265SDimitry Andric v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
404*81ad6265SDimitry Andric v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
405*81ad6265SDimitry Andric v[0] = add_128(v[0], v[5]);
406*81ad6265SDimitry Andric v[1] = add_128(v[1], v[6]);
407*81ad6265SDimitry Andric v[2] = add_128(v[2], v[7]);
408*81ad6265SDimitry Andric v[3] = add_128(v[3], v[4]);
409*81ad6265SDimitry Andric v[15] = xor_128(v[15], v[0]);
410*81ad6265SDimitry Andric v[12] = xor_128(v[12], v[1]);
411*81ad6265SDimitry Andric v[13] = xor_128(v[13], v[2]);
412*81ad6265SDimitry Andric v[14] = xor_128(v[14], v[3]);
413*81ad6265SDimitry Andric v[15] = rot8_128(v[15]);
414*81ad6265SDimitry Andric v[12] = rot8_128(v[12]);
415*81ad6265SDimitry Andric v[13] = rot8_128(v[13]);
416*81ad6265SDimitry Andric v[14] = rot8_128(v[14]);
417*81ad6265SDimitry Andric v[10] = add_128(v[10], v[15]);
418*81ad6265SDimitry Andric v[11] = add_128(v[11], v[12]);
419*81ad6265SDimitry Andric v[8] = add_128(v[8], v[13]);
420*81ad6265SDimitry Andric v[9] = add_128(v[9], v[14]);
421*81ad6265SDimitry Andric v[5] = xor_128(v[5], v[10]);
422*81ad6265SDimitry Andric v[6] = xor_128(v[6], v[11]);
423*81ad6265SDimitry Andric v[7] = xor_128(v[7], v[8]);
424*81ad6265SDimitry Andric v[4] = xor_128(v[4], v[9]);
425*81ad6265SDimitry Andric v[5] = rot7_128(v[5]);
426*81ad6265SDimitry Andric v[6] = rot7_128(v[6]);
427*81ad6265SDimitry Andric v[7] = rot7_128(v[7]);
428*81ad6265SDimitry Andric v[4] = rot7_128(v[4]);
429*81ad6265SDimitry Andric }
430*81ad6265SDimitry Andric
transpose_vecs_128(__m128i vecs[4])431*81ad6265SDimitry Andric INLINE void transpose_vecs_128(__m128i vecs[4]) {
432*81ad6265SDimitry Andric // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is
433*81ad6265SDimitry Andric // 22/33. Note that this doesn't split the vector into two lanes, as the
434*81ad6265SDimitry Andric // AVX2 counterparts do.
435*81ad6265SDimitry Andric __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
436*81ad6265SDimitry Andric __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
437*81ad6265SDimitry Andric __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
438*81ad6265SDimitry Andric __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
439*81ad6265SDimitry Andric
440*81ad6265SDimitry Andric // Interleave 64-bit lanes.
441*81ad6265SDimitry Andric __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01);
442*81ad6265SDimitry Andric __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01);
443*81ad6265SDimitry Andric __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23);
444*81ad6265SDimitry Andric __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23);
445*81ad6265SDimitry Andric
446*81ad6265SDimitry Andric vecs[0] = abcd_0;
447*81ad6265SDimitry Andric vecs[1] = abcd_1;
448*81ad6265SDimitry Andric vecs[2] = abcd_2;
449*81ad6265SDimitry Andric vecs[3] = abcd_3;
450*81ad6265SDimitry Andric }
451*81ad6265SDimitry Andric
transpose_msg_vecs4(const uint8_t * const * inputs,size_t block_offset,__m128i out[16])452*81ad6265SDimitry Andric INLINE void transpose_msg_vecs4(const uint8_t *const *inputs,
453*81ad6265SDimitry Andric size_t block_offset, __m128i out[16]) {
454*81ad6265SDimitry Andric out[0] = loadu_128(&inputs[0][block_offset + 0 * sizeof(__m128i)]);
455*81ad6265SDimitry Andric out[1] = loadu_128(&inputs[1][block_offset + 0 * sizeof(__m128i)]);
456*81ad6265SDimitry Andric out[2] = loadu_128(&inputs[2][block_offset + 0 * sizeof(__m128i)]);
457*81ad6265SDimitry Andric out[3] = loadu_128(&inputs[3][block_offset + 0 * sizeof(__m128i)]);
458*81ad6265SDimitry Andric out[4] = loadu_128(&inputs[0][block_offset + 1 * sizeof(__m128i)]);
459*81ad6265SDimitry Andric out[5] = loadu_128(&inputs[1][block_offset + 1 * sizeof(__m128i)]);
460*81ad6265SDimitry Andric out[6] = loadu_128(&inputs[2][block_offset + 1 * sizeof(__m128i)]);
461*81ad6265SDimitry Andric out[7] = loadu_128(&inputs[3][block_offset + 1 * sizeof(__m128i)]);
462*81ad6265SDimitry Andric out[8] = loadu_128(&inputs[0][block_offset + 2 * sizeof(__m128i)]);
463*81ad6265SDimitry Andric out[9] = loadu_128(&inputs[1][block_offset + 2 * sizeof(__m128i)]);
464*81ad6265SDimitry Andric out[10] = loadu_128(&inputs[2][block_offset + 2 * sizeof(__m128i)]);
465*81ad6265SDimitry Andric out[11] = loadu_128(&inputs[3][block_offset + 2 * sizeof(__m128i)]);
466*81ad6265SDimitry Andric out[12] = loadu_128(&inputs[0][block_offset + 3 * sizeof(__m128i)]);
467*81ad6265SDimitry Andric out[13] = loadu_128(&inputs[1][block_offset + 3 * sizeof(__m128i)]);
468*81ad6265SDimitry Andric out[14] = loadu_128(&inputs[2][block_offset + 3 * sizeof(__m128i)]);
469*81ad6265SDimitry Andric out[15] = loadu_128(&inputs[3][block_offset + 3 * sizeof(__m128i)]);
470*81ad6265SDimitry Andric for (size_t i = 0; i < 4; ++i) {
471*81ad6265SDimitry Andric _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
472*81ad6265SDimitry Andric }
473*81ad6265SDimitry Andric transpose_vecs_128(&out[0]);
474*81ad6265SDimitry Andric transpose_vecs_128(&out[4]);
475*81ad6265SDimitry Andric transpose_vecs_128(&out[8]);
476*81ad6265SDimitry Andric transpose_vecs_128(&out[12]);
477*81ad6265SDimitry Andric }
478*81ad6265SDimitry Andric
load_counters4(uint64_t counter,bool increment_counter,__m128i * out_lo,__m128i * out_hi)479*81ad6265SDimitry Andric INLINE void load_counters4(uint64_t counter, bool increment_counter,
480*81ad6265SDimitry Andric __m128i *out_lo, __m128i *out_hi) {
481*81ad6265SDimitry Andric uint64_t mask = (increment_counter ? ~0 : 0);
482*81ad6265SDimitry Andric __m256i mask_vec = _mm256_set1_epi64x(mask);
483*81ad6265SDimitry Andric __m256i deltas = _mm256_setr_epi64x(0, 1, 2, 3);
484*81ad6265SDimitry Andric deltas = _mm256_and_si256(mask_vec, deltas);
485*81ad6265SDimitry Andric __m256i counters =
486*81ad6265SDimitry Andric _mm256_add_epi64(_mm256_set1_epi64x((int64_t)counter), deltas);
487*81ad6265SDimitry Andric *out_lo = _mm256_cvtepi64_epi32(counters);
488*81ad6265SDimitry Andric *out_hi = _mm256_cvtepi64_epi32(_mm256_srli_epi64(counters, 32));
489*81ad6265SDimitry Andric }
490*81ad6265SDimitry Andric
491*81ad6265SDimitry Andric static
blake3_hash4_avx512(const uint8_t * const * inputs,size_t blocks,const uint32_t key[8],uint64_t counter,bool increment_counter,uint8_t flags,uint8_t flags_start,uint8_t flags_end,uint8_t * out)492*81ad6265SDimitry Andric void blake3_hash4_avx512(const uint8_t *const *inputs, size_t blocks,
493*81ad6265SDimitry Andric const uint32_t key[8], uint64_t counter,
494*81ad6265SDimitry Andric bool increment_counter, uint8_t flags,
495*81ad6265SDimitry Andric uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
496*81ad6265SDimitry Andric __m128i h_vecs[8] = {
497*81ad6265SDimitry Andric set1_128(key[0]), set1_128(key[1]), set1_128(key[2]), set1_128(key[3]),
498*81ad6265SDimitry Andric set1_128(key[4]), set1_128(key[5]), set1_128(key[6]), set1_128(key[7]),
499*81ad6265SDimitry Andric };
500*81ad6265SDimitry Andric __m128i counter_low_vec, counter_high_vec;
501*81ad6265SDimitry Andric load_counters4(counter, increment_counter, &counter_low_vec,
502*81ad6265SDimitry Andric &counter_high_vec);
503*81ad6265SDimitry Andric uint8_t block_flags = flags | flags_start;
504*81ad6265SDimitry Andric
505*81ad6265SDimitry Andric for (size_t block = 0; block < blocks; block++) {
506*81ad6265SDimitry Andric if (block + 1 == blocks) {
507*81ad6265SDimitry Andric block_flags |= flags_end;
508*81ad6265SDimitry Andric }
509*81ad6265SDimitry Andric __m128i block_len_vec = set1_128(BLAKE3_BLOCK_LEN);
510*81ad6265SDimitry Andric __m128i block_flags_vec = set1_128(block_flags);
511*81ad6265SDimitry Andric __m128i msg_vecs[16];
512*81ad6265SDimitry Andric transpose_msg_vecs4(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
513*81ad6265SDimitry Andric
514*81ad6265SDimitry Andric __m128i v[16] = {
515*81ad6265SDimitry Andric h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
516*81ad6265SDimitry Andric h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
517*81ad6265SDimitry Andric set1_128(IV[0]), set1_128(IV[1]), set1_128(IV[2]), set1_128(IV[3]),
518*81ad6265SDimitry Andric counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
519*81ad6265SDimitry Andric };
520*81ad6265SDimitry Andric round_fn4(v, msg_vecs, 0);
521*81ad6265SDimitry Andric round_fn4(v, msg_vecs, 1);
522*81ad6265SDimitry Andric round_fn4(v, msg_vecs, 2);
523*81ad6265SDimitry Andric round_fn4(v, msg_vecs, 3);
524*81ad6265SDimitry Andric round_fn4(v, msg_vecs, 4);
525*81ad6265SDimitry Andric round_fn4(v, msg_vecs, 5);
526*81ad6265SDimitry Andric round_fn4(v, msg_vecs, 6);
527*81ad6265SDimitry Andric h_vecs[0] = xor_128(v[0], v[8]);
528*81ad6265SDimitry Andric h_vecs[1] = xor_128(v[1], v[9]);
529*81ad6265SDimitry Andric h_vecs[2] = xor_128(v[2], v[10]);
530*81ad6265SDimitry Andric h_vecs[3] = xor_128(v[3], v[11]);
531*81ad6265SDimitry Andric h_vecs[4] = xor_128(v[4], v[12]);
532*81ad6265SDimitry Andric h_vecs[5] = xor_128(v[5], v[13]);
533*81ad6265SDimitry Andric h_vecs[6] = xor_128(v[6], v[14]);
534*81ad6265SDimitry Andric h_vecs[7] = xor_128(v[7], v[15]);
535*81ad6265SDimitry Andric
536*81ad6265SDimitry Andric block_flags = flags;
537*81ad6265SDimitry Andric }
538*81ad6265SDimitry Andric
539*81ad6265SDimitry Andric transpose_vecs_128(&h_vecs[0]);
540*81ad6265SDimitry Andric transpose_vecs_128(&h_vecs[4]);
541*81ad6265SDimitry Andric // The first four vecs now contain the first half of each output, and the
542*81ad6265SDimitry Andric // second four vecs contain the second half of each output.
543*81ad6265SDimitry Andric storeu_128(h_vecs[0], &out[0 * sizeof(__m128i)]);
544*81ad6265SDimitry Andric storeu_128(h_vecs[4], &out[1 * sizeof(__m128i)]);
545*81ad6265SDimitry Andric storeu_128(h_vecs[1], &out[2 * sizeof(__m128i)]);
546*81ad6265SDimitry Andric storeu_128(h_vecs[5], &out[3 * sizeof(__m128i)]);
547*81ad6265SDimitry Andric storeu_128(h_vecs[2], &out[4 * sizeof(__m128i)]);
548*81ad6265SDimitry Andric storeu_128(h_vecs[6], &out[5 * sizeof(__m128i)]);
549*81ad6265SDimitry Andric storeu_128(h_vecs[3], &out[6 * sizeof(__m128i)]);
550*81ad6265SDimitry Andric storeu_128(h_vecs[7], &out[7 * sizeof(__m128i)]);
551*81ad6265SDimitry Andric }
552*81ad6265SDimitry Andric
553*81ad6265SDimitry Andric /*
554*81ad6265SDimitry Andric * ----------------------------------------------------------------------------
555*81ad6265SDimitry Andric * hash8_avx512
556*81ad6265SDimitry Andric * ----------------------------------------------------------------------------
557*81ad6265SDimitry Andric */
558*81ad6265SDimitry Andric
round_fn8(__m256i v[16],__m256i m[16],size_t r)559*81ad6265SDimitry Andric INLINE void round_fn8(__m256i v[16], __m256i m[16], size_t r) {
560*81ad6265SDimitry Andric v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
561*81ad6265SDimitry Andric v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
562*81ad6265SDimitry Andric v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
563*81ad6265SDimitry Andric v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
564*81ad6265SDimitry Andric v[0] = add_256(v[0], v[4]);
565*81ad6265SDimitry Andric v[1] = add_256(v[1], v[5]);
566*81ad6265SDimitry Andric v[2] = add_256(v[2], v[6]);
567*81ad6265SDimitry Andric v[3] = add_256(v[3], v[7]);
568*81ad6265SDimitry Andric v[12] = xor_256(v[12], v[0]);
569*81ad6265SDimitry Andric v[13] = xor_256(v[13], v[1]);
570*81ad6265SDimitry Andric v[14] = xor_256(v[14], v[2]);
571*81ad6265SDimitry Andric v[15] = xor_256(v[15], v[3]);
572*81ad6265SDimitry Andric v[12] = rot16_256(v[12]);
573*81ad6265SDimitry Andric v[13] = rot16_256(v[13]);
574*81ad6265SDimitry Andric v[14] = rot16_256(v[14]);
575*81ad6265SDimitry Andric v[15] = rot16_256(v[15]);
576*81ad6265SDimitry Andric v[8] = add_256(v[8], v[12]);
577*81ad6265SDimitry Andric v[9] = add_256(v[9], v[13]);
578*81ad6265SDimitry Andric v[10] = add_256(v[10], v[14]);
579*81ad6265SDimitry Andric v[11] = add_256(v[11], v[15]);
580*81ad6265SDimitry Andric v[4] = xor_256(v[4], v[8]);
581*81ad6265SDimitry Andric v[5] = xor_256(v[5], v[9]);
582*81ad6265SDimitry Andric v[6] = xor_256(v[6], v[10]);
583*81ad6265SDimitry Andric v[7] = xor_256(v[7], v[11]);
584*81ad6265SDimitry Andric v[4] = rot12_256(v[4]);
585*81ad6265SDimitry Andric v[5] = rot12_256(v[5]);
586*81ad6265SDimitry Andric v[6] = rot12_256(v[6]);
587*81ad6265SDimitry Andric v[7] = rot12_256(v[7]);
588*81ad6265SDimitry Andric v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
589*81ad6265SDimitry Andric v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
590*81ad6265SDimitry Andric v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
591*81ad6265SDimitry Andric v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
592*81ad6265SDimitry Andric v[0] = add_256(v[0], v[4]);
593*81ad6265SDimitry Andric v[1] = add_256(v[1], v[5]);
594*81ad6265SDimitry Andric v[2] = add_256(v[2], v[6]);
595*81ad6265SDimitry Andric v[3] = add_256(v[3], v[7]);
596*81ad6265SDimitry Andric v[12] = xor_256(v[12], v[0]);
597*81ad6265SDimitry Andric v[13] = xor_256(v[13], v[1]);
598*81ad6265SDimitry Andric v[14] = xor_256(v[14], v[2]);
599*81ad6265SDimitry Andric v[15] = xor_256(v[15], v[3]);
600*81ad6265SDimitry Andric v[12] = rot8_256(v[12]);
601*81ad6265SDimitry Andric v[13] = rot8_256(v[13]);
602*81ad6265SDimitry Andric v[14] = rot8_256(v[14]);
603*81ad6265SDimitry Andric v[15] = rot8_256(v[15]);
604*81ad6265SDimitry Andric v[8] = add_256(v[8], v[12]);
605*81ad6265SDimitry Andric v[9] = add_256(v[9], v[13]);
606*81ad6265SDimitry Andric v[10] = add_256(v[10], v[14]);
607*81ad6265SDimitry Andric v[11] = add_256(v[11], v[15]);
608*81ad6265SDimitry Andric v[4] = xor_256(v[4], v[8]);
609*81ad6265SDimitry Andric v[5] = xor_256(v[5], v[9]);
610*81ad6265SDimitry Andric v[6] = xor_256(v[6], v[10]);
611*81ad6265SDimitry Andric v[7] = xor_256(v[7], v[11]);
612*81ad6265SDimitry Andric v[4] = rot7_256(v[4]);
613*81ad6265SDimitry Andric v[5] = rot7_256(v[5]);
614*81ad6265SDimitry Andric v[6] = rot7_256(v[6]);
615*81ad6265SDimitry Andric v[7] = rot7_256(v[7]);
616*81ad6265SDimitry Andric
617*81ad6265SDimitry Andric v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
618*81ad6265SDimitry Andric v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
619*81ad6265SDimitry Andric v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
620*81ad6265SDimitry Andric v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
621*81ad6265SDimitry Andric v[0] = add_256(v[0], v[5]);
622*81ad6265SDimitry Andric v[1] = add_256(v[1], v[6]);
623*81ad6265SDimitry Andric v[2] = add_256(v[2], v[7]);
624*81ad6265SDimitry Andric v[3] = add_256(v[3], v[4]);
625*81ad6265SDimitry Andric v[15] = xor_256(v[15], v[0]);
626*81ad6265SDimitry Andric v[12] = xor_256(v[12], v[1]);
627*81ad6265SDimitry Andric v[13] = xor_256(v[13], v[2]);
628*81ad6265SDimitry Andric v[14] = xor_256(v[14], v[3]);
629*81ad6265SDimitry Andric v[15] = rot16_256(v[15]);
630*81ad6265SDimitry Andric v[12] = rot16_256(v[12]);
631*81ad6265SDimitry Andric v[13] = rot16_256(v[13]);
632*81ad6265SDimitry Andric v[14] = rot16_256(v[14]);
633*81ad6265SDimitry Andric v[10] = add_256(v[10], v[15]);
634*81ad6265SDimitry Andric v[11] = add_256(v[11], v[12]);
635*81ad6265SDimitry Andric v[8] = add_256(v[8], v[13]);
636*81ad6265SDimitry Andric v[9] = add_256(v[9], v[14]);
637*81ad6265SDimitry Andric v[5] = xor_256(v[5], v[10]);
638*81ad6265SDimitry Andric v[6] = xor_256(v[6], v[11]);
639*81ad6265SDimitry Andric v[7] = xor_256(v[7], v[8]);
640*81ad6265SDimitry Andric v[4] = xor_256(v[4], v[9]);
641*81ad6265SDimitry Andric v[5] = rot12_256(v[5]);
642*81ad6265SDimitry Andric v[6] = rot12_256(v[6]);
643*81ad6265SDimitry Andric v[7] = rot12_256(v[7]);
644*81ad6265SDimitry Andric v[4] = rot12_256(v[4]);
645*81ad6265SDimitry Andric v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
646*81ad6265SDimitry Andric v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
647*81ad6265SDimitry Andric v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
648*81ad6265SDimitry Andric v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
649*81ad6265SDimitry Andric v[0] = add_256(v[0], v[5]);
650*81ad6265SDimitry Andric v[1] = add_256(v[1], v[6]);
651*81ad6265SDimitry Andric v[2] = add_256(v[2], v[7]);
652*81ad6265SDimitry Andric v[3] = add_256(v[3], v[4]);
653*81ad6265SDimitry Andric v[15] = xor_256(v[15], v[0]);
654*81ad6265SDimitry Andric v[12] = xor_256(v[12], v[1]);
655*81ad6265SDimitry Andric v[13] = xor_256(v[13], v[2]);
656*81ad6265SDimitry Andric v[14] = xor_256(v[14], v[3]);
657*81ad6265SDimitry Andric v[15] = rot8_256(v[15]);
658*81ad6265SDimitry Andric v[12] = rot8_256(v[12]);
659*81ad6265SDimitry Andric v[13] = rot8_256(v[13]);
660*81ad6265SDimitry Andric v[14] = rot8_256(v[14]);
661*81ad6265SDimitry Andric v[10] = add_256(v[10], v[15]);
662*81ad6265SDimitry Andric v[11] = add_256(v[11], v[12]);
663*81ad6265SDimitry Andric v[8] = add_256(v[8], v[13]);
664*81ad6265SDimitry Andric v[9] = add_256(v[9], v[14]);
665*81ad6265SDimitry Andric v[5] = xor_256(v[5], v[10]);
666*81ad6265SDimitry Andric v[6] = xor_256(v[6], v[11]);
667*81ad6265SDimitry Andric v[7] = xor_256(v[7], v[8]);
668*81ad6265SDimitry Andric v[4] = xor_256(v[4], v[9]);
669*81ad6265SDimitry Andric v[5] = rot7_256(v[5]);
670*81ad6265SDimitry Andric v[6] = rot7_256(v[6]);
671*81ad6265SDimitry Andric v[7] = rot7_256(v[7]);
672*81ad6265SDimitry Andric v[4] = rot7_256(v[4]);
673*81ad6265SDimitry Andric }
674*81ad6265SDimitry Andric
transpose_vecs_256(__m256i vecs[8])675*81ad6265SDimitry Andric INLINE void transpose_vecs_256(__m256i vecs[8]) {
676*81ad6265SDimitry Andric // Interleave 32-bit lanes. The low unpack is lanes 00/11/44/55, and the high
677*81ad6265SDimitry Andric // is 22/33/66/77.
678*81ad6265SDimitry Andric __m256i ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]);
679*81ad6265SDimitry Andric __m256i ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]);
680*81ad6265SDimitry Andric __m256i cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]);
681*81ad6265SDimitry Andric __m256i cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]);
682*81ad6265SDimitry Andric __m256i ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]);
683*81ad6265SDimitry Andric __m256i ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]);
684*81ad6265SDimitry Andric __m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]);
685*81ad6265SDimitry Andric __m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]);
686*81ad6265SDimitry Andric
687*81ad6265SDimitry Andric // Interleave 64-bit lates. The low unpack is lanes 00/22 and the high is
688*81ad6265SDimitry Andric // 11/33.
689*81ad6265SDimitry Andric __m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145);
690*81ad6265SDimitry Andric __m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145);
691*81ad6265SDimitry Andric __m256i abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367);
692*81ad6265SDimitry Andric __m256i abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367);
693*81ad6265SDimitry Andric __m256i efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145);
694*81ad6265SDimitry Andric __m256i efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145);
695*81ad6265SDimitry Andric __m256i efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367);
696*81ad6265SDimitry Andric __m256i efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367);
697*81ad6265SDimitry Andric
698*81ad6265SDimitry Andric // Interleave 128-bit lanes.
699*81ad6265SDimitry Andric vecs[0] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x20);
700*81ad6265SDimitry Andric vecs[1] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x20);
701*81ad6265SDimitry Andric vecs[2] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x20);
702*81ad6265SDimitry Andric vecs[3] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x20);
703*81ad6265SDimitry Andric vecs[4] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x31);
704*81ad6265SDimitry Andric vecs[5] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x31);
705*81ad6265SDimitry Andric vecs[6] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x31);
706*81ad6265SDimitry Andric vecs[7] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x31);
707*81ad6265SDimitry Andric }
708*81ad6265SDimitry Andric
transpose_msg_vecs8(const uint8_t * const * inputs,size_t block_offset,__m256i out[16])709*81ad6265SDimitry Andric INLINE void transpose_msg_vecs8(const uint8_t *const *inputs,
710*81ad6265SDimitry Andric size_t block_offset, __m256i out[16]) {
711*81ad6265SDimitry Andric out[0] = loadu_256(&inputs[0][block_offset + 0 * sizeof(__m256i)]);
712*81ad6265SDimitry Andric out[1] = loadu_256(&inputs[1][block_offset + 0 * sizeof(__m256i)]);
713*81ad6265SDimitry Andric out[2] = loadu_256(&inputs[2][block_offset + 0 * sizeof(__m256i)]);
714*81ad6265SDimitry Andric out[3] = loadu_256(&inputs[3][block_offset + 0 * sizeof(__m256i)]);
715*81ad6265SDimitry Andric out[4] = loadu_256(&inputs[4][block_offset + 0 * sizeof(__m256i)]);
716*81ad6265SDimitry Andric out[5] = loadu_256(&inputs[5][block_offset + 0 * sizeof(__m256i)]);
717*81ad6265SDimitry Andric out[6] = loadu_256(&inputs[6][block_offset + 0 * sizeof(__m256i)]);
718*81ad6265SDimitry Andric out[7] = loadu_256(&inputs[7][block_offset + 0 * sizeof(__m256i)]);
719*81ad6265SDimitry Andric out[8] = loadu_256(&inputs[0][block_offset + 1 * sizeof(__m256i)]);
720*81ad6265SDimitry Andric out[9] = loadu_256(&inputs[1][block_offset + 1 * sizeof(__m256i)]);
721*81ad6265SDimitry Andric out[10] = loadu_256(&inputs[2][block_offset + 1 * sizeof(__m256i)]);
722*81ad6265SDimitry Andric out[11] = loadu_256(&inputs[3][block_offset + 1 * sizeof(__m256i)]);
723*81ad6265SDimitry Andric out[12] = loadu_256(&inputs[4][block_offset + 1 * sizeof(__m256i)]);
724*81ad6265SDimitry Andric out[13] = loadu_256(&inputs[5][block_offset + 1 * sizeof(__m256i)]);
725*81ad6265SDimitry Andric out[14] = loadu_256(&inputs[6][block_offset + 1 * sizeof(__m256i)]);
726*81ad6265SDimitry Andric out[15] = loadu_256(&inputs[7][block_offset + 1 * sizeof(__m256i)]);
727*81ad6265SDimitry Andric for (size_t i = 0; i < 8; ++i) {
728*81ad6265SDimitry Andric _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
729*81ad6265SDimitry Andric }
730*81ad6265SDimitry Andric transpose_vecs_256(&out[0]);
731*81ad6265SDimitry Andric transpose_vecs_256(&out[8]);
732*81ad6265SDimitry Andric }
733*81ad6265SDimitry Andric
load_counters8(uint64_t counter,bool increment_counter,__m256i * out_lo,__m256i * out_hi)734*81ad6265SDimitry Andric INLINE void load_counters8(uint64_t counter, bool increment_counter,
735*81ad6265SDimitry Andric __m256i *out_lo, __m256i *out_hi) {
736*81ad6265SDimitry Andric uint64_t mask = (increment_counter ? ~0 : 0);
737*81ad6265SDimitry Andric __m512i mask_vec = _mm512_set1_epi64(mask);
738*81ad6265SDimitry Andric __m512i deltas = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
739*81ad6265SDimitry Andric deltas = _mm512_and_si512(mask_vec, deltas);
740*81ad6265SDimitry Andric __m512i counters =
741*81ad6265SDimitry Andric _mm512_add_epi64(_mm512_set1_epi64((int64_t)counter), deltas);
742*81ad6265SDimitry Andric *out_lo = _mm512_cvtepi64_epi32(counters);
743*81ad6265SDimitry Andric *out_hi = _mm512_cvtepi64_epi32(_mm512_srli_epi64(counters, 32));
744*81ad6265SDimitry Andric }
745*81ad6265SDimitry Andric
746*81ad6265SDimitry Andric static
blake3_hash8_avx512(const uint8_t * const * inputs,size_t blocks,const uint32_t key[8],uint64_t counter,bool increment_counter,uint8_t flags,uint8_t flags_start,uint8_t flags_end,uint8_t * out)747*81ad6265SDimitry Andric void blake3_hash8_avx512(const uint8_t *const *inputs, size_t blocks,
748*81ad6265SDimitry Andric const uint32_t key[8], uint64_t counter,
749*81ad6265SDimitry Andric bool increment_counter, uint8_t flags,
750*81ad6265SDimitry Andric uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
751*81ad6265SDimitry Andric __m256i h_vecs[8] = {
752*81ad6265SDimitry Andric set1_256(key[0]), set1_256(key[1]), set1_256(key[2]), set1_256(key[3]),
753*81ad6265SDimitry Andric set1_256(key[4]), set1_256(key[5]), set1_256(key[6]), set1_256(key[7]),
754*81ad6265SDimitry Andric };
755*81ad6265SDimitry Andric __m256i counter_low_vec, counter_high_vec;
756*81ad6265SDimitry Andric load_counters8(counter, increment_counter, &counter_low_vec,
757*81ad6265SDimitry Andric &counter_high_vec);
758*81ad6265SDimitry Andric uint8_t block_flags = flags | flags_start;
759*81ad6265SDimitry Andric
760*81ad6265SDimitry Andric for (size_t block = 0; block < blocks; block++) {
761*81ad6265SDimitry Andric if (block + 1 == blocks) {
762*81ad6265SDimitry Andric block_flags |= flags_end;
763*81ad6265SDimitry Andric }
764*81ad6265SDimitry Andric __m256i block_len_vec = set1_256(BLAKE3_BLOCK_LEN);
765*81ad6265SDimitry Andric __m256i block_flags_vec = set1_256(block_flags);
766*81ad6265SDimitry Andric __m256i msg_vecs[16];
767*81ad6265SDimitry Andric transpose_msg_vecs8(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
768*81ad6265SDimitry Andric
769*81ad6265SDimitry Andric __m256i v[16] = {
770*81ad6265SDimitry Andric h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
771*81ad6265SDimitry Andric h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
772*81ad6265SDimitry Andric set1_256(IV[0]), set1_256(IV[1]), set1_256(IV[2]), set1_256(IV[3]),
773*81ad6265SDimitry Andric counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
774*81ad6265SDimitry Andric };
775*81ad6265SDimitry Andric round_fn8(v, msg_vecs, 0);
776*81ad6265SDimitry Andric round_fn8(v, msg_vecs, 1);
777*81ad6265SDimitry Andric round_fn8(v, msg_vecs, 2);
778*81ad6265SDimitry Andric round_fn8(v, msg_vecs, 3);
779*81ad6265SDimitry Andric round_fn8(v, msg_vecs, 4);
780*81ad6265SDimitry Andric round_fn8(v, msg_vecs, 5);
781*81ad6265SDimitry Andric round_fn8(v, msg_vecs, 6);
782*81ad6265SDimitry Andric h_vecs[0] = xor_256(v[0], v[8]);
783*81ad6265SDimitry Andric h_vecs[1] = xor_256(v[1], v[9]);
784*81ad6265SDimitry Andric h_vecs[2] = xor_256(v[2], v[10]);
785*81ad6265SDimitry Andric h_vecs[3] = xor_256(v[3], v[11]);
786*81ad6265SDimitry Andric h_vecs[4] = xor_256(v[4], v[12]);
787*81ad6265SDimitry Andric h_vecs[5] = xor_256(v[5], v[13]);
788*81ad6265SDimitry Andric h_vecs[6] = xor_256(v[6], v[14]);
789*81ad6265SDimitry Andric h_vecs[7] = xor_256(v[7], v[15]);
790*81ad6265SDimitry Andric
791*81ad6265SDimitry Andric block_flags = flags;
792*81ad6265SDimitry Andric }
793*81ad6265SDimitry Andric
794*81ad6265SDimitry Andric transpose_vecs_256(h_vecs);
795*81ad6265SDimitry Andric storeu_256(h_vecs[0], &out[0 * sizeof(__m256i)]);
796*81ad6265SDimitry Andric storeu_256(h_vecs[1], &out[1 * sizeof(__m256i)]);
797*81ad6265SDimitry Andric storeu_256(h_vecs[2], &out[2 * sizeof(__m256i)]);
798*81ad6265SDimitry Andric storeu_256(h_vecs[3], &out[3 * sizeof(__m256i)]);
799*81ad6265SDimitry Andric storeu_256(h_vecs[4], &out[4 * sizeof(__m256i)]);
800*81ad6265SDimitry Andric storeu_256(h_vecs[5], &out[5 * sizeof(__m256i)]);
801*81ad6265SDimitry Andric storeu_256(h_vecs[6], &out[6 * sizeof(__m256i)]);
802*81ad6265SDimitry Andric storeu_256(h_vecs[7], &out[7 * sizeof(__m256i)]);
803*81ad6265SDimitry Andric }
804*81ad6265SDimitry Andric
805*81ad6265SDimitry Andric /*
806*81ad6265SDimitry Andric * ----------------------------------------------------------------------------
807*81ad6265SDimitry Andric * hash16_avx512
808*81ad6265SDimitry Andric * ----------------------------------------------------------------------------
809*81ad6265SDimitry Andric */
810*81ad6265SDimitry Andric
round_fn16(__m512i v[16],__m512i m[16],size_t r)811*81ad6265SDimitry Andric INLINE void round_fn16(__m512i v[16], __m512i m[16], size_t r) {
812*81ad6265SDimitry Andric v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
813*81ad6265SDimitry Andric v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
814*81ad6265SDimitry Andric v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
815*81ad6265SDimitry Andric v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
816*81ad6265SDimitry Andric v[0] = add_512(v[0], v[4]);
817*81ad6265SDimitry Andric v[1] = add_512(v[1], v[5]);
818*81ad6265SDimitry Andric v[2] = add_512(v[2], v[6]);
819*81ad6265SDimitry Andric v[3] = add_512(v[3], v[7]);
820*81ad6265SDimitry Andric v[12] = xor_512(v[12], v[0]);
821*81ad6265SDimitry Andric v[13] = xor_512(v[13], v[1]);
822*81ad6265SDimitry Andric v[14] = xor_512(v[14], v[2]);
823*81ad6265SDimitry Andric v[15] = xor_512(v[15], v[3]);
824*81ad6265SDimitry Andric v[12] = rot16_512(v[12]);
825*81ad6265SDimitry Andric v[13] = rot16_512(v[13]);
826*81ad6265SDimitry Andric v[14] = rot16_512(v[14]);
827*81ad6265SDimitry Andric v[15] = rot16_512(v[15]);
828*81ad6265SDimitry Andric v[8] = add_512(v[8], v[12]);
829*81ad6265SDimitry Andric v[9] = add_512(v[9], v[13]);
830*81ad6265SDimitry Andric v[10] = add_512(v[10], v[14]);
831*81ad6265SDimitry Andric v[11] = add_512(v[11], v[15]);
832*81ad6265SDimitry Andric v[4] = xor_512(v[4], v[8]);
833*81ad6265SDimitry Andric v[5] = xor_512(v[5], v[9]);
834*81ad6265SDimitry Andric v[6] = xor_512(v[6], v[10]);
835*81ad6265SDimitry Andric v[7] = xor_512(v[7], v[11]);
836*81ad6265SDimitry Andric v[4] = rot12_512(v[4]);
837*81ad6265SDimitry Andric v[5] = rot12_512(v[5]);
838*81ad6265SDimitry Andric v[6] = rot12_512(v[6]);
839*81ad6265SDimitry Andric v[7] = rot12_512(v[7]);
840*81ad6265SDimitry Andric v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
841*81ad6265SDimitry Andric v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
842*81ad6265SDimitry Andric v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
843*81ad6265SDimitry Andric v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
844*81ad6265SDimitry Andric v[0] = add_512(v[0], v[4]);
845*81ad6265SDimitry Andric v[1] = add_512(v[1], v[5]);
846*81ad6265SDimitry Andric v[2] = add_512(v[2], v[6]);
847*81ad6265SDimitry Andric v[3] = add_512(v[3], v[7]);
848*81ad6265SDimitry Andric v[12] = xor_512(v[12], v[0]);
849*81ad6265SDimitry Andric v[13] = xor_512(v[13], v[1]);
850*81ad6265SDimitry Andric v[14] = xor_512(v[14], v[2]);
851*81ad6265SDimitry Andric v[15] = xor_512(v[15], v[3]);
852*81ad6265SDimitry Andric v[12] = rot8_512(v[12]);
853*81ad6265SDimitry Andric v[13] = rot8_512(v[13]);
854*81ad6265SDimitry Andric v[14] = rot8_512(v[14]);
855*81ad6265SDimitry Andric v[15] = rot8_512(v[15]);
856*81ad6265SDimitry Andric v[8] = add_512(v[8], v[12]);
857*81ad6265SDimitry Andric v[9] = add_512(v[9], v[13]);
858*81ad6265SDimitry Andric v[10] = add_512(v[10], v[14]);
859*81ad6265SDimitry Andric v[11] = add_512(v[11], v[15]);
860*81ad6265SDimitry Andric v[4] = xor_512(v[4], v[8]);
861*81ad6265SDimitry Andric v[5] = xor_512(v[5], v[9]);
862*81ad6265SDimitry Andric v[6] = xor_512(v[6], v[10]);
863*81ad6265SDimitry Andric v[7] = xor_512(v[7], v[11]);
864*81ad6265SDimitry Andric v[4] = rot7_512(v[4]);
865*81ad6265SDimitry Andric v[5] = rot7_512(v[5]);
866*81ad6265SDimitry Andric v[6] = rot7_512(v[6]);
867*81ad6265SDimitry Andric v[7] = rot7_512(v[7]);
868*81ad6265SDimitry Andric
869*81ad6265SDimitry Andric v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
870*81ad6265SDimitry Andric v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
871*81ad6265SDimitry Andric v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
872*81ad6265SDimitry Andric v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
873*81ad6265SDimitry Andric v[0] = add_512(v[0], v[5]);
874*81ad6265SDimitry Andric v[1] = add_512(v[1], v[6]);
875*81ad6265SDimitry Andric v[2] = add_512(v[2], v[7]);
876*81ad6265SDimitry Andric v[3] = add_512(v[3], v[4]);
877*81ad6265SDimitry Andric v[15] = xor_512(v[15], v[0]);
878*81ad6265SDimitry Andric v[12] = xor_512(v[12], v[1]);
879*81ad6265SDimitry Andric v[13] = xor_512(v[13], v[2]);
880*81ad6265SDimitry Andric v[14] = xor_512(v[14], v[3]);
881*81ad6265SDimitry Andric v[15] = rot16_512(v[15]);
882*81ad6265SDimitry Andric v[12] = rot16_512(v[12]);
883*81ad6265SDimitry Andric v[13] = rot16_512(v[13]);
884*81ad6265SDimitry Andric v[14] = rot16_512(v[14]);
885*81ad6265SDimitry Andric v[10] = add_512(v[10], v[15]);
886*81ad6265SDimitry Andric v[11] = add_512(v[11], v[12]);
887*81ad6265SDimitry Andric v[8] = add_512(v[8], v[13]);
888*81ad6265SDimitry Andric v[9] = add_512(v[9], v[14]);
889*81ad6265SDimitry Andric v[5] = xor_512(v[5], v[10]);
890*81ad6265SDimitry Andric v[6] = xor_512(v[6], v[11]);
891*81ad6265SDimitry Andric v[7] = xor_512(v[7], v[8]);
892*81ad6265SDimitry Andric v[4] = xor_512(v[4], v[9]);
893*81ad6265SDimitry Andric v[5] = rot12_512(v[5]);
894*81ad6265SDimitry Andric v[6] = rot12_512(v[6]);
895*81ad6265SDimitry Andric v[7] = rot12_512(v[7]);
896*81ad6265SDimitry Andric v[4] = rot12_512(v[4]);
897*81ad6265SDimitry Andric v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
898*81ad6265SDimitry Andric v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
899*81ad6265SDimitry Andric v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
900*81ad6265SDimitry Andric v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
901*81ad6265SDimitry Andric v[0] = add_512(v[0], v[5]);
902*81ad6265SDimitry Andric v[1] = add_512(v[1], v[6]);
903*81ad6265SDimitry Andric v[2] = add_512(v[2], v[7]);
904*81ad6265SDimitry Andric v[3] = add_512(v[3], v[4]);
905*81ad6265SDimitry Andric v[15] = xor_512(v[15], v[0]);
906*81ad6265SDimitry Andric v[12] = xor_512(v[12], v[1]);
907*81ad6265SDimitry Andric v[13] = xor_512(v[13], v[2]);
908*81ad6265SDimitry Andric v[14] = xor_512(v[14], v[3]);
909*81ad6265SDimitry Andric v[15] = rot8_512(v[15]);
910*81ad6265SDimitry Andric v[12] = rot8_512(v[12]);
911*81ad6265SDimitry Andric v[13] = rot8_512(v[13]);
912*81ad6265SDimitry Andric v[14] = rot8_512(v[14]);
913*81ad6265SDimitry Andric v[10] = add_512(v[10], v[15]);
914*81ad6265SDimitry Andric v[11] = add_512(v[11], v[12]);
915*81ad6265SDimitry Andric v[8] = add_512(v[8], v[13]);
916*81ad6265SDimitry Andric v[9] = add_512(v[9], v[14]);
917*81ad6265SDimitry Andric v[5] = xor_512(v[5], v[10]);
918*81ad6265SDimitry Andric v[6] = xor_512(v[6], v[11]);
919*81ad6265SDimitry Andric v[7] = xor_512(v[7], v[8]);
920*81ad6265SDimitry Andric v[4] = xor_512(v[4], v[9]);
921*81ad6265SDimitry Andric v[5] = rot7_512(v[5]);
922*81ad6265SDimitry Andric v[6] = rot7_512(v[6]);
923*81ad6265SDimitry Andric v[7] = rot7_512(v[7]);
924*81ad6265SDimitry Andric v[4] = rot7_512(v[4]);
925*81ad6265SDimitry Andric }
926*81ad6265SDimitry Andric
927*81ad6265SDimitry Andric // 0b10001000, or lanes a0/a2/b0/b2 in little-endian order
928*81ad6265SDimitry Andric #define LO_IMM8 0x88
929*81ad6265SDimitry Andric
unpack_lo_128(__m512i a,__m512i b)930*81ad6265SDimitry Andric INLINE __m512i unpack_lo_128(__m512i a, __m512i b) {
931*81ad6265SDimitry Andric return _mm512_shuffle_i32x4(a, b, LO_IMM8);
932*81ad6265SDimitry Andric }
933*81ad6265SDimitry Andric
934*81ad6265SDimitry Andric // 0b11011101, or lanes a1/a3/b1/b3 in little-endian order
935*81ad6265SDimitry Andric #define HI_IMM8 0xdd
936*81ad6265SDimitry Andric
unpack_hi_128(__m512i a,__m512i b)937*81ad6265SDimitry Andric INLINE __m512i unpack_hi_128(__m512i a, __m512i b) {
938*81ad6265SDimitry Andric return _mm512_shuffle_i32x4(a, b, HI_IMM8);
939*81ad6265SDimitry Andric }
940*81ad6265SDimitry Andric
transpose_vecs_512(__m512i vecs[16])941*81ad6265SDimitry Andric INLINE void transpose_vecs_512(__m512i vecs[16]) {
942*81ad6265SDimitry Andric // Interleave 32-bit lanes. The _0 unpack is lanes
943*81ad6265SDimitry Andric // 0/0/1/1/4/4/5/5/8/8/9/9/12/12/13/13, and the _2 unpack is lanes
944*81ad6265SDimitry Andric // 2/2/3/3/6/6/7/7/10/10/11/11/14/14/15/15.
945*81ad6265SDimitry Andric __m512i ab_0 = _mm512_unpacklo_epi32(vecs[0], vecs[1]);
946*81ad6265SDimitry Andric __m512i ab_2 = _mm512_unpackhi_epi32(vecs[0], vecs[1]);
947*81ad6265SDimitry Andric __m512i cd_0 = _mm512_unpacklo_epi32(vecs[2], vecs[3]);
948*81ad6265SDimitry Andric __m512i cd_2 = _mm512_unpackhi_epi32(vecs[2], vecs[3]);
949*81ad6265SDimitry Andric __m512i ef_0 = _mm512_unpacklo_epi32(vecs[4], vecs[5]);
950*81ad6265SDimitry Andric __m512i ef_2 = _mm512_unpackhi_epi32(vecs[4], vecs[5]);
951*81ad6265SDimitry Andric __m512i gh_0 = _mm512_unpacklo_epi32(vecs[6], vecs[7]);
952*81ad6265SDimitry Andric __m512i gh_2 = _mm512_unpackhi_epi32(vecs[6], vecs[7]);
953*81ad6265SDimitry Andric __m512i ij_0 = _mm512_unpacklo_epi32(vecs[8], vecs[9]);
954*81ad6265SDimitry Andric __m512i ij_2 = _mm512_unpackhi_epi32(vecs[8], vecs[9]);
955*81ad6265SDimitry Andric __m512i kl_0 = _mm512_unpacklo_epi32(vecs[10], vecs[11]);
956*81ad6265SDimitry Andric __m512i kl_2 = _mm512_unpackhi_epi32(vecs[10], vecs[11]);
957*81ad6265SDimitry Andric __m512i mn_0 = _mm512_unpacklo_epi32(vecs[12], vecs[13]);
958*81ad6265SDimitry Andric __m512i mn_2 = _mm512_unpackhi_epi32(vecs[12], vecs[13]);
959*81ad6265SDimitry Andric __m512i op_0 = _mm512_unpacklo_epi32(vecs[14], vecs[15]);
960*81ad6265SDimitry Andric __m512i op_2 = _mm512_unpackhi_epi32(vecs[14], vecs[15]);
961*81ad6265SDimitry Andric
962*81ad6265SDimitry Andric // Interleave 64-bit lates. The _0 unpack is lanes
963*81ad6265SDimitry Andric // 0/0/0/0/4/4/4/4/8/8/8/8/12/12/12/12, the _1 unpack is lanes
964*81ad6265SDimitry Andric // 1/1/1/1/5/5/5/5/9/9/9/9/13/13/13/13, the _2 unpack is lanes
965*81ad6265SDimitry Andric // 2/2/2/2/6/6/6/6/10/10/10/10/14/14/14/14, and the _3 unpack is lanes
966*81ad6265SDimitry Andric // 3/3/3/3/7/7/7/7/11/11/11/11/15/15/15/15.
967*81ad6265SDimitry Andric __m512i abcd_0 = _mm512_unpacklo_epi64(ab_0, cd_0);
968*81ad6265SDimitry Andric __m512i abcd_1 = _mm512_unpackhi_epi64(ab_0, cd_0);
969*81ad6265SDimitry Andric __m512i abcd_2 = _mm512_unpacklo_epi64(ab_2, cd_2);
970*81ad6265SDimitry Andric __m512i abcd_3 = _mm512_unpackhi_epi64(ab_2, cd_2);
971*81ad6265SDimitry Andric __m512i efgh_0 = _mm512_unpacklo_epi64(ef_0, gh_0);
972*81ad6265SDimitry Andric __m512i efgh_1 = _mm512_unpackhi_epi64(ef_0, gh_0);
973*81ad6265SDimitry Andric __m512i efgh_2 = _mm512_unpacklo_epi64(ef_2, gh_2);
974*81ad6265SDimitry Andric __m512i efgh_3 = _mm512_unpackhi_epi64(ef_2, gh_2);
975*81ad6265SDimitry Andric __m512i ijkl_0 = _mm512_unpacklo_epi64(ij_0, kl_0);
976*81ad6265SDimitry Andric __m512i ijkl_1 = _mm512_unpackhi_epi64(ij_0, kl_0);
977*81ad6265SDimitry Andric __m512i ijkl_2 = _mm512_unpacklo_epi64(ij_2, kl_2);
978*81ad6265SDimitry Andric __m512i ijkl_3 = _mm512_unpackhi_epi64(ij_2, kl_2);
979*81ad6265SDimitry Andric __m512i mnop_0 = _mm512_unpacklo_epi64(mn_0, op_0);
980*81ad6265SDimitry Andric __m512i mnop_1 = _mm512_unpackhi_epi64(mn_0, op_0);
981*81ad6265SDimitry Andric __m512i mnop_2 = _mm512_unpacklo_epi64(mn_2, op_2);
982*81ad6265SDimitry Andric __m512i mnop_3 = _mm512_unpackhi_epi64(mn_2, op_2);
983*81ad6265SDimitry Andric
984*81ad6265SDimitry Andric // Interleave 128-bit lanes. The _0 unpack is
985*81ad6265SDimitry Andric // 0/0/0/0/8/8/8/8/0/0/0/0/8/8/8/8, the _1 unpack is
986*81ad6265SDimitry Andric // 1/1/1/1/9/9/9/9/1/1/1/1/9/9/9/9, and so on.
987*81ad6265SDimitry Andric __m512i abcdefgh_0 = unpack_lo_128(abcd_0, efgh_0);
988*81ad6265SDimitry Andric __m512i abcdefgh_1 = unpack_lo_128(abcd_1, efgh_1);
989*81ad6265SDimitry Andric __m512i abcdefgh_2 = unpack_lo_128(abcd_2, efgh_2);
990*81ad6265SDimitry Andric __m512i abcdefgh_3 = unpack_lo_128(abcd_3, efgh_3);
991*81ad6265SDimitry Andric __m512i abcdefgh_4 = unpack_hi_128(abcd_0, efgh_0);
992*81ad6265SDimitry Andric __m512i abcdefgh_5 = unpack_hi_128(abcd_1, efgh_1);
993*81ad6265SDimitry Andric __m512i abcdefgh_6 = unpack_hi_128(abcd_2, efgh_2);
994*81ad6265SDimitry Andric __m512i abcdefgh_7 = unpack_hi_128(abcd_3, efgh_3);
995*81ad6265SDimitry Andric __m512i ijklmnop_0 = unpack_lo_128(ijkl_0, mnop_0);
996*81ad6265SDimitry Andric __m512i ijklmnop_1 = unpack_lo_128(ijkl_1, mnop_1);
997*81ad6265SDimitry Andric __m512i ijklmnop_2 = unpack_lo_128(ijkl_2, mnop_2);
998*81ad6265SDimitry Andric __m512i ijklmnop_3 = unpack_lo_128(ijkl_3, mnop_3);
999*81ad6265SDimitry Andric __m512i ijklmnop_4 = unpack_hi_128(ijkl_0, mnop_0);
1000*81ad6265SDimitry Andric __m512i ijklmnop_5 = unpack_hi_128(ijkl_1, mnop_1);
1001*81ad6265SDimitry Andric __m512i ijklmnop_6 = unpack_hi_128(ijkl_2, mnop_2);
1002*81ad6265SDimitry Andric __m512i ijklmnop_7 = unpack_hi_128(ijkl_3, mnop_3);
1003*81ad6265SDimitry Andric
1004*81ad6265SDimitry Andric // Interleave 128-bit lanes again for the final outputs.
1005*81ad6265SDimitry Andric vecs[0] = unpack_lo_128(abcdefgh_0, ijklmnop_0);
1006*81ad6265SDimitry Andric vecs[1] = unpack_lo_128(abcdefgh_1, ijklmnop_1);
1007*81ad6265SDimitry Andric vecs[2] = unpack_lo_128(abcdefgh_2, ijklmnop_2);
1008*81ad6265SDimitry Andric vecs[3] = unpack_lo_128(abcdefgh_3, ijklmnop_3);
1009*81ad6265SDimitry Andric vecs[4] = unpack_lo_128(abcdefgh_4, ijklmnop_4);
1010*81ad6265SDimitry Andric vecs[5] = unpack_lo_128(abcdefgh_5, ijklmnop_5);
1011*81ad6265SDimitry Andric vecs[6] = unpack_lo_128(abcdefgh_6, ijklmnop_6);
1012*81ad6265SDimitry Andric vecs[7] = unpack_lo_128(abcdefgh_7, ijklmnop_7);
1013*81ad6265SDimitry Andric vecs[8] = unpack_hi_128(abcdefgh_0, ijklmnop_0);
1014*81ad6265SDimitry Andric vecs[9] = unpack_hi_128(abcdefgh_1, ijklmnop_1);
1015*81ad6265SDimitry Andric vecs[10] = unpack_hi_128(abcdefgh_2, ijklmnop_2);
1016*81ad6265SDimitry Andric vecs[11] = unpack_hi_128(abcdefgh_3, ijklmnop_3);
1017*81ad6265SDimitry Andric vecs[12] = unpack_hi_128(abcdefgh_4, ijklmnop_4);
1018*81ad6265SDimitry Andric vecs[13] = unpack_hi_128(abcdefgh_5, ijklmnop_5);
1019*81ad6265SDimitry Andric vecs[14] = unpack_hi_128(abcdefgh_6, ijklmnop_6);
1020*81ad6265SDimitry Andric vecs[15] = unpack_hi_128(abcdefgh_7, ijklmnop_7);
1021*81ad6265SDimitry Andric }
1022*81ad6265SDimitry Andric
transpose_msg_vecs16(const uint8_t * const * inputs,size_t block_offset,__m512i out[16])1023*81ad6265SDimitry Andric INLINE void transpose_msg_vecs16(const uint8_t *const *inputs,
1024*81ad6265SDimitry Andric size_t block_offset, __m512i out[16]) {
1025*81ad6265SDimitry Andric out[0] = loadu_512(&inputs[0][block_offset]);
1026*81ad6265SDimitry Andric out[1] = loadu_512(&inputs[1][block_offset]);
1027*81ad6265SDimitry Andric out[2] = loadu_512(&inputs[2][block_offset]);
1028*81ad6265SDimitry Andric out[3] = loadu_512(&inputs[3][block_offset]);
1029*81ad6265SDimitry Andric out[4] = loadu_512(&inputs[4][block_offset]);
1030*81ad6265SDimitry Andric out[5] = loadu_512(&inputs[5][block_offset]);
1031*81ad6265SDimitry Andric out[6] = loadu_512(&inputs[6][block_offset]);
1032*81ad6265SDimitry Andric out[7] = loadu_512(&inputs[7][block_offset]);
1033*81ad6265SDimitry Andric out[8] = loadu_512(&inputs[8][block_offset]);
1034*81ad6265SDimitry Andric out[9] = loadu_512(&inputs[9][block_offset]);
1035*81ad6265SDimitry Andric out[10] = loadu_512(&inputs[10][block_offset]);
1036*81ad6265SDimitry Andric out[11] = loadu_512(&inputs[11][block_offset]);
1037*81ad6265SDimitry Andric out[12] = loadu_512(&inputs[12][block_offset]);
1038*81ad6265SDimitry Andric out[13] = loadu_512(&inputs[13][block_offset]);
1039*81ad6265SDimitry Andric out[14] = loadu_512(&inputs[14][block_offset]);
1040*81ad6265SDimitry Andric out[15] = loadu_512(&inputs[15][block_offset]);
1041*81ad6265SDimitry Andric for (size_t i = 0; i < 16; ++i) {
1042*81ad6265SDimitry Andric _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
1043*81ad6265SDimitry Andric }
1044*81ad6265SDimitry Andric transpose_vecs_512(out);
1045*81ad6265SDimitry Andric }
1046*81ad6265SDimitry Andric
load_counters16(uint64_t counter,bool increment_counter,__m512i * out_lo,__m512i * out_hi)1047*81ad6265SDimitry Andric INLINE void load_counters16(uint64_t counter, bool increment_counter,
1048*81ad6265SDimitry Andric __m512i *out_lo, __m512i *out_hi) {
1049*81ad6265SDimitry Andric const __m512i mask = _mm512_set1_epi32(-(int32_t)increment_counter);
1050*81ad6265SDimitry Andric const __m512i add0 = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
1051*81ad6265SDimitry Andric const __m512i add1 = _mm512_and_si512(mask, add0);
1052*81ad6265SDimitry Andric __m512i l = _mm512_add_epi32(_mm512_set1_epi32((int32_t)counter), add1);
1053*81ad6265SDimitry Andric __mmask16 carry = _mm512_cmp_epu32_mask(l, add1, _MM_CMPINT_LT);
1054*81ad6265SDimitry Andric __m512i h = _mm512_mask_add_epi32(_mm512_set1_epi32((int32_t)(counter >> 32)), carry, _mm512_set1_epi32((int32_t)(counter >> 32)), _mm512_set1_epi32(1));
1055*81ad6265SDimitry Andric *out_lo = l;
1056*81ad6265SDimitry Andric *out_hi = h;
1057*81ad6265SDimitry Andric }
1058*81ad6265SDimitry Andric
1059*81ad6265SDimitry Andric static
blake3_hash16_avx512(const uint8_t * const * inputs,size_t blocks,const uint32_t key[8],uint64_t counter,bool increment_counter,uint8_t flags,uint8_t flags_start,uint8_t flags_end,uint8_t * out)1060*81ad6265SDimitry Andric void blake3_hash16_avx512(const uint8_t *const *inputs, size_t blocks,
1061*81ad6265SDimitry Andric const uint32_t key[8], uint64_t counter,
1062*81ad6265SDimitry Andric bool increment_counter, uint8_t flags,
1063*81ad6265SDimitry Andric uint8_t flags_start, uint8_t flags_end,
1064*81ad6265SDimitry Andric uint8_t *out) {
1065*81ad6265SDimitry Andric __m512i h_vecs[8] = {
1066*81ad6265SDimitry Andric set1_512(key[0]), set1_512(key[1]), set1_512(key[2]), set1_512(key[3]),
1067*81ad6265SDimitry Andric set1_512(key[4]), set1_512(key[5]), set1_512(key[6]), set1_512(key[7]),
1068*81ad6265SDimitry Andric };
1069*81ad6265SDimitry Andric __m512i counter_low_vec, counter_high_vec;
1070*81ad6265SDimitry Andric load_counters16(counter, increment_counter, &counter_low_vec,
1071*81ad6265SDimitry Andric &counter_high_vec);
1072*81ad6265SDimitry Andric uint8_t block_flags = flags | flags_start;
1073*81ad6265SDimitry Andric
1074*81ad6265SDimitry Andric for (size_t block = 0; block < blocks; block++) {
1075*81ad6265SDimitry Andric if (block + 1 == blocks) {
1076*81ad6265SDimitry Andric block_flags |= flags_end;
1077*81ad6265SDimitry Andric }
1078*81ad6265SDimitry Andric __m512i block_len_vec = set1_512(BLAKE3_BLOCK_LEN);
1079*81ad6265SDimitry Andric __m512i block_flags_vec = set1_512(block_flags);
1080*81ad6265SDimitry Andric __m512i msg_vecs[16];
1081*81ad6265SDimitry Andric transpose_msg_vecs16(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
1082*81ad6265SDimitry Andric
1083*81ad6265SDimitry Andric __m512i v[16] = {
1084*81ad6265SDimitry Andric h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
1085*81ad6265SDimitry Andric h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
1086*81ad6265SDimitry Andric set1_512(IV[0]), set1_512(IV[1]), set1_512(IV[2]), set1_512(IV[3]),
1087*81ad6265SDimitry Andric counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
1088*81ad6265SDimitry Andric };
1089*81ad6265SDimitry Andric round_fn16(v, msg_vecs, 0);
1090*81ad6265SDimitry Andric round_fn16(v, msg_vecs, 1);
1091*81ad6265SDimitry Andric round_fn16(v, msg_vecs, 2);
1092*81ad6265SDimitry Andric round_fn16(v, msg_vecs, 3);
1093*81ad6265SDimitry Andric round_fn16(v, msg_vecs, 4);
1094*81ad6265SDimitry Andric round_fn16(v, msg_vecs, 5);
1095*81ad6265SDimitry Andric round_fn16(v, msg_vecs, 6);
1096*81ad6265SDimitry Andric h_vecs[0] = xor_512(v[0], v[8]);
1097*81ad6265SDimitry Andric h_vecs[1] = xor_512(v[1], v[9]);
1098*81ad6265SDimitry Andric h_vecs[2] = xor_512(v[2], v[10]);
1099*81ad6265SDimitry Andric h_vecs[3] = xor_512(v[3], v[11]);
1100*81ad6265SDimitry Andric h_vecs[4] = xor_512(v[4], v[12]);
1101*81ad6265SDimitry Andric h_vecs[5] = xor_512(v[5], v[13]);
1102*81ad6265SDimitry Andric h_vecs[6] = xor_512(v[6], v[14]);
1103*81ad6265SDimitry Andric h_vecs[7] = xor_512(v[7], v[15]);
1104*81ad6265SDimitry Andric
1105*81ad6265SDimitry Andric block_flags = flags;
1106*81ad6265SDimitry Andric }
1107*81ad6265SDimitry Andric
1108*81ad6265SDimitry Andric // transpose_vecs_512 operates on a 16x16 matrix of words, but we only have 8
1109*81ad6265SDimitry Andric // state vectors. Pad the matrix with zeros. After transposition, store the
1110*81ad6265SDimitry Andric // lower half of each vector.
1111*81ad6265SDimitry Andric __m512i padded[16] = {
1112*81ad6265SDimitry Andric h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
1113*81ad6265SDimitry Andric h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
1114*81ad6265SDimitry Andric set1_512(0), set1_512(0), set1_512(0), set1_512(0),
1115*81ad6265SDimitry Andric set1_512(0), set1_512(0), set1_512(0), set1_512(0),
1116*81ad6265SDimitry Andric };
1117*81ad6265SDimitry Andric transpose_vecs_512(padded);
1118*81ad6265SDimitry Andric _mm256_mask_storeu_epi32(&out[0 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[0]));
1119*81ad6265SDimitry Andric _mm256_mask_storeu_epi32(&out[1 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[1]));
1120*81ad6265SDimitry Andric _mm256_mask_storeu_epi32(&out[2 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[2]));
1121*81ad6265SDimitry Andric _mm256_mask_storeu_epi32(&out[3 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[3]));
1122*81ad6265SDimitry Andric _mm256_mask_storeu_epi32(&out[4 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[4]));
1123*81ad6265SDimitry Andric _mm256_mask_storeu_epi32(&out[5 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[5]));
1124*81ad6265SDimitry Andric _mm256_mask_storeu_epi32(&out[6 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[6]));
1125*81ad6265SDimitry Andric _mm256_mask_storeu_epi32(&out[7 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[7]));
1126*81ad6265SDimitry Andric _mm256_mask_storeu_epi32(&out[8 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[8]));
1127*81ad6265SDimitry Andric _mm256_mask_storeu_epi32(&out[9 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[9]));
1128*81ad6265SDimitry Andric _mm256_mask_storeu_epi32(&out[10 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[10]));
1129*81ad6265SDimitry Andric _mm256_mask_storeu_epi32(&out[11 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[11]));
1130*81ad6265SDimitry Andric _mm256_mask_storeu_epi32(&out[12 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[12]));
1131*81ad6265SDimitry Andric _mm256_mask_storeu_epi32(&out[13 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[13]));
1132*81ad6265SDimitry Andric _mm256_mask_storeu_epi32(&out[14 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[14]));
1133*81ad6265SDimitry Andric _mm256_mask_storeu_epi32(&out[15 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[15]));
1134*81ad6265SDimitry Andric }
1135*81ad6265SDimitry Andric
1136*81ad6265SDimitry Andric /*
1137*81ad6265SDimitry Andric * ----------------------------------------------------------------------------
1138*81ad6265SDimitry Andric * hash_many_avx512
1139*81ad6265SDimitry Andric * ----------------------------------------------------------------------------
1140*81ad6265SDimitry Andric */
1141*81ad6265SDimitry Andric
hash_one_avx512(const uint8_t * input,size_t blocks,const uint32_t key[8],uint64_t counter,uint8_t flags,uint8_t flags_start,uint8_t flags_end,uint8_t out[BLAKE3_OUT_LEN])1142*81ad6265SDimitry Andric INLINE void hash_one_avx512(const uint8_t *input, size_t blocks,
1143*81ad6265SDimitry Andric const uint32_t key[8], uint64_t counter,
1144*81ad6265SDimitry Andric uint8_t flags, uint8_t flags_start,
1145*81ad6265SDimitry Andric uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) {
1146*81ad6265SDimitry Andric uint32_t cv[8];
1147*81ad6265SDimitry Andric memcpy(cv, key, BLAKE3_KEY_LEN);
1148*81ad6265SDimitry Andric uint8_t block_flags = flags | flags_start;
1149*81ad6265SDimitry Andric while (blocks > 0) {
1150*81ad6265SDimitry Andric if (blocks == 1) {
1151*81ad6265SDimitry Andric block_flags |= flags_end;
1152*81ad6265SDimitry Andric }
1153*81ad6265SDimitry Andric blake3_compress_in_place_avx512(cv, input, BLAKE3_BLOCK_LEN, counter,
1154*81ad6265SDimitry Andric block_flags);
1155*81ad6265SDimitry Andric input = &input[BLAKE3_BLOCK_LEN];
1156*81ad6265SDimitry Andric blocks -= 1;
1157*81ad6265SDimitry Andric block_flags = flags;
1158*81ad6265SDimitry Andric }
1159*81ad6265SDimitry Andric memcpy(out, cv, BLAKE3_OUT_LEN);
1160*81ad6265SDimitry Andric }
1161*81ad6265SDimitry Andric
blake3_hash_many_avx512(const uint8_t * const * inputs,size_t num_inputs,size_t blocks,const uint32_t key[8],uint64_t counter,bool increment_counter,uint8_t flags,uint8_t flags_start,uint8_t flags_end,uint8_t * out)1162*81ad6265SDimitry Andric void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
1163*81ad6265SDimitry Andric size_t blocks, const uint32_t key[8],
1164*81ad6265SDimitry Andric uint64_t counter, bool increment_counter,
1165*81ad6265SDimitry Andric uint8_t flags, uint8_t flags_start,
1166*81ad6265SDimitry Andric uint8_t flags_end, uint8_t *out) {
1167*81ad6265SDimitry Andric while (num_inputs >= 16) {
1168*81ad6265SDimitry Andric blake3_hash16_avx512(inputs, blocks, key, counter, increment_counter, flags,
1169*81ad6265SDimitry Andric flags_start, flags_end, out);
1170*81ad6265SDimitry Andric if (increment_counter) {
1171*81ad6265SDimitry Andric counter += 16;
1172*81ad6265SDimitry Andric }
1173*81ad6265SDimitry Andric inputs += 16;
1174*81ad6265SDimitry Andric num_inputs -= 16;
1175*81ad6265SDimitry Andric out = &out[16 * BLAKE3_OUT_LEN];
1176*81ad6265SDimitry Andric }
1177*81ad6265SDimitry Andric while (num_inputs >= 8) {
1178*81ad6265SDimitry Andric blake3_hash8_avx512(inputs, blocks, key, counter, increment_counter, flags,
1179*81ad6265SDimitry Andric flags_start, flags_end, out);
1180*81ad6265SDimitry Andric if (increment_counter) {
1181*81ad6265SDimitry Andric counter += 8;
1182*81ad6265SDimitry Andric }
1183*81ad6265SDimitry Andric inputs += 8;
1184*81ad6265SDimitry Andric num_inputs -= 8;
1185*81ad6265SDimitry Andric out = &out[8 * BLAKE3_OUT_LEN];
1186*81ad6265SDimitry Andric }
1187*81ad6265SDimitry Andric while (num_inputs >= 4) {
1188*81ad6265SDimitry Andric blake3_hash4_avx512(inputs, blocks, key, counter, increment_counter, flags,
1189*81ad6265SDimitry Andric flags_start, flags_end, out);
1190*81ad6265SDimitry Andric if (increment_counter) {
1191*81ad6265SDimitry Andric counter += 4;
1192*81ad6265SDimitry Andric }
1193*81ad6265SDimitry Andric inputs += 4;
1194*81ad6265SDimitry Andric num_inputs -= 4;
1195*81ad6265SDimitry Andric out = &out[4 * BLAKE3_OUT_LEN];
1196*81ad6265SDimitry Andric }
1197*81ad6265SDimitry Andric while (num_inputs > 0) {
1198*81ad6265SDimitry Andric hash_one_avx512(inputs[0], blocks, key, counter, flags, flags_start,
1199*81ad6265SDimitry Andric flags_end, out);
1200*81ad6265SDimitry Andric if (increment_counter) {
1201*81ad6265SDimitry Andric counter += 1;
1202*81ad6265SDimitry Andric }
1203*81ad6265SDimitry Andric inputs += 1;
1204*81ad6265SDimitry Andric num_inputs -= 1;
1205*81ad6265SDimitry Andric out = &out[BLAKE3_OUT_LEN];
1206*81ad6265SDimitry Andric }
1207*81ad6265SDimitry Andric }
1208