Lines Matching +full:3 +full:v
105 *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3)); in diagonalize()
106 *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); in diagonalize()
107 *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1)); in diagonalize()
111 *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1)); in undiagonalize()
112 *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); in undiagonalize()
113 *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3)); in undiagonalize()
121 rows[2] = set4(IV[0], IV[1], IV[2], IV[3]); in compress_pre()
122 rows[3] = set4(counter_low(counter), counter_high(counter), in compress_pre()
128 __m128i m3 = loadu_128(&block[sizeof(__m128i) * 3]); in compress_pre()
135 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); in compress_pre()
136 t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); // 7 5 3 1 in compress_pre()
137 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); in compress_pre()
138 diagonalize(&rows[0], &rows[2], &rows[3]); in compress_pre()
140 t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); // 12 10 8 14 in compress_pre()
141 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); in compress_pre()
142 t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11 9 in compress_pre()
143 t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3)); // 13 11 9 15 in compress_pre()
144 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); in compress_pre()
145 undiagonalize(&rows[0], &rows[2], &rows[3]); in compress_pre()
153 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); in compress_pre()
154 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); in compress_pre()
155 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); in compress_pre()
156 t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); in compress_pre()
157 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); in compress_pre()
159 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); in compress_pre()
160 diagonalize(&rows[0], &rows[2], &rows[3]); in compress_pre()
163 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); in compress_pre()
164 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); in compress_pre()
167 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); in compress_pre()
168 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); in compress_pre()
169 undiagonalize(&rows[0], &rows[2], &rows[3]); in compress_pre()
175 // Round 3 in compress_pre()
176 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); in compress_pre()
177 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); in compress_pre()
178 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); in compress_pre()
179 t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); in compress_pre()
180 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); in compress_pre()
182 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); in compress_pre()
183 diagonalize(&rows[0], &rows[2], &rows[3]); in compress_pre()
186 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); in compress_pre()
187 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); in compress_pre()
190 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); in compress_pre()
191 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); in compress_pre()
192 undiagonalize(&rows[0], &rows[2], &rows[3]); in compress_pre()
199 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); in compress_pre()
200 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); in compress_pre()
201 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); in compress_pre()
202 t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); in compress_pre()
203 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); in compress_pre()
205 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); in compress_pre()
206 diagonalize(&rows[0], &rows[2], &rows[3]); in compress_pre()
209 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); in compress_pre()
210 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); in compress_pre()
213 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); in compress_pre()
214 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); in compress_pre()
215 undiagonalize(&rows[0], &rows[2], &rows[3]); in compress_pre()
222 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); in compress_pre()
223 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); in compress_pre()
224 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); in compress_pre()
225 t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); in compress_pre()
226 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); in compress_pre()
228 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); in compress_pre()
229 diagonalize(&rows[0], &rows[2], &rows[3]); in compress_pre()
232 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); in compress_pre()
233 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); in compress_pre()
236 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); in compress_pre()
237 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); in compress_pre()
238 undiagonalize(&rows[0], &rows[2], &rows[3]); in compress_pre()
245 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); in compress_pre()
246 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); in compress_pre()
247 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); in compress_pre()
248 t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); in compress_pre()
249 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); in compress_pre()
251 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); in compress_pre()
252 diagonalize(&rows[0], &rows[2], &rows[3]); in compress_pre()
255 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); in compress_pre()
256 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); in compress_pre()
259 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); in compress_pre()
260 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); in compress_pre()
261 undiagonalize(&rows[0], &rows[2], &rows[3]); in compress_pre()
268 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); in compress_pre()
269 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); in compress_pre()
270 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); in compress_pre()
271 t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); in compress_pre()
272 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); in compress_pre()
274 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); in compress_pre()
275 diagonalize(&rows[0], &rows[2], &rows[3]); in compress_pre()
278 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); in compress_pre()
279 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); in compress_pre()
282 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); in compress_pre()
283 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); in compress_pre()
284 undiagonalize(&rows[0], &rows[2], &rows[3]); in compress_pre()
294 storeu_128(xor_128(rows[1], rows[3]), &out[16]); in blake3_compress_xof_avx512()
296 storeu_128(xor_128(rows[3], loadu_128((uint8_t *)&cv[4])), &out[48]); in blake3_compress_xof_avx512()
306 storeu_128(xor_128(rows[1], rows[3]), (uint8_t *)&cv[4]); in blake3_compress_in_place_avx512()
315 INLINE void round_fn4(__m128i v[16], __m128i m[16], size_t r) { in round_fn4()
316 v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); in round_fn4()
317 v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); in round_fn4()
318 v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); in round_fn4()
319 v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); in round_fn4()
320 v[0] = add_128(v[0], v[4]); in round_fn4()
321 v[1] = add_128(v[1], v[5]); in round_fn4()
322 v[2] = add_128(v[2], v[6]); in round_fn4()
323 v[3] = add_128(v[3], v[7]); in round_fn4()
324 v[12] = xor_128(v[12], v[0]); in round_fn4()
325 v[13] = xor_128(v[13], v[1]); in round_fn4()
326 v[14] = xor_128(v[14], v[2]); in round_fn4()
327 v[15] = xor_128(v[15], v[3]); in round_fn4()
328 v[12] = rot16_128(v[12]); in round_fn4()
329 v[13] = rot16_128(v[13]); in round_fn4()
330 v[14] = rot16_128(v[14]); in round_fn4()
331 v[15] = rot16_128(v[15]); in round_fn4()
332 v[8] = add_128(v[8], v[12]); in round_fn4()
333 v[9] = add_128(v[9], v[13]); in round_fn4()
334 v[10] = add_128(v[10], v[14]); in round_fn4()
335 v[11] = add_128(v[11], v[15]); in round_fn4()
336 v[4] = xor_128(v[4], v[8]); in round_fn4()
337 v[5] = xor_128(v[5], v[9]); in round_fn4()
338 v[6] = xor_128(v[6], v[10]); in round_fn4()
339 v[7] = xor_128(v[7], v[11]); in round_fn4()
340 v[4] = rot12_128(v[4]); in round_fn4()
341 v[5] = rot12_128(v[5]); in round_fn4()
342 v[6] = rot12_128(v[6]); in round_fn4()
343 v[7] = rot12_128(v[7]); in round_fn4()
344 v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); in round_fn4()
345 v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); in round_fn4()
346 v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); in round_fn4()
347 v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); in round_fn4()
348 v[0] = add_128(v[0], v[4]); in round_fn4()
349 v[1] = add_128(v[1], v[5]); in round_fn4()
350 v[2] = add_128(v[2], v[6]); in round_fn4()
351 v[3] = add_128(v[3], v[7]); in round_fn4()
352 v[12] = xor_128(v[12], v[0]); in round_fn4()
353 v[13] = xor_128(v[13], v[1]); in round_fn4()
354 v[14] = xor_128(v[14], v[2]); in round_fn4()
355 v[15] = xor_128(v[15], v[3]); in round_fn4()
356 v[12] = rot8_128(v[12]); in round_fn4()
357 v[13] = rot8_128(v[13]); in round_fn4()
358 v[14] = rot8_128(v[14]); in round_fn4()
359 v[15] = rot8_128(v[15]); in round_fn4()
360 v[8] = add_128(v[8], v[12]); in round_fn4()
361 v[9] = add_128(v[9], v[13]); in round_fn4()
362 v[10] = add_128(v[10], v[14]); in round_fn4()
363 v[11] = add_128(v[11], v[15]); in round_fn4()
364 v[4] = xor_128(v[4], v[8]); in round_fn4()
365 v[5] = xor_128(v[5], v[9]); in round_fn4()
366 v[6] = xor_128(v[6], v[10]); in round_fn4()
367 v[7] = xor_128(v[7], v[11]); in round_fn4()
368 v[4] = rot7_128(v[4]); in round_fn4()
369 v[5] = rot7_128(v[5]); in round_fn4()
370 v[6] = rot7_128(v[6]); in round_fn4()
371 v[7] = rot7_128(v[7]); in round_fn4()
373 v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); in round_fn4()
374 v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); in round_fn4()
375 v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); in round_fn4()
376 v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); in round_fn4()
377 v[0] = add_128(v[0], v[5]); in round_fn4()
378 v[1] = add_128(v[1], v[6]); in round_fn4()
379 v[2] = add_128(v[2], v[7]); in round_fn4()
380 v[3] = add_128(v[3], v[4]); in round_fn4()
381 v[15] = xor_128(v[15], v[0]); in round_fn4()
382 v[12] = xor_128(v[12], v[1]); in round_fn4()
383 v[13] = xor_128(v[13], v[2]); in round_fn4()
384 v[14] = xor_128(v[14], v[3]); in round_fn4()
385 v[15] = rot16_128(v[15]); in round_fn4()
386 v[12] = rot16_128(v[12]); in round_fn4()
387 v[13] = rot16_128(v[13]); in round_fn4()
388 v[14] = rot16_128(v[14]); in round_fn4()
389 v[10] = add_128(v[10], v[15]); in round_fn4()
390 v[11] = add_128(v[11], v[12]); in round_fn4()
391 v[8] = add_128(v[8], v[13]); in round_fn4()
392 v[9] = add_128(v[9], v[14]); in round_fn4()
393 v[5] = xor_128(v[5], v[10]); in round_fn4()
394 v[6] = xor_128(v[6], v[11]); in round_fn4()
395 v[7] = xor_128(v[7], v[8]); in round_fn4()
396 v[4] = xor_128(v[4], v[9]); in round_fn4()
397 v[5] = rot12_128(v[5]); in round_fn4()
398 v[6] = rot12_128(v[6]); in round_fn4()
399 v[7] = rot12_128(v[7]); in round_fn4()
400 v[4] = rot12_128(v[4]); in round_fn4()
401 v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); in round_fn4()
402 v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); in round_fn4()
403 v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); in round_fn4()
404 v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); in round_fn4()
405 v[0] = add_128(v[0], v[5]); in round_fn4()
406 v[1] = add_128(v[1], v[6]); in round_fn4()
407 v[2] = add_128(v[2], v[7]); in round_fn4()
408 v[3] = add_128(v[3], v[4]); in round_fn4()
409 v[15] = xor_128(v[15], v[0]); in round_fn4()
410 v[12] = xor_128(v[12], v[1]); in round_fn4()
411 v[13] = xor_128(v[13], v[2]); in round_fn4()
412 v[14] = xor_128(v[14], v[3]); in round_fn4()
413 v[15] = rot8_128(v[15]); in round_fn4()
414 v[12] = rot8_128(v[12]); in round_fn4()
415 v[13] = rot8_128(v[13]); in round_fn4()
416 v[14] = rot8_128(v[14]); in round_fn4()
417 v[10] = add_128(v[10], v[15]); in round_fn4()
418 v[11] = add_128(v[11], v[12]); in round_fn4()
419 v[8] = add_128(v[8], v[13]); in round_fn4()
420 v[9] = add_128(v[9], v[14]); in round_fn4()
421 v[5] = xor_128(v[5], v[10]); in round_fn4()
422 v[6] = xor_128(v[6], v[11]); in round_fn4()
423 v[7] = xor_128(v[7], v[8]); in round_fn4()
424 v[4] = xor_128(v[4], v[9]); in round_fn4()
425 v[5] = rot7_128(v[5]); in round_fn4()
426 v[6] = rot7_128(v[6]); in round_fn4()
427 v[7] = rot7_128(v[7]); in round_fn4()
428 v[4] = rot7_128(v[4]); in round_fn4()
437 __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]); in transpose_vecs_128()
438 __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]); in transpose_vecs_128()
449 vecs[3] = abcd_3; in transpose_vecs_128()
457 out[3] = loadu_128(&inputs[3][block_offset + 0 * sizeof(__m128i)]); in transpose_msg_vecs4()
461 out[7] = loadu_128(&inputs[3][block_offset + 1 * sizeof(__m128i)]); in transpose_msg_vecs4()
465 out[11] = loadu_128(&inputs[3][block_offset + 2 * sizeof(__m128i)]); in transpose_msg_vecs4()
466 out[12] = loadu_128(&inputs[0][block_offset + 3 * sizeof(__m128i)]); in transpose_msg_vecs4()
467 out[13] = loadu_128(&inputs[1][block_offset + 3 * sizeof(__m128i)]); in transpose_msg_vecs4()
468 out[14] = loadu_128(&inputs[2][block_offset + 3 * sizeof(__m128i)]); in transpose_msg_vecs4()
469 out[15] = loadu_128(&inputs[3][block_offset + 3 * sizeof(__m128i)]); in transpose_msg_vecs4()
483 __m256i deltas = _mm256_setr_epi64x(0, 1, 2, 3); in load_counters4()
497 set1_128(key[0]), set1_128(key[1]), set1_128(key[2]), set1_128(key[3]), in blake3_hash4_avx512()
514 __m128i v[16] = { in blake3_hash4_avx512() local
515 h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], in blake3_hash4_avx512()
517 set1_128(IV[0]), set1_128(IV[1]), set1_128(IV[2]), set1_128(IV[3]), in blake3_hash4_avx512()
520 round_fn4(v, msg_vecs, 0); in blake3_hash4_avx512()
521 round_fn4(v, msg_vecs, 1); in blake3_hash4_avx512()
522 round_fn4(v, msg_vecs, 2); in blake3_hash4_avx512()
523 round_fn4(v, msg_vecs, 3); in blake3_hash4_avx512()
524 round_fn4(v, msg_vecs, 4); in blake3_hash4_avx512()
525 round_fn4(v, msg_vecs, 5); in blake3_hash4_avx512()
526 round_fn4(v, msg_vecs, 6); in blake3_hash4_avx512()
527 h_vecs[0] = xor_128(v[0], v[8]); in blake3_hash4_avx512()
528 h_vecs[1] = xor_128(v[1], v[9]); in blake3_hash4_avx512()
529 h_vecs[2] = xor_128(v[2], v[10]); in blake3_hash4_avx512()
530 h_vecs[3] = xor_128(v[3], v[11]); in blake3_hash4_avx512()
531 h_vecs[4] = xor_128(v[4], v[12]); in blake3_hash4_avx512()
532 h_vecs[5] = xor_128(v[5], v[13]); in blake3_hash4_avx512()
533 h_vecs[6] = xor_128(v[6], v[14]); in blake3_hash4_avx512()
534 h_vecs[7] = xor_128(v[7], v[15]); in blake3_hash4_avx512()
546 storeu_128(h_vecs[5], &out[3 * sizeof(__m128i)]); in blake3_hash4_avx512()
549 storeu_128(h_vecs[3], &out[6 * sizeof(__m128i)]); in blake3_hash4_avx512()
559 INLINE void round_fn8(__m256i v[16], __m256i m[16], size_t r) { in round_fn8()
560 v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); in round_fn8()
561 v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); in round_fn8()
562 v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); in round_fn8()
563 v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); in round_fn8()
564 v[0] = add_256(v[0], v[4]); in round_fn8()
565 v[1] = add_256(v[1], v[5]); in round_fn8()
566 v[2] = add_256(v[2], v[6]); in round_fn8()
567 v[3] = add_256(v[3], v[7]); in round_fn8()
568 v[12] = xor_256(v[12], v[0]); in round_fn8()
569 v[13] = xor_256(v[13], v[1]); in round_fn8()
570 v[14] = xor_256(v[14], v[2]); in round_fn8()
571 v[15] = xor_256(v[15], v[3]); in round_fn8()
572 v[12] = rot16_256(v[12]); in round_fn8()
573 v[13] = rot16_256(v[13]); in round_fn8()
574 v[14] = rot16_256(v[14]); in round_fn8()
575 v[15] = rot16_256(v[15]); in round_fn8()
576 v[8] = add_256(v[8], v[12]); in round_fn8()
577 v[9] = add_256(v[9], v[13]); in round_fn8()
578 v[10] = add_256(v[10], v[14]); in round_fn8()
579 v[11] = add_256(v[11], v[15]); in round_fn8()
580 v[4] = xor_256(v[4], v[8]); in round_fn8()
581 v[5] = xor_256(v[5], v[9]); in round_fn8()
582 v[6] = xor_256(v[6], v[10]); in round_fn8()
583 v[7] = xor_256(v[7], v[11]); in round_fn8()
584 v[4] = rot12_256(v[4]); in round_fn8()
585 v[5] = rot12_256(v[5]); in round_fn8()
586 v[6] = rot12_256(v[6]); in round_fn8()
587 v[7] = rot12_256(v[7]); in round_fn8()
588 v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); in round_fn8()
589 v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); in round_fn8()
590 v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); in round_fn8()
591 v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); in round_fn8()
592 v[0] = add_256(v[0], v[4]); in round_fn8()
593 v[1] = add_256(v[1], v[5]); in round_fn8()
594 v[2] = add_256(v[2], v[6]); in round_fn8()
595 v[3] = add_256(v[3], v[7]); in round_fn8()
596 v[12] = xor_256(v[12], v[0]); in round_fn8()
597 v[13] = xor_256(v[13], v[1]); in round_fn8()
598 v[14] = xor_256(v[14], v[2]); in round_fn8()
599 v[15] = xor_256(v[15], v[3]); in round_fn8()
600 v[12] = rot8_256(v[12]); in round_fn8()
601 v[13] = rot8_256(v[13]); in round_fn8()
602 v[14] = rot8_256(v[14]); in round_fn8()
603 v[15] = rot8_256(v[15]); in round_fn8()
604 v[8] = add_256(v[8], v[12]); in round_fn8()
605 v[9] = add_256(v[9], v[13]); in round_fn8()
606 v[10] = add_256(v[10], v[14]); in round_fn8()
607 v[11] = add_256(v[11], v[15]); in round_fn8()
608 v[4] = xor_256(v[4], v[8]); in round_fn8()
609 v[5] = xor_256(v[5], v[9]); in round_fn8()
610 v[6] = xor_256(v[6], v[10]); in round_fn8()
611 v[7] = xor_256(v[7], v[11]); in round_fn8()
612 v[4] = rot7_256(v[4]); in round_fn8()
613 v[5] = rot7_256(v[5]); in round_fn8()
614 v[6] = rot7_256(v[6]); in round_fn8()
615 v[7] = rot7_256(v[7]); in round_fn8()
617 v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); in round_fn8()
618 v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); in round_fn8()
619 v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); in round_fn8()
620 v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); in round_fn8()
621 v[0] = add_256(v[0], v[5]); in round_fn8()
622 v[1] = add_256(v[1], v[6]); in round_fn8()
623 v[2] = add_256(v[2], v[7]); in round_fn8()
624 v[3] = add_256(v[3], v[4]); in round_fn8()
625 v[15] = xor_256(v[15], v[0]); in round_fn8()
626 v[12] = xor_256(v[12], v[1]); in round_fn8()
627 v[13] = xor_256(v[13], v[2]); in round_fn8()
628 v[14] = xor_256(v[14], v[3]); in round_fn8()
629 v[15] = rot16_256(v[15]); in round_fn8()
630 v[12] = rot16_256(v[12]); in round_fn8()
631 v[13] = rot16_256(v[13]); in round_fn8()
632 v[14] = rot16_256(v[14]); in round_fn8()
633 v[10] = add_256(v[10], v[15]); in round_fn8()
634 v[11] = add_256(v[11], v[12]); in round_fn8()
635 v[8] = add_256(v[8], v[13]); in round_fn8()
636 v[9] = add_256(v[9], v[14]); in round_fn8()
637 v[5] = xor_256(v[5], v[10]); in round_fn8()
638 v[6] = xor_256(v[6], v[11]); in round_fn8()
639 v[7] = xor_256(v[7], v[8]); in round_fn8()
640 v[4] = xor_256(v[4], v[9]); in round_fn8()
641 v[5] = rot12_256(v[5]); in round_fn8()
642 v[6] = rot12_256(v[6]); in round_fn8()
643 v[7] = rot12_256(v[7]); in round_fn8()
644 v[4] = rot12_256(v[4]); in round_fn8()
645 v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); in round_fn8()
646 v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); in round_fn8()
647 v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); in round_fn8()
648 v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); in round_fn8()
649 v[0] = add_256(v[0], v[5]); in round_fn8()
650 v[1] = add_256(v[1], v[6]); in round_fn8()
651 v[2] = add_256(v[2], v[7]); in round_fn8()
652 v[3] = add_256(v[3], v[4]); in round_fn8()
653 v[15] = xor_256(v[15], v[0]); in round_fn8()
654 v[12] = xor_256(v[12], v[1]); in round_fn8()
655 v[13] = xor_256(v[13], v[2]); in round_fn8()
656 v[14] = xor_256(v[14], v[3]); in round_fn8()
657 v[15] = rot8_256(v[15]); in round_fn8()
658 v[12] = rot8_256(v[12]); in round_fn8()
659 v[13] = rot8_256(v[13]); in round_fn8()
660 v[14] = rot8_256(v[14]); in round_fn8()
661 v[10] = add_256(v[10], v[15]); in round_fn8()
662 v[11] = add_256(v[11], v[12]); in round_fn8()
663 v[8] = add_256(v[8], v[13]); in round_fn8()
664 v[9] = add_256(v[9], v[14]); in round_fn8()
665 v[5] = xor_256(v[5], v[10]); in round_fn8()
666 v[6] = xor_256(v[6], v[11]); in round_fn8()
667 v[7] = xor_256(v[7], v[8]); in round_fn8()
668 v[4] = xor_256(v[4], v[9]); in round_fn8()
669 v[5] = rot7_256(v[5]); in round_fn8()
670 v[6] = rot7_256(v[6]); in round_fn8()
671 v[7] = rot7_256(v[7]); in round_fn8()
672 v[4] = rot7_256(v[4]); in round_fn8()
680 __m256i cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]); in transpose_vecs_256()
681 __m256i cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]); in transpose_vecs_256()
702 vecs[3] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x20); in transpose_vecs_256()
714 out[3] = loadu_256(&inputs[3][block_offset + 0 * sizeof(__m256i)]); in transpose_msg_vecs8()
722 out[11] = loadu_256(&inputs[3][block_offset + 1 * sizeof(__m256i)]); in transpose_msg_vecs8()
738 __m512i deltas = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7); in load_counters8()
752 set1_256(key[0]), set1_256(key[1]), set1_256(key[2]), set1_256(key[3]), in blake3_hash8_avx512()
769 __m256i v[16] = { in blake3_hash8_avx512() local
770 h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], in blake3_hash8_avx512()
772 set1_256(IV[0]), set1_256(IV[1]), set1_256(IV[2]), set1_256(IV[3]), in blake3_hash8_avx512()
775 round_fn8(v, msg_vecs, 0); in blake3_hash8_avx512()
776 round_fn8(v, msg_vecs, 1); in blake3_hash8_avx512()
777 round_fn8(v, msg_vecs, 2); in blake3_hash8_avx512()
778 round_fn8(v, msg_vecs, 3); in blake3_hash8_avx512()
779 round_fn8(v, msg_vecs, 4); in blake3_hash8_avx512()
780 round_fn8(v, msg_vecs, 5); in blake3_hash8_avx512()
781 round_fn8(v, msg_vecs, 6); in blake3_hash8_avx512()
782 h_vecs[0] = xor_256(v[0], v[8]); in blake3_hash8_avx512()
783 h_vecs[1] = xor_256(v[1], v[9]); in blake3_hash8_avx512()
784 h_vecs[2] = xor_256(v[2], v[10]); in blake3_hash8_avx512()
785 h_vecs[3] = xor_256(v[3], v[11]); in blake3_hash8_avx512()
786 h_vecs[4] = xor_256(v[4], v[12]); in blake3_hash8_avx512()
787 h_vecs[5] = xor_256(v[5], v[13]); in blake3_hash8_avx512()
788 h_vecs[6] = xor_256(v[6], v[14]); in blake3_hash8_avx512()
789 h_vecs[7] = xor_256(v[7], v[15]); in blake3_hash8_avx512()
798 storeu_256(h_vecs[3], &out[3 * sizeof(__m256i)]); in blake3_hash8_avx512()
811 INLINE void round_fn16(__m512i v[16], __m512i m[16], size_t r) { in round_fn16()
812 v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); in round_fn16()
813 v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); in round_fn16()
814 v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); in round_fn16()
815 v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); in round_fn16()
816 v[0] = add_512(v[0], v[4]); in round_fn16()
817 v[1] = add_512(v[1], v[5]); in round_fn16()
818 v[2] = add_512(v[2], v[6]); in round_fn16()
819 v[3] = add_512(v[3], v[7]); in round_fn16()
820 v[12] = xor_512(v[12], v[0]); in round_fn16()
821 v[13] = xor_512(v[13], v[1]); in round_fn16()
822 v[14] = xor_512(v[14], v[2]); in round_fn16()
823 v[15] = xor_512(v[15], v[3]); in round_fn16()
824 v[12] = rot16_512(v[12]); in round_fn16()
825 v[13] = rot16_512(v[13]); in round_fn16()
826 v[14] = rot16_512(v[14]); in round_fn16()
827 v[15] = rot16_512(v[15]); in round_fn16()
828 v[8] = add_512(v[8], v[12]); in round_fn16()
829 v[9] = add_512(v[9], v[13]); in round_fn16()
830 v[10] = add_512(v[10], v[14]); in round_fn16()
831 v[11] = add_512(v[11], v[15]); in round_fn16()
832 v[4] = xor_512(v[4], v[8]); in round_fn16()
833 v[5] = xor_512(v[5], v[9]); in round_fn16()
834 v[6] = xor_512(v[6], v[10]); in round_fn16()
835 v[7] = xor_512(v[7], v[11]); in round_fn16()
836 v[4] = rot12_512(v[4]); in round_fn16()
837 v[5] = rot12_512(v[5]); in round_fn16()
838 v[6] = rot12_512(v[6]); in round_fn16()
839 v[7] = rot12_512(v[7]); in round_fn16()
840 v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); in round_fn16()
841 v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); in round_fn16()
842 v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); in round_fn16()
843 v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); in round_fn16()
844 v[0] = add_512(v[0], v[4]); in round_fn16()
845 v[1] = add_512(v[1], v[5]); in round_fn16()
846 v[2] = add_512(v[2], v[6]); in round_fn16()
847 v[3] = add_512(v[3], v[7]); in round_fn16()
848 v[12] = xor_512(v[12], v[0]); in round_fn16()
849 v[13] = xor_512(v[13], v[1]); in round_fn16()
850 v[14] = xor_512(v[14], v[2]); in round_fn16()
851 v[15] = xor_512(v[15], v[3]); in round_fn16()
852 v[12] = rot8_512(v[12]); in round_fn16()
853 v[13] = rot8_512(v[13]); in round_fn16()
854 v[14] = rot8_512(v[14]); in round_fn16()
855 v[15] = rot8_512(v[15]); in round_fn16()
856 v[8] = add_512(v[8], v[12]); in round_fn16()
857 v[9] = add_512(v[9], v[13]); in round_fn16()
858 v[10] = add_512(v[10], v[14]); in round_fn16()
859 v[11] = add_512(v[11], v[15]); in round_fn16()
860 v[4] = xor_512(v[4], v[8]); in round_fn16()
861 v[5] = xor_512(v[5], v[9]); in round_fn16()
862 v[6] = xor_512(v[6], v[10]); in round_fn16()
863 v[7] = xor_512(v[7], v[11]); in round_fn16()
864 v[4] = rot7_512(v[4]); in round_fn16()
865 v[5] = rot7_512(v[5]); in round_fn16()
866 v[6] = rot7_512(v[6]); in round_fn16()
867 v[7] = rot7_512(v[7]); in round_fn16()
869 v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); in round_fn16()
870 v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); in round_fn16()
871 v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); in round_fn16()
872 v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); in round_fn16()
873 v[0] = add_512(v[0], v[5]); in round_fn16()
874 v[1] = add_512(v[1], v[6]); in round_fn16()
875 v[2] = add_512(v[2], v[7]); in round_fn16()
876 v[3] = add_512(v[3], v[4]); in round_fn16()
877 v[15] = xor_512(v[15], v[0]); in round_fn16()
878 v[12] = xor_512(v[12], v[1]); in round_fn16()
879 v[13] = xor_512(v[13], v[2]); in round_fn16()
880 v[14] = xor_512(v[14], v[3]); in round_fn16()
881 v[15] = rot16_512(v[15]); in round_fn16()
882 v[12] = rot16_512(v[12]); in round_fn16()
883 v[13] = rot16_512(v[13]); in round_fn16()
884 v[14] = rot16_512(v[14]); in round_fn16()
885 v[10] = add_512(v[10], v[15]); in round_fn16()
886 v[11] = add_512(v[11], v[12]); in round_fn16()
887 v[8] = add_512(v[8], v[13]); in round_fn16()
888 v[9] = add_512(v[9], v[14]); in round_fn16()
889 v[5] = xor_512(v[5], v[10]); in round_fn16()
890 v[6] = xor_512(v[6], v[11]); in round_fn16()
891 v[7] = xor_512(v[7], v[8]); in round_fn16()
892 v[4] = xor_512(v[4], v[9]); in round_fn16()
893 v[5] = rot12_512(v[5]); in round_fn16()
894 v[6] = rot12_512(v[6]); in round_fn16()
895 v[7] = rot12_512(v[7]); in round_fn16()
896 v[4] = rot12_512(v[4]); in round_fn16()
897 v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); in round_fn16()
898 v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); in round_fn16()
899 v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); in round_fn16()
900 v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); in round_fn16()
901 v[0] = add_512(v[0], v[5]); in round_fn16()
902 v[1] = add_512(v[1], v[6]); in round_fn16()
903 v[2] = add_512(v[2], v[7]); in round_fn16()
904 v[3] = add_512(v[3], v[4]); in round_fn16()
905 v[15] = xor_512(v[15], v[0]); in round_fn16()
906 v[12] = xor_512(v[12], v[1]); in round_fn16()
907 v[13] = xor_512(v[13], v[2]); in round_fn16()
908 v[14] = xor_512(v[14], v[3]); in round_fn16()
909 v[15] = rot8_512(v[15]); in round_fn16()
910 v[12] = rot8_512(v[12]); in round_fn16()
911 v[13] = rot8_512(v[13]); in round_fn16()
912 v[14] = rot8_512(v[14]); in round_fn16()
913 v[10] = add_512(v[10], v[15]); in round_fn16()
914 v[11] = add_512(v[11], v[12]); in round_fn16()
915 v[8] = add_512(v[8], v[13]); in round_fn16()
916 v[9] = add_512(v[9], v[14]); in round_fn16()
917 v[5] = xor_512(v[5], v[10]); in round_fn16()
918 v[6] = xor_512(v[6], v[11]); in round_fn16()
919 v[7] = xor_512(v[7], v[8]); in round_fn16()
920 v[4] = xor_512(v[4], v[9]); in round_fn16()
921 v[5] = rot7_512(v[5]); in round_fn16()
922 v[6] = rot7_512(v[6]); in round_fn16()
923 v[7] = rot7_512(v[7]); in round_fn16()
924 v[4] = rot7_512(v[4]); in round_fn16()
944 // 2/2/3/3/6/6/7/7/10/10/11/11/14/14/15/15. in transpose_vecs_512()
947 __m512i cd_0 = _mm512_unpacklo_epi32(vecs[2], vecs[3]); in transpose_vecs_512()
948 __m512i cd_2 = _mm512_unpackhi_epi32(vecs[2], vecs[3]); in transpose_vecs_512()
966 // 3/3/3/3/7/7/7/7/11/11/11/11/15/15/15/15. in transpose_vecs_512()
1008 vecs[3] = unpack_lo_128(abcdefgh_3, ijklmnop_3); in transpose_vecs_512()
1028 out[3] = loadu_512(&inputs[3][block_offset]); in transpose_msg_vecs16()
1050 const __m512i add0 = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); in load_counters16()
1066 set1_512(key[0]), set1_512(key[1]), set1_512(key[2]), set1_512(key[3]), in blake3_hash16_avx512()
1083 __m512i v[16] = { in blake3_hash16_avx512() local
1084 h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], in blake3_hash16_avx512()
1086 set1_512(IV[0]), set1_512(IV[1]), set1_512(IV[2]), set1_512(IV[3]), in blake3_hash16_avx512()
1089 round_fn16(v, msg_vecs, 0); in blake3_hash16_avx512()
1090 round_fn16(v, msg_vecs, 1); in blake3_hash16_avx512()
1091 round_fn16(v, msg_vecs, 2); in blake3_hash16_avx512()
1092 round_fn16(v, msg_vecs, 3); in blake3_hash16_avx512()
1093 round_fn16(v, msg_vecs, 4); in blake3_hash16_avx512()
1094 round_fn16(v, msg_vecs, 5); in blake3_hash16_avx512()
1095 round_fn16(v, msg_vecs, 6); in blake3_hash16_avx512()
1096 h_vecs[0] = xor_512(v[0], v[8]); in blake3_hash16_avx512()
1097 h_vecs[1] = xor_512(v[1], v[9]); in blake3_hash16_avx512()
1098 h_vecs[2] = xor_512(v[2], v[10]); in blake3_hash16_avx512()
1099 h_vecs[3] = xor_512(v[3], v[11]); in blake3_hash16_avx512()
1100 h_vecs[4] = xor_512(v[4], v[12]); in blake3_hash16_avx512()
1101 h_vecs[5] = xor_512(v[5], v[13]); in blake3_hash16_avx512()
1102 h_vecs[6] = xor_512(v[6], v[14]); in blake3_hash16_avx512()
1103 h_vecs[7] = xor_512(v[7], v[15]); in blake3_hash16_avx512()
1112 h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], in blake3_hash16_avx512()
1121 …_mm256_mask_storeu_epi32(&out[3 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[3]… in blake3_hash16_avx512()