Lines Matching +full:0 +full:v
32 x, _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)); in rot16()
41 x, _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1)); in rot8()
72 *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3)); in diagonalize()
73 *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); in diagonalize()
74 *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1)); in diagonalize()
78 *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1)); in undiagonalize()
79 *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); in undiagonalize()
80 *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3)); in undiagonalize()
86 rows[0] = loadu((uint8_t *)&cv[0]); in compress_pre()
88 rows[2] = set4(IV[0], IV[1], IV[2], IV[3]); in compress_pre()
92 __m128i m0 = loadu(&block[sizeof(__m128i) * 0]); in compress_pre()
101 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); // 6 4 2 0 in compress_pre()
102 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); in compress_pre()
104 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); in compress_pre()
105 diagonalize(&rows[0], &rows[2], &rows[3]); in compress_pre()
106 t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10 8 in compress_pre()
107 t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); // 12 10 8 14 in compress_pre()
108 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); in compress_pre()
110 t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3)); // 13 11 9 15 in compress_pre()
111 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); in compress_pre()
112 undiagonalize(&rows[0], &rows[2], &rows[3]); in compress_pre()
121 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); in compress_pre()
122 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); in compress_pre()
124 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); in compress_pre()
125 t1 = _mm_blend_epi16(tt, t1, 0xCC); in compress_pre()
126 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); in compress_pre()
127 diagonalize(&rows[0], &rows[2], &rows[3]); in compress_pre()
129 tt = _mm_blend_epi16(t2, m2, 0xC0); in compress_pre()
130 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); in compress_pre()
131 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); in compress_pre()
134 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); in compress_pre()
135 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); in compress_pre()
136 undiagonalize(&rows[0], &rows[2], &rows[3]); in compress_pre()
144 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); in compress_pre()
145 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); in compress_pre()
147 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); in compress_pre()
148 t1 = _mm_blend_epi16(tt, t1, 0xCC); in compress_pre()
149 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); in compress_pre()
150 diagonalize(&rows[0], &rows[2], &rows[3]); in compress_pre()
152 tt = _mm_blend_epi16(t2, m2, 0xC0); in compress_pre()
153 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); in compress_pre()
154 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); in compress_pre()
157 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); in compress_pre()
158 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); in compress_pre()
159 undiagonalize(&rows[0], &rows[2], &rows[3]); in compress_pre()
167 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); in compress_pre()
168 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); in compress_pre()
170 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); in compress_pre()
171 t1 = _mm_blend_epi16(tt, t1, 0xCC); in compress_pre()
172 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); in compress_pre()
173 diagonalize(&rows[0], &rows[2], &rows[3]); in compress_pre()
175 tt = _mm_blend_epi16(t2, m2, 0xC0); in compress_pre()
176 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); in compress_pre()
177 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); in compress_pre()
180 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); in compress_pre()
181 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); in compress_pre()
182 undiagonalize(&rows[0], &rows[2], &rows[3]); in compress_pre()
190 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); in compress_pre()
191 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); in compress_pre()
193 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); in compress_pre()
194 t1 = _mm_blend_epi16(tt, t1, 0xCC); in compress_pre()
195 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); in compress_pre()
196 diagonalize(&rows[0], &rows[2], &rows[3]); in compress_pre()
198 tt = _mm_blend_epi16(t2, m2, 0xC0); in compress_pre()
199 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); in compress_pre()
200 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); in compress_pre()
203 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); in compress_pre()
204 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); in compress_pre()
205 undiagonalize(&rows[0], &rows[2], &rows[3]); in compress_pre()
213 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); in compress_pre()
214 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); in compress_pre()
216 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); in compress_pre()
217 t1 = _mm_blend_epi16(tt, t1, 0xCC); in compress_pre()
218 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); in compress_pre()
219 diagonalize(&rows[0], &rows[2], &rows[3]); in compress_pre()
221 tt = _mm_blend_epi16(t2, m2, 0xC0); in compress_pre()
222 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); in compress_pre()
223 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); in compress_pre()
226 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); in compress_pre()
227 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); in compress_pre()
228 undiagonalize(&rows[0], &rows[2], &rows[3]); in compress_pre()
236 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); in compress_pre()
237 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); in compress_pre()
239 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3)); in compress_pre()
240 t1 = _mm_blend_epi16(tt, t1, 0xCC); in compress_pre()
241 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); in compress_pre()
242 diagonalize(&rows[0], &rows[2], &rows[3]); in compress_pre()
244 tt = _mm_blend_epi16(t2, m2, 0xC0); in compress_pre()
245 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); in compress_pre()
246 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); in compress_pre()
249 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); in compress_pre()
250 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); in compress_pre()
251 undiagonalize(&rows[0], &rows[2], &rows[3]); in compress_pre()
260 storeu(xorv(rows[0], rows[2]), (uint8_t *)&cv[0]); in blake3_compress_in_place_sse41()
270 storeu(xorv(rows[0], rows[2]), &out[0]); in blake3_compress_xof_sse41()
272 storeu(xorv(rows[2], loadu((uint8_t *)&cv[0])), &out[32]); in blake3_compress_xof_sse41()
276 INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r) { in round_fn()
277 v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); in round_fn()
278 v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); in round_fn()
279 v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); in round_fn()
280 v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); in round_fn()
281 v[0] = addv(v[0], v[4]); in round_fn()
282 v[1] = addv(v[1], v[5]); in round_fn()
283 v[2] = addv(v[2], v[6]); in round_fn()
284 v[3] = addv(v[3], v[7]); in round_fn()
285 v[12] = xorv(v[12], v[0]); in round_fn()
286 v[13] = xorv(v[13], v[1]); in round_fn()
287 v[14] = xorv(v[14], v[2]); in round_fn()
288 v[15] = xorv(v[15], v[3]); in round_fn()
289 v[12] = rot16(v[12]); in round_fn()
290 v[13] = rot16(v[13]); in round_fn()
291 v[14] = rot16(v[14]); in round_fn()
292 v[15] = rot16(v[15]); in round_fn()
293 v[8] = addv(v[8], v[12]); in round_fn()
294 v[9] = addv(v[9], v[13]); in round_fn()
295 v[10] = addv(v[10], v[14]); in round_fn()
296 v[11] = addv(v[11], v[15]); in round_fn()
297 v[4] = xorv(v[4], v[8]); in round_fn()
298 v[5] = xorv(v[5], v[9]); in round_fn()
299 v[6] = xorv(v[6], v[10]); in round_fn()
300 v[7] = xorv(v[7], v[11]); in round_fn()
301 v[4] = rot12(v[4]); in round_fn()
302 v[5] = rot12(v[5]); in round_fn()
303 v[6] = rot12(v[6]); in round_fn()
304 v[7] = rot12(v[7]); in round_fn()
305 v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); in round_fn()
306 v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); in round_fn()
307 v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); in round_fn()
308 v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); in round_fn()
309 v[0] = addv(v[0], v[4]); in round_fn()
310 v[1] = addv(v[1], v[5]); in round_fn()
311 v[2] = addv(v[2], v[6]); in round_fn()
312 v[3] = addv(v[3], v[7]); in round_fn()
313 v[12] = xorv(v[12], v[0]); in round_fn()
314 v[13] = xorv(v[13], v[1]); in round_fn()
315 v[14] = xorv(v[14], v[2]); in round_fn()
316 v[15] = xorv(v[15], v[3]); in round_fn()
317 v[12] = rot8(v[12]); in round_fn()
318 v[13] = rot8(v[13]); in round_fn()
319 v[14] = rot8(v[14]); in round_fn()
320 v[15] = rot8(v[15]); in round_fn()
321 v[8] = addv(v[8], v[12]); in round_fn()
322 v[9] = addv(v[9], v[13]); in round_fn()
323 v[10] = addv(v[10], v[14]); in round_fn()
324 v[11] = addv(v[11], v[15]); in round_fn()
325 v[4] = xorv(v[4], v[8]); in round_fn()
326 v[5] = xorv(v[5], v[9]); in round_fn()
327 v[6] = xorv(v[6], v[10]); in round_fn()
328 v[7] = xorv(v[7], v[11]); in round_fn()
329 v[4] = rot7(v[4]); in round_fn()
330 v[5] = rot7(v[5]); in round_fn()
331 v[6] = rot7(v[6]); in round_fn()
332 v[7] = rot7(v[7]); in round_fn()
334 v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); in round_fn()
335 v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); in round_fn()
336 v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); in round_fn()
337 v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); in round_fn()
338 v[0] = addv(v[0], v[5]); in round_fn()
339 v[1] = addv(v[1], v[6]); in round_fn()
340 v[2] = addv(v[2], v[7]); in round_fn()
341 v[3] = addv(v[3], v[4]); in round_fn()
342 v[15] = xorv(v[15], v[0]); in round_fn()
343 v[12] = xorv(v[12], v[1]); in round_fn()
344 v[13] = xorv(v[13], v[2]); in round_fn()
345 v[14] = xorv(v[14], v[3]); in round_fn()
346 v[15] = rot16(v[15]); in round_fn()
347 v[12] = rot16(v[12]); in round_fn()
348 v[13] = rot16(v[13]); in round_fn()
349 v[14] = rot16(v[14]); in round_fn()
350 v[10] = addv(v[10], v[15]); in round_fn()
351 v[11] = addv(v[11], v[12]); in round_fn()
352 v[8] = addv(v[8], v[13]); in round_fn()
353 v[9] = addv(v[9], v[14]); in round_fn()
354 v[5] = xorv(v[5], v[10]); in round_fn()
355 v[6] = xorv(v[6], v[11]); in round_fn()
356 v[7] = xorv(v[7], v[8]); in round_fn()
357 v[4] = xorv(v[4], v[9]); in round_fn()
358 v[5] = rot12(v[5]); in round_fn()
359 v[6] = rot12(v[6]); in round_fn()
360 v[7] = rot12(v[7]); in round_fn()
361 v[4] = rot12(v[4]); in round_fn()
362 v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); in round_fn()
363 v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); in round_fn()
364 v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); in round_fn()
365 v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); in round_fn()
366 v[0] = addv(v[0], v[5]); in round_fn()
367 v[1] = addv(v[1], v[6]); in round_fn()
368 v[2] = addv(v[2], v[7]); in round_fn()
369 v[3] = addv(v[3], v[4]); in round_fn()
370 v[15] = xorv(v[15], v[0]); in round_fn()
371 v[12] = xorv(v[12], v[1]); in round_fn()
372 v[13] = xorv(v[13], v[2]); in round_fn()
373 v[14] = xorv(v[14], v[3]); in round_fn()
374 v[15] = rot8(v[15]); in round_fn()
375 v[12] = rot8(v[12]); in round_fn()
376 v[13] = rot8(v[13]); in round_fn()
377 v[14] = rot8(v[14]); in round_fn()
378 v[10] = addv(v[10], v[15]); in round_fn()
379 v[11] = addv(v[11], v[12]); in round_fn()
380 v[8] = addv(v[8], v[13]); in round_fn()
381 v[9] = addv(v[9], v[14]); in round_fn()
382 v[5] = xorv(v[5], v[10]); in round_fn()
383 v[6] = xorv(v[6], v[11]); in round_fn()
384 v[7] = xorv(v[7], v[8]); in round_fn()
385 v[4] = xorv(v[4], v[9]); in round_fn()
386 v[5] = rot7(v[5]); in round_fn()
387 v[6] = rot7(v[6]); in round_fn()
388 v[7] = rot7(v[7]); in round_fn()
389 v[4] = rot7(v[4]); in round_fn()
396 __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]); in transpose_vecs()
397 __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]); in transpose_vecs()
407 vecs[0] = abcd_0; in transpose_vecs()
415 out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m128i)]); in transpose_msg_vecs()
416 out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m128i)]); in transpose_msg_vecs()
417 out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m128i)]); in transpose_msg_vecs()
418 out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m128i)]); in transpose_msg_vecs()
419 out[4] = loadu(&inputs[0][block_offset + 1 * sizeof(__m128i)]); in transpose_msg_vecs()
423 out[8] = loadu(&inputs[0][block_offset + 2 * sizeof(__m128i)]); in transpose_msg_vecs()
427 out[12] = loadu(&inputs[0][block_offset + 3 * sizeof(__m128i)]); in transpose_msg_vecs()
431 for (size_t i = 0; i < 4; ++i) { in transpose_msg_vecs()
434 transpose_vecs(&out[0]); in transpose_msg_vecs()
443 const __m128i add0 = _mm_set_epi32(3, 2, 1, 0); in load_counters()
446 __m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)), in load_counters()
447 _mm_xor_si128( l, _mm_set1_epi32(0x80000000))); in load_counters()
459 set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]), in blake3_hash4_sse41()
467 for (size_t block = 0; block < blocks; block++) { in blake3_hash4_sse41()
476 __m128i v[16] = { in blake3_hash4_sse41() local
477 h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], in blake3_hash4_sse41()
479 set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]), in blake3_hash4_sse41()
482 round_fn(v, msg_vecs, 0); in blake3_hash4_sse41()
483 round_fn(v, msg_vecs, 1); in blake3_hash4_sse41()
484 round_fn(v, msg_vecs, 2); in blake3_hash4_sse41()
485 round_fn(v, msg_vecs, 3); in blake3_hash4_sse41()
486 round_fn(v, msg_vecs, 4); in blake3_hash4_sse41()
487 round_fn(v, msg_vecs, 5); in blake3_hash4_sse41()
488 round_fn(v, msg_vecs, 6); in blake3_hash4_sse41()
489 h_vecs[0] = xorv(v[0], v[8]); in blake3_hash4_sse41()
490 h_vecs[1] = xorv(v[1], v[9]); in blake3_hash4_sse41()
491 h_vecs[2] = xorv(v[2], v[10]); in blake3_hash4_sse41()
492 h_vecs[3] = xorv(v[3], v[11]); in blake3_hash4_sse41()
493 h_vecs[4] = xorv(v[4], v[12]); in blake3_hash4_sse41()
494 h_vecs[5] = xorv(v[5], v[13]); in blake3_hash4_sse41()
495 h_vecs[6] = xorv(v[6], v[14]); in blake3_hash4_sse41()
496 h_vecs[7] = xorv(v[7], v[15]); in blake3_hash4_sse41()
501 transpose_vecs(&h_vecs[0]); in blake3_hash4_sse41()
505 storeu(h_vecs[0], &out[0 * sizeof(__m128i)]); in blake3_hash4_sse41()
522 while (blocks > 0) { in hash_one_sse41()
550 while (num_inputs > 0) { in blake3_hash_many_sse41()
551 hash_one_sse41(inputs[0], blocks, key, counter, flags, flags_start, in blake3_hash_many_sse41()