Lines Matching +full:2 +full:v

32       x, _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2));  in rot16()
41 x, _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1)); in rot8()
72 *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3)); in diagonalize()
73 *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); in diagonalize()
74 *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1)); in diagonalize()
78 *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1)); in undiagonalize()
79 *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); in undiagonalize()
80 *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3)); in undiagonalize()
88 rows[2] = set4(IV[0], IV[1], IV[2], IV[3]); in compress_pre()
94 __m128i m2 = loadu(&block[sizeof(__m128i) * 2]); in compress_pre()
101 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); // 6 4 2 0 in compress_pre()
102 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); in compress_pre()
104 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); in compress_pre()
105 diagonalize(&rows[0], &rows[2], &rows[3]); in compress_pre()
106 t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10 8 in compress_pre()
107 t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); // 12 10 8 14 in compress_pre()
108 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); in compress_pre()
110 t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3)); // 13 11 9 15 in compress_pre()
111 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); in compress_pre()
112 undiagonalize(&rows[0], &rows[2], &rows[3]); in compress_pre()
118 // Round 2. This round and all following rounds apply a fixed permutation in compress_pre()
120 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); in compress_pre()
121 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); in compress_pre()
122 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); in compress_pre()
123 t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); in compress_pre()
126 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); in compress_pre()
127 diagonalize(&rows[0], &rows[2], &rows[3]); in compress_pre()
130 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); in compress_pre()
131 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); in compress_pre()
134 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); in compress_pre()
135 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); in compress_pre()
136 undiagonalize(&rows[0], &rows[2], &rows[3]); in compress_pre()
143 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); in compress_pre()
144 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); in compress_pre()
145 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); in compress_pre()
146 t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); in compress_pre()
149 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); in compress_pre()
150 diagonalize(&rows[0], &rows[2], &rows[3]); in compress_pre()
153 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); in compress_pre()
154 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); in compress_pre()
157 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); in compress_pre()
158 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); in compress_pre()
159 undiagonalize(&rows[0], &rows[2], &rows[3]); in compress_pre()
166 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); in compress_pre()
167 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); in compress_pre()
168 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); in compress_pre()
169 t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); in compress_pre()
172 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); in compress_pre()
173 diagonalize(&rows[0], &rows[2], &rows[3]); in compress_pre()
176 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); in compress_pre()
177 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); in compress_pre()
180 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); in compress_pre()
181 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); in compress_pre()
182 undiagonalize(&rows[0], &rows[2], &rows[3]); in compress_pre()
189 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); in compress_pre()
190 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); in compress_pre()
191 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); in compress_pre()
192 t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); in compress_pre()
195 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); in compress_pre()
196 diagonalize(&rows[0], &rows[2], &rows[3]); in compress_pre()
199 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); in compress_pre()
200 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); in compress_pre()
203 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); in compress_pre()
204 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); in compress_pre()
205 undiagonalize(&rows[0], &rows[2], &rows[3]); in compress_pre()
212 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); in compress_pre()
213 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); in compress_pre()
214 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); in compress_pre()
215 t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); in compress_pre()
218 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); in compress_pre()
219 diagonalize(&rows[0], &rows[2], &rows[3]); in compress_pre()
222 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); in compress_pre()
223 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); in compress_pre()
226 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); in compress_pre()
227 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); in compress_pre()
228 undiagonalize(&rows[0], &rows[2], &rows[3]); in compress_pre()
235 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); in compress_pre()
236 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); in compress_pre()
237 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); in compress_pre()
238 t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); in compress_pre()
241 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); in compress_pre()
242 diagonalize(&rows[0], &rows[2], &rows[3]); in compress_pre()
245 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); in compress_pre()
246 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); in compress_pre()
249 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); in compress_pre()
250 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); in compress_pre()
251 undiagonalize(&rows[0], &rows[2], &rows[3]); in compress_pre()
260 storeu(xorv(rows[0], rows[2]), (uint8_t *)&cv[0]); in blake3_compress_in_place_sse41()
270 storeu(xorv(rows[0], rows[2]), &out[0]); in blake3_compress_xof_sse41()
272 storeu(xorv(rows[2], loadu((uint8_t *)&cv[0])), &out[32]); in blake3_compress_xof_sse41()
276 INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r) { in round_fn()
277 v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); in round_fn()
278 v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); in round_fn()
279 v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); in round_fn()
280 v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); in round_fn()
281 v[0] = addv(v[0], v[4]); in round_fn()
282 v[1] = addv(v[1], v[5]); in round_fn()
283 v[2] = addv(v[2], v[6]); in round_fn()
284 v[3] = addv(v[3], v[7]); in round_fn()
285 v[12] = xorv(v[12], v[0]); in round_fn()
286 v[13] = xorv(v[13], v[1]); in round_fn()
287 v[14] = xorv(v[14], v[2]); in round_fn()
288 v[15] = xorv(v[15], v[3]); in round_fn()
289 v[12] = rot16(v[12]); in round_fn()
290 v[13] = rot16(v[13]); in round_fn()
291 v[14] = rot16(v[14]); in round_fn()
292 v[15] = rot16(v[15]); in round_fn()
293 v[8] = addv(v[8], v[12]); in round_fn()
294 v[9] = addv(v[9], v[13]); in round_fn()
295 v[10] = addv(v[10], v[14]); in round_fn()
296 v[11] = addv(v[11], v[15]); in round_fn()
297 v[4] = xorv(v[4], v[8]); in round_fn()
298 v[5] = xorv(v[5], v[9]); in round_fn()
299 v[6] = xorv(v[6], v[10]); in round_fn()
300 v[7] = xorv(v[7], v[11]); in round_fn()
301 v[4] = rot12(v[4]); in round_fn()
302 v[5] = rot12(v[5]); in round_fn()
303 v[6] = rot12(v[6]); in round_fn()
304 v[7] = rot12(v[7]); in round_fn()
305 v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); in round_fn()
306 v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); in round_fn()
307 v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); in round_fn()
308 v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); in round_fn()
309 v[0] = addv(v[0], v[4]); in round_fn()
310 v[1] = addv(v[1], v[5]); in round_fn()
311 v[2] = addv(v[2], v[6]); in round_fn()
312 v[3] = addv(v[3], v[7]); in round_fn()
313 v[12] = xorv(v[12], v[0]); in round_fn()
314 v[13] = xorv(v[13], v[1]); in round_fn()
315 v[14] = xorv(v[14], v[2]); in round_fn()
316 v[15] = xorv(v[15], v[3]); in round_fn()
317 v[12] = rot8(v[12]); in round_fn()
318 v[13] = rot8(v[13]); in round_fn()
319 v[14] = rot8(v[14]); in round_fn()
320 v[15] = rot8(v[15]); in round_fn()
321 v[8] = addv(v[8], v[12]); in round_fn()
322 v[9] = addv(v[9], v[13]); in round_fn()
323 v[10] = addv(v[10], v[14]); in round_fn()
324 v[11] = addv(v[11], v[15]); in round_fn()
325 v[4] = xorv(v[4], v[8]); in round_fn()
326 v[5] = xorv(v[5], v[9]); in round_fn()
327 v[6] = xorv(v[6], v[10]); in round_fn()
328 v[7] = xorv(v[7], v[11]); in round_fn()
329 v[4] = rot7(v[4]); in round_fn()
330 v[5] = rot7(v[5]); in round_fn()
331 v[6] = rot7(v[6]); in round_fn()
332 v[7] = rot7(v[7]); in round_fn()
334 v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); in round_fn()
335 v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); in round_fn()
336 v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); in round_fn()
337 v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); in round_fn()
338 v[0] = addv(v[0], v[5]); in round_fn()
339 v[1] = addv(v[1], v[6]); in round_fn()
340 v[2] = addv(v[2], v[7]); in round_fn()
341 v[3] = addv(v[3], v[4]); in round_fn()
342 v[15] = xorv(v[15], v[0]); in round_fn()
343 v[12] = xorv(v[12], v[1]); in round_fn()
344 v[13] = xorv(v[13], v[2]); in round_fn()
345 v[14] = xorv(v[14], v[3]); in round_fn()
346 v[15] = rot16(v[15]); in round_fn()
347 v[12] = rot16(v[12]); in round_fn()
348 v[13] = rot16(v[13]); in round_fn()
349 v[14] = rot16(v[14]); in round_fn()
350 v[10] = addv(v[10], v[15]); in round_fn()
351 v[11] = addv(v[11], v[12]); in round_fn()
352 v[8] = addv(v[8], v[13]); in round_fn()
353 v[9] = addv(v[9], v[14]); in round_fn()
354 v[5] = xorv(v[5], v[10]); in round_fn()
355 v[6] = xorv(v[6], v[11]); in round_fn()
356 v[7] = xorv(v[7], v[8]); in round_fn()
357 v[4] = xorv(v[4], v[9]); in round_fn()
358 v[5] = rot12(v[5]); in round_fn()
359 v[6] = rot12(v[6]); in round_fn()
360 v[7] = rot12(v[7]); in round_fn()
361 v[4] = rot12(v[4]); in round_fn()
362 v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); in round_fn()
363 v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); in round_fn()
364 v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); in round_fn()
365 v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); in round_fn()
366 v[0] = addv(v[0], v[5]); in round_fn()
367 v[1] = addv(v[1], v[6]); in round_fn()
368 v[2] = addv(v[2], v[7]); in round_fn()
369 v[3] = addv(v[3], v[4]); in round_fn()
370 v[15] = xorv(v[15], v[0]); in round_fn()
371 v[12] = xorv(v[12], v[1]); in round_fn()
372 v[13] = xorv(v[13], v[2]); in round_fn()
373 v[14] = xorv(v[14], v[3]); in round_fn()
374 v[15] = rot8(v[15]); in round_fn()
375 v[12] = rot8(v[12]); in round_fn()
376 v[13] = rot8(v[13]); in round_fn()
377 v[14] = rot8(v[14]); in round_fn()
378 v[10] = addv(v[10], v[15]); in round_fn()
379 v[11] = addv(v[11], v[12]); in round_fn()
380 v[8] = addv(v[8], v[13]); in round_fn()
381 v[9] = addv(v[9], v[14]); in round_fn()
382 v[5] = xorv(v[5], v[10]); in round_fn()
383 v[6] = xorv(v[6], v[11]); in round_fn()
384 v[7] = xorv(v[7], v[8]); in round_fn()
385 v[4] = xorv(v[4], v[9]); in round_fn()
386 v[5] = rot7(v[5]); in round_fn()
387 v[6] = rot7(v[6]); in round_fn()
388 v[7] = rot7(v[7]); in round_fn()
389 v[4] = rot7(v[4]); in round_fn()
398 __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]); in transpose_vecs()
399 __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]); in transpose_vecs()
409 vecs[2] = abcd_2; in transpose_vecs()
417 out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m128i)]); in transpose_msg_vecs()
421 out[6] = loadu(&inputs[2][block_offset + 1 * sizeof(__m128i)]); in transpose_msg_vecs()
423 out[8] = loadu(&inputs[0][block_offset + 2 * sizeof(__m128i)]); in transpose_msg_vecs()
424 out[9] = loadu(&inputs[1][block_offset + 2 * sizeof(__m128i)]); in transpose_msg_vecs()
425 out[10] = loadu(&inputs[2][block_offset + 2 * sizeof(__m128i)]); in transpose_msg_vecs()
426 out[11] = loadu(&inputs[3][block_offset + 2 * sizeof(__m128i)]); in transpose_msg_vecs()
429 out[14] = loadu(&inputs[2][block_offset + 3 * sizeof(__m128i)]); in transpose_msg_vecs()
443 const __m128i add0 = _mm_set_epi32(3, 2, 1, 0); in load_counters()
459 set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]), in blake3_hash4_sse41()
476 __m128i v[16] = { in blake3_hash4_sse41() local
477 h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], in blake3_hash4_sse41()
479 set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]), in blake3_hash4_sse41()
482 round_fn(v, msg_vecs, 0); in blake3_hash4_sse41()
483 round_fn(v, msg_vecs, 1); in blake3_hash4_sse41()
484 round_fn(v, msg_vecs, 2); in blake3_hash4_sse41()
485 round_fn(v, msg_vecs, 3); in blake3_hash4_sse41()
486 round_fn(v, msg_vecs, 4); in blake3_hash4_sse41()
487 round_fn(v, msg_vecs, 5); in blake3_hash4_sse41()
488 round_fn(v, msg_vecs, 6); in blake3_hash4_sse41()
489 h_vecs[0] = xorv(v[0], v[8]); in blake3_hash4_sse41()
490 h_vecs[1] = xorv(v[1], v[9]); in blake3_hash4_sse41()
491 h_vecs[2] = xorv(v[2], v[10]); in blake3_hash4_sse41()
492 h_vecs[3] = xorv(v[3], v[11]); in blake3_hash4_sse41()
493 h_vecs[4] = xorv(v[4], v[12]); in blake3_hash4_sse41()
494 h_vecs[5] = xorv(v[5], v[13]); in blake3_hash4_sse41()
495 h_vecs[6] = xorv(v[6], v[14]); in blake3_hash4_sse41()
496 h_vecs[7] = xorv(v[7], v[15]); in blake3_hash4_sse41()
507 storeu(h_vecs[1], &out[2 * sizeof(__m128i)]); in blake3_hash4_sse41()
509 storeu(h_vecs[2], &out[4 * sizeof(__m128i)]); in blake3_hash4_sse41()