Lines Matching +full:2 +full:v

70   *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3));  in diagonalize()
71 *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); in diagonalize()
72 *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1)); in diagonalize()
76 *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1)); in undiagonalize()
77 *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2)); in undiagonalize()
78 *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3)); in undiagonalize()
94 rows[2] = set4(IV[0], IV[1], IV[2], IV[3]); in compress_pre()
100 __m128i m2 = loadu(&block[sizeof(__m128i) * 2]); in compress_pre()
107 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); // 6 4 2 0 in compress_pre()
108 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); in compress_pre()
110 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); in compress_pre()
111 diagonalize(&rows[0], &rows[2], &rows[3]); in compress_pre()
112 t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10 8 in compress_pre()
113 t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); // 12 10 8 14 in compress_pre()
114 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); in compress_pre()
116 t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3)); // 13 11 9 15 in compress_pre()
117 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); in compress_pre()
118 undiagonalize(&rows[0], &rows[2], &rows[3]); in compress_pre()
124 // Round 2. This round and all following rounds apply a fixed permutation in compress_pre()
126 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); in compress_pre()
127 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); in compress_pre()
128 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); in compress_pre()
129 t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); in compress_pre()
132 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); in compress_pre()
133 diagonalize(&rows[0], &rows[2], &rows[3]); in compress_pre()
136 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); in compress_pre()
137 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); in compress_pre()
140 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); in compress_pre()
141 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); in compress_pre()
142 undiagonalize(&rows[0], &rows[2], &rows[3]); in compress_pre()
149 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); in compress_pre()
150 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); in compress_pre()
151 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); in compress_pre()
152 t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); in compress_pre()
155 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); in compress_pre()
156 diagonalize(&rows[0], &rows[2], &rows[3]); in compress_pre()
159 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); in compress_pre()
160 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); in compress_pre()
163 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); in compress_pre()
164 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); in compress_pre()
165 undiagonalize(&rows[0], &rows[2], &rows[3]); in compress_pre()
172 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); in compress_pre()
173 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); in compress_pre()
174 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); in compress_pre()
175 t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); in compress_pre()
178 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); in compress_pre()
179 diagonalize(&rows[0], &rows[2], &rows[3]); in compress_pre()
182 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); in compress_pre()
183 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); in compress_pre()
186 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); in compress_pre()
187 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); in compress_pre()
188 undiagonalize(&rows[0], &rows[2], &rows[3]); in compress_pre()
195 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); in compress_pre()
196 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); in compress_pre()
197 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); in compress_pre()
198 t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); in compress_pre()
201 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); in compress_pre()
202 diagonalize(&rows[0], &rows[2], &rows[3]); in compress_pre()
205 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); in compress_pre()
206 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); in compress_pre()
209 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); in compress_pre()
210 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); in compress_pre()
211 undiagonalize(&rows[0], &rows[2], &rows[3]); in compress_pre()
218 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); in compress_pre()
219 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); in compress_pre()
220 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); in compress_pre()
221 t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); in compress_pre()
224 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); in compress_pre()
225 diagonalize(&rows[0], &rows[2], &rows[3]); in compress_pre()
228 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); in compress_pre()
229 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); in compress_pre()
232 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); in compress_pre()
233 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); in compress_pre()
234 undiagonalize(&rows[0], &rows[2], &rows[3]); in compress_pre()
241 t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2)); in compress_pre()
242 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1)); in compress_pre()
243 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0); in compress_pre()
244 t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2)); in compress_pre()
247 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1); in compress_pre()
248 diagonalize(&rows[0], &rows[2], &rows[3]); in compress_pre()
251 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0)); in compress_pre()
252 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2); in compress_pre()
255 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2)); in compress_pre()
256 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3); in compress_pre()
257 undiagonalize(&rows[0], &rows[2], &rows[3]); in compress_pre()
266 storeu(xorv(rows[0], rows[2]), (uint8_t *)&cv[0]); in blake3_compress_in_place_sse2()
276 storeu(xorv(rows[0], rows[2]), &out[0]); in blake3_compress_xof_sse2()
278 storeu(xorv(rows[2], loadu((uint8_t *)&cv[0])), &out[32]); in blake3_compress_xof_sse2()
282 INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r) { in round_fn()
283 v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); in round_fn()
284 v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); in round_fn()
285 v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); in round_fn()
286 v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); in round_fn()
287 v[0] = addv(v[0], v[4]); in round_fn()
288 v[1] = addv(v[1], v[5]); in round_fn()
289 v[2] = addv(v[2], v[6]); in round_fn()
290 v[3] = addv(v[3], v[7]); in round_fn()
291 v[12] = xorv(v[12], v[0]); in round_fn()
292 v[13] = xorv(v[13], v[1]); in round_fn()
293 v[14] = xorv(v[14], v[2]); in round_fn()
294 v[15] = xorv(v[15], v[3]); in round_fn()
295 v[12] = rot16(v[12]); in round_fn()
296 v[13] = rot16(v[13]); in round_fn()
297 v[14] = rot16(v[14]); in round_fn()
298 v[15] = rot16(v[15]); in round_fn()
299 v[8] = addv(v[8], v[12]); in round_fn()
300 v[9] = addv(v[9], v[13]); in round_fn()
301 v[10] = addv(v[10], v[14]); in round_fn()
302 v[11] = addv(v[11], v[15]); in round_fn()
303 v[4] = xorv(v[4], v[8]); in round_fn()
304 v[5] = xorv(v[5], v[9]); in round_fn()
305 v[6] = xorv(v[6], v[10]); in round_fn()
306 v[7] = xorv(v[7], v[11]); in round_fn()
307 v[4] = rot12(v[4]); in round_fn()
308 v[5] = rot12(v[5]); in round_fn()
309 v[6] = rot12(v[6]); in round_fn()
310 v[7] = rot12(v[7]); in round_fn()
311 v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); in round_fn()
312 v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); in round_fn()
313 v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); in round_fn()
314 v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); in round_fn()
315 v[0] = addv(v[0], v[4]); in round_fn()
316 v[1] = addv(v[1], v[5]); in round_fn()
317 v[2] = addv(v[2], v[6]); in round_fn()
318 v[3] = addv(v[3], v[7]); in round_fn()
319 v[12] = xorv(v[12], v[0]); in round_fn()
320 v[13] = xorv(v[13], v[1]); in round_fn()
321 v[14] = xorv(v[14], v[2]); in round_fn()
322 v[15] = xorv(v[15], v[3]); in round_fn()
323 v[12] = rot8(v[12]); in round_fn()
324 v[13] = rot8(v[13]); in round_fn()
325 v[14] = rot8(v[14]); in round_fn()
326 v[15] = rot8(v[15]); in round_fn()
327 v[8] = addv(v[8], v[12]); in round_fn()
328 v[9] = addv(v[9], v[13]); in round_fn()
329 v[10] = addv(v[10], v[14]); in round_fn()
330 v[11] = addv(v[11], v[15]); in round_fn()
331 v[4] = xorv(v[4], v[8]); in round_fn()
332 v[5] = xorv(v[5], v[9]); in round_fn()
333 v[6] = xorv(v[6], v[10]); in round_fn()
334 v[7] = xorv(v[7], v[11]); in round_fn()
335 v[4] = rot7(v[4]); in round_fn()
336 v[5] = rot7(v[5]); in round_fn()
337 v[6] = rot7(v[6]); in round_fn()
338 v[7] = rot7(v[7]); in round_fn()
340 v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); in round_fn()
341 v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); in round_fn()
342 v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); in round_fn()
343 v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); in round_fn()
344 v[0] = addv(v[0], v[5]); in round_fn()
345 v[1] = addv(v[1], v[6]); in round_fn()
346 v[2] = addv(v[2], v[7]); in round_fn()
347 v[3] = addv(v[3], v[4]); in round_fn()
348 v[15] = xorv(v[15], v[0]); in round_fn()
349 v[12] = xorv(v[12], v[1]); in round_fn()
350 v[13] = xorv(v[13], v[2]); in round_fn()
351 v[14] = xorv(v[14], v[3]); in round_fn()
352 v[15] = rot16(v[15]); in round_fn()
353 v[12] = rot16(v[12]); in round_fn()
354 v[13] = rot16(v[13]); in round_fn()
355 v[14] = rot16(v[14]); in round_fn()
356 v[10] = addv(v[10], v[15]); in round_fn()
357 v[11] = addv(v[11], v[12]); in round_fn()
358 v[8] = addv(v[8], v[13]); in round_fn()
359 v[9] = addv(v[9], v[14]); in round_fn()
360 v[5] = xorv(v[5], v[10]); in round_fn()
361 v[6] = xorv(v[6], v[11]); in round_fn()
362 v[7] = xorv(v[7], v[8]); in round_fn()
363 v[4] = xorv(v[4], v[9]); in round_fn()
364 v[5] = rot12(v[5]); in round_fn()
365 v[6] = rot12(v[6]); in round_fn()
366 v[7] = rot12(v[7]); in round_fn()
367 v[4] = rot12(v[4]); in round_fn()
368 v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); in round_fn()
369 v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); in round_fn()
370 v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); in round_fn()
371 v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); in round_fn()
372 v[0] = addv(v[0], v[5]); in round_fn()
373 v[1] = addv(v[1], v[6]); in round_fn()
374 v[2] = addv(v[2], v[7]); in round_fn()
375 v[3] = addv(v[3], v[4]); in round_fn()
376 v[15] = xorv(v[15], v[0]); in round_fn()
377 v[12] = xorv(v[12], v[1]); in round_fn()
378 v[13] = xorv(v[13], v[2]); in round_fn()
379 v[14] = xorv(v[14], v[3]); in round_fn()
380 v[15] = rot8(v[15]); in round_fn()
381 v[12] = rot8(v[12]); in round_fn()
382 v[13] = rot8(v[13]); in round_fn()
383 v[14] = rot8(v[14]); in round_fn()
384 v[10] = addv(v[10], v[15]); in round_fn()
385 v[11] = addv(v[11], v[12]); in round_fn()
386 v[8] = addv(v[8], v[13]); in round_fn()
387 v[9] = addv(v[9], v[14]); in round_fn()
388 v[5] = xorv(v[5], v[10]); in round_fn()
389 v[6] = xorv(v[6], v[11]); in round_fn()
390 v[7] = xorv(v[7], v[8]); in round_fn()
391 v[4] = xorv(v[4], v[9]); in round_fn()
392 v[5] = rot7(v[5]); in round_fn()
393 v[6] = rot7(v[6]); in round_fn()
394 v[7] = rot7(v[7]); in round_fn()
395 v[4] = rot7(v[4]); in round_fn()
404 __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]); in transpose_vecs()
405 __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]); in transpose_vecs()
415 vecs[2] = abcd_2; in transpose_vecs()
423 out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m128i)]); in transpose_msg_vecs()
427 out[6] = loadu(&inputs[2][block_offset + 1 * sizeof(__m128i)]); in transpose_msg_vecs()
429 out[8] = loadu(&inputs[0][block_offset + 2 * sizeof(__m128i)]); in transpose_msg_vecs()
430 out[9] = loadu(&inputs[1][block_offset + 2 * sizeof(__m128i)]); in transpose_msg_vecs()
431 out[10] = loadu(&inputs[2][block_offset + 2 * sizeof(__m128i)]); in transpose_msg_vecs()
432 out[11] = loadu(&inputs[3][block_offset + 2 * sizeof(__m128i)]); in transpose_msg_vecs()
435 out[14] = loadu(&inputs[2][block_offset + 3 * sizeof(__m128i)]); in transpose_msg_vecs()
449 const __m128i add0 = _mm_set_epi32(3, 2, 1, 0); in load_counters()
465 set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]), in blake3_hash4_sse2()
482 __m128i v[16] = { in blake3_hash4_sse2() local
483 h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], in blake3_hash4_sse2()
485 set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]), in blake3_hash4_sse2()
488 round_fn(v, msg_vecs, 0); in blake3_hash4_sse2()
489 round_fn(v, msg_vecs, 1); in blake3_hash4_sse2()
490 round_fn(v, msg_vecs, 2); in blake3_hash4_sse2()
491 round_fn(v, msg_vecs, 3); in blake3_hash4_sse2()
492 round_fn(v, msg_vecs, 4); in blake3_hash4_sse2()
493 round_fn(v, msg_vecs, 5); in blake3_hash4_sse2()
494 round_fn(v, msg_vecs, 6); in blake3_hash4_sse2()
495 h_vecs[0] = xorv(v[0], v[8]); in blake3_hash4_sse2()
496 h_vecs[1] = xorv(v[1], v[9]); in blake3_hash4_sse2()
497 h_vecs[2] = xorv(v[2], v[10]); in blake3_hash4_sse2()
498 h_vecs[3] = xorv(v[3], v[11]); in blake3_hash4_sse2()
499 h_vecs[4] = xorv(v[4], v[12]); in blake3_hash4_sse2()
500 h_vecs[5] = xorv(v[5], v[13]); in blake3_hash4_sse2()
501 h_vecs[6] = xorv(v[6], v[14]); in blake3_hash4_sse2()
502 h_vecs[7] = xorv(v[7], v[15]); in blake3_hash4_sse2()
513 storeu(h_vecs[1], &out[2 * sizeof(__m128i)]); in blake3_hash4_sse2()
515 storeu(h_vecs[2], &out[4 * sizeof(__m128i)]); in blake3_hash4_sse2()