Lines Matching +full:2 +full:v

24       x, _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2,  in rot16()
25 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)); in rot16()
34 x, _mm256_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1, in rot8()
35 12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1)); in rot8()
42 INLINE void round_fn(__m256i v[16], __m256i m[16], size_t r) { in round_fn()
43 v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); in round_fn()
44 v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); in round_fn()
45 v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); in round_fn()
46 v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); in round_fn()
47 v[0] = addv(v[0], v[4]); in round_fn()
48 v[1] = addv(v[1], v[5]); in round_fn()
49 v[2] = addv(v[2], v[6]); in round_fn()
50 v[3] = addv(v[3], v[7]); in round_fn()
51 v[12] = xorv(v[12], v[0]); in round_fn()
52 v[13] = xorv(v[13], v[1]); in round_fn()
53 v[14] = xorv(v[14], v[2]); in round_fn()
54 v[15] = xorv(v[15], v[3]); in round_fn()
55 v[12] = rot16(v[12]); in round_fn()
56 v[13] = rot16(v[13]); in round_fn()
57 v[14] = rot16(v[14]); in round_fn()
58 v[15] = rot16(v[15]); in round_fn()
59 v[8] = addv(v[8], v[12]); in round_fn()
60 v[9] = addv(v[9], v[13]); in round_fn()
61 v[10] = addv(v[10], v[14]); in round_fn()
62 v[11] = addv(v[11], v[15]); in round_fn()
63 v[4] = xorv(v[4], v[8]); in round_fn()
64 v[5] = xorv(v[5], v[9]); in round_fn()
65 v[6] = xorv(v[6], v[10]); in round_fn()
66 v[7] = xorv(v[7], v[11]); in round_fn()
67 v[4] = rot12(v[4]); in round_fn()
68 v[5] = rot12(v[5]); in round_fn()
69 v[6] = rot12(v[6]); in round_fn()
70 v[7] = rot12(v[7]); in round_fn()
71 v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); in round_fn()
72 v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); in round_fn()
73 v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); in round_fn()
74 v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); in round_fn()
75 v[0] = addv(v[0], v[4]); in round_fn()
76 v[1] = addv(v[1], v[5]); in round_fn()
77 v[2] = addv(v[2], v[6]); in round_fn()
78 v[3] = addv(v[3], v[7]); in round_fn()
79 v[12] = xorv(v[12], v[0]); in round_fn()
80 v[13] = xorv(v[13], v[1]); in round_fn()
81 v[14] = xorv(v[14], v[2]); in round_fn()
82 v[15] = xorv(v[15], v[3]); in round_fn()
83 v[12] = rot8(v[12]); in round_fn()
84 v[13] = rot8(v[13]); in round_fn()
85 v[14] = rot8(v[14]); in round_fn()
86 v[15] = rot8(v[15]); in round_fn()
87 v[8] = addv(v[8], v[12]); in round_fn()
88 v[9] = addv(v[9], v[13]); in round_fn()
89 v[10] = addv(v[10], v[14]); in round_fn()
90 v[11] = addv(v[11], v[15]); in round_fn()
91 v[4] = xorv(v[4], v[8]); in round_fn()
92 v[5] = xorv(v[5], v[9]); in round_fn()
93 v[6] = xorv(v[6], v[10]); in round_fn()
94 v[7] = xorv(v[7], v[11]); in round_fn()
95 v[4] = rot7(v[4]); in round_fn()
96 v[5] = rot7(v[5]); in round_fn()
97 v[6] = rot7(v[6]); in round_fn()
98 v[7] = rot7(v[7]); in round_fn()
100 v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); in round_fn()
101 v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); in round_fn()
102 v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); in round_fn()
103 v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); in round_fn()
104 v[0] = addv(v[0], v[5]); in round_fn()
105 v[1] = addv(v[1], v[6]); in round_fn()
106 v[2] = addv(v[2], v[7]); in round_fn()
107 v[3] = addv(v[3], v[4]); in round_fn()
108 v[15] = xorv(v[15], v[0]); in round_fn()
109 v[12] = xorv(v[12], v[1]); in round_fn()
110 v[13] = xorv(v[13], v[2]); in round_fn()
111 v[14] = xorv(v[14], v[3]); in round_fn()
112 v[15] = rot16(v[15]); in round_fn()
113 v[12] = rot16(v[12]); in round_fn()
114 v[13] = rot16(v[13]); in round_fn()
115 v[14] = rot16(v[14]); in round_fn()
116 v[10] = addv(v[10], v[15]); in round_fn()
117 v[11] = addv(v[11], v[12]); in round_fn()
118 v[8] = addv(v[8], v[13]); in round_fn()
119 v[9] = addv(v[9], v[14]); in round_fn()
120 v[5] = xorv(v[5], v[10]); in round_fn()
121 v[6] = xorv(v[6], v[11]); in round_fn()
122 v[7] = xorv(v[7], v[8]); in round_fn()
123 v[4] = xorv(v[4], v[9]); in round_fn()
124 v[5] = rot12(v[5]); in round_fn()
125 v[6] = rot12(v[6]); in round_fn()
126 v[7] = rot12(v[7]); in round_fn()
127 v[4] = rot12(v[4]); in round_fn()
128 v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); in round_fn()
129 v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); in round_fn()
130 v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); in round_fn()
131 v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); in round_fn()
132 v[0] = addv(v[0], v[5]); in round_fn()
133 v[1] = addv(v[1], v[6]); in round_fn()
134 v[2] = addv(v[2], v[7]); in round_fn()
135 v[3] = addv(v[3], v[4]); in round_fn()
136 v[15] = xorv(v[15], v[0]); in round_fn()
137 v[12] = xorv(v[12], v[1]); in round_fn()
138 v[13] = xorv(v[13], v[2]); in round_fn()
139 v[14] = xorv(v[14], v[3]); in round_fn()
140 v[15] = rot8(v[15]); in round_fn()
141 v[12] = rot8(v[12]); in round_fn()
142 v[13] = rot8(v[13]); in round_fn()
143 v[14] = rot8(v[14]); in round_fn()
144 v[10] = addv(v[10], v[15]); in round_fn()
145 v[11] = addv(v[11], v[12]); in round_fn()
146 v[8] = addv(v[8], v[13]); in round_fn()
147 v[9] = addv(v[9], v[14]); in round_fn()
148 v[5] = xorv(v[5], v[10]); in round_fn()
149 v[6] = xorv(v[6], v[11]); in round_fn()
150 v[7] = xorv(v[7], v[8]); in round_fn()
151 v[4] = xorv(v[4], v[9]); in round_fn()
152 v[5] = rot7(v[5]); in round_fn()
153 v[6] = rot7(v[6]); in round_fn()
154 v[7] = rot7(v[7]); in round_fn()
155 v[4] = rot7(v[4]); in round_fn()
163 __m256i cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]); in transpose_vecs()
164 __m256i cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]); in transpose_vecs()
184 vecs[2] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x20); in transpose_vecs()
196 out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m256i)]); in transpose_msg_vecs()
204 out[10] = loadu(&inputs[2][block_offset + 1 * sizeof(__m256i)]); in transpose_msg_vecs()
220 const __m256i add0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); in load_counters()
236 set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]), in blake3_hash8_avx2()
253 __m256i v[16] = { in blake3_hash8_avx2() local
254 h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], in blake3_hash8_avx2()
256 set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]), in blake3_hash8_avx2()
259 round_fn(v, msg_vecs, 0); in blake3_hash8_avx2()
260 round_fn(v, msg_vecs, 1); in blake3_hash8_avx2()
261 round_fn(v, msg_vecs, 2); in blake3_hash8_avx2()
262 round_fn(v, msg_vecs, 3); in blake3_hash8_avx2()
263 round_fn(v, msg_vecs, 4); in blake3_hash8_avx2()
264 round_fn(v, msg_vecs, 5); in blake3_hash8_avx2()
265 round_fn(v, msg_vecs, 6); in blake3_hash8_avx2()
266 h_vecs[0] = xorv(v[0], v[8]); in blake3_hash8_avx2()
267 h_vecs[1] = xorv(v[1], v[9]); in blake3_hash8_avx2()
268 h_vecs[2] = xorv(v[2], v[10]); in blake3_hash8_avx2()
269 h_vecs[3] = xorv(v[3], v[11]); in blake3_hash8_avx2()
270 h_vecs[4] = xorv(v[4], v[12]); in blake3_hash8_avx2()
271 h_vecs[5] = xorv(v[5], v[13]); in blake3_hash8_avx2()
272 h_vecs[6] = xorv(v[6], v[14]); in blake3_hash8_avx2()
273 h_vecs[7] = xorv(v[7], v[15]); in blake3_hash8_avx2()
281 storeu(h_vecs[2], &out[2 * sizeof(__m256i)]); in blake3_hash8_avx2()