Lines Matching +full:2 +full:v

66 INLINE void round_fn4(uint32x4_t v[16], uint32x4_t m[16], size_t r) {  in round_fn4()
67 v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); in round_fn4()
68 v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); in round_fn4()
69 v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); in round_fn4()
70 v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); in round_fn4()
71 v[0] = add_128(v[0], v[4]); in round_fn4()
72 v[1] = add_128(v[1], v[5]); in round_fn4()
73 v[2] = add_128(v[2], v[6]); in round_fn4()
74 v[3] = add_128(v[3], v[7]); in round_fn4()
75 v[12] = xor_128(v[12], v[0]); in round_fn4()
76 v[13] = xor_128(v[13], v[1]); in round_fn4()
77 v[14] = xor_128(v[14], v[2]); in round_fn4()
78 v[15] = xor_128(v[15], v[3]); in round_fn4()
79 v[12] = rot16_128(v[12]); in round_fn4()
80 v[13] = rot16_128(v[13]); in round_fn4()
81 v[14] = rot16_128(v[14]); in round_fn4()
82 v[15] = rot16_128(v[15]); in round_fn4()
83 v[8] = add_128(v[8], v[12]); in round_fn4()
84 v[9] = add_128(v[9], v[13]); in round_fn4()
85 v[10] = add_128(v[10], v[14]); in round_fn4()
86 v[11] = add_128(v[11], v[15]); in round_fn4()
87 v[4] = xor_128(v[4], v[8]); in round_fn4()
88 v[5] = xor_128(v[5], v[9]); in round_fn4()
89 v[6] = xor_128(v[6], v[10]); in round_fn4()
90 v[7] = xor_128(v[7], v[11]); in round_fn4()
91 v[4] = rot12_128(v[4]); in round_fn4()
92 v[5] = rot12_128(v[5]); in round_fn4()
93 v[6] = rot12_128(v[6]); in round_fn4()
94 v[7] = rot12_128(v[7]); in round_fn4()
95 v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); in round_fn4()
96 v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); in round_fn4()
97 v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); in round_fn4()
98 v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); in round_fn4()
99 v[0] = add_128(v[0], v[4]); in round_fn4()
100 v[1] = add_128(v[1], v[5]); in round_fn4()
101 v[2] = add_128(v[2], v[6]); in round_fn4()
102 v[3] = add_128(v[3], v[7]); in round_fn4()
103 v[12] = xor_128(v[12], v[0]); in round_fn4()
104 v[13] = xor_128(v[13], v[1]); in round_fn4()
105 v[14] = xor_128(v[14], v[2]); in round_fn4()
106 v[15] = xor_128(v[15], v[3]); in round_fn4()
107 v[12] = rot8_128(v[12]); in round_fn4()
108 v[13] = rot8_128(v[13]); in round_fn4()
109 v[14] = rot8_128(v[14]); in round_fn4()
110 v[15] = rot8_128(v[15]); in round_fn4()
111 v[8] = add_128(v[8], v[12]); in round_fn4()
112 v[9] = add_128(v[9], v[13]); in round_fn4()
113 v[10] = add_128(v[10], v[14]); in round_fn4()
114 v[11] = add_128(v[11], v[15]); in round_fn4()
115 v[4] = xor_128(v[4], v[8]); in round_fn4()
116 v[5] = xor_128(v[5], v[9]); in round_fn4()
117 v[6] = xor_128(v[6], v[10]); in round_fn4()
118 v[7] = xor_128(v[7], v[11]); in round_fn4()
119 v[4] = rot7_128(v[4]); in round_fn4()
120 v[5] = rot7_128(v[5]); in round_fn4()
121 v[6] = rot7_128(v[6]); in round_fn4()
122 v[7] = rot7_128(v[7]); in round_fn4()
124 v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); in round_fn4()
125 v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); in round_fn4()
126 v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); in round_fn4()
127 v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); in round_fn4()
128 v[0] = add_128(v[0], v[5]); in round_fn4()
129 v[1] = add_128(v[1], v[6]); in round_fn4()
130 v[2] = add_128(v[2], v[7]); in round_fn4()
131 v[3] = add_128(v[3], v[4]); in round_fn4()
132 v[15] = xor_128(v[15], v[0]); in round_fn4()
133 v[12] = xor_128(v[12], v[1]); in round_fn4()
134 v[13] = xor_128(v[13], v[2]); in round_fn4()
135 v[14] = xor_128(v[14], v[3]); in round_fn4()
136 v[15] = rot16_128(v[15]); in round_fn4()
137 v[12] = rot16_128(v[12]); in round_fn4()
138 v[13] = rot16_128(v[13]); in round_fn4()
139 v[14] = rot16_128(v[14]); in round_fn4()
140 v[10] = add_128(v[10], v[15]); in round_fn4()
141 v[11] = add_128(v[11], v[12]); in round_fn4()
142 v[8] = add_128(v[8], v[13]); in round_fn4()
143 v[9] = add_128(v[9], v[14]); in round_fn4()
144 v[5] = xor_128(v[5], v[10]); in round_fn4()
145 v[6] = xor_128(v[6], v[11]); in round_fn4()
146 v[7] = xor_128(v[7], v[8]); in round_fn4()
147 v[4] = xor_128(v[4], v[9]); in round_fn4()
148 v[5] = rot12_128(v[5]); in round_fn4()
149 v[6] = rot12_128(v[6]); in round_fn4()
150 v[7] = rot12_128(v[7]); in round_fn4()
151 v[4] = rot12_128(v[4]); in round_fn4()
152 v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); in round_fn4()
153 v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); in round_fn4()
154 v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); in round_fn4()
155 v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); in round_fn4()
156 v[0] = add_128(v[0], v[5]); in round_fn4()
157 v[1] = add_128(v[1], v[6]); in round_fn4()
158 v[2] = add_128(v[2], v[7]); in round_fn4()
159 v[3] = add_128(v[3], v[4]); in round_fn4()
160 v[15] = xor_128(v[15], v[0]); in round_fn4()
161 v[12] = xor_128(v[12], v[1]); in round_fn4()
162 v[13] = xor_128(v[13], v[2]); in round_fn4()
163 v[14] = xor_128(v[14], v[3]); in round_fn4()
164 v[15] = rot8_128(v[15]); in round_fn4()
165 v[12] = rot8_128(v[12]); in round_fn4()
166 v[13] = rot8_128(v[13]); in round_fn4()
167 v[14] = rot8_128(v[14]); in round_fn4()
168 v[10] = add_128(v[10], v[15]); in round_fn4()
169 v[11] = add_128(v[11], v[12]); in round_fn4()
170 v[8] = add_128(v[8], v[13]); in round_fn4()
171 v[9] = add_128(v[9], v[14]); in round_fn4()
172 v[5] = xor_128(v[5], v[10]); in round_fn4()
173 v[6] = xor_128(v[6], v[11]); in round_fn4()
174 v[7] = xor_128(v[7], v[8]); in round_fn4()
175 v[4] = xor_128(v[4], v[9]); in round_fn4()
176 v[5] = rot7_128(v[5]); in round_fn4()
177 v[6] = rot7_128(v[6]); in round_fn4()
178 v[7] = rot7_128(v[7]); in round_fn4()
179 v[4] = rot7_128(v[4]); in round_fn4()
183 // Individually transpose the four 2x2 sub-matrices in each corner. in transpose_vecs_128()
185 uint32x4x2_t rows23 = vtrnq_u32(vecs[2], vecs[3]); in transpose_vecs_128()
187 // Swap the top-right and bottom-left 2x2s (which just got transposed). in transpose_vecs_128()
192 vecs[2] = in transpose_vecs_128()
202 out[2] = loadu_128(&inputs[2][block_offset + 0 * sizeof(uint32x4_t)]); in transpose_msg_vecs4()
206 out[6] = loadu_128(&inputs[2][block_offset + 1 * sizeof(uint32x4_t)]); in transpose_msg_vecs4()
208 out[8] = loadu_128(&inputs[0][block_offset + 2 * sizeof(uint32x4_t)]); in transpose_msg_vecs4()
209 out[9] = loadu_128(&inputs[1][block_offset + 2 * sizeof(uint32x4_t)]); in transpose_msg_vecs4()
210 out[10] = loadu_128(&inputs[2][block_offset + 2 * sizeof(uint32x4_t)]); in transpose_msg_vecs4()
211 out[11] = loadu_128(&inputs[3][block_offset + 2 * sizeof(uint32x4_t)]); in transpose_msg_vecs4()
214 out[14] = loadu_128(&inputs[2][block_offset + 3 * sizeof(uint32x4_t)]); in transpose_msg_vecs4()
227 counter_low(counter + (mask & 2)), counter_low(counter + (mask & 3))); in load_counters4()
230 counter_high(counter + (mask & 2)), counter_high(counter + (mask & 3))); in load_counters4()
239 set1_128(key[0]), set1_128(key[1]), set1_128(key[2]), set1_128(key[3]), in blake3_hash4_neon()
256 uint32x4_t v[16] = { in blake3_hash4_neon() local
257 h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], in blake3_hash4_neon()
259 set1_128(IV[0]), set1_128(IV[1]), set1_128(IV[2]), set1_128(IV[3]), in blake3_hash4_neon()
262 round_fn4(v, msg_vecs, 0); in blake3_hash4_neon()
263 round_fn4(v, msg_vecs, 1); in blake3_hash4_neon()
264 round_fn4(v, msg_vecs, 2); in blake3_hash4_neon()
265 round_fn4(v, msg_vecs, 3); in blake3_hash4_neon()
266 round_fn4(v, msg_vecs, 4); in blake3_hash4_neon()
267 round_fn4(v, msg_vecs, 5); in blake3_hash4_neon()
268 round_fn4(v, msg_vecs, 6); in blake3_hash4_neon()
269 h_vecs[0] = xor_128(v[0], v[8]); in blake3_hash4_neon()
270 h_vecs[1] = xor_128(v[1], v[9]); in blake3_hash4_neon()
271 h_vecs[2] = xor_128(v[2], v[10]); in blake3_hash4_neon()
272 h_vecs[3] = xor_128(v[3], v[11]); in blake3_hash4_neon()
273 h_vecs[4] = xor_128(v[4], v[12]); in blake3_hash4_neon()
274 h_vecs[5] = xor_128(v[5], v[13]); in blake3_hash4_neon()
275 h_vecs[6] = xor_128(v[6], v[14]); in blake3_hash4_neon()
276 h_vecs[7] = xor_128(v[7], v[15]); in blake3_hash4_neon()
287 storeu_128(h_vecs[1], &out[2 * sizeof(uint32x4_t)]); in blake3_hash4_neon()
289 storeu_128(h_vecs[2], &out[4 * sizeof(uint32x4_t)]); in blake3_hash4_neon()