1*81ad6265SDimitry Andric #include "blake3_impl.h"
2*81ad6265SDimitry Andric
3*81ad6265SDimitry Andric #if BLAKE3_USE_NEON
4*81ad6265SDimitry Andric
5*81ad6265SDimitry Andric #include <arm_neon.h>
6*81ad6265SDimitry Andric
7*81ad6265SDimitry Andric #ifdef __ARM_BIG_ENDIAN
8*81ad6265SDimitry Andric #error "This implementation only supports little-endian ARM."
9*81ad6265SDimitry Andric // It might be that all we need for big-endian support here is to get the loads
10*81ad6265SDimitry Andric // and stores right, but step zero would be finding a way to test it in CI.
11*81ad6265SDimitry Andric #endif
12*81ad6265SDimitry Andric
loadu_128(const uint8_t src[16])13*81ad6265SDimitry Andric INLINE uint32x4_t loadu_128(const uint8_t src[16]) {
14*81ad6265SDimitry Andric // vld1q_u32 has alignment requirements. Don't use it.
15*81ad6265SDimitry Andric uint32x4_t x;
16*81ad6265SDimitry Andric memcpy(&x, src, 16);
17*81ad6265SDimitry Andric return x;
18*81ad6265SDimitry Andric }
19*81ad6265SDimitry Andric
storeu_128(uint32x4_t src,uint8_t dest[16])20*81ad6265SDimitry Andric INLINE void storeu_128(uint32x4_t src, uint8_t dest[16]) {
21*81ad6265SDimitry Andric // vst1q_u32 has alignment requirements. Don't use it.
22*81ad6265SDimitry Andric memcpy(dest, &src, 16);
23*81ad6265SDimitry Andric }
24*81ad6265SDimitry Andric
add_128(uint32x4_t a,uint32x4_t b)25*81ad6265SDimitry Andric INLINE uint32x4_t add_128(uint32x4_t a, uint32x4_t b) {
26*81ad6265SDimitry Andric return vaddq_u32(a, b);
27*81ad6265SDimitry Andric }
28*81ad6265SDimitry Andric
xor_128(uint32x4_t a,uint32x4_t b)29*81ad6265SDimitry Andric INLINE uint32x4_t xor_128(uint32x4_t a, uint32x4_t b) {
30*81ad6265SDimitry Andric return veorq_u32(a, b);
31*81ad6265SDimitry Andric }
32*81ad6265SDimitry Andric
set1_128(uint32_t x)33*81ad6265SDimitry Andric INLINE uint32x4_t set1_128(uint32_t x) { return vld1q_dup_u32(&x); }
34*81ad6265SDimitry Andric
set4(uint32_t a,uint32_t b,uint32_t c,uint32_t d)35*81ad6265SDimitry Andric INLINE uint32x4_t set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
36*81ad6265SDimitry Andric uint32_t array[4] = {a, b, c, d};
37*81ad6265SDimitry Andric return vld1q_u32(array);
38*81ad6265SDimitry Andric }
39*81ad6265SDimitry Andric
rot16_128(uint32x4_t x)40*81ad6265SDimitry Andric INLINE uint32x4_t rot16_128(uint32x4_t x) {
41*81ad6265SDimitry Andric return vorrq_u32(vshrq_n_u32(x, 16), vshlq_n_u32(x, 32 - 16));
42*81ad6265SDimitry Andric }
43*81ad6265SDimitry Andric
rot12_128(uint32x4_t x)44*81ad6265SDimitry Andric INLINE uint32x4_t rot12_128(uint32x4_t x) {
45*81ad6265SDimitry Andric return vorrq_u32(vshrq_n_u32(x, 12), vshlq_n_u32(x, 32 - 12));
46*81ad6265SDimitry Andric }
47*81ad6265SDimitry Andric
rot8_128(uint32x4_t x)48*81ad6265SDimitry Andric INLINE uint32x4_t rot8_128(uint32x4_t x) {
49*81ad6265SDimitry Andric return vorrq_u32(vshrq_n_u32(x, 8), vshlq_n_u32(x, 32 - 8));
50*81ad6265SDimitry Andric }
51*81ad6265SDimitry Andric
rot7_128(uint32x4_t x)52*81ad6265SDimitry Andric INLINE uint32x4_t rot7_128(uint32x4_t x) {
53*81ad6265SDimitry Andric return vorrq_u32(vshrq_n_u32(x, 7), vshlq_n_u32(x, 32 - 7));
54*81ad6265SDimitry Andric }
55*81ad6265SDimitry Andric
56*81ad6265SDimitry Andric // TODO: compress_neon
57*81ad6265SDimitry Andric
58*81ad6265SDimitry Andric // TODO: hash2_neon
59*81ad6265SDimitry Andric
60*81ad6265SDimitry Andric /*
61*81ad6265SDimitry Andric * ----------------------------------------------------------------------------
62*81ad6265SDimitry Andric * hash4_neon
63*81ad6265SDimitry Andric * ----------------------------------------------------------------------------
64*81ad6265SDimitry Andric */
65*81ad6265SDimitry Andric
round_fn4(uint32x4_t v[16],uint32x4_t m[16],size_t r)66*81ad6265SDimitry Andric INLINE void round_fn4(uint32x4_t v[16], uint32x4_t m[16], size_t r) {
67*81ad6265SDimitry Andric v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
68*81ad6265SDimitry Andric v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
69*81ad6265SDimitry Andric v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
70*81ad6265SDimitry Andric v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
71*81ad6265SDimitry Andric v[0] = add_128(v[0], v[4]);
72*81ad6265SDimitry Andric v[1] = add_128(v[1], v[5]);
73*81ad6265SDimitry Andric v[2] = add_128(v[2], v[6]);
74*81ad6265SDimitry Andric v[3] = add_128(v[3], v[7]);
75*81ad6265SDimitry Andric v[12] = xor_128(v[12], v[0]);
76*81ad6265SDimitry Andric v[13] = xor_128(v[13], v[1]);
77*81ad6265SDimitry Andric v[14] = xor_128(v[14], v[2]);
78*81ad6265SDimitry Andric v[15] = xor_128(v[15], v[3]);
79*81ad6265SDimitry Andric v[12] = rot16_128(v[12]);
80*81ad6265SDimitry Andric v[13] = rot16_128(v[13]);
81*81ad6265SDimitry Andric v[14] = rot16_128(v[14]);
82*81ad6265SDimitry Andric v[15] = rot16_128(v[15]);
83*81ad6265SDimitry Andric v[8] = add_128(v[8], v[12]);
84*81ad6265SDimitry Andric v[9] = add_128(v[9], v[13]);
85*81ad6265SDimitry Andric v[10] = add_128(v[10], v[14]);
86*81ad6265SDimitry Andric v[11] = add_128(v[11], v[15]);
87*81ad6265SDimitry Andric v[4] = xor_128(v[4], v[8]);
88*81ad6265SDimitry Andric v[5] = xor_128(v[5], v[9]);
89*81ad6265SDimitry Andric v[6] = xor_128(v[6], v[10]);
90*81ad6265SDimitry Andric v[7] = xor_128(v[7], v[11]);
91*81ad6265SDimitry Andric v[4] = rot12_128(v[4]);
92*81ad6265SDimitry Andric v[5] = rot12_128(v[5]);
93*81ad6265SDimitry Andric v[6] = rot12_128(v[6]);
94*81ad6265SDimitry Andric v[7] = rot12_128(v[7]);
95*81ad6265SDimitry Andric v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
96*81ad6265SDimitry Andric v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
97*81ad6265SDimitry Andric v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
98*81ad6265SDimitry Andric v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
99*81ad6265SDimitry Andric v[0] = add_128(v[0], v[4]);
100*81ad6265SDimitry Andric v[1] = add_128(v[1], v[5]);
101*81ad6265SDimitry Andric v[2] = add_128(v[2], v[6]);
102*81ad6265SDimitry Andric v[3] = add_128(v[3], v[7]);
103*81ad6265SDimitry Andric v[12] = xor_128(v[12], v[0]);
104*81ad6265SDimitry Andric v[13] = xor_128(v[13], v[1]);
105*81ad6265SDimitry Andric v[14] = xor_128(v[14], v[2]);
106*81ad6265SDimitry Andric v[15] = xor_128(v[15], v[3]);
107*81ad6265SDimitry Andric v[12] = rot8_128(v[12]);
108*81ad6265SDimitry Andric v[13] = rot8_128(v[13]);
109*81ad6265SDimitry Andric v[14] = rot8_128(v[14]);
110*81ad6265SDimitry Andric v[15] = rot8_128(v[15]);
111*81ad6265SDimitry Andric v[8] = add_128(v[8], v[12]);
112*81ad6265SDimitry Andric v[9] = add_128(v[9], v[13]);
113*81ad6265SDimitry Andric v[10] = add_128(v[10], v[14]);
114*81ad6265SDimitry Andric v[11] = add_128(v[11], v[15]);
115*81ad6265SDimitry Andric v[4] = xor_128(v[4], v[8]);
116*81ad6265SDimitry Andric v[5] = xor_128(v[5], v[9]);
117*81ad6265SDimitry Andric v[6] = xor_128(v[6], v[10]);
118*81ad6265SDimitry Andric v[7] = xor_128(v[7], v[11]);
119*81ad6265SDimitry Andric v[4] = rot7_128(v[4]);
120*81ad6265SDimitry Andric v[5] = rot7_128(v[5]);
121*81ad6265SDimitry Andric v[6] = rot7_128(v[6]);
122*81ad6265SDimitry Andric v[7] = rot7_128(v[7]);
123*81ad6265SDimitry Andric
124*81ad6265SDimitry Andric v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
125*81ad6265SDimitry Andric v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
126*81ad6265SDimitry Andric v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
127*81ad6265SDimitry Andric v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
128*81ad6265SDimitry Andric v[0] = add_128(v[0], v[5]);
129*81ad6265SDimitry Andric v[1] = add_128(v[1], v[6]);
130*81ad6265SDimitry Andric v[2] = add_128(v[2], v[7]);
131*81ad6265SDimitry Andric v[3] = add_128(v[3], v[4]);
132*81ad6265SDimitry Andric v[15] = xor_128(v[15], v[0]);
133*81ad6265SDimitry Andric v[12] = xor_128(v[12], v[1]);
134*81ad6265SDimitry Andric v[13] = xor_128(v[13], v[2]);
135*81ad6265SDimitry Andric v[14] = xor_128(v[14], v[3]);
136*81ad6265SDimitry Andric v[15] = rot16_128(v[15]);
137*81ad6265SDimitry Andric v[12] = rot16_128(v[12]);
138*81ad6265SDimitry Andric v[13] = rot16_128(v[13]);
139*81ad6265SDimitry Andric v[14] = rot16_128(v[14]);
140*81ad6265SDimitry Andric v[10] = add_128(v[10], v[15]);
141*81ad6265SDimitry Andric v[11] = add_128(v[11], v[12]);
142*81ad6265SDimitry Andric v[8] = add_128(v[8], v[13]);
143*81ad6265SDimitry Andric v[9] = add_128(v[9], v[14]);
144*81ad6265SDimitry Andric v[5] = xor_128(v[5], v[10]);
145*81ad6265SDimitry Andric v[6] = xor_128(v[6], v[11]);
146*81ad6265SDimitry Andric v[7] = xor_128(v[7], v[8]);
147*81ad6265SDimitry Andric v[4] = xor_128(v[4], v[9]);
148*81ad6265SDimitry Andric v[5] = rot12_128(v[5]);
149*81ad6265SDimitry Andric v[6] = rot12_128(v[6]);
150*81ad6265SDimitry Andric v[7] = rot12_128(v[7]);
151*81ad6265SDimitry Andric v[4] = rot12_128(v[4]);
152*81ad6265SDimitry Andric v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
153*81ad6265SDimitry Andric v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
154*81ad6265SDimitry Andric v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
155*81ad6265SDimitry Andric v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
156*81ad6265SDimitry Andric v[0] = add_128(v[0], v[5]);
157*81ad6265SDimitry Andric v[1] = add_128(v[1], v[6]);
158*81ad6265SDimitry Andric v[2] = add_128(v[2], v[7]);
159*81ad6265SDimitry Andric v[3] = add_128(v[3], v[4]);
160*81ad6265SDimitry Andric v[15] = xor_128(v[15], v[0]);
161*81ad6265SDimitry Andric v[12] = xor_128(v[12], v[1]);
162*81ad6265SDimitry Andric v[13] = xor_128(v[13], v[2]);
163*81ad6265SDimitry Andric v[14] = xor_128(v[14], v[3]);
164*81ad6265SDimitry Andric v[15] = rot8_128(v[15]);
165*81ad6265SDimitry Andric v[12] = rot8_128(v[12]);
166*81ad6265SDimitry Andric v[13] = rot8_128(v[13]);
167*81ad6265SDimitry Andric v[14] = rot8_128(v[14]);
168*81ad6265SDimitry Andric v[10] = add_128(v[10], v[15]);
169*81ad6265SDimitry Andric v[11] = add_128(v[11], v[12]);
170*81ad6265SDimitry Andric v[8] = add_128(v[8], v[13]);
171*81ad6265SDimitry Andric v[9] = add_128(v[9], v[14]);
172*81ad6265SDimitry Andric v[5] = xor_128(v[5], v[10]);
173*81ad6265SDimitry Andric v[6] = xor_128(v[6], v[11]);
174*81ad6265SDimitry Andric v[7] = xor_128(v[7], v[8]);
175*81ad6265SDimitry Andric v[4] = xor_128(v[4], v[9]);
176*81ad6265SDimitry Andric v[5] = rot7_128(v[5]);
177*81ad6265SDimitry Andric v[6] = rot7_128(v[6]);
178*81ad6265SDimitry Andric v[7] = rot7_128(v[7]);
179*81ad6265SDimitry Andric v[4] = rot7_128(v[4]);
180*81ad6265SDimitry Andric }
181*81ad6265SDimitry Andric
transpose_vecs_128(uint32x4_t vecs[4])182*81ad6265SDimitry Andric INLINE void transpose_vecs_128(uint32x4_t vecs[4]) {
183*81ad6265SDimitry Andric // Individually transpose the four 2x2 sub-matrices in each corner.
184*81ad6265SDimitry Andric uint32x4x2_t rows01 = vtrnq_u32(vecs[0], vecs[1]);
185*81ad6265SDimitry Andric uint32x4x2_t rows23 = vtrnq_u32(vecs[2], vecs[3]);
186*81ad6265SDimitry Andric
187*81ad6265SDimitry Andric // Swap the top-right and bottom-left 2x2s (which just got transposed).
188*81ad6265SDimitry Andric vecs[0] =
189*81ad6265SDimitry Andric vcombine_u32(vget_low_u32(rows01.val[0]), vget_low_u32(rows23.val[0]));
190*81ad6265SDimitry Andric vecs[1] =
191*81ad6265SDimitry Andric vcombine_u32(vget_low_u32(rows01.val[1]), vget_low_u32(rows23.val[1]));
192*81ad6265SDimitry Andric vecs[2] =
193*81ad6265SDimitry Andric vcombine_u32(vget_high_u32(rows01.val[0]), vget_high_u32(rows23.val[0]));
194*81ad6265SDimitry Andric vecs[3] =
195*81ad6265SDimitry Andric vcombine_u32(vget_high_u32(rows01.val[1]), vget_high_u32(rows23.val[1]));
196*81ad6265SDimitry Andric }
197*81ad6265SDimitry Andric
transpose_msg_vecs4(const uint8_t * const * inputs,size_t block_offset,uint32x4_t out[16])198*81ad6265SDimitry Andric INLINE void transpose_msg_vecs4(const uint8_t *const *inputs,
199*81ad6265SDimitry Andric size_t block_offset, uint32x4_t out[16]) {
200*81ad6265SDimitry Andric out[0] = loadu_128(&inputs[0][block_offset + 0 * sizeof(uint32x4_t)]);
201*81ad6265SDimitry Andric out[1] = loadu_128(&inputs[1][block_offset + 0 * sizeof(uint32x4_t)]);
202*81ad6265SDimitry Andric out[2] = loadu_128(&inputs[2][block_offset + 0 * sizeof(uint32x4_t)]);
203*81ad6265SDimitry Andric out[3] = loadu_128(&inputs[3][block_offset + 0 * sizeof(uint32x4_t)]);
204*81ad6265SDimitry Andric out[4] = loadu_128(&inputs[0][block_offset + 1 * sizeof(uint32x4_t)]);
205*81ad6265SDimitry Andric out[5] = loadu_128(&inputs[1][block_offset + 1 * sizeof(uint32x4_t)]);
206*81ad6265SDimitry Andric out[6] = loadu_128(&inputs[2][block_offset + 1 * sizeof(uint32x4_t)]);
207*81ad6265SDimitry Andric out[7] = loadu_128(&inputs[3][block_offset + 1 * sizeof(uint32x4_t)]);
208*81ad6265SDimitry Andric out[8] = loadu_128(&inputs[0][block_offset + 2 * sizeof(uint32x4_t)]);
209*81ad6265SDimitry Andric out[9] = loadu_128(&inputs[1][block_offset + 2 * sizeof(uint32x4_t)]);
210*81ad6265SDimitry Andric out[10] = loadu_128(&inputs[2][block_offset + 2 * sizeof(uint32x4_t)]);
211*81ad6265SDimitry Andric out[11] = loadu_128(&inputs[3][block_offset + 2 * sizeof(uint32x4_t)]);
212*81ad6265SDimitry Andric out[12] = loadu_128(&inputs[0][block_offset + 3 * sizeof(uint32x4_t)]);
213*81ad6265SDimitry Andric out[13] = loadu_128(&inputs[1][block_offset + 3 * sizeof(uint32x4_t)]);
214*81ad6265SDimitry Andric out[14] = loadu_128(&inputs[2][block_offset + 3 * sizeof(uint32x4_t)]);
215*81ad6265SDimitry Andric out[15] = loadu_128(&inputs[3][block_offset + 3 * sizeof(uint32x4_t)]);
216*81ad6265SDimitry Andric transpose_vecs_128(&out[0]);
217*81ad6265SDimitry Andric transpose_vecs_128(&out[4]);
218*81ad6265SDimitry Andric transpose_vecs_128(&out[8]);
219*81ad6265SDimitry Andric transpose_vecs_128(&out[12]);
220*81ad6265SDimitry Andric }
221*81ad6265SDimitry Andric
load_counters4(uint64_t counter,bool increment_counter,uint32x4_t * out_low,uint32x4_t * out_high)222*81ad6265SDimitry Andric INLINE void load_counters4(uint64_t counter, bool increment_counter,
223*81ad6265SDimitry Andric uint32x4_t *out_low, uint32x4_t *out_high) {
224*81ad6265SDimitry Andric uint64_t mask = (increment_counter ? ~0 : 0);
225*81ad6265SDimitry Andric *out_low = set4(
226*81ad6265SDimitry Andric counter_low(counter + (mask & 0)), counter_low(counter + (mask & 1)),
227*81ad6265SDimitry Andric counter_low(counter + (mask & 2)), counter_low(counter + (mask & 3)));
228*81ad6265SDimitry Andric *out_high = set4(
229*81ad6265SDimitry Andric counter_high(counter + (mask & 0)), counter_high(counter + (mask & 1)),
230*81ad6265SDimitry Andric counter_high(counter + (mask & 2)), counter_high(counter + (mask & 3)));
231*81ad6265SDimitry Andric }
232*81ad6265SDimitry Andric
233*81ad6265SDimitry Andric static
blake3_hash4_neon(const uint8_t * const * inputs,size_t blocks,const uint32_t key[8],uint64_t counter,bool increment_counter,uint8_t flags,uint8_t flags_start,uint8_t flags_end,uint8_t * out)234*81ad6265SDimitry Andric void blake3_hash4_neon(const uint8_t *const *inputs, size_t blocks,
235*81ad6265SDimitry Andric const uint32_t key[8], uint64_t counter,
236*81ad6265SDimitry Andric bool increment_counter, uint8_t flags,
237*81ad6265SDimitry Andric uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
238*81ad6265SDimitry Andric uint32x4_t h_vecs[8] = {
239*81ad6265SDimitry Andric set1_128(key[0]), set1_128(key[1]), set1_128(key[2]), set1_128(key[3]),
240*81ad6265SDimitry Andric set1_128(key[4]), set1_128(key[5]), set1_128(key[6]), set1_128(key[7]),
241*81ad6265SDimitry Andric };
242*81ad6265SDimitry Andric uint32x4_t counter_low_vec, counter_high_vec;
243*81ad6265SDimitry Andric load_counters4(counter, increment_counter, &counter_low_vec,
244*81ad6265SDimitry Andric &counter_high_vec);
245*81ad6265SDimitry Andric uint8_t block_flags = flags | flags_start;
246*81ad6265SDimitry Andric
247*81ad6265SDimitry Andric for (size_t block = 0; block < blocks; block++) {
248*81ad6265SDimitry Andric if (block + 1 == blocks) {
249*81ad6265SDimitry Andric block_flags |= flags_end;
250*81ad6265SDimitry Andric }
251*81ad6265SDimitry Andric uint32x4_t block_len_vec = set1_128(BLAKE3_BLOCK_LEN);
252*81ad6265SDimitry Andric uint32x4_t block_flags_vec = set1_128(block_flags);
253*81ad6265SDimitry Andric uint32x4_t msg_vecs[16];
254*81ad6265SDimitry Andric transpose_msg_vecs4(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
255*81ad6265SDimitry Andric
256*81ad6265SDimitry Andric uint32x4_t v[16] = {
257*81ad6265SDimitry Andric h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
258*81ad6265SDimitry Andric h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
259*81ad6265SDimitry Andric set1_128(IV[0]), set1_128(IV[1]), set1_128(IV[2]), set1_128(IV[3]),
260*81ad6265SDimitry Andric counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
261*81ad6265SDimitry Andric };
262*81ad6265SDimitry Andric round_fn4(v, msg_vecs, 0);
263*81ad6265SDimitry Andric round_fn4(v, msg_vecs, 1);
264*81ad6265SDimitry Andric round_fn4(v, msg_vecs, 2);
265*81ad6265SDimitry Andric round_fn4(v, msg_vecs, 3);
266*81ad6265SDimitry Andric round_fn4(v, msg_vecs, 4);
267*81ad6265SDimitry Andric round_fn4(v, msg_vecs, 5);
268*81ad6265SDimitry Andric round_fn4(v, msg_vecs, 6);
269*81ad6265SDimitry Andric h_vecs[0] = xor_128(v[0], v[8]);
270*81ad6265SDimitry Andric h_vecs[1] = xor_128(v[1], v[9]);
271*81ad6265SDimitry Andric h_vecs[2] = xor_128(v[2], v[10]);
272*81ad6265SDimitry Andric h_vecs[3] = xor_128(v[3], v[11]);
273*81ad6265SDimitry Andric h_vecs[4] = xor_128(v[4], v[12]);
274*81ad6265SDimitry Andric h_vecs[5] = xor_128(v[5], v[13]);
275*81ad6265SDimitry Andric h_vecs[6] = xor_128(v[6], v[14]);
276*81ad6265SDimitry Andric h_vecs[7] = xor_128(v[7], v[15]);
277*81ad6265SDimitry Andric
278*81ad6265SDimitry Andric block_flags = flags;
279*81ad6265SDimitry Andric }
280*81ad6265SDimitry Andric
281*81ad6265SDimitry Andric transpose_vecs_128(&h_vecs[0]);
282*81ad6265SDimitry Andric transpose_vecs_128(&h_vecs[4]);
283*81ad6265SDimitry Andric // The first four vecs now contain the first half of each output, and the
284*81ad6265SDimitry Andric // second four vecs contain the second half of each output.
285*81ad6265SDimitry Andric storeu_128(h_vecs[0], &out[0 * sizeof(uint32x4_t)]);
286*81ad6265SDimitry Andric storeu_128(h_vecs[4], &out[1 * sizeof(uint32x4_t)]);
287*81ad6265SDimitry Andric storeu_128(h_vecs[1], &out[2 * sizeof(uint32x4_t)]);
288*81ad6265SDimitry Andric storeu_128(h_vecs[5], &out[3 * sizeof(uint32x4_t)]);
289*81ad6265SDimitry Andric storeu_128(h_vecs[2], &out[4 * sizeof(uint32x4_t)]);
290*81ad6265SDimitry Andric storeu_128(h_vecs[6], &out[5 * sizeof(uint32x4_t)]);
291*81ad6265SDimitry Andric storeu_128(h_vecs[3], &out[6 * sizeof(uint32x4_t)]);
292*81ad6265SDimitry Andric storeu_128(h_vecs[7], &out[7 * sizeof(uint32x4_t)]);
293*81ad6265SDimitry Andric }
294*81ad6265SDimitry Andric
295*81ad6265SDimitry Andric /*
296*81ad6265SDimitry Andric * ----------------------------------------------------------------------------
297*81ad6265SDimitry Andric * hash_many_neon
298*81ad6265SDimitry Andric * ----------------------------------------------------------------------------
299*81ad6265SDimitry Andric */
300*81ad6265SDimitry Andric
301*81ad6265SDimitry Andric void blake3_compress_in_place_portable(uint32_t cv[8],
302*81ad6265SDimitry Andric const uint8_t block[BLAKE3_BLOCK_LEN],
303*81ad6265SDimitry Andric uint8_t block_len, uint64_t counter,
304*81ad6265SDimitry Andric uint8_t flags);
305*81ad6265SDimitry Andric
hash_one_neon(const uint8_t * input,size_t blocks,const uint32_t key[8],uint64_t counter,uint8_t flags,uint8_t flags_start,uint8_t flags_end,uint8_t out[BLAKE3_OUT_LEN])306*81ad6265SDimitry Andric INLINE void hash_one_neon(const uint8_t *input, size_t blocks,
307*81ad6265SDimitry Andric const uint32_t key[8], uint64_t counter,
308*81ad6265SDimitry Andric uint8_t flags, uint8_t flags_start, uint8_t flags_end,
309*81ad6265SDimitry Andric uint8_t out[BLAKE3_OUT_LEN]) {
310*81ad6265SDimitry Andric uint32_t cv[8];
311*81ad6265SDimitry Andric memcpy(cv, key, BLAKE3_KEY_LEN);
312*81ad6265SDimitry Andric uint8_t block_flags = flags | flags_start;
313*81ad6265SDimitry Andric while (blocks > 0) {
314*81ad6265SDimitry Andric if (blocks == 1) {
315*81ad6265SDimitry Andric block_flags |= flags_end;
316*81ad6265SDimitry Andric }
317*81ad6265SDimitry Andric // TODO: Implement compress_neon. However note that according to
318*81ad6265SDimitry Andric // https://github.com/BLAKE2/BLAKE2/commit/7965d3e6e1b4193438b8d3a656787587d2579227,
319*81ad6265SDimitry Andric // compress_neon might not be any faster than compress_portable.
320*81ad6265SDimitry Andric blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, counter,
321*81ad6265SDimitry Andric block_flags);
322*81ad6265SDimitry Andric input = &input[BLAKE3_BLOCK_LEN];
323*81ad6265SDimitry Andric blocks -= 1;
324*81ad6265SDimitry Andric block_flags = flags;
325*81ad6265SDimitry Andric }
326*81ad6265SDimitry Andric memcpy(out, cv, BLAKE3_OUT_LEN);
327*81ad6265SDimitry Andric }
328*81ad6265SDimitry Andric
blake3_hash_many_neon(const uint8_t * const * inputs,size_t num_inputs,size_t blocks,const uint32_t key[8],uint64_t counter,bool increment_counter,uint8_t flags,uint8_t flags_start,uint8_t flags_end,uint8_t * out)329*81ad6265SDimitry Andric void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs,
330*81ad6265SDimitry Andric size_t blocks, const uint32_t key[8],
331*81ad6265SDimitry Andric uint64_t counter, bool increment_counter,
332*81ad6265SDimitry Andric uint8_t flags, uint8_t flags_start,
333*81ad6265SDimitry Andric uint8_t flags_end, uint8_t *out) {
334*81ad6265SDimitry Andric while (num_inputs >= 4) {
335*81ad6265SDimitry Andric blake3_hash4_neon(inputs, blocks, key, counter, increment_counter, flags,
336*81ad6265SDimitry Andric flags_start, flags_end, out);
337*81ad6265SDimitry Andric if (increment_counter) {
338*81ad6265SDimitry Andric counter += 4;
339*81ad6265SDimitry Andric }
340*81ad6265SDimitry Andric inputs += 4;
341*81ad6265SDimitry Andric num_inputs -= 4;
342*81ad6265SDimitry Andric out = &out[4 * BLAKE3_OUT_LEN];
343*81ad6265SDimitry Andric }
344*81ad6265SDimitry Andric while (num_inputs > 0) {
345*81ad6265SDimitry Andric hash_one_neon(inputs[0], blocks, key, counter, flags, flags_start,
346*81ad6265SDimitry Andric flags_end, out);
347*81ad6265SDimitry Andric if (increment_counter) {
348*81ad6265SDimitry Andric counter += 1;
349*81ad6265SDimitry Andric }
350*81ad6265SDimitry Andric inputs += 1;
351*81ad6265SDimitry Andric num_inputs -= 1;
352*81ad6265SDimitry Andric out = &out[BLAKE3_OUT_LEN];
353*81ad6265SDimitry Andric }
354*81ad6265SDimitry Andric }
355*81ad6265SDimitry Andric
356*81ad6265SDimitry Andric #endif // BLAKE3_USE_NEON
357