xref: /openbsd-src/gnu/llvm/llvm/lib/Support/BLAKE3/blake3_dispatch.c (revision d415bd752c734aee168c4ee86ff32e8cc249eb16)
1*d415bd75Srobert #include <stdbool.h>
2*d415bd75Srobert #include <stddef.h>
3*d415bd75Srobert #include <stdint.h>
4*d415bd75Srobert 
5*d415bd75Srobert #include "blake3_impl.h"
6*d415bd75Srobert 
7*d415bd75Srobert #if defined(IS_X86)
8*d415bd75Srobert #if defined(_MSC_VER)
9*d415bd75Srobert #include <intrin.h>
10*d415bd75Srobert #elif defined(__GNUC__)
11*d415bd75Srobert #include <immintrin.h>
12*d415bd75Srobert #else
13*d415bd75Srobert #error "Unimplemented!"
14*d415bd75Srobert #endif
15*d415bd75Srobert #endif
16*d415bd75Srobert 
17*d415bd75Srobert #define MAYBE_UNUSED(x) (void)((x))
18*d415bd75Srobert 
19*d415bd75Srobert #if defined(IS_X86)
xgetbv(void)20*d415bd75Srobert static uint64_t xgetbv(void) {
21*d415bd75Srobert #if defined(_MSC_VER)
22*d415bd75Srobert   return _xgetbv(0);
23*d415bd75Srobert #else
24*d415bd75Srobert   uint32_t eax = 0, edx = 0;
25*d415bd75Srobert   __asm__ __volatile__("xgetbv\n" : "=a"(eax), "=d"(edx) : "c"(0));
26*d415bd75Srobert   return ((uint64_t)edx << 32) | eax;
27*d415bd75Srobert #endif
28*d415bd75Srobert }
29*d415bd75Srobert 
cpuid(uint32_t out[4],uint32_t id)30*d415bd75Srobert static void cpuid(uint32_t out[4], uint32_t id) {
31*d415bd75Srobert #if defined(_MSC_VER)
32*d415bd75Srobert   __cpuid((int *)out, id);
33*d415bd75Srobert #elif defined(__i386__) || defined(_M_IX86)
34*d415bd75Srobert   __asm__ __volatile__("movl %%ebx, %1\n"
35*d415bd75Srobert                        "cpuid\n"
36*d415bd75Srobert                        "xchgl %1, %%ebx\n"
37*d415bd75Srobert                        : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
38*d415bd75Srobert                        : "a"(id));
39*d415bd75Srobert #else
40*d415bd75Srobert   __asm__ __volatile__("cpuid\n"
41*d415bd75Srobert                        : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
42*d415bd75Srobert                        : "a"(id));
43*d415bd75Srobert #endif
44*d415bd75Srobert }
45*d415bd75Srobert 
cpuidex(uint32_t out[4],uint32_t id,uint32_t sid)46*d415bd75Srobert static void cpuidex(uint32_t out[4], uint32_t id, uint32_t sid) {
47*d415bd75Srobert #if defined(_MSC_VER)
48*d415bd75Srobert   __cpuidex((int *)out, id, sid);
49*d415bd75Srobert #elif defined(__i386__) || defined(_M_IX86)
50*d415bd75Srobert   __asm__ __volatile__("movl %%ebx, %1\n"
51*d415bd75Srobert                        "cpuid\n"
52*d415bd75Srobert                        "xchgl %1, %%ebx\n"
53*d415bd75Srobert                        : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
54*d415bd75Srobert                        : "a"(id), "c"(sid));
55*d415bd75Srobert #else
56*d415bd75Srobert   __asm__ __volatile__("cpuid\n"
57*d415bd75Srobert                        : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
58*d415bd75Srobert                        : "a"(id), "c"(sid));
59*d415bd75Srobert #endif
60*d415bd75Srobert }
61*d415bd75Srobert 
62*d415bd75Srobert #endif
63*d415bd75Srobert 
64*d415bd75Srobert enum cpu_feature {
65*d415bd75Srobert   SSE2 = 1 << 0,
66*d415bd75Srobert   SSSE3 = 1 << 1,
67*d415bd75Srobert   SSE41 = 1 << 2,
68*d415bd75Srobert   AVX = 1 << 3,
69*d415bd75Srobert   AVX2 = 1 << 4,
70*d415bd75Srobert   AVX512F = 1 << 5,
71*d415bd75Srobert   AVX512VL = 1 << 6,
72*d415bd75Srobert   /* ... */
73*d415bd75Srobert   UNDEFINED = 1 << 30
74*d415bd75Srobert };
75*d415bd75Srobert 
76*d415bd75Srobert #if !defined(BLAKE3_TESTING)
77*d415bd75Srobert static /* Allow the variable to be controlled manually for testing */
78*d415bd75Srobert #endif
79*d415bd75Srobert     enum cpu_feature g_cpu_features = UNDEFINED;
80*d415bd75Srobert 
81*d415bd75Srobert LLVM_ATTRIBUTE_USED
82*d415bd75Srobert #if !defined(BLAKE3_TESTING)
83*d415bd75Srobert static
84*d415bd75Srobert #endif
85*d415bd75Srobert     enum cpu_feature
get_cpu_features(void)86*d415bd75Srobert     get_cpu_features(void) {
87*d415bd75Srobert 
88*d415bd75Srobert   if (g_cpu_features != UNDEFINED) {
89*d415bd75Srobert     return g_cpu_features;
90*d415bd75Srobert   } else {
91*d415bd75Srobert #if defined(IS_X86)
92*d415bd75Srobert     uint32_t regs[4] = {0};
93*d415bd75Srobert     uint32_t *eax = &regs[0], *ebx = &regs[1], *ecx = &regs[2], *edx = &regs[3];
94*d415bd75Srobert     (void)edx;
95*d415bd75Srobert     enum cpu_feature features = 0;
96*d415bd75Srobert     cpuid(regs, 0);
97*d415bd75Srobert     const int max_id = *eax;
98*d415bd75Srobert     cpuid(regs, 1);
99*d415bd75Srobert #if defined(__amd64__) || defined(_M_X64)
100*d415bd75Srobert     features |= SSE2;
101*d415bd75Srobert #else
102*d415bd75Srobert     if (*edx & (1UL << 26))
103*d415bd75Srobert       features |= SSE2;
104*d415bd75Srobert #endif
105*d415bd75Srobert     if (*ecx & (1UL << 0))
106*d415bd75Srobert       features |= SSSE3;
107*d415bd75Srobert     if (*ecx & (1UL << 19))
108*d415bd75Srobert       features |= SSE41;
109*d415bd75Srobert 
110*d415bd75Srobert     if (*ecx & (1UL << 27)) { // OSXSAVE
111*d415bd75Srobert       const uint64_t mask = xgetbv();
112*d415bd75Srobert       if ((mask & 6) == 6) { // SSE and AVX states
113*d415bd75Srobert         if (*ecx & (1UL << 28))
114*d415bd75Srobert           features |= AVX;
115*d415bd75Srobert         if (max_id >= 7) {
116*d415bd75Srobert           cpuidex(regs, 7, 0);
117*d415bd75Srobert           if (*ebx & (1UL << 5))
118*d415bd75Srobert             features |= AVX2;
119*d415bd75Srobert           if ((mask & 224) == 224) { // Opmask, ZMM_Hi256, Hi16_Zmm
120*d415bd75Srobert             if (*ebx & (1UL << 31))
121*d415bd75Srobert               features |= AVX512VL;
122*d415bd75Srobert             if (*ebx & (1UL << 16))
123*d415bd75Srobert               features |= AVX512F;
124*d415bd75Srobert           }
125*d415bd75Srobert         }
126*d415bd75Srobert       }
127*d415bd75Srobert     }
128*d415bd75Srobert     g_cpu_features = features;
129*d415bd75Srobert     return features;
130*d415bd75Srobert #else
131*d415bd75Srobert     /* How to detect NEON? */
132*d415bd75Srobert     return 0;
133*d415bd75Srobert #endif
134*d415bd75Srobert   }
135*d415bd75Srobert }
136*d415bd75Srobert 
blake3_compress_in_place(uint32_t cv[8],const uint8_t block[BLAKE3_BLOCK_LEN],uint8_t block_len,uint64_t counter,uint8_t flags)137*d415bd75Srobert void blake3_compress_in_place(uint32_t cv[8],
138*d415bd75Srobert                               const uint8_t block[BLAKE3_BLOCK_LEN],
139*d415bd75Srobert                               uint8_t block_len, uint64_t counter,
140*d415bd75Srobert                               uint8_t flags) {
141*d415bd75Srobert #if defined(IS_X86)
142*d415bd75Srobert   const enum cpu_feature features = get_cpu_features();
143*d415bd75Srobert   MAYBE_UNUSED(features);
144*d415bd75Srobert #if !defined(BLAKE3_NO_AVX512)
145*d415bd75Srobert   if (features & AVX512VL) {
146*d415bd75Srobert     blake3_compress_in_place_avx512(cv, block, block_len, counter, flags);
147*d415bd75Srobert     return;
148*d415bd75Srobert   }
149*d415bd75Srobert #endif
150*d415bd75Srobert #if !defined(BLAKE3_NO_SSE41)
151*d415bd75Srobert   if (features & SSE41) {
152*d415bd75Srobert     blake3_compress_in_place_sse41(cv, block, block_len, counter, flags);
153*d415bd75Srobert     return;
154*d415bd75Srobert   }
155*d415bd75Srobert #endif
156*d415bd75Srobert #if !defined(BLAKE3_NO_SSE2)
157*d415bd75Srobert   if (features & SSE2) {
158*d415bd75Srobert     blake3_compress_in_place_sse2(cv, block, block_len, counter, flags);
159*d415bd75Srobert     return;
160*d415bd75Srobert   }
161*d415bd75Srobert #endif
162*d415bd75Srobert #endif
163*d415bd75Srobert   blake3_compress_in_place_portable(cv, block, block_len, counter, flags);
164*d415bd75Srobert }
165*d415bd75Srobert 
blake3_compress_xof(const uint32_t cv[8],const uint8_t block[BLAKE3_BLOCK_LEN],uint8_t block_len,uint64_t counter,uint8_t flags,uint8_t out[64])166*d415bd75Srobert void blake3_compress_xof(const uint32_t cv[8],
167*d415bd75Srobert                          const uint8_t block[BLAKE3_BLOCK_LEN],
168*d415bd75Srobert                          uint8_t block_len, uint64_t counter, uint8_t flags,
169*d415bd75Srobert                          uint8_t out[64]) {
170*d415bd75Srobert #if defined(IS_X86)
171*d415bd75Srobert   const enum cpu_feature features = get_cpu_features();
172*d415bd75Srobert   MAYBE_UNUSED(features);
173*d415bd75Srobert #if !defined(BLAKE3_NO_AVX512)
174*d415bd75Srobert   if (features & AVX512VL) {
175*d415bd75Srobert     blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out);
176*d415bd75Srobert     return;
177*d415bd75Srobert   }
178*d415bd75Srobert #endif
179*d415bd75Srobert #if !defined(BLAKE3_NO_SSE41)
180*d415bd75Srobert   if (features & SSE41) {
181*d415bd75Srobert     blake3_compress_xof_sse41(cv, block, block_len, counter, flags, out);
182*d415bd75Srobert     return;
183*d415bd75Srobert   }
184*d415bd75Srobert #endif
185*d415bd75Srobert #if !defined(BLAKE3_NO_SSE2)
186*d415bd75Srobert   if (features & SSE2) {
187*d415bd75Srobert     blake3_compress_xof_sse2(cv, block, block_len, counter, flags, out);
188*d415bd75Srobert     return;
189*d415bd75Srobert   }
190*d415bd75Srobert #endif
191*d415bd75Srobert #endif
192*d415bd75Srobert   blake3_compress_xof_portable(cv, block, block_len, counter, flags, out);
193*d415bd75Srobert }
194*d415bd75Srobert 
blake3_hash_many(const uint8_t * const * inputs,size_t num_inputs,size_t blocks,const uint32_t key[8],uint64_t counter,bool increment_counter,uint8_t flags,uint8_t flags_start,uint8_t flags_end,uint8_t * out)195*d415bd75Srobert void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
196*d415bd75Srobert                       size_t blocks, const uint32_t key[8], uint64_t counter,
197*d415bd75Srobert                       bool increment_counter, uint8_t flags,
198*d415bd75Srobert                       uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
199*d415bd75Srobert #if defined(IS_X86)
200*d415bd75Srobert   const enum cpu_feature features = get_cpu_features();
201*d415bd75Srobert   MAYBE_UNUSED(features);
202*d415bd75Srobert #if !defined(BLAKE3_NO_AVX512)
203*d415bd75Srobert   if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
204*d415bd75Srobert     blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter,
205*d415bd75Srobert                             increment_counter, flags, flags_start, flags_end,
206*d415bd75Srobert                             out);
207*d415bd75Srobert     return;
208*d415bd75Srobert   }
209*d415bd75Srobert #endif
210*d415bd75Srobert #if !defined(BLAKE3_NO_AVX2)
211*d415bd75Srobert   if (features & AVX2) {
212*d415bd75Srobert     blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter,
213*d415bd75Srobert                           increment_counter, flags, flags_start, flags_end,
214*d415bd75Srobert                           out);
215*d415bd75Srobert     return;
216*d415bd75Srobert   }
217*d415bd75Srobert #endif
218*d415bd75Srobert #if !defined(BLAKE3_NO_SSE41)
219*d415bd75Srobert   if (features & SSE41) {
220*d415bd75Srobert     blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter,
221*d415bd75Srobert                            increment_counter, flags, flags_start, flags_end,
222*d415bd75Srobert                            out);
223*d415bd75Srobert     return;
224*d415bd75Srobert   }
225*d415bd75Srobert #endif
226*d415bd75Srobert #if !defined(BLAKE3_NO_SSE2)
227*d415bd75Srobert   if (features & SSE2) {
228*d415bd75Srobert     blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter,
229*d415bd75Srobert                           increment_counter, flags, flags_start, flags_end,
230*d415bd75Srobert                           out);
231*d415bd75Srobert     return;
232*d415bd75Srobert   }
233*d415bd75Srobert #endif
234*d415bd75Srobert #endif
235*d415bd75Srobert 
236*d415bd75Srobert #if BLAKE3_USE_NEON == 1
237*d415bd75Srobert   blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter,
238*d415bd75Srobert                         increment_counter, flags, flags_start, flags_end, out);
239*d415bd75Srobert   return;
240*d415bd75Srobert #endif
241*d415bd75Srobert 
242*d415bd75Srobert   blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter,
243*d415bd75Srobert                             increment_counter, flags, flags_start, flags_end,
244*d415bd75Srobert                             out);
245*d415bd75Srobert }
246*d415bd75Srobert 
247*d415bd75Srobert // The dynamically detected SIMD degree of the current platform.
blake3_simd_degree(void)248*d415bd75Srobert size_t blake3_simd_degree(void) {
249*d415bd75Srobert #if defined(IS_X86)
250*d415bd75Srobert   const enum cpu_feature features = get_cpu_features();
251*d415bd75Srobert   MAYBE_UNUSED(features);
252*d415bd75Srobert #if !defined(BLAKE3_NO_AVX512)
253*d415bd75Srobert   if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
254*d415bd75Srobert     return 16;
255*d415bd75Srobert   }
256*d415bd75Srobert #endif
257*d415bd75Srobert #if !defined(BLAKE3_NO_AVX2)
258*d415bd75Srobert   if (features & AVX2) {
259*d415bd75Srobert     return 8;
260*d415bd75Srobert   }
261*d415bd75Srobert #endif
262*d415bd75Srobert #if !defined(BLAKE3_NO_SSE41)
263*d415bd75Srobert   if (features & SSE41) {
264*d415bd75Srobert     return 4;
265*d415bd75Srobert   }
266*d415bd75Srobert #endif
267*d415bd75Srobert #if !defined(BLAKE3_NO_SSE2)
268*d415bd75Srobert   if (features & SSE2) {
269*d415bd75Srobert     return 4;
270*d415bd75Srobert   }
271*d415bd75Srobert #endif
272*d415bd75Srobert #endif
273*d415bd75Srobert #if BLAKE3_USE_NEON == 1
274*d415bd75Srobert   return 4;
275*d415bd75Srobert #endif
276*d415bd75Srobert   return 1;
277*d415bd75Srobert }
278