1#include <stdbool.h>
2#include <stddef.h>
3#include <stdint.h>
4
5#include "blake3_impl.h"
6
7#if defined(IS_X86)
8#if defined(_MSC_VER)
9#include <intrin.h>
10#elif defined(__GNUC__)
11#include <immintrin.h>
12#else
13#error "Unimplemented!"
14#endif
15#endif
16
17#define MAYBE_UNUSED(x) (void)((x))
18
19#if defined(IS_X86)
20static uint64_t xgetbv(void) {
21#if defined(_MSC_VER)
22  return _xgetbv(0);
23#else
24  uint32_t eax = 0, edx = 0;
25  __asm__ __volatile__("xgetbv\n" : "=a"(eax), "=d"(edx) : "c"(0));
26  return ((uint64_t)edx << 32) | eax;
27#endif
28}
29
30static void cpuid(uint32_t out[4], uint32_t id) {
31#if defined(_MSC_VER)
32  __cpuid((int *)out, id);
33#elif defined(__i386__) || defined(_M_IX86)
34  __asm__ __volatile__("movl %%ebx, %1\n"
35                       "cpuid\n"
36                       "xchgl %1, %%ebx\n"
37                       : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
38                       : "a"(id));
39#else
40  __asm__ __volatile__("cpuid\n"
41                       : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
42                       : "a"(id));
43#endif
44}
45
46static void cpuidex(uint32_t out[4], uint32_t id, uint32_t sid) {
47#if defined(_MSC_VER)
48  __cpuidex((int *)out, id, sid);
49#elif defined(__i386__) || defined(_M_IX86)
50  __asm__ __volatile__("movl %%ebx, %1\n"
51                       "cpuid\n"
52                       "xchgl %1, %%ebx\n"
53                       : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
54                       : "a"(id), "c"(sid));
55#else
56  __asm__ __volatile__("cpuid\n"
57                       : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
58                       : "a"(id), "c"(sid));
59#endif
60}
61
62#endif
63
64enum cpu_feature {
65  SSE2 = 1 << 0,
66  SSSE3 = 1 << 1,
67  SSE41 = 1 << 2,
68  AVX = 1 << 3,
69  AVX2 = 1 << 4,
70  AVX512F = 1 << 5,
71  AVX512VL = 1 << 6,
72  /* ... */
73  UNDEFINED = 1 << 30
74};
75
76#if !defined(BLAKE3_TESTING)
77static /* Allow the variable to be controlled manually for testing */
78#endif
79    enum cpu_feature g_cpu_features = UNDEFINED;
80
81LLVM_ATTRIBUTE_USED
82#if !defined(BLAKE3_TESTING)
83static
84#endif
85    enum cpu_feature
86    get_cpu_features(void) {
87
88  if (g_cpu_features != UNDEFINED) {
89    return g_cpu_features;
90  } else {
91#if defined(IS_X86)
92    uint32_t regs[4] = {0};
93    uint32_t *eax = &regs[0], *ebx = &regs[1], *ecx = &regs[2], *edx = &regs[3];
94    (void)edx;
95    enum cpu_feature features = 0;
96    cpuid(regs, 0);
97    const int max_id = *eax;
98    cpuid(regs, 1);
99#if defined(__amd64__) || defined(_M_X64)
100    features |= SSE2;
101#else
102    if (*edx & (1UL << 26))
103      features |= SSE2;
104#endif
105    if (*ecx & (1UL << 0))
106      features |= SSSE3;
107    if (*ecx & (1UL << 19))
108      features |= SSE41;
109
110    if (*ecx & (1UL << 27)) { // OSXSAVE
111      const uint64_t mask = xgetbv();
112      if ((mask & 6) == 6) { // SSE and AVX states
113        if (*ecx & (1UL << 28))
114          features |= AVX;
115        if (max_id >= 7) {
116          cpuidex(regs, 7, 0);
117          if (*ebx & (1UL << 5))
118            features |= AVX2;
119          if ((mask & 224) == 224) { // Opmask, ZMM_Hi256, Hi16_Zmm
120            if (*ebx & (1UL << 31))
121              features |= AVX512VL;
122            if (*ebx & (1UL << 16))
123              features |= AVX512F;
124          }
125        }
126      }
127    }
128    g_cpu_features = features;
129    return features;
130#else
131    /* How to detect NEON? */
132    return 0;
133#endif
134  }
135}
136
137void blake3_compress_in_place(uint32_t cv[8],
138                              const uint8_t block[BLAKE3_BLOCK_LEN],
139                              uint8_t block_len, uint64_t counter,
140                              uint8_t flags) {
141#if defined(IS_X86)
142  const enum cpu_feature features = get_cpu_features();
143  MAYBE_UNUSED(features);
144#if !defined(BLAKE3_NO_AVX512)
145  if (features & AVX512VL) {
146    blake3_compress_in_place_avx512(cv, block, block_len, counter, flags);
147    return;
148  }
149#endif
150#if !defined(BLAKE3_NO_SSE41)
151  if (features & SSE41) {
152    blake3_compress_in_place_sse41(cv, block, block_len, counter, flags);
153    return;
154  }
155#endif
156#if !defined(BLAKE3_NO_SSE2)
157  if (features & SSE2) {
158    blake3_compress_in_place_sse2(cv, block, block_len, counter, flags);
159    return;
160  }
161#endif
162#endif
163  blake3_compress_in_place_portable(cv, block, block_len, counter, flags);
164}
165
166void blake3_compress_xof(const uint32_t cv[8],
167                         const uint8_t block[BLAKE3_BLOCK_LEN],
168                         uint8_t block_len, uint64_t counter, uint8_t flags,
169                         uint8_t out[64]) {
170#if defined(IS_X86)
171  const enum cpu_feature features = get_cpu_features();
172  MAYBE_UNUSED(features);
173#if !defined(BLAKE3_NO_AVX512)
174  if (features & AVX512VL) {
175    blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out);
176    return;
177  }
178#endif
179#if !defined(BLAKE3_NO_SSE41)
180  if (features & SSE41) {
181    blake3_compress_xof_sse41(cv, block, block_len, counter, flags, out);
182    return;
183  }
184#endif
185#if !defined(BLAKE3_NO_SSE2)
186  if (features & SSE2) {
187    blake3_compress_xof_sse2(cv, block, block_len, counter, flags, out);
188    return;
189  }
190#endif
191#endif
192  blake3_compress_xof_portable(cv, block, block_len, counter, flags, out);
193}
194
195void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
196                      size_t blocks, const uint32_t key[8], uint64_t counter,
197                      bool increment_counter, uint8_t flags,
198                      uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
199#if defined(IS_X86)
200  const enum cpu_feature features = get_cpu_features();
201  MAYBE_UNUSED(features);
202#if !defined(BLAKE3_NO_AVX512)
203  if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
204    blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter,
205                            increment_counter, flags, flags_start, flags_end,
206                            out);
207    return;
208  }
209#endif
210#if !defined(BLAKE3_NO_AVX2)
211  if (features & AVX2) {
212    blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter,
213                          increment_counter, flags, flags_start, flags_end,
214                          out);
215    return;
216  }
217#endif
218#if !defined(BLAKE3_NO_SSE41)
219  if (features & SSE41) {
220    blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter,
221                           increment_counter, flags, flags_start, flags_end,
222                           out);
223    return;
224  }
225#endif
226#if !defined(BLAKE3_NO_SSE2)
227  if (features & SSE2) {
228    blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter,
229                          increment_counter, flags, flags_start, flags_end,
230                          out);
231    return;
232  }
233#endif
234#endif
235
236#if BLAKE3_USE_NEON == 1
237  blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter,
238                        increment_counter, flags, flags_start, flags_end, out);
239  return;
240#endif
241
242  blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter,
243                            increment_counter, flags, flags_start, flags_end,
244                            out);
245}
246
247// The dynamically detected SIMD degree of the current platform.
248size_t blake3_simd_degree(void) {
249#if defined(IS_X86)
250  const enum cpu_feature features = get_cpu_features();
251  MAYBE_UNUSED(features);
252#if !defined(BLAKE3_NO_AVX512)
253  if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
254    return 16;
255  }
256#endif
257#if !defined(BLAKE3_NO_AVX2)
258  if (features & AVX2) {
259    return 8;
260  }
261#endif
262#if !defined(BLAKE3_NO_SSE41)
263  if (features & SSE41) {
264    return 4;
265  }
266#endif
267#if !defined(BLAKE3_NO_SSE2)
268  if (features & SSE2) {
269    return 4;
270  }
271#endif
272#endif
273#if BLAKE3_USE_NEON == 1
274  return 4;
275#endif
276  return 1;
277}
278