1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2021-2022 Tino Reichardt <milky-zfs@mcmilk.de>
24 */
25
26#include <sys/simd.h>
27#include <sys/zfs_context.h>
28#include <sys/zfs_impl.h>
29#include <sys/blake3.h>
30
31#include "blake3_impl.h"
32
33#if !defined(OMIT_SIMD) && (defined(__aarch64__) ||  \
34	(defined(__x86_64) && defined(HAVE_SSE2)) || \
35    (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)))
36#define USE_SIMD
37#endif
38
39#ifdef USE_SIMD
40extern void ASMABI zfs_blake3_compress_in_place_sse2(uint32_t cv[8],
41    const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
42    uint64_t counter, uint8_t flags);
43
44extern void ASMABI zfs_blake3_compress_xof_sse2(const uint32_t cv[8],
45    const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
46    uint64_t counter, uint8_t flags, uint8_t out[64]);
47
48extern void ASMABI zfs_blake3_hash_many_sse2(const uint8_t * const *inputs,
49    size_t num_inputs, size_t blocks, const uint32_t key[8],
50    uint64_t counter, boolean_t increment_counter, uint8_t flags,
51    uint8_t flags_start, uint8_t flags_end, uint8_t *out);
52
53static void blake3_compress_in_place_sse2(uint32_t cv[8],
54    const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
55    uint64_t counter, uint8_t flags) {
56	kfpu_begin();
57	zfs_blake3_compress_in_place_sse2(cv, block, block_len, counter,
58	    flags);
59	kfpu_end();
60}
61
62static void blake3_compress_xof_sse2(const uint32_t cv[8],
63    const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
64    uint64_t counter, uint8_t flags, uint8_t out[64]) {
65	kfpu_begin();
66	zfs_blake3_compress_xof_sse2(cv, block, block_len, counter, flags,
67	    out);
68	kfpu_end();
69}
70
71static void blake3_hash_many_sse2(const uint8_t * const *inputs,
72    size_t num_inputs, size_t blocks, const uint32_t key[8],
73    uint64_t counter, boolean_t increment_counter, uint8_t flags,
74    uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
75	kfpu_begin();
76	zfs_blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter,
77	    increment_counter, flags, flags_start, flags_end, out);
78	kfpu_end();
79}
80
81static boolean_t blake3_is_sse2_supported(void)
82{
83#if defined(__x86_64)
84	return (kfpu_allowed() && zfs_sse2_available());
85#elif defined(__PPC64__)
86	return (kfpu_allowed() && zfs_vsx_available());
87#else
88	return (kfpu_allowed());
89#endif
90}
91
92const blake3_ops_t blake3_sse2_impl = {
93	.compress_in_place = blake3_compress_in_place_sse2,
94	.compress_xof = blake3_compress_xof_sse2,
95	.hash_many = blake3_hash_many_sse2,
96	.is_supported = blake3_is_sse2_supported,
97	.degree = 4,
98	.name = "sse2"
99};
100#endif
101
102#ifdef USE_SIMD
103
104extern void ASMABI zfs_blake3_compress_in_place_sse41(uint32_t cv[8],
105    const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
106    uint64_t counter, uint8_t flags);
107
108extern void ASMABI zfs_blake3_compress_xof_sse41(const uint32_t cv[8],
109    const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
110    uint64_t counter, uint8_t flags, uint8_t out[64]);
111
112extern void ASMABI zfs_blake3_hash_many_sse41(const uint8_t * const *inputs,
113    size_t num_inputs, size_t blocks, const uint32_t key[8],
114    uint64_t counter, boolean_t increment_counter, uint8_t flags,
115    uint8_t flags_start, uint8_t flags_end, uint8_t *out);
116
117static void blake3_compress_in_place_sse41(uint32_t cv[8],
118    const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
119    uint64_t counter, uint8_t flags) {
120	kfpu_begin();
121	zfs_blake3_compress_in_place_sse41(cv, block, block_len, counter,
122	    flags);
123	kfpu_end();
124}
125
126static void blake3_compress_xof_sse41(const uint32_t cv[8],
127    const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
128    uint64_t counter, uint8_t flags, uint8_t out[64]) {
129	kfpu_begin();
130	zfs_blake3_compress_xof_sse41(cv, block, block_len, counter, flags,
131	    out);
132	kfpu_end();
133}
134
135static void blake3_hash_many_sse41(const uint8_t * const *inputs,
136    size_t num_inputs, size_t blocks, const uint32_t key[8],
137    uint64_t counter, boolean_t increment_counter, uint8_t flags,
138    uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
139	kfpu_begin();
140	zfs_blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter,
141	    increment_counter, flags, flags_start, flags_end, out);
142	kfpu_end();
143}
144
145static boolean_t blake3_is_sse41_supported(void)
146{
147#if defined(__x86_64)
148	return (kfpu_allowed() && zfs_sse4_1_available());
149#elif defined(__PPC64__)
150	return (kfpu_allowed() && zfs_vsx_available());
151#else
152	return (kfpu_allowed());
153#endif
154}
155
156const blake3_ops_t blake3_sse41_impl = {
157	.compress_in_place = blake3_compress_in_place_sse41,
158	.compress_xof = blake3_compress_xof_sse41,
159	.hash_many = blake3_hash_many_sse41,
160	.is_supported = blake3_is_sse41_supported,
161	.degree = 4,
162	.name = "sse41"
163};
164#endif
165
166#if defined(__x86_64) && defined(HAVE_SSE4_1) && defined(HAVE_AVX2)
167extern void ASMABI zfs_blake3_hash_many_avx2(const uint8_t * const *inputs,
168    size_t num_inputs, size_t blocks, const uint32_t key[8],
169    uint64_t counter, boolean_t increment_counter, uint8_t flags,
170    uint8_t flags_start, uint8_t flags_end, uint8_t *out);
171
172static void blake3_hash_many_avx2(const uint8_t * const *inputs,
173    size_t num_inputs, size_t blocks, const uint32_t key[8],
174    uint64_t counter, boolean_t increment_counter, uint8_t flags,
175    uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
176	kfpu_begin();
177	zfs_blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter,
178	    increment_counter, flags, flags_start, flags_end, out);
179	kfpu_end();
180}
181
182static boolean_t blake3_is_avx2_supported(void)
183{
184	return (kfpu_allowed() && zfs_sse4_1_available() &&
185	    zfs_avx2_available());
186}
187
188const blake3_ops_t
189blake3_avx2_impl = {
190	.compress_in_place = blake3_compress_in_place_sse41,
191	.compress_xof = blake3_compress_xof_sse41,
192	.hash_many = blake3_hash_many_avx2,
193	.is_supported = blake3_is_avx2_supported,
194	.degree = 8,
195	.name = "avx2"
196};
197#endif
198
199#if defined(__x86_64) && defined(HAVE_AVX512F) && defined(HAVE_AVX512VL)
200extern void ASMABI zfs_blake3_compress_in_place_avx512(uint32_t cv[8],
201    const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
202    uint64_t counter, uint8_t flags);
203
204extern void ASMABI zfs_blake3_compress_xof_avx512(const uint32_t cv[8],
205    const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
206    uint64_t counter, uint8_t flags, uint8_t out[64]);
207
208extern void ASMABI zfs_blake3_hash_many_avx512(const uint8_t * const *inputs,
209    size_t num_inputs, size_t blocks, const uint32_t key[8],
210    uint64_t counter, boolean_t increment_counter, uint8_t flags,
211    uint8_t flags_start, uint8_t flags_end, uint8_t *out);
212
213static void blake3_compress_in_place_avx512(uint32_t cv[8],
214    const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
215    uint64_t counter, uint8_t flags) {
216	kfpu_begin();
217	zfs_blake3_compress_in_place_avx512(cv, block, block_len, counter,
218	    flags);
219	kfpu_end();
220}
221
222static void blake3_compress_xof_avx512(const uint32_t cv[8],
223    const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
224    uint64_t counter, uint8_t flags, uint8_t out[64]) {
225	kfpu_begin();
226	zfs_blake3_compress_xof_avx512(cv, block, block_len, counter, flags,
227	    out);
228	kfpu_end();
229}
230
231static void blake3_hash_many_avx512(const uint8_t * const *inputs,
232    size_t num_inputs, size_t blocks, const uint32_t key[8],
233    uint64_t counter, boolean_t increment_counter, uint8_t flags,
234    uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
235	kfpu_begin();
236	zfs_blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter,
237	    increment_counter, flags, flags_start, flags_end, out);
238	kfpu_end();
239}
240
241static boolean_t blake3_is_avx512_supported(void)
242{
243	return (kfpu_allowed() && zfs_avx512f_available() &&
244	    zfs_avx512vl_available());
245}
246
247const blake3_ops_t blake3_avx512_impl = {
248	.compress_in_place = blake3_compress_in_place_avx512,
249	.compress_xof = blake3_compress_xof_avx512,
250	.hash_many = blake3_hash_many_avx512,
251	.is_supported = blake3_is_avx512_supported,
252	.degree = 16,
253	.name = "avx512"
254};
255#endif
256
257extern const blake3_ops_t blake3_generic_impl;
258
259static const blake3_ops_t *const blake3_impls[] = {
260	&blake3_generic_impl,
261#ifdef USE_SIMD
262#if defined(__aarch64__) || \
263	(defined(__x86_64) && defined(HAVE_SSE2)) || \
264	(defined(__PPC64__) && defined(__LITTLE_ENDIAN__))
265	&blake3_sse2_impl,
266#endif
267#if defined(__aarch64__) || \
268	(defined(__x86_64) && defined(HAVE_SSE4_1)) || \
269	(defined(__PPC64__) && defined(__LITTLE_ENDIAN__))
270	&blake3_sse41_impl,
271#endif
272#if defined(__x86_64) && defined(HAVE_SSE4_1) && defined(HAVE_AVX2)
273	&blake3_avx2_impl,
274#endif
275#if defined(__x86_64) && defined(HAVE_AVX512F) && defined(HAVE_AVX512VL)
276	&blake3_avx512_impl,
277#endif
278#endif
279};
280
281/* use the generic implementation functions */
282#define	IMPL_NAME		"blake3"
283#define	IMPL_OPS_T		blake3_ops_t
284#define	IMPL_ARRAY		blake3_impls
285#define	IMPL_GET_OPS		blake3_get_ops
286#define	ZFS_IMPL_OPS		zfs_blake3_ops
287#include <generic_impl.c>
288
289#ifdef _KERNEL
290void **blake3_per_cpu_ctx;
291
292void
293blake3_per_cpu_ctx_init(void)
294{
295	/*
296	 * Create "The Godfather" ptr to hold all blake3 ctx
297	 */
298	blake3_per_cpu_ctx = kmem_alloc(max_ncpus * sizeof (void *), KM_SLEEP);
299	for (int i = 0; i < max_ncpus; i++) {
300		blake3_per_cpu_ctx[i] = kmem_alloc(sizeof (BLAKE3_CTX),
301		    KM_SLEEP);
302	}
303}
304
305void
306blake3_per_cpu_ctx_fini(void)
307{
308	for (int i = 0; i < max_ncpus; i++) {
309		memset(blake3_per_cpu_ctx[i], 0, sizeof (BLAKE3_CTX));
310		kmem_free(blake3_per_cpu_ctx[i], sizeof (BLAKE3_CTX));
311	}
312	memset(blake3_per_cpu_ctx, 0, max_ncpus * sizeof (void *));
313	kmem_free(blake3_per_cpu_ctx, max_ncpus * sizeof (void *));
314}
315
316#define	IMPL_FMT(impl, i)	(((impl) == (i)) ? "[%s] " : "%s ")
317
318#if defined(__linux__)
319
320static int
321blake3_param_get(char *buffer, zfs_kernel_param_t *unused)
322{
323	const uint32_t impl = IMPL_READ(generic_impl_chosen);
324	char *fmt;
325	int cnt = 0;
326
327	/* cycling */
328	fmt = IMPL_FMT(impl, IMPL_CYCLE);
329	cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt, "cycle");
330
331	/* list fastest */
332	fmt = IMPL_FMT(impl, IMPL_FASTEST);
333	cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt, "fastest");
334
335	/* list all supported implementations */
336	generic_impl_init();
337	for (uint32_t i = 0; i < generic_supp_impls_cnt; ++i) {
338		fmt = IMPL_FMT(impl, i);
339		cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt,
340		    blake3_impls[i]->name);
341	}
342
343	return (cnt);
344}
345
346static int
347blake3_param_set(const char *val, zfs_kernel_param_t *unused)
348{
349	(void) unused;
350	return (generic_impl_setname(val));
351}
352
353#elif defined(__FreeBSD__)
354
355#include <sys/sbuf.h>
356
357static int
358blake3_param(ZFS_MODULE_PARAM_ARGS)
359{
360	int err;
361
362	generic_impl_init();
363	if (req->newptr == NULL) {
364		const uint32_t impl = IMPL_READ(generic_impl_chosen);
365		const int init_buflen = 64;
366		const char *fmt;
367		struct sbuf *s;
368
369		s = sbuf_new_for_sysctl(NULL, NULL, init_buflen, req);
370
371		/* cycling */
372		fmt = IMPL_FMT(impl, IMPL_CYCLE);
373		(void) sbuf_printf(s, fmt, "cycle");
374
375		/* list fastest */
376		fmt = IMPL_FMT(impl, IMPL_FASTEST);
377		(void) sbuf_printf(s, fmt, "fastest");
378
379		/* list all supported implementations */
380		for (uint32_t i = 0; i < generic_supp_impls_cnt; ++i) {
381			fmt = IMPL_FMT(impl, i);
382			(void) sbuf_printf(s, fmt, generic_supp_impls[i]->name);
383		}
384
385		err = sbuf_finish(s);
386		sbuf_delete(s);
387
388		return (err);
389	}
390
391	char buf[16];
392
393	err = sysctl_handle_string(oidp, buf, sizeof (buf), req);
394	if (err) {
395		return (err);
396	}
397
398	return (-generic_impl_setname(buf));
399}
400#endif
401
402#undef IMPL_FMT
403
404ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs, zfs_, blake3_impl,
405    blake3_param_set, blake3_param_get, ZMOD_RW, \
406	"Select BLAKE3 implementation.");
407#endif
408