1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 * Copyright (C) 2016 Gvozden Ne��kovi��. All rights reserved.
25 */
26/*
27 * Copyright 2013 Saso Kiselkov. All rights reserved.
28 */
29
30/*
31 * Copyright (c) 2016 by Delphix. All rights reserved.
32 */
33
34/*
35 * Fletcher Checksums
36 * ------------------
37 *
38 * ZFS's 2nd and 4th order Fletcher checksums are defined by the following
39 * recurrence relations:
40 *
41 *	a  = a    + f
42 *	 i    i-1    i-1
43 *
44 *	b  = b    + a
45 *	 i    i-1    i
46 *
47 *	c  = c    + b		(fletcher-4 only)
48 *	 i    i-1    i
49 *
50 *	d  = d    + c		(fletcher-4 only)
51 *	 i    i-1    i
52 *
53 * Where
54 *	a_0 = b_0 = c_0 = d_0 = 0
55 * and
56 *	f_0 .. f_(n-1) are the input data.
57 *
58 * Using standard techniques, these translate into the following series:
59 *
60 *	     __n_			     __n_
61 *	     \   |			     \   |
62 *	a  =  >     f			b  =  >     i * f
63 *	 n   /___|   n - i		 n   /___|	 n - i
64 *	     i = 1			     i = 1
65 *
66 *
67 *	     __n_			     __n_
68 *	     \   |  i*(i+1)		     \   |  i*(i+1)*(i+2)
69 *	c  =  >     ------- f		d  =  >     ------------- f
70 *	 n   /___|     2     n - i	 n   /___|	  6	   n - i
71 *	     i = 1			     i = 1
72 *
73 * For fletcher-2, the f_is are 64-bit, and [ab]_i are 64-bit accumulators.
74 * Since the additions are done mod (2^64), errors in the high bits may not
75 * be noticed.  For this reason, fletcher-2 is deprecated.
76 *
77 * For fletcher-4, the f_is are 32-bit, and [abcd]_i are 64-bit accumulators.
78 * A conservative estimate of how big the buffer can get before we overflow
79 * can be estimated using f_i = 0xffffffff for all i:
80 *
81 * % bc
82 *  f=2^32-1;d=0; for (i = 1; d<2^64; i++) { d += f*i*(i+1)*(i+2)/6 }; (i-1)*4
83 * 2264
84 *  quit
85 * %
86 *
87 * So blocks of up to 2k will not overflow.  Our largest block size is
88 * 128k, which has 32k 4-byte words, so we can compute the largest possible
89 * accumulators, then divide by 2^64 to figure the max amount of overflow:
90 *
91 * % bc
92 *  a=b=c=d=0; f=2^32-1; for (i=1; i<=32*1024; i++) { a+=f; b+=a; c+=b; d+=c }
93 *  a/2^64;b/2^64;c/2^64;d/2^64
94 * 0
95 * 0
96 * 1365
97 * 11186858
98 *  quit
99 * %
100 *
101 * So a and b cannot overflow.  To make sure each bit of input has some
102 * effect on the contents of c and d, we can look at what the factors of
103 * the coefficients in the equations for c_n and d_n are.  The number of 2s
104 * in the factors determines the lowest set bit in the multiplier.  Running
105 * through the cases for n*(n+1)/2 reveals that the highest power of 2 is
106 * 2^14, and for n*(n+1)*(n+2)/6 it is 2^15.  So while some data may overflow
107 * the 64-bit accumulators, every bit of every f_i effects every accumulator,
108 * even for 128k blocks.
109 *
110 * If we wanted to make a stronger version of fletcher4 (fletcher4c?),
111 * we could do our calculations mod (2^32 - 1) by adding in the carries
112 * periodically, and store the number of carries in the top 32-bits.
113 *
114 * --------------------
115 * Checksum Performance
116 * --------------------
117 *
118 * There are two interesting components to checksum performance: cached and
119 * uncached performance.  With cached data, fletcher-2 is about four times
120 * faster than fletcher-4.  With uncached data, the performance difference is
121 * negligible, since the cost of a cache fill dominates the processing time.
122 * Even though fletcher-4 is slower than fletcher-2, it is still a pretty
123 * efficient pass over the data.
124 *
125 * In normal operation, the data which is being checksummed is in a buffer
126 * which has been filled either by:
127 *
128 *	1. a compression step, which will be mostly cached, or
129 *	2. a memcpy() or copyin(), which will be uncached
130 *	   (because the copy is cache-bypassing).
131 *
132 * For both cached and uncached data, both fletcher checksums are much faster
133 * than sha-256, and slower than 'off', which doesn't touch the data at all.
134 */
135
136#include <sys/types.h>
137#include <sys/sysmacros.h>
138#include <sys/byteorder.h>
139#include <sys/simd.h>
140#include <sys/spa.h>
141#include <sys/zio_checksum.h>
142#include <sys/zfs_context.h>
143#include <zfs_fletcher.h>
144
145#define	FLETCHER_MIN_SIMD_SIZE	64
146
147static void fletcher_4_scalar_init(fletcher_4_ctx_t *ctx);
148static void fletcher_4_scalar_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp);
149static void fletcher_4_scalar_native(fletcher_4_ctx_t *ctx,
150    const void *buf, uint64_t size);
151static void fletcher_4_scalar_byteswap(fletcher_4_ctx_t *ctx,
152    const void *buf, uint64_t size);
153static boolean_t fletcher_4_scalar_valid(void);
154
155static const fletcher_4_ops_t fletcher_4_scalar_ops = {
156	.init_native = fletcher_4_scalar_init,
157	.fini_native = fletcher_4_scalar_fini,
158	.compute_native = fletcher_4_scalar_native,
159	.init_byteswap = fletcher_4_scalar_init,
160	.fini_byteswap = fletcher_4_scalar_fini,
161	.compute_byteswap = fletcher_4_scalar_byteswap,
162	.valid = fletcher_4_scalar_valid,
163	.uses_fpu = B_FALSE,
164	.name = "scalar"
165};
166
167static fletcher_4_ops_t fletcher_4_fastest_impl = {
168	.name = "fastest",
169	.valid = fletcher_4_scalar_valid
170};
171
172static const fletcher_4_ops_t *fletcher_4_impls[] = {
173	&fletcher_4_scalar_ops,
174	&fletcher_4_superscalar_ops,
175	&fletcher_4_superscalar4_ops,
176#if defined(HAVE_SSE2)
177	&fletcher_4_sse2_ops,
178#endif
179#if defined(HAVE_SSE2) && defined(HAVE_SSSE3)
180	&fletcher_4_ssse3_ops,
181#endif
182#if defined(HAVE_AVX) && defined(HAVE_AVX2)
183	&fletcher_4_avx2_ops,
184#endif
185#if defined(__x86_64) && defined(HAVE_AVX512F)
186	&fletcher_4_avx512f_ops,
187#endif
188#if defined(__x86_64) && defined(HAVE_AVX512BW)
189	&fletcher_4_avx512bw_ops,
190#endif
191#if defined(__aarch64__) && !defined(__FreeBSD__)
192	&fletcher_4_aarch64_neon_ops,
193#endif
194};
195
196/* Hold all supported implementations */
197static uint32_t fletcher_4_supp_impls_cnt = 0;
198static fletcher_4_ops_t *fletcher_4_supp_impls[ARRAY_SIZE(fletcher_4_impls)];
199
200/* Select fletcher4 implementation */
201#define	IMPL_FASTEST	(UINT32_MAX)
202#define	IMPL_CYCLE	(UINT32_MAX - 1)
203#define	IMPL_SCALAR	(0)
204
205static uint32_t fletcher_4_impl_chosen = IMPL_FASTEST;
206
207#define	IMPL_READ(i)	(*(volatile uint32_t *) &(i))
208
209static struct fletcher_4_impl_selector {
210	const char	*fis_name;
211	uint32_t	fis_sel;
212} fletcher_4_impl_selectors[] = {
213	{ "cycle",	IMPL_CYCLE },
214	{ "fastest",	IMPL_FASTEST },
215	{ "scalar",	IMPL_SCALAR }
216};
217
218#if defined(_KERNEL)
219static kstat_t *fletcher_4_kstat;
220
221static struct fletcher_4_kstat {
222	uint64_t native;
223	uint64_t byteswap;
224} fletcher_4_stat_data[ARRAY_SIZE(fletcher_4_impls) + 1];
225#endif
226
227/* Indicate that benchmark has been completed */
228static boolean_t fletcher_4_initialized = B_FALSE;
229
230void
231fletcher_init(zio_cksum_t *zcp)
232{
233	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
234}
235
236int
237fletcher_2_incremental_native(void *buf, size_t size, void *data)
238{
239	zio_cksum_t *zcp = data;
240
241	const uint64_t *ip = buf;
242	const uint64_t *ipend = ip + (size / sizeof (uint64_t));
243	uint64_t a0, b0, a1, b1;
244
245	a0 = zcp->zc_word[0];
246	a1 = zcp->zc_word[1];
247	b0 = zcp->zc_word[2];
248	b1 = zcp->zc_word[3];
249
250	for (; ip < ipend; ip += 2) {
251		a0 += ip[0];
252		a1 += ip[1];
253		b0 += a0;
254		b1 += a1;
255	}
256
257	ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
258	return (0);
259}
260
261void
262fletcher_2_native(const void *buf, uint64_t size,
263    const void *ctx_template, zio_cksum_t *zcp)
264{
265	(void) ctx_template;
266	fletcher_init(zcp);
267	(void) fletcher_2_incremental_native((void *) buf, size, zcp);
268}
269
270int
271fletcher_2_incremental_byteswap(void *buf, size_t size, void *data)
272{
273	zio_cksum_t *zcp = data;
274
275	const uint64_t *ip = buf;
276	const uint64_t *ipend = ip + (size / sizeof (uint64_t));
277	uint64_t a0, b0, a1, b1;
278
279	a0 = zcp->zc_word[0];
280	a1 = zcp->zc_word[1];
281	b0 = zcp->zc_word[2];
282	b1 = zcp->zc_word[3];
283
284	for (; ip < ipend; ip += 2) {
285		a0 += BSWAP_64(ip[0]);
286		a1 += BSWAP_64(ip[1]);
287		b0 += a0;
288		b1 += a1;
289	}
290
291	ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
292	return (0);
293}
294
295void
296fletcher_2_byteswap(const void *buf, uint64_t size,
297    const void *ctx_template, zio_cksum_t *zcp)
298{
299	(void) ctx_template;
300	fletcher_init(zcp);
301	(void) fletcher_2_incremental_byteswap((void *) buf, size, zcp);
302}
303
304static void
305fletcher_4_scalar_init(fletcher_4_ctx_t *ctx)
306{
307	ZIO_SET_CHECKSUM(&ctx->scalar, 0, 0, 0, 0);
308}
309
310static void
311fletcher_4_scalar_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)
312{
313	memcpy(zcp, &ctx->scalar, sizeof (zio_cksum_t));
314}
315
316static void
317fletcher_4_scalar_native(fletcher_4_ctx_t *ctx, const void *buf,
318    uint64_t size)
319{
320	const uint32_t *ip = buf;
321	const uint32_t *ipend = ip + (size / sizeof (uint32_t));
322	uint64_t a, b, c, d;
323
324	a = ctx->scalar.zc_word[0];
325	b = ctx->scalar.zc_word[1];
326	c = ctx->scalar.zc_word[2];
327	d = ctx->scalar.zc_word[3];
328
329	for (; ip < ipend; ip++) {
330		a += ip[0];
331		b += a;
332		c += b;
333		d += c;
334	}
335
336	ZIO_SET_CHECKSUM(&ctx->scalar, a, b, c, d);
337}
338
339static void
340fletcher_4_scalar_byteswap(fletcher_4_ctx_t *ctx, const void *buf,
341    uint64_t size)
342{
343	const uint32_t *ip = buf;
344	const uint32_t *ipend = ip + (size / sizeof (uint32_t));
345	uint64_t a, b, c, d;
346
347	a = ctx->scalar.zc_word[0];
348	b = ctx->scalar.zc_word[1];
349	c = ctx->scalar.zc_word[2];
350	d = ctx->scalar.zc_word[3];
351
352	for (; ip < ipend; ip++) {
353		a += BSWAP_32(ip[0]);
354		b += a;
355		c += b;
356		d += c;
357	}
358
359	ZIO_SET_CHECKSUM(&ctx->scalar, a, b, c, d);
360}
361
362static boolean_t
363fletcher_4_scalar_valid(void)
364{
365	return (B_TRUE);
366}
367
368int
369fletcher_4_impl_set(const char *val)
370{
371	int err = -EINVAL;
372	uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
373	size_t i, val_len;
374
375	val_len = strlen(val);
376	while ((val_len > 0) && !!isspace(val[val_len-1])) /* trim '\n' */
377		val_len--;
378
379	/* check mandatory implementations */
380	for (i = 0; i < ARRAY_SIZE(fletcher_4_impl_selectors); i++) {
381		const char *name = fletcher_4_impl_selectors[i].fis_name;
382
383		if (val_len == strlen(name) &&
384		    strncmp(val, name, val_len) == 0) {
385			impl = fletcher_4_impl_selectors[i].fis_sel;
386			err = 0;
387			break;
388		}
389	}
390
391	if (err != 0 && fletcher_4_initialized) {
392		/* check all supported implementations */
393		for (i = 0; i < fletcher_4_supp_impls_cnt; i++) {
394			const char *name = fletcher_4_supp_impls[i]->name;
395
396			if (val_len == strlen(name) &&
397			    strncmp(val, name, val_len) == 0) {
398				impl = i;
399				err = 0;
400				break;
401			}
402		}
403	}
404
405	if (err == 0) {
406		atomic_swap_32(&fletcher_4_impl_chosen, impl);
407		membar_producer();
408	}
409
410	return (err);
411}
412
413/*
414 * Returns the Fletcher 4 operations for checksums.   When a SIMD
415 * implementation is not allowed in the current context, then fallback
416 * to the fastest generic implementation.
417 */
418static inline const fletcher_4_ops_t *
419fletcher_4_impl_get(void)
420{
421	if (!kfpu_allowed())
422		return (&fletcher_4_superscalar4_ops);
423
424	const fletcher_4_ops_t *ops = NULL;
425	uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
426
427	switch (impl) {
428	case IMPL_FASTEST:
429		ASSERT(fletcher_4_initialized);
430		ops = &fletcher_4_fastest_impl;
431		break;
432	case IMPL_CYCLE:
433		/* Cycle through supported implementations */
434		ASSERT(fletcher_4_initialized);
435		ASSERT3U(fletcher_4_supp_impls_cnt, >, 0);
436		static uint32_t cycle_count = 0;
437		uint32_t idx = (++cycle_count) % fletcher_4_supp_impls_cnt;
438		ops = fletcher_4_supp_impls[idx];
439		break;
440	default:
441		ASSERT3U(fletcher_4_supp_impls_cnt, >, 0);
442		ASSERT3U(impl, <, fletcher_4_supp_impls_cnt);
443		ops = fletcher_4_supp_impls[impl];
444		break;
445	}
446
447	ASSERT3P(ops, !=, NULL);
448
449	return (ops);
450}
451
452static inline void
453fletcher_4_native_impl(const void *buf, uint64_t size, zio_cksum_t *zcp)
454{
455	fletcher_4_ctx_t ctx;
456	const fletcher_4_ops_t *ops = fletcher_4_impl_get();
457
458	if (ops->uses_fpu == B_TRUE) {
459		kfpu_begin();
460	}
461	ops->init_native(&ctx);
462	ops->compute_native(&ctx, buf, size);
463	ops->fini_native(&ctx, zcp);
464	if (ops->uses_fpu == B_TRUE) {
465		kfpu_end();
466	}
467}
468
469void
470fletcher_4_native(const void *buf, uint64_t size,
471    const void *ctx_template, zio_cksum_t *zcp)
472{
473	(void) ctx_template;
474	const uint64_t p2size = P2ALIGN(size, FLETCHER_MIN_SIMD_SIZE);
475
476	ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t)));
477
478	if (size == 0 || p2size == 0) {
479		ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
480
481		if (size > 0)
482			fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp,
483			    buf, size);
484	} else {
485		fletcher_4_native_impl(buf, p2size, zcp);
486
487		if (p2size < size)
488			fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp,
489			    (char *)buf + p2size, size - p2size);
490	}
491}
492
493void
494fletcher_4_native_varsize(const void *buf, uint64_t size, zio_cksum_t *zcp)
495{
496	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
497	fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp, buf, size);
498}
499
500static inline void
501fletcher_4_byteswap_impl(const void *buf, uint64_t size, zio_cksum_t *zcp)
502{
503	fletcher_4_ctx_t ctx;
504	const fletcher_4_ops_t *ops = fletcher_4_impl_get();
505
506	if (ops->uses_fpu == B_TRUE) {
507		kfpu_begin();
508	}
509	ops->init_byteswap(&ctx);
510	ops->compute_byteswap(&ctx, buf, size);
511	ops->fini_byteswap(&ctx, zcp);
512	if (ops->uses_fpu == B_TRUE) {
513		kfpu_end();
514	}
515}
516
517void
518fletcher_4_byteswap(const void *buf, uint64_t size,
519    const void *ctx_template, zio_cksum_t *zcp)
520{
521	(void) ctx_template;
522	const uint64_t p2size = P2ALIGN(size, FLETCHER_MIN_SIMD_SIZE);
523
524	ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t)));
525
526	if (size == 0 || p2size == 0) {
527		ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
528
529		if (size > 0)
530			fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp,
531			    buf, size);
532	} else {
533		fletcher_4_byteswap_impl(buf, p2size, zcp);
534
535		if (p2size < size)
536			fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp,
537			    (char *)buf + p2size, size - p2size);
538	}
539}
540
541/* Incremental Fletcher 4 */
542
543#define	ZFS_FLETCHER_4_INC_MAX_SIZE	(8ULL << 20)
544
545static inline void
546fletcher_4_incremental_combine(zio_cksum_t *zcp, const uint64_t size,
547    const zio_cksum_t *nzcp)
548{
549	const uint64_t c1 = size / sizeof (uint32_t);
550	const uint64_t c2 = c1 * (c1 + 1) / 2;
551	const uint64_t c3 = c2 * (c1 + 2) / 3;
552
553	/*
554	 * Value of 'c3' overflows on buffer sizes close to 16MiB. For that
555	 * reason we split incremental fletcher4 computation of large buffers
556	 * to steps of (ZFS_FLETCHER_4_INC_MAX_SIZE) size.
557	 */
558	ASSERT3U(size, <=, ZFS_FLETCHER_4_INC_MAX_SIZE);
559
560	zcp->zc_word[3] += nzcp->zc_word[3] + c1 * zcp->zc_word[2] +
561	    c2 * zcp->zc_word[1] + c3 * zcp->zc_word[0];
562	zcp->zc_word[2] += nzcp->zc_word[2] + c1 * zcp->zc_word[1] +
563	    c2 * zcp->zc_word[0];
564	zcp->zc_word[1] += nzcp->zc_word[1] + c1 * zcp->zc_word[0];
565	zcp->zc_word[0] += nzcp->zc_word[0];
566}
567
568static inline void
569fletcher_4_incremental_impl(boolean_t native, const void *buf, uint64_t size,
570    zio_cksum_t *zcp)
571{
572	while (size > 0) {
573		zio_cksum_t nzc;
574		uint64_t len = MIN(size, ZFS_FLETCHER_4_INC_MAX_SIZE);
575
576		if (native)
577			fletcher_4_native(buf, len, NULL, &nzc);
578		else
579			fletcher_4_byteswap(buf, len, NULL, &nzc);
580
581		fletcher_4_incremental_combine(zcp, len, &nzc);
582
583		size -= len;
584		buf += len;
585	}
586}
587
588int
589fletcher_4_incremental_native(void *buf, size_t size, void *data)
590{
591	zio_cksum_t *zcp = data;
592	/* Use scalar impl to directly update cksum of small blocks */
593	if (size < SPA_MINBLOCKSIZE)
594		fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp, buf, size);
595	else
596		fletcher_4_incremental_impl(B_TRUE, buf, size, zcp);
597	return (0);
598}
599
600int
601fletcher_4_incremental_byteswap(void *buf, size_t size, void *data)
602{
603	zio_cksum_t *zcp = data;
604	/* Use scalar impl to directly update cksum of small blocks */
605	if (size < SPA_MINBLOCKSIZE)
606		fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp, buf, size);
607	else
608		fletcher_4_incremental_impl(B_FALSE, buf, size, zcp);
609	return (0);
610}
611
612#if defined(_KERNEL)
613/*
614 * Fletcher 4 kstats
615 */
616static int
617fletcher_4_kstat_headers(char *buf, size_t size)
618{
619	ssize_t off = 0;
620
621	off += snprintf(buf + off, size, "%-17s", "implementation");
622	off += snprintf(buf + off, size - off, "%-15s", "native");
623	(void) snprintf(buf + off, size - off, "%-15s\n", "byteswap");
624
625	return (0);
626}
627
628static int
629fletcher_4_kstat_data(char *buf, size_t size, void *data)
630{
631	struct fletcher_4_kstat *fastest_stat =
632	    &fletcher_4_stat_data[fletcher_4_supp_impls_cnt];
633	struct fletcher_4_kstat *curr_stat = (struct fletcher_4_kstat *)data;
634	ssize_t off = 0;
635
636	if (curr_stat == fastest_stat) {
637		off += snprintf(buf + off, size - off, "%-17s", "fastest");
638		off += snprintf(buf + off, size - off, "%-15s",
639		    fletcher_4_supp_impls[fastest_stat->native]->name);
640		(void) snprintf(buf + off, size - off, "%-15s\n",
641		    fletcher_4_supp_impls[fastest_stat->byteswap]->name);
642	} else {
643		ptrdiff_t id = curr_stat - fletcher_4_stat_data;
644
645		off += snprintf(buf + off, size - off, "%-17s",
646		    fletcher_4_supp_impls[id]->name);
647		off += snprintf(buf + off, size - off, "%-15llu",
648		    (u_longlong_t)curr_stat->native);
649		(void) snprintf(buf + off, size - off, "%-15llu\n",
650		    (u_longlong_t)curr_stat->byteswap);
651	}
652
653	return (0);
654}
655
656static void *
657fletcher_4_kstat_addr(kstat_t *ksp, loff_t n)
658{
659	if (n <= fletcher_4_supp_impls_cnt)
660		ksp->ks_private = (void *) (fletcher_4_stat_data + n);
661	else
662		ksp->ks_private = NULL;
663
664	return (ksp->ks_private);
665}
666#endif
667
668#define	FLETCHER_4_FASTEST_FN_COPY(type, src)				  \
669{									  \
670	fletcher_4_fastest_impl.init_ ## type = src->init_ ## type;	  \
671	fletcher_4_fastest_impl.fini_ ## type = src->fini_ ## type;	  \
672	fletcher_4_fastest_impl.compute_ ## type = src->compute_ ## type; \
673	fletcher_4_fastest_impl.uses_fpu = src->uses_fpu;		  \
674}
675
676#define	FLETCHER_4_BENCH_NS	(MSEC2NSEC(1))		/* 1ms */
677
678typedef void fletcher_checksum_func_t(const void *, uint64_t, const void *,
679					zio_cksum_t *);
680
681#if defined(_KERNEL)
682static void
683fletcher_4_benchmark_impl(boolean_t native, char *data, uint64_t data_size)
684{
685
686	struct fletcher_4_kstat *fastest_stat =
687	    &fletcher_4_stat_data[fletcher_4_supp_impls_cnt];
688	hrtime_t start;
689	uint64_t run_bw, run_time_ns, best_run = 0;
690	zio_cksum_t zc;
691	uint32_t i, l, sel_save = IMPL_READ(fletcher_4_impl_chosen);
692
693	fletcher_checksum_func_t *fletcher_4_test = native ?
694	    fletcher_4_native : fletcher_4_byteswap;
695
696	for (i = 0; i < fletcher_4_supp_impls_cnt; i++) {
697		struct fletcher_4_kstat *stat = &fletcher_4_stat_data[i];
698		uint64_t run_count = 0;
699
700		/* temporary set an implementation */
701		fletcher_4_impl_chosen = i;
702
703		kpreempt_disable();
704		start = gethrtime();
705		do {
706			for (l = 0; l < 32; l++, run_count++)
707				fletcher_4_test(data, data_size, NULL, &zc);
708
709			run_time_ns = gethrtime() - start;
710		} while (run_time_ns < FLETCHER_4_BENCH_NS);
711		kpreempt_enable();
712
713		run_bw = data_size * run_count * NANOSEC;
714		run_bw /= run_time_ns;	/* B/s */
715
716		if (native)
717			stat->native = run_bw;
718		else
719			stat->byteswap = run_bw;
720
721		if (run_bw > best_run) {
722			best_run = run_bw;
723
724			if (native) {
725				fastest_stat->native = i;
726				FLETCHER_4_FASTEST_FN_COPY(native,
727				    fletcher_4_supp_impls[i]);
728			} else {
729				fastest_stat->byteswap = i;
730				FLETCHER_4_FASTEST_FN_COPY(byteswap,
731				    fletcher_4_supp_impls[i]);
732			}
733		}
734	}
735
736	/* restore original selection */
737	atomic_swap_32(&fletcher_4_impl_chosen, sel_save);
738}
739#endif /* _KERNEL */
740
741/*
742 * Initialize and benchmark all supported implementations.
743 */
744static void
745fletcher_4_benchmark(void)
746{
747	fletcher_4_ops_t *curr_impl;
748	int i, c;
749
750	/* Move supported implementations into fletcher_4_supp_impls */
751	for (i = 0, c = 0; i < ARRAY_SIZE(fletcher_4_impls); i++) {
752		curr_impl = (fletcher_4_ops_t *)fletcher_4_impls[i];
753
754		if (curr_impl->valid && curr_impl->valid())
755			fletcher_4_supp_impls[c++] = curr_impl;
756	}
757	membar_producer();	/* complete fletcher_4_supp_impls[] init */
758	fletcher_4_supp_impls_cnt = c;	/* number of supported impl */
759
760#if defined(_KERNEL)
761	static const size_t data_size = 1 << SPA_OLD_MAXBLOCKSHIFT; /* 128kiB */
762	char *databuf = vmem_alloc(data_size, KM_SLEEP);
763
764	for (i = 0; i < data_size / sizeof (uint64_t); i++)
765		((uint64_t *)databuf)[i] = (uintptr_t)(databuf+i); /* warm-up */
766
767	fletcher_4_benchmark_impl(B_FALSE, databuf, data_size);
768	fletcher_4_benchmark_impl(B_TRUE, databuf, data_size);
769
770	vmem_free(databuf, data_size);
771#else
772	/*
773	 * Skip the benchmark in user space to avoid impacting libzpool
774	 * consumers (zdb, zhack, zinject, ztest).  The last implementation
775	 * is assumed to be the fastest and used by default.
776	 */
777	memcpy(&fletcher_4_fastest_impl,
778	    fletcher_4_supp_impls[fletcher_4_supp_impls_cnt - 1],
779	    sizeof (fletcher_4_fastest_impl));
780	fletcher_4_fastest_impl.name = "fastest";
781	membar_producer();
782#endif /* _KERNEL */
783}
784
785void
786fletcher_4_init(void)
787{
788	/* Determine the fastest available implementation. */
789	fletcher_4_benchmark();
790
791#if defined(_KERNEL)
792	/* Install kstats for all implementations */
793	fletcher_4_kstat = kstat_create("zfs", 0, "fletcher_4_bench", "misc",
794	    KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
795	if (fletcher_4_kstat != NULL) {
796		fletcher_4_kstat->ks_data = NULL;
797		fletcher_4_kstat->ks_ndata = UINT32_MAX;
798		kstat_set_raw_ops(fletcher_4_kstat,
799		    fletcher_4_kstat_headers,
800		    fletcher_4_kstat_data,
801		    fletcher_4_kstat_addr);
802		kstat_install(fletcher_4_kstat);
803	}
804#endif
805
806	/* Finish initialization */
807	fletcher_4_initialized = B_TRUE;
808}
809
810void
811fletcher_4_fini(void)
812{
813#if defined(_KERNEL)
814	if (fletcher_4_kstat != NULL) {
815		kstat_delete(fletcher_4_kstat);
816		fletcher_4_kstat = NULL;
817	}
818#endif
819}
820
821/* ABD adapters */
822
823static void
824abd_fletcher_4_init(zio_abd_checksum_data_t *cdp)
825{
826	const fletcher_4_ops_t *ops = fletcher_4_impl_get();
827	cdp->acd_private = (void *) ops;
828
829	if (ops->uses_fpu == B_TRUE) {
830		kfpu_begin();
831	}
832	if (cdp->acd_byteorder == ZIO_CHECKSUM_NATIVE)
833		ops->init_native(cdp->acd_ctx);
834	else
835		ops->init_byteswap(cdp->acd_ctx);
836
837}
838
839static void
840abd_fletcher_4_fini(zio_abd_checksum_data_t *cdp)
841{
842	fletcher_4_ops_t *ops = (fletcher_4_ops_t *)cdp->acd_private;
843
844	ASSERT(ops);
845
846	if (cdp->acd_byteorder == ZIO_CHECKSUM_NATIVE)
847		ops->fini_native(cdp->acd_ctx, cdp->acd_zcp);
848	else
849		ops->fini_byteswap(cdp->acd_ctx, cdp->acd_zcp);
850
851	if (ops->uses_fpu == B_TRUE) {
852		kfpu_end();
853	}
854}
855
856
857static void
858abd_fletcher_4_simd2scalar(boolean_t native, void *data, size_t size,
859    zio_abd_checksum_data_t *cdp)
860{
861	zio_cksum_t *zcp = cdp->acd_zcp;
862
863	ASSERT3U(size, <, FLETCHER_MIN_SIMD_SIZE);
864
865	abd_fletcher_4_fini(cdp);
866	cdp->acd_private = (void *)&fletcher_4_scalar_ops;
867
868	if (native)
869		fletcher_4_incremental_native(data, size, zcp);
870	else
871		fletcher_4_incremental_byteswap(data, size, zcp);
872}
873
874static int
875abd_fletcher_4_iter(void *data, size_t size, void *private)
876{
877	zio_abd_checksum_data_t *cdp = (zio_abd_checksum_data_t *)private;
878	fletcher_4_ctx_t *ctx = cdp->acd_ctx;
879	fletcher_4_ops_t *ops = (fletcher_4_ops_t *)cdp->acd_private;
880	boolean_t native = cdp->acd_byteorder == ZIO_CHECKSUM_NATIVE;
881	uint64_t asize = P2ALIGN(size, FLETCHER_MIN_SIMD_SIZE);
882
883	ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t)));
884
885	if (asize > 0) {
886		if (native)
887			ops->compute_native(ctx, data, asize);
888		else
889			ops->compute_byteswap(ctx, data, asize);
890
891		size -= asize;
892		data = (char *)data + asize;
893	}
894
895	if (size > 0) {
896		ASSERT3U(size, <, FLETCHER_MIN_SIMD_SIZE);
897		/* At this point we have to switch to scalar impl */
898		abd_fletcher_4_simd2scalar(native, data, size, cdp);
899	}
900
901	return (0);
902}
903
904zio_abd_checksum_func_t fletcher_4_abd_ops = {
905	.acf_init = abd_fletcher_4_init,
906	.acf_fini = abd_fletcher_4_fini,
907	.acf_iter = abd_fletcher_4_iter
908};
909
910#if defined(_KERNEL)
911
912#define	IMPL_FMT(impl, i)	(((impl) == (i)) ? "[%s] " : "%s ")
913
914#if defined(__linux__)
915
916static int
917fletcher_4_param_get(char *buffer, zfs_kernel_param_t *unused)
918{
919	const uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
920	char *fmt;
921	int cnt = 0;
922
923	/* list fastest */
924	fmt = IMPL_FMT(impl, IMPL_FASTEST);
925	cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt, "fastest");
926
927	/* list all supported implementations */
928	for (uint32_t i = 0; i < fletcher_4_supp_impls_cnt; ++i) {
929		fmt = IMPL_FMT(impl, i);
930		cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt,
931		    fletcher_4_supp_impls[i]->name);
932	}
933
934	return (cnt);
935}
936
937static int
938fletcher_4_param_set(const char *val, zfs_kernel_param_t *unused)
939{
940	return (fletcher_4_impl_set(val));
941}
942
943#else
944
945#include <sys/sbuf.h>
946
947static int
948fletcher_4_param(ZFS_MODULE_PARAM_ARGS)
949{
950	int err;
951
952	if (req->newptr == NULL) {
953		const uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
954		const int init_buflen = 64;
955		const char *fmt;
956		struct sbuf *s;
957
958		s = sbuf_new_for_sysctl(NULL, NULL, init_buflen, req);
959
960		/* list fastest */
961		fmt = IMPL_FMT(impl, IMPL_FASTEST);
962		(void) sbuf_printf(s, fmt, "fastest");
963
964		/* list all supported implementations */
965		for (uint32_t i = 0; i < fletcher_4_supp_impls_cnt; ++i) {
966			fmt = IMPL_FMT(impl, i);
967			(void) sbuf_printf(s, fmt,
968			    fletcher_4_supp_impls[i]->name);
969		}
970
971		err = sbuf_finish(s);
972		sbuf_delete(s);
973
974		return (err);
975	}
976
977	char buf[16];
978
979	err = sysctl_handle_string(oidp, buf, sizeof (buf), req);
980	if (err)
981		return (err);
982	return (-fletcher_4_impl_set(buf));
983}
984
985#endif
986
987#undef IMPL_FMT
988
989/*
990 * Choose a fletcher 4 implementation in ZFS.
991 * Users can choose "cycle" to exercise all implementations, but this is
992 * for testing purpose therefore it can only be set in user space.
993 */
994ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs, zfs_, fletcher_4_impl,
995    fletcher_4_param_set, fletcher_4_param_get, ZMOD_RW,
996	"Select fletcher 4 implementation.");
997
998EXPORT_SYMBOL(fletcher_init);
999EXPORT_SYMBOL(fletcher_2_incremental_native);
1000EXPORT_SYMBOL(fletcher_2_incremental_byteswap);
1001EXPORT_SYMBOL(fletcher_4_init);
1002EXPORT_SYMBOL(fletcher_4_fini);
1003EXPORT_SYMBOL(fletcher_2_native);
1004EXPORT_SYMBOL(fletcher_2_byteswap);
1005EXPORT_SYMBOL(fletcher_4_native);
1006EXPORT_SYMBOL(fletcher_4_native_varsize);
1007EXPORT_SYMBOL(fletcher_4_byteswap);
1008EXPORT_SYMBOL(fletcher_4_incremental_native);
1009EXPORT_SYMBOL(fletcher_4_incremental_byteswap);
1010EXPORT_SYMBOL(fletcher_4_abd_ops);
1011#endif
1012