1219089Spjd/*
2219089Spjd * CDDL HEADER START
3219089Spjd *
4219089Spjd * The contents of this file are subject to the terms of the
5219089Spjd * Common Development and Distribution License (the "License").
6219089Spjd * You may not use this file except in compliance with the License.
7219089Spjd *
8219089Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9219089Spjd * or http://www.opensolaris.org/os/licensing.
10219089Spjd * See the License for the specific language governing permissions
11219089Spjd * and limitations under the License.
12219089Spjd *
13219089Spjd * When distributing Covered Code, include this CDDL HEADER in each
14219089Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15219089Spjd * If applicable, add the following below this CDDL HEADER, with the
16219089Spjd * fields enclosed by brackets "[]" replaced with your own identifying
17219089Spjd * information: Portions Copyright [yyyy] [name of copyright owner]
18219089Spjd *
19219089Spjd * CDDL HEADER END
20219089Spjd */
21219089Spjd/*
22219089Spjd * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23219089Spjd * Use is subject to license terms.
24219089Spjd */
25219089Spjd
26219089Spjd/*
27219089Spjd * Fletcher Checksums
28219089Spjd * ------------------
29219089Spjd *
30219089Spjd * ZFS's 2nd and 4th order Fletcher checksums are defined by the following
31219089Spjd * recurrence relations:
32219089Spjd *
33219089Spjd *	a  = a    + f
34219089Spjd *	 i    i-1    i-1
35219089Spjd *
36219089Spjd *	b  = b    + a
37219089Spjd *	 i    i-1    i
38219089Spjd *
39219089Spjd *	c  = c    + b		(fletcher-4 only)
40219089Spjd *	 i    i-1    i
41219089Spjd *
42219089Spjd *	d  = d    + c		(fletcher-4 only)
43219089Spjd *	 i    i-1    i
44219089Spjd *
45219089Spjd * Where
46219089Spjd *	a_0 = b_0 = c_0 = d_0 = 0
47219089Spjd * and
48219089Spjd *	f_0 .. f_(n-1) are the input data.
49219089Spjd *
50219089Spjd * Using standard techniques, these translate into the following series:
51219089Spjd *
52219089Spjd *	     __n_			     __n_
53219089Spjd *	     \   |			     \   |
54219089Spjd *	a  =  >     f			b  =  >     i * f
55219089Spjd *	 n   /___|   n - i		 n   /___|	 n - i
56219089Spjd *	     i = 1			     i = 1
57219089Spjd *
58219089Spjd *
59219089Spjd *	     __n_			     __n_
60219089Spjd *	     \   |  i*(i+1)		     \   |  i*(i+1)*(i+2)
61219089Spjd *	c  =  >     ------- f		d  =  >     ------------- f
62219089Spjd *	 n   /___|     2     n - i	 n   /___|	  6	   n - i
63219089Spjd *	     i = 1			     i = 1
64219089Spjd *
65219089Spjd * For fletcher-2, the f_is are 64-bit, and [ab]_i are 64-bit accumulators.
66219089Spjd * Since the additions are done mod (2^64), errors in the high bits may not
67219089Spjd * be noticed.  For this reason, fletcher-2 is deprecated.
68219089Spjd *
69219089Spjd * For fletcher-4, the f_is are 32-bit, and [abcd]_i are 64-bit accumulators.
70219089Spjd * A conservative estimate of how big the buffer can get before we overflow
71219089Spjd * can be estimated using f_i = 0xffffffff for all i:
72219089Spjd *
73219089Spjd * % bc
74219089Spjd *  f=2^32-1;d=0; for (i = 1; d<2^64; i++) { d += f*i*(i+1)*(i+2)/6 }; (i-1)*4
75219089Spjd * 2264
76219089Spjd *  quit
77219089Spjd * %
78219089Spjd *
79219089Spjd * So blocks of up to 2k will not overflow.  Our largest block size is
80219089Spjd * 128k, which has 32k 4-byte words, so we can compute the largest possible
81219089Spjd * accumulators, then divide by 2^64 to figure the max amount of overflow:
82219089Spjd *
83219089Spjd * % bc
84219089Spjd *  a=b=c=d=0; f=2^32-1; for (i=1; i<=32*1024; i++) { a+=f; b+=a; c+=b; d+=c }
85219089Spjd *  a/2^64;b/2^64;c/2^64;d/2^64
86219089Spjd * 0
87219089Spjd * 0
88219089Spjd * 1365
89219089Spjd * 11186858
90219089Spjd *  quit
91219089Spjd * %
92219089Spjd *
93219089Spjd * So a and b cannot overflow.  To make sure each bit of input has some
94219089Spjd * effect on the contents of c and d, we can look at what the factors of
95219089Spjd * the coefficients in the equations for c_n and d_n are.  The number of 2s
96219089Spjd * in the factors determines the lowest set bit in the multiplier.  Running
97219089Spjd * through the cases for n*(n+1)/2 reveals that the highest power of 2 is
98219089Spjd * 2^14, and for n*(n+1)*(n+2)/6 it is 2^15.  So while some data may overflow
99219089Spjd * the 64-bit accumulators, every bit of every f_i effects every accumulator,
100219089Spjd * even for 128k blocks.
101219089Spjd *
102219089Spjd * If we wanted to make a stronger version of fletcher4 (fletcher4c?),
103219089Spjd * we could do our calculations mod (2^32 - 1) by adding in the carries
104219089Spjd * periodically, and store the number of carries in the top 32-bits.
105219089Spjd *
106219089Spjd * --------------------
107219089Spjd * Checksum Performance
108219089Spjd * --------------------
109219089Spjd *
110219089Spjd * There are two interesting components to checksum performance: cached and
111219089Spjd * uncached performance.  With cached data, fletcher-2 is about four times
112219089Spjd * faster than fletcher-4.  With uncached data, the performance difference is
113219089Spjd * negligible, since the cost of a cache fill dominates the processing time.
114219089Spjd * Even though fletcher-4 is slower than fletcher-2, it is still a pretty
115219089Spjd * efficient pass over the data.
116219089Spjd *
117219089Spjd * In normal operation, the data which is being checksummed is in a buffer
118219089Spjd * which has been filled either by:
119219089Spjd *
120219089Spjd *	1. a compression step, which will be mostly cached, or
121219089Spjd *	2. a bcopy() or copyin(), which will be uncached (because the
122219089Spjd *	   copy is cache-bypassing).
123219089Spjd *
124219089Spjd * For both cached and uncached data, both fletcher checksums are much faster
125219089Spjd * than sha-256, and slower than 'off', which doesn't touch the data at all.
126219089Spjd */
127219089Spjd
128219089Spjd#include <sys/types.h>
129219089Spjd#include <sys/sysmacros.h>
130219089Spjd#include <sys/byteorder.h>
131219089Spjd#include <sys/zio.h>
132219089Spjd#include <sys/spa.h>
133219089Spjd
134219089Spjdvoid
135219089Spjdfletcher_2_native(const void *buf, uint64_t size, zio_cksum_t *zcp)
136219089Spjd{
137219089Spjd	const uint64_t *ip = buf;
138219089Spjd	const uint64_t *ipend = ip + (size / sizeof (uint64_t));
139219089Spjd	uint64_t a0, b0, a1, b1;
140219089Spjd
141219089Spjd	for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) {
142219089Spjd		a0 += ip[0];
143219089Spjd		a1 += ip[1];
144219089Spjd		b0 += a0;
145219089Spjd		b1 += a1;
146219089Spjd	}
147219089Spjd
148219089Spjd	ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
149219089Spjd}
150219089Spjd
151219089Spjdvoid
152219089Spjdfletcher_2_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp)
153219089Spjd{
154219089Spjd	const uint64_t *ip = buf;
155219089Spjd	const uint64_t *ipend = ip + (size / sizeof (uint64_t));
156219089Spjd	uint64_t a0, b0, a1, b1;
157219089Spjd
158219089Spjd	for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) {
159219089Spjd		a0 += BSWAP_64(ip[0]);
160219089Spjd		a1 += BSWAP_64(ip[1]);
161219089Spjd		b0 += a0;
162219089Spjd		b1 += a1;
163219089Spjd	}
164219089Spjd
165219089Spjd	ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
166219089Spjd}
167219089Spjd
168219089Spjdvoid
169219089Spjdfletcher_4_native(const void *buf, uint64_t size, zio_cksum_t *zcp)
170219089Spjd{
171219089Spjd	const uint32_t *ip = buf;
172219089Spjd	const uint32_t *ipend = ip + (size / sizeof (uint32_t));
173219089Spjd	uint64_t a, b, c, d;
174219089Spjd
175219089Spjd	for (a = b = c = d = 0; ip < ipend; ip++) {
176219089Spjd		a += ip[0];
177219089Spjd		b += a;
178219089Spjd		c += b;
179219089Spjd		d += c;
180219089Spjd	}
181219089Spjd
182219089Spjd	ZIO_SET_CHECKSUM(zcp, a, b, c, d);
183219089Spjd}
184219089Spjd
185219089Spjdvoid
186219089Spjdfletcher_4_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp)
187219089Spjd{
188219089Spjd	const uint32_t *ip = buf;
189219089Spjd	const uint32_t *ipend = ip + (size / sizeof (uint32_t));
190219089Spjd	uint64_t a, b, c, d;
191219089Spjd
192219089Spjd	for (a = b = c = d = 0; ip < ipend; ip++) {
193219089Spjd		a += BSWAP_32(ip[0]);
194219089Spjd		b += a;
195219089Spjd		c += b;
196219089Spjd		d += c;
197219089Spjd	}
198219089Spjd
199219089Spjd	ZIO_SET_CHECKSUM(zcp, a, b, c, d);
200219089Spjd}
201219089Spjd
202219089Spjdvoid
203219089Spjdfletcher_4_incremental_native(const void *buf, uint64_t size,
204219089Spjd    zio_cksum_t *zcp)
205219089Spjd{
206219089Spjd	const uint32_t *ip = buf;
207219089Spjd	const uint32_t *ipend = ip + (size / sizeof (uint32_t));
208219089Spjd	uint64_t a, b, c, d;
209219089Spjd
210219089Spjd	a = zcp->zc_word[0];
211219089Spjd	b = zcp->zc_word[1];
212219089Spjd	c = zcp->zc_word[2];
213219089Spjd	d = zcp->zc_word[3];
214219089Spjd
215219089Spjd	for (; ip < ipend; ip++) {
216219089Spjd		a += ip[0];
217219089Spjd		b += a;
218219089Spjd		c += b;
219219089Spjd		d += c;
220219089Spjd	}
221219089Spjd
222219089Spjd	ZIO_SET_CHECKSUM(zcp, a, b, c, d);
223219089Spjd}
224219089Spjd
225219089Spjdvoid
226219089Spjdfletcher_4_incremental_byteswap(const void *buf, uint64_t size,
227219089Spjd    zio_cksum_t *zcp)
228219089Spjd{
229219089Spjd	const uint32_t *ip = buf;
230219089Spjd	const uint32_t *ipend = ip + (size / sizeof (uint32_t));
231219089Spjd	uint64_t a, b, c, d;
232219089Spjd
233219089Spjd	a = zcp->zc_word[0];
234219089Spjd	b = zcp->zc_word[1];
235219089Spjd	c = zcp->zc_word[2];
236219089Spjd	d = zcp->zc_word[3];
237219089Spjd
238219089Spjd	for (; ip < ipend; ip++) {
239219089Spjd		a += BSWAP_32(ip[0]);
240219089Spjd		b += a;
241219089Spjd		c += b;
242219089Spjd		d += c;
243219089Spjd	}
244219089Spjd
245219089Spjd	ZIO_SET_CHECKSUM(zcp, a, b, c, d);
246219089Spjd}
247