zfssubr.c revision 199241
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#include <sys/cdefs.h>
27__FBSDID("$FreeBSD: head/sys/cddl/boot/zfs/zfssubr.c 199241 2009-11-13 02:50:50Z ps $");
28
29static uint64_t zfs_crc64_table[256];
30
31static void
32zfs_init_crc(void)
33{
34	int i, j;
35	uint64_t *ct;
36
37	/*
38	 * Calculate the crc64 table (used for the zap hash
39	 * function).
40	 */
41	if (zfs_crc64_table[128] != ZFS_CRC64_POLY) {
42		memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table));
43		for (i = 0; i < 256; i++)
44			for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
45				*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
46	}
47}
48
49static void
50zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp)
51{
52	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
53}
54
55/*
56 * Signature for checksum functions.
57 */
58typedef void zio_checksum_t(const void *data, uint64_t size, zio_cksum_t *zcp);
59
60/*
61 * Information about each checksum function.
62 */
63typedef struct zio_checksum_info {
64	zio_checksum_t	*ci_func[2]; /* checksum function for each byteorder */
65	int		ci_correctable;	/* number of correctable bits	*/
66	int		ci_zbt;		/* uses zio block tail?	*/
67	const char	*ci_name;	/* descriptive name */
68} zio_checksum_info_t;
69
70#include "fletcher.c"
71#include "sha256.c"
72
73static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
74	{{NULL,			NULL},			0, 0,	"inherit"},
75	{{NULL,			NULL},			0, 0,	"on"},
76	{{zio_checksum_off,	zio_checksum_off},	0, 0,	"off"},
77	{{zio_checksum_SHA256,	NULL},			1, 1,	"label"},
78	{{zio_checksum_SHA256,	NULL},			1, 1,	"gang_header"},
79	{{fletcher_2_native,	NULL},			0, 1,	"zilog"},
80	{{fletcher_2_native,	NULL},			0, 0,	"fletcher2"},
81	{{fletcher_4_native,	NULL},			1, 0,	"fletcher4"},
82	{{zio_checksum_SHA256,	NULL},			1, 0,	"SHA256"},
83};
84
85/*
86 * Common signature for all zio compress/decompress functions.
87 */
88typedef size_t zio_compress_func_t(void *src, void *dst,
89    size_t s_len, size_t d_len, int);
90typedef int zio_decompress_func_t(void *src, void *dst,
91    size_t s_len, size_t d_len, int);
92
93/*
94 * Information about each compression function.
95 */
96typedef struct zio_compress_info {
97	zio_compress_func_t	*ci_compress;	/* compression function */
98	zio_decompress_func_t	*ci_decompress;	/* decompression function */
99	int			ci_level;	/* level parameter */
100	const char		*ci_name;	/* algorithm name */
101} zio_compress_info_t;
102
103#include "lzjb.c"
104
105/*
106 * Compression vectors.
107 */
108static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
109	{NULL,			NULL,			0,	"inherit"},
110	{NULL,			NULL,			0,	"on"},
111	{NULL,			NULL,			0,	"uncompressed"},
112	{NULL,			lzjb_decompress,	0,	"lzjb"},
113	{NULL,			NULL,			0,	"empty"},
114	{NULL,			NULL,			1,	"gzip-1"},
115	{NULL,			NULL,			2,	"gzip-2"},
116	{NULL,			NULL,			3,	"gzip-3"},
117	{NULL,			NULL,			4,	"gzip-4"},
118	{NULL,			NULL,			5,	"gzip-5"},
119	{NULL,			NULL,			6,	"gzip-6"},
120	{NULL,			NULL,			7,	"gzip-7"},
121	{NULL,			NULL,			8,	"gzip-8"},
122	{NULL,			NULL,			9,	"gzip-9"},
123};
124
125static int
126zio_checksum_error(const blkptr_t *bp, void *data)
127{
128	zio_cksum_t zc = bp->blk_cksum;
129	unsigned int checksum = BP_GET_CHECKSUM(bp);
130	uint64_t size = BP_GET_PSIZE(bp);
131	zio_block_tail_t *zbt = (zio_block_tail_t *)((char *)data + size) - 1;
132	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
133	zio_cksum_t actual_cksum, expected_cksum;
134
135	if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL)
136		return (EINVAL);
137
138	if (ci->ci_zbt) {
139		expected_cksum = zbt->zbt_cksum;
140		zbt->zbt_cksum = zc;
141		ci->ci_func[0](data, size, &actual_cksum);
142		zbt->zbt_cksum = expected_cksum;
143		zc = expected_cksum;
144	} else {
145		/* ASSERT(!BP_IS_GANG(bp)); */
146		ci->ci_func[0](data, size, &actual_cksum);
147	}
148
149	if (!ZIO_CHECKSUM_EQUAL(actual_cksum, zc)) {
150		/*printf("ZFS: read checksum failed\n");*/
151		return (EIO);
152	}
153
154	return (0);
155}
156
157static int
158zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
159	void *dest, uint64_t destsize)
160{
161	zio_compress_info_t *ci = &zio_compress_table[cpfunc];
162
163	/* ASSERT((uint_t)cpfunc < ZIO_COMPRESS_FUNCTIONS); */
164	if (!ci->ci_decompress) {
165		printf("ZFS: unsupported compression algorithm %u\n", cpfunc);
166		return (EIO);
167	}
168
169	return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
170}
171
172static uint64_t
173zap_hash(uint64_t salt, const char *name)
174{
175	const uint8_t *cp;
176	uint8_t c;
177	uint64_t crc = salt;
178
179	/*ASSERT(crc != 0);*/
180	/*ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);*/
181	for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
182		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
183
184	/*
185	 * Only use 28 bits, since we need 4 bits in the cookie for the
186	 * collision differentiator.  We MUST use the high bits, since
187	 * those are the onces that we first pay attention to when
188	 * chosing the bucket.
189	 */
190	crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
191
192	return (crc);
193}
194
195static char *zfs_alloc_temp(size_t sz);
196
197typedef struct raidz_col {
198	uint64_t rc_devidx;		/* child device index for I/O */
199	uint64_t rc_offset;		/* device offset */
200	uint64_t rc_size;		/* I/O size */
201	void *rc_data;			/* I/O data */
202	int rc_error;			/* I/O error for this device */
203	uint8_t rc_tried;		/* Did we attempt this I/O column? */
204	uint8_t rc_skipped;		/* Did we skip this I/O column? */
205} raidz_col_t;
206
207#define	VDEV_RAIDZ_P		0
208#define	VDEV_RAIDZ_Q		1
209
210static void
211vdev_raidz_reconstruct_p(raidz_col_t *cols, int nparity, int acols, int x)
212{
213	uint64_t *dst, *src, xcount, ccount, count, i;
214	int c;
215
216	xcount = cols[x].rc_size / sizeof (src[0]);
217	//ASSERT(xcount <= cols[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
218	//ASSERT(xcount > 0);
219
220	src = cols[VDEV_RAIDZ_P].rc_data;
221	dst = cols[x].rc_data;
222	for (i = 0; i < xcount; i++, dst++, src++) {
223		*dst = *src;
224	}
225
226	for (c = nparity; c < acols; c++) {
227		src = cols[c].rc_data;
228		dst = cols[x].rc_data;
229
230		if (c == x)
231			continue;
232
233		ccount = cols[c].rc_size / sizeof (src[0]);
234		count = MIN(ccount, xcount);
235
236		for (i = 0; i < count; i++, dst++, src++) {
237			*dst ^= *src;
238		}
239	}
240}
241
242/*
243 * These two tables represent powers and logs of 2 in the Galois field defined
244 * above. These values were computed by repeatedly multiplying by 2 as above.
245 */
246static const uint8_t vdev_raidz_pow2[256] = {
247	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
248	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
249	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
250	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
251	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
252	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
253	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
254	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
255	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
256	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
257	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
258	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
259	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
260	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
261	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
262	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
263	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
264	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
265	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
266	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
267	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
268	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
269	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
270	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
271	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
272	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
273	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
274	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
275	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
276	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
277	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
278	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
279};
280static const uint8_t vdev_raidz_log2[256] = {
281	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
282	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
283	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
284	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
285	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
286	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
287	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
288	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
289	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
290	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
291	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
292	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
293	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
294	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
295	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
296	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
297	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
298	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
299	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
300	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
301	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
302	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
303	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
304	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
305	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
306	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
307	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
308	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
309	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
310	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
311	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
312	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
313};
314
315/*
316 * Multiply a given number by 2 raised to the given power.
317 */
318static uint8_t
319vdev_raidz_exp2(uint8_t a, int exp)
320{
321	if (a == 0)
322		return (0);
323
324	//ASSERT(exp >= 0);
325	//ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
326
327	exp += vdev_raidz_log2[a];
328	if (exp > 255)
329		exp -= 255;
330
331	return (vdev_raidz_pow2[exp]);
332}
333
334static void
335vdev_raidz_generate_parity_pq(raidz_col_t *cols, int nparity, int acols)
336{
337	uint64_t *q, *p, *src, pcount, ccount, mask, i;
338	int c;
339
340	pcount = cols[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
341	//ASSERT(cols[VDEV_RAIDZ_P].rc_size == cols[VDEV_RAIDZ_Q].rc_size);
342
343	for (c = nparity; c < acols; c++) {
344		src = cols[c].rc_data;
345		p = cols[VDEV_RAIDZ_P].rc_data;
346		q = cols[VDEV_RAIDZ_Q].rc_data;
347		ccount = cols[c].rc_size / sizeof (src[0]);
348
349		if (c == nparity) {
350			//ASSERT(ccount == pcount || ccount == 0);
351			for (i = 0; i < ccount; i++, p++, q++, src++) {
352				*q = *src;
353				*p = *src;
354			}
355			for (; i < pcount; i++, p++, q++, src++) {
356				*q = 0;
357				*p = 0;
358			}
359		} else {
360			//ASSERT(ccount <= pcount);
361
362			/*
363			 * Rather than multiplying each byte
364			 * individually (as described above), we are
365			 * able to handle 8 at once by generating a
366			 * mask based on the high bit in each byte and
367			 * using that to conditionally XOR in 0x1d.
368			 */
369			for (i = 0; i < ccount; i++, p++, q++, src++) {
370				mask = *q & 0x8080808080808080ULL;
371				mask = (mask << 1) - (mask >> 7);
372				*q = ((*q << 1) & 0xfefefefefefefefeULL) ^
373				    (mask & 0x1d1d1d1d1d1d1d1dULL);
374				*q ^= *src;
375				*p ^= *src;
376			}
377
378			/*
379			 * Treat short columns as though they are full of 0s.
380			 */
381			for (; i < pcount; i++, q++) {
382				mask = *q & 0x8080808080808080ULL;
383				mask = (mask << 1) - (mask >> 7);
384				*q = ((*q << 1) & 0xfefefefefefefefeULL) ^
385				    (mask & 0x1d1d1d1d1d1d1d1dULL);
386			}
387		}
388	}
389}
390
391static void
392vdev_raidz_reconstruct_q(raidz_col_t *cols, int nparity, int acols, int x)
393{
394	uint64_t *dst, *src, xcount, ccount, count, mask, i;
395	uint8_t *b;
396	int c, j, exp;
397
398	xcount = cols[x].rc_size / sizeof (src[0]);
399	//ASSERT(xcount <= cols[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
400
401	for (c = nparity; c < acols; c++) {
402		src = cols[c].rc_data;
403		dst = cols[x].rc_data;
404
405		if (c == x)
406			ccount = 0;
407		else
408			ccount = cols[c].rc_size / sizeof (src[0]);
409
410		count = MIN(ccount, xcount);
411
412		if (c == nparity) {
413			for (i = 0; i < count; i++, dst++, src++) {
414				*dst = *src;
415			}
416			for (; i < xcount; i++, dst++) {
417				*dst = 0;
418			}
419
420		} else {
421			/*
422			 * For an explanation of this, see the comment in
423			 * vdev_raidz_generate_parity_pq() above.
424			 */
425			for (i = 0; i < count; i++, dst++, src++) {
426				mask = *dst & 0x8080808080808080ULL;
427				mask = (mask << 1) - (mask >> 7);
428				*dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
429				    (mask & 0x1d1d1d1d1d1d1d1dULL);
430				*dst ^= *src;
431			}
432
433			for (; i < xcount; i++, dst++) {
434				mask = *dst & 0x8080808080808080ULL;
435				mask = (mask << 1) - (mask >> 7);
436				*dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
437				    (mask & 0x1d1d1d1d1d1d1d1dULL);
438			}
439		}
440	}
441
442	src = cols[VDEV_RAIDZ_Q].rc_data;
443	dst = cols[x].rc_data;
444	exp = 255 - (acols - 1 - x);
445
446	for (i = 0; i < xcount; i++, dst++, src++) {
447		*dst ^= *src;
448		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
449			*b = vdev_raidz_exp2(*b, exp);
450		}
451	}
452}
453
454
455static void
456vdev_raidz_reconstruct_pq(raidz_col_t *cols, int nparity, int acols,
457    int x, int y)
458{
459	uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
460	void *pdata, *qdata;
461	uint64_t xsize, ysize, i;
462
463	//ASSERT(x < y);
464	//ASSERT(x >= nparity);
465	//ASSERT(y < acols);
466
467	//ASSERT(cols[x].rc_size >= cols[y].rc_size);
468
469	/*
470	 * Move the parity data aside -- we're going to compute parity as
471	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
472	 * reuse the parity generation mechanism without trashing the actual
473	 * parity so we make those columns appear to be full of zeros by
474	 * setting their lengths to zero.
475	 */
476	pdata = cols[VDEV_RAIDZ_P].rc_data;
477	qdata = cols[VDEV_RAIDZ_Q].rc_data;
478	xsize = cols[x].rc_size;
479	ysize = cols[y].rc_size;
480
481	cols[VDEV_RAIDZ_P].rc_data =
482		zfs_alloc_temp(cols[VDEV_RAIDZ_P].rc_size);
483	cols[VDEV_RAIDZ_Q].rc_data =
484		zfs_alloc_temp(cols[VDEV_RAIDZ_Q].rc_size);
485	cols[x].rc_size = 0;
486	cols[y].rc_size = 0;
487
488	vdev_raidz_generate_parity_pq(cols, nparity, acols);
489
490	cols[x].rc_size = xsize;
491	cols[y].rc_size = ysize;
492
493	p = pdata;
494	q = qdata;
495	pxy = cols[VDEV_RAIDZ_P].rc_data;
496	qxy = cols[VDEV_RAIDZ_Q].rc_data;
497	xd = cols[x].rc_data;
498	yd = cols[y].rc_data;
499
500	/*
501	 * We now have:
502	 *	Pxy = P + D_x + D_y
503	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
504	 *
505	 * We can then solve for D_x:
506	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
507	 * where
508	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
509	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
510	 *
511	 * With D_x in hand, we can easily solve for D_y:
512	 *	D_y = P + Pxy + D_x
513	 */
514
515	a = vdev_raidz_pow2[255 + x - y];
516	b = vdev_raidz_pow2[255 - (acols - 1 - x)];
517	tmp = 255 - vdev_raidz_log2[a ^ 1];
518
519	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
520	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
521
522	for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
523		*xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
524		    vdev_raidz_exp2(*q ^ *qxy, bexp);
525
526		if (i < ysize)
527			*yd = *p ^ *pxy ^ *xd;
528	}
529
530	/*
531	 * Restore the saved parity data.
532	 */
533	cols[VDEV_RAIDZ_P].rc_data = pdata;
534	cols[VDEV_RAIDZ_Q].rc_data = qdata;
535}
536
537static int
538vdev_raidz_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
539    off_t offset, size_t bytes)
540{
541	size_t psize = BP_GET_PSIZE(bp);
542	vdev_t *kid;
543	int unit_shift = vdev->v_ashift;
544	int dcols = vdev->v_nchildren;
545	int nparity = vdev->v_nparity;
546	int missingdata, missingparity;
547	int parity_errors, data_errors, unexpected_errors, total_errors;
548	int parity_untried;
549	uint64_t b = offset >> unit_shift;
550	uint64_t s = psize >> unit_shift;
551	uint64_t f = b % dcols;
552	uint64_t o = (b / dcols) << unit_shift;
553	uint64_t q, r, coff;
554	int c, c1, bc, col, acols, devidx, asize, n;
555	static raidz_col_t cols[16];
556	raidz_col_t *rc, *rc1;
557
558	q = s / (dcols - nparity);
559	r = s - q * (dcols - nparity);
560	bc = (r == 0 ? 0 : r + nparity);
561
562	acols = (q == 0 ? bc : dcols);
563	asize = 0;
564
565	for (c = 0; c < acols; c++) {
566		col = f + c;
567		coff = o;
568		if (col >= dcols) {
569			col -= dcols;
570			coff += 1ULL << unit_shift;
571		}
572		cols[c].rc_devidx = col;
573		cols[c].rc_offset = coff;
574		cols[c].rc_size = (q + (c < bc)) << unit_shift;
575		cols[c].rc_data = NULL;
576		cols[c].rc_error = 0;
577		cols[c].rc_tried = 0;
578		cols[c].rc_skipped = 0;
579		asize += cols[c].rc_size;
580	}
581
582	asize = roundup(asize, (nparity + 1) << unit_shift);
583
584	for (c = 0; c < nparity; c++) {
585		cols[c].rc_data = zfs_alloc_temp(cols[c].rc_size);
586	}
587
588	cols[c].rc_data = buf;
589
590	for (c = c + 1; c < acols; c++)
591		cols[c].rc_data = (char *)cols[c - 1].rc_data +
592		    cols[c - 1].rc_size;
593
594	/*
595	 * If all data stored spans all columns, there's a danger that
596	 * parity will always be on the same device and, since parity
597	 * isn't read during normal operation, that that device's I/O
598	 * bandwidth won't be used effectively. We therefore switch
599	 * the parity every 1MB.
600	 *
601	 * ... at least that was, ostensibly, the theory. As a
602	 * practical matter unless we juggle the parity between all
603	 * devices evenly, we won't see any benefit. Further,
604	 * occasional writes that aren't a multiple of the LCM of the
605	 * number of children and the minimum stripe width are
606	 * sufficient to avoid pessimal behavior.  Unfortunately, this
607	 * decision created an implicit on-disk format requirement
608	 * that we need to support for all eternity, but only for
609	 * single-parity RAID-Z.
610	 */
611	//ASSERT(acols >= 2);
612	//ASSERT(cols[0].rc_size == cols[1].rc_size);
613
614	if (nparity == 1 && (offset & (1ULL << 20))) {
615		devidx = cols[0].rc_devidx;
616		o = cols[0].rc_offset;
617		cols[0].rc_devidx = cols[1].rc_devidx;
618		cols[0].rc_offset = cols[1].rc_offset;
619		cols[1].rc_devidx = devidx;
620		cols[1].rc_offset = o;
621	}
622
623	/*
624	 * Iterate over the columns in reverse order so that we hit
625	 * the parity last -- any errors along the way will force us
626	 * to read the parity data.
627	 */
628	missingdata = 0;
629	missingparity = 0;
630	for (c = acols - 1; c >= 0; c--) {
631		rc = &cols[c];
632		devidx = rc->rc_devidx;
633		STAILQ_FOREACH(kid, &vdev->v_children, v_childlink)
634			if (kid->v_id == devidx)
635				break;
636		if (kid == NULL || kid->v_state != VDEV_STATE_HEALTHY) {
637			if (c >= nparity)
638				missingdata++;
639			else
640				missingparity++;
641			rc->rc_error = ENXIO;
642			rc->rc_tried = 1;	/* don't even try */
643			rc->rc_skipped = 1;
644			continue;
645		}
646#if 0
647		/*
648		 * Too hard for the bootcode
649		 */
650		if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) {
651			if (c >= nparity)
652				rm->rm_missingdata++;
653			else
654				rm->rm_missingparity++;
655			rc->rc_error = ESTALE;
656			rc->rc_skipped = 1;
657			continue;
658		}
659#endif
660		if (c >= nparity || missingdata > 0) {
661			if (rc->rc_data)
662				rc->rc_error = kid->v_read(kid, NULL,
663				    rc->rc_data, rc->rc_offset, rc->rc_size);
664			else
665				rc->rc_error = ENXIO;
666			rc->rc_tried = 1;
667			rc->rc_skipped = 0;
668		}
669	}
670
671reconstruct:
672	parity_errors = 0;
673	data_errors = 0;
674	unexpected_errors = 0;
675	total_errors = 0;
676	parity_untried = 0;
677	for (c = 0; c < acols; c++) {
678		rc = &cols[c];
679
680		if (rc->rc_error) {
681			if (c < nparity)
682				parity_errors++;
683			else
684				data_errors++;
685
686			if (!rc->rc_skipped)
687				unexpected_errors++;
688
689			total_errors++;
690		} else if (c < nparity && !rc->rc_tried) {
691			parity_untried++;
692		}
693	}
694
695	/*
696	 * There are three potential phases for a read:
697	 *	1. produce valid data from the columns read
698	 *	2. read all disks and try again
699	 *	3. perform combinatorial reconstruction
700	 *
701	 * Each phase is progressively both more expensive and less
702	 * likely to occur. If we encounter more errors than we can
703	 * repair or all phases fail, we have no choice but to return
704	 * an error.
705	 */
706
707	/*
708	 * If the number of errors we saw was correctable -- less than
709	 * or equal to the number of parity disks read -- attempt to
710	 * produce data that has a valid checksum. Naturally, this
711	 * case applies in the absence of any errors.
712	 */
713	if (total_errors <= nparity - parity_untried) {
714		switch (data_errors) {
715		case 0:
716			if (zio_checksum_error(bp, buf) == 0)
717				return (0);
718			break;
719
720		case 1:
721			/*
722			 * We either attempt to read all the parity columns or
723			 * none of them. If we didn't try to read parity, we
724			 * wouldn't be here in the correctable case. There must
725			 * also have been fewer parity errors than parity
726			 * columns or, again, we wouldn't be in this code path.
727			 */
728			//ASSERT(parity_untried == 0);
729			//ASSERT(parity_errors < nparity);
730
731			/*
732			 * Find the column that reported the error.
733			 */
734			for (c = nparity; c < acols; c++) {
735				rc = &cols[c];
736				if (rc->rc_error != 0)
737					break;
738			}
739			//ASSERT(c != acols);
740			//ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || rc->rc_error == ESTALE);
741
742			if (cols[VDEV_RAIDZ_P].rc_error == 0) {
743				vdev_raidz_reconstruct_p(cols, nparity,
744				    acols, c);
745			} else {
746				//ASSERT(nparity > 1);
747				vdev_raidz_reconstruct_q(cols, nparity,
748				    acols, c);
749			}
750
751			if (zio_checksum_error(bp, buf) == 0)
752				return (0);
753			break;
754
755		case 2:
756			/*
757			 * Two data column errors require double parity.
758			 */
759			//ASSERT(nparity == 2);
760
761			/*
762			 * Find the two columns that reported errors.
763			 */
764			for (c = nparity; c < acols; c++) {
765				rc = &cols[c];
766				if (rc->rc_error != 0)
767					break;
768			}
769			//ASSERT(c != acols);
770			//ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || rc->rc_error == ESTALE);
771
772			for (c1 = c++; c < acols; c++) {
773				rc = &cols[c];
774				if (rc->rc_error != 0)
775					break;
776			}
777			//ASSERT(c != acols);
778			//ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || rc->rc_error == ESTALE);
779
780			vdev_raidz_reconstruct_pq(cols, nparity, acols,
781			    c1, c);
782
783			if (zio_checksum_error(bp, buf) == 0)
784				return (0);
785			break;
786
787		default:
788			break;
789			//ASSERT(nparity <= 2);
790			//ASSERT(0);
791		}
792	}
793
794	/*
795	 * This isn't a typical situation -- either we got a read
796	 * error or a child silently returned bad data. Read every
797	 * block so we can try again with as much data and parity as
798	 * we can track down. If we've already been through once
799	 * before, all children will be marked as tried so we'll
800	 * proceed to combinatorial reconstruction.
801	 */
802	n = 0;
803	for (c = 0; c < acols; c++) {
804		rc = &cols[c];
805		if (rc->rc_tried)
806			continue;
807
808		devidx = rc->rc_devidx;
809		STAILQ_FOREACH(kid, &vdev->v_children, v_childlink)
810			if (kid->v_id == devidx)
811				break;
812		if (kid == NULL || kid->v_state != VDEV_STATE_HEALTHY) {
813			rc->rc_error = ENXIO;
814			rc->rc_tried = 1;	/* don't even try */
815			rc->rc_skipped = 1;
816			continue;
817		}
818		if (rc->rc_data)
819			rc->rc_error = kid->v_read(kid, NULL,
820			    rc->rc_data, rc->rc_offset, rc->rc_size);
821		else
822			rc->rc_error = ENXIO;
823		if (rc->rc_error == 0)
824			n++;
825		rc->rc_tried = 1;
826		rc->rc_skipped = 0;
827	}
828
829	/*
830	 * If we managed to read anything more, retry the
831	 * reconstruction.
832	 */
833	if (n)
834		goto reconstruct;
835
836	/*
837	 * At this point we've attempted to reconstruct the data given the
838	 * errors we detected, and we've attempted to read all columns. There
839	 * must, therefore, be one or more additional problems -- silent errors
840	 * resulting in invalid data rather than explicit I/O errors resulting
841	 * in absent data. Before we attempt combinatorial reconstruction make
842	 * sure we have a chance of coming up with the right answer.
843	 */
844	if (total_errors >= nparity) {
845		return (EIO);
846	}
847
848	asize = 0;
849	for (c = 0; c < acols; c++) {
850		rc = &cols[c];
851		if (rc->rc_size > asize)
852			asize = rc->rc_size;
853	}
854	if (cols[VDEV_RAIDZ_P].rc_error == 0) {
855		/*
856		 * Attempt to reconstruct the data from parity P.
857		 */
858		void *orig;
859		orig = zfs_alloc_temp(asize);
860		for (c = nparity; c < acols; c++) {
861			rc = &cols[c];
862
863			memcpy(orig, rc->rc_data, rc->rc_size);
864			vdev_raidz_reconstruct_p(cols, nparity, acols, c);
865
866			if (zio_checksum_error(bp, buf) == 0)
867				return (0);
868
869			memcpy(rc->rc_data, orig, rc->rc_size);
870		}
871	}
872
873	if (nparity > 1 && cols[VDEV_RAIDZ_Q].rc_error == 0) {
874		/*
875		 * Attempt to reconstruct the data from parity Q.
876		 */
877		void *orig;
878		orig = zfs_alloc_temp(asize);
879		for (c = nparity; c < acols; c++) {
880			rc = &cols[c];
881
882			memcpy(orig, rc->rc_data, rc->rc_size);
883			vdev_raidz_reconstruct_q(cols, nparity, acols, c);
884
885			if (zio_checksum_error(bp, buf) == 0)
886				return (0);
887
888			memcpy(rc->rc_data, orig, rc->rc_size);
889		}
890	}
891
892	if (nparity > 1 &&
893	    cols[VDEV_RAIDZ_P].rc_error == 0 &&
894	    cols[VDEV_RAIDZ_Q].rc_error == 0) {
895		/*
896		 * Attempt to reconstruct the data from both P and Q.
897		 */
898		void *orig, *orig1;
899		orig = zfs_alloc_temp(asize);
900		orig1 = zfs_alloc_temp(asize);
901		for (c = nparity; c < acols - 1; c++) {
902			rc = &cols[c];
903
904			memcpy(orig, rc->rc_data, rc->rc_size);
905
906			for (c1 = c + 1; c1 < acols; c1++) {
907				rc1 = &cols[c1];
908
909				memcpy(orig1, rc1->rc_data, rc1->rc_size);
910
911				vdev_raidz_reconstruct_pq(cols, nparity,
912				    acols, c, c1);
913
914				if (zio_checksum_error(bp, buf) == 0)
915					return (0);
916
917				memcpy(rc1->rc_data, orig1, rc1->rc_size);
918			}
919
920			memcpy(rc->rc_data, orig, rc->rc_size);
921		}
922	}
923
924	return (EIO);
925}
926
927