1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#include <sys/cdefs.h>
27__FBSDID("$FreeBSD$");
28
29#include <lz4.h>
30
31static uint64_t zfs_crc64_table[256];
32
33#define	ASSERT3S(x, y, z)	((void)0)
34#define	ASSERT3U(x, y, z)	((void)0)
35#define	ASSERT3P(x, y, z)	((void)0)
36#define	ASSERT0(x)		((void)0)
37#define	ASSERT(x)		((void)0)
38
39#define	panic(...)	do {						\
40	printf(__VA_ARGS__);						\
41	for (;;) ;							\
42} while (0)
43
44static void
45zfs_init_crc(void)
46{
47	int i, j;
48	uint64_t *ct;
49
50	/*
51	 * Calculate the crc64 table (used for the zap hash
52	 * function).
53	 */
54	if (zfs_crc64_table[128] != ZFS_CRC64_POLY) {
55		memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table));
56		for (i = 0; i < 256; i++)
57			for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
58				*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
59	}
60}
61
62static void
63zio_checksum_off(const void *buf, uint64_t size,
64    const void *ctx_template, zio_cksum_t *zcp)
65{
66	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
67}
68
69/*
70 * Signature for checksum functions.
71 */
72typedef void zio_checksum_t(const void *data, uint64_t size,
73    const void *ctx_template, zio_cksum_t *zcp);
74typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt);
75typedef void zio_checksum_tmpl_free_t(void *ctx_template);
76
77typedef enum zio_checksum_flags {
78	/* Strong enough for metadata? */
79	ZCHECKSUM_FLAG_METADATA = (1 << 1),
80	/* ZIO embedded checksum */
81	ZCHECKSUM_FLAG_EMBEDDED = (1 << 2),
82	/* Strong enough for dedup (without verification)? */
83	ZCHECKSUM_FLAG_DEDUP = (1 << 3),
84	/* Uses salt value */
85	ZCHECKSUM_FLAG_SALTED = (1 << 4),
86	/* Strong enough for nopwrite? */
87	ZCHECKSUM_FLAG_NOPWRITE = (1 << 5)
88} zio_checksum_flags_t;
89
90/*
91 * Information about each checksum function.
92 */
93typedef struct zio_checksum_info {
94	/* checksum function for each byteorder */
95	zio_checksum_t			*ci_func[2];
96	zio_checksum_tmpl_init_t	*ci_tmpl_init;
97	zio_checksum_tmpl_free_t	*ci_tmpl_free;
98	zio_checksum_flags_t		ci_flags;
99	const char			*ci_name;	/* descriptive name */
100} zio_checksum_info_t;
101
102#include "blkptr.c"
103
104#include "fletcher.c"
105#include "sha256.c"
106#include "skein_zfs.c"
107
108#ifdef HAS_ZSTD_ZFS
109extern int zfs_zstd_decompress(void *s_start, void *d_start, size_t s_len,
110    size_t d_len, int n);
111#endif
112
113static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
114	{{NULL, NULL}, NULL, NULL, 0, "inherit"},
115	{{NULL, NULL}, NULL, NULL, 0, "on"},
116	{{zio_checksum_off,	zio_checksum_off}, NULL, NULL, 0, "off"},
117	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
118	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "label"},
119	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
120	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "gang_header"},
121	{{fletcher_2_native,	fletcher_2_byteswap}, NULL, NULL,
122	    ZCHECKSUM_FLAG_EMBEDDED, "zilog"},
123	{{fletcher_2_native,	fletcher_2_byteswap}, NULL, NULL,
124	    0, "fletcher2"},
125	{{fletcher_4_native,	fletcher_4_byteswap}, NULL, NULL,
126	    ZCHECKSUM_FLAG_METADATA, "fletcher4"},
127	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
128	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
129	    ZCHECKSUM_FLAG_NOPWRITE, "SHA256"},
130	{{fletcher_4_native,	fletcher_4_byteswap}, NULL, NULL,
131	    ZCHECKSUM_FLAG_EMBEDDED, "zillog2"},
132	{{zio_checksum_off,	zio_checksum_off}, NULL, NULL,
133	    0, "noparity"},
134	{{zio_checksum_SHA512_native,	zio_checksum_SHA512_byteswap},
135	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
136	    ZCHECKSUM_FLAG_NOPWRITE, "SHA512"},
137	{{zio_checksum_skein_native, zio_checksum_skein_byteswap},
138	    zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free,
139	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
140	    ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"},
141	/* no edonr for now */
142	{{NULL, NULL}, NULL, NULL, ZCHECKSUM_FLAG_METADATA |
143	    ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "edonr"}
144};
145
146/*
147 * Common signature for all zio compress/decompress functions.
148 */
149typedef size_t zio_compress_func_t(void *src, void *dst,
150    size_t s_len, size_t d_len, int);
151typedef int zio_decompress_func_t(void *src, void *dst,
152    size_t s_len, size_t d_len, int);
153
154/*
155 * Information about each compression function.
156 */
157typedef struct zio_compress_info {
158	zio_compress_func_t	*ci_compress;	/* compression function */
159	zio_decompress_func_t	*ci_decompress;	/* decompression function */
160	int			ci_level;	/* level parameter */
161	const char		*ci_name;	/* algorithm name */
162} zio_compress_info_t;
163
164#include "lzjb.c"
165#include "zle.c"
166
167/*
168 * Compression vectors.
169 */
170static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
171	{NULL,			NULL,			0,	"inherit"},
172	{NULL,			NULL,			0,	"on"},
173	{NULL,			NULL,			0,	"uncompressed"},
174	{NULL,			lzjb_decompress,	0,	"lzjb"},
175	{NULL,			NULL,			0,	"empty"},
176	{NULL,			NULL,			1,	"gzip-1"},
177	{NULL,			NULL,			2,	"gzip-2"},
178	{NULL,			NULL,			3,	"gzip-3"},
179	{NULL,			NULL,			4,	"gzip-4"},
180	{NULL,			NULL,			5,	"gzip-5"},
181	{NULL,			NULL,			6,	"gzip-6"},
182	{NULL,			NULL,			7,	"gzip-7"},
183	{NULL,			NULL,			8,	"gzip-8"},
184	{NULL,			NULL,			9,	"gzip-9"},
185	{NULL,			zle_decompress,		64,	"zle"},
186	{NULL,			lz4_decompress,		0,	"lz4"},
187#ifdef HAS_ZSTD_ZFS
188	{NULL,			zfs_zstd_decompress, ZIO_ZSTD_LEVEL_DEFAULT, "zstd"}
189#endif
190};
191
192static void
193byteswap_uint64_array(void *vbuf, size_t size)
194{
195	uint64_t *buf = vbuf;
196	size_t count = size >> 3;
197	int i;
198
199	ASSERT((size & 7) == 0);
200
201	for (i = 0; i < count; i++)
202		buf[i] = BSWAP_64(buf[i]);
203}
204
205/*
206 * Set the external verifier for a gang block based on <vdev, offset, txg>,
207 * a tuple which is guaranteed to be unique for the life of the pool.
208 */
209static void
210zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp)
211{
212	const dva_t *dva = BP_IDENTITY(bp);
213	uint64_t txg = BP_PHYSICAL_BIRTH(bp);
214
215	ASSERT(BP_IS_GANG(bp));
216
217	ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0);
218}
219
220/*
221 * Set the external verifier for a label block based on its offset.
222 * The vdev is implicit, and the txg is unknowable at pool open time --
223 * hence the logic in vdev_uberblock_load() to find the most recent copy.
224 */
225static void
226zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
227{
228	ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0);
229}
230
231/*
232 * Calls the template init function of a checksum which supports context
233 * templates and installs the template into the spa_t.
234 */
235static void
236zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa)
237{
238	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
239
240	if (ci->ci_tmpl_init == NULL)
241		return;
242
243	if (spa->spa_cksum_tmpls[checksum] != NULL)
244		return;
245
246	if (spa->spa_cksum_tmpls[checksum] == NULL) {
247		spa->spa_cksum_tmpls[checksum] =
248		    ci->ci_tmpl_init(&spa->spa_cksum_salt);
249	}
250}
251
252/*
253 * Called by a spa_t that's about to be deallocated. This steps through
254 * all of the checksum context templates and deallocates any that were
255 * initialized using the algorithm-specific template init function.
256 */
257static void __unused
258zio_checksum_templates_free(spa_t *spa)
259{
260	for (enum zio_checksum checksum = 0;
261	    checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) {
262		if (spa->spa_cksum_tmpls[checksum] != NULL) {
263			zio_checksum_info_t *ci = &zio_checksum_table[checksum];
264
265			ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]);
266			spa->spa_cksum_tmpls[checksum] = NULL;
267		}
268	}
269}
270
271static int
272zio_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data)
273{
274	uint64_t size;
275	unsigned int checksum;
276	zio_checksum_info_t *ci;
277	void *ctx = NULL;
278	zio_cksum_t actual_cksum, expected_cksum, verifier;
279	int byteswap;
280
281	checksum = BP_GET_CHECKSUM(bp);
282	size = BP_GET_PSIZE(bp);
283
284	if (checksum >= ZIO_CHECKSUM_FUNCTIONS)
285		return (EINVAL);
286	ci = &zio_checksum_table[checksum];
287	if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL)
288		return (EINVAL);
289
290	if (spa != NULL) {
291		zio_checksum_template_init(checksum, __DECONST(spa_t *,spa));
292		ctx = spa->spa_cksum_tmpls[checksum];
293	}
294
295	if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
296		zio_eck_t *eck;
297
298		ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER ||
299		    checksum == ZIO_CHECKSUM_LABEL);
300
301		eck = (zio_eck_t *)((char *)data + size) - 1;
302
303		if (checksum == ZIO_CHECKSUM_GANG_HEADER)
304			zio_checksum_gang_verifier(&verifier, bp);
305		else if (checksum == ZIO_CHECKSUM_LABEL)
306			zio_checksum_label_verifier(&verifier,
307			    DVA_GET_OFFSET(BP_IDENTITY(bp)));
308		else
309			verifier = bp->blk_cksum;
310
311		byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
312
313		if (byteswap)
314			byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
315
316		expected_cksum = eck->zec_cksum;
317		eck->zec_cksum = verifier;
318		ci->ci_func[byteswap](data, size, ctx, &actual_cksum);
319		eck->zec_cksum = expected_cksum;
320
321		if (byteswap)
322			byteswap_uint64_array(&expected_cksum,
323			    sizeof (zio_cksum_t));
324	} else {
325		byteswap = BP_SHOULD_BYTESWAP(bp);
326		expected_cksum = bp->blk_cksum;
327		ci->ci_func[byteswap](data, size, ctx, &actual_cksum);
328	}
329
330	if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) {
331		/*printf("ZFS: read checksum %s failed\n", ci->ci_name);*/
332		return (EIO);
333	}
334
335	return (0);
336}
337
338static int
339zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
340	void *dest, uint64_t destsize)
341{
342	zio_compress_info_t *ci;
343
344	if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) {
345		printf("ZFS: unsupported compression algorithm %u\n", cpfunc);
346		return (EIO);
347	}
348
349	ci = &zio_compress_table[cpfunc];
350	if (!ci->ci_decompress) {
351		printf("ZFS: unsupported compression algorithm %s\n",
352		    ci->ci_name);
353		return (EIO);
354	}
355
356	return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
357}
358
359static uint64_t
360zap_hash(uint64_t salt, const char *name)
361{
362	const uint8_t *cp;
363	uint8_t c;
364	uint64_t crc = salt;
365
366	ASSERT(crc != 0);
367	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
368	for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
369		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
370
371	/*
372	 * Only use 28 bits, since we need 4 bits in the cookie for the
373	 * collision differentiator.  We MUST use the high bits, since
374	 * those are the onces that we first pay attention to when
375	 * chosing the bucket.
376	 */
377	crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
378
379	return (crc);
380}
381
382typedef struct raidz_col {
383	uint64_t rc_devidx;		/* child device index for I/O */
384	uint64_t rc_offset;		/* device offset */
385	uint64_t rc_size;		/* I/O size */
386	void *rc_data;			/* I/O data */
387	int rc_error;			/* I/O error for this device */
388	uint8_t rc_tried;		/* Did we attempt this I/O column? */
389	uint8_t rc_skipped;		/* Did we skip this I/O column? */
390} raidz_col_t;
391
392typedef struct raidz_map {
393	uint64_t rm_cols;		/* Regular column count */
394	uint64_t rm_scols;		/* Count including skipped columns */
395	uint64_t rm_bigcols;		/* Number of oversized columns */
396	uint64_t rm_asize;		/* Actual total I/O size */
397	uint64_t rm_missingdata;	/* Count of missing data devices */
398	uint64_t rm_missingparity;	/* Count of missing parity devices */
399	uint64_t rm_firstdatacol;	/* First data column/parity count */
400	uint64_t rm_nskip;		/* Skipped sectors for padding */
401	uint64_t rm_skipstart;		/* Column index of padding start */
402	uintptr_t rm_reports;		/* # of referencing checksum reports */
403	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
404	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
405	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
406} raidz_map_t;
407
408#define	VDEV_RAIDZ_P		0
409#define	VDEV_RAIDZ_Q		1
410#define	VDEV_RAIDZ_R		2
411
412#define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
413#define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
414
415/*
416 * We provide a mechanism to perform the field multiplication operation on a
417 * 64-bit value all at once rather than a byte at a time. This works by
418 * creating a mask from the top bit in each byte and using that to
419 * conditionally apply the XOR of 0x1d.
420 */
421#define	VDEV_RAIDZ_64MUL_2(x, mask) \
422{ \
423	(mask) = (x) & 0x8080808080808080ULL; \
424	(mask) = ((mask) << 1) - ((mask) >> 7); \
425	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
426	    ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
427}
428
429#define	VDEV_RAIDZ_64MUL_4(x, mask) \
430{ \
431	VDEV_RAIDZ_64MUL_2((x), mask); \
432	VDEV_RAIDZ_64MUL_2((x), mask); \
433}
434
435/*
436 * These two tables represent powers and logs of 2 in the Galois field defined
437 * above. These values were computed by repeatedly multiplying by 2 as above.
438 */
439static const uint8_t vdev_raidz_pow2[256] = {
440	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
441	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
442	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
443	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
444	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
445	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
446	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
447	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
448	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
449	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
450	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
451	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
452	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
453	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
454	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
455	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
456	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
457	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
458	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
459	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
460	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
461	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
462	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
463	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
464	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
465	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
466	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
467	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
468	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
469	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
470	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
471	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
472};
473static const uint8_t vdev_raidz_log2[256] = {
474	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
475	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
476	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
477	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
478	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
479	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
480	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
481	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
482	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
483	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
484	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
485	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
486	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
487	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
488	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
489	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
490	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
491	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
492	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
493	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
494	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
495	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
496	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
497	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
498	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
499	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
500	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
501	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
502	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
503	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
504	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
505	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
506};
507
508/*
509 * Multiply a given number by 2 raised to the given power.
510 */
511static uint8_t
512vdev_raidz_exp2(uint8_t a, int exp)
513{
514	if (a == 0)
515		return (0);
516
517	ASSERT(exp >= 0);
518	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
519
520	exp += vdev_raidz_log2[a];
521	if (exp > 255)
522		exp -= 255;
523
524	return (vdev_raidz_pow2[exp]);
525}
526
527static void
528vdev_raidz_generate_parity_p(raidz_map_t *rm)
529{
530	uint64_t *p, *src, pcount, ccount, i;
531	int c;
532
533	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
534
535	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
536		src = rm->rm_col[c].rc_data;
537		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
538		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
539
540		if (c == rm->rm_firstdatacol) {
541			ASSERT(ccount == pcount);
542			for (i = 0; i < ccount; i++, src++, p++) {
543				*p = *src;
544			}
545		} else {
546			ASSERT(ccount <= pcount);
547			for (i = 0; i < ccount; i++, src++, p++) {
548				*p ^= *src;
549			}
550		}
551	}
552}
553
554static void
555vdev_raidz_generate_parity_pq(raidz_map_t *rm)
556{
557	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
558	int c;
559
560	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
561	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
562	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
563
564	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
565		src = rm->rm_col[c].rc_data;
566		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
567		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
568
569		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
570
571		if (c == rm->rm_firstdatacol) {
572			ASSERT(ccnt == pcnt || ccnt == 0);
573			for (i = 0; i < ccnt; i++, src++, p++, q++) {
574				*p = *src;
575				*q = *src;
576			}
577			for (; i < pcnt; i++, src++, p++, q++) {
578				*p = 0;
579				*q = 0;
580			}
581		} else {
582			ASSERT(ccnt <= pcnt);
583
584			/*
585			 * Apply the algorithm described above by multiplying
586			 * the previous result and adding in the new value.
587			 */
588			for (i = 0; i < ccnt; i++, src++, p++, q++) {
589				*p ^= *src;
590
591				VDEV_RAIDZ_64MUL_2(*q, mask);
592				*q ^= *src;
593			}
594
595			/*
596			 * Treat short columns as though they are full of 0s.
597			 * Note that there's therefore nothing needed for P.
598			 */
599			for (; i < pcnt; i++, q++) {
600				VDEV_RAIDZ_64MUL_2(*q, mask);
601			}
602		}
603	}
604}
605
606static void
607vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
608{
609	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
610	int c;
611
612	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
613	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
614	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
615	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
616	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
617
618	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
619		src = rm->rm_col[c].rc_data;
620		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
621		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
622		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
623
624		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
625
626		if (c == rm->rm_firstdatacol) {
627			ASSERT(ccnt == pcnt || ccnt == 0);
628			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
629				*p = *src;
630				*q = *src;
631				*r = *src;
632			}
633			for (; i < pcnt; i++, src++, p++, q++, r++) {
634				*p = 0;
635				*q = 0;
636				*r = 0;
637			}
638		} else {
639			ASSERT(ccnt <= pcnt);
640
641			/*
642			 * Apply the algorithm described above by multiplying
643			 * the previous result and adding in the new value.
644			 */
645			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
646				*p ^= *src;
647
648				VDEV_RAIDZ_64MUL_2(*q, mask);
649				*q ^= *src;
650
651				VDEV_RAIDZ_64MUL_4(*r, mask);
652				*r ^= *src;
653			}
654
655			/*
656			 * Treat short columns as though they are full of 0s.
657			 * Note that there's therefore nothing needed for P.
658			 */
659			for (; i < pcnt; i++, q++, r++) {
660				VDEV_RAIDZ_64MUL_2(*q, mask);
661				VDEV_RAIDZ_64MUL_4(*r, mask);
662			}
663		}
664	}
665}
666
667/*
668 * Generate RAID parity in the first virtual columns according to the number of
669 * parity columns available.
670 */
671static void
672vdev_raidz_generate_parity(raidz_map_t *rm)
673{
674	switch (rm->rm_firstdatacol) {
675	case 1:
676		vdev_raidz_generate_parity_p(rm);
677		break;
678	case 2:
679		vdev_raidz_generate_parity_pq(rm);
680		break;
681	case 3:
682		vdev_raidz_generate_parity_pqr(rm);
683		break;
684	default:
685		panic("invalid RAID-Z configuration");
686	}
687}
688
689/* BEGIN CSTYLED */
690/*
691 * In the general case of reconstruction, we must solve the system of linear
692 * equations defined by the coeffecients used to generate parity as well as
693 * the contents of the data and parity disks. This can be expressed with
694 * vectors for the original data (D) and the actual data (d) and parity (p)
695 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
696 *
697 *            __   __                     __     __
698 *            |     |         __     __   |  p_0  |
699 *            |  V  |         |  D_0  |   | p_m-1 |
700 *            |     |    x    |   :   | = |  d_0  |
701 *            |  I  |         | D_n-1 |   |   :   |
702 *            |     |         ~~     ~~   | d_n-1 |
703 *            ~~   ~~                     ~~     ~~
704 *
705 * I is simply a square identity matrix of size n, and V is a vandermonde
706 * matrix defined by the coeffecients we chose for the various parity columns
707 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
708 * computation as well as linear separability.
709 *
710 *      __               __               __     __
711 *      |   1   ..  1 1 1 |               |  p_0  |
712 *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
713 *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
714 *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
715 *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
716 *      |   :       : : : |   |   :   |   |  d_2  |
717 *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
718 *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
719 *      |   0   ..  0 0 1 |               | d_n-1 |
720 *      ~~               ~~               ~~     ~~
721 *
722 * Note that I, V, d, and p are known. To compute D, we must invert the
723 * matrix and use the known data and parity values to reconstruct the unknown
724 * data values. We begin by removing the rows in V|I and d|p that correspond
725 * to failed or missing columns; we then make V|I square (n x n) and d|p
726 * sized n by removing rows corresponding to unused parity from the bottom up
727 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
728 * using Gauss-Jordan elimination. In the example below we use m=3 parity
729 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
730 *           __                               __
731 *           |  1   1   1   1   1   1   1   1  |
732 *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
733 *           |  19 205 116  29  64  16  4   1  |      / /
734 *           |  1   0   0   0   0   0   0   0  |     / /
735 *           |  0   1   0   0   0   0   0   0  | <--' /
736 *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
737 *           |  0   0   0   1   0   0   0   0  |
738 *           |  0   0   0   0   1   0   0   0  |
739 *           |  0   0   0   0   0   1   0   0  |
740 *           |  0   0   0   0   0   0   1   0  |
741 *           |  0   0   0   0   0   0   0   1  |
742 *           ~~                               ~~
743 *           __                               __
744 *           |  1   1   1   1   1   1   1   1  |
745 *           | 128  64  32  16  8   4   2   1  |
746 *           |  19 205 116  29  64  16  4   1  |
747 *           |  1   0   0   0   0   0   0   0  |
748 *           |  0   1   0   0   0   0   0   0  |
749 *  (V|I)' = |  0   0   1   0   0   0   0   0  |
750 *           |  0   0   0   1   0   0   0   0  |
751 *           |  0   0   0   0   1   0   0   0  |
752 *           |  0   0   0   0   0   1   0   0  |
753 *           |  0   0   0   0   0   0   1   0  |
754 *           |  0   0   0   0   0   0   0   1  |
755 *           ~~                               ~~
756 *
757 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
758 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
759 * matrix is not singular.
760 * __                                                                 __
761 * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
762 * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
763 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
764 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
765 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
766 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
767 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
768 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
769 * ~~                                                                 ~~
770 * __                                                                 __
771 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
772 * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
773 * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
774 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
775 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
776 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
777 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
778 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
779 * ~~                                                                 ~~
780 * __                                                                 __
781 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
782 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
783 * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
784 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
785 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
786 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
787 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
788 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
789 * ~~                                                                 ~~
790 * __                                                                 __
791 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
792 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
793 * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
794 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
795 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
796 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
797 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
798 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
799 * ~~                                                                 ~~
800 * __                                                                 __
801 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
802 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
803 * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
804 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
805 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
806 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
807 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
808 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
809 * ~~                                                                 ~~
810 * __                                                                 __
811 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
812 * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
813 * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
814 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
815 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
816 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
817 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
818 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
819 * ~~                                                                 ~~
820 *                   __                               __
821 *                   |  0   0   1   0   0   0   0   0  |
822 *                   | 167 100  5   41 159 169 217 208 |
823 *                   | 166 100  4   40 158 168 216 209 |
824 *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
825 *                   |  0   0   0   0   1   0   0   0  |
826 *                   |  0   0   0   0   0   1   0   0  |
827 *                   |  0   0   0   0   0   0   1   0  |
828 *                   |  0   0   0   0   0   0   0   1  |
829 *                   ~~                               ~~
830 *
831 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
832 * of the missing data.
833 *
834 * As is apparent from the example above, the only non-trivial rows in the
835 * inverse matrix correspond to the data disks that we're trying to
836 * reconstruct. Indeed, those are the only rows we need as the others would
837 * only be useful for reconstructing data known or assumed to be valid. For
838 * that reason, we only build the coefficients in the rows that correspond to
839 * targeted columns.
840 */
841/* END CSTYLED */
842
843static void
844vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
845    uint8_t **rows)
846{
847	int i, j;
848	int pow;
849
850	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
851
852	/*
853	 * Fill in the missing rows of interest.
854	 */
855	for (i = 0; i < nmap; i++) {
856		ASSERT3S(0, <=, map[i]);
857		ASSERT3S(map[i], <=, 2);
858
859		pow = map[i] * n;
860		if (pow > 255)
861			pow -= 255;
862		ASSERT(pow <= 255);
863
864		for (j = 0; j < n; j++) {
865			pow -= map[i];
866			if (pow < 0)
867				pow += 255;
868			rows[i][j] = vdev_raidz_pow2[pow];
869		}
870	}
871}
872
873static void
874vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
875    uint8_t **rows, uint8_t **invrows, const uint8_t *used)
876{
877	int i, j, ii, jj;
878	uint8_t log;
879
880	/*
881	 * Assert that the first nmissing entries from the array of used
882	 * columns correspond to parity columns and that subsequent entries
883	 * correspond to data columns.
884	 */
885	for (i = 0; i < nmissing; i++) {
886		ASSERT3S(used[i], <, rm->rm_firstdatacol);
887	}
888	for (; i < n; i++) {
889		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
890	}
891
892	/*
893	 * First initialize the storage where we'll compute the inverse rows.
894	 */
895	for (i = 0; i < nmissing; i++) {
896		for (j = 0; j < n; j++) {
897			invrows[i][j] = (i == j) ? 1 : 0;
898		}
899	}
900
901	/*
902	 * Subtract all trivial rows from the rows of consequence.
903	 */
904	for (i = 0; i < nmissing; i++) {
905		for (j = nmissing; j < n; j++) {
906			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
907			jj = used[j] - rm->rm_firstdatacol;
908			ASSERT3S(jj, <, n);
909			invrows[i][j] = rows[i][jj];
910			rows[i][jj] = 0;
911		}
912	}
913
914	/*
915	 * For each of the rows of interest, we must normalize it and subtract
916	 * a multiple of it from the other rows.
917	 */
918	for (i = 0; i < nmissing; i++) {
919		for (j = 0; j < missing[i]; j++) {
920			ASSERT3U(rows[i][j], ==, 0);
921		}
922		ASSERT3U(rows[i][missing[i]], !=, 0);
923
924		/*
925		 * Compute the inverse of the first element and multiply each
926		 * element in the row by that value.
927		 */
928		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
929
930		for (j = 0; j < n; j++) {
931			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
932			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
933		}
934
935		for (ii = 0; ii < nmissing; ii++) {
936			if (i == ii)
937				continue;
938
939			ASSERT3U(rows[ii][missing[i]], !=, 0);
940
941			log = vdev_raidz_log2[rows[ii][missing[i]]];
942
943			for (j = 0; j < n; j++) {
944				rows[ii][j] ^=
945				    vdev_raidz_exp2(rows[i][j], log);
946				invrows[ii][j] ^=
947				    vdev_raidz_exp2(invrows[i][j], log);
948			}
949		}
950	}
951
952	/*
953	 * Verify that the data that is left in the rows are properly part of
954	 * an identity matrix.
955	 */
956	for (i = 0; i < nmissing; i++) {
957		for (j = 0; j < n; j++) {
958			if (j == missing[i]) {
959				ASSERT3U(rows[i][j], ==, 1);
960			} else {
961				ASSERT3U(rows[i][j], ==, 0);
962			}
963		}
964	}
965}
966
967static void
968vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
969    int *missing, uint8_t **invrows, const uint8_t *used)
970{
971	int i, j, x, cc, c;
972	uint8_t *src;
973	uint64_t ccount;
974	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
975	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
976	uint8_t log, val;
977	int ll;
978	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
979	uint8_t *p, *pp;
980	size_t psize;
981
982	log = 0;	/* gcc */
983	psize = sizeof (invlog[0][0]) * n * nmissing;
984	p = malloc(psize);
985	if (p == NULL) {
986		printf("Out of memory\n");
987		return;
988	}
989
990	for (pp = p, i = 0; i < nmissing; i++) {
991		invlog[i] = pp;
992		pp += n;
993	}
994
995	for (i = 0; i < nmissing; i++) {
996		for (j = 0; j < n; j++) {
997			ASSERT3U(invrows[i][j], !=, 0);
998			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
999		}
1000	}
1001
1002	for (i = 0; i < n; i++) {
1003		c = used[i];
1004		ASSERT3U(c, <, rm->rm_cols);
1005
1006		src = rm->rm_col[c].rc_data;
1007		ccount = rm->rm_col[c].rc_size;
1008		for (j = 0; j < nmissing; j++) {
1009			cc = missing[j] + rm->rm_firstdatacol;
1010			ASSERT3U(cc, >=, rm->rm_firstdatacol);
1011			ASSERT3U(cc, <, rm->rm_cols);
1012			ASSERT3U(cc, !=, c);
1013
1014			dst[j] = rm->rm_col[cc].rc_data;
1015			dcount[j] = rm->rm_col[cc].rc_size;
1016		}
1017
1018		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1019
1020		for (x = 0; x < ccount; x++, src++) {
1021			if (*src != 0)
1022				log = vdev_raidz_log2[*src];
1023
1024			for (cc = 0; cc < nmissing; cc++) {
1025				if (x >= dcount[cc])
1026					continue;
1027
1028				if (*src == 0) {
1029					val = 0;
1030				} else {
1031					if ((ll = log + invlog[cc][i]) >= 255)
1032						ll -= 255;
1033					val = vdev_raidz_pow2[ll];
1034				}
1035
1036				if (i == 0)
1037					dst[cc][x] = val;
1038				else
1039					dst[cc][x] ^= val;
1040			}
1041		}
1042	}
1043
1044	free(p);
1045}
1046
1047static int
1048vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1049{
1050	int n, i, c, t, tt;
1051	int nmissing_rows;
1052	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1053	int parity_map[VDEV_RAIDZ_MAXPARITY];
1054
1055	uint8_t *p, *pp;
1056	size_t psize;
1057
1058	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1059	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1060	uint8_t *used;
1061
1062	int code = 0;
1063
1064
1065	n = rm->rm_cols - rm->rm_firstdatacol;
1066
1067	/*
1068	 * Figure out which data columns are missing.
1069	 */
1070	nmissing_rows = 0;
1071	for (t = 0; t < ntgts; t++) {
1072		if (tgts[t] >= rm->rm_firstdatacol) {
1073			missing_rows[nmissing_rows++] =
1074			    tgts[t] - rm->rm_firstdatacol;
1075		}
1076	}
1077
1078	/*
1079	 * Figure out which parity columns to use to help generate the missing
1080	 * data columns.
1081	 */
1082	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1083		ASSERT(tt < ntgts);
1084		ASSERT(c < rm->rm_firstdatacol);
1085
1086		/*
1087		 * Skip any targeted parity columns.
1088		 */
1089		if (c == tgts[tt]) {
1090			tt++;
1091			continue;
1092		}
1093
1094		code |= 1 << c;
1095
1096		parity_map[i] = c;
1097		i++;
1098	}
1099
1100	ASSERT(code != 0);
1101	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1102
1103	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1104	    nmissing_rows * n + sizeof (used[0]) * n;
1105	p = malloc(psize);
1106	if (p == NULL) {
1107		printf("Out of memory\n");
1108		return (code);
1109	}
1110
1111	for (pp = p, i = 0; i < nmissing_rows; i++) {
1112		rows[i] = pp;
1113		pp += n;
1114		invrows[i] = pp;
1115		pp += n;
1116	}
1117	used = pp;
1118
1119	for (i = 0; i < nmissing_rows; i++) {
1120		used[i] = parity_map[i];
1121	}
1122
1123	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1124		if (tt < nmissing_rows &&
1125		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1126			tt++;
1127			continue;
1128		}
1129
1130		ASSERT3S(i, <, n);
1131		used[i] = c;
1132		i++;
1133	}
1134
1135	/*
1136	 * Initialize the interesting rows of the matrix.
1137	 */
1138	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1139
1140	/*
1141	 * Invert the matrix.
1142	 */
1143	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1144	    invrows, used);
1145
1146	/*
1147	 * Reconstruct the missing data using the generated matrix.
1148	 */
1149	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1150	    invrows, used);
1151
1152	free(p);
1153
1154	return (code);
1155}
1156
1157static int
1158vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1159{
1160	int tgts[VDEV_RAIDZ_MAXPARITY];
1161	int ntgts;
1162	int i, c;
1163	int code;
1164	int nbadparity, nbaddata;
1165
1166	/*
1167	 * The tgts list must already be sorted.
1168	 */
1169	for (i = 1; i < nt; i++) {
1170		ASSERT(t[i] > t[i - 1]);
1171	}
1172
1173	nbadparity = rm->rm_firstdatacol;
1174	nbaddata = rm->rm_cols - nbadparity;
1175	ntgts = 0;
1176	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1177		if (i < nt && c == t[i]) {
1178			tgts[ntgts++] = c;
1179			i++;
1180		} else if (rm->rm_col[c].rc_error != 0) {
1181			tgts[ntgts++] = c;
1182		} else if (c >= rm->rm_firstdatacol) {
1183			nbaddata--;
1184		} else {
1185			nbadparity--;
1186		}
1187	}
1188
1189	ASSERT(ntgts >= nt);
1190	ASSERT(nbaddata >= 0);
1191	ASSERT(nbaddata + nbadparity == ntgts);
1192
1193	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1194	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1195	ASSERT(code > 0);
1196	return (code);
1197}
1198
1199static raidz_map_t *
1200vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift,
1201    uint64_t dcols, uint64_t nparity)
1202{
1203	raidz_map_t *rm;
1204	uint64_t b = offset >> unit_shift;
1205	uint64_t s = size >> unit_shift;
1206	uint64_t f = b % dcols;
1207	uint64_t o = (b / dcols) << unit_shift;
1208	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
1209
1210	q = s / (dcols - nparity);
1211	r = s - q * (dcols - nparity);
1212	bc = (r == 0 ? 0 : r + nparity);
1213	tot = s + nparity * (q + (r == 0 ? 0 : 1));
1214
1215	if (q == 0) {
1216		acols = bc;
1217		scols = MIN(dcols, roundup(bc, nparity + 1));
1218	} else {
1219		acols = dcols;
1220		scols = dcols;
1221	}
1222
1223	ASSERT3U(acols, <=, scols);
1224
1225	rm = malloc(offsetof(raidz_map_t, rm_col[scols]));
1226	if (rm == NULL)
1227		return (rm);
1228
1229	rm->rm_cols = acols;
1230	rm->rm_scols = scols;
1231	rm->rm_bigcols = bc;
1232	rm->rm_skipstart = bc;
1233	rm->rm_missingdata = 0;
1234	rm->rm_missingparity = 0;
1235	rm->rm_firstdatacol = nparity;
1236	rm->rm_reports = 0;
1237	rm->rm_freed = 0;
1238	rm->rm_ecksuminjected = 0;
1239
1240	asize = 0;
1241
1242	for (c = 0; c < scols; c++) {
1243		col = f + c;
1244		coff = o;
1245		if (col >= dcols) {
1246			col -= dcols;
1247			coff += 1ULL << unit_shift;
1248		}
1249		rm->rm_col[c].rc_devidx = col;
1250		rm->rm_col[c].rc_offset = coff;
1251		rm->rm_col[c].rc_data = NULL;
1252		rm->rm_col[c].rc_error = 0;
1253		rm->rm_col[c].rc_tried = 0;
1254		rm->rm_col[c].rc_skipped = 0;
1255
1256		if (c >= acols)
1257			rm->rm_col[c].rc_size = 0;
1258		else if (c < bc)
1259			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
1260		else
1261			rm->rm_col[c].rc_size = q << unit_shift;
1262
1263		asize += rm->rm_col[c].rc_size;
1264	}
1265
1266	ASSERT3U(asize, ==, tot << unit_shift);
1267	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
1268	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
1269	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
1270	ASSERT3U(rm->rm_nskip, <=, nparity);
1271
1272	for (c = 0; c < rm->rm_firstdatacol; c++) {
1273		rm->rm_col[c].rc_data = malloc(rm->rm_col[c].rc_size);
1274		if (rm->rm_col[c].rc_data == NULL) {
1275			c++;
1276			while (c != 0)
1277				free(rm->rm_col[--c].rc_data);
1278			free(rm);
1279			return (NULL);
1280		}
1281	}
1282
1283	rm->rm_col[c].rc_data = data;
1284
1285	for (c = c + 1; c < acols; c++)
1286		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
1287		    rm->rm_col[c - 1].rc_size;
1288
1289	/*
1290	 * If all data stored spans all columns, there's a danger that parity
1291	 * will always be on the same device and, since parity isn't read
1292	 * during normal operation, that that device's I/O bandwidth won't be
1293	 * used effectively. We therefore switch the parity every 1MB.
1294	 *
1295	 * ... at least that was, ostensibly, the theory. As a practical
1296	 * matter unless we juggle the parity between all devices evenly, we
1297	 * won't see any benefit. Further, occasional writes that aren't a
1298	 * multiple of the LCM of the number of children and the minimum
1299	 * stripe width are sufficient to avoid pessimal behavior.
1300	 * Unfortunately, this decision created an implicit on-disk format
1301	 * requirement that we need to support for all eternity, but only
1302	 * for single-parity RAID-Z.
1303	 *
1304	 * If we intend to skip a sector in the zeroth column for padding
1305	 * we must make sure to note this swap. We will never intend to
1306	 * skip the first column since at least one data and one parity
1307	 * column must appear in each row.
1308	 */
1309	ASSERT(rm->rm_cols >= 2);
1310	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
1311
1312	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
1313		devidx = rm->rm_col[0].rc_devidx;
1314		o = rm->rm_col[0].rc_offset;
1315		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
1316		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
1317		rm->rm_col[1].rc_devidx = devidx;
1318		rm->rm_col[1].rc_offset = o;
1319
1320		if (rm->rm_skipstart == 0)
1321			rm->rm_skipstart = 1;
1322	}
1323
1324	return (rm);
1325}
1326
1327static void
1328vdev_raidz_map_free(raidz_map_t *rm)
1329{
1330	int c;
1331
1332	for (c = rm->rm_firstdatacol - 1; c >= 0; c--)
1333		free(rm->rm_col[c].rc_data);
1334
1335	free(rm);
1336}
1337
1338static vdev_t *
1339vdev_child(vdev_t *pvd, uint64_t devidx)
1340{
1341	vdev_t *cvd;
1342
1343	STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) {
1344		if (cvd->v_id == devidx)
1345			break;
1346	}
1347
1348	return (cvd);
1349}
1350
1351/*
1352 * We keep track of whether or not there were any injected errors, so that
1353 * any ereports we generate can note it.
1354 */
1355static int
1356raidz_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data,
1357    uint64_t size)
1358{
1359	return (zio_checksum_verify(spa, bp, data));
1360}
1361
1362/*
1363 * Generate the parity from the data columns. If we tried and were able to
1364 * read the parity without error, verify that the generated parity matches the
1365 * data we read. If it doesn't, we fire off a checksum error. Return the
1366 * number such failures.
1367 */
1368static int
1369raidz_parity_verify(raidz_map_t *rm)
1370{
1371	void *orig[VDEV_RAIDZ_MAXPARITY];
1372	int c, ret = 0;
1373	raidz_col_t *rc;
1374
1375	for (c = 0; c < rm->rm_firstdatacol; c++) {
1376		rc = &rm->rm_col[c];
1377		if (!rc->rc_tried || rc->rc_error != 0)
1378			continue;
1379		orig[c] = malloc(rc->rc_size);
1380		if (orig[c] != NULL) {
1381			bcopy(rc->rc_data, orig[c], rc->rc_size);
1382		} else {
1383			printf("Out of memory\n");
1384		}
1385	}
1386
1387	vdev_raidz_generate_parity(rm);
1388
1389	for (c = rm->rm_firstdatacol - 1; c >= 0; c--) {
1390		rc = &rm->rm_col[c];
1391		if (!rc->rc_tried || rc->rc_error != 0)
1392			continue;
1393		if (orig[c] == NULL ||
1394		    bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1395			rc->rc_error = ECKSUM;
1396			ret++;
1397		}
1398		free(orig[c]);
1399	}
1400
1401	return (ret);
1402}
1403
1404/*
1405 * Iterate over all combinations of bad data and attempt a reconstruction.
1406 * Note that the algorithm below is non-optimal because it doesn't take into
1407 * account how reconstruction is actually performed. For example, with
1408 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1409 * is targeted as invalid as if columns 1 and 4 are targeted since in both
1410 * cases we'd only use parity information in column 0.
1411 */
1412static int
1413vdev_raidz_combrec(const spa_t *spa, raidz_map_t *rm, const blkptr_t *bp,
1414    void *data, off_t offset, uint64_t bytes, int total_errors, int data_errors)
1415{
1416	raidz_col_t *rc;
1417	void *orig[VDEV_RAIDZ_MAXPARITY];
1418	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1419	int *tgts = &tstore[1];
1420	int current, next, i, c, n;
1421	int code, ret = 0;
1422
1423	ASSERT(total_errors < rm->rm_firstdatacol);
1424
1425	/*
1426	 * This simplifies one edge condition.
1427	 */
1428	tgts[-1] = -1;
1429
1430	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1431		/*
1432		 * Initialize the targets array by finding the first n columns
1433		 * that contain no error.
1434		 *
1435		 * If there were no data errors, we need to ensure that we're
1436		 * always explicitly attempting to reconstruct at least one
1437		 * data column. To do this, we simply push the highest target
1438		 * up into the data columns.
1439		 */
1440		for (c = 0, i = 0; i < n; i++) {
1441			if (i == n - 1 && data_errors == 0 &&
1442			    c < rm->rm_firstdatacol) {
1443				c = rm->rm_firstdatacol;
1444			}
1445
1446			while (rm->rm_col[c].rc_error != 0) {
1447				c++;
1448				ASSERT3S(c, <, rm->rm_cols);
1449			}
1450
1451			tgts[i] = c++;
1452		}
1453
1454		/*
1455		 * Setting tgts[n] simplifies the other edge condition.
1456		 */
1457		tgts[n] = rm->rm_cols;
1458
1459		/*
1460		 * These buffers were allocated in previous iterations.
1461		 */
1462		for (i = 0; i < n - 1; i++) {
1463			ASSERT(orig[i] != NULL);
1464		}
1465
1466		orig[n - 1] = malloc(rm->rm_col[0].rc_size);
1467		if (orig[n - 1] == NULL) {
1468			ret = ENOMEM;
1469			goto done;
1470		}
1471
1472		current = 0;
1473		next = tgts[current];
1474
1475		while (current != n) {
1476			tgts[current] = next;
1477			current = 0;
1478
1479			/*
1480			 * Save off the original data that we're going to
1481			 * attempt to reconstruct.
1482			 */
1483			for (i = 0; i < n; i++) {
1484				ASSERT(orig[i] != NULL);
1485				c = tgts[i];
1486				ASSERT3S(c, >=, 0);
1487				ASSERT3S(c, <, rm->rm_cols);
1488				rc = &rm->rm_col[c];
1489				bcopy(rc->rc_data, orig[i], rc->rc_size);
1490			}
1491
1492			/*
1493			 * Attempt a reconstruction and exit the outer loop on
1494			 * success.
1495			 */
1496			code = vdev_raidz_reconstruct(rm, tgts, n);
1497			if (raidz_checksum_verify(spa, bp, data, bytes) == 0) {
1498				for (i = 0; i < n; i++) {
1499					c = tgts[i];
1500					rc = &rm->rm_col[c];
1501					ASSERT(rc->rc_error == 0);
1502					rc->rc_error = ECKSUM;
1503				}
1504
1505				ret = code;
1506				goto done;
1507			}
1508
1509			/*
1510			 * Restore the original data.
1511			 */
1512			for (i = 0; i < n; i++) {
1513				c = tgts[i];
1514				rc = &rm->rm_col[c];
1515				bcopy(orig[i], rc->rc_data, rc->rc_size);
1516			}
1517
1518			do {
1519				/*
1520				 * Find the next valid column after the current
1521				 * position..
1522				 */
1523				for (next = tgts[current] + 1;
1524				    next < rm->rm_cols &&
1525				    rm->rm_col[next].rc_error != 0; next++)
1526					continue;
1527
1528				ASSERT(next <= tgts[current + 1]);
1529
1530				/*
1531				 * If that spot is available, we're done here.
1532				 */
1533				if (next != tgts[current + 1])
1534					break;
1535
1536				/*
1537				 * Otherwise, find the next valid column after
1538				 * the previous position.
1539				 */
1540				for (c = tgts[current - 1] + 1;
1541				    rm->rm_col[c].rc_error != 0; c++)
1542					continue;
1543
1544				tgts[current] = c;
1545				current++;
1546
1547			} while (current != n);
1548		}
1549	}
1550	n--;
1551done:
1552	for (i = n - 1; i >= 0; i--) {
1553		free(orig[i]);
1554	}
1555
1556	return (ret);
1557}
1558
1559static int
1560vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data,
1561    off_t offset, size_t bytes)
1562{
1563	vdev_t *tvd = vd->v_top;
1564	vdev_t *cvd;
1565	raidz_map_t *rm;
1566	raidz_col_t *rc;
1567	int c, error;
1568	int unexpected_errors;
1569	int parity_errors;
1570	int parity_untried;
1571	int data_errors;
1572	int total_errors;
1573	int n;
1574	int tgts[VDEV_RAIDZ_MAXPARITY];
1575	int code;
1576
1577	rc = NULL;	/* gcc */
1578	error = 0;
1579
1580	rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift,
1581	    vd->v_nchildren, vd->v_nparity);
1582	if (rm == NULL)
1583		return (ENOMEM);
1584
1585	/*
1586	 * Iterate over the columns in reverse order so that we hit the parity
1587	 * last -- any errors along the way will force us to read the parity.
1588	 */
1589	for (c = rm->rm_cols - 1; c >= 0; c--) {
1590		rc = &rm->rm_col[c];
1591		cvd = vdev_child(vd, rc->rc_devidx);
1592		if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) {
1593			if (c >= rm->rm_firstdatacol)
1594				rm->rm_missingdata++;
1595			else
1596				rm->rm_missingparity++;
1597			rc->rc_error = ENXIO;
1598			rc->rc_tried = 1;	/* don't even try */
1599			rc->rc_skipped = 1;
1600			continue;
1601		}
1602#if 0		/* XXX: Too hard for the boot code. */
1603		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1604			if (c >= rm->rm_firstdatacol)
1605				rm->rm_missingdata++;
1606			else
1607				rm->rm_missingparity++;
1608			rc->rc_error = ESTALE;
1609			rc->rc_skipped = 1;
1610			continue;
1611		}
1612#endif
1613		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) {
1614			rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data,
1615			    rc->rc_offset, rc->rc_size);
1616			rc->rc_tried = 1;
1617			rc->rc_skipped = 0;
1618		}
1619	}
1620
1621reconstruct:
1622	unexpected_errors = 0;
1623	parity_errors = 0;
1624	parity_untried = 0;
1625	data_errors = 0;
1626	total_errors = 0;
1627
1628	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1629	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1630
1631	for (c = 0; c < rm->rm_cols; c++) {
1632		rc = &rm->rm_col[c];
1633
1634		if (rc->rc_error) {
1635			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
1636
1637			if (c < rm->rm_firstdatacol)
1638				parity_errors++;
1639			else
1640				data_errors++;
1641
1642			if (!rc->rc_skipped)
1643				unexpected_errors++;
1644
1645			total_errors++;
1646		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1647			parity_untried++;
1648		}
1649	}
1650
1651	/*
1652	 * There are three potential phases for a read:
1653	 *	1. produce valid data from the columns read
1654	 *	2. read all disks and try again
1655	 *	3. perform combinatorial reconstruction
1656	 *
1657	 * Each phase is progressively both more expensive and less likely to
1658	 * occur. If we encounter more errors than we can repair or all phases
1659	 * fail, we have no choice but to return an error.
1660	 */
1661
1662	/*
1663	 * If the number of errors we saw was correctable -- less than or equal
1664	 * to the number of parity disks read -- attempt to produce data that
1665	 * has a valid checksum. Naturally, this case applies in the absence of
1666	 * any errors.
1667	 */
1668	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
1669		int rv;
1670
1671		if (data_errors == 0) {
1672			rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes);
1673			if (rv == 0) {
1674				/*
1675				 * If we read parity information (unnecessarily
1676				 * as it happens since no reconstruction was
1677				 * needed) regenerate and verify the parity.
1678				 * We also regenerate parity when resilvering
1679				 * so we can write it out to the failed device
1680				 * later.
1681				 */
1682				if (parity_errors + parity_untried <
1683				    rm->rm_firstdatacol) {
1684					n = raidz_parity_verify(rm);
1685					unexpected_errors += n;
1686					ASSERT(parity_errors + n <=
1687					    rm->rm_firstdatacol);
1688				}
1689				goto done;
1690			}
1691		} else {
1692			/*
1693			 * We either attempt to read all the parity columns or
1694			 * none of them. If we didn't try to read parity, we
1695			 * wouldn't be here in the correctable case. There must
1696			 * also have been fewer parity errors than parity
1697			 * columns or, again, we wouldn't be in this code path.
1698			 */
1699			ASSERT(parity_untried == 0);
1700			ASSERT(parity_errors < rm->rm_firstdatacol);
1701
1702			/*
1703			 * Identify the data columns that reported an error.
1704			 */
1705			n = 0;
1706			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1707				rc = &rm->rm_col[c];
1708				if (rc->rc_error != 0) {
1709					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1710					tgts[n++] = c;
1711				}
1712			}
1713
1714			ASSERT(rm->rm_firstdatacol >= n);
1715
1716			code = vdev_raidz_reconstruct(rm, tgts, n);
1717
1718			rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes);
1719			if (rv == 0) {
1720				/*
1721				 * If we read more parity disks than were used
1722				 * for reconstruction, confirm that the other
1723				 * parity disks produced correct data. This
1724				 * routine is suboptimal in that it regenerates
1725				 * the parity that we already used in addition
1726				 * to the parity that we're attempting to
1727				 * verify, but this should be a relatively
1728				 * uncommon case, and can be optimized if it
1729				 * becomes a problem. Note that we regenerate
1730				 * parity when resilvering so we can write it
1731				 * out to failed devices later.
1732				 */
1733				if (parity_errors < rm->rm_firstdatacol - n) {
1734					n = raidz_parity_verify(rm);
1735					unexpected_errors += n;
1736					ASSERT(parity_errors + n <=
1737					    rm->rm_firstdatacol);
1738				}
1739
1740				goto done;
1741			}
1742		}
1743	}
1744
1745	/*
1746	 * This isn't a typical situation -- either we got a read
1747	 * error or a child silently returned bad data. Read every
1748	 * block so we can try again with as much data and parity as
1749	 * we can track down. If we've already been through once
1750	 * before, all children will be marked as tried so we'll
1751	 * proceed to combinatorial reconstruction.
1752	 */
1753	unexpected_errors = 1;
1754	rm->rm_missingdata = 0;
1755	rm->rm_missingparity = 0;
1756
1757	n = 0;
1758	for (c = 0; c < rm->rm_cols; c++) {
1759		rc = &rm->rm_col[c];
1760
1761		if (rc->rc_tried)
1762			continue;
1763
1764		cvd = vdev_child(vd, rc->rc_devidx);
1765		ASSERT(cvd != NULL);
1766		rc->rc_error = cvd->v_read(cvd, NULL,
1767		    rc->rc_data, rc->rc_offset, rc->rc_size);
1768		if (rc->rc_error == 0)
1769			n++;
1770		rc->rc_tried = 1;
1771		rc->rc_skipped = 0;
1772	}
1773	/*
1774	 * If we managed to read anything more, retry the
1775	 * reconstruction.
1776	 */
1777	if (n > 0)
1778		goto reconstruct;
1779
1780	/*
1781	 * At this point we've attempted to reconstruct the data given the
1782	 * errors we detected, and we've attempted to read all columns. There
1783	 * must, therefore, be one or more additional problems -- silent errors
1784	 * resulting in invalid data rather than explicit I/O errors resulting
1785	 * in absent data. We check if there is enough additional data to
1786	 * possibly reconstruct the data and then perform combinatorial
1787	 * reconstruction over all possible combinations. If that fails,
1788	 * we're cooked.
1789	 */
1790	if (total_errors > rm->rm_firstdatacol) {
1791		error = EIO;
1792	} else if (total_errors < rm->rm_firstdatacol &&
1793	    (code = vdev_raidz_combrec(vd->v_spa, rm, bp, data, offset, bytes,
1794	     total_errors, data_errors)) != 0) {
1795		/*
1796		 * If we didn't use all the available parity for the
1797		 * combinatorial reconstruction, verify that the remaining
1798		 * parity is correct.
1799		 */
1800		if (code != (1 << rm->rm_firstdatacol) - 1)
1801			(void) raidz_parity_verify(rm);
1802	} else {
1803		/*
1804		 * We're here because either:
1805		 *
1806		 *	total_errors == rm_first_datacol, or
1807		 *	vdev_raidz_combrec() failed
1808		 *
1809		 * In either case, there is enough bad data to prevent
1810		 * reconstruction.
1811		 *
1812		 * Start checksum ereports for all children which haven't
1813		 * failed, and the IO wasn't speculative.
1814		 */
1815		error = ECKSUM;
1816	}
1817
1818done:
1819	vdev_raidz_map_free(rm);
1820
1821	return (error);
1822}
1823