zfssubr.c revision 329175
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#include <sys/cdefs.h>
27__FBSDID("$FreeBSD: stable/11/sys/cddl/boot/zfs/zfssubr.c 329175 2018-02-12 17:44:35Z kevans $");
28
29static uint64_t zfs_crc64_table[256];
30
31#define	ECKSUM	666
32
33#define	ASSERT3S(x, y, z)	((void)0)
34#define	ASSERT3U(x, y, z)	((void)0)
35#define	ASSERT3P(x, y, z)	((void)0)
36#define	ASSERT0(x)		((void)0)
37#define	ASSERT(x)		((void)0)
38
39#define	panic(...)	do {						\
40	printf(__VA_ARGS__);						\
41	for (;;) ;							\
42} while (0)
43
44#define	kmem_alloc(size, flag)	zfs_alloc((size))
45#define	kmem_free(ptr, size)	zfs_free((ptr), (size))
46
47static void
48zfs_init_crc(void)
49{
50	int i, j;
51	uint64_t *ct;
52
53	/*
54	 * Calculate the crc64 table (used for the zap hash
55	 * function).
56	 */
57	if (zfs_crc64_table[128] != ZFS_CRC64_POLY) {
58		memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table));
59		for (i = 0; i < 256; i++)
60			for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
61				*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
62	}
63}
64
65static void
66zio_checksum_off(const void *buf, uint64_t size,
67    const void *ctx_template, zio_cksum_t *zcp)
68{
69	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
70}
71
72/*
73 * Signature for checksum functions.
74 */
75typedef void zio_checksum_t(const void *data, uint64_t size,
76    const void *ctx_template, zio_cksum_t *zcp);
77typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt);
78typedef void zio_checksum_tmpl_free_t(void *ctx_template);
79
80typedef enum zio_checksum_flags {
81	/* Strong enough for metadata? */
82	ZCHECKSUM_FLAG_METADATA = (1 << 1),
83	/* ZIO embedded checksum */
84	ZCHECKSUM_FLAG_EMBEDDED = (1 << 2),
85	/* Strong enough for dedup (without verification)? */
86	ZCHECKSUM_FLAG_DEDUP = (1 << 3),
87	/* Uses salt value */
88	ZCHECKSUM_FLAG_SALTED = (1 << 4),
89	/* Strong enough for nopwrite? */
90	ZCHECKSUM_FLAG_NOPWRITE = (1 << 5)
91} zio_checksum_flags_t;
92
93/*
94 * Information about each checksum function.
95 */
96typedef struct zio_checksum_info {
97	/* checksum function for each byteorder */
98	zio_checksum_t			*ci_func[2];
99	zio_checksum_tmpl_init_t	*ci_tmpl_init;
100	zio_checksum_tmpl_free_t	*ci_tmpl_free;
101	zio_checksum_flags_t		ci_flags;
102	const char			*ci_name;	/* descriptive name */
103} zio_checksum_info_t;
104
105#include "blkptr.c"
106
107#include "fletcher.c"
108#include "sha256.c"
109#include "skein_zfs.c"
110
111static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
112	{{NULL, NULL}, NULL, NULL, 0, "inherit"},
113	{{NULL, NULL}, NULL, NULL, 0, "on"},
114	{{zio_checksum_off,	zio_checksum_off}, NULL, NULL, 0, "off"},
115	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
116	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "label"},
117	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
118	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "gang_header"},
119	{{fletcher_2_native,	fletcher_2_byteswap}, NULL, NULL,
120	    ZCHECKSUM_FLAG_EMBEDDED, "zilog"},
121	{{fletcher_2_native,	fletcher_2_byteswap}, NULL, NULL,
122	    0, "fletcher2"},
123	{{fletcher_4_native,	fletcher_4_byteswap}, NULL, NULL,
124	    ZCHECKSUM_FLAG_METADATA, "fletcher4"},
125	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
126	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
127	    ZCHECKSUM_FLAG_NOPWRITE, "SHA256"},
128	{{fletcher_4_native,	fletcher_4_byteswap}, NULL, NULL,
129	    ZCHECKSUM_FLAG_EMBEDDED, "zillog2"},
130	{{zio_checksum_off,	zio_checksum_off}, NULL, NULL,
131	    0, "noparity"},
132	{{zio_checksum_SHA512_native,	zio_checksum_SHA512_byteswap},
133	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
134	    ZCHECKSUM_FLAG_NOPWRITE, "SHA512"},
135	{{zio_checksum_skein_native, zio_checksum_skein_byteswap},
136	    zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free,
137	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
138	    ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"},
139	/* no edonr for now */
140	{{NULL, NULL}, NULL, NULL, ZCHECKSUM_FLAG_METADATA |
141	    ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "edonr"}
142};
143
144/*
145 * Common signature for all zio compress/decompress functions.
146 */
147typedef size_t zio_compress_func_t(void *src, void *dst,
148    size_t s_len, size_t d_len, int);
149typedef int zio_decompress_func_t(void *src, void *dst,
150    size_t s_len, size_t d_len, int);
151
152/*
153 * Information about each compression function.
154 */
155typedef struct zio_compress_info {
156	zio_compress_func_t	*ci_compress;	/* compression function */
157	zio_decompress_func_t	*ci_decompress;	/* decompression function */
158	int			ci_level;	/* level parameter */
159	const char		*ci_name;	/* algorithm name */
160} zio_compress_info_t;
161
162#include "lzjb.c"
163#include "zle.c"
164#include "lz4.c"
165
166/*
167 * Compression vectors.
168 */
169static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
170	{NULL,			NULL,			0,	"inherit"},
171	{NULL,			NULL,			0,	"on"},
172	{NULL,			NULL,			0,	"uncompressed"},
173	{NULL,			lzjb_decompress,	0,	"lzjb"},
174	{NULL,			NULL,			0,	"empty"},
175	{NULL,			NULL,			1,	"gzip-1"},
176	{NULL,			NULL,			2,	"gzip-2"},
177	{NULL,			NULL,			3,	"gzip-3"},
178	{NULL,			NULL,			4,	"gzip-4"},
179	{NULL,			NULL,			5,	"gzip-5"},
180	{NULL,			NULL,			6,	"gzip-6"},
181	{NULL,			NULL,			7,	"gzip-7"},
182	{NULL,			NULL,			8,	"gzip-8"},
183	{NULL,			NULL,			9,	"gzip-9"},
184	{NULL,			zle_decompress,		64,	"zle"},
185	{NULL,			lz4_decompress,		0,	"lz4"},
186};
187
188static void
189byteswap_uint64_array(void *vbuf, size_t size)
190{
191	uint64_t *buf = vbuf;
192	size_t count = size >> 3;
193	int i;
194
195	ASSERT((size & 7) == 0);
196
197	for (i = 0; i < count; i++)
198		buf[i] = BSWAP_64(buf[i]);
199}
200
201/*
202 * Set the external verifier for a gang block based on <vdev, offset, txg>,
203 * a tuple which is guaranteed to be unique for the life of the pool.
204 */
205static void
206zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp)
207{
208	const dva_t *dva = BP_IDENTITY(bp);
209	uint64_t txg = BP_PHYSICAL_BIRTH(bp);
210
211	ASSERT(BP_IS_GANG(bp));
212
213	ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0);
214}
215
216/*
217 * Set the external verifier for a label block based on its offset.
218 * The vdev is implicit, and the txg is unknowable at pool open time --
219 * hence the logic in vdev_uberblock_load() to find the most recent copy.
220 */
221static void
222zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
223{
224	ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0);
225}
226
227/*
228 * Calls the template init function of a checksum which supports context
229 * templates and installs the template into the spa_t.
230 */
231static void
232zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa)
233{
234	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
235
236	if (ci->ci_tmpl_init == NULL)
237		return;
238
239	if (spa->spa_cksum_tmpls[checksum] != NULL)
240		return;
241
242	if (spa->spa_cksum_tmpls[checksum] == NULL) {
243		spa->spa_cksum_tmpls[checksum] =
244		    ci->ci_tmpl_init(&spa->spa_cksum_salt);
245	}
246}
247
248/*
249 * Called by a spa_t that's about to be deallocated. This steps through
250 * all of the checksum context templates and deallocates any that were
251 * initialized using the algorithm-specific template init function.
252 */
253static void __unused
254zio_checksum_templates_free(spa_t *spa)
255{
256	for (enum zio_checksum checksum = 0;
257	    checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) {
258		if (spa->spa_cksum_tmpls[checksum] != NULL) {
259			zio_checksum_info_t *ci = &zio_checksum_table[checksum];
260
261			ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]);
262			spa->spa_cksum_tmpls[checksum] = NULL;
263		}
264	}
265}
266
267static int
268zio_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data)
269{
270	uint64_t size;
271	unsigned int checksum;
272	zio_checksum_info_t *ci;
273	void *ctx = NULL;
274	zio_cksum_t actual_cksum, expected_cksum, verifier;
275	int byteswap;
276
277	checksum = BP_GET_CHECKSUM(bp);
278	size = BP_GET_PSIZE(bp);
279
280	if (checksum >= ZIO_CHECKSUM_FUNCTIONS)
281		return (EINVAL);
282	ci = &zio_checksum_table[checksum];
283	if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL)
284		return (EINVAL);
285
286	if (spa != NULL) {
287		zio_checksum_template_init(checksum, __DECONST(spa_t *,spa));
288		ctx = spa->spa_cksum_tmpls[checksum];
289	}
290
291	if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
292		zio_eck_t *eck;
293
294		ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER ||
295		    checksum == ZIO_CHECKSUM_LABEL);
296
297		eck = (zio_eck_t *)((char *)data + size) - 1;
298
299		if (checksum == ZIO_CHECKSUM_GANG_HEADER)
300			zio_checksum_gang_verifier(&verifier, bp);
301		else if (checksum == ZIO_CHECKSUM_LABEL)
302			zio_checksum_label_verifier(&verifier,
303			    DVA_GET_OFFSET(BP_IDENTITY(bp)));
304		else
305			verifier = bp->blk_cksum;
306
307		byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
308
309		if (byteswap)
310			byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
311
312		expected_cksum = eck->zec_cksum;
313		eck->zec_cksum = verifier;
314		ci->ci_func[byteswap](data, size, ctx, &actual_cksum);
315		eck->zec_cksum = expected_cksum;
316
317		if (byteswap)
318			byteswap_uint64_array(&expected_cksum,
319			    sizeof (zio_cksum_t));
320	} else {
321		expected_cksum = bp->blk_cksum;
322		ci->ci_func[0](data, size, ctx, &actual_cksum);
323	}
324
325	if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) {
326		/*printf("ZFS: read checksum %s failed\n", ci->ci_name);*/
327		return (EIO);
328	}
329
330	return (0);
331}
332
333static int
334zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
335	void *dest, uint64_t destsize)
336{
337	zio_compress_info_t *ci;
338
339	if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) {
340		printf("ZFS: unsupported compression algorithm %u\n", cpfunc);
341		return (EIO);
342	}
343
344	ci = &zio_compress_table[cpfunc];
345	if (!ci->ci_decompress) {
346		printf("ZFS: unsupported compression algorithm %s\n",
347		    ci->ci_name);
348		return (EIO);
349	}
350
351	return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
352}
353
354static uint64_t
355zap_hash(uint64_t salt, const char *name)
356{
357	const uint8_t *cp;
358	uint8_t c;
359	uint64_t crc = salt;
360
361	ASSERT(crc != 0);
362	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
363	for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
364		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
365
366	/*
367	 * Only use 28 bits, since we need 4 bits in the cookie for the
368	 * collision differentiator.  We MUST use the high bits, since
369	 * those are the onces that we first pay attention to when
370	 * chosing the bucket.
371	 */
372	crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
373
374	return (crc);
375}
376
377static void *zfs_alloc(size_t size);
378static void zfs_free(void *ptr, size_t size);
379
380typedef struct raidz_col {
381	uint64_t rc_devidx;		/* child device index for I/O */
382	uint64_t rc_offset;		/* device offset */
383	uint64_t rc_size;		/* I/O size */
384	void *rc_data;			/* I/O data */
385	int rc_error;			/* I/O error for this device */
386	uint8_t rc_tried;		/* Did we attempt this I/O column? */
387	uint8_t rc_skipped;		/* Did we skip this I/O column? */
388} raidz_col_t;
389
390typedef struct raidz_map {
391	uint64_t rm_cols;		/* Regular column count */
392	uint64_t rm_scols;		/* Count including skipped columns */
393	uint64_t rm_bigcols;		/* Number of oversized columns */
394	uint64_t rm_asize;		/* Actual total I/O size */
395	uint64_t rm_missingdata;	/* Count of missing data devices */
396	uint64_t rm_missingparity;	/* Count of missing parity devices */
397	uint64_t rm_firstdatacol;	/* First data column/parity count */
398	uint64_t rm_nskip;		/* Skipped sectors for padding */
399	uint64_t rm_skipstart;		/* Column index of padding start */
400	uintptr_t rm_reports;		/* # of referencing checksum reports */
401	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
402	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
403	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
404} raidz_map_t;
405
406#define	VDEV_RAIDZ_P		0
407#define	VDEV_RAIDZ_Q		1
408#define	VDEV_RAIDZ_R		2
409
410#define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
411#define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
412
413/*
414 * We provide a mechanism to perform the field multiplication operation on a
415 * 64-bit value all at once rather than a byte at a time. This works by
416 * creating a mask from the top bit in each byte and using that to
417 * conditionally apply the XOR of 0x1d.
418 */
419#define	VDEV_RAIDZ_64MUL_2(x, mask) \
420{ \
421	(mask) = (x) & 0x8080808080808080ULL; \
422	(mask) = ((mask) << 1) - ((mask) >> 7); \
423	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
424	    ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
425}
426
427#define	VDEV_RAIDZ_64MUL_4(x, mask) \
428{ \
429	VDEV_RAIDZ_64MUL_2((x), mask); \
430	VDEV_RAIDZ_64MUL_2((x), mask); \
431}
432
433/*
434 * These two tables represent powers and logs of 2 in the Galois field defined
435 * above. These values were computed by repeatedly multiplying by 2 as above.
436 */
437static const uint8_t vdev_raidz_pow2[256] = {
438	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
439	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
440	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
441	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
442	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
443	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
444	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
445	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
446	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
447	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
448	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
449	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
450	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
451	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
452	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
453	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
454	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
455	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
456	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
457	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
458	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
459	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
460	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
461	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
462	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
463	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
464	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
465	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
466	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
467	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
468	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
469	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
470};
471static const uint8_t vdev_raidz_log2[256] = {
472	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
473	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
474	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
475	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
476	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
477	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
478	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
479	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
480	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
481	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
482	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
483	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
484	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
485	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
486	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
487	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
488	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
489	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
490	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
491	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
492	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
493	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
494	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
495	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
496	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
497	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
498	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
499	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
500	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
501	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
502	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
503	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
504};
505
506/*
507 * Multiply a given number by 2 raised to the given power.
508 */
509static uint8_t
510vdev_raidz_exp2(uint8_t a, int exp)
511{
512	if (a == 0)
513		return (0);
514
515	ASSERT(exp >= 0);
516	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
517
518	exp += vdev_raidz_log2[a];
519	if (exp > 255)
520		exp -= 255;
521
522	return (vdev_raidz_pow2[exp]);
523}
524
525static void
526vdev_raidz_generate_parity_p(raidz_map_t *rm)
527{
528	uint64_t *p, *src, pcount, ccount, i;
529	int c;
530
531	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
532
533	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
534		src = rm->rm_col[c].rc_data;
535		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
536		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
537
538		if (c == rm->rm_firstdatacol) {
539			ASSERT(ccount == pcount);
540			for (i = 0; i < ccount; i++, src++, p++) {
541				*p = *src;
542			}
543		} else {
544			ASSERT(ccount <= pcount);
545			for (i = 0; i < ccount; i++, src++, p++) {
546				*p ^= *src;
547			}
548		}
549	}
550}
551
552static void
553vdev_raidz_generate_parity_pq(raidz_map_t *rm)
554{
555	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
556	int c;
557
558	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
559	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
560	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
561
562	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
563		src = rm->rm_col[c].rc_data;
564		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
565		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
566
567		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
568
569		if (c == rm->rm_firstdatacol) {
570			ASSERT(ccnt == pcnt || ccnt == 0);
571			for (i = 0; i < ccnt; i++, src++, p++, q++) {
572				*p = *src;
573				*q = *src;
574			}
575			for (; i < pcnt; i++, src++, p++, q++) {
576				*p = 0;
577				*q = 0;
578			}
579		} else {
580			ASSERT(ccnt <= pcnt);
581
582			/*
583			 * Apply the algorithm described above by multiplying
584			 * the previous result and adding in the new value.
585			 */
586			for (i = 0; i < ccnt; i++, src++, p++, q++) {
587				*p ^= *src;
588
589				VDEV_RAIDZ_64MUL_2(*q, mask);
590				*q ^= *src;
591			}
592
593			/*
594			 * Treat short columns as though they are full of 0s.
595			 * Note that there's therefore nothing needed for P.
596			 */
597			for (; i < pcnt; i++, q++) {
598				VDEV_RAIDZ_64MUL_2(*q, mask);
599			}
600		}
601	}
602}
603
604static void
605vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
606{
607	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
608	int c;
609
610	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
611	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
612	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
613	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
614	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
615
616	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
617		src = rm->rm_col[c].rc_data;
618		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
619		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
620		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
621
622		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
623
624		if (c == rm->rm_firstdatacol) {
625			ASSERT(ccnt == pcnt || ccnt == 0);
626			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
627				*p = *src;
628				*q = *src;
629				*r = *src;
630			}
631			for (; i < pcnt; i++, src++, p++, q++, r++) {
632				*p = 0;
633				*q = 0;
634				*r = 0;
635			}
636		} else {
637			ASSERT(ccnt <= pcnt);
638
639			/*
640			 * Apply the algorithm described above by multiplying
641			 * the previous result and adding in the new value.
642			 */
643			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
644				*p ^= *src;
645
646				VDEV_RAIDZ_64MUL_2(*q, mask);
647				*q ^= *src;
648
649				VDEV_RAIDZ_64MUL_4(*r, mask);
650				*r ^= *src;
651			}
652
653			/*
654			 * Treat short columns as though they are full of 0s.
655			 * Note that there's therefore nothing needed for P.
656			 */
657			for (; i < pcnt; i++, q++, r++) {
658				VDEV_RAIDZ_64MUL_2(*q, mask);
659				VDEV_RAIDZ_64MUL_4(*r, mask);
660			}
661		}
662	}
663}
664
665/*
666 * Generate RAID parity in the first virtual columns according to the number of
667 * parity columns available.
668 */
669static void
670vdev_raidz_generate_parity(raidz_map_t *rm)
671{
672	switch (rm->rm_firstdatacol) {
673	case 1:
674		vdev_raidz_generate_parity_p(rm);
675		break;
676	case 2:
677		vdev_raidz_generate_parity_pq(rm);
678		break;
679	case 3:
680		vdev_raidz_generate_parity_pqr(rm);
681		break;
682	default:
683		panic("invalid RAID-Z configuration");
684	}
685}
686
687/* BEGIN CSTYLED */
688/*
689 * In the general case of reconstruction, we must solve the system of linear
690 * equations defined by the coeffecients used to generate parity as well as
691 * the contents of the data and parity disks. This can be expressed with
692 * vectors for the original data (D) and the actual data (d) and parity (p)
693 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
694 *
695 *            __   __                     __     __
696 *            |     |         __     __   |  p_0  |
697 *            |  V  |         |  D_0  |   | p_m-1 |
698 *            |     |    x    |   :   | = |  d_0  |
699 *            |  I  |         | D_n-1 |   |   :   |
700 *            |     |         ~~     ~~   | d_n-1 |
701 *            ~~   ~~                     ~~     ~~
702 *
703 * I is simply a square identity matrix of size n, and V is a vandermonde
704 * matrix defined by the coeffecients we chose for the various parity columns
705 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
706 * computation as well as linear separability.
707 *
708 *      __               __               __     __
709 *      |   1   ..  1 1 1 |               |  p_0  |
710 *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
711 *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
712 *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
713 *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
714 *      |   :       : : : |   |   :   |   |  d_2  |
715 *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
716 *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
717 *      |   0   ..  0 0 1 |               | d_n-1 |
718 *      ~~               ~~               ~~     ~~
719 *
720 * Note that I, V, d, and p are known. To compute D, we must invert the
721 * matrix and use the known data and parity values to reconstruct the unknown
722 * data values. We begin by removing the rows in V|I and d|p that correspond
723 * to failed or missing columns; we then make V|I square (n x n) and d|p
724 * sized n by removing rows corresponding to unused parity from the bottom up
725 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
726 * using Gauss-Jordan elimination. In the example below we use m=3 parity
727 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
728 *           __                               __
729 *           |  1   1   1   1   1   1   1   1  |
730 *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
731 *           |  19 205 116  29  64  16  4   1  |      / /
732 *           |  1   0   0   0   0   0   0   0  |     / /
733 *           |  0   1   0   0   0   0   0   0  | <--' /
734 *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
735 *           |  0   0   0   1   0   0   0   0  |
736 *           |  0   0   0   0   1   0   0   0  |
737 *           |  0   0   0   0   0   1   0   0  |
738 *           |  0   0   0   0   0   0   1   0  |
739 *           |  0   0   0   0   0   0   0   1  |
740 *           ~~                               ~~
741 *           __                               __
742 *           |  1   1   1   1   1   1   1   1  |
743 *           | 128  64  32  16  8   4   2   1  |
744 *           |  19 205 116  29  64  16  4   1  |
745 *           |  1   0   0   0   0   0   0   0  |
746 *           |  0   1   0   0   0   0   0   0  |
747 *  (V|I)' = |  0   0   1   0   0   0   0   0  |
748 *           |  0   0   0   1   0   0   0   0  |
749 *           |  0   0   0   0   1   0   0   0  |
750 *           |  0   0   0   0   0   1   0   0  |
751 *           |  0   0   0   0   0   0   1   0  |
752 *           |  0   0   0   0   0   0   0   1  |
753 *           ~~                               ~~
754 *
755 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
756 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
757 * matrix is not singular.
758 * __                                                                 __
759 * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
760 * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
761 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
762 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
763 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
764 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
765 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
766 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
767 * ~~                                                                 ~~
768 * __                                                                 __
769 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
770 * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
771 * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
772 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
773 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
774 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
775 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
776 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
777 * ~~                                                                 ~~
778 * __                                                                 __
779 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
780 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
781 * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
782 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
783 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
784 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
785 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
786 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
787 * ~~                                                                 ~~
788 * __                                                                 __
789 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
790 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
791 * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
792 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
793 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
794 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
795 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
796 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
797 * ~~                                                                 ~~
798 * __                                                                 __
799 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
800 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
801 * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
802 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
803 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
804 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
805 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
806 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
807 * ~~                                                                 ~~
808 * __                                                                 __
809 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
810 * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
811 * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
812 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
813 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
814 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
815 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
816 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
817 * ~~                                                                 ~~
818 *                   __                               __
819 *                   |  0   0   1   0   0   0   0   0  |
820 *                   | 167 100  5   41 159 169 217 208 |
821 *                   | 166 100  4   40 158 168 216 209 |
822 *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
823 *                   |  0   0   0   0   1   0   0   0  |
824 *                   |  0   0   0   0   0   1   0   0  |
825 *                   |  0   0   0   0   0   0   1   0  |
826 *                   |  0   0   0   0   0   0   0   1  |
827 *                   ~~                               ~~
828 *
829 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
830 * of the missing data.
831 *
832 * As is apparent from the example above, the only non-trivial rows in the
833 * inverse matrix correspond to the data disks that we're trying to
834 * reconstruct. Indeed, those are the only rows we need as the others would
835 * only be useful for reconstructing data known or assumed to be valid. For
836 * that reason, we only build the coefficients in the rows that correspond to
837 * targeted columns.
838 */
839/* END CSTYLED */
840
841static void
842vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
843    uint8_t **rows)
844{
845	int i, j;
846	int pow;
847
848	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
849
850	/*
851	 * Fill in the missing rows of interest.
852	 */
853	for (i = 0; i < nmap; i++) {
854		ASSERT3S(0, <=, map[i]);
855		ASSERT3S(map[i], <=, 2);
856
857		pow = map[i] * n;
858		if (pow > 255)
859			pow -= 255;
860		ASSERT(pow <= 255);
861
862		for (j = 0; j < n; j++) {
863			pow -= map[i];
864			if (pow < 0)
865				pow += 255;
866			rows[i][j] = vdev_raidz_pow2[pow];
867		}
868	}
869}
870
871static void
872vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
873    uint8_t **rows, uint8_t **invrows, const uint8_t *used)
874{
875	int i, j, ii, jj;
876	uint8_t log;
877
878	/*
879	 * Assert that the first nmissing entries from the array of used
880	 * columns correspond to parity columns and that subsequent entries
881	 * correspond to data columns.
882	 */
883	for (i = 0; i < nmissing; i++) {
884		ASSERT3S(used[i], <, rm->rm_firstdatacol);
885	}
886	for (; i < n; i++) {
887		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
888	}
889
890	/*
891	 * First initialize the storage where we'll compute the inverse rows.
892	 */
893	for (i = 0; i < nmissing; i++) {
894		for (j = 0; j < n; j++) {
895			invrows[i][j] = (i == j) ? 1 : 0;
896		}
897	}
898
899	/*
900	 * Subtract all trivial rows from the rows of consequence.
901	 */
902	for (i = 0; i < nmissing; i++) {
903		for (j = nmissing; j < n; j++) {
904			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
905			jj = used[j] - rm->rm_firstdatacol;
906			ASSERT3S(jj, <, n);
907			invrows[i][j] = rows[i][jj];
908			rows[i][jj] = 0;
909		}
910	}
911
912	/*
913	 * For each of the rows of interest, we must normalize it and subtract
914	 * a multiple of it from the other rows.
915	 */
916	for (i = 0; i < nmissing; i++) {
917		for (j = 0; j < missing[i]; j++) {
918			ASSERT3U(rows[i][j], ==, 0);
919		}
920		ASSERT3U(rows[i][missing[i]], !=, 0);
921
922		/*
923		 * Compute the inverse of the first element and multiply each
924		 * element in the row by that value.
925		 */
926		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
927
928		for (j = 0; j < n; j++) {
929			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
930			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
931		}
932
933		for (ii = 0; ii < nmissing; ii++) {
934			if (i == ii)
935				continue;
936
937			ASSERT3U(rows[ii][missing[i]], !=, 0);
938
939			log = vdev_raidz_log2[rows[ii][missing[i]]];
940
941			for (j = 0; j < n; j++) {
942				rows[ii][j] ^=
943				    vdev_raidz_exp2(rows[i][j], log);
944				invrows[ii][j] ^=
945				    vdev_raidz_exp2(invrows[i][j], log);
946			}
947		}
948	}
949
950	/*
951	 * Verify that the data that is left in the rows are properly part of
952	 * an identity matrix.
953	 */
954	for (i = 0; i < nmissing; i++) {
955		for (j = 0; j < n; j++) {
956			if (j == missing[i]) {
957				ASSERT3U(rows[i][j], ==, 1);
958			} else {
959				ASSERT3U(rows[i][j], ==, 0);
960			}
961		}
962	}
963}
964
965static void
966vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
967    int *missing, uint8_t **invrows, const uint8_t *used)
968{
969	int i, j, x, cc, c;
970	uint8_t *src;
971	uint64_t ccount;
972	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
973	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
974	uint8_t log, val;
975	int ll;
976	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
977	uint8_t *p, *pp;
978	size_t psize;
979
980	log = 0;	/* gcc */
981	psize = sizeof (invlog[0][0]) * n * nmissing;
982	p = zfs_alloc(psize);
983
984	for (pp = p, i = 0; i < nmissing; i++) {
985		invlog[i] = pp;
986		pp += n;
987	}
988
989	for (i = 0; i < nmissing; i++) {
990		for (j = 0; j < n; j++) {
991			ASSERT3U(invrows[i][j], !=, 0);
992			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
993		}
994	}
995
996	for (i = 0; i < n; i++) {
997		c = used[i];
998		ASSERT3U(c, <, rm->rm_cols);
999
1000		src = rm->rm_col[c].rc_data;
1001		ccount = rm->rm_col[c].rc_size;
1002		for (j = 0; j < nmissing; j++) {
1003			cc = missing[j] + rm->rm_firstdatacol;
1004			ASSERT3U(cc, >=, rm->rm_firstdatacol);
1005			ASSERT3U(cc, <, rm->rm_cols);
1006			ASSERT3U(cc, !=, c);
1007
1008			dst[j] = rm->rm_col[cc].rc_data;
1009			dcount[j] = rm->rm_col[cc].rc_size;
1010		}
1011
1012		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1013
1014		for (x = 0; x < ccount; x++, src++) {
1015			if (*src != 0)
1016				log = vdev_raidz_log2[*src];
1017
1018			for (cc = 0; cc < nmissing; cc++) {
1019				if (x >= dcount[cc])
1020					continue;
1021
1022				if (*src == 0) {
1023					val = 0;
1024				} else {
1025					if ((ll = log + invlog[cc][i]) >= 255)
1026						ll -= 255;
1027					val = vdev_raidz_pow2[ll];
1028				}
1029
1030				if (i == 0)
1031					dst[cc][x] = val;
1032				else
1033					dst[cc][x] ^= val;
1034			}
1035		}
1036	}
1037
1038	zfs_free(p, psize);
1039}
1040
1041static int
1042vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1043{
1044	int n, i, c, t, tt;
1045	int nmissing_rows;
1046	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1047	int parity_map[VDEV_RAIDZ_MAXPARITY];
1048
1049	uint8_t *p, *pp;
1050	size_t psize;
1051
1052	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1053	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1054	uint8_t *used;
1055
1056	int code = 0;
1057
1058
1059	n = rm->rm_cols - rm->rm_firstdatacol;
1060
1061	/*
1062	 * Figure out which data columns are missing.
1063	 */
1064	nmissing_rows = 0;
1065	for (t = 0; t < ntgts; t++) {
1066		if (tgts[t] >= rm->rm_firstdatacol) {
1067			missing_rows[nmissing_rows++] =
1068			    tgts[t] - rm->rm_firstdatacol;
1069		}
1070	}
1071
1072	/*
1073	 * Figure out which parity columns to use to help generate the missing
1074	 * data columns.
1075	 */
1076	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1077		ASSERT(tt < ntgts);
1078		ASSERT(c < rm->rm_firstdatacol);
1079
1080		/*
1081		 * Skip any targeted parity columns.
1082		 */
1083		if (c == tgts[tt]) {
1084			tt++;
1085			continue;
1086		}
1087
1088		code |= 1 << c;
1089
1090		parity_map[i] = c;
1091		i++;
1092	}
1093
1094	ASSERT(code != 0);
1095	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1096
1097	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1098	    nmissing_rows * n + sizeof (used[0]) * n;
1099	p = kmem_alloc(psize, KM_SLEEP);
1100
1101	for (pp = p, i = 0; i < nmissing_rows; i++) {
1102		rows[i] = pp;
1103		pp += n;
1104		invrows[i] = pp;
1105		pp += n;
1106	}
1107	used = pp;
1108
1109	for (i = 0; i < nmissing_rows; i++) {
1110		used[i] = parity_map[i];
1111	}
1112
1113	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1114		if (tt < nmissing_rows &&
1115		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1116			tt++;
1117			continue;
1118		}
1119
1120		ASSERT3S(i, <, n);
1121		used[i] = c;
1122		i++;
1123	}
1124
1125	/*
1126	 * Initialize the interesting rows of the matrix.
1127	 */
1128	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1129
1130	/*
1131	 * Invert the matrix.
1132	 */
1133	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1134	    invrows, used);
1135
1136	/*
1137	 * Reconstruct the missing data using the generated matrix.
1138	 */
1139	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1140	    invrows, used);
1141
1142	kmem_free(p, psize);
1143
1144	return (code);
1145}
1146
1147static int
1148vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1149{
1150	int tgts[VDEV_RAIDZ_MAXPARITY];
1151	int ntgts;
1152	int i, c;
1153	int code;
1154	int nbadparity, nbaddata;
1155
1156	/*
1157	 * The tgts list must already be sorted.
1158	 */
1159	for (i = 1; i < nt; i++) {
1160		ASSERT(t[i] > t[i - 1]);
1161	}
1162
1163	nbadparity = rm->rm_firstdatacol;
1164	nbaddata = rm->rm_cols - nbadparity;
1165	ntgts = 0;
1166	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1167		if (i < nt && c == t[i]) {
1168			tgts[ntgts++] = c;
1169			i++;
1170		} else if (rm->rm_col[c].rc_error != 0) {
1171			tgts[ntgts++] = c;
1172		} else if (c >= rm->rm_firstdatacol) {
1173			nbaddata--;
1174		} else {
1175			nbadparity--;
1176		}
1177	}
1178
1179	ASSERT(ntgts >= nt);
1180	ASSERT(nbaddata >= 0);
1181	ASSERT(nbaddata + nbadparity == ntgts);
1182
1183	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1184	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1185	ASSERT(code > 0);
1186	return (code);
1187}
1188
1189static raidz_map_t *
1190vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift,
1191    uint64_t dcols, uint64_t nparity)
1192{
1193	raidz_map_t *rm;
1194	uint64_t b = offset >> unit_shift;
1195	uint64_t s = size >> unit_shift;
1196	uint64_t f = b % dcols;
1197	uint64_t o = (b / dcols) << unit_shift;
1198	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
1199
1200	q = s / (dcols - nparity);
1201	r = s - q * (dcols - nparity);
1202	bc = (r == 0 ? 0 : r + nparity);
1203	tot = s + nparity * (q + (r == 0 ? 0 : 1));
1204
1205	if (q == 0) {
1206		acols = bc;
1207		scols = MIN(dcols, roundup(bc, nparity + 1));
1208	} else {
1209		acols = dcols;
1210		scols = dcols;
1211	}
1212
1213	ASSERT3U(acols, <=, scols);
1214
1215	rm = zfs_alloc(offsetof(raidz_map_t, rm_col[scols]));
1216
1217	rm->rm_cols = acols;
1218	rm->rm_scols = scols;
1219	rm->rm_bigcols = bc;
1220	rm->rm_skipstart = bc;
1221	rm->rm_missingdata = 0;
1222	rm->rm_missingparity = 0;
1223	rm->rm_firstdatacol = nparity;
1224	rm->rm_reports = 0;
1225	rm->rm_freed = 0;
1226	rm->rm_ecksuminjected = 0;
1227
1228	asize = 0;
1229
1230	for (c = 0; c < scols; c++) {
1231		col = f + c;
1232		coff = o;
1233		if (col >= dcols) {
1234			col -= dcols;
1235			coff += 1ULL << unit_shift;
1236		}
1237		rm->rm_col[c].rc_devidx = col;
1238		rm->rm_col[c].rc_offset = coff;
1239		rm->rm_col[c].rc_data = NULL;
1240		rm->rm_col[c].rc_error = 0;
1241		rm->rm_col[c].rc_tried = 0;
1242		rm->rm_col[c].rc_skipped = 0;
1243
1244		if (c >= acols)
1245			rm->rm_col[c].rc_size = 0;
1246		else if (c < bc)
1247			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
1248		else
1249			rm->rm_col[c].rc_size = q << unit_shift;
1250
1251		asize += rm->rm_col[c].rc_size;
1252	}
1253
1254	ASSERT3U(asize, ==, tot << unit_shift);
1255	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
1256	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
1257	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
1258	ASSERT3U(rm->rm_nskip, <=, nparity);
1259
1260	for (c = 0; c < rm->rm_firstdatacol; c++)
1261		rm->rm_col[c].rc_data = zfs_alloc(rm->rm_col[c].rc_size);
1262
1263	rm->rm_col[c].rc_data = data;
1264
1265	for (c = c + 1; c < acols; c++)
1266		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
1267		    rm->rm_col[c - 1].rc_size;
1268
1269	/*
1270	 * If all data stored spans all columns, there's a danger that parity
1271	 * will always be on the same device and, since parity isn't read
1272	 * during normal operation, that that device's I/O bandwidth won't be
1273	 * used effectively. We therefore switch the parity every 1MB.
1274	 *
1275	 * ... at least that was, ostensibly, the theory. As a practical
1276	 * matter unless we juggle the parity between all devices evenly, we
1277	 * won't see any benefit. Further, occasional writes that aren't a
1278	 * multiple of the LCM of the number of children and the minimum
1279	 * stripe width are sufficient to avoid pessimal behavior.
1280	 * Unfortunately, this decision created an implicit on-disk format
1281	 * requirement that we need to support for all eternity, but only
1282	 * for single-parity RAID-Z.
1283	 *
1284	 * If we intend to skip a sector in the zeroth column for padding
1285	 * we must make sure to note this swap. We will never intend to
1286	 * skip the first column since at least one data and one parity
1287	 * column must appear in each row.
1288	 */
1289	ASSERT(rm->rm_cols >= 2);
1290	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
1291
1292	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
1293		devidx = rm->rm_col[0].rc_devidx;
1294		o = rm->rm_col[0].rc_offset;
1295		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
1296		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
1297		rm->rm_col[1].rc_devidx = devidx;
1298		rm->rm_col[1].rc_offset = o;
1299
1300		if (rm->rm_skipstart == 0)
1301			rm->rm_skipstart = 1;
1302	}
1303
1304	return (rm);
1305}
1306
1307static void
1308vdev_raidz_map_free(raidz_map_t *rm)
1309{
1310	int c;
1311
1312	for (c = rm->rm_firstdatacol - 1; c >= 0; c--)
1313		zfs_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
1314
1315	zfs_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
1316}
1317
1318static vdev_t *
1319vdev_child(vdev_t *pvd, uint64_t devidx)
1320{
1321	vdev_t *cvd;
1322
1323	STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) {
1324		if (cvd->v_id == devidx)
1325			break;
1326	}
1327
1328	return (cvd);
1329}
1330
1331/*
1332 * We keep track of whether or not there were any injected errors, so that
1333 * any ereports we generate can note it.
1334 */
1335static int
1336raidz_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data,
1337    uint64_t size)
1338{
1339	return (zio_checksum_verify(spa, bp, data));
1340}
1341
1342/*
1343 * Generate the parity from the data columns. If we tried and were able to
1344 * read the parity without error, verify that the generated parity matches the
1345 * data we read. If it doesn't, we fire off a checksum error. Return the
1346 * number such failures.
1347 */
1348static int
1349raidz_parity_verify(raidz_map_t *rm)
1350{
1351	void *orig[VDEV_RAIDZ_MAXPARITY];
1352	int c, ret = 0;
1353	raidz_col_t *rc;
1354
1355	for (c = 0; c < rm->rm_firstdatacol; c++) {
1356		rc = &rm->rm_col[c];
1357		if (!rc->rc_tried || rc->rc_error != 0)
1358			continue;
1359		orig[c] = zfs_alloc(rc->rc_size);
1360		bcopy(rc->rc_data, orig[c], rc->rc_size);
1361	}
1362
1363	vdev_raidz_generate_parity(rm);
1364
1365	for (c = rm->rm_firstdatacol - 1; c >= 0; c--) {
1366		rc = &rm->rm_col[c];
1367		if (!rc->rc_tried || rc->rc_error != 0)
1368			continue;
1369		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1370			rc->rc_error = ECKSUM;
1371			ret++;
1372		}
1373		zfs_free(orig[c], rc->rc_size);
1374	}
1375
1376	return (ret);
1377}
1378
1379/*
1380 * Iterate over all combinations of bad data and attempt a reconstruction.
1381 * Note that the algorithm below is non-optimal because it doesn't take into
1382 * account how reconstruction is actually performed. For example, with
1383 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1384 * is targeted as invalid as if columns 1 and 4 are targeted since in both
1385 * cases we'd only use parity information in column 0.
1386 */
1387static int
1388vdev_raidz_combrec(const spa_t *spa, raidz_map_t *rm, const blkptr_t *bp,
1389    void *data, off_t offset, uint64_t bytes, int total_errors, int data_errors)
1390{
1391	raidz_col_t *rc;
1392	void *orig[VDEV_RAIDZ_MAXPARITY];
1393	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1394	int *tgts = &tstore[1];
1395	int current, next, i, c, n;
1396	int code, ret = 0;
1397
1398	ASSERT(total_errors < rm->rm_firstdatacol);
1399
1400	/*
1401	 * This simplifies one edge condition.
1402	 */
1403	tgts[-1] = -1;
1404
1405	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1406		/*
1407		 * Initialize the targets array by finding the first n columns
1408		 * that contain no error.
1409		 *
1410		 * If there were no data errors, we need to ensure that we're
1411		 * always explicitly attempting to reconstruct at least one
1412		 * data column. To do this, we simply push the highest target
1413		 * up into the data columns.
1414		 */
1415		for (c = 0, i = 0; i < n; i++) {
1416			if (i == n - 1 && data_errors == 0 &&
1417			    c < rm->rm_firstdatacol) {
1418				c = rm->rm_firstdatacol;
1419			}
1420
1421			while (rm->rm_col[c].rc_error != 0) {
1422				c++;
1423				ASSERT3S(c, <, rm->rm_cols);
1424			}
1425
1426			tgts[i] = c++;
1427		}
1428
1429		/*
1430		 * Setting tgts[n] simplifies the other edge condition.
1431		 */
1432		tgts[n] = rm->rm_cols;
1433
1434		/*
1435		 * These buffers were allocated in previous iterations.
1436		 */
1437		for (i = 0; i < n - 1; i++) {
1438			ASSERT(orig[i] != NULL);
1439		}
1440
1441		orig[n - 1] = zfs_alloc(rm->rm_col[0].rc_size);
1442
1443		current = 0;
1444		next = tgts[current];
1445
1446		while (current != n) {
1447			tgts[current] = next;
1448			current = 0;
1449
1450			/*
1451			 * Save off the original data that we're going to
1452			 * attempt to reconstruct.
1453			 */
1454			for (i = 0; i < n; i++) {
1455				ASSERT(orig[i] != NULL);
1456				c = tgts[i];
1457				ASSERT3S(c, >=, 0);
1458				ASSERT3S(c, <, rm->rm_cols);
1459				rc = &rm->rm_col[c];
1460				bcopy(rc->rc_data, orig[i], rc->rc_size);
1461			}
1462
1463			/*
1464			 * Attempt a reconstruction and exit the outer loop on
1465			 * success.
1466			 */
1467			code = vdev_raidz_reconstruct(rm, tgts, n);
1468			if (raidz_checksum_verify(spa, bp, data, bytes) == 0) {
1469				for (i = 0; i < n; i++) {
1470					c = tgts[i];
1471					rc = &rm->rm_col[c];
1472					ASSERT(rc->rc_error == 0);
1473					rc->rc_error = ECKSUM;
1474				}
1475
1476				ret = code;
1477				goto done;
1478			}
1479
1480			/*
1481			 * Restore the original data.
1482			 */
1483			for (i = 0; i < n; i++) {
1484				c = tgts[i];
1485				rc = &rm->rm_col[c];
1486				bcopy(orig[i], rc->rc_data, rc->rc_size);
1487			}
1488
1489			do {
1490				/*
1491				 * Find the next valid column after the current
1492				 * position..
1493				 */
1494				for (next = tgts[current] + 1;
1495				    next < rm->rm_cols &&
1496				    rm->rm_col[next].rc_error != 0; next++)
1497					continue;
1498
1499				ASSERT(next <= tgts[current + 1]);
1500
1501				/*
1502				 * If that spot is available, we're done here.
1503				 */
1504				if (next != tgts[current + 1])
1505					break;
1506
1507				/*
1508				 * Otherwise, find the next valid column after
1509				 * the previous position.
1510				 */
1511				for (c = tgts[current - 1] + 1;
1512				    rm->rm_col[c].rc_error != 0; c++)
1513					continue;
1514
1515				tgts[current] = c;
1516				current++;
1517
1518			} while (current != n);
1519		}
1520	}
1521	n--;
1522done:
1523	for (i = n - 1; i >= 0; i--) {
1524		zfs_free(orig[i], rm->rm_col[0].rc_size);
1525	}
1526
1527	return (ret);
1528}
1529
1530static int
1531vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data,
1532    off_t offset, size_t bytes)
1533{
1534	vdev_t *tvd = vd->v_top;
1535	vdev_t *cvd;
1536	raidz_map_t *rm;
1537	raidz_col_t *rc;
1538	int c, error;
1539	int unexpected_errors;
1540	int parity_errors;
1541	int parity_untried;
1542	int data_errors;
1543	int total_errors;
1544	int n;
1545	int tgts[VDEV_RAIDZ_MAXPARITY];
1546	int code;
1547
1548	rc = NULL;	/* gcc */
1549	error = 0;
1550
1551	rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift,
1552	    vd->v_nchildren, vd->v_nparity);
1553
1554	/*
1555	 * Iterate over the columns in reverse order so that we hit the parity
1556	 * last -- any errors along the way will force us to read the parity.
1557	 */
1558	for (c = rm->rm_cols - 1; c >= 0; c--) {
1559		rc = &rm->rm_col[c];
1560		cvd = vdev_child(vd, rc->rc_devidx);
1561		if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) {
1562			if (c >= rm->rm_firstdatacol)
1563				rm->rm_missingdata++;
1564			else
1565				rm->rm_missingparity++;
1566			rc->rc_error = ENXIO;
1567			rc->rc_tried = 1;	/* don't even try */
1568			rc->rc_skipped = 1;
1569			continue;
1570		}
1571#if 0		/* XXX: Too hard for the boot code. */
1572		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1573			if (c >= rm->rm_firstdatacol)
1574				rm->rm_missingdata++;
1575			else
1576				rm->rm_missingparity++;
1577			rc->rc_error = ESTALE;
1578			rc->rc_skipped = 1;
1579			continue;
1580		}
1581#endif
1582		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) {
1583			rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data,
1584			    rc->rc_offset, rc->rc_size);
1585			rc->rc_tried = 1;
1586			rc->rc_skipped = 0;
1587		}
1588	}
1589
1590reconstruct:
1591	unexpected_errors = 0;
1592	parity_errors = 0;
1593	parity_untried = 0;
1594	data_errors = 0;
1595	total_errors = 0;
1596
1597	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1598	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1599
1600	for (c = 0; c < rm->rm_cols; c++) {
1601		rc = &rm->rm_col[c];
1602
1603		if (rc->rc_error) {
1604			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
1605
1606			if (c < rm->rm_firstdatacol)
1607				parity_errors++;
1608			else
1609				data_errors++;
1610
1611			if (!rc->rc_skipped)
1612				unexpected_errors++;
1613
1614			total_errors++;
1615		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1616			parity_untried++;
1617		}
1618	}
1619
1620	/*
1621	 * There are three potential phases for a read:
1622	 *	1. produce valid data from the columns read
1623	 *	2. read all disks and try again
1624	 *	3. perform combinatorial reconstruction
1625	 *
1626	 * Each phase is progressively both more expensive and less likely to
1627	 * occur. If we encounter more errors than we can repair or all phases
1628	 * fail, we have no choice but to return an error.
1629	 */
1630
1631	/*
1632	 * If the number of errors we saw was correctable -- less than or equal
1633	 * to the number of parity disks read -- attempt to produce data that
1634	 * has a valid checksum. Naturally, this case applies in the absence of
1635	 * any errors.
1636	 */
1637	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
1638		if (data_errors == 0) {
1639			if (raidz_checksum_verify(vd->spa, bp, data, bytes) == 0) {
1640				/*
1641				 * If we read parity information (unnecessarily
1642				 * as it happens since no reconstruction was
1643				 * needed) regenerate and verify the parity.
1644				 * We also regenerate parity when resilvering
1645				 * so we can write it out to the failed device
1646				 * later.
1647				 */
1648				if (parity_errors + parity_untried <
1649				    rm->rm_firstdatacol) {
1650					n = raidz_parity_verify(rm);
1651					unexpected_errors += n;
1652					ASSERT(parity_errors + n <=
1653					    rm->rm_firstdatacol);
1654				}
1655				goto done;
1656			}
1657		} else {
1658			/*
1659			 * We either attempt to read all the parity columns or
1660			 * none of them. If we didn't try to read parity, we
1661			 * wouldn't be here in the correctable case. There must
1662			 * also have been fewer parity errors than parity
1663			 * columns or, again, we wouldn't be in this code path.
1664			 */
1665			ASSERT(parity_untried == 0);
1666			ASSERT(parity_errors < rm->rm_firstdatacol);
1667
1668			/*
1669			 * Identify the data columns that reported an error.
1670			 */
1671			n = 0;
1672			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1673				rc = &rm->rm_col[c];
1674				if (rc->rc_error != 0) {
1675					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1676					tgts[n++] = c;
1677				}
1678			}
1679
1680			ASSERT(rm->rm_firstdatacol >= n);
1681
1682			code = vdev_raidz_reconstruct(rm, tgts, n);
1683
1684			if (raidz_checksum_verify(vd->spa, bp, data, bytes) == 0) {
1685				/*
1686				 * If we read more parity disks than were used
1687				 * for reconstruction, confirm that the other
1688				 * parity disks produced correct data. This
1689				 * routine is suboptimal in that it regenerates
1690				 * the parity that we already used in addition
1691				 * to the parity that we're attempting to
1692				 * verify, but this should be a relatively
1693				 * uncommon case, and can be optimized if it
1694				 * becomes a problem. Note that we regenerate
1695				 * parity when resilvering so we can write it
1696				 * out to failed devices later.
1697				 */
1698				if (parity_errors < rm->rm_firstdatacol - n) {
1699					n = raidz_parity_verify(rm);
1700					unexpected_errors += n;
1701					ASSERT(parity_errors + n <=
1702					    rm->rm_firstdatacol);
1703				}
1704
1705				goto done;
1706			}
1707		}
1708	}
1709
1710	/*
1711	 * This isn't a typical situation -- either we got a read
1712	 * error or a child silently returned bad data. Read every
1713	 * block so we can try again with as much data and parity as
1714	 * we can track down. If we've already been through once
1715	 * before, all children will be marked as tried so we'll
1716	 * proceed to combinatorial reconstruction.
1717	 */
1718	unexpected_errors = 1;
1719	rm->rm_missingdata = 0;
1720	rm->rm_missingparity = 0;
1721
1722	n = 0;
1723	for (c = 0; c < rm->rm_cols; c++) {
1724		rc = &rm->rm_col[c];
1725
1726		if (rc->rc_tried)
1727			continue;
1728
1729		cvd = vdev_child(vd, rc->rc_devidx);
1730		ASSERT(cvd != NULL);
1731		rc->rc_error = cvd->v_read(cvd, NULL,
1732		    rc->rc_data, rc->rc_offset, rc->rc_size);
1733		if (rc->rc_error == 0)
1734			n++;
1735		rc->rc_tried = 1;
1736		rc->rc_skipped = 0;
1737	}
1738	/*
1739	 * If we managed to read anything more, retry the
1740	 * reconstruction.
1741	 */
1742	if (n > 0)
1743		goto reconstruct;
1744
1745	/*
1746	 * At this point we've attempted to reconstruct the data given the
1747	 * errors we detected, and we've attempted to read all columns. There
1748	 * must, therefore, be one or more additional problems -- silent errors
1749	 * resulting in invalid data rather than explicit I/O errors resulting
1750	 * in absent data. We check if there is enough additional data to
1751	 * possibly reconstruct the data and then perform combinatorial
1752	 * reconstruction over all possible combinations. If that fails,
1753	 * we're cooked.
1754	 */
1755	if (total_errors > rm->rm_firstdatacol) {
1756		error = EIO;
1757	} else if (total_errors < rm->rm_firstdatacol &&
1758	    (code = vdev_raidz_combrec(vd->spa, rm, bp, data, offset, bytes,
1759	     total_errors, data_errors)) != 0) {
1760		/*
1761		 * If we didn't use all the available parity for the
1762		 * combinatorial reconstruction, verify that the remaining
1763		 * parity is correct.
1764		 */
1765		if (code != (1 << rm->rm_firstdatacol) - 1)
1766			(void) raidz_parity_verify(rm);
1767	} else {
1768		/*
1769		 * We're here because either:
1770		 *
1771		 *	total_errors == rm_first_datacol, or
1772		 *	vdev_raidz_combrec() failed
1773		 *
1774		 * In either case, there is enough bad data to prevent
1775		 * reconstruction.
1776		 *
1777		 * Start checksum ereports for all children which haven't
1778		 * failed, and the IO wasn't speculative.
1779		 */
1780		error = ECKSUM;
1781	}
1782
1783done:
1784	vdev_raidz_map_free(rm);
1785
1786	return (error);
1787}
1788