1185029Spjd/*
2185029Spjd * CDDL HEADER START
3185029Spjd *
4185029Spjd * The contents of this file are subject to the terms of the
5185029Spjd * Common Development and Distribution License (the "License").
6185029Spjd * You may not use this file except in compliance with the License.
7185029Spjd *
8185029Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9185029Spjd * or http://www.opensolaris.org/os/licensing.
10185029Spjd * See the License for the specific language governing permissions
11185029Spjd * and limitations under the License.
12185029Spjd *
13185029Spjd * When distributing Covered Code, include this CDDL HEADER in each
14185029Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15185029Spjd * If applicable, add the following below this CDDL HEADER, with the
16185029Spjd * fields enclosed by brackets "[]" replaced with your own identifying
17185029Spjd * information: Portions Copyright [yyyy] [name of copyright owner]
18185029Spjd *
19185029Spjd * CDDL HEADER END
20185029Spjd */
21185029Spjd/*
22185029Spjd * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23185029Spjd * Use is subject to license terms.
24185029Spjd */
25185029Spjd
26185029Spjd#include <sys/cdefs.h>
27192640Sdes__FBSDID("$FreeBSD: releng/10.2/sys/cddl/boot/zfs/zfssubr.c 268649 2014-07-15 04:53:34Z delphij $");
28185029Spjd
29185029Spjdstatic uint64_t zfs_crc64_table[256];
30185029Spjd
31219089Spjd#define	ECKSUM	666
32219089Spjd
33268649Sdelphij#define	ASSERT3S(x, y, z)	((void)0)
34268649Sdelphij#define	ASSERT3U(x, y, z)	((void)0)
35268649Sdelphij#define	ASSERT3P(x, y, z)	((void)0)
36268649Sdelphij#define	ASSERT0(x)		((void)0)
37268649Sdelphij#define	ASSERT(x)		((void)0)
38219089Spjd
39219089Spjd#define	panic(...)	do {						\
40219089Spjd	printf(__VA_ARGS__);						\
41219089Spjd	for (;;) ;							\
42219089Spjd} while (0)
43219089Spjd
44219089Spjd#define	kmem_alloc(size, flag)	zfs_alloc((size))
45219089Spjd#define	kmem_free(ptr, size)	zfs_free((ptr), (size))
46219089Spjd
47185029Spjdstatic void
48185029Spjdzfs_init_crc(void)
49185029Spjd{
50185029Spjd	int i, j;
51185029Spjd	uint64_t *ct;
52185029Spjd
53185029Spjd	/*
54185029Spjd	 * Calculate the crc64 table (used for the zap hash
55185029Spjd	 * function).
56185029Spjd	 */
57185029Spjd	if (zfs_crc64_table[128] != ZFS_CRC64_POLY) {
58185029Spjd		memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table));
59185029Spjd		for (i = 0; i < 256; i++)
60185029Spjd			for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
61185029Spjd				*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
62185029Spjd	}
63185029Spjd}
64185029Spjd
65185029Spjdstatic void
66185029Spjdzio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp)
67185029Spjd{
68185029Spjd	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
69185029Spjd}
70185029Spjd
71185029Spjd/*
72185029Spjd * Signature for checksum functions.
73185029Spjd */
74185029Spjdtypedef void zio_checksum_t(const void *data, uint64_t size, zio_cksum_t *zcp);
75185029Spjd
76185029Spjd/*
77185029Spjd * Information about each checksum function.
78185029Spjd */
79185029Spjdtypedef struct zio_checksum_info {
80185029Spjd	zio_checksum_t	*ci_func[2]; /* checksum function for each byteorder */
81185029Spjd	int		ci_correctable;	/* number of correctable bits	*/
82219089Spjd	int		ci_eck;		/* uses zio embedded checksum? */
83219089Spjd	int		ci_dedup;	/* strong enough for dedup? */
84185029Spjd	const char	*ci_name;	/* descriptive name */
85185029Spjd} zio_checksum_info_t;
86185029Spjd
87268649Sdelphij#include "blkptr.c"
88268649Sdelphij
89185029Spjd#include "fletcher.c"
90185029Spjd#include "sha256.c"
91185029Spjd
92185029Spjdstatic zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
93219089Spjd	{{NULL,			NULL},			0, 0, 0, "inherit"},
94219089Spjd	{{NULL,			NULL},			0, 0, 0, "on"},
95219089Spjd	{{zio_checksum_off,	zio_checksum_off},	0, 0, 0, "off"},
96219089Spjd	{{zio_checksum_SHA256,	zio_checksum_SHA256},	1, 1, 0, "label"},
97219089Spjd	{{zio_checksum_SHA256,	zio_checksum_SHA256},	1, 1, 0, "gang_header"},
98219089Spjd	{{fletcher_2_native,	fletcher_2_byteswap},	0, 1, 0, "zilog"},
99219089Spjd	{{fletcher_2_native,	fletcher_2_byteswap},	0, 0, 0, "fletcher2"},
100219089Spjd	{{fletcher_4_native,	fletcher_4_byteswap},	1, 0, 0, "fletcher4"},
101219089Spjd	{{zio_checksum_SHA256,	zio_checksum_SHA256},	1, 0, 1, "SHA256"},
102219089Spjd	{{fletcher_4_native,	fletcher_4_byteswap},	0, 1, 0, "zillog2"},
103185029Spjd};
104185029Spjd
105219089Spjd
106185029Spjd/*
107185029Spjd * Common signature for all zio compress/decompress functions.
108185029Spjd */
109185029Spjdtypedef size_t zio_compress_func_t(void *src, void *dst,
110185029Spjd    size_t s_len, size_t d_len, int);
111185029Spjdtypedef int zio_decompress_func_t(void *src, void *dst,
112185029Spjd    size_t s_len, size_t d_len, int);
113185029Spjd
114185029Spjd/*
115185029Spjd * Information about each compression function.
116185029Spjd */
117185029Spjdtypedef struct zio_compress_info {
118185029Spjd	zio_compress_func_t	*ci_compress;	/* compression function */
119185029Spjd	zio_decompress_func_t	*ci_decompress;	/* decompression function */
120185029Spjd	int			ci_level;	/* level parameter */
121185029Spjd	const char		*ci_name;	/* algorithm name */
122185029Spjd} zio_compress_info_t;
123185029Spjd
124185029Spjd#include "lzjb.c"
125219089Spjd#include "zle.c"
126246586Sdelphij#include "lz4.c"
127185029Spjd
128185029Spjd/*
129185029Spjd * Compression vectors.
130185029Spjd */
131185029Spjdstatic zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
132185029Spjd	{NULL,			NULL,			0,	"inherit"},
133185029Spjd	{NULL,			NULL,			0,	"on"},
134185029Spjd	{NULL,			NULL,			0,	"uncompressed"},
135185029Spjd	{NULL,			lzjb_decompress,	0,	"lzjb"},
136185029Spjd	{NULL,			NULL,			0,	"empty"},
137185029Spjd	{NULL,			NULL,			1,	"gzip-1"},
138185029Spjd	{NULL,			NULL,			2,	"gzip-2"},
139185029Spjd	{NULL,			NULL,			3,	"gzip-3"},
140185029Spjd	{NULL,			NULL,			4,	"gzip-4"},
141185029Spjd	{NULL,			NULL,			5,	"gzip-5"},
142185029Spjd	{NULL,			NULL,			6,	"gzip-6"},
143185029Spjd	{NULL,			NULL,			7,	"gzip-7"},
144185029Spjd	{NULL,			NULL,			8,	"gzip-8"},
145185029Spjd	{NULL,			NULL,			9,	"gzip-9"},
146219089Spjd	{NULL,			zle_decompress,		64,	"zle"},
147246586Sdelphij	{NULL,			lz4_decompress,		0,	"lz4"},
148185029Spjd};
149185029Spjd
150219089Spjdstatic void
151219089Spjdbyteswap_uint64_array(void *vbuf, size_t size)
152219089Spjd{
153219089Spjd	uint64_t *buf = vbuf;
154219089Spjd	size_t count = size >> 3;
155219089Spjd	int i;
156219089Spjd
157219089Spjd	ASSERT((size & 7) == 0);
158219089Spjd
159219089Spjd	for (i = 0; i < count; i++)
160219089Spjd		buf[i] = BSWAP_64(buf[i]);
161219089Spjd}
162219089Spjd
163219089Spjd/*
164219089Spjd * Set the external verifier for a gang block based on <vdev, offset, txg>,
165219089Spjd * a tuple which is guaranteed to be unique for the life of the pool.
166219089Spjd */
167219089Spjdstatic void
168219089Spjdzio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp)
169219089Spjd{
170219089Spjd	const dva_t *dva = BP_IDENTITY(bp);
171219089Spjd	uint64_t txg = BP_PHYSICAL_BIRTH(bp);
172219089Spjd
173219089Spjd	ASSERT(BP_IS_GANG(bp));
174219089Spjd
175219089Spjd	ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0);
176219089Spjd}
177219089Spjd
178219089Spjd/*
179219089Spjd * Set the external verifier for a label block based on its offset.
180219089Spjd * The vdev is implicit, and the txg is unknowable at pool open time --
181219089Spjd * hence the logic in vdev_uberblock_load() to find the most recent copy.
182219089Spjd */
183219089Spjdstatic void
184219089Spjdzio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
185219089Spjd{
186219089Spjd	ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0);
187219089Spjd}
188219089Spjd
189185029Spjdstatic int
190226568Spjdzio_checksum_verify(const blkptr_t *bp, void *data)
191185029Spjd{
192226568Spjd	uint64_t size;
193226568Spjd	unsigned int checksum;
194219089Spjd	zio_checksum_info_t *ci;
195219089Spjd	zio_cksum_t actual_cksum, expected_cksum, verifier;
196219089Spjd	int byteswap;
197185029Spjd
198226568Spjd	checksum = BP_GET_CHECKSUM(bp);
199226568Spjd	size = BP_GET_PSIZE(bp);
200226568Spjd
201219089Spjd	if (checksum >= ZIO_CHECKSUM_FUNCTIONS)
202185029Spjd		return (EINVAL);
203219089Spjd	ci = &zio_checksum_table[checksum];
204219089Spjd	if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL)
205219089Spjd		return (EINVAL);
206185029Spjd
207219089Spjd	if (ci->ci_eck) {
208219089Spjd		zio_eck_t *eck;
209219089Spjd
210219089Spjd		ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER ||
211219089Spjd		    checksum == ZIO_CHECKSUM_LABEL);
212219089Spjd
213219089Spjd		eck = (zio_eck_t *)((char *)data + size) - 1;
214219089Spjd
215219089Spjd		if (checksum == ZIO_CHECKSUM_GANG_HEADER)
216219089Spjd			zio_checksum_gang_verifier(&verifier, bp);
217219089Spjd		else if (checksum == ZIO_CHECKSUM_LABEL)
218226568Spjd			zio_checksum_label_verifier(&verifier,
219226568Spjd			    DVA_GET_OFFSET(BP_IDENTITY(bp)));
220219089Spjd		else
221219089Spjd			verifier = bp->blk_cksum;
222219089Spjd
223219089Spjd		byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
224219089Spjd
225219089Spjd		if (byteswap)
226219089Spjd			byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
227219089Spjd
228219089Spjd		expected_cksum = eck->zec_cksum;
229219089Spjd		eck->zec_cksum = verifier;
230219089Spjd		ci->ci_func[byteswap](data, size, &actual_cksum);
231219089Spjd		eck->zec_cksum = expected_cksum;
232219089Spjd
233219089Spjd		if (byteswap)
234219089Spjd			byteswap_uint64_array(&expected_cksum,
235219089Spjd			    sizeof (zio_cksum_t));
236185029Spjd	} else {
237219089Spjd		expected_cksum = bp->blk_cksum;
238185029Spjd		ci->ci_func[0](data, size, &actual_cksum);
239185029Spjd	}
240185029Spjd
241219089Spjd	if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) {
242185029Spjd		/*printf("ZFS: read checksum failed\n");*/
243185029Spjd		return (EIO);
244185029Spjd	}
245185029Spjd
246185029Spjd	return (0);
247185029Spjd}
248185029Spjd
249185029Spjdstatic int
250185029Spjdzio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
251185029Spjd	void *dest, uint64_t destsize)
252185029Spjd{
253219089Spjd	zio_compress_info_t *ci;
254185029Spjd
255219089Spjd	if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) {
256185097Sdfr		printf("ZFS: unsupported compression algorithm %u\n", cpfunc);
257185029Spjd		return (EIO);
258185029Spjd	}
259185029Spjd
260219089Spjd	ci = &zio_compress_table[cpfunc];
261219089Spjd	if (!ci->ci_decompress) {
262219089Spjd		printf("ZFS: unsupported compression algorithm %s\n",
263219089Spjd		    ci->ci_name);
264219089Spjd		return (EIO);
265219089Spjd	}
266219089Spjd
267185029Spjd	return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
268185029Spjd}
269185029Spjd
270185029Spjdstatic uint64_t
271185029Spjdzap_hash(uint64_t salt, const char *name)
272185029Spjd{
273185029Spjd	const uint8_t *cp;
274185029Spjd	uint8_t c;
275185029Spjd	uint64_t crc = salt;
276185029Spjd
277219089Spjd	ASSERT(crc != 0);
278219089Spjd	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
279185029Spjd	for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
280185029Spjd		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
281185029Spjd
282185029Spjd	/*
283185029Spjd	 * Only use 28 bits, since we need 4 bits in the cookie for the
284185029Spjd	 * collision differentiator.  We MUST use the high bits, since
285185029Spjd	 * those are the onces that we first pay attention to when
286185029Spjd	 * chosing the bucket.
287185029Spjd	 */
288185029Spjd	crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
289185029Spjd
290185029Spjd	return (crc);
291185029Spjd}
292192194Sdfr
293219089Spjdstatic void *zfs_alloc(size_t size);
294219089Spjdstatic void zfs_free(void *ptr, size_t size);
295192194Sdfr
296192194Sdfrtypedef struct raidz_col {
297192194Sdfr	uint64_t rc_devidx;		/* child device index for I/O */
298192194Sdfr	uint64_t rc_offset;		/* device offset */
299192194Sdfr	uint64_t rc_size;		/* I/O size */
300192194Sdfr	void *rc_data;			/* I/O data */
301192194Sdfr	int rc_error;			/* I/O error for this device */
302192194Sdfr	uint8_t rc_tried;		/* Did we attempt this I/O column? */
303192194Sdfr	uint8_t rc_skipped;		/* Did we skip this I/O column? */
304192194Sdfr} raidz_col_t;
305192194Sdfr
306219089Spjdtypedef struct raidz_map {
307219089Spjd	uint64_t rm_cols;		/* Regular column count */
308219089Spjd	uint64_t rm_scols;		/* Count including skipped columns */
309219089Spjd	uint64_t rm_bigcols;		/* Number of oversized columns */
310219089Spjd	uint64_t rm_asize;		/* Actual total I/O size */
311219089Spjd	uint64_t rm_missingdata;	/* Count of missing data devices */
312219089Spjd	uint64_t rm_missingparity;	/* Count of missing parity devices */
313219089Spjd	uint64_t rm_firstdatacol;	/* First data column/parity count */
314219089Spjd	uint64_t rm_nskip;		/* Skipped sectors for padding */
315219089Spjd	uint64_t rm_skipstart;		/* Column index of padding start */
316219089Spjd	uintptr_t rm_reports;		/* # of referencing checksum reports */
317219089Spjd	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
318219089Spjd	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
319219089Spjd	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
320219089Spjd} raidz_map_t;
321219089Spjd
322192194Sdfr#define	VDEV_RAIDZ_P		0
323192194Sdfr#define	VDEV_RAIDZ_Q		1
324219089Spjd#define	VDEV_RAIDZ_R		2
325192194Sdfr
326219089Spjd#define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
327219089Spjd#define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
328192194Sdfr
329219089Spjd/*
330219089Spjd * We provide a mechanism to perform the field multiplication operation on a
331219089Spjd * 64-bit value all at once rather than a byte at a time. This works by
332219089Spjd * creating a mask from the top bit in each byte and using that to
333219089Spjd * conditionally apply the XOR of 0x1d.
334219089Spjd */
335219089Spjd#define	VDEV_RAIDZ_64MUL_2(x, mask) \
336219089Spjd{ \
337219089Spjd	(mask) = (x) & 0x8080808080808080ULL; \
338219089Spjd	(mask) = ((mask) << 1) - ((mask) >> 7); \
339219089Spjd	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
340225531Savg	    ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
341219089Spjd}
342192194Sdfr
343219089Spjd#define	VDEV_RAIDZ_64MUL_4(x, mask) \
344219089Spjd{ \
345219089Spjd	VDEV_RAIDZ_64MUL_2((x), mask); \
346219089Spjd	VDEV_RAIDZ_64MUL_2((x), mask); \
347192194Sdfr}
348192194Sdfr
349192194Sdfr/*
350192194Sdfr * These two tables represent powers and logs of 2 in the Galois field defined
351192194Sdfr * above. These values were computed by repeatedly multiplying by 2 as above.
352192194Sdfr */
353192194Sdfrstatic const uint8_t vdev_raidz_pow2[256] = {
354192194Sdfr	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
355192194Sdfr	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
356192194Sdfr	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
357192194Sdfr	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
358192194Sdfr	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
359192194Sdfr	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
360192194Sdfr	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
361192194Sdfr	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
362192194Sdfr	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
363192194Sdfr	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
364192194Sdfr	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
365192194Sdfr	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
366192194Sdfr	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
367192194Sdfr	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
368192194Sdfr	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
369192194Sdfr	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
370192194Sdfr	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
371192194Sdfr	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
372192194Sdfr	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
373192194Sdfr	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
374192194Sdfr	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
375192194Sdfr	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
376192194Sdfr	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
377192194Sdfr	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
378192194Sdfr	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
379192194Sdfr	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
380192194Sdfr	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
381192194Sdfr	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
382192194Sdfr	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
383192194Sdfr	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
384192194Sdfr	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
385192194Sdfr	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
386192194Sdfr};
387192194Sdfrstatic const uint8_t vdev_raidz_log2[256] = {
388192194Sdfr	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
389192194Sdfr	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
390192194Sdfr	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
391192194Sdfr	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
392192194Sdfr	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
393192194Sdfr	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
394192194Sdfr	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
395192194Sdfr	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
396192194Sdfr	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
397192194Sdfr	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
398192194Sdfr	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
399192194Sdfr	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
400192194Sdfr	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
401192194Sdfr	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
402192194Sdfr	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
403192194Sdfr	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
404192194Sdfr	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
405192194Sdfr	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
406192194Sdfr	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
407192194Sdfr	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
408192194Sdfr	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
409192194Sdfr	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
410192194Sdfr	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
411192194Sdfr	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
412192194Sdfr	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
413192194Sdfr	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
414192194Sdfr	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
415192194Sdfr	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
416192194Sdfr	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
417192194Sdfr	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
418192194Sdfr	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
419192194Sdfr	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
420192194Sdfr};
421192194Sdfr
422192194Sdfr/*
423192194Sdfr * Multiply a given number by 2 raised to the given power.
424192194Sdfr */
425192194Sdfrstatic uint8_t
426192194Sdfrvdev_raidz_exp2(uint8_t a, int exp)
427192194Sdfr{
428192194Sdfr	if (a == 0)
429192194Sdfr		return (0);
430192194Sdfr
431219089Spjd	ASSERT(exp >= 0);
432219089Spjd	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
433192194Sdfr
434192194Sdfr	exp += vdev_raidz_log2[a];
435192194Sdfr	if (exp > 255)
436192194Sdfr		exp -= 255;
437192194Sdfr
438192194Sdfr	return (vdev_raidz_pow2[exp]);
439192194Sdfr}
440192194Sdfr
441192194Sdfrstatic void
442219089Spjdvdev_raidz_generate_parity_p(raidz_map_t *rm)
443192194Sdfr{
444219089Spjd	uint64_t *p, *src, pcount, ccount, i;
445192194Sdfr	int c;
446192194Sdfr
447219089Spjd	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
448192194Sdfr
449219089Spjd	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
450219089Spjd		src = rm->rm_col[c].rc_data;
451219089Spjd		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
452219089Spjd		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
453192194Sdfr
454219089Spjd		if (c == rm->rm_firstdatacol) {
455219089Spjd			ASSERT(ccount == pcount);
456219089Spjd			for (i = 0; i < ccount; i++, src++, p++) {
457192194Sdfr				*p = *src;
458192194Sdfr			}
459219089Spjd		} else {
460219089Spjd			ASSERT(ccount <= pcount);
461219089Spjd			for (i = 0; i < ccount; i++, src++, p++) {
462219089Spjd				*p ^= *src;
463219089Spjd			}
464219089Spjd		}
465219089Spjd	}
466219089Spjd}
467219089Spjd
468219089Spjdstatic void
469219089Spjdvdev_raidz_generate_parity_pq(raidz_map_t *rm)
470219089Spjd{
471219089Spjd	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
472219089Spjd	int c;
473219089Spjd
474219089Spjd	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
475219089Spjd	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
476219089Spjd	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
477219089Spjd
478219089Spjd	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
479219089Spjd		src = rm->rm_col[c].rc_data;
480219089Spjd		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
481219089Spjd		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
482219089Spjd
483219089Spjd		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
484219089Spjd
485219089Spjd		if (c == rm->rm_firstdatacol) {
486219089Spjd			ASSERT(ccnt == pcnt || ccnt == 0);
487219089Spjd			for (i = 0; i < ccnt; i++, src++, p++, q++) {
488219089Spjd				*p = *src;
489219089Spjd				*q = *src;
490219089Spjd			}
491219089Spjd			for (; i < pcnt; i++, src++, p++, q++) {
492219089Spjd				*p = 0;
493192194Sdfr				*q = 0;
494192194Sdfr			}
495192194Sdfr		} else {
496219089Spjd			ASSERT(ccnt <= pcnt);
497192194Sdfr
498192194Sdfr			/*
499219089Spjd			 * Apply the algorithm described above by multiplying
500219089Spjd			 * the previous result and adding in the new value.
501192194Sdfr			 */
502219089Spjd			for (i = 0; i < ccnt; i++, src++, p++, q++) {
503219089Spjd				*p ^= *src;
504219089Spjd
505219089Spjd				VDEV_RAIDZ_64MUL_2(*q, mask);
506192194Sdfr				*q ^= *src;
507192194Sdfr			}
508192194Sdfr
509192194Sdfr			/*
510192194Sdfr			 * Treat short columns as though they are full of 0s.
511219089Spjd			 * Note that there's therefore nothing needed for P.
512192194Sdfr			 */
513219089Spjd			for (; i < pcnt; i++, q++) {
514219089Spjd				VDEV_RAIDZ_64MUL_2(*q, mask);
515192194Sdfr			}
516192194Sdfr		}
517192194Sdfr	}
518192194Sdfr}
519192194Sdfr
520192194Sdfrstatic void
521219089Spjdvdev_raidz_generate_parity_pqr(raidz_map_t *rm)
522192194Sdfr{
523219089Spjd	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
524219089Spjd	int c;
525192194Sdfr
526219089Spjd	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
527219089Spjd	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
528219089Spjd	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
529219089Spjd	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
530219089Spjd	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
531192194Sdfr
532219089Spjd	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
533219089Spjd		src = rm->rm_col[c].rc_data;
534219089Spjd		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
535219089Spjd		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
536219089Spjd		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
537192194Sdfr
538219089Spjd		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
539192194Sdfr
540219089Spjd		if (c == rm->rm_firstdatacol) {
541219089Spjd			ASSERT(ccnt == pcnt || ccnt == 0);
542219089Spjd			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
543219089Spjd				*p = *src;
544219089Spjd				*q = *src;
545219089Spjd				*r = *src;
546192194Sdfr			}
547219089Spjd			for (; i < pcnt; i++, src++, p++, q++, r++) {
548219089Spjd				*p = 0;
549219089Spjd				*q = 0;
550219089Spjd				*r = 0;
551192194Sdfr			}
552219089Spjd		} else {
553219089Spjd			ASSERT(ccnt <= pcnt);
554192194Sdfr
555192194Sdfr			/*
556219089Spjd			 * Apply the algorithm described above by multiplying
557219089Spjd			 * the previous result and adding in the new value.
558192194Sdfr			 */
559219089Spjd			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
560219089Spjd				*p ^= *src;
561219089Spjd
562219089Spjd				VDEV_RAIDZ_64MUL_2(*q, mask);
563219089Spjd				*q ^= *src;
564219089Spjd
565219089Spjd				VDEV_RAIDZ_64MUL_4(*r, mask);
566219089Spjd				*r ^= *src;
567192194Sdfr			}
568192194Sdfr
569219089Spjd			/*
570219089Spjd			 * Treat short columns as though they are full of 0s.
571219089Spjd			 * Note that there's therefore nothing needed for P.
572219089Spjd			 */
573219089Spjd			for (; i < pcnt; i++, q++, r++) {
574219089Spjd				VDEV_RAIDZ_64MUL_2(*q, mask);
575219089Spjd				VDEV_RAIDZ_64MUL_4(*r, mask);
576192194Sdfr			}
577192194Sdfr		}
578192194Sdfr	}
579219089Spjd}
580192194Sdfr
581219089Spjd/*
582219089Spjd * Generate RAID parity in the first virtual columns according to the number of
583219089Spjd * parity columns available.
584219089Spjd */
585219089Spjdstatic void
586219089Spjdvdev_raidz_generate_parity(raidz_map_t *rm)
587219089Spjd{
588219089Spjd	switch (rm->rm_firstdatacol) {
589219089Spjd	case 1:
590219089Spjd		vdev_raidz_generate_parity_p(rm);
591219089Spjd		break;
592219089Spjd	case 2:
593219089Spjd		vdev_raidz_generate_parity_pq(rm);
594219089Spjd		break;
595219089Spjd	case 3:
596219089Spjd		vdev_raidz_generate_parity_pqr(rm);
597219089Spjd		break;
598219089Spjd	default:
599219089Spjd		panic("invalid RAID-Z configuration");
600219089Spjd	}
601219089Spjd}
602192194Sdfr
603219089Spjd/* BEGIN CSTYLED */
604219089Spjd/*
605219089Spjd * In the general case of reconstruction, we must solve the system of linear
606219089Spjd * equations defined by the coeffecients used to generate parity as well as
607219089Spjd * the contents of the data and parity disks. This can be expressed with
608219089Spjd * vectors for the original data (D) and the actual data (d) and parity (p)
609219089Spjd * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
610219089Spjd *
611219089Spjd *            __   __                     __     __
612219089Spjd *            |     |         __     __   |  p_0  |
613219089Spjd *            |  V  |         |  D_0  |   | p_m-1 |
614219089Spjd *            |     |    x    |   :   | = |  d_0  |
615219089Spjd *            |  I  |         | D_n-1 |   |   :   |
616219089Spjd *            |     |         ~~     ~~   | d_n-1 |
617219089Spjd *            ~~   ~~                     ~~     ~~
618219089Spjd *
619219089Spjd * I is simply a square identity matrix of size n, and V is a vandermonde
620219089Spjd * matrix defined by the coeffecients we chose for the various parity columns
621219089Spjd * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
622219089Spjd * computation as well as linear separability.
623219089Spjd *
624219089Spjd *      __               __               __     __
625219089Spjd *      |   1   ..  1 1 1 |               |  p_0  |
626219089Spjd *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
627219089Spjd *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
628219089Spjd *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
629219089Spjd *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
630219089Spjd *      |   :       : : : |   |   :   |   |  d_2  |
631219089Spjd *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
632219089Spjd *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
633219089Spjd *      |   0   ..  0 0 1 |               | d_n-1 |
634219089Spjd *      ~~               ~~               ~~     ~~
635219089Spjd *
636219089Spjd * Note that I, V, d, and p are known. To compute D, we must invert the
637219089Spjd * matrix and use the known data and parity values to reconstruct the unknown
638219089Spjd * data values. We begin by removing the rows in V|I and d|p that correspond
639219089Spjd * to failed or missing columns; we then make V|I square (n x n) and d|p
640219089Spjd * sized n by removing rows corresponding to unused parity from the bottom up
641219089Spjd * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
642219089Spjd * using Gauss-Jordan elimination. In the example below we use m=3 parity
643219089Spjd * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
644219089Spjd *           __                               __
645219089Spjd *           |  1   1   1   1   1   1   1   1  |
646219089Spjd *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
647219089Spjd *           |  19 205 116  29  64  16  4   1  |      / /
648219089Spjd *           |  1   0   0   0   0   0   0   0  |     / /
649219089Spjd *           |  0   1   0   0   0   0   0   0  | <--' /
650219089Spjd *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
651219089Spjd *           |  0   0   0   1   0   0   0   0  |
652219089Spjd *           |  0   0   0   0   1   0   0   0  |
653219089Spjd *           |  0   0   0   0   0   1   0   0  |
654219089Spjd *           |  0   0   0   0   0   0   1   0  |
655219089Spjd *           |  0   0   0   0   0   0   0   1  |
656219089Spjd *           ~~                               ~~
657219089Spjd *           __                               __
658219089Spjd *           |  1   1   1   1   1   1   1   1  |
659219089Spjd *           | 128  64  32  16  8   4   2   1  |
660219089Spjd *           |  19 205 116  29  64  16  4   1  |
661219089Spjd *           |  1   0   0   0   0   0   0   0  |
662219089Spjd *           |  0   1   0   0   0   0   0   0  |
663219089Spjd *  (V|I)' = |  0   0   1   0   0   0   0   0  |
664219089Spjd *           |  0   0   0   1   0   0   0   0  |
665219089Spjd *           |  0   0   0   0   1   0   0   0  |
666219089Spjd *           |  0   0   0   0   0   1   0   0  |
667219089Spjd *           |  0   0   0   0   0   0   1   0  |
668219089Spjd *           |  0   0   0   0   0   0   0   1  |
669219089Spjd *           ~~                               ~~
670219089Spjd *
671219089Spjd * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
672219089Spjd * have carefully chosen the seed values 1, 2, and 4 to ensure that this
673219089Spjd * matrix is not singular.
674219089Spjd * __                                                                 __
675219089Spjd * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
676219089Spjd * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
677219089Spjd * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
678219089Spjd * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
679219089Spjd * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
680219089Spjd * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
681219089Spjd * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
682219089Spjd * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
683219089Spjd * ~~                                                                 ~~
684219089Spjd * __                                                                 __
685219089Spjd * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
686219089Spjd * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
687219089Spjd * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
688219089Spjd * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
689219089Spjd * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
690219089Spjd * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
691219089Spjd * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
692219089Spjd * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
693219089Spjd * ~~                                                                 ~~
694219089Spjd * __                                                                 __
695219089Spjd * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
696219089Spjd * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
697219089Spjd * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
698219089Spjd * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
699219089Spjd * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
700219089Spjd * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
701219089Spjd * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
702219089Spjd * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
703219089Spjd * ~~                                                                 ~~
704219089Spjd * __                                                                 __
705219089Spjd * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
706219089Spjd * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
707219089Spjd * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
708219089Spjd * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
709219089Spjd * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
710219089Spjd * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
711219089Spjd * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
712219089Spjd * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
713219089Spjd * ~~                                                                 ~~
714219089Spjd * __                                                                 __
715219089Spjd * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
716219089Spjd * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
717219089Spjd * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
718219089Spjd * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
719219089Spjd * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
720219089Spjd * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
721219089Spjd * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
722219089Spjd * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
723219089Spjd * ~~                                                                 ~~
724219089Spjd * __                                                                 __
725219089Spjd * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
726219089Spjd * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
727219089Spjd * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
728219089Spjd * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
729219089Spjd * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
730219089Spjd * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
731219089Spjd * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
732219089Spjd * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
733219089Spjd * ~~                                                                 ~~
734219089Spjd *                   __                               __
735219089Spjd *                   |  0   0   1   0   0   0   0   0  |
736219089Spjd *                   | 167 100  5   41 159 169 217 208 |
737219089Spjd *                   | 166 100  4   40 158 168 216 209 |
738219089Spjd *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
739219089Spjd *                   |  0   0   0   0   1   0   0   0  |
740219089Spjd *                   |  0   0   0   0   0   1   0   0  |
741219089Spjd *                   |  0   0   0   0   0   0   1   0  |
742219089Spjd *                   |  0   0   0   0   0   0   0   1  |
743219089Spjd *                   ~~                               ~~
744219089Spjd *
745219089Spjd * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
746219089Spjd * of the missing data.
747219089Spjd *
748219089Spjd * As is apparent from the example above, the only non-trivial rows in the
749219089Spjd * inverse matrix correspond to the data disks that we're trying to
750219089Spjd * reconstruct. Indeed, those are the only rows we need as the others would
751219089Spjd * only be useful for reconstructing data known or assumed to be valid. For
752219089Spjd * that reason, we only build the coefficients in the rows that correspond to
753219089Spjd * targeted columns.
754219089Spjd */
755219089Spjd/* END CSTYLED */
756219089Spjd
757219089Spjdstatic void
758219089Spjdvdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
759219089Spjd    uint8_t **rows)
760219089Spjd{
761219089Spjd	int i, j;
762219089Spjd	int pow;
763219089Spjd
764219089Spjd	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
765219089Spjd
766219089Spjd	/*
767219089Spjd	 * Fill in the missing rows of interest.
768219089Spjd	 */
769219089Spjd	for (i = 0; i < nmap; i++) {
770219089Spjd		ASSERT3S(0, <=, map[i]);
771219089Spjd		ASSERT3S(map[i], <=, 2);
772219089Spjd
773219089Spjd		pow = map[i] * n;
774219089Spjd		if (pow > 255)
775219089Spjd			pow -= 255;
776219089Spjd		ASSERT(pow <= 255);
777219089Spjd
778219089Spjd		for (j = 0; j < n; j++) {
779219089Spjd			pow -= map[i];
780219089Spjd			if (pow < 0)
781219089Spjd				pow += 255;
782219089Spjd			rows[i][j] = vdev_raidz_pow2[pow];
783192194Sdfr		}
784192194Sdfr	}
785192194Sdfr}
786192194Sdfr
787192194Sdfrstatic void
788219089Spjdvdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
789219089Spjd    uint8_t **rows, uint8_t **invrows, const uint8_t *used)
790192194Sdfr{
791219089Spjd	int i, j, ii, jj;
792219089Spjd	uint8_t log;
793192194Sdfr
794219089Spjd	/*
795219089Spjd	 * Assert that the first nmissing entries from the array of used
796219089Spjd	 * columns correspond to parity columns and that subsequent entries
797219089Spjd	 * correspond to data columns.
798219089Spjd	 */
799219089Spjd	for (i = 0; i < nmissing; i++) {
800219089Spjd		ASSERT3S(used[i], <, rm->rm_firstdatacol);
801219089Spjd	}
802219089Spjd	for (; i < n; i++) {
803219089Spjd		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
804219089Spjd	}
805192194Sdfr
806219089Spjd	/*
807219089Spjd	 * First initialize the storage where we'll compute the inverse rows.
808219089Spjd	 */
809219089Spjd	for (i = 0; i < nmissing; i++) {
810219089Spjd		for (j = 0; j < n; j++) {
811219089Spjd			invrows[i][j] = (i == j) ? 1 : 0;
812219089Spjd		}
813219089Spjd	}
814192194Sdfr
815192194Sdfr	/*
816219089Spjd	 * Subtract all trivial rows from the rows of consequence.
817192194Sdfr	 */
818219089Spjd	for (i = 0; i < nmissing; i++) {
819219089Spjd		for (j = nmissing; j < n; j++) {
820219089Spjd			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
821219089Spjd			jj = used[j] - rm->rm_firstdatacol;
822219089Spjd			ASSERT3S(jj, <, n);
823219089Spjd			invrows[i][j] = rows[i][jj];
824219089Spjd			rows[i][jj] = 0;
825219089Spjd		}
826219089Spjd	}
827192194Sdfr
828219089Spjd	/*
829219089Spjd	 * For each of the rows of interest, we must normalize it and subtract
830219089Spjd	 * a multiple of it from the other rows.
831219089Spjd	 */
832219089Spjd	for (i = 0; i < nmissing; i++) {
833219089Spjd		for (j = 0; j < missing[i]; j++) {
834219089Spjd			ASSERT3U(rows[i][j], ==, 0);
835219089Spjd		}
836219089Spjd		ASSERT3U(rows[i][missing[i]], !=, 0);
837192194Sdfr
838219089Spjd		/*
839219089Spjd		 * Compute the inverse of the first element and multiply each
840219089Spjd		 * element in the row by that value.
841219089Spjd		 */
842219089Spjd		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
843192194Sdfr
844219089Spjd		for (j = 0; j < n; j++) {
845219089Spjd			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
846219089Spjd			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
847219089Spjd		}
848192194Sdfr
849219089Spjd		for (ii = 0; ii < nmissing; ii++) {
850219089Spjd			if (i == ii)
851219089Spjd				continue;
852192194Sdfr
853219089Spjd			ASSERT3U(rows[ii][missing[i]], !=, 0);
854219089Spjd
855219089Spjd			log = vdev_raidz_log2[rows[ii][missing[i]]];
856219089Spjd
857219089Spjd			for (j = 0; j < n; j++) {
858219089Spjd				rows[ii][j] ^=
859219089Spjd				    vdev_raidz_exp2(rows[i][j], log);
860219089Spjd				invrows[ii][j] ^=
861219089Spjd				    vdev_raidz_exp2(invrows[i][j], log);
862219089Spjd			}
863219089Spjd		}
864219089Spjd	}
865219089Spjd
866192194Sdfr	/*
867219089Spjd	 * Verify that the data that is left in the rows are properly part of
868219089Spjd	 * an identity matrix.
869192194Sdfr	 */
870219089Spjd	for (i = 0; i < nmissing; i++) {
871219089Spjd		for (j = 0; j < n; j++) {
872219089Spjd			if (j == missing[i]) {
873219089Spjd				ASSERT3U(rows[i][j], ==, 1);
874219089Spjd			} else {
875219089Spjd				ASSERT3U(rows[i][j], ==, 0);
876219089Spjd			}
877219089Spjd		}
878219089Spjd	}
879219089Spjd}
880192194Sdfr
881219089Spjdstatic void
882219089Spjdvdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
883219089Spjd    int *missing, uint8_t **invrows, const uint8_t *used)
884219089Spjd{
885219089Spjd	int i, j, x, cc, c;
886219089Spjd	uint8_t *src;
887219089Spjd	uint64_t ccount;
888219089Spjd	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
889219089Spjd	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
890219089Spjd	uint8_t log, val;
891219089Spjd	int ll;
892219089Spjd	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
893219089Spjd	uint8_t *p, *pp;
894219089Spjd	size_t psize;
895192194Sdfr
896219089Spjd	log = 0;	/* gcc */
897219089Spjd	psize = sizeof (invlog[0][0]) * n * nmissing;
898219089Spjd	p = zfs_alloc(psize);
899192194Sdfr
900219089Spjd	for (pp = p, i = 0; i < nmissing; i++) {
901219089Spjd		invlog[i] = pp;
902219089Spjd		pp += n;
903219089Spjd	}
904192194Sdfr
905219089Spjd	for (i = 0; i < nmissing; i++) {
906219089Spjd		for (j = 0; j < n; j++) {
907219089Spjd			ASSERT3U(invrows[i][j], !=, 0);
908219089Spjd			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
909219089Spjd		}
910192194Sdfr	}
911192194Sdfr
912219089Spjd	for (i = 0; i < n; i++) {
913219089Spjd		c = used[i];
914219089Spjd		ASSERT3U(c, <, rm->rm_cols);
915219089Spjd
916219089Spjd		src = rm->rm_col[c].rc_data;
917219089Spjd		ccount = rm->rm_col[c].rc_size;
918219089Spjd		for (j = 0; j < nmissing; j++) {
919219089Spjd			cc = missing[j] + rm->rm_firstdatacol;
920219089Spjd			ASSERT3U(cc, >=, rm->rm_firstdatacol);
921219089Spjd			ASSERT3U(cc, <, rm->rm_cols);
922219089Spjd			ASSERT3U(cc, !=, c);
923219089Spjd
924219089Spjd			dst[j] = rm->rm_col[cc].rc_data;
925219089Spjd			dcount[j] = rm->rm_col[cc].rc_size;
926219089Spjd		}
927219089Spjd
928219089Spjd		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
929219089Spjd
930219089Spjd		for (x = 0; x < ccount; x++, src++) {
931219089Spjd			if (*src != 0)
932219089Spjd				log = vdev_raidz_log2[*src];
933219089Spjd
934219089Spjd			for (cc = 0; cc < nmissing; cc++) {
935219089Spjd				if (x >= dcount[cc])
936219089Spjd					continue;
937219089Spjd
938219089Spjd				if (*src == 0) {
939219089Spjd					val = 0;
940219089Spjd				} else {
941219089Spjd					if ((ll = log + invlog[cc][i]) >= 255)
942219089Spjd						ll -= 255;
943219089Spjd					val = vdev_raidz_pow2[ll];
944219089Spjd				}
945219089Spjd
946219089Spjd				if (i == 0)
947219089Spjd					dst[cc][x] = val;
948219089Spjd				else
949219089Spjd					dst[cc][x] ^= val;
950219089Spjd			}
951219089Spjd		}
952219089Spjd	}
953219089Spjd
954219089Spjd	zfs_free(p, psize);
955219089Spjd}
956219089Spjd
957219089Spjdstatic int
958219089Spjdvdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
959219089Spjd{
960219089Spjd	int n, i, c, t, tt;
961219089Spjd	int nmissing_rows;
962219089Spjd	int missing_rows[VDEV_RAIDZ_MAXPARITY];
963219089Spjd	int parity_map[VDEV_RAIDZ_MAXPARITY];
964219089Spjd
965219089Spjd	uint8_t *p, *pp;
966219089Spjd	size_t psize;
967219089Spjd
968219089Spjd	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
969219089Spjd	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
970219089Spjd	uint8_t *used;
971219089Spjd
972219089Spjd	int code = 0;
973219089Spjd
974219089Spjd
975219089Spjd	n = rm->rm_cols - rm->rm_firstdatacol;
976219089Spjd
977192194Sdfr	/*
978219089Spjd	 * Figure out which data columns are missing.
979192194Sdfr	 */
980219089Spjd	nmissing_rows = 0;
981219089Spjd	for (t = 0; t < ntgts; t++) {
982219089Spjd		if (tgts[t] >= rm->rm_firstdatacol) {
983219089Spjd			missing_rows[nmissing_rows++] =
984219089Spjd			    tgts[t] - rm->rm_firstdatacol;
985219089Spjd		}
986219089Spjd	}
987219089Spjd
988219089Spjd	/*
989219089Spjd	 * Figure out which parity columns to use to help generate the missing
990219089Spjd	 * data columns.
991219089Spjd	 */
992219089Spjd	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
993219089Spjd		ASSERT(tt < ntgts);
994219089Spjd		ASSERT(c < rm->rm_firstdatacol);
995219089Spjd
996219089Spjd		/*
997219089Spjd		 * Skip any targeted parity columns.
998219089Spjd		 */
999219089Spjd		if (c == tgts[tt]) {
1000219089Spjd			tt++;
1001219089Spjd			continue;
1002219089Spjd		}
1003219089Spjd
1004219089Spjd		code |= 1 << c;
1005219089Spjd
1006219089Spjd		parity_map[i] = c;
1007219089Spjd		i++;
1008219089Spjd	}
1009219089Spjd
1010219089Spjd	ASSERT(code != 0);
1011219089Spjd	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1012219089Spjd
1013219089Spjd	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1014219089Spjd	    nmissing_rows * n + sizeof (used[0]) * n;
1015219089Spjd	p = kmem_alloc(psize, KM_SLEEP);
1016219089Spjd
1017219089Spjd	for (pp = p, i = 0; i < nmissing_rows; i++) {
1018219089Spjd		rows[i] = pp;
1019219089Spjd		pp += n;
1020219089Spjd		invrows[i] = pp;
1021219089Spjd		pp += n;
1022219089Spjd	}
1023219089Spjd	used = pp;
1024219089Spjd
1025219089Spjd	for (i = 0; i < nmissing_rows; i++) {
1026219089Spjd		used[i] = parity_map[i];
1027219089Spjd	}
1028219089Spjd
1029219089Spjd	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1030219089Spjd		if (tt < nmissing_rows &&
1031219089Spjd		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1032219089Spjd			tt++;
1033219089Spjd			continue;
1034219089Spjd		}
1035219089Spjd
1036219089Spjd		ASSERT3S(i, <, n);
1037219089Spjd		used[i] = c;
1038219089Spjd		i++;
1039219089Spjd	}
1040219089Spjd
1041219089Spjd	/*
1042219089Spjd	 * Initialize the interesting rows of the matrix.
1043219089Spjd	 */
1044219089Spjd	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1045219089Spjd
1046219089Spjd	/*
1047219089Spjd	 * Invert the matrix.
1048219089Spjd	 */
1049219089Spjd	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1050219089Spjd	    invrows, used);
1051219089Spjd
1052219089Spjd	/*
1053219089Spjd	 * Reconstruct the missing data using the generated matrix.
1054219089Spjd	 */
1055219089Spjd	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1056219089Spjd	    invrows, used);
1057219089Spjd
1058219089Spjd	kmem_free(p, psize);
1059219089Spjd
1060219089Spjd	return (code);
1061192194Sdfr}
1062192194Sdfr
1063192194Sdfrstatic int
1064219089Spjdvdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1065192194Sdfr{
1066219089Spjd	int tgts[VDEV_RAIDZ_MAXPARITY];
1067219089Spjd	int ntgts;
1068219089Spjd	int i, c;
1069219089Spjd	int code;
1070219089Spjd	int nbadparity, nbaddata;
1071219089Spjd
1072219089Spjd	/*
1073219089Spjd	 * The tgts list must already be sorted.
1074219089Spjd	 */
1075219089Spjd	for (i = 1; i < nt; i++) {
1076219089Spjd		ASSERT(t[i] > t[i - 1]);
1077219089Spjd	}
1078219089Spjd
1079219089Spjd	nbadparity = rm->rm_firstdatacol;
1080219089Spjd	nbaddata = rm->rm_cols - nbadparity;
1081219089Spjd	ntgts = 0;
1082219089Spjd	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1083219089Spjd		if (i < nt && c == t[i]) {
1084219089Spjd			tgts[ntgts++] = c;
1085219089Spjd			i++;
1086219089Spjd		} else if (rm->rm_col[c].rc_error != 0) {
1087219089Spjd			tgts[ntgts++] = c;
1088219089Spjd		} else if (c >= rm->rm_firstdatacol) {
1089219089Spjd			nbaddata--;
1090219089Spjd		} else {
1091219089Spjd			nbadparity--;
1092219089Spjd		}
1093219089Spjd	}
1094219089Spjd
1095219089Spjd	ASSERT(ntgts >= nt);
1096219089Spjd	ASSERT(nbaddata >= 0);
1097219089Spjd	ASSERT(nbaddata + nbadparity == ntgts);
1098219089Spjd
1099219089Spjd	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1100219089Spjd	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1101219089Spjd	ASSERT(code > 0);
1102219089Spjd	return (code);
1103219089Spjd}
1104219089Spjd
1105219089Spjdstatic raidz_map_t *
1106219089Spjdvdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift,
1107219089Spjd    uint64_t dcols, uint64_t nparity)
1108219089Spjd{
1109219089Spjd	raidz_map_t *rm;
1110192194Sdfr	uint64_t b = offset >> unit_shift;
1111219089Spjd	uint64_t s = size >> unit_shift;
1112192194Sdfr	uint64_t f = b % dcols;
1113192194Sdfr	uint64_t o = (b / dcols) << unit_shift;
1114219089Spjd	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
1115192194Sdfr
1116192194Sdfr	q = s / (dcols - nparity);
1117192194Sdfr	r = s - q * (dcols - nparity);
1118192194Sdfr	bc = (r == 0 ? 0 : r + nparity);
1119219089Spjd	tot = s + nparity * (q + (r == 0 ? 0 : 1));
1120192194Sdfr
1121219089Spjd	if (q == 0) {
1122219089Spjd		acols = bc;
1123219089Spjd		scols = MIN(dcols, roundup(bc, nparity + 1));
1124219089Spjd	} else {
1125219089Spjd		acols = dcols;
1126219089Spjd		scols = dcols;
1127219089Spjd	}
1128219089Spjd
1129219089Spjd	ASSERT3U(acols, <=, scols);
1130219089Spjd
1131219089Spjd	rm = zfs_alloc(offsetof(raidz_map_t, rm_col[scols]));
1132219089Spjd
1133219089Spjd	rm->rm_cols = acols;
1134219089Spjd	rm->rm_scols = scols;
1135219089Spjd	rm->rm_bigcols = bc;
1136219089Spjd	rm->rm_skipstart = bc;
1137219089Spjd	rm->rm_missingdata = 0;
1138219089Spjd	rm->rm_missingparity = 0;
1139219089Spjd	rm->rm_firstdatacol = nparity;
1140219089Spjd	rm->rm_reports = 0;
1141219089Spjd	rm->rm_freed = 0;
1142219089Spjd	rm->rm_ecksuminjected = 0;
1143219089Spjd
1144192194Sdfr	asize = 0;
1145219089Spjd
1146219089Spjd	for (c = 0; c < scols; c++) {
1147192194Sdfr		col = f + c;
1148192194Sdfr		coff = o;
1149192194Sdfr		if (col >= dcols) {
1150192194Sdfr			col -= dcols;
1151192194Sdfr			coff += 1ULL << unit_shift;
1152192194Sdfr		}
1153219089Spjd		rm->rm_col[c].rc_devidx = col;
1154219089Spjd		rm->rm_col[c].rc_offset = coff;
1155219089Spjd		rm->rm_col[c].rc_data = NULL;
1156219089Spjd		rm->rm_col[c].rc_error = 0;
1157219089Spjd		rm->rm_col[c].rc_tried = 0;
1158219089Spjd		rm->rm_col[c].rc_skipped = 0;
1159192194Sdfr
1160219089Spjd		if (c >= acols)
1161219089Spjd			rm->rm_col[c].rc_size = 0;
1162219089Spjd		else if (c < bc)
1163219089Spjd			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
1164219089Spjd		else
1165219089Spjd			rm->rm_col[c].rc_size = q << unit_shift;
1166192194Sdfr
1167219089Spjd		asize += rm->rm_col[c].rc_size;
1168192194Sdfr	}
1169192194Sdfr
1170219089Spjd	ASSERT3U(asize, ==, tot << unit_shift);
1171219089Spjd	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
1172219089Spjd	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
1173219089Spjd	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
1174219089Spjd	ASSERT3U(rm->rm_nskip, <=, nparity);
1175192194Sdfr
1176219089Spjd	for (c = 0; c < rm->rm_firstdatacol; c++)
1177219089Spjd		rm->rm_col[c].rc_data = zfs_alloc(rm->rm_col[c].rc_size);
1178219089Spjd
1179219089Spjd	rm->rm_col[c].rc_data = data;
1180219089Spjd
1181192194Sdfr	for (c = c + 1; c < acols; c++)
1182219089Spjd		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
1183219089Spjd		    rm->rm_col[c - 1].rc_size;
1184192194Sdfr
1185192194Sdfr	/*
1186219089Spjd	 * If all data stored spans all columns, there's a danger that parity
1187219089Spjd	 * will always be on the same device and, since parity isn't read
1188219089Spjd	 * during normal operation, that that device's I/O bandwidth won't be
1189219089Spjd	 * used effectively. We therefore switch the parity every 1MB.
1190192194Sdfr	 *
1191219089Spjd	 * ... at least that was, ostensibly, the theory. As a practical
1192219089Spjd	 * matter unless we juggle the parity between all devices evenly, we
1193219089Spjd	 * won't see any benefit. Further, occasional writes that aren't a
1194219089Spjd	 * multiple of the LCM of the number of children and the minimum
1195219089Spjd	 * stripe width are sufficient to avoid pessimal behavior.
1196219089Spjd	 * Unfortunately, this decision created an implicit on-disk format
1197219089Spjd	 * requirement that we need to support for all eternity, but only
1198219089Spjd	 * for single-parity RAID-Z.
1199219089Spjd	 *
1200219089Spjd	 * If we intend to skip a sector in the zeroth column for padding
1201219089Spjd	 * we must make sure to note this swap. We will never intend to
1202219089Spjd	 * skip the first column since at least one data and one parity
1203219089Spjd	 * column must appear in each row.
1204192194Sdfr	 */
1205219089Spjd	ASSERT(rm->rm_cols >= 2);
1206219089Spjd	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
1207192194Sdfr
1208219089Spjd	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
1209219089Spjd		devidx = rm->rm_col[0].rc_devidx;
1210219089Spjd		o = rm->rm_col[0].rc_offset;
1211219089Spjd		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
1212219089Spjd		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
1213219089Spjd		rm->rm_col[1].rc_devidx = devidx;
1214219089Spjd		rm->rm_col[1].rc_offset = o;
1215219089Spjd
1216219089Spjd		if (rm->rm_skipstart == 0)
1217219089Spjd			rm->rm_skipstart = 1;
1218192194Sdfr	}
1219192194Sdfr
1220219089Spjd	return (rm);
1221219089Spjd}
1222219089Spjd
1223219089Spjdstatic void
1224219089Spjdvdev_raidz_map_free(raidz_map_t *rm)
1225219089Spjd{
1226219089Spjd	int c;
1227219089Spjd
1228219089Spjd	for (c = rm->rm_firstdatacol - 1; c >= 0; c--)
1229219089Spjd		zfs_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
1230219089Spjd
1231219089Spjd	zfs_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
1232219089Spjd}
1233219089Spjd
1234219089Spjdstatic vdev_t *
1235219089Spjdvdev_child(vdev_t *pvd, uint64_t devidx)
1236219089Spjd{
1237219089Spjd	vdev_t *cvd;
1238219089Spjd
1239219089Spjd	STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) {
1240219089Spjd		if (cvd->v_id == devidx)
1241219089Spjd			break;
1242219089Spjd	}
1243219089Spjd
1244219089Spjd	return (cvd);
1245219089Spjd}
1246219089Spjd
1247219089Spjd/*
1248219089Spjd * We keep track of whether or not there were any injected errors, so that
1249219089Spjd * any ereports we generate can note it.
1250219089Spjd */
1251219089Spjdstatic int
1252226553Spjdraidz_checksum_verify(const blkptr_t *bp, void *data, uint64_t size)
1253219089Spjd{
1254219089Spjd
1255226568Spjd	return (zio_checksum_verify(bp, data));
1256219089Spjd}
1257219089Spjd
1258219089Spjd/*
1259219089Spjd * Generate the parity from the data columns. If we tried and were able to
1260219089Spjd * read the parity without error, verify that the generated parity matches the
1261219089Spjd * data we read. If it doesn't, we fire off a checksum error. Return the
1262219089Spjd * number such failures.
1263219089Spjd */
1264219089Spjdstatic int
1265219089Spjdraidz_parity_verify(raidz_map_t *rm)
1266219089Spjd{
1267219089Spjd	void *orig[VDEV_RAIDZ_MAXPARITY];
1268219089Spjd	int c, ret = 0;
1269219089Spjd	raidz_col_t *rc;
1270219089Spjd
1271219089Spjd	for (c = 0; c < rm->rm_firstdatacol; c++) {
1272219089Spjd		rc = &rm->rm_col[c];
1273219089Spjd		if (!rc->rc_tried || rc->rc_error != 0)
1274219089Spjd			continue;
1275219089Spjd		orig[c] = zfs_alloc(rc->rc_size);
1276219089Spjd		bcopy(rc->rc_data, orig[c], rc->rc_size);
1277219089Spjd	}
1278219089Spjd
1279219089Spjd	vdev_raidz_generate_parity(rm);
1280219089Spjd
1281219089Spjd	for (c = rm->rm_firstdatacol - 1; c >= 0; c--) {
1282219089Spjd		rc = &rm->rm_col[c];
1283219089Spjd		if (!rc->rc_tried || rc->rc_error != 0)
1284219089Spjd			continue;
1285219089Spjd		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1286219089Spjd			rc->rc_error = ECKSUM;
1287219089Spjd			ret++;
1288219089Spjd		}
1289219089Spjd		zfs_free(orig[c], rc->rc_size);
1290219089Spjd	}
1291219089Spjd
1292219089Spjd	return (ret);
1293219089Spjd}
1294219089Spjd
1295219089Spjd/*
1296219089Spjd * Iterate over all combinations of bad data and attempt a reconstruction.
1297219089Spjd * Note that the algorithm below is non-optimal because it doesn't take into
1298219089Spjd * account how reconstruction is actually performed. For example, with
1299219089Spjd * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1300219089Spjd * is targeted as invalid as if columns 1 and 4 are targeted since in both
1301219089Spjd * cases we'd only use parity information in column 0.
1302219089Spjd */
1303219089Spjdstatic int
1304219089Spjdvdev_raidz_combrec(raidz_map_t *rm, const blkptr_t *bp, void *data,
1305226553Spjd    off_t offset, uint64_t bytes, int total_errors, int data_errors)
1306219089Spjd{
1307219089Spjd	raidz_col_t *rc;
1308219089Spjd	void *orig[VDEV_RAIDZ_MAXPARITY];
1309219089Spjd	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1310219089Spjd	int *tgts = &tstore[1];
1311219089Spjd	int current, next, i, c, n;
1312219089Spjd	int code, ret = 0;
1313219089Spjd
1314219089Spjd	ASSERT(total_errors < rm->rm_firstdatacol);
1315219089Spjd
1316192194Sdfr	/*
1317219089Spjd	 * This simplifies one edge condition.
1318192194Sdfr	 */
1319219089Spjd	tgts[-1] = -1;
1320219089Spjd
1321219089Spjd	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1322219089Spjd		/*
1323219089Spjd		 * Initialize the targets array by finding the first n columns
1324219089Spjd		 * that contain no error.
1325219089Spjd		 *
1326219089Spjd		 * If there were no data errors, we need to ensure that we're
1327219089Spjd		 * always explicitly attempting to reconstruct at least one
1328219089Spjd		 * data column. To do this, we simply push the highest target
1329219089Spjd		 * up into the data columns.
1330219089Spjd		 */
1331219089Spjd		for (c = 0, i = 0; i < n; i++) {
1332219089Spjd			if (i == n - 1 && data_errors == 0 &&
1333219089Spjd			    c < rm->rm_firstdatacol) {
1334219089Spjd				c = rm->rm_firstdatacol;
1335219089Spjd			}
1336219089Spjd
1337219089Spjd			while (rm->rm_col[c].rc_error != 0) {
1338219089Spjd				c++;
1339219089Spjd				ASSERT3S(c, <, rm->rm_cols);
1340219089Spjd			}
1341219089Spjd
1342219089Spjd			tgts[i] = c++;
1343219089Spjd		}
1344219089Spjd
1345219089Spjd		/*
1346219089Spjd		 * Setting tgts[n] simplifies the other edge condition.
1347219089Spjd		 */
1348219089Spjd		tgts[n] = rm->rm_cols;
1349219089Spjd
1350219089Spjd		/*
1351219089Spjd		 * These buffers were allocated in previous iterations.
1352219089Spjd		 */
1353219089Spjd		for (i = 0; i < n - 1; i++) {
1354219089Spjd			ASSERT(orig[i] != NULL);
1355219089Spjd		}
1356219089Spjd
1357219089Spjd		orig[n - 1] = zfs_alloc(rm->rm_col[0].rc_size);
1358219089Spjd
1359219089Spjd		current = 0;
1360219089Spjd		next = tgts[current];
1361219089Spjd
1362219089Spjd		while (current != n) {
1363219089Spjd			tgts[current] = next;
1364219089Spjd			current = 0;
1365219089Spjd
1366219089Spjd			/*
1367219089Spjd			 * Save off the original data that we're going to
1368219089Spjd			 * attempt to reconstruct.
1369219089Spjd			 */
1370219089Spjd			for (i = 0; i < n; i++) {
1371219089Spjd				ASSERT(orig[i] != NULL);
1372219089Spjd				c = tgts[i];
1373219089Spjd				ASSERT3S(c, >=, 0);
1374219089Spjd				ASSERT3S(c, <, rm->rm_cols);
1375219089Spjd				rc = &rm->rm_col[c];
1376219089Spjd				bcopy(rc->rc_data, orig[i], rc->rc_size);
1377219089Spjd			}
1378219089Spjd
1379219089Spjd			/*
1380219089Spjd			 * Attempt a reconstruction and exit the outer loop on
1381219089Spjd			 * success.
1382219089Spjd			 */
1383219089Spjd			code = vdev_raidz_reconstruct(rm, tgts, n);
1384226553Spjd			if (raidz_checksum_verify(bp, data, bytes) == 0) {
1385219089Spjd				for (i = 0; i < n; i++) {
1386219089Spjd					c = tgts[i];
1387219089Spjd					rc = &rm->rm_col[c];
1388219089Spjd					ASSERT(rc->rc_error == 0);
1389219089Spjd					rc->rc_error = ECKSUM;
1390219089Spjd				}
1391219089Spjd
1392219089Spjd				ret = code;
1393219089Spjd				goto done;
1394219089Spjd			}
1395219089Spjd
1396219089Spjd			/*
1397219089Spjd			 * Restore the original data.
1398219089Spjd			 */
1399219089Spjd			for (i = 0; i < n; i++) {
1400219089Spjd				c = tgts[i];
1401219089Spjd				rc = &rm->rm_col[c];
1402219089Spjd				bcopy(orig[i], rc->rc_data, rc->rc_size);
1403219089Spjd			}
1404219089Spjd
1405219089Spjd			do {
1406219089Spjd				/*
1407219089Spjd				 * Find the next valid column after the current
1408219089Spjd				 * position..
1409219089Spjd				 */
1410219089Spjd				for (next = tgts[current] + 1;
1411219089Spjd				    next < rm->rm_cols &&
1412219089Spjd				    rm->rm_col[next].rc_error != 0; next++)
1413219089Spjd					continue;
1414219089Spjd
1415219089Spjd				ASSERT(next <= tgts[current + 1]);
1416219089Spjd
1417219089Spjd				/*
1418219089Spjd				 * If that spot is available, we're done here.
1419219089Spjd				 */
1420219089Spjd				if (next != tgts[current + 1])
1421219089Spjd					break;
1422219089Spjd
1423219089Spjd				/*
1424219089Spjd				 * Otherwise, find the next valid column after
1425219089Spjd				 * the previous position.
1426219089Spjd				 */
1427219089Spjd				for (c = tgts[current - 1] + 1;
1428219089Spjd				    rm->rm_col[c].rc_error != 0; c++)
1429219089Spjd					continue;
1430219089Spjd
1431219089Spjd				tgts[current] = c;
1432219089Spjd				current++;
1433219089Spjd
1434219089Spjd			} while (current != n);
1435219089Spjd		}
1436219089Spjd	}
1437219089Spjd	n--;
1438219089Spjddone:
1439219089Spjd	for (i = n - 1; i >= 0; i--) {
1440219089Spjd		zfs_free(orig[i], rm->rm_col[0].rc_size);
1441219089Spjd	}
1442219089Spjd
1443219089Spjd	return (ret);
1444219089Spjd}
1445219089Spjd
1446219089Spjdstatic int
1447219089Spjdvdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data,
1448219089Spjd    off_t offset, size_t bytes)
1449219089Spjd{
1450219089Spjd	vdev_t *tvd = vd->v_top;
1451219089Spjd	vdev_t *cvd;
1452219089Spjd	raidz_map_t *rm;
1453219089Spjd	raidz_col_t *rc;
1454219089Spjd	int c, error;
1455219089Spjd	int unexpected_errors;
1456219089Spjd	int parity_errors;
1457219089Spjd	int parity_untried;
1458219089Spjd	int data_errors;
1459219089Spjd	int total_errors;
1460219089Spjd	int n;
1461219089Spjd	int tgts[VDEV_RAIDZ_MAXPARITY];
1462219089Spjd	int code;
1463219089Spjd
1464219089Spjd	rc = NULL;	/* gcc */
1465219089Spjd	error = 0;
1466219089Spjd
1467219089Spjd	rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift,
1468219089Spjd	    vd->v_nchildren, vd->v_nparity);
1469219089Spjd
1470219089Spjd	/*
1471219089Spjd	 * Iterate over the columns in reverse order so that we hit the parity
1472219089Spjd	 * last -- any errors along the way will force us to read the parity.
1473219089Spjd	 */
1474219089Spjd	for (c = rm->rm_cols - 1; c >= 0; c--) {
1475219089Spjd		rc = &rm->rm_col[c];
1476219089Spjd		cvd = vdev_child(vd, rc->rc_devidx);
1477219089Spjd		if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) {
1478219089Spjd			if (c >= rm->rm_firstdatacol)
1479219089Spjd				rm->rm_missingdata++;
1480192194Sdfr			else
1481219089Spjd				rm->rm_missingparity++;
1482192194Sdfr			rc->rc_error = ENXIO;
1483192194Sdfr			rc->rc_tried = 1;	/* don't even try */
1484192194Sdfr			rc->rc_skipped = 1;
1485192194Sdfr			continue;
1486192194Sdfr		}
1487219089Spjd#if 0		/* XXX: Too hard for the boot code. */
1488219089Spjd		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1489219089Spjd			if (c >= rm->rm_firstdatacol)
1490192194Sdfr				rm->rm_missingdata++;
1491192194Sdfr			else
1492192194Sdfr				rm->rm_missingparity++;
1493192194Sdfr			rc->rc_error = ESTALE;
1494192194Sdfr			rc->rc_skipped = 1;
1495192194Sdfr			continue;
1496192194Sdfr		}
1497192194Sdfr#endif
1498219089Spjd		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) {
1499219089Spjd			rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data,
1500219089Spjd			    rc->rc_offset, rc->rc_size);
1501192194Sdfr			rc->rc_tried = 1;
1502192194Sdfr			rc->rc_skipped = 0;
1503192194Sdfr		}
1504192194Sdfr	}
1505192194Sdfr
1506192194Sdfrreconstruct:
1507219089Spjd	unexpected_errors = 0;
1508192194Sdfr	parity_errors = 0;
1509219089Spjd	parity_untried = 0;
1510192194Sdfr	data_errors = 0;
1511192194Sdfr	total_errors = 0;
1512192194Sdfr
1513219089Spjd	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1514219089Spjd	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1515219089Spjd
1516219089Spjd	for (c = 0; c < rm->rm_cols; c++) {
1517219089Spjd		rc = &rm->rm_col[c];
1518219089Spjd
1519192194Sdfr		if (rc->rc_error) {
1520219089Spjd			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
1521219089Spjd
1522219089Spjd			if (c < rm->rm_firstdatacol)
1523192194Sdfr				parity_errors++;
1524192194Sdfr			else
1525192194Sdfr				data_errors++;
1526192194Sdfr
1527192194Sdfr			if (!rc->rc_skipped)
1528192194Sdfr				unexpected_errors++;
1529192194Sdfr
1530192194Sdfr			total_errors++;
1531219089Spjd		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1532192194Sdfr			parity_untried++;
1533192194Sdfr		}
1534192194Sdfr	}
1535192194Sdfr
1536192194Sdfr	/*
1537192194Sdfr	 * There are three potential phases for a read:
1538192194Sdfr	 *	1. produce valid data from the columns read
1539192194Sdfr	 *	2. read all disks and try again
1540192194Sdfr	 *	3. perform combinatorial reconstruction
1541192194Sdfr	 *
1542219089Spjd	 * Each phase is progressively both more expensive and less likely to
1543219089Spjd	 * occur. If we encounter more errors than we can repair or all phases
1544219089Spjd	 * fail, we have no choice but to return an error.
1545192194Sdfr	 */
1546192194Sdfr
1547192194Sdfr	/*
1548219089Spjd	 * If the number of errors we saw was correctable -- less than or equal
1549219089Spjd	 * to the number of parity disks read -- attempt to produce data that
1550219089Spjd	 * has a valid checksum. Naturally, this case applies in the absence of
1551219089Spjd	 * any errors.
1552192194Sdfr	 */
1553219089Spjd	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
1554219089Spjd		if (data_errors == 0) {
1555226553Spjd			if (raidz_checksum_verify(bp, data, bytes) == 0) {
1556219089Spjd				/*
1557219089Spjd				 * If we read parity information (unnecessarily
1558219089Spjd				 * as it happens since no reconstruction was
1559219089Spjd				 * needed) regenerate and verify the parity.
1560219089Spjd				 * We also regenerate parity when resilvering
1561219089Spjd				 * so we can write it out to the failed device
1562219089Spjd				 * later.
1563219089Spjd				 */
1564219089Spjd				if (parity_errors + parity_untried <
1565219089Spjd				    rm->rm_firstdatacol) {
1566219089Spjd					n = raidz_parity_verify(rm);
1567219089Spjd					unexpected_errors += n;
1568219089Spjd					ASSERT(parity_errors + n <=
1569219089Spjd					    rm->rm_firstdatacol);
1570219089Spjd				}
1571219089Spjd				goto done;
1572219089Spjd			}
1573219089Spjd		} else {
1574192194Sdfr			/*
1575192194Sdfr			 * We either attempt to read all the parity columns or
1576192194Sdfr			 * none of them. If we didn't try to read parity, we
1577192194Sdfr			 * wouldn't be here in the correctable case. There must
1578192194Sdfr			 * also have been fewer parity errors than parity
1579192194Sdfr			 * columns or, again, we wouldn't be in this code path.
1580192194Sdfr			 */
1581219089Spjd			ASSERT(parity_untried == 0);
1582219089Spjd			ASSERT(parity_errors < rm->rm_firstdatacol);
1583192194Sdfr
1584192194Sdfr			/*
1585219089Spjd			 * Identify the data columns that reported an error.
1586192194Sdfr			 */
1587219089Spjd			n = 0;
1588219089Spjd			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1589219089Spjd				rc = &rm->rm_col[c];
1590219089Spjd				if (rc->rc_error != 0) {
1591219089Spjd					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1592219089Spjd					tgts[n++] = c;
1593219089Spjd				}
1594192194Sdfr			}
1595192194Sdfr
1596219089Spjd			ASSERT(rm->rm_firstdatacol >= n);
1597192194Sdfr
1598219089Spjd			code = vdev_raidz_reconstruct(rm, tgts, n);
1599192194Sdfr
1600226553Spjd			if (raidz_checksum_verify(bp, data, bytes) == 0) {
1601219089Spjd				/*
1602219089Spjd				 * If we read more parity disks than were used
1603219089Spjd				 * for reconstruction, confirm that the other
1604219089Spjd				 * parity disks produced correct data. This
1605219089Spjd				 * routine is suboptimal in that it regenerates
1606219089Spjd				 * the parity that we already used in addition
1607219089Spjd				 * to the parity that we're attempting to
1608219089Spjd				 * verify, but this should be a relatively
1609219089Spjd				 * uncommon case, and can be optimized if it
1610219089Spjd				 * becomes a problem. Note that we regenerate
1611219089Spjd				 * parity when resilvering so we can write it
1612219089Spjd				 * out to failed devices later.
1613219089Spjd				 */
1614219089Spjd				if (parity_errors < rm->rm_firstdatacol - n) {
1615219089Spjd					n = raidz_parity_verify(rm);
1616219089Spjd					unexpected_errors += n;
1617219089Spjd					ASSERT(parity_errors + n <=
1618219089Spjd					    rm->rm_firstdatacol);
1619219089Spjd				}
1620192194Sdfr
1621219089Spjd				goto done;
1622192194Sdfr			}
1623192194Sdfr		}
1624192194Sdfr	}
1625192194Sdfr
1626192194Sdfr	/*
1627192194Sdfr	 * This isn't a typical situation -- either we got a read
1628192194Sdfr	 * error or a child silently returned bad data. Read every
1629192194Sdfr	 * block so we can try again with as much data and parity as
1630192194Sdfr	 * we can track down. If we've already been through once
1631192194Sdfr	 * before, all children will be marked as tried so we'll
1632192194Sdfr	 * proceed to combinatorial reconstruction.
1633192194Sdfr	 */
1634219089Spjd	unexpected_errors = 1;
1635219089Spjd	rm->rm_missingdata = 0;
1636219089Spjd	rm->rm_missingparity = 0;
1637219089Spjd
1638192194Sdfr	n = 0;
1639219089Spjd	for (c = 0; c < rm->rm_cols; c++) {
1640226550Spjd		rc = &rm->rm_col[c];
1641226550Spjd
1642226550Spjd		if (rc->rc_tried)
1643192194Sdfr			continue;
1644192194Sdfr
1645219089Spjd		cvd = vdev_child(vd, rc->rc_devidx);
1646219089Spjd		ASSERT(cvd != NULL);
1647219089Spjd		rc->rc_error = cvd->v_read(cvd, NULL,
1648219089Spjd		    rc->rc_data, rc->rc_offset, rc->rc_size);
1649192194Sdfr		if (rc->rc_error == 0)
1650192194Sdfr			n++;
1651192194Sdfr		rc->rc_tried = 1;
1652192194Sdfr		rc->rc_skipped = 0;
1653192194Sdfr	}
1654192194Sdfr	/*
1655192194Sdfr	 * If we managed to read anything more, retry the
1656192194Sdfr	 * reconstruction.
1657192194Sdfr	 */
1658219089Spjd	if (n > 0)
1659192194Sdfr		goto reconstruct;
1660192194Sdfr
1661192194Sdfr	/*
1662192194Sdfr	 * At this point we've attempted to reconstruct the data given the
1663192194Sdfr	 * errors we detected, and we've attempted to read all columns. There
1664192194Sdfr	 * must, therefore, be one or more additional problems -- silent errors
1665192194Sdfr	 * resulting in invalid data rather than explicit I/O errors resulting
1666219089Spjd	 * in absent data. We check if there is enough additional data to
1667219089Spjd	 * possibly reconstruct the data and then perform combinatorial
1668219089Spjd	 * reconstruction over all possible combinations. If that fails,
1669219089Spjd	 * we're cooked.
1670192194Sdfr	 */
1671219089Spjd	if (total_errors > rm->rm_firstdatacol) {
1672219089Spjd		error = EIO;
1673219089Spjd	} else if (total_errors < rm->rm_firstdatacol &&
1674226553Spjd	    (code = vdev_raidz_combrec(rm, bp, data, offset, bytes,
1675226553Spjd	     total_errors, data_errors)) != 0) {
1676192194Sdfr		/*
1677219089Spjd		 * If we didn't use all the available parity for the
1678219089Spjd		 * combinatorial reconstruction, verify that the remaining
1679219089Spjd		 * parity is correct.
1680192194Sdfr		 */
1681219089Spjd		if (code != (1 << rm->rm_firstdatacol) - 1)
1682219089Spjd			(void) raidz_parity_verify(rm);
1683219089Spjd	} else {
1684192194Sdfr		/*
1685219089Spjd		 * We're here because either:
1686219089Spjd		 *
1687219089Spjd		 *	total_errors == rm_first_datacol, or
1688219089Spjd		 *	vdev_raidz_combrec() failed
1689219089Spjd		 *
1690219089Spjd		 * In either case, there is enough bad data to prevent
1691219089Spjd		 * reconstruction.
1692219089Spjd		 *
1693219089Spjd		 * Start checksum ereports for all children which haven't
1694219089Spjd		 * failed, and the IO wasn't speculative.
1695192194Sdfr		 */
1696219089Spjd		error = ECKSUM;
1697192194Sdfr	}
1698192194Sdfr
1699219089Spjddone:
1700219089Spjd	vdev_raidz_map_free(rm);
1701192194Sdfr
1702219089Spjd	return (error);
1703192194Sdfr}
1704