zfssubr.c revision 219089
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#include <sys/cdefs.h>
27__FBSDID("$FreeBSD: head/sys/cddl/boot/zfs/zfssubr.c 219089 2011-02-27 19:41:40Z pjd $");
28
29static uint64_t zfs_crc64_table[256];
30
31#define	ECKSUM	666
32
33#define	ASSERT(...)	do { } while (0)
34#define	ASSERT3U(...)	do { } while (0)
35#define	ASSERT3S(...)	do { } while (0)
36
37#define	panic(...)	do {						\
38	printf(__VA_ARGS__);						\
39	for (;;) ;							\
40} while (0)
41
42#define	kmem_alloc(size, flag)	zfs_alloc((size))
43#define	kmem_free(ptr, size)	zfs_free((ptr), (size))
44
45static void
46zfs_init_crc(void)
47{
48	int i, j;
49	uint64_t *ct;
50
51	/*
52	 * Calculate the crc64 table (used for the zap hash
53	 * function).
54	 */
55	if (zfs_crc64_table[128] != ZFS_CRC64_POLY) {
56		memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table));
57		for (i = 0; i < 256; i++)
58			for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
59				*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
60	}
61}
62
63static void
64zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp)
65{
66	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
67}
68
69/*
70 * Signature for checksum functions.
71 */
72typedef void zio_checksum_t(const void *data, uint64_t size, zio_cksum_t *zcp);
73
74/*
75 * Information about each checksum function.
76 */
77typedef struct zio_checksum_info {
78	zio_checksum_t	*ci_func[2]; /* checksum function for each byteorder */
79	int		ci_correctable;	/* number of correctable bits	*/
80	int		ci_eck;		/* uses zio embedded checksum? */
81	int		ci_dedup;	/* strong enough for dedup? */
82	const char	*ci_name;	/* descriptive name */
83} zio_checksum_info_t;
84
85#include "fletcher.c"
86#include "sha256.c"
87
88static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
89	{{NULL,			NULL},			0, 0, 0, "inherit"},
90	{{NULL,			NULL},			0, 0, 0, "on"},
91	{{zio_checksum_off,	zio_checksum_off},	0, 0, 0, "off"},
92	{{zio_checksum_SHA256,	zio_checksum_SHA256},	1, 1, 0, "label"},
93	{{zio_checksum_SHA256,	zio_checksum_SHA256},	1, 1, 0, "gang_header"},
94	{{fletcher_2_native,	fletcher_2_byteswap},	0, 1, 0, "zilog"},
95	{{fletcher_2_native,	fletcher_2_byteswap},	0, 0, 0, "fletcher2"},
96	{{fletcher_4_native,	fletcher_4_byteswap},	1, 0, 0, "fletcher4"},
97	{{zio_checksum_SHA256,	zio_checksum_SHA256},	1, 0, 1, "SHA256"},
98	{{fletcher_4_native,	fletcher_4_byteswap},	0, 1, 0, "zillog2"},
99};
100
101
102/*
103 * Common signature for all zio compress/decompress functions.
104 */
105typedef size_t zio_compress_func_t(void *src, void *dst,
106    size_t s_len, size_t d_len, int);
107typedef int zio_decompress_func_t(void *src, void *dst,
108    size_t s_len, size_t d_len, int);
109
110/*
111 * Information about each compression function.
112 */
113typedef struct zio_compress_info {
114	zio_compress_func_t	*ci_compress;	/* compression function */
115	zio_decompress_func_t	*ci_decompress;	/* decompression function */
116	int			ci_level;	/* level parameter */
117	const char		*ci_name;	/* algorithm name */
118} zio_compress_info_t;
119
120#include "lzjb.c"
121#include "zle.c"
122
123/*
124 * Compression vectors.
125 */
126static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
127	{NULL,			NULL,			0,	"inherit"},
128	{NULL,			NULL,			0,	"on"},
129	{NULL,			NULL,			0,	"uncompressed"},
130	{NULL,			lzjb_decompress,	0,	"lzjb"},
131	{NULL,			NULL,			0,	"empty"},
132	{NULL,			NULL,			1,	"gzip-1"},
133	{NULL,			NULL,			2,	"gzip-2"},
134	{NULL,			NULL,			3,	"gzip-3"},
135	{NULL,			NULL,			4,	"gzip-4"},
136	{NULL,			NULL,			5,	"gzip-5"},
137	{NULL,			NULL,			6,	"gzip-6"},
138	{NULL,			NULL,			7,	"gzip-7"},
139	{NULL,			NULL,			8,	"gzip-8"},
140	{NULL,			NULL,			9,	"gzip-9"},
141	{NULL,			zle_decompress,		64,	"zle"},
142};
143
144static void
145byteswap_uint64_array(void *vbuf, size_t size)
146{
147	uint64_t *buf = vbuf;
148	size_t count = size >> 3;
149	int i;
150
151	ASSERT((size & 7) == 0);
152
153	for (i = 0; i < count; i++)
154		buf[i] = BSWAP_64(buf[i]);
155}
156
157/*
158 * Set the external verifier for a gang block based on <vdev, offset, txg>,
159 * a tuple which is guaranteed to be unique for the life of the pool.
160 */
161static void
162zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp)
163{
164	const dva_t *dva = BP_IDENTITY(bp);
165	uint64_t txg = BP_PHYSICAL_BIRTH(bp);
166
167	ASSERT(BP_IS_GANG(bp));
168
169	ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0);
170}
171
172/*
173 * Set the external verifier for a label block based on its offset.
174 * The vdev is implicit, and the txg is unknowable at pool open time --
175 * hence the logic in vdev_uberblock_load() to find the most recent copy.
176 */
177static void
178zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
179{
180	ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0);
181}
182
183static int
184zio_checksum_error(const blkptr_t *bp, void *data, uint64_t offset)
185{
186	unsigned int checksum = BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp);
187	uint64_t size = BP_GET_PSIZE(bp);
188	zio_checksum_info_t *ci;
189	zio_cksum_t actual_cksum, expected_cksum, verifier;
190	int byteswap;
191
192	if (checksum >= ZIO_CHECKSUM_FUNCTIONS)
193		return (EINVAL);
194	ci = &zio_checksum_table[checksum];
195	if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL)
196		return (EINVAL);
197
198	if (ci->ci_eck) {
199		zio_eck_t *eck;
200
201		ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER ||
202		    checksum == ZIO_CHECKSUM_LABEL);
203
204		eck = (zio_eck_t *)((char *)data + size) - 1;
205
206		if (checksum == ZIO_CHECKSUM_GANG_HEADER)
207			zio_checksum_gang_verifier(&verifier, bp);
208		else if (checksum == ZIO_CHECKSUM_LABEL)
209			zio_checksum_label_verifier(&verifier, offset);
210		else
211			verifier = bp->blk_cksum;
212
213		byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
214
215		if (byteswap)
216			byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
217
218		expected_cksum = eck->zec_cksum;
219		eck->zec_cksum = verifier;
220		ci->ci_func[byteswap](data, size, &actual_cksum);
221		eck->zec_cksum = expected_cksum;
222
223		if (byteswap)
224			byteswap_uint64_array(&expected_cksum,
225			    sizeof (zio_cksum_t));
226	} else {
227		ASSERT(!BP_IS_GANG(bp));
228		expected_cksum = bp->blk_cksum;
229		ci->ci_func[0](data, size, &actual_cksum);
230	}
231
232	if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) {
233		/*printf("ZFS: read checksum failed\n");*/
234		return (EIO);
235	}
236
237	return (0);
238}
239
240static int
241zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
242	void *dest, uint64_t destsize)
243{
244	zio_compress_info_t *ci;
245
246	if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) {
247		printf("ZFS: unsupported compression algorithm %u\n", cpfunc);
248		return (EIO);
249	}
250
251	ci = &zio_compress_table[cpfunc];
252	if (!ci->ci_decompress) {
253		printf("ZFS: unsupported compression algorithm %s\n",
254		    ci->ci_name);
255		return (EIO);
256	}
257
258	return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
259}
260
261static uint64_t
262zap_hash(uint64_t salt, const char *name)
263{
264	const uint8_t *cp;
265	uint8_t c;
266	uint64_t crc = salt;
267
268	ASSERT(crc != 0);
269	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
270	for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
271		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
272
273	/*
274	 * Only use 28 bits, since we need 4 bits in the cookie for the
275	 * collision differentiator.  We MUST use the high bits, since
276	 * those are the onces that we first pay attention to when
277	 * chosing the bucket.
278	 */
279	crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
280
281	return (crc);
282}
283
284static void *zfs_alloc(size_t size);
285static void zfs_free(void *ptr, size_t size);
286
287typedef struct raidz_col {
288	uint64_t rc_devidx;		/* child device index for I/O */
289	uint64_t rc_offset;		/* device offset */
290	uint64_t rc_size;		/* I/O size */
291	void *rc_data;			/* I/O data */
292	int rc_error;			/* I/O error for this device */
293	uint8_t rc_tried;		/* Did we attempt this I/O column? */
294	uint8_t rc_skipped;		/* Did we skip this I/O column? */
295} raidz_col_t;
296
297typedef struct raidz_map {
298	uint64_t rm_cols;		/* Regular column count */
299	uint64_t rm_scols;		/* Count including skipped columns */
300	uint64_t rm_bigcols;		/* Number of oversized columns */
301	uint64_t rm_asize;		/* Actual total I/O size */
302	uint64_t rm_missingdata;	/* Count of missing data devices */
303	uint64_t rm_missingparity;	/* Count of missing parity devices */
304	uint64_t rm_firstdatacol;	/* First data column/parity count */
305	uint64_t rm_nskip;		/* Skipped sectors for padding */
306	uint64_t rm_skipstart;		/* Column index of padding start */
307	uintptr_t rm_reports;		/* # of referencing checksum reports */
308	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
309	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
310	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
311} raidz_map_t;
312
313#define	VDEV_RAIDZ_P		0
314#define	VDEV_RAIDZ_Q		1
315#define	VDEV_RAIDZ_R		2
316
317#define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
318#define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
319
320/*
321 * We provide a mechanism to perform the field multiplication operation on a
322 * 64-bit value all at once rather than a byte at a time. This works by
323 * creating a mask from the top bit in each byte and using that to
324 * conditionally apply the XOR of 0x1d.
325 */
326#define	VDEV_RAIDZ_64MUL_2(x, mask) \
327{ \
328	(mask) = (x) & 0x8080808080808080ULL; \
329	(mask) = ((mask) << 1) - ((mask) >> 7); \
330	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
331	    ((mask) & 0x1d1d1d1d1d1d1d1d); \
332}
333
334#define	VDEV_RAIDZ_64MUL_4(x, mask) \
335{ \
336	VDEV_RAIDZ_64MUL_2((x), mask); \
337	VDEV_RAIDZ_64MUL_2((x), mask); \
338}
339
340/*
341 * These two tables represent powers and logs of 2 in the Galois field defined
342 * above. These values were computed by repeatedly multiplying by 2 as above.
343 */
344static const uint8_t vdev_raidz_pow2[256] = {
345	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
346	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
347	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
348	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
349	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
350	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
351	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
352	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
353	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
354	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
355	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
356	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
357	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
358	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
359	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
360	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
361	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
362	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
363	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
364	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
365	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
366	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
367	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
368	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
369	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
370	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
371	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
372	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
373	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
374	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
375	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
376	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
377};
378static const uint8_t vdev_raidz_log2[256] = {
379	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
380	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
381	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
382	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
383	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
384	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
385	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
386	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
387	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
388	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
389	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
390	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
391	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
392	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
393	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
394	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
395	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
396	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
397	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
398	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
399	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
400	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
401	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
402	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
403	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
404	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
405	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
406	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
407	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
408	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
409	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
410	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
411};
412
413/*
414 * Multiply a given number by 2 raised to the given power.
415 */
416static uint8_t
417vdev_raidz_exp2(uint8_t a, int exp)
418{
419	if (a == 0)
420		return (0);
421
422	ASSERT(exp >= 0);
423	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
424
425	exp += vdev_raidz_log2[a];
426	if (exp > 255)
427		exp -= 255;
428
429	return (vdev_raidz_pow2[exp]);
430}
431
432static void
433vdev_raidz_generate_parity_p(raidz_map_t *rm)
434{
435	uint64_t *p, *src, pcount, ccount, i;
436	int c;
437
438	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
439
440	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
441		src = rm->rm_col[c].rc_data;
442		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
443		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
444
445		if (c == rm->rm_firstdatacol) {
446			ASSERT(ccount == pcount);
447			for (i = 0; i < ccount; i++, src++, p++) {
448				*p = *src;
449			}
450		} else {
451			ASSERT(ccount <= pcount);
452			for (i = 0; i < ccount; i++, src++, p++) {
453				*p ^= *src;
454			}
455		}
456	}
457}
458
459static void
460vdev_raidz_generate_parity_pq(raidz_map_t *rm)
461{
462	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
463	int c;
464
465	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
466	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
467	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
468
469	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
470		src = rm->rm_col[c].rc_data;
471		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
472		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
473
474		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
475
476		if (c == rm->rm_firstdatacol) {
477			ASSERT(ccnt == pcnt || ccnt == 0);
478			for (i = 0; i < ccnt; i++, src++, p++, q++) {
479				*p = *src;
480				*q = *src;
481			}
482			for (; i < pcnt; i++, src++, p++, q++) {
483				*p = 0;
484				*q = 0;
485			}
486		} else {
487			ASSERT(ccnt <= pcnt);
488
489			/*
490			 * Apply the algorithm described above by multiplying
491			 * the previous result and adding in the new value.
492			 */
493			for (i = 0; i < ccnt; i++, src++, p++, q++) {
494				*p ^= *src;
495
496				VDEV_RAIDZ_64MUL_2(*q, mask);
497				*q ^= *src;
498			}
499
500			/*
501			 * Treat short columns as though they are full of 0s.
502			 * Note that there's therefore nothing needed for P.
503			 */
504			for (; i < pcnt; i++, q++) {
505				VDEV_RAIDZ_64MUL_2(*q, mask);
506			}
507		}
508	}
509}
510
511static void
512vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
513{
514	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
515	int c;
516
517	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
518	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
519	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
520	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
521	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
522
523	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
524		src = rm->rm_col[c].rc_data;
525		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
526		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
527		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
528
529		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
530
531		if (c == rm->rm_firstdatacol) {
532			ASSERT(ccnt == pcnt || ccnt == 0);
533			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
534				*p = *src;
535				*q = *src;
536				*r = *src;
537			}
538			for (; i < pcnt; i++, src++, p++, q++, r++) {
539				*p = 0;
540				*q = 0;
541				*r = 0;
542			}
543		} else {
544			ASSERT(ccnt <= pcnt);
545
546			/*
547			 * Apply the algorithm described above by multiplying
548			 * the previous result and adding in the new value.
549			 */
550			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
551				*p ^= *src;
552
553				VDEV_RAIDZ_64MUL_2(*q, mask);
554				*q ^= *src;
555
556				VDEV_RAIDZ_64MUL_4(*r, mask);
557				*r ^= *src;
558			}
559
560			/*
561			 * Treat short columns as though they are full of 0s.
562			 * Note that there's therefore nothing needed for P.
563			 */
564			for (; i < pcnt; i++, q++, r++) {
565				VDEV_RAIDZ_64MUL_2(*q, mask);
566				VDEV_RAIDZ_64MUL_4(*r, mask);
567			}
568		}
569	}
570}
571
572/*
573 * Generate RAID parity in the first virtual columns according to the number of
574 * parity columns available.
575 */
576static void
577vdev_raidz_generate_parity(raidz_map_t *rm)
578{
579	switch (rm->rm_firstdatacol) {
580	case 1:
581		vdev_raidz_generate_parity_p(rm);
582		break;
583	case 2:
584		vdev_raidz_generate_parity_pq(rm);
585		break;
586	case 3:
587		vdev_raidz_generate_parity_pqr(rm);
588		break;
589	default:
590		panic("invalid RAID-Z configuration");
591	}
592}
593
594/* BEGIN CSTYLED */
595/*
596 * In the general case of reconstruction, we must solve the system of linear
597 * equations defined by the coeffecients used to generate parity as well as
598 * the contents of the data and parity disks. This can be expressed with
599 * vectors for the original data (D) and the actual data (d) and parity (p)
600 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
601 *
602 *            __   __                     __     __
603 *            |     |         __     __   |  p_0  |
604 *            |  V  |         |  D_0  |   | p_m-1 |
605 *            |     |    x    |   :   | = |  d_0  |
606 *            |  I  |         | D_n-1 |   |   :   |
607 *            |     |         ~~     ~~   | d_n-1 |
608 *            ~~   ~~                     ~~     ~~
609 *
610 * I is simply a square identity matrix of size n, and V is a vandermonde
611 * matrix defined by the coeffecients we chose for the various parity columns
612 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
613 * computation as well as linear separability.
614 *
615 *      __               __               __     __
616 *      |   1   ..  1 1 1 |               |  p_0  |
617 *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
618 *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
619 *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
620 *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
621 *      |   :       : : : |   |   :   |   |  d_2  |
622 *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
623 *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
624 *      |   0   ..  0 0 1 |               | d_n-1 |
625 *      ~~               ~~               ~~     ~~
626 *
627 * Note that I, V, d, and p are known. To compute D, we must invert the
628 * matrix and use the known data and parity values to reconstruct the unknown
629 * data values. We begin by removing the rows in V|I and d|p that correspond
630 * to failed or missing columns; we then make V|I square (n x n) and d|p
631 * sized n by removing rows corresponding to unused parity from the bottom up
632 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
633 * using Gauss-Jordan elimination. In the example below we use m=3 parity
634 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
635 *           __                               __
636 *           |  1   1   1   1   1   1   1   1  |
637 *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
638 *           |  19 205 116  29  64  16  4   1  |      / /
639 *           |  1   0   0   0   0   0   0   0  |     / /
640 *           |  0   1   0   0   0   0   0   0  | <--' /
641 *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
642 *           |  0   0   0   1   0   0   0   0  |
643 *           |  0   0   0   0   1   0   0   0  |
644 *           |  0   0   0   0   0   1   0   0  |
645 *           |  0   0   0   0   0   0   1   0  |
646 *           |  0   0   0   0   0   0   0   1  |
647 *           ~~                               ~~
648 *           __                               __
649 *           |  1   1   1   1   1   1   1   1  |
650 *           | 128  64  32  16  8   4   2   1  |
651 *           |  19 205 116  29  64  16  4   1  |
652 *           |  1   0   0   0   0   0   0   0  |
653 *           |  0   1   0   0   0   0   0   0  |
654 *  (V|I)' = |  0   0   1   0   0   0   0   0  |
655 *           |  0   0   0   1   0   0   0   0  |
656 *           |  0   0   0   0   1   0   0   0  |
657 *           |  0   0   0   0   0   1   0   0  |
658 *           |  0   0   0   0   0   0   1   0  |
659 *           |  0   0   0   0   0   0   0   1  |
660 *           ~~                               ~~
661 *
662 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
663 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
664 * matrix is not singular.
665 * __                                                                 __
666 * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
667 * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
668 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
669 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
670 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
671 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
672 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
673 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
674 * ~~                                                                 ~~
675 * __                                                                 __
676 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
677 * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
678 * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
679 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
680 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
681 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
682 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
683 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
684 * ~~                                                                 ~~
685 * __                                                                 __
686 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
687 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
688 * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
689 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
690 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
691 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
692 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
693 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
694 * ~~                                                                 ~~
695 * __                                                                 __
696 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
697 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
698 * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
699 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
700 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
701 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
702 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
703 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
704 * ~~                                                                 ~~
705 * __                                                                 __
706 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
707 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
708 * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
709 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
710 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
711 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
712 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
713 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
714 * ~~                                                                 ~~
715 * __                                                                 __
716 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
717 * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
718 * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
719 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
720 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
721 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
722 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
723 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
724 * ~~                                                                 ~~
725 *                   __                               __
726 *                   |  0   0   1   0   0   0   0   0  |
727 *                   | 167 100  5   41 159 169 217 208 |
728 *                   | 166 100  4   40 158 168 216 209 |
729 *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
730 *                   |  0   0   0   0   1   0   0   0  |
731 *                   |  0   0   0   0   0   1   0   0  |
732 *                   |  0   0   0   0   0   0   1   0  |
733 *                   |  0   0   0   0   0   0   0   1  |
734 *                   ~~                               ~~
735 *
736 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
737 * of the missing data.
738 *
739 * As is apparent from the example above, the only non-trivial rows in the
740 * inverse matrix correspond to the data disks that we're trying to
741 * reconstruct. Indeed, those are the only rows we need as the others would
742 * only be useful for reconstructing data known or assumed to be valid. For
743 * that reason, we only build the coefficients in the rows that correspond to
744 * targeted columns.
745 */
746/* END CSTYLED */
747
748static void
749vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
750    uint8_t **rows)
751{
752	int i, j;
753	int pow;
754
755	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
756
757	/*
758	 * Fill in the missing rows of interest.
759	 */
760	for (i = 0; i < nmap; i++) {
761		ASSERT3S(0, <=, map[i]);
762		ASSERT3S(map[i], <=, 2);
763
764		pow = map[i] * n;
765		if (pow > 255)
766			pow -= 255;
767		ASSERT(pow <= 255);
768
769		for (j = 0; j < n; j++) {
770			pow -= map[i];
771			if (pow < 0)
772				pow += 255;
773			rows[i][j] = vdev_raidz_pow2[pow];
774		}
775	}
776}
777
778static void
779vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
780    uint8_t **rows, uint8_t **invrows, const uint8_t *used)
781{
782	int i, j, ii, jj;
783	uint8_t log;
784
785	/*
786	 * Assert that the first nmissing entries from the array of used
787	 * columns correspond to parity columns and that subsequent entries
788	 * correspond to data columns.
789	 */
790	for (i = 0; i < nmissing; i++) {
791		ASSERT3S(used[i], <, rm->rm_firstdatacol);
792	}
793	for (; i < n; i++) {
794		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
795	}
796
797	/*
798	 * First initialize the storage where we'll compute the inverse rows.
799	 */
800	for (i = 0; i < nmissing; i++) {
801		for (j = 0; j < n; j++) {
802			invrows[i][j] = (i == j) ? 1 : 0;
803		}
804	}
805
806	/*
807	 * Subtract all trivial rows from the rows of consequence.
808	 */
809	for (i = 0; i < nmissing; i++) {
810		for (j = nmissing; j < n; j++) {
811			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
812			jj = used[j] - rm->rm_firstdatacol;
813			ASSERT3S(jj, <, n);
814			invrows[i][j] = rows[i][jj];
815			rows[i][jj] = 0;
816		}
817	}
818
819	/*
820	 * For each of the rows of interest, we must normalize it and subtract
821	 * a multiple of it from the other rows.
822	 */
823	for (i = 0; i < nmissing; i++) {
824		for (j = 0; j < missing[i]; j++) {
825			ASSERT3U(rows[i][j], ==, 0);
826		}
827		ASSERT3U(rows[i][missing[i]], !=, 0);
828
829		/*
830		 * Compute the inverse of the first element and multiply each
831		 * element in the row by that value.
832		 */
833		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
834
835		for (j = 0; j < n; j++) {
836			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
837			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
838		}
839
840		for (ii = 0; ii < nmissing; ii++) {
841			if (i == ii)
842				continue;
843
844			ASSERT3U(rows[ii][missing[i]], !=, 0);
845
846			log = vdev_raidz_log2[rows[ii][missing[i]]];
847
848			for (j = 0; j < n; j++) {
849				rows[ii][j] ^=
850				    vdev_raidz_exp2(rows[i][j], log);
851				invrows[ii][j] ^=
852				    vdev_raidz_exp2(invrows[i][j], log);
853			}
854		}
855	}
856
857	/*
858	 * Verify that the data that is left in the rows are properly part of
859	 * an identity matrix.
860	 */
861	for (i = 0; i < nmissing; i++) {
862		for (j = 0; j < n; j++) {
863			if (j == missing[i]) {
864				ASSERT3U(rows[i][j], ==, 1);
865			} else {
866				ASSERT3U(rows[i][j], ==, 0);
867			}
868		}
869	}
870}
871
872static void
873vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
874    int *missing, uint8_t **invrows, const uint8_t *used)
875{
876	int i, j, x, cc, c;
877	uint8_t *src;
878	uint64_t ccount;
879	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
880	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
881	uint8_t log, val;
882	int ll;
883	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
884	uint8_t *p, *pp;
885	size_t psize;
886
887	log = 0;	/* gcc */
888	psize = sizeof (invlog[0][0]) * n * nmissing;
889	p = zfs_alloc(psize);
890
891	for (pp = p, i = 0; i < nmissing; i++) {
892		invlog[i] = pp;
893		pp += n;
894	}
895
896	for (i = 0; i < nmissing; i++) {
897		for (j = 0; j < n; j++) {
898			ASSERT3U(invrows[i][j], !=, 0);
899			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
900		}
901	}
902
903	for (i = 0; i < n; i++) {
904		c = used[i];
905		ASSERT3U(c, <, rm->rm_cols);
906
907		src = rm->rm_col[c].rc_data;
908		ccount = rm->rm_col[c].rc_size;
909		for (j = 0; j < nmissing; j++) {
910			cc = missing[j] + rm->rm_firstdatacol;
911			ASSERT3U(cc, >=, rm->rm_firstdatacol);
912			ASSERT3U(cc, <, rm->rm_cols);
913			ASSERT3U(cc, !=, c);
914
915			dst[j] = rm->rm_col[cc].rc_data;
916			dcount[j] = rm->rm_col[cc].rc_size;
917		}
918
919		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
920
921		for (x = 0; x < ccount; x++, src++) {
922			if (*src != 0)
923				log = vdev_raidz_log2[*src];
924
925			for (cc = 0; cc < nmissing; cc++) {
926				if (x >= dcount[cc])
927					continue;
928
929				if (*src == 0) {
930					val = 0;
931				} else {
932					if ((ll = log + invlog[cc][i]) >= 255)
933						ll -= 255;
934					val = vdev_raidz_pow2[ll];
935				}
936
937				if (i == 0)
938					dst[cc][x] = val;
939				else
940					dst[cc][x] ^= val;
941			}
942		}
943	}
944
945	zfs_free(p, psize);
946}
947
948static int
949vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
950{
951	int n, i, c, t, tt;
952	int nmissing_rows;
953	int missing_rows[VDEV_RAIDZ_MAXPARITY];
954	int parity_map[VDEV_RAIDZ_MAXPARITY];
955
956	uint8_t *p, *pp;
957	size_t psize;
958
959	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
960	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
961	uint8_t *used;
962
963	int code = 0;
964
965
966	n = rm->rm_cols - rm->rm_firstdatacol;
967
968	/*
969	 * Figure out which data columns are missing.
970	 */
971	nmissing_rows = 0;
972	for (t = 0; t < ntgts; t++) {
973		if (tgts[t] >= rm->rm_firstdatacol) {
974			missing_rows[nmissing_rows++] =
975			    tgts[t] - rm->rm_firstdatacol;
976		}
977	}
978
979	/*
980	 * Figure out which parity columns to use to help generate the missing
981	 * data columns.
982	 */
983	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
984		ASSERT(tt < ntgts);
985		ASSERT(c < rm->rm_firstdatacol);
986
987		/*
988		 * Skip any targeted parity columns.
989		 */
990		if (c == tgts[tt]) {
991			tt++;
992			continue;
993		}
994
995		code |= 1 << c;
996
997		parity_map[i] = c;
998		i++;
999	}
1000
1001	ASSERT(code != 0);
1002	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1003
1004	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1005	    nmissing_rows * n + sizeof (used[0]) * n;
1006	p = kmem_alloc(psize, KM_SLEEP);
1007
1008	for (pp = p, i = 0; i < nmissing_rows; i++) {
1009		rows[i] = pp;
1010		pp += n;
1011		invrows[i] = pp;
1012		pp += n;
1013	}
1014	used = pp;
1015
1016	for (i = 0; i < nmissing_rows; i++) {
1017		used[i] = parity_map[i];
1018	}
1019
1020	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1021		if (tt < nmissing_rows &&
1022		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1023			tt++;
1024			continue;
1025		}
1026
1027		ASSERT3S(i, <, n);
1028		used[i] = c;
1029		i++;
1030	}
1031
1032	/*
1033	 * Initialize the interesting rows of the matrix.
1034	 */
1035	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1036
1037	/*
1038	 * Invert the matrix.
1039	 */
1040	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1041	    invrows, used);
1042
1043	/*
1044	 * Reconstruct the missing data using the generated matrix.
1045	 */
1046	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1047	    invrows, used);
1048
1049	kmem_free(p, psize);
1050
1051	return (code);
1052}
1053
1054static int
1055vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1056{
1057	int tgts[VDEV_RAIDZ_MAXPARITY];
1058	int ntgts;
1059	int i, c;
1060	int code;
1061	int nbadparity, nbaddata;
1062
1063	/*
1064	 * The tgts list must already be sorted.
1065	 */
1066	for (i = 1; i < nt; i++) {
1067		ASSERT(t[i] > t[i - 1]);
1068	}
1069
1070	nbadparity = rm->rm_firstdatacol;
1071	nbaddata = rm->rm_cols - nbadparity;
1072	ntgts = 0;
1073	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1074		if (i < nt && c == t[i]) {
1075			tgts[ntgts++] = c;
1076			i++;
1077		} else if (rm->rm_col[c].rc_error != 0) {
1078			tgts[ntgts++] = c;
1079		} else if (c >= rm->rm_firstdatacol) {
1080			nbaddata--;
1081		} else {
1082			nbadparity--;
1083		}
1084	}
1085
1086	ASSERT(ntgts >= nt);
1087	ASSERT(nbaddata >= 0);
1088	ASSERT(nbaddata + nbadparity == ntgts);
1089
1090	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1091	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1092	ASSERT(code > 0);
1093	return (code);
1094}
1095
1096static raidz_map_t *
1097vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift,
1098    uint64_t dcols, uint64_t nparity)
1099{
1100	raidz_map_t *rm;
1101	uint64_t b = offset >> unit_shift;
1102	uint64_t s = size >> unit_shift;
1103	uint64_t f = b % dcols;
1104	uint64_t o = (b / dcols) << unit_shift;
1105	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
1106
1107	q = s / (dcols - nparity);
1108	r = s - q * (dcols - nparity);
1109	bc = (r == 0 ? 0 : r + nparity);
1110	tot = s + nparity * (q + (r == 0 ? 0 : 1));
1111
1112	if (q == 0) {
1113		acols = bc;
1114		scols = MIN(dcols, roundup(bc, nparity + 1));
1115	} else {
1116		acols = dcols;
1117		scols = dcols;
1118	}
1119
1120	ASSERT3U(acols, <=, scols);
1121
1122	rm = zfs_alloc(offsetof(raidz_map_t, rm_col[scols]));
1123
1124	rm->rm_cols = acols;
1125	rm->rm_scols = scols;
1126	rm->rm_bigcols = bc;
1127	rm->rm_skipstart = bc;
1128	rm->rm_missingdata = 0;
1129	rm->rm_missingparity = 0;
1130	rm->rm_firstdatacol = nparity;
1131	rm->rm_reports = 0;
1132	rm->rm_freed = 0;
1133	rm->rm_ecksuminjected = 0;
1134
1135	asize = 0;
1136
1137	for (c = 0; c < scols; c++) {
1138		col = f + c;
1139		coff = o;
1140		if (col >= dcols) {
1141			col -= dcols;
1142			coff += 1ULL << unit_shift;
1143		}
1144		rm->rm_col[c].rc_devidx = col;
1145		rm->rm_col[c].rc_offset = coff;
1146		rm->rm_col[c].rc_data = NULL;
1147		rm->rm_col[c].rc_error = 0;
1148		rm->rm_col[c].rc_tried = 0;
1149		rm->rm_col[c].rc_skipped = 0;
1150
1151		if (c >= acols)
1152			rm->rm_col[c].rc_size = 0;
1153		else if (c < bc)
1154			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
1155		else
1156			rm->rm_col[c].rc_size = q << unit_shift;
1157
1158		asize += rm->rm_col[c].rc_size;
1159	}
1160
1161	ASSERT3U(asize, ==, tot << unit_shift);
1162	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
1163	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
1164	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
1165	ASSERT3U(rm->rm_nskip, <=, nparity);
1166
1167	for (c = 0; c < rm->rm_firstdatacol; c++)
1168		rm->rm_col[c].rc_data = zfs_alloc(rm->rm_col[c].rc_size);
1169
1170	rm->rm_col[c].rc_data = data;
1171
1172	for (c = c + 1; c < acols; c++)
1173		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
1174		    rm->rm_col[c - 1].rc_size;
1175
1176	/*
1177	 * If all data stored spans all columns, there's a danger that parity
1178	 * will always be on the same device and, since parity isn't read
1179	 * during normal operation, that that device's I/O bandwidth won't be
1180	 * used effectively. We therefore switch the parity every 1MB.
1181	 *
1182	 * ... at least that was, ostensibly, the theory. As a practical
1183	 * matter unless we juggle the parity between all devices evenly, we
1184	 * won't see any benefit. Further, occasional writes that aren't a
1185	 * multiple of the LCM of the number of children and the minimum
1186	 * stripe width are sufficient to avoid pessimal behavior.
1187	 * Unfortunately, this decision created an implicit on-disk format
1188	 * requirement that we need to support for all eternity, but only
1189	 * for single-parity RAID-Z.
1190	 *
1191	 * If we intend to skip a sector in the zeroth column for padding
1192	 * we must make sure to note this swap. We will never intend to
1193	 * skip the first column since at least one data and one parity
1194	 * column must appear in each row.
1195	 */
1196	ASSERT(rm->rm_cols >= 2);
1197	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
1198
1199	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
1200		devidx = rm->rm_col[0].rc_devidx;
1201		o = rm->rm_col[0].rc_offset;
1202		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
1203		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
1204		rm->rm_col[1].rc_devidx = devidx;
1205		rm->rm_col[1].rc_offset = o;
1206
1207		if (rm->rm_skipstart == 0)
1208			rm->rm_skipstart = 1;
1209	}
1210
1211	return (rm);
1212}
1213
1214static void
1215vdev_raidz_map_free(raidz_map_t *rm)
1216{
1217	int c;
1218	size_t size;
1219
1220	for (c = rm->rm_firstdatacol - 1; c >= 0; c--)
1221		zfs_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
1222
1223	size = 0;
1224	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
1225		size += rm->rm_col[c].rc_size;
1226
1227	zfs_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
1228}
1229
1230static vdev_t *
1231vdev_child(vdev_t *pvd, uint64_t devidx)
1232{
1233	vdev_t *cvd;
1234
1235	STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) {
1236		if (cvd->v_id == devidx)
1237			break;
1238	}
1239
1240	return (cvd);
1241}
1242
1243/*
1244 * We keep track of whether or not there were any injected errors, so that
1245 * any ereports we generate can note it.
1246 */
1247static int
1248raidz_checksum_verify(const blkptr_t *bp, void *data)
1249{
1250
1251	return (zio_checksum_error(bp, data, 0));
1252}
1253
1254/*
1255 * Generate the parity from the data columns. If we tried and were able to
1256 * read the parity without error, verify that the generated parity matches the
1257 * data we read. If it doesn't, we fire off a checksum error. Return the
1258 * number such failures.
1259 */
1260static int
1261raidz_parity_verify(raidz_map_t *rm)
1262{
1263	void *orig[VDEV_RAIDZ_MAXPARITY];
1264	int c, ret = 0;
1265	raidz_col_t *rc;
1266
1267	for (c = 0; c < rm->rm_firstdatacol; c++) {
1268		rc = &rm->rm_col[c];
1269		if (!rc->rc_tried || rc->rc_error != 0)
1270			continue;
1271		orig[c] = zfs_alloc(rc->rc_size);
1272		bcopy(rc->rc_data, orig[c], rc->rc_size);
1273	}
1274
1275	vdev_raidz_generate_parity(rm);
1276
1277	for (c = rm->rm_firstdatacol - 1; c >= 0; c--) {
1278		rc = &rm->rm_col[c];
1279		if (!rc->rc_tried || rc->rc_error != 0)
1280			continue;
1281		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1282			rc->rc_error = ECKSUM;
1283			ret++;
1284		}
1285		zfs_free(orig[c], rc->rc_size);
1286	}
1287
1288	return (ret);
1289}
1290
1291/*
1292 * Iterate over all combinations of bad data and attempt a reconstruction.
1293 * Note that the algorithm below is non-optimal because it doesn't take into
1294 * account how reconstruction is actually performed. For example, with
1295 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1296 * is targeted as invalid as if columns 1 and 4 are targeted since in both
1297 * cases we'd only use parity information in column 0.
1298 */
1299static int
1300vdev_raidz_combrec(raidz_map_t *rm, const blkptr_t *bp, void *data,
1301    off_t offset, int total_errors, int data_errors)
1302{
1303	raidz_col_t *rc;
1304	void *orig[VDEV_RAIDZ_MAXPARITY];
1305	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1306	int *tgts = &tstore[1];
1307	int current, next, i, c, n;
1308	int code, ret = 0;
1309
1310	ASSERT(total_errors < rm->rm_firstdatacol);
1311
1312	/*
1313	 * This simplifies one edge condition.
1314	 */
1315	tgts[-1] = -1;
1316
1317	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1318		/*
1319		 * Initialize the targets array by finding the first n columns
1320		 * that contain no error.
1321		 *
1322		 * If there were no data errors, we need to ensure that we're
1323		 * always explicitly attempting to reconstruct at least one
1324		 * data column. To do this, we simply push the highest target
1325		 * up into the data columns.
1326		 */
1327		for (c = 0, i = 0; i < n; i++) {
1328			if (i == n - 1 && data_errors == 0 &&
1329			    c < rm->rm_firstdatacol) {
1330				c = rm->rm_firstdatacol;
1331			}
1332
1333			while (rm->rm_col[c].rc_error != 0) {
1334				c++;
1335				ASSERT3S(c, <, rm->rm_cols);
1336			}
1337
1338			tgts[i] = c++;
1339		}
1340
1341		/*
1342		 * Setting tgts[n] simplifies the other edge condition.
1343		 */
1344		tgts[n] = rm->rm_cols;
1345
1346		/*
1347		 * These buffers were allocated in previous iterations.
1348		 */
1349		for (i = 0; i < n - 1; i++) {
1350			ASSERT(orig[i] != NULL);
1351		}
1352
1353		orig[n - 1] = zfs_alloc(rm->rm_col[0].rc_size);
1354
1355		current = 0;
1356		next = tgts[current];
1357
1358		while (current != n) {
1359			tgts[current] = next;
1360			current = 0;
1361
1362			/*
1363			 * Save off the original data that we're going to
1364			 * attempt to reconstruct.
1365			 */
1366			for (i = 0; i < n; i++) {
1367				ASSERT(orig[i] != NULL);
1368				c = tgts[i];
1369				ASSERT3S(c, >=, 0);
1370				ASSERT3S(c, <, rm->rm_cols);
1371				rc = &rm->rm_col[c];
1372				bcopy(rc->rc_data, orig[i], rc->rc_size);
1373			}
1374
1375			/*
1376			 * Attempt a reconstruction and exit the outer loop on
1377			 * success.
1378			 */
1379			code = vdev_raidz_reconstruct(rm, tgts, n);
1380			if (raidz_checksum_verify(bp, data) == 0) {
1381				for (i = 0; i < n; i++) {
1382					c = tgts[i];
1383					rc = &rm->rm_col[c];
1384					ASSERT(rc->rc_error == 0);
1385					rc->rc_error = ECKSUM;
1386				}
1387
1388				ret = code;
1389				goto done;
1390			}
1391
1392			/*
1393			 * Restore the original data.
1394			 */
1395			for (i = 0; i < n; i++) {
1396				c = tgts[i];
1397				rc = &rm->rm_col[c];
1398				bcopy(orig[i], rc->rc_data, rc->rc_size);
1399			}
1400
1401			do {
1402				/*
1403				 * Find the next valid column after the current
1404				 * position..
1405				 */
1406				for (next = tgts[current] + 1;
1407				    next < rm->rm_cols &&
1408				    rm->rm_col[next].rc_error != 0; next++)
1409					continue;
1410
1411				ASSERT(next <= tgts[current + 1]);
1412
1413				/*
1414				 * If that spot is available, we're done here.
1415				 */
1416				if (next != tgts[current + 1])
1417					break;
1418
1419				/*
1420				 * Otherwise, find the next valid column after
1421				 * the previous position.
1422				 */
1423				for (c = tgts[current - 1] + 1;
1424				    rm->rm_col[c].rc_error != 0; c++)
1425					continue;
1426
1427				tgts[current] = c;
1428				current++;
1429
1430			} while (current != n);
1431		}
1432	}
1433	n--;
1434done:
1435	for (i = n - 1; i >= 0; i--) {
1436		zfs_free(orig[i], rm->rm_col[0].rc_size);
1437	}
1438
1439	return (ret);
1440}
1441
1442static int
1443vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data,
1444    off_t offset, size_t bytes)
1445{
1446	vdev_t *tvd = vd->v_top;
1447	vdev_t *cvd;
1448	raidz_map_t *rm;
1449	raidz_col_t *rc;
1450	int c, error;
1451	int unexpected_errors;
1452	int parity_errors;
1453	int parity_untried;
1454	int data_errors;
1455	int total_errors;
1456	int n;
1457	int tgts[VDEV_RAIDZ_MAXPARITY];
1458	int code;
1459
1460	rc = NULL;	/* gcc */
1461	error = 0;
1462
1463	rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift,
1464	    vd->v_nchildren, vd->v_nparity);
1465
1466	/*
1467	 * Iterate over the columns in reverse order so that we hit the parity
1468	 * last -- any errors along the way will force us to read the parity.
1469	 */
1470	for (c = rm->rm_cols - 1; c >= 0; c--) {
1471		rc = &rm->rm_col[c];
1472		cvd = vdev_child(vd, rc->rc_devidx);
1473		if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) {
1474			if (c >= rm->rm_firstdatacol)
1475				rm->rm_missingdata++;
1476			else
1477				rm->rm_missingparity++;
1478			rc->rc_error = ENXIO;
1479			rc->rc_tried = 1;	/* don't even try */
1480			rc->rc_skipped = 1;
1481			continue;
1482		}
1483#if 0		/* XXX: Too hard for the boot code. */
1484		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1485			if (c >= rm->rm_firstdatacol)
1486				rm->rm_missingdata++;
1487			else
1488				rm->rm_missingparity++;
1489			rc->rc_error = ESTALE;
1490			rc->rc_skipped = 1;
1491			continue;
1492		}
1493#endif
1494		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) {
1495			rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data,
1496			    rc->rc_offset, rc->rc_size);
1497			rc->rc_tried = 1;
1498			rc->rc_skipped = 0;
1499		}
1500	}
1501
1502reconstruct:
1503	unexpected_errors = 0;
1504	parity_errors = 0;
1505	parity_untried = 0;
1506	data_errors = 0;
1507	total_errors = 0;
1508
1509	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1510	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1511
1512	for (c = 0; c < rm->rm_cols; c++) {
1513		rc = &rm->rm_col[c];
1514
1515		if (rc->rc_error) {
1516			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
1517
1518			if (c < rm->rm_firstdatacol)
1519				parity_errors++;
1520			else
1521				data_errors++;
1522
1523			if (!rc->rc_skipped)
1524				unexpected_errors++;
1525
1526			total_errors++;
1527		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1528			parity_untried++;
1529		}
1530	}
1531
1532	/*
1533	 * There are three potential phases for a read:
1534	 *	1. produce valid data from the columns read
1535	 *	2. read all disks and try again
1536	 *	3. perform combinatorial reconstruction
1537	 *
1538	 * Each phase is progressively both more expensive and less likely to
1539	 * occur. If we encounter more errors than we can repair or all phases
1540	 * fail, we have no choice but to return an error.
1541	 */
1542
1543	/*
1544	 * If the number of errors we saw was correctable -- less than or equal
1545	 * to the number of parity disks read -- attempt to produce data that
1546	 * has a valid checksum. Naturally, this case applies in the absence of
1547	 * any errors.
1548	 */
1549	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
1550		if (data_errors == 0) {
1551			if (raidz_checksum_verify(bp, data) == 0) {
1552				/*
1553				 * If we read parity information (unnecessarily
1554				 * as it happens since no reconstruction was
1555				 * needed) regenerate and verify the parity.
1556				 * We also regenerate parity when resilvering
1557				 * so we can write it out to the failed device
1558				 * later.
1559				 */
1560				if (parity_errors + parity_untried <
1561				    rm->rm_firstdatacol) {
1562					n = raidz_parity_verify(rm);
1563					unexpected_errors += n;
1564					ASSERT(parity_errors + n <=
1565					    rm->rm_firstdatacol);
1566				}
1567				goto done;
1568			}
1569		} else {
1570			/*
1571			 * We either attempt to read all the parity columns or
1572			 * none of them. If we didn't try to read parity, we
1573			 * wouldn't be here in the correctable case. There must
1574			 * also have been fewer parity errors than parity
1575			 * columns or, again, we wouldn't be in this code path.
1576			 */
1577			ASSERT(parity_untried == 0);
1578			ASSERT(parity_errors < rm->rm_firstdatacol);
1579
1580			/*
1581			 * Identify the data columns that reported an error.
1582			 */
1583			n = 0;
1584			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1585				rc = &rm->rm_col[c];
1586				if (rc->rc_error != 0) {
1587					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1588					tgts[n++] = c;
1589				}
1590			}
1591
1592			ASSERT(rm->rm_firstdatacol >= n);
1593
1594			code = vdev_raidz_reconstruct(rm, tgts, n);
1595
1596			if (raidz_checksum_verify(bp, data) == 0) {
1597				/*
1598				 * If we read more parity disks than were used
1599				 * for reconstruction, confirm that the other
1600				 * parity disks produced correct data. This
1601				 * routine is suboptimal in that it regenerates
1602				 * the parity that we already used in addition
1603				 * to the parity that we're attempting to
1604				 * verify, but this should be a relatively
1605				 * uncommon case, and can be optimized if it
1606				 * becomes a problem. Note that we regenerate
1607				 * parity when resilvering so we can write it
1608				 * out to failed devices later.
1609				 */
1610				if (parity_errors < rm->rm_firstdatacol - n) {
1611					n = raidz_parity_verify(rm);
1612					unexpected_errors += n;
1613					ASSERT(parity_errors + n <=
1614					    rm->rm_firstdatacol);
1615				}
1616
1617				goto done;
1618			}
1619		}
1620	}
1621
1622	/*
1623	 * This isn't a typical situation -- either we got a read
1624	 * error or a child silently returned bad data. Read every
1625	 * block so we can try again with as much data and parity as
1626	 * we can track down. If we've already been through once
1627	 * before, all children will be marked as tried so we'll
1628	 * proceed to combinatorial reconstruction.
1629	 */
1630	unexpected_errors = 1;
1631	rm->rm_missingdata = 0;
1632	rm->rm_missingparity = 0;
1633
1634	n = 0;
1635	for (c = 0; c < rm->rm_cols; c++) {
1636		if (rm->rm_col[c].rc_tried)
1637			continue;
1638
1639		cvd = vdev_child(vd, rc->rc_devidx);
1640		ASSERT(cvd != NULL);
1641		rc->rc_error = cvd->v_read(cvd, NULL,
1642		    rc->rc_data, rc->rc_offset, rc->rc_size);
1643		if (rc->rc_error == 0)
1644			n++;
1645		rc->rc_tried = 1;
1646		rc->rc_skipped = 0;
1647	}
1648	/*
1649	 * If we managed to read anything more, retry the
1650	 * reconstruction.
1651	 */
1652	if (n > 0)
1653		goto reconstruct;
1654
1655	/*
1656	 * At this point we've attempted to reconstruct the data given the
1657	 * errors we detected, and we've attempted to read all columns. There
1658	 * must, therefore, be one or more additional problems -- silent errors
1659	 * resulting in invalid data rather than explicit I/O errors resulting
1660	 * in absent data. We check if there is enough additional data to
1661	 * possibly reconstruct the data and then perform combinatorial
1662	 * reconstruction over all possible combinations. If that fails,
1663	 * we're cooked.
1664	 */
1665	if (total_errors > rm->rm_firstdatacol) {
1666		error = EIO;
1667	} else if (total_errors < rm->rm_firstdatacol &&
1668	    (code = vdev_raidz_combrec(rm, bp, data, offset, total_errors,
1669	     data_errors)) != 0) {
1670		/*
1671		 * If we didn't use all the available parity for the
1672		 * combinatorial reconstruction, verify that the remaining
1673		 * parity is correct.
1674		 */
1675		if (code != (1 << rm->rm_firstdatacol) - 1)
1676			(void) raidz_parity_verify(rm);
1677	} else {
1678		/*
1679		 * We're here because either:
1680		 *
1681		 *	total_errors == rm_first_datacol, or
1682		 *	vdev_raidz_combrec() failed
1683		 *
1684		 * In either case, there is enough bad data to prevent
1685		 * reconstruction.
1686		 *
1687		 * Start checksum ereports for all children which haven't
1688		 * failed, and the IO wasn't speculative.
1689		 */
1690		error = ECKSUM;
1691	}
1692
1693done:
1694	vdev_raidz_map_free(rm);
1695
1696	return (error);
1697}
1698