vdev_raidz.c revision 1.3
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
25 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
26 * Copyright (c) 2014 Integros [integros.com]
27 */
28
29#include <sys/zfs_context.h>
30#include <sys/spa.h>
31#include <sys/vdev_impl.h>
32#ifndef __FreeBSD__
33#include <sys/vdev_disk.h>
34#endif
35#include <sys/vdev_file.h>
36#include <sys/vdev_raidz.h>
37#include <sys/zio.h>
38#include <sys/zio_checksum.h>
39#include <sys/fs/zfs.h>
40#include <sys/fm/fs/zfs.h>
41#ifdef __FreeBSD__
42#include <sys/bio.h>
43#endif
44
45/*
46 * Virtual device vector for RAID-Z.
47 *
48 * This vdev supports single, double, and triple parity. For single parity,
49 * we use a simple XOR of all the data columns. For double or triple parity,
50 * we use a special case of Reed-Solomon coding. This extends the
51 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
52 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
53 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
54 * former is also based. The latter is designed to provide higher performance
55 * for writes.
56 *
57 * Note that the Plank paper claimed to support arbitrary N+M, but was then
58 * amended six years later identifying a critical flaw that invalidates its
59 * claims. Nevertheless, the technique can be adapted to work for up to
60 * triple parity. For additional parity, the amendment "Note: Correction to
61 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
62 * is viable, but the additional complexity means that write performance will
63 * suffer.
64 *
65 * All of the methods above operate on a Galois field, defined over the
66 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
67 * can be expressed with a single byte. Briefly, the operations on the
68 * field are defined as follows:
69 *
70 *   o addition (+) is represented by a bitwise XOR
71 *   o subtraction (-) is therefore identical to addition: A + B = A - B
72 *   o multiplication of A by 2 is defined by the following bitwise expression:
73 *
74 *	(A * 2)_7 = A_6
75 *	(A * 2)_6 = A_5
76 *	(A * 2)_5 = A_4
77 *	(A * 2)_4 = A_3 + A_7
78 *	(A * 2)_3 = A_2 + A_7
79 *	(A * 2)_2 = A_1 + A_7
80 *	(A * 2)_1 = A_0
81 *	(A * 2)_0 = A_7
82 *
83 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
84 * As an aside, this multiplication is derived from the error correcting
85 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
86 *
87 * Observe that any number in the field (except for 0) can be expressed as a
88 * power of 2 -- a generator for the field. We store a table of the powers of
89 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
90 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
91 * than field addition). The inverse of a field element A (A^-1) is therefore
92 * A ^ (255 - 1) = A^254.
93 *
94 * The up-to-three parity columns, P, Q, R over several data columns,
95 * D_0, ... D_n-1, can be expressed by field operations:
96 *
97 *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
98 *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
99 *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
100 *	R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
101 *	  = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
102 *
103 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
104 * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
105 * independent coefficients. (There are no additional coefficients that have
106 * this property which is why the uncorrected Plank method breaks down.)
107 *
108 * See the reconstruction code below for how P, Q and R can used individually
109 * or in concert to recover missing data columns.
110 */
111
112typedef struct raidz_col {
113	uint64_t rc_devidx;		/* child device index for I/O */
114	uint64_t rc_offset;		/* device offset */
115	uint64_t rc_size;		/* I/O size */
116	void *rc_data;			/* I/O data */
117	void *rc_gdata;			/* used to store the "good" version */
118	int rc_error;			/* I/O error for this device */
119	uint8_t rc_tried;		/* Did we attempt this I/O column? */
120	uint8_t rc_skipped;		/* Did we skip this I/O column? */
121} raidz_col_t;
122
123typedef struct raidz_map {
124	uint64_t rm_cols;		/* Regular column count */
125	uint64_t rm_scols;		/* Count including skipped columns */
126	uint64_t rm_bigcols;		/* Number of oversized columns */
127	uint64_t rm_asize;		/* Actual total I/O size */
128	uint64_t rm_missingdata;	/* Count of missing data devices */
129	uint64_t rm_missingparity;	/* Count of missing parity devices */
130	uint64_t rm_firstdatacol;	/* First data column/parity count */
131	uint64_t rm_nskip;		/* Skipped sectors for padding */
132	uint64_t rm_skipstart;		/* Column index of padding start */
133	void *rm_datacopy;		/* rm_asize-buffer of copied data */
134	uintptr_t rm_reports;		/* # of referencing checksum reports */
135	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
136	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
137	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
138} raidz_map_t;
139
140#define	VDEV_RAIDZ_P		0
141#define	VDEV_RAIDZ_Q		1
142#define	VDEV_RAIDZ_R		2
143
144#define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
145#define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
146
147/*
148 * We provide a mechanism to perform the field multiplication operation on a
149 * 64-bit value all at once rather than a byte at a time. This works by
150 * creating a mask from the top bit in each byte and using that to
151 * conditionally apply the XOR of 0x1d.
152 */
153#define	VDEV_RAIDZ_64MUL_2(x, mask) \
154{ \
155	(mask) = (x) & 0x8080808080808080ULL; \
156	(mask) = ((mask) << 1) - ((mask) >> 7); \
157	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
158	    ((mask) & 0x1d1d1d1d1d1d1d1d); \
159}
160
161#define	VDEV_RAIDZ_64MUL_4(x, mask) \
162{ \
163	VDEV_RAIDZ_64MUL_2((x), mask); \
164	VDEV_RAIDZ_64MUL_2((x), mask); \
165}
166
167#define	VDEV_LABEL_OFFSET(x)	(x + VDEV_LABEL_START_SIZE)
168
169/*
170 * Force reconstruction to use the general purpose method.
171 */
172int vdev_raidz_default_to_general;
173
174/* Powers of 2 in the Galois field defined above. */
175static const uint8_t vdev_raidz_pow2[256] = {
176	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
177	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
178	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
179	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
180	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
181	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
182	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
183	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
184	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
185	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
186	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
187	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
188	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
189	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
190	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
191	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
192	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
193	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
194	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
195	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
196	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
197	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
198	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
199	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
200	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
201	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
202	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
203	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
204	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
205	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
206	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
207	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
208};
209/* Logs of 2 in the Galois field defined above. */
210static const uint8_t vdev_raidz_log2[256] = {
211	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
212	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
213	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
214	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
215	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
216	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
217	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
218	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
219	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
220	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
221	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
222	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
223	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
224	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
225	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
226	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
227	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
228	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
229	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
230	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
231	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
232	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
233	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
234	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
235	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
236	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
237	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
238	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
239	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
240	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
241	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
242	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
243};
244
245static void vdev_raidz_generate_parity(raidz_map_t *rm);
246
247/*
248 * Multiply a given number by 2 raised to the given power.
249 */
250static uint8_t
251vdev_raidz_exp2(uint_t a, int exp)
252{
253	if (a == 0)
254		return (0);
255
256	ASSERT(exp >= 0);
257	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
258
259	exp += vdev_raidz_log2[a];
260	if (exp > 255)
261		exp -= 255;
262
263	return (vdev_raidz_pow2[exp]);
264}
265
266static void
267vdev_raidz_map_free(raidz_map_t *rm)
268{
269	int c;
270	size_t size;
271
272	for (c = 0; c < rm->rm_firstdatacol; c++) {
273		if (rm->rm_col[c].rc_data != NULL)
274			zio_buf_free(rm->rm_col[c].rc_data,
275			    rm->rm_col[c].rc_size);
276
277		if (rm->rm_col[c].rc_gdata != NULL)
278			zio_buf_free(rm->rm_col[c].rc_gdata,
279			    rm->rm_col[c].rc_size);
280	}
281
282	size = 0;
283	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
284		size += rm->rm_col[c].rc_size;
285
286	if (rm->rm_datacopy != NULL)
287		zio_buf_free(rm->rm_datacopy, size);
288
289	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
290}
291
292static void
293vdev_raidz_map_free_vsd(zio_t *zio)
294{
295	raidz_map_t *rm = zio->io_vsd;
296
297	ASSERT0(rm->rm_freed);
298	rm->rm_freed = 1;
299
300	if (rm->rm_reports == 0)
301		vdev_raidz_map_free(rm);
302}
303
304/*ARGSUSED*/
305static void
306vdev_raidz_cksum_free(void *arg, size_t ignored)
307{
308	raidz_map_t *rm = arg;
309
310	ASSERT3U(rm->rm_reports, >, 0);
311
312	if (--rm->rm_reports == 0 && rm->rm_freed != 0)
313		vdev_raidz_map_free(rm);
314}
315
316static void
317vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
318{
319	raidz_map_t *rm = zcr->zcr_cbdata;
320	size_t c = zcr->zcr_cbinfo;
321	size_t x;
322
323	const char *good = NULL;
324	const char *bad = rm->rm_col[c].rc_data;
325
326	if (good_data == NULL) {
327		zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
328		return;
329	}
330
331	if (c < rm->rm_firstdatacol) {
332		/*
333		 * The first time through, calculate the parity blocks for
334		 * the good data (this relies on the fact that the good
335		 * data never changes for a given logical ZIO)
336		 */
337		if (rm->rm_col[0].rc_gdata == NULL) {
338			char *bad_parity[VDEV_RAIDZ_MAXPARITY];
339			char *buf;
340
341			/*
342			 * Set up the rm_col[]s to generate the parity for
343			 * good_data, first saving the parity bufs and
344			 * replacing them with buffers to hold the result.
345			 */
346			for (x = 0; x < rm->rm_firstdatacol; x++) {
347				bad_parity[x] = rm->rm_col[x].rc_data;
348				rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
349				    zio_buf_alloc(rm->rm_col[x].rc_size);
350			}
351
352			/* fill in the data columns from good_data */
353			buf = (char *)good_data;
354			for (; x < rm->rm_cols; x++) {
355				rm->rm_col[x].rc_data = buf;
356				buf += rm->rm_col[x].rc_size;
357			}
358
359			/*
360			 * Construct the parity from the good data.
361			 */
362			vdev_raidz_generate_parity(rm);
363
364			/* restore everything back to its original state */
365			for (x = 0; x < rm->rm_firstdatacol; x++)
366				rm->rm_col[x].rc_data = bad_parity[x];
367
368			buf = rm->rm_datacopy;
369			for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
370				rm->rm_col[x].rc_data = buf;
371				buf += rm->rm_col[x].rc_size;
372			}
373		}
374
375		ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
376		good = rm->rm_col[c].rc_gdata;
377	} else {
378		/* adjust good_data to point at the start of our column */
379		good = good_data;
380
381		for (x = rm->rm_firstdatacol; x < c; x++)
382			good += rm->rm_col[x].rc_size;
383	}
384
385	/* we drop the ereport if it ends up that the data was good */
386	zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
387}
388
389/*
390 * Invoked indirectly by zfs_ereport_start_checksum(), called
391 * below when our read operation fails completely.  The main point
392 * is to keep a copy of everything we read from disk, so that at
393 * vdev_raidz_cksum_finish() time we can compare it with the good data.
394 */
395static void
396vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
397{
398	size_t c = (size_t)(uintptr_t)arg;
399	caddr_t buf;
400
401	raidz_map_t *rm = zio->io_vsd;
402	size_t size;
403
404	/* set up the report and bump the refcount  */
405	zcr->zcr_cbdata = rm;
406	zcr->zcr_cbinfo = c;
407	zcr->zcr_finish = vdev_raidz_cksum_finish;
408	zcr->zcr_free = vdev_raidz_cksum_free;
409
410	rm->rm_reports++;
411	ASSERT3U(rm->rm_reports, >, 0);
412
413	if (rm->rm_datacopy != NULL)
414		return;
415
416	/*
417	 * It's the first time we're called for this raidz_map_t, so we need
418	 * to copy the data aside; there's no guarantee that our zio's buffer
419	 * won't be re-used for something else.
420	 *
421	 * Our parity data is already in separate buffers, so there's no need
422	 * to copy them.
423	 */
424
425	size = 0;
426	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
427		size += rm->rm_col[c].rc_size;
428
429	buf = rm->rm_datacopy = zio_buf_alloc(size);
430
431	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
432		raidz_col_t *col = &rm->rm_col[c];
433
434		bcopy(col->rc_data, buf, col->rc_size);
435		col->rc_data = buf;
436
437		buf += col->rc_size;
438	}
439	ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
440}
441
442static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
443	vdev_raidz_map_free_vsd,
444	vdev_raidz_cksum_report
445};
446
447/*
448 * Divides the IO evenly across all child vdevs; usually, dcols is
449 * the number of children in the target vdev.
450 */
451static raidz_map_t *
452vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset, boolean_t dofree,
453    uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
454{
455	raidz_map_t *rm;
456	/* The starting RAIDZ (parent) vdev sector of the block. */
457	uint64_t b = offset >> unit_shift;
458	/* The zio's size in units of the vdev's minimum sector size. */
459	uint64_t s = size >> unit_shift;
460	/* The first column for this stripe. */
461	uint64_t f = b % dcols;
462	/* The starting byte offset on each child vdev. */
463	uint64_t o = (b / dcols) << unit_shift;
464	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
465
466	/*
467	 * "Quotient": The number of data sectors for this stripe on all but
468	 * the "big column" child vdevs that also contain "remainder" data.
469	 */
470	q = s / (dcols - nparity);
471
472	/*
473	 * "Remainder": The number of partial stripe data sectors in this I/O.
474	 * This will add a sector to some, but not all, child vdevs.
475	 */
476	r = s - q * (dcols - nparity);
477
478	/* The number of "big columns" - those which contain remainder data. */
479	bc = (r == 0 ? 0 : r + nparity);
480
481	/*
482	 * The total number of data and parity sectors associated with
483	 * this I/O.
484	 */
485	tot = s + nparity * (q + (r == 0 ? 0 : 1));
486
487	/* acols: The columns that will be accessed. */
488	/* scols: The columns that will be accessed or skipped. */
489	if (q == 0) {
490		/* Our I/O request doesn't span all child vdevs. */
491		acols = bc;
492		scols = MIN(dcols, roundup(bc, nparity + 1));
493	} else {
494		acols = dcols;
495		scols = dcols;
496	}
497
498	ASSERT3U(acols, <=, scols);
499
500	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
501
502	rm->rm_cols = acols;
503	rm->rm_scols = scols;
504	rm->rm_bigcols = bc;
505	rm->rm_skipstart = bc;
506	rm->rm_missingdata = 0;
507	rm->rm_missingparity = 0;
508	rm->rm_firstdatacol = nparity;
509	rm->rm_datacopy = NULL;
510	rm->rm_reports = 0;
511	rm->rm_freed = 0;
512	rm->rm_ecksuminjected = 0;
513
514	asize = 0;
515
516	for (c = 0; c < scols; c++) {
517		col = f + c;
518		coff = o;
519		if (col >= dcols) {
520			col -= dcols;
521			coff += 1ULL << unit_shift;
522		}
523		rm->rm_col[c].rc_devidx = col;
524		rm->rm_col[c].rc_offset = coff;
525		rm->rm_col[c].rc_data = NULL;
526		rm->rm_col[c].rc_gdata = NULL;
527		rm->rm_col[c].rc_error = 0;
528		rm->rm_col[c].rc_tried = 0;
529		rm->rm_col[c].rc_skipped = 0;
530
531		if (c >= acols)
532			rm->rm_col[c].rc_size = 0;
533		else if (c < bc)
534			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
535		else
536			rm->rm_col[c].rc_size = q << unit_shift;
537
538		asize += rm->rm_col[c].rc_size;
539	}
540
541	ASSERT3U(asize, ==, tot << unit_shift);
542	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
543	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
544	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
545	ASSERT3U(rm->rm_nskip, <=, nparity);
546
547	if (!dofree) {
548		for (c = 0; c < rm->rm_firstdatacol; c++) {
549			rm->rm_col[c].rc_data =
550			    zio_buf_alloc(rm->rm_col[c].rc_size);
551		}
552
553		rm->rm_col[c].rc_data = data;
554
555		for (c = c + 1; c < acols; c++) {
556			rm->rm_col[c].rc_data =
557			    (char *)rm->rm_col[c - 1].rc_data +
558			    rm->rm_col[c - 1].rc_size;
559		}
560	}
561
562	/*
563	 * If all data stored spans all columns, there's a danger that parity
564	 * will always be on the same device and, since parity isn't read
565	 * during normal operation, that that device's I/O bandwidth won't be
566	 * used effectively. We therefore switch the parity every 1MB.
567	 *
568	 * ... at least that was, ostensibly, the theory. As a practical
569	 * matter unless we juggle the parity between all devices evenly, we
570	 * won't see any benefit. Further, occasional writes that aren't a
571	 * multiple of the LCM of the number of children and the minimum
572	 * stripe width are sufficient to avoid pessimal behavior.
573	 * Unfortunately, this decision created an implicit on-disk format
574	 * requirement that we need to support for all eternity, but only
575	 * for single-parity RAID-Z.
576	 *
577	 * If we intend to skip a sector in the zeroth column for padding
578	 * we must make sure to note this swap. We will never intend to
579	 * skip the first column since at least one data and one parity
580	 * column must appear in each row.
581	 */
582	ASSERT(rm->rm_cols >= 2);
583	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
584
585	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
586		devidx = rm->rm_col[0].rc_devidx;
587		o = rm->rm_col[0].rc_offset;
588		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
589		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
590		rm->rm_col[1].rc_devidx = devidx;
591		rm->rm_col[1].rc_offset = o;
592
593		if (rm->rm_skipstart == 0)
594			rm->rm_skipstart = 1;
595	}
596
597	return (rm);
598}
599
600static void
601vdev_raidz_generate_parity_p(raidz_map_t *rm)
602{
603	uint64_t *p, *src, pcount, ccount, i;
604	int c;
605
606	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
607
608	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
609		src = rm->rm_col[c].rc_data;
610		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
611		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
612
613		if (c == rm->rm_firstdatacol) {
614			ASSERT(ccount == pcount);
615			for (i = 0; i < ccount; i++, src++, p++) {
616				*p = *src;
617			}
618		} else {
619			ASSERT(ccount <= pcount);
620			for (i = 0; i < ccount; i++, src++, p++) {
621				*p ^= *src;
622			}
623		}
624	}
625}
626
627static void
628vdev_raidz_generate_parity_pq(raidz_map_t *rm)
629{
630	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
631	int c;
632
633	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
634	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
635	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
636
637	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
638		src = rm->rm_col[c].rc_data;
639		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
640		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
641
642		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
643
644		if (c == rm->rm_firstdatacol) {
645			ASSERT(ccnt == pcnt || ccnt == 0);
646			for (i = 0; i < ccnt; i++, src++, p++, q++) {
647				*p = *src;
648				*q = *src;
649			}
650			for (; i < pcnt; i++, src++, p++, q++) {
651				*p = 0;
652				*q = 0;
653			}
654		} else {
655			ASSERT(ccnt <= pcnt);
656
657			/*
658			 * Apply the algorithm described above by multiplying
659			 * the previous result and adding in the new value.
660			 */
661			for (i = 0; i < ccnt; i++, src++, p++, q++) {
662				*p ^= *src;
663
664				VDEV_RAIDZ_64MUL_2(*q, mask);
665				*q ^= *src;
666			}
667
668			/*
669			 * Treat short columns as though they are full of 0s.
670			 * Note that there's therefore nothing needed for P.
671			 */
672			for (; i < pcnt; i++, q++) {
673				VDEV_RAIDZ_64MUL_2(*q, mask);
674			}
675		}
676	}
677}
678
679static void
680vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
681{
682	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
683	int c;
684
685	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
686	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
687	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
688	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
689	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
690
691	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
692		src = rm->rm_col[c].rc_data;
693		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
694		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
695		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
696
697		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
698
699		if (c == rm->rm_firstdatacol) {
700			ASSERT(ccnt == pcnt || ccnt == 0);
701			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
702				*p = *src;
703				*q = *src;
704				*r = *src;
705			}
706			for (; i < pcnt; i++, src++, p++, q++, r++) {
707				*p = 0;
708				*q = 0;
709				*r = 0;
710			}
711		} else {
712			ASSERT(ccnt <= pcnt);
713
714			/*
715			 * Apply the algorithm described above by multiplying
716			 * the previous result and adding in the new value.
717			 */
718			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
719				*p ^= *src;
720
721				VDEV_RAIDZ_64MUL_2(*q, mask);
722				*q ^= *src;
723
724				VDEV_RAIDZ_64MUL_4(*r, mask);
725				*r ^= *src;
726			}
727
728			/*
729			 * Treat short columns as though they are full of 0s.
730			 * Note that there's therefore nothing needed for P.
731			 */
732			for (; i < pcnt; i++, q++, r++) {
733				VDEV_RAIDZ_64MUL_2(*q, mask);
734				VDEV_RAIDZ_64MUL_4(*r, mask);
735			}
736		}
737	}
738}
739
740/*
741 * Generate RAID parity in the first virtual columns according to the number of
742 * parity columns available.
743 */
744static void
745vdev_raidz_generate_parity(raidz_map_t *rm)
746{
747	switch (rm->rm_firstdatacol) {
748	case 1:
749		vdev_raidz_generate_parity_p(rm);
750		break;
751	case 2:
752		vdev_raidz_generate_parity_pq(rm);
753		break;
754	case 3:
755		vdev_raidz_generate_parity_pqr(rm);
756		break;
757	default:
758		cmn_err(CE_PANIC, "invalid RAID-Z configuration");
759	}
760}
761
762static int
763vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
764{
765	uint64_t *dst, *src, xcount, ccount, count, i;
766	int x = tgts[0];
767	int c;
768
769	ASSERT(ntgts == 1);
770	ASSERT(x >= rm->rm_firstdatacol);
771	ASSERT(x < rm->rm_cols);
772
773	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
774	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
775	ASSERT(xcount > 0);
776
777	src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
778	dst = rm->rm_col[x].rc_data;
779	for (i = 0; i < xcount; i++, dst++, src++) {
780		*dst = *src;
781	}
782
783	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
784		src = rm->rm_col[c].rc_data;
785		dst = rm->rm_col[x].rc_data;
786
787		if (c == x)
788			continue;
789
790		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
791		count = MIN(ccount, xcount);
792
793		for (i = 0; i < count; i++, dst++, src++) {
794			*dst ^= *src;
795		}
796	}
797
798	return (1 << VDEV_RAIDZ_P);
799}
800
801static int
802vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
803{
804	uint64_t *dst, *src, xcount, ccount, count, mask, i;
805	uint8_t *b;
806	int x = tgts[0];
807	int c, j, exp;
808
809	ASSERT(ntgts == 1);
810
811	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
812	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
813
814	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
815		src = rm->rm_col[c].rc_data;
816		dst = rm->rm_col[x].rc_data;
817
818		if (c == x)
819			ccount = 0;
820		else
821			ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
822
823		count = MIN(ccount, xcount);
824
825		if (c == rm->rm_firstdatacol) {
826			for (i = 0; i < count; i++, dst++, src++) {
827				*dst = *src;
828			}
829			for (; i < xcount; i++, dst++) {
830				*dst = 0;
831			}
832
833		} else {
834			for (i = 0; i < count; i++, dst++, src++) {
835				VDEV_RAIDZ_64MUL_2(*dst, mask);
836				*dst ^= *src;
837			}
838
839			for (; i < xcount; i++, dst++) {
840				VDEV_RAIDZ_64MUL_2(*dst, mask);
841			}
842		}
843	}
844
845	src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
846	dst = rm->rm_col[x].rc_data;
847	exp = 255 - (rm->rm_cols - 1 - x);
848
849	for (i = 0; i < xcount; i++, dst++, src++) {
850		*dst ^= *src;
851		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
852			*b = vdev_raidz_exp2(*b, exp);
853		}
854	}
855
856	return (1 << VDEV_RAIDZ_Q);
857}
858
859static int
860vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
861{
862	uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
863	void *pdata, *qdata;
864	uint64_t xsize, ysize, i;
865	int x = tgts[0];
866	int y = tgts[1];
867
868	ASSERT(ntgts == 2);
869	ASSERT(x < y);
870	ASSERT(x >= rm->rm_firstdatacol);
871	ASSERT(y < rm->rm_cols);
872
873	ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
874
875	/*
876	 * Move the parity data aside -- we're going to compute parity as
877	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
878	 * reuse the parity generation mechanism without trashing the actual
879	 * parity so we make those columns appear to be full of zeros by
880	 * setting their lengths to zero.
881	 */
882	pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
883	qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
884	xsize = rm->rm_col[x].rc_size;
885	ysize = rm->rm_col[y].rc_size;
886
887	rm->rm_col[VDEV_RAIDZ_P].rc_data =
888	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
889	rm->rm_col[VDEV_RAIDZ_Q].rc_data =
890	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
891	rm->rm_col[x].rc_size = 0;
892	rm->rm_col[y].rc_size = 0;
893
894	vdev_raidz_generate_parity_pq(rm);
895
896	rm->rm_col[x].rc_size = xsize;
897	rm->rm_col[y].rc_size = ysize;
898
899	p = pdata;
900	q = qdata;
901	pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
902	qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
903	xd = rm->rm_col[x].rc_data;
904	yd = rm->rm_col[y].rc_data;
905
906	/*
907	 * We now have:
908	 *	Pxy = P + D_x + D_y
909	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
910	 *
911	 * We can then solve for D_x:
912	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
913	 * where
914	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
915	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
916	 *
917	 * With D_x in hand, we can easily solve for D_y:
918	 *	D_y = P + Pxy + D_x
919	 */
920
921	a = vdev_raidz_pow2[255 + x - y];
922	b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
923	tmp = 255 - vdev_raidz_log2[a ^ 1];
924
925	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
926	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
927
928	for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
929		*xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
930		    vdev_raidz_exp2(*q ^ *qxy, bexp);
931
932		if (i < ysize)
933			*yd = *p ^ *pxy ^ *xd;
934	}
935
936	zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
937	    rm->rm_col[VDEV_RAIDZ_P].rc_size);
938	zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
939	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
940
941	/*
942	 * Restore the saved parity data.
943	 */
944	rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
945	rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
946
947	return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
948}
949
950/* BEGIN CSTYLED */
951/*
952 * In the general case of reconstruction, we must solve the system of linear
953 * equations defined by the coeffecients used to generate parity as well as
954 * the contents of the data and parity disks. This can be expressed with
955 * vectors for the original data (D) and the actual data (d) and parity (p)
956 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
957 *
958 *            __   __                     __     __
959 *            |     |         __     __   |  p_0  |
960 *            |  V  |         |  D_0  |   | p_m-1 |
961 *            |     |    x    |   :   | = |  d_0  |
962 *            |  I  |         | D_n-1 |   |   :   |
963 *            |     |         ~~     ~~   | d_n-1 |
964 *            ~~   ~~                     ~~     ~~
965 *
966 * I is simply a square identity matrix of size n, and V is a vandermonde
967 * matrix defined by the coeffecients we chose for the various parity columns
968 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
969 * computation as well as linear separability.
970 *
971 *      __               __               __     __
972 *      |   1   ..  1 1 1 |               |  p_0  |
973 *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
974 *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
975 *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
976 *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
977 *      |   :       : : : |   |   :   |   |  d_2  |
978 *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
979 *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
980 *      |   0   ..  0 0 1 |               | d_n-1 |
981 *      ~~               ~~               ~~     ~~
982 *
983 * Note that I, V, d, and p are known. To compute D, we must invert the
984 * matrix and use the known data and parity values to reconstruct the unknown
985 * data values. We begin by removing the rows in V|I and d|p that correspond
986 * to failed or missing columns; we then make V|I square (n x n) and d|p
987 * sized n by removing rows corresponding to unused parity from the bottom up
988 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
989 * using Gauss-Jordan elimination. In the example below we use m=3 parity
990 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
991 *           __                               __
992 *           |  1   1   1   1   1   1   1   1  |
993 *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
994 *           |  19 205 116  29  64  16  4   1  |      / /
995 *           |  1   0   0   0   0   0   0   0  |     / /
996 *           |  0   1   0   0   0   0   0   0  | <--' /
997 *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
998 *           |  0   0   0   1   0   0   0   0  |
999 *           |  0   0   0   0   1   0   0   0  |
1000 *           |  0   0   0   0   0   1   0   0  |
1001 *           |  0   0   0   0   0   0   1   0  |
1002 *           |  0   0   0   0   0   0   0   1  |
1003 *           ~~                               ~~
1004 *           __                               __
1005 *           |  1   1   1   1   1   1   1   1  |
1006 *           |  19 205 116  29  64  16  4   1  |
1007 *           |  1   0   0   0   0   0   0   0  |
1008 *  (V|I)' = |  0   0   0   1   0   0   0   0  |
1009 *           |  0   0   0   0   1   0   0   0  |
1010 *           |  0   0   0   0   0   1   0   0  |
1011 *           |  0   0   0   0   0   0   1   0  |
1012 *           |  0   0   0   0   0   0   0   1  |
1013 *           ~~                               ~~
1014 *
1015 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1016 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1017 * matrix is not singular.
1018 * __                                                                 __
1019 * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1020 * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1021 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1022 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1023 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1024 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1025 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1026 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1027 * ~~                                                                 ~~
1028 * __                                                                 __
1029 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1030 * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1031 * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1032 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1033 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1034 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1035 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1036 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1037 * ~~                                                                 ~~
1038 * __                                                                 __
1039 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1040 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1041 * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
1042 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1043 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1044 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1045 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1046 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1047 * ~~                                                                 ~~
1048 * __                                                                 __
1049 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1050 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1051 * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
1052 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1053 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1054 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1055 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1056 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1057 * ~~                                                                 ~~
1058 * __                                                                 __
1059 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1060 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1061 * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1062 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1063 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1064 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1065 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1066 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1067 * ~~                                                                 ~~
1068 * __                                                                 __
1069 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1070 * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
1071 * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1072 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1073 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1074 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1075 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1076 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1077 * ~~                                                                 ~~
1078 *                   __                               __
1079 *                   |  0   0   1   0   0   0   0   0  |
1080 *                   | 167 100  5   41 159 169 217 208 |
1081 *                   | 166 100  4   40 158 168 216 209 |
1082 *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
1083 *                   |  0   0   0   0   1   0   0   0  |
1084 *                   |  0   0   0   0   0   1   0   0  |
1085 *                   |  0   0   0   0   0   0   1   0  |
1086 *                   |  0   0   0   0   0   0   0   1  |
1087 *                   ~~                               ~~
1088 *
1089 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1090 * of the missing data.
1091 *
1092 * As is apparent from the example above, the only non-trivial rows in the
1093 * inverse matrix correspond to the data disks that we're trying to
1094 * reconstruct. Indeed, those are the only rows we need as the others would
1095 * only be useful for reconstructing data known or assumed to be valid. For
1096 * that reason, we only build the coefficients in the rows that correspond to
1097 * targeted columns.
1098 */
1099/* END CSTYLED */
1100
1101static void
1102vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1103    uint8_t **rows)
1104{
1105	int i, j;
1106	int pow;
1107
1108	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1109
1110	/*
1111	 * Fill in the missing rows of interest.
1112	 */
1113	for (i = 0; i < nmap; i++) {
1114		ASSERT3S(0, <=, map[i]);
1115		ASSERT3S(map[i], <=, 2);
1116
1117		pow = map[i] * n;
1118		if (pow > 255)
1119			pow -= 255;
1120		ASSERT(pow <= 255);
1121
1122		for (j = 0; j < n; j++) {
1123			pow -= map[i];
1124			if (pow < 0)
1125				pow += 255;
1126			rows[i][j] = vdev_raidz_pow2[pow];
1127		}
1128	}
1129}
1130
1131static void
1132vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1133    uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1134{
1135	int i, j, ii, jj;
1136	uint8_t log;
1137
1138	/*
1139	 * Assert that the first nmissing entries from the array of used
1140	 * columns correspond to parity columns and that subsequent entries
1141	 * correspond to data columns.
1142	 */
1143	for (i = 0; i < nmissing; i++) {
1144		ASSERT3S(used[i], <, rm->rm_firstdatacol);
1145	}
1146	for (; i < n; i++) {
1147		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1148	}
1149
1150	/*
1151	 * First initialize the storage where we'll compute the inverse rows.
1152	 */
1153	for (i = 0; i < nmissing; i++) {
1154		for (j = 0; j < n; j++) {
1155			invrows[i][j] = (i == j) ? 1 : 0;
1156		}
1157	}
1158
1159	/*
1160	 * Subtract all trivial rows from the rows of consequence.
1161	 */
1162	for (i = 0; i < nmissing; i++) {
1163		for (j = nmissing; j < n; j++) {
1164			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1165			jj = used[j] - rm->rm_firstdatacol;
1166			ASSERT3S(jj, <, n);
1167			invrows[i][j] = rows[i][jj];
1168			rows[i][jj] = 0;
1169		}
1170	}
1171
1172	/*
1173	 * For each of the rows of interest, we must normalize it and subtract
1174	 * a multiple of it from the other rows.
1175	 */
1176	for (i = 0; i < nmissing; i++) {
1177		for (j = 0; j < missing[i]; j++) {
1178			ASSERT0(rows[i][j]);
1179		}
1180		ASSERT3U(rows[i][missing[i]], !=, 0);
1181
1182		/*
1183		 * Compute the inverse of the first element and multiply each
1184		 * element in the row by that value.
1185		 */
1186		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1187
1188		for (j = 0; j < n; j++) {
1189			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1190			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1191		}
1192
1193		for (ii = 0; ii < nmissing; ii++) {
1194			if (i == ii)
1195				continue;
1196
1197			ASSERT3U(rows[ii][missing[i]], !=, 0);
1198
1199			log = vdev_raidz_log2[rows[ii][missing[i]]];
1200
1201			for (j = 0; j < n; j++) {
1202				rows[ii][j] ^=
1203				    vdev_raidz_exp2(rows[i][j], log);
1204				invrows[ii][j] ^=
1205				    vdev_raidz_exp2(invrows[i][j], log);
1206			}
1207		}
1208	}
1209
1210	/*
1211	 * Verify that the data that is left in the rows are properly part of
1212	 * an identity matrix.
1213	 */
1214	for (i = 0; i < nmissing; i++) {
1215		for (j = 0; j < n; j++) {
1216			if (j == missing[i]) {
1217				ASSERT3U(rows[i][j], ==, 1);
1218			} else {
1219				ASSERT0(rows[i][j]);
1220			}
1221		}
1222	}
1223}
1224
1225static void
1226vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1227    int *missing, uint8_t **invrows, const uint8_t *used)
1228{
1229	int i, j, x, cc, c;
1230	uint8_t *src;
1231	uint64_t ccount;
1232	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1233	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1234	uint8_t log = 0;
1235	uint8_t val;
1236	int ll;
1237	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1238	uint8_t *p, *pp;
1239	size_t psize;
1240
1241	psize = sizeof (invlog[0][0]) * n * nmissing;
1242	p = kmem_alloc(psize, KM_SLEEP);
1243
1244	for (pp = p, i = 0; i < nmissing; i++) {
1245		invlog[i] = pp;
1246		pp += n;
1247	}
1248
1249	for (i = 0; i < nmissing; i++) {
1250		for (j = 0; j < n; j++) {
1251			ASSERT3U(invrows[i][j], !=, 0);
1252			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1253		}
1254	}
1255
1256	for (i = 0; i < n; i++) {
1257		c = used[i];
1258		ASSERT3U(c, <, rm->rm_cols);
1259
1260		src = rm->rm_col[c].rc_data;
1261		ccount = rm->rm_col[c].rc_size;
1262		for (j = 0; j < nmissing; j++) {
1263			cc = missing[j] + rm->rm_firstdatacol;
1264			ASSERT3U(cc, >=, rm->rm_firstdatacol);
1265			ASSERT3U(cc, <, rm->rm_cols);
1266			ASSERT3U(cc, !=, c);
1267
1268			dst[j] = rm->rm_col[cc].rc_data;
1269			dcount[j] = rm->rm_col[cc].rc_size;
1270		}
1271
1272		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1273
1274		for (x = 0; x < ccount; x++, src++) {
1275			if (*src != 0)
1276				log = vdev_raidz_log2[*src];
1277
1278			for (cc = 0; cc < nmissing; cc++) {
1279				if (x >= dcount[cc])
1280					continue;
1281
1282				if (*src == 0) {
1283					val = 0;
1284				} else {
1285					if ((ll = log + invlog[cc][i]) >= 255)
1286						ll -= 255;
1287					val = vdev_raidz_pow2[ll];
1288				}
1289
1290				if (i == 0)
1291					dst[cc][x] = val;
1292				else
1293					dst[cc][x] ^= val;
1294			}
1295		}
1296	}
1297
1298	kmem_free(p, psize);
1299}
1300
1301static int
1302vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1303{
1304	int n, i, c, t, tt;
1305	int nmissing_rows;
1306	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1307	int parity_map[VDEV_RAIDZ_MAXPARITY];
1308
1309	uint8_t *p, *pp;
1310	size_t psize;
1311
1312	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1313	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1314	uint8_t *used;
1315
1316	int code = 0;
1317
1318
1319	n = rm->rm_cols - rm->rm_firstdatacol;
1320
1321	/*
1322	 * Figure out which data columns are missing.
1323	 */
1324	nmissing_rows = 0;
1325	for (t = 0; t < ntgts; t++) {
1326		if (tgts[t] >= rm->rm_firstdatacol) {
1327			missing_rows[nmissing_rows++] =
1328			    tgts[t] - rm->rm_firstdatacol;
1329		}
1330	}
1331
1332	/*
1333	 * Figure out which parity columns to use to help generate the missing
1334	 * data columns.
1335	 */
1336	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1337		ASSERT(tt < ntgts);
1338		ASSERT(c < rm->rm_firstdatacol);
1339
1340		/*
1341		 * Skip any targeted parity columns.
1342		 */
1343		if (c == tgts[tt]) {
1344			tt++;
1345			continue;
1346		}
1347
1348		code |= 1 << c;
1349
1350		parity_map[i] = c;
1351		i++;
1352	}
1353
1354	ASSERT(code != 0);
1355	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1356
1357	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1358	    nmissing_rows * n + sizeof (used[0]) * n;
1359	p = kmem_alloc(psize, KM_SLEEP);
1360
1361	for (pp = p, i = 0; i < nmissing_rows; i++) {
1362		rows[i] = pp;
1363		pp += n;
1364		invrows[i] = pp;
1365		pp += n;
1366	}
1367	used = pp;
1368
1369	for (i = 0; i < nmissing_rows; i++) {
1370		used[i] = parity_map[i];
1371	}
1372
1373	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1374		if (tt < nmissing_rows &&
1375		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1376			tt++;
1377			continue;
1378		}
1379
1380		ASSERT3S(i, <, n);
1381		used[i] = c;
1382		i++;
1383	}
1384
1385	/*
1386	 * Initialize the interesting rows of the matrix.
1387	 */
1388	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1389
1390	/*
1391	 * Invert the matrix.
1392	 */
1393	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1394	    invrows, used);
1395
1396	/*
1397	 * Reconstruct the missing data using the generated matrix.
1398	 */
1399	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1400	    invrows, used);
1401
1402	kmem_free(p, psize);
1403
1404	return (code);
1405}
1406
1407static int
1408vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1409{
1410	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1411	int ntgts;
1412	int i, c;
1413	int code;
1414	int nbadparity, nbaddata;
1415	int parity_valid[VDEV_RAIDZ_MAXPARITY] = {0};
1416
1417	/*
1418	 * The tgts list must already be sorted.
1419	 */
1420	for (i = 1; i < nt; i++) {
1421		ASSERT(t[i] > t[i - 1]);
1422	}
1423
1424	nbadparity = rm->rm_firstdatacol;
1425	nbaddata = rm->rm_cols - nbadparity;
1426	ntgts = 0;
1427	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1428		if (c < rm->rm_firstdatacol)
1429			parity_valid[c] = B_FALSE;
1430
1431		if (i < nt && c == t[i]) {
1432			tgts[ntgts++] = c;
1433			i++;
1434		} else if (rm->rm_col[c].rc_error != 0) {
1435			tgts[ntgts++] = c;
1436		} else if (c >= rm->rm_firstdatacol) {
1437			nbaddata--;
1438		} else {
1439			parity_valid[c] = B_TRUE;
1440			nbadparity--;
1441		}
1442	}
1443
1444	ASSERT(ntgts >= nt);
1445	ASSERT(nbaddata >= 0);
1446	ASSERT(nbaddata + nbadparity == ntgts);
1447
1448	dt = &tgts[nbadparity];
1449
1450	/*
1451	 * See if we can use any of our optimized reconstruction routines.
1452	 */
1453	if (!vdev_raidz_default_to_general) {
1454		switch (nbaddata) {
1455		case 1:
1456			if (parity_valid[VDEV_RAIDZ_P])
1457				return (vdev_raidz_reconstruct_p(rm, dt, 1));
1458
1459			ASSERT(rm->rm_firstdatacol > 1);
1460
1461			if (parity_valid[VDEV_RAIDZ_Q])
1462				return (vdev_raidz_reconstruct_q(rm, dt, 1));
1463
1464			ASSERT(rm->rm_firstdatacol > 2);
1465			break;
1466
1467		case 2:
1468			ASSERT(rm->rm_firstdatacol > 1);
1469
1470			if (parity_valid[VDEV_RAIDZ_P] &&
1471			    parity_valid[VDEV_RAIDZ_Q])
1472				return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1473
1474			ASSERT(rm->rm_firstdatacol > 2);
1475
1476			break;
1477		}
1478	}
1479
1480	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1481	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1482	ASSERT(code > 0);
1483	return (code);
1484}
1485
1486static int
1487vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1488    uint64_t *logical_ashift, uint64_t *physical_ashift)
1489{
1490	vdev_t *cvd;
1491	uint64_t nparity = vd->vdev_nparity;
1492	int c;
1493	int lasterror = 0;
1494	int numerrors = 0;
1495
1496	ASSERT(nparity > 0);
1497
1498	if (nparity > VDEV_RAIDZ_MAXPARITY ||
1499	    vd->vdev_children < nparity + 1) {
1500		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1501		return (SET_ERROR(EINVAL));
1502	}
1503
1504	vdev_open_children(vd);
1505
1506	for (c = 0; c < vd->vdev_children; c++) {
1507		cvd = vd->vdev_child[c];
1508
1509		if (cvd->vdev_open_error != 0) {
1510			lasterror = cvd->vdev_open_error;
1511			numerrors++;
1512			continue;
1513		}
1514
1515		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1516		*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1517		*logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
1518		*physical_ashift = MAX(*physical_ashift,
1519		    cvd->vdev_physical_ashift);
1520	}
1521
1522	*asize *= vd->vdev_children;
1523	*max_asize *= vd->vdev_children;
1524
1525	if (numerrors > nparity) {
1526		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1527		return (lasterror);
1528	}
1529
1530	return (0);
1531}
1532
1533static void
1534vdev_raidz_close(vdev_t *vd)
1535{
1536	int c;
1537
1538	for (c = 0; c < vd->vdev_children; c++)
1539		vdev_close(vd->vdev_child[c]);
1540}
1541
1542#ifdef illumos
1543/*
1544 * Handle a read or write I/O to a RAID-Z dump device.
1545 *
1546 * The dump device is in a unique situation compared to other ZFS datasets:
1547 * writing to this device should be as simple and fast as possible.  In
1548 * addition, durability matters much less since the dump will be extracted
1549 * once the machine reboots.  For that reason, this function eschews parity for
1550 * performance and simplicity.  The dump device uses the checksum setting
1551 * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this
1552 * dataset.
1553 *
1554 * Blocks of size 128 KB have been preallocated for this volume.  I/Os less than
1555 * 128 KB will not fill an entire block; in addition, they may not be properly
1556 * aligned.  In that case, this function uses the preallocated 128 KB block and
1557 * omits reading or writing any "empty" portions of that block, as opposed to
1558 * allocating a fresh appropriately-sized block.
1559 *
1560 * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs:
1561 *
1562 *     vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB)
1563 *
1564 * If this were a standard RAID-Z dataset, a block of at least 40 KB would be
1565 * allocated which spans all five child vdevs.  8 KB of data would be written to
1566 * each of four vdevs, with the fifth containing the parity bits.
1567 *
1568 *       parity    data     data     data     data
1569 *     |   PP   |   XX   |   XX   |   XX   |   XX   |
1570 *         ^        ^        ^        ^        ^
1571 *         |        |        |        |        |
1572 *   8 KB parity    ------8 KB data blocks------
1573 *
1574 * However, when writing to the dump device, the behavior is different:
1575 *
1576 *     vdev_raidz_physio(data, size: 32 KB, offset: 64 KB)
1577 *
1578 * Unlike the normal RAID-Z case in which the block is allocated based on the
1579 * I/O size, reads and writes here always use a 128 KB logical I/O size.  If the
1580 * I/O size is less than 128 KB, only the actual portions of data are written.
1581 * In this example the data is written to the third data vdev since that vdev
1582 * contains the offset [64 KB, 96 KB).
1583 *
1584 *       parity    data     data     data     data
1585 *     |        |        |        |   XX   |        |
1586 *                                    ^
1587 *                                    |
1588 *                             32 KB data block
1589 *
1590 * As a result, an individual I/O may not span all child vdevs; moreover, a
1591 * small I/O may only operate on a single child vdev.
1592 *
1593 * Note that since there are no parity bits calculated or written, this format
1594 * remains the same no matter how many parity bits are used in a normal RAID-Z
1595 * stripe.  On a RAID-Z3 configuration with seven child vdevs, the example above
1596 * would look like:
1597 *
1598 *       parity   parity   parity    data     data     data     data
1599 *     |        |        |        |        |        |   XX   |        |
1600 *                                                      ^
1601 *                                                      |
1602 *                                               32 KB data block
1603 */
1604int
1605vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
1606    uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump)
1607{
1608	vdev_t *tvd = vd->vdev_top;
1609	vdev_t *cvd;
1610	raidz_map_t *rm;
1611	raidz_col_t *rc;
1612	int c, err = 0;
1613
1614	uint64_t start, end, colstart, colend;
1615	uint64_t coloffset, colsize, colskip;
1616
1617	int flags = doread ? BIO_READ : BIO_WRITE;
1618
1619#ifdef	_KERNEL
1620
1621	/*
1622	 * Don't write past the end of the block
1623	 */
1624	VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE);
1625
1626	start = offset;
1627	end = start + size;
1628
1629	/*
1630	 * Allocate a RAID-Z map for this block.  Note that this block starts
1631	 * from the "original" offset, this is, the offset of the extent which
1632	 * contains the requisite offset of the data being read or written.
1633	 *
1634	 * Even if this I/O operation doesn't span the full block size, let's
1635	 * treat the on-disk format as if the only blocks are the complete 128
1636	 * KB size.
1637	 */
1638	rm = vdev_raidz_map_alloc(data - (offset - origoffset),
1639	    SPA_OLD_MAXBLOCKSIZE, origoffset, B_FALSE, tvd->vdev_ashift,
1640	    vd->vdev_children, vd->vdev_nparity);
1641
1642	coloffset = origoffset;
1643
1644	for (c = rm->rm_firstdatacol; c < rm->rm_cols;
1645	    c++, coloffset += rc->rc_size) {
1646		rc = &rm->rm_col[c];
1647		cvd = vd->vdev_child[rc->rc_devidx];
1648
1649		/*
1650		 * Find the start and end of this column in the RAID-Z map,
1651		 * keeping in mind that the stated size and offset of the
1652		 * operation may not fill the entire column for this vdev.
1653		 *
1654		 * If any portion of the data spans this column, issue the
1655		 * appropriate operation to the vdev.
1656		 */
1657		if (coloffset + rc->rc_size <= start)
1658			continue;
1659		if (coloffset >= end)
1660			continue;
1661
1662		colstart = MAX(coloffset, start);
1663		colend = MIN(end, coloffset + rc->rc_size);
1664		colsize = colend - colstart;
1665		colskip = colstart - coloffset;
1666
1667		VERIFY3U(colsize, <=, rc->rc_size);
1668		VERIFY3U(colskip, <=, rc->rc_size);
1669
1670		/*
1671		 * Note that the child vdev will have a vdev label at the start
1672		 * of its range of offsets, hence the need for
1673		 * VDEV_LABEL_OFFSET().  See zio_vdev_child_io() for another
1674		 * example of why this calculation is needed.
1675		 */
1676		if ((err = vdev_disk_physio(cvd,
1677		    ((char *)rc->rc_data) + colskip, colsize,
1678		    VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
1679		    flags, isdump)) != 0)
1680			break;
1681	}
1682
1683	vdev_raidz_map_free(rm);
1684#endif	/* KERNEL */
1685
1686	return (err);
1687}
1688#endif
1689
1690static uint64_t
1691vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1692{
1693	uint64_t asize;
1694	uint64_t ashift = vd->vdev_top->vdev_ashift;
1695	uint64_t cols = vd->vdev_children;
1696	uint64_t nparity = vd->vdev_nparity;
1697
1698	asize = ((psize - 1) >> ashift) + 1;
1699	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1700	asize = roundup(asize, nparity + 1) << ashift;
1701
1702	return (asize);
1703}
1704
1705static void
1706vdev_raidz_child_done(zio_t *zio)
1707{
1708	raidz_col_t *rc = zio->io_private;
1709
1710	rc->rc_error = zio->io_error;
1711	rc->rc_tried = 1;
1712	rc->rc_skipped = 0;
1713}
1714
1715/*
1716 * Start an IO operation on a RAIDZ VDev
1717 *
1718 * Outline:
1719 * - For write operations:
1720 *   1. Generate the parity data
1721 *   2. Create child zio write operations to each column's vdev, for both
1722 *      data and parity.
1723 *   3. If the column skips any sectors for padding, create optional dummy
1724 *      write zio children for those areas to improve aggregation continuity.
1725 * - For read operations:
1726 *   1. Create child zio read operations to each data column's vdev to read
1727 *      the range of data required for zio.
1728 *   2. If this is a scrub or resilver operation, or if any of the data
1729 *      vdevs have had errors, then create zio read operations to the parity
1730 *      columns' VDevs as well.
1731 */
1732static void
1733vdev_raidz_io_start(zio_t *zio)
1734{
1735	vdev_t *vd = zio->io_vd;
1736	vdev_t *tvd = vd->vdev_top;
1737	vdev_t *cvd;
1738	raidz_map_t *rm;
1739	raidz_col_t *rc;
1740	int c, i;
1741
1742	rm = vdev_raidz_map_alloc(zio->io_data, zio->io_size, zio->io_offset,
1743	    zio->io_type == ZIO_TYPE_FREE,
1744	    tvd->vdev_ashift, vd->vdev_children,
1745	    vd->vdev_nparity);
1746
1747	zio->io_vsd = rm;
1748	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
1749
1750	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1751
1752	if (zio->io_type == ZIO_TYPE_FREE) {
1753		for (c = 0; c < rm->rm_cols; c++) {
1754			rc = &rm->rm_col[c];
1755			cvd = vd->vdev_child[rc->rc_devidx];
1756			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1757			    rc->rc_offset, rc->rc_data, rc->rc_size,
1758			    zio->io_type, zio->io_priority, 0,
1759			    vdev_raidz_child_done, rc));
1760		}
1761
1762		zio_execute(zio);
1763		return;
1764	}
1765
1766	if (zio->io_type == ZIO_TYPE_WRITE) {
1767		vdev_raidz_generate_parity(rm);
1768
1769		for (c = 0; c < rm->rm_cols; c++) {
1770			rc = &rm->rm_col[c];
1771			cvd = vd->vdev_child[rc->rc_devidx];
1772			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1773			    rc->rc_offset, rc->rc_data, rc->rc_size,
1774			    zio->io_type, zio->io_priority, 0,
1775			    vdev_raidz_child_done, rc));
1776		}
1777
1778		/*
1779		 * Generate optional I/Os for any skipped sectors to improve
1780		 * aggregation contiguity.
1781		 */
1782		for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1783			ASSERT(c <= rm->rm_scols);
1784			if (c == rm->rm_scols)
1785				c = 0;
1786			rc = &rm->rm_col[c];
1787			cvd = vd->vdev_child[rc->rc_devidx];
1788			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1789			    rc->rc_offset + rc->rc_size, NULL,
1790			    1 << tvd->vdev_ashift,
1791			    zio->io_type, zio->io_priority,
1792			    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1793		}
1794
1795		zio_execute(zio);
1796		return;
1797	}
1798
1799	ASSERT(zio->io_type == ZIO_TYPE_READ);
1800
1801	/*
1802	 * Iterate over the columns in reverse order so that we hit the parity
1803	 * last -- any errors along the way will force us to read the parity.
1804	 */
1805	for (c = rm->rm_cols - 1; c >= 0; c--) {
1806		rc = &rm->rm_col[c];
1807		cvd = vd->vdev_child[rc->rc_devidx];
1808		if (!vdev_readable(cvd)) {
1809			if (c >= rm->rm_firstdatacol)
1810				rm->rm_missingdata++;
1811			else
1812				rm->rm_missingparity++;
1813			rc->rc_error = SET_ERROR(ENXIO);
1814			rc->rc_tried = 1;	/* don't even try */
1815			rc->rc_skipped = 1;
1816			continue;
1817		}
1818		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1819			if (c >= rm->rm_firstdatacol)
1820				rm->rm_missingdata++;
1821			else
1822				rm->rm_missingparity++;
1823			rc->rc_error = SET_ERROR(ESTALE);
1824			rc->rc_skipped = 1;
1825			continue;
1826		}
1827		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
1828		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1829			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1830			    rc->rc_offset, rc->rc_data, rc->rc_size,
1831			    zio->io_type, zio->io_priority, 0,
1832			    vdev_raidz_child_done, rc));
1833		}
1834	}
1835
1836	zio_execute(zio);
1837}
1838
1839
1840/*
1841 * Report a checksum error for a child of a RAID-Z device.
1842 */
1843static void
1844raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
1845{
1846	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
1847
1848	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1849		zio_bad_cksum_t zbc;
1850		raidz_map_t *rm = zio->io_vsd;
1851
1852		mutex_enter(&vd->vdev_stat_lock);
1853		vd->vdev_stat.vs_checksum_errors++;
1854		mutex_exit(&vd->vdev_stat_lock);
1855
1856		zbc.zbc_has_cksum = 0;
1857		zbc.zbc_injected = rm->rm_ecksuminjected;
1858
1859		zfs_ereport_post_checksum(zio->io_spa, vd, zio,
1860		    rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
1861		    &zbc);
1862	}
1863}
1864
1865/*
1866 * We keep track of whether or not there were any injected errors, so that
1867 * any ereports we generate can note it.
1868 */
1869static int
1870raidz_checksum_verify(zio_t *zio)
1871{
1872	zio_bad_cksum_t zbc;
1873	raidz_map_t *rm = zio->io_vsd;
1874
1875	int ret = zio_checksum_error(zio, &zbc);
1876	if (ret != 0 && zbc.zbc_injected != 0)
1877		rm->rm_ecksuminjected = 1;
1878
1879	return (ret);
1880}
1881
1882/*
1883 * Generate the parity from the data columns. If we tried and were able to
1884 * read the parity without error, verify that the generated parity matches the
1885 * data we read. If it doesn't, we fire off a checksum error. Return the
1886 * number such failures.
1887 */
1888static int
1889raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
1890{
1891	void *orig[VDEV_RAIDZ_MAXPARITY];
1892	int c, ret = 0;
1893	raidz_col_t *rc;
1894
1895	blkptr_t *bp = zio->io_bp;
1896	enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
1897	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
1898
1899	if (checksum == ZIO_CHECKSUM_NOPARITY)
1900		return (ret);
1901
1902	for (c = 0; c < rm->rm_firstdatacol; c++) {
1903		rc = &rm->rm_col[c];
1904		if (!rc->rc_tried || rc->rc_error != 0)
1905			continue;
1906		orig[c] = zio_buf_alloc(rc->rc_size);
1907		bcopy(rc->rc_data, orig[c], rc->rc_size);
1908	}
1909
1910	vdev_raidz_generate_parity(rm);
1911
1912	for (c = 0; c < rm->rm_firstdatacol; c++) {
1913		rc = &rm->rm_col[c];
1914		if (!rc->rc_tried || rc->rc_error != 0)
1915			continue;
1916		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1917			raidz_checksum_error(zio, rc, orig[c]);
1918			rc->rc_error = SET_ERROR(ECKSUM);
1919			ret++;
1920		}
1921		zio_buf_free(orig[c], rc->rc_size);
1922	}
1923
1924	return (ret);
1925}
1926
1927/*
1928 * Keep statistics on all the ways that we used parity to correct data.
1929 */
1930static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
1931
1932static int
1933vdev_raidz_worst_error(raidz_map_t *rm)
1934{
1935	int error = 0;
1936
1937	for (int c = 0; c < rm->rm_cols; c++)
1938		error = zio_worst_error(error, rm->rm_col[c].rc_error);
1939
1940	return (error);
1941}
1942
1943/*
1944 * Iterate over all combinations of bad data and attempt a reconstruction.
1945 * Note that the algorithm below is non-optimal because it doesn't take into
1946 * account how reconstruction is actually performed. For example, with
1947 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1948 * is targeted as invalid as if columns 1 and 4 are targeted since in both
1949 * cases we'd only use parity information in column 0.
1950 */
1951static int
1952vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
1953{
1954	raidz_map_t *rm = zio->io_vsd;
1955	raidz_col_t *rc;
1956	void *orig[VDEV_RAIDZ_MAXPARITY];
1957	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1958	int *tgts = &tstore[1];
1959	int current, next, i, c, n;
1960	int code, ret = 0;
1961
1962	ASSERT(total_errors < rm->rm_firstdatacol);
1963
1964	/*
1965	 * This simplifies one edge condition.
1966	 */
1967	tgts[-1] = -1;
1968
1969	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1970		/*
1971		 * Initialize the targets array by finding the first n columns
1972		 * that contain no error.
1973		 *
1974		 * If there were no data errors, we need to ensure that we're
1975		 * always explicitly attempting to reconstruct at least one
1976		 * data column. To do this, we simply push the highest target
1977		 * up into the data columns.
1978		 */
1979		for (c = 0, i = 0; i < n; i++) {
1980			if (i == n - 1 && data_errors == 0 &&
1981			    c < rm->rm_firstdatacol) {
1982				c = rm->rm_firstdatacol;
1983			}
1984
1985			while (rm->rm_col[c].rc_error != 0) {
1986				c++;
1987				ASSERT3S(c, <, rm->rm_cols);
1988			}
1989
1990			tgts[i] = c++;
1991		}
1992
1993		/*
1994		 * Setting tgts[n] simplifies the other edge condition.
1995		 */
1996		tgts[n] = rm->rm_cols;
1997
1998		/*
1999		 * These buffers were allocated in previous iterations.
2000		 */
2001		for (i = 0; i < n - 1; i++) {
2002			ASSERT(orig[i] != NULL);
2003		}
2004
2005		orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
2006
2007		current = 0;
2008		next = tgts[current];
2009
2010		while (current != n) {
2011			tgts[current] = next;
2012			current = 0;
2013
2014			/*
2015			 * Save off the original data that we're going to
2016			 * attempt to reconstruct.
2017			 */
2018			for (i = 0; i < n; i++) {
2019				ASSERT(orig[i] != NULL);
2020				c = tgts[i];
2021				ASSERT3S(c, >=, 0);
2022				ASSERT3S(c, <, rm->rm_cols);
2023				rc = &rm->rm_col[c];
2024				bcopy(rc->rc_data, orig[i], rc->rc_size);
2025			}
2026
2027			/*
2028			 * Attempt a reconstruction and exit the outer loop on
2029			 * success.
2030			 */
2031			code = vdev_raidz_reconstruct(rm, tgts, n);
2032			if (raidz_checksum_verify(zio) == 0) {
2033				atomic_inc_64(&raidz_corrected[code]);
2034
2035				for (i = 0; i < n; i++) {
2036					c = tgts[i];
2037					rc = &rm->rm_col[c];
2038					ASSERT(rc->rc_error == 0);
2039					if (rc->rc_tried)
2040						raidz_checksum_error(zio, rc,
2041						    orig[i]);
2042					rc->rc_error = SET_ERROR(ECKSUM);
2043				}
2044
2045				ret = code;
2046				goto done;
2047			}
2048
2049			/*
2050			 * Restore the original data.
2051			 */
2052			for (i = 0; i < n; i++) {
2053				c = tgts[i];
2054				rc = &rm->rm_col[c];
2055				bcopy(orig[i], rc->rc_data, rc->rc_size);
2056			}
2057
2058			do {
2059				/*
2060				 * Find the next valid column after the current
2061				 * position..
2062				 */
2063				for (next = tgts[current] + 1;
2064				    next < rm->rm_cols &&
2065				    rm->rm_col[next].rc_error != 0; next++)
2066					continue;
2067
2068				ASSERT(next <= tgts[current + 1]);
2069
2070				/*
2071				 * If that spot is available, we're done here.
2072				 */
2073				if (next != tgts[current + 1])
2074					break;
2075
2076				/*
2077				 * Otherwise, find the next valid column after
2078				 * the previous position.
2079				 */
2080				for (c = tgts[current - 1] + 1;
2081				    rm->rm_col[c].rc_error != 0; c++)
2082					continue;
2083
2084				tgts[current] = c;
2085				current++;
2086
2087			} while (current != n);
2088		}
2089	}
2090	n--;
2091done:
2092	for (i = 0; i < n; i++) {
2093		zio_buf_free(orig[i], rm->rm_col[0].rc_size);
2094	}
2095
2096	return (ret);
2097}
2098
2099/*
2100 * Complete an IO operation on a RAIDZ VDev
2101 *
2102 * Outline:
2103 * - For write operations:
2104 *   1. Check for errors on the child IOs.
2105 *   2. Return, setting an error code if too few child VDevs were written
2106 *      to reconstruct the data later.  Note that partial writes are
2107 *      considered successful if they can be reconstructed at all.
2108 * - For read operations:
2109 *   1. Check for errors on the child IOs.
2110 *   2. If data errors occurred:
2111 *      a. Try to reassemble the data from the parity available.
2112 *      b. If we haven't yet read the parity drives, read them now.
2113 *      c. If all parity drives have been read but the data still doesn't
2114 *         reassemble with a correct checksum, then try combinatorial
2115 *         reconstruction.
2116 *      d. If that doesn't work, return an error.
2117 *   3. If there were unexpected errors or this is a resilver operation,
2118 *      rewrite the vdevs that had errors.
2119 */
2120static void
2121vdev_raidz_io_done(zio_t *zio)
2122{
2123	vdev_t *vd = zio->io_vd;
2124	vdev_t *cvd;
2125	raidz_map_t *rm = zio->io_vsd;
2126	raidz_col_t *rc;
2127	int unexpected_errors = 0;
2128	int parity_errors = 0;
2129	int parity_untried = 0;
2130	int data_errors = 0;
2131	int total_errors = 0;
2132	int n, c;
2133	int tgts[VDEV_RAIDZ_MAXPARITY];
2134	int code;
2135
2136	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
2137
2138	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
2139	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
2140
2141	for (c = 0; c < rm->rm_cols; c++) {
2142		rc = &rm->rm_col[c];
2143
2144		if (rc->rc_error) {
2145			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
2146
2147			if (c < rm->rm_firstdatacol)
2148				parity_errors++;
2149			else
2150				data_errors++;
2151
2152			if (!rc->rc_skipped)
2153				unexpected_errors++;
2154
2155			total_errors++;
2156		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
2157			parity_untried++;
2158		}
2159	}
2160
2161	if (zio->io_type == ZIO_TYPE_WRITE) {
2162		/*
2163		 * XXX -- for now, treat partial writes as a success.
2164		 * (If we couldn't write enough columns to reconstruct
2165		 * the data, the I/O failed.  Otherwise, good enough.)
2166		 *
2167		 * Now that we support write reallocation, it would be better
2168		 * to treat partial failure as real failure unless there are
2169		 * no non-degraded top-level vdevs left, and not update DTLs
2170		 * if we intend to reallocate.
2171		 */
2172		/* XXPOLICY */
2173		if (total_errors > rm->rm_firstdatacol)
2174			zio->io_error = vdev_raidz_worst_error(rm);
2175
2176		return;
2177	} else if (zio->io_type == ZIO_TYPE_FREE) {
2178		return;
2179	}
2180
2181	ASSERT(zio->io_type == ZIO_TYPE_READ);
2182	/*
2183	 * There are three potential phases for a read:
2184	 *	1. produce valid data from the columns read
2185	 *	2. read all disks and try again
2186	 *	3. perform combinatorial reconstruction
2187	 *
2188	 * Each phase is progressively both more expensive and less likely to
2189	 * occur. If we encounter more errors than we can repair or all phases
2190	 * fail, we have no choice but to return an error.
2191	 */
2192
2193	/*
2194	 * If the number of errors we saw was correctable -- less than or equal
2195	 * to the number of parity disks read -- attempt to produce data that
2196	 * has a valid checksum. Naturally, this case applies in the absence of
2197	 * any errors.
2198	 */
2199	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
2200		if (data_errors == 0) {
2201			if (raidz_checksum_verify(zio) == 0) {
2202				/*
2203				 * If we read parity information (unnecessarily
2204				 * as it happens since no reconstruction was
2205				 * needed) regenerate and verify the parity.
2206				 * We also regenerate parity when resilvering
2207				 * so we can write it out to the failed device
2208				 * later.
2209				 */
2210				if (parity_errors + parity_untried <
2211				    rm->rm_firstdatacol ||
2212				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
2213					n = raidz_parity_verify(zio, rm);
2214					unexpected_errors += n;
2215					ASSERT(parity_errors + n <=
2216					    rm->rm_firstdatacol);
2217				}
2218				goto done;
2219			}
2220		} else {
2221			/*
2222			 * We either attempt to read all the parity columns or
2223			 * none of them. If we didn't try to read parity, we
2224			 * wouldn't be here in the correctable case. There must
2225			 * also have been fewer parity errors than parity
2226			 * columns or, again, we wouldn't be in this code path.
2227			 */
2228			ASSERT(parity_untried == 0);
2229			ASSERT(parity_errors < rm->rm_firstdatacol);
2230
2231			/*
2232			 * Identify the data columns that reported an error.
2233			 */
2234			n = 0;
2235			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
2236				rc = &rm->rm_col[c];
2237				if (rc->rc_error != 0) {
2238					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2239					tgts[n++] = c;
2240				}
2241			}
2242
2243			ASSERT(rm->rm_firstdatacol >= n);
2244
2245			code = vdev_raidz_reconstruct(rm, tgts, n);
2246
2247			if (raidz_checksum_verify(zio) == 0) {
2248				atomic_inc_64(&raidz_corrected[code]);
2249
2250				/*
2251				 * If we read more parity disks than were used
2252				 * for reconstruction, confirm that the other
2253				 * parity disks produced correct data. This
2254				 * routine is suboptimal in that it regenerates
2255				 * the parity that we already used in addition
2256				 * to the parity that we're attempting to
2257				 * verify, but this should be a relatively
2258				 * uncommon case, and can be optimized if it
2259				 * becomes a problem. Note that we regenerate
2260				 * parity when resilvering so we can write it
2261				 * out to failed devices later.
2262				 */
2263				if (parity_errors < rm->rm_firstdatacol - n ||
2264				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
2265					n = raidz_parity_verify(zio, rm);
2266					unexpected_errors += n;
2267					ASSERT(parity_errors + n <=
2268					    rm->rm_firstdatacol);
2269				}
2270
2271				goto done;
2272			}
2273		}
2274	}
2275
2276	/*
2277	 * This isn't a typical situation -- either we got a read error or
2278	 * a child silently returned bad data. Read every block so we can
2279	 * try again with as much data and parity as we can track down. If
2280	 * we've already been through once before, all children will be marked
2281	 * as tried so we'll proceed to combinatorial reconstruction.
2282	 */
2283	unexpected_errors = 1;
2284	rm->rm_missingdata = 0;
2285	rm->rm_missingparity = 0;
2286
2287	for (c = 0; c < rm->rm_cols; c++) {
2288		if (rm->rm_col[c].rc_tried)
2289			continue;
2290
2291		zio_vdev_io_redone(zio);
2292		do {
2293			rc = &rm->rm_col[c];
2294			if (rc->rc_tried)
2295				continue;
2296			zio_nowait(zio_vdev_child_io(zio, NULL,
2297			    vd->vdev_child[rc->rc_devidx],
2298			    rc->rc_offset, rc->rc_data, rc->rc_size,
2299			    zio->io_type, zio->io_priority, 0,
2300			    vdev_raidz_child_done, rc));
2301		} while (++c < rm->rm_cols);
2302
2303		return;
2304	}
2305
2306	/*
2307	 * At this point we've attempted to reconstruct the data given the
2308	 * errors we detected, and we've attempted to read all columns. There
2309	 * must, therefore, be one or more additional problems -- silent errors
2310	 * resulting in invalid data rather than explicit I/O errors resulting
2311	 * in absent data. We check if there is enough additional data to
2312	 * possibly reconstruct the data and then perform combinatorial
2313	 * reconstruction over all possible combinations. If that fails,
2314	 * we're cooked.
2315	 */
2316	if (total_errors > rm->rm_firstdatacol) {
2317		zio->io_error = vdev_raidz_worst_error(rm);
2318
2319	} else if (total_errors < rm->rm_firstdatacol &&
2320	    (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2321		/*
2322		 * If we didn't use all the available parity for the
2323		 * combinatorial reconstruction, verify that the remaining
2324		 * parity is correct.
2325		 */
2326		if (code != (1 << rm->rm_firstdatacol) - 1)
2327			(void) raidz_parity_verify(zio, rm);
2328	} else {
2329		/*
2330		 * We're here because either:
2331		 *
2332		 *	total_errors == rm_first_datacol, or
2333		 *	vdev_raidz_combrec() failed
2334		 *
2335		 * In either case, there is enough bad data to prevent
2336		 * reconstruction.
2337		 *
2338		 * Start checksum ereports for all children which haven't
2339		 * failed, and the IO wasn't speculative.
2340		 */
2341		zio->io_error = SET_ERROR(ECKSUM);
2342
2343		if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2344			for (c = 0; c < rm->rm_cols; c++) {
2345				rc = &rm->rm_col[c];
2346				if (rc->rc_error == 0) {
2347					zio_bad_cksum_t zbc;
2348					zbc.zbc_has_cksum = 0;
2349					zbc.zbc_injected =
2350					    rm->rm_ecksuminjected;
2351
2352					zfs_ereport_start_checksum(
2353					    zio->io_spa,
2354					    vd->vdev_child[rc->rc_devidx],
2355					    zio, rc->rc_offset, rc->rc_size,
2356					    (void *)(uintptr_t)c, &zbc);
2357				}
2358			}
2359		}
2360	}
2361
2362done:
2363	zio_checksum_verified(zio);
2364
2365	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2366	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2367		/*
2368		 * Use the good data we have in hand to repair damaged children.
2369		 */
2370		for (c = 0; c < rm->rm_cols; c++) {
2371			rc = &rm->rm_col[c];
2372			cvd = vd->vdev_child[rc->rc_devidx];
2373
2374			if (rc->rc_error == 0)
2375				continue;
2376
2377			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2378			    rc->rc_offset, rc->rc_data, rc->rc_size,
2379			    ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
2380			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2381			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2382		}
2383	}
2384}
2385
2386static void
2387vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2388{
2389	if (faulted > vd->vdev_nparity)
2390		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2391		    VDEV_AUX_NO_REPLICAS);
2392	else if (degraded + faulted != 0)
2393		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2394	else
2395		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2396}
2397
2398vdev_ops_t vdev_raidz_ops = {
2399	vdev_raidz_open,
2400	vdev_raidz_close,
2401	vdev_raidz_asize,
2402	vdev_raidz_io_start,
2403	vdev_raidz_io_done,
2404	vdev_raidz_state_change,
2405	NULL,
2406	NULL,
2407	VDEV_TYPE_RAIDZ,	/* name of this vdev type */
2408	B_FALSE			/* not a leaf vdev */
2409};
2410