vdev_raidz.c revision 168738
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#pragma ident	"%Z%%M%	%I%	%E% SMI"
28
29#include <sys/zfs_context.h>
30#include <sys/spa.h>
31#include <sys/vdev_impl.h>
32#include <sys/zio.h>
33#include <sys/zio_checksum.h>
34#include <sys/fs/zfs.h>
35#include <sys/fm/fs/zfs.h>
36
37/*
38 * Virtual device vector for RAID-Z.
39 *
40 * This vdev supports both single and double parity. For single parity, we
41 * use a simple XOR of all the data columns. For double parity, we use both
42 * the simple XOR as well as a technique described in "The mathematics of
43 * RAID-6" by H. Peter Anvin. This technique defines a Galois field, GF(2^8),
44 * over the integers expressable in a single byte. Briefly, the operations on
45 * the field are defined as follows:
46 *
47 *   o addition (+) is represented by a bitwise XOR
48 *   o subtraction (-) is therefore identical to addition: A + B = A - B
49 *   o multiplication of A by 2 is defined by the following bitwise expression:
50 *	(A * 2)_7 = A_6
51 *	(A * 2)_6 = A_5
52 *	(A * 2)_5 = A_4
53 *	(A * 2)_4 = A_3 + A_7
54 *	(A * 2)_3 = A_2 + A_7
55 *	(A * 2)_2 = A_1 + A_7
56 *	(A * 2)_1 = A_0
57 *	(A * 2)_0 = A_7
58 *
59 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
60 *
61 * Observe that any number in the field (except for 0) can be expressed as a
62 * power of 2 -- a generator for the field. We store a table of the powers of
63 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
64 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
65 * than field addition). The inverse of a field element A (A^-1) is A^254.
66 *
67 * The two parity columns, P and Q, over several data columns, D_0, ... D_n-1,
68 * can be expressed by field operations:
69 *
70 *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
71 *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
72 *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
73 *
74 * See the reconstruction code below for how P and Q can used individually or
75 * in concert to recover missing data columns.
76 */
77
78typedef struct raidz_col {
79	uint64_t rc_devidx;		/* child device index for I/O */
80	uint64_t rc_offset;		/* device offset */
81	uint64_t rc_size;		/* I/O size */
82	void *rc_data;			/* I/O data */
83	int rc_error;			/* I/O error for this device */
84	uint8_t rc_tried;		/* Did we attempt this I/O column? */
85	uint8_t rc_skipped;		/* Did we skip this I/O column? */
86} raidz_col_t;
87
88typedef struct raidz_map {
89	uint64_t rm_cols;		/* Column count */
90	uint64_t rm_bigcols;		/* Number of oversized columns */
91	uint64_t rm_asize;		/* Actual total I/O size */
92	uint64_t rm_missingdata;	/* Count of missing data devices */
93	uint64_t rm_missingparity;	/* Count of missing parity devices */
94	uint64_t rm_firstdatacol;	/* First data column/parity count */
95	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
96} raidz_map_t;
97
98#define	VDEV_RAIDZ_P		0
99#define	VDEV_RAIDZ_Q		1
100
101#define	VDEV_RAIDZ_MAXPARITY	2
102
103#define	VDEV_RAIDZ_MUL_2(a)	(((a) << 1) ^ (((a) & 0x80) ? 0x1d : 0))
104
105/*
106 * These two tables represent powers and logs of 2 in the Galois field defined
107 * above. These values were computed by repeatedly multiplying by 2 as above.
108 */
109static const uint8_t vdev_raidz_pow2[256] = {
110	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
111	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
112	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
113	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
114	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
115	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
116	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
117	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
118	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
119	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
120	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
121	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
122	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
123	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
124	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
125	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
126	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
127	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
128	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
129	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
130	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
131	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
132	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
133	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
134	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
135	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
136	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
137	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
138	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
139	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
140	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
141	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
142};
143static const uint8_t vdev_raidz_log2[256] = {
144	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
145	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
146	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
147	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
148	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
149	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
150	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
151	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
152	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
153	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
154	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
155	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
156	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
157	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
158	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
159	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
160	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
161	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
162	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
163	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
164	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
165	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
166	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
167	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
168	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
169	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
170	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
171	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
172	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
173	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
174	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
175	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
176};
177
178/*
179 * Multiply a given number by 2 raised to the given power.
180 */
181static uint8_t
182vdev_raidz_exp2(uint_t a, int exp)
183{
184	if (a == 0)
185		return (0);
186
187	ASSERT(exp >= 0);
188	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
189
190	exp += vdev_raidz_log2[a];
191	if (exp > 255)
192		exp -= 255;
193
194	return (vdev_raidz_pow2[exp]);
195}
196
197static raidz_map_t *
198vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
199    uint64_t nparity)
200{
201	raidz_map_t *rm;
202	uint64_t b = zio->io_offset >> unit_shift;
203	uint64_t s = zio->io_size >> unit_shift;
204	uint64_t f = b % dcols;
205	uint64_t o = (b / dcols) << unit_shift;
206	uint64_t q, r, c, bc, col, acols, coff, devidx;
207
208	q = s / (dcols - nparity);
209	r = s - q * (dcols - nparity);
210	bc = (r == 0 ? 0 : r + nparity);
211
212	acols = (q == 0 ? bc : dcols);
213
214	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[acols]), KM_SLEEP);
215
216	rm->rm_cols = acols;
217	rm->rm_bigcols = bc;
218	rm->rm_asize = 0;
219	rm->rm_missingdata = 0;
220	rm->rm_missingparity = 0;
221	rm->rm_firstdatacol = nparity;
222
223	for (c = 0; c < acols; c++) {
224		col = f + c;
225		coff = o;
226		if (col >= dcols) {
227			col -= dcols;
228			coff += 1ULL << unit_shift;
229		}
230		rm->rm_col[c].rc_devidx = col;
231		rm->rm_col[c].rc_offset = coff;
232		rm->rm_col[c].rc_size = (q + (c < bc)) << unit_shift;
233		rm->rm_col[c].rc_data = NULL;
234		rm->rm_col[c].rc_error = 0;
235		rm->rm_col[c].rc_tried = 0;
236		rm->rm_col[c].rc_skipped = 0;
237		rm->rm_asize += rm->rm_col[c].rc_size;
238	}
239
240	rm->rm_asize = roundup(rm->rm_asize, (nparity + 1) << unit_shift);
241
242	for (c = 0; c < rm->rm_firstdatacol; c++)
243		rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
244
245	rm->rm_col[c].rc_data = zio->io_data;
246
247	for (c = c + 1; c < acols; c++)
248		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
249		    rm->rm_col[c - 1].rc_size;
250
251	/*
252	 * If all data stored spans all columns, there's a danger that parity
253	 * will always be on the same device and, since parity isn't read
254	 * during normal operation, that that device's I/O bandwidth won't be
255	 * used effectively. We therefore switch the parity every 1MB.
256	 *
257	 * ... at least that was, ostensibly, the theory. As a practical
258	 * matter unless we juggle the parity between all devices evenly, we
259	 * won't see any benefit. Further, occasional writes that aren't a
260	 * multiple of the LCM of the number of children and the minimum
261	 * stripe width are sufficient to avoid pessimal behavior.
262	 * Unfortunately, this decision created an implicit on-disk format
263	 * requirement that we need to support for all eternity, but only
264	 * for single-parity RAID-Z.
265	 */
266	ASSERT(rm->rm_cols >= 2);
267	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
268
269	if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
270		devidx = rm->rm_col[0].rc_devidx;
271		o = rm->rm_col[0].rc_offset;
272		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
273		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
274		rm->rm_col[1].rc_devidx = devidx;
275		rm->rm_col[1].rc_offset = o;
276	}
277
278	zio->io_vsd = rm;
279	return (rm);
280}
281
282static void
283vdev_raidz_map_free(zio_t *zio)
284{
285	raidz_map_t *rm = zio->io_vsd;
286	int c;
287
288	for (c = 0; c < rm->rm_firstdatacol; c++)
289		zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
290
291	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols]));
292	zio->io_vsd = NULL;
293}
294
295static void
296vdev_raidz_generate_parity_p(raidz_map_t *rm)
297{
298	uint64_t *p, *src, pcount, ccount, i;
299	int c;
300
301	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
302
303	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
304		src = rm->rm_col[c].rc_data;
305		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
306		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
307
308		if (c == rm->rm_firstdatacol) {
309			ASSERT(ccount == pcount);
310			for (i = 0; i < ccount; i++, p++, src++) {
311				*p = *src;
312			}
313		} else {
314			ASSERT(ccount <= pcount);
315			for (i = 0; i < ccount; i++, p++, src++) {
316				*p ^= *src;
317			}
318		}
319	}
320}
321
322static void
323vdev_raidz_generate_parity_pq(raidz_map_t *rm)
324{
325	uint64_t *q, *p, *src, pcount, ccount, mask, i;
326	int c;
327
328	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
329	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
330	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
331
332	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
333		src = rm->rm_col[c].rc_data;
334		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
335		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
336		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
337
338		if (c == rm->rm_firstdatacol) {
339			ASSERT(ccount == pcount || ccount == 0);
340			for (i = 0; i < ccount; i++, p++, q++, src++) {
341				*q = *src;
342				*p = *src;
343			}
344			for (; i < pcount; i++, p++, q++, src++) {
345				*q = 0;
346				*p = 0;
347			}
348		} else {
349			ASSERT(ccount <= pcount);
350
351			/*
352			 * Rather than multiplying each byte individually (as
353			 * described above), we are able to handle 8 at once
354			 * by generating a mask based on the high bit in each
355			 * byte and using that to conditionally XOR in 0x1d.
356			 */
357			for (i = 0; i < ccount; i++, p++, q++, src++) {
358				mask = *q & 0x8080808080808080ULL;
359				mask = (mask << 1) - (mask >> 7);
360				*q = ((*q << 1) & 0xfefefefefefefefeULL) ^
361				    (mask & 0x1d1d1d1d1d1d1d1dULL);
362				*q ^= *src;
363				*p ^= *src;
364			}
365
366			/*
367			 * Treat short columns as though they are full of 0s.
368			 */
369			for (; i < pcount; i++, q++) {
370				mask = *q & 0x8080808080808080ULL;
371				mask = (mask << 1) - (mask >> 7);
372				*q = ((*q << 1) & 0xfefefefefefefefeULL) ^
373				    (mask & 0x1d1d1d1d1d1d1d1dULL);
374			}
375		}
376	}
377}
378
379static void
380vdev_raidz_reconstruct_p(raidz_map_t *rm, int x)
381{
382	uint64_t *dst, *src, xcount, ccount, count, i;
383	int c;
384
385	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
386	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
387	ASSERT(xcount > 0);
388
389	src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
390	dst = rm->rm_col[x].rc_data;
391	for (i = 0; i < xcount; i++, dst++, src++) {
392		*dst = *src;
393	}
394
395	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
396		src = rm->rm_col[c].rc_data;
397		dst = rm->rm_col[x].rc_data;
398
399		if (c == x)
400			continue;
401
402		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
403		count = MIN(ccount, xcount);
404
405		for (i = 0; i < count; i++, dst++, src++) {
406			*dst ^= *src;
407		}
408	}
409}
410
411static void
412vdev_raidz_reconstruct_q(raidz_map_t *rm, int x)
413{
414	uint64_t *dst, *src, xcount, ccount, count, mask, i;
415	uint8_t *b;
416	int c, j, exp;
417
418	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
419	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
420
421	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
422		src = rm->rm_col[c].rc_data;
423		dst = rm->rm_col[x].rc_data;
424
425		if (c == x)
426			ccount = 0;
427		else
428			ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
429
430		count = MIN(ccount, xcount);
431
432		if (c == rm->rm_firstdatacol) {
433			for (i = 0; i < count; i++, dst++, src++) {
434				*dst = *src;
435			}
436			for (; i < xcount; i++, dst++) {
437				*dst = 0;
438			}
439
440		} else {
441			/*
442			 * For an explanation of this, see the comment in
443			 * vdev_raidz_generate_parity_pq() above.
444			 */
445			for (i = 0; i < count; i++, dst++, src++) {
446				mask = *dst & 0x8080808080808080ULL;
447				mask = (mask << 1) - (mask >> 7);
448				*dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
449				    (mask & 0x1d1d1d1d1d1d1d1dULL);
450				*dst ^= *src;
451			}
452
453			for (; i < xcount; i++, dst++) {
454				mask = *dst & 0x8080808080808080ULL;
455				mask = (mask << 1) - (mask >> 7);
456				*dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
457				    (mask & 0x1d1d1d1d1d1d1d1dULL);
458			}
459		}
460	}
461
462	src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
463	dst = rm->rm_col[x].rc_data;
464	exp = 255 - (rm->rm_cols - 1 - x);
465
466	for (i = 0; i < xcount; i++, dst++, src++) {
467		*dst ^= *src;
468		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
469			*b = vdev_raidz_exp2(*b, exp);
470		}
471	}
472}
473
474static void
475vdev_raidz_reconstruct_pq(raidz_map_t *rm, int x, int y)
476{
477	uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
478	void *pdata, *qdata;
479	uint64_t xsize, ysize, i;
480
481	ASSERT(x < y);
482	ASSERT(x >= rm->rm_firstdatacol);
483	ASSERT(y < rm->rm_cols);
484
485	ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
486
487	/*
488	 * Move the parity data aside -- we're going to compute parity as
489	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
490	 * reuse the parity generation mechanism without trashing the actual
491	 * parity so we make those columns appear to be full of zeros by
492	 * setting their lengths to zero.
493	 */
494	pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
495	qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
496	xsize = rm->rm_col[x].rc_size;
497	ysize = rm->rm_col[y].rc_size;
498
499	rm->rm_col[VDEV_RAIDZ_P].rc_data =
500	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
501	rm->rm_col[VDEV_RAIDZ_Q].rc_data =
502	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
503	rm->rm_col[x].rc_size = 0;
504	rm->rm_col[y].rc_size = 0;
505
506	vdev_raidz_generate_parity_pq(rm);
507
508	rm->rm_col[x].rc_size = xsize;
509	rm->rm_col[y].rc_size = ysize;
510
511	p = pdata;
512	q = qdata;
513	pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
514	qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
515	xd = rm->rm_col[x].rc_data;
516	yd = rm->rm_col[y].rc_data;
517
518	/*
519	 * We now have:
520	 *	Pxy = P + D_x + D_y
521	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
522	 *
523	 * We can then solve for D_x:
524	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
525	 * where
526	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
527	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
528	 *
529	 * With D_x in hand, we can easily solve for D_y:
530	 *	D_y = P + Pxy + D_x
531	 */
532
533	a = vdev_raidz_pow2[255 + x - y];
534	b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
535	tmp = 255 - vdev_raidz_log2[a ^ 1];
536
537	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
538	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
539
540	for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
541		*xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
542		    vdev_raidz_exp2(*q ^ *qxy, bexp);
543
544		if (i < ysize)
545			*yd = *p ^ *pxy ^ *xd;
546	}
547
548	zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
549	    rm->rm_col[VDEV_RAIDZ_P].rc_size);
550	zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
551	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
552
553	/*
554	 * Restore the saved parity data.
555	 */
556	rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
557	rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
558}
559
560
561static int
562vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
563{
564	vdev_t *cvd;
565	uint64_t nparity = vd->vdev_nparity;
566	int c, error;
567	int lasterror = 0;
568	int numerrors = 0;
569
570	ASSERT(nparity > 0);
571
572	if (nparity > VDEV_RAIDZ_MAXPARITY ||
573	    vd->vdev_children < nparity + 1) {
574		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
575		return (EINVAL);
576	}
577
578	for (c = 0; c < vd->vdev_children; c++) {
579		cvd = vd->vdev_child[c];
580
581		if ((error = vdev_open(cvd)) != 0) {
582			lasterror = error;
583			numerrors++;
584			continue;
585		}
586
587		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
588		*ashift = MAX(*ashift, cvd->vdev_ashift);
589	}
590
591	*asize *= vd->vdev_children;
592
593	if (numerrors > nparity) {
594		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
595		return (lasterror);
596	}
597
598	return (0);
599}
600
601static void
602vdev_raidz_close(vdev_t *vd)
603{
604	int c;
605
606	for (c = 0; c < vd->vdev_children; c++)
607		vdev_close(vd->vdev_child[c]);
608}
609
610static uint64_t
611vdev_raidz_asize(vdev_t *vd, uint64_t psize)
612{
613	uint64_t asize;
614	uint64_t ashift = vd->vdev_top->vdev_ashift;
615	uint64_t cols = vd->vdev_children;
616	uint64_t nparity = vd->vdev_nparity;
617
618	asize = ((psize - 1) >> ashift) + 1;
619	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
620	asize = roundup(asize, nparity + 1) << ashift;
621
622	return (asize);
623}
624
625static void
626vdev_raidz_child_done(zio_t *zio)
627{
628	raidz_col_t *rc = zio->io_private;
629
630	rc->rc_error = zio->io_error;
631	rc->rc_tried = 1;
632	rc->rc_skipped = 0;
633}
634
635static void
636vdev_raidz_repair_done(zio_t *zio)
637{
638	ASSERT(zio->io_private == zio->io_parent);
639	vdev_raidz_map_free(zio->io_private);
640}
641
642static void
643vdev_raidz_io_start(zio_t *zio)
644{
645	vdev_t *vd = zio->io_vd;
646	vdev_t *tvd = vd->vdev_top;
647	vdev_t *cvd;
648	blkptr_t *bp = zio->io_bp;
649	raidz_map_t *rm;
650	raidz_col_t *rc;
651	int c;
652
653	rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
654	    vd->vdev_nparity);
655
656	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
657
658	if (zio->io_type == ZIO_TYPE_WRITE) {
659		/*
660		 * Generate RAID parity in the first virtual columns.
661		 */
662		if (rm->rm_firstdatacol == 1)
663			vdev_raidz_generate_parity_p(rm);
664		else
665			vdev_raidz_generate_parity_pq(rm);
666
667		for (c = 0; c < rm->rm_cols; c++) {
668			rc = &rm->rm_col[c];
669			cvd = vd->vdev_child[rc->rc_devidx];
670			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
671			    rc->rc_offset, rc->rc_data, rc->rc_size,
672			    zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
673			    vdev_raidz_child_done, rc));
674		}
675		zio_wait_children_done(zio);
676		return;
677	}
678
679	ASSERT(zio->io_type == ZIO_TYPE_READ);
680
681	/*
682	 * Iterate over the columns in reverse order so that we hit the parity
683	 * last -- any errors along the way will force us to read the parity
684	 * data.
685	 */
686	for (c = rm->rm_cols - 1; c >= 0; c--) {
687		rc = &rm->rm_col[c];
688		cvd = vd->vdev_child[rc->rc_devidx];
689		if (vdev_is_dead(cvd)) {
690			if (c >= rm->rm_firstdatacol)
691				rm->rm_missingdata++;
692			else
693				rm->rm_missingparity++;
694			rc->rc_error = ENXIO;
695			rc->rc_tried = 1;	/* don't even try */
696			rc->rc_skipped = 1;
697			continue;
698		}
699		if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) {
700			if (c >= rm->rm_firstdatacol)
701				rm->rm_missingdata++;
702			else
703				rm->rm_missingparity++;
704			rc->rc_error = ESTALE;
705			rc->rc_skipped = 1;
706			continue;
707		}
708		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
709		    (zio->io_flags & ZIO_FLAG_SCRUB)) {
710			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
711			    rc->rc_offset, rc->rc_data, rc->rc_size,
712			    zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
713			    vdev_raidz_child_done, rc));
714		}
715	}
716
717	zio_wait_children_done(zio);
718}
719
720/*
721 * Report a checksum error for a child of a RAID-Z device.
722 */
723static void
724raidz_checksum_error(zio_t *zio, raidz_col_t *rc)
725{
726	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
727	dprintf_bp(zio->io_bp, "imputed checksum error on %s: ",
728	    vdev_description(vd));
729
730	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
731		mutex_enter(&vd->vdev_stat_lock);
732		vd->vdev_stat.vs_checksum_errors++;
733		mutex_exit(&vd->vdev_stat_lock);
734	}
735
736	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE))
737		zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
738		    zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size);
739}
740
741/*
742 * Generate the parity from the data columns. If we tried and were able to
743 * read the parity without error, verify that the generated parity matches the
744 * data we read. If it doesn't, we fire off a checksum error. Return the
745 * number such failures.
746 */
747static int
748raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
749{
750	void *orig[VDEV_RAIDZ_MAXPARITY];
751	int c, ret = 0;
752	raidz_col_t *rc;
753
754	for (c = 0; c < rm->rm_firstdatacol; c++) {
755		rc = &rm->rm_col[c];
756		if (!rc->rc_tried || rc->rc_error != 0)
757			continue;
758		orig[c] = zio_buf_alloc(rc->rc_size);
759		bcopy(rc->rc_data, orig[c], rc->rc_size);
760	}
761
762	if (rm->rm_firstdatacol == 1)
763		vdev_raidz_generate_parity_p(rm);
764	else
765		vdev_raidz_generate_parity_pq(rm);
766
767	for (c = 0; c < rm->rm_firstdatacol; c++) {
768		rc = &rm->rm_col[c];
769		if (!rc->rc_tried || rc->rc_error != 0)
770			continue;
771		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
772			raidz_checksum_error(zio, rc);
773			rc->rc_error = ECKSUM;
774			ret++;
775		}
776		zio_buf_free(orig[c], rc->rc_size);
777	}
778
779	return (ret);
780}
781
782static uint64_t raidz_corrected_p;
783static uint64_t raidz_corrected_q;
784static uint64_t raidz_corrected_pq;
785
786static void
787vdev_raidz_io_done(zio_t *zio)
788{
789	vdev_t *vd = zio->io_vd;
790	vdev_t *cvd;
791	raidz_map_t *rm = zio->io_vsd;
792	raidz_col_t *rc, *rc1;
793	int unexpected_errors = 0;
794	int parity_errors = 0;
795	int parity_untried = 0;
796	int data_errors = 0;
797	int n, c, c1;
798
799	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
800
801	zio->io_error = 0;
802	zio->io_numerrors = 0;
803
804	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
805	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
806
807	for (c = 0; c < rm->rm_cols; c++) {
808		rc = &rm->rm_col[c];
809
810		/*
811		 * We preserve any EIOs because those may be worth retrying;
812		 * whereas ECKSUM and ENXIO are more likely to be persistent.
813		 */
814		if (rc->rc_error) {
815			if (zio->io_error != EIO)
816				zio->io_error = rc->rc_error;
817
818			if (c < rm->rm_firstdatacol)
819				parity_errors++;
820			else
821				data_errors++;
822
823			if (!rc->rc_skipped)
824				unexpected_errors++;
825
826			zio->io_numerrors++;
827		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
828			parity_untried++;
829		}
830	}
831
832	if (zio->io_type == ZIO_TYPE_WRITE) {
833		/*
834		 * If this is not a failfast write, and we were able to
835		 * write enough columns to reconstruct the data, good enough.
836		 */
837		/* XXPOLICY */
838		if (zio->io_numerrors <= rm->rm_firstdatacol &&
839		    !(zio->io_flags & ZIO_FLAG_FAILFAST))
840			zio->io_error = 0;
841
842		vdev_raidz_map_free(zio);
843		zio_next_stage(zio);
844		return;
845	}
846
847	ASSERT(zio->io_type == ZIO_TYPE_READ);
848	/*
849	 * There are three potential phases for a read:
850	 *	1. produce valid data from the columns read
851	 *	2. read all disks and try again
852	 *	3. perform combinatorial reconstruction
853	 *
854	 * Each phase is progressively both more expensive and less likely to
855	 * occur. If we encounter more errors than we can repair or all phases
856	 * fail, we have no choice but to return an error.
857	 */
858
859	/*
860	 * If the number of errors we saw was correctable -- less than or equal
861	 * to the number of parity disks read -- attempt to produce data that
862	 * has a valid checksum. Naturally, this case applies in the absence of
863	 * any errors.
864	 */
865	if (zio->io_numerrors <= rm->rm_firstdatacol - parity_untried) {
866		switch (data_errors) {
867		case 0:
868			if (zio_checksum_error(zio) == 0) {
869				zio->io_error = 0;
870
871				/*
872				 * If we read parity information (unnecessarily
873				 * as it happens since no reconstruction was
874				 * needed) regenerate and verify the parity.
875				 * We also regenerate parity when resilvering
876				 * so we can write it out to the failed device
877				 * later.
878				 */
879				if (parity_errors + parity_untried <
880				    rm->rm_firstdatacol ||
881				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
882					n = raidz_parity_verify(zio, rm);
883					unexpected_errors += n;
884					ASSERT(parity_errors + n <=
885					    rm->rm_firstdatacol);
886				}
887				goto done;
888			}
889			break;
890
891		case 1:
892			/*
893			 * We either attempt to read all the parity columns or
894			 * none of them. If we didn't try to read parity, we
895			 * wouldn't be here in the correctable case. There must
896			 * also have been fewer parity errors than parity
897			 * columns or, again, we wouldn't be in this code path.
898			 */
899			ASSERT(parity_untried == 0);
900			ASSERT(parity_errors < rm->rm_firstdatacol);
901
902			/*
903			 * Find the column that reported the error.
904			 */
905			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
906				rc = &rm->rm_col[c];
907				if (rc->rc_error != 0)
908					break;
909			}
910			ASSERT(c != rm->rm_cols);
911			ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
912			    rc->rc_error == ESTALE);
913
914			if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) {
915				vdev_raidz_reconstruct_p(rm, c);
916			} else {
917				ASSERT(rm->rm_firstdatacol > 1);
918				vdev_raidz_reconstruct_q(rm, c);
919			}
920
921			if (zio_checksum_error(zio) == 0) {
922				zio->io_error = 0;
923				if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0)
924					atomic_inc_64(&raidz_corrected_p);
925				else
926					atomic_inc_64(&raidz_corrected_q);
927
928				/*
929				 * If there's more than one parity disk that
930				 * was successfully read, confirm that the
931				 * other parity disk produced the correct data.
932				 * This routine is suboptimal in that it
933				 * regenerates both the parity we wish to test
934				 * as well as the parity we just used to
935				 * perform the reconstruction, but this should
936				 * be a relatively uncommon case, and can be
937				 * optimized if it becomes a problem.
938				 * We also regenerate parity when resilvering
939				 * so we can write it out to the failed device
940				 * later.
941				 */
942				if (parity_errors < rm->rm_firstdatacol - 1 ||
943				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
944					n = raidz_parity_verify(zio, rm);
945					unexpected_errors += n;
946					ASSERT(parity_errors + n <=
947					    rm->rm_firstdatacol);
948				}
949
950				goto done;
951			}
952			break;
953
954		case 2:
955			/*
956			 * Two data column errors require double parity.
957			 */
958			ASSERT(rm->rm_firstdatacol == 2);
959
960			/*
961			 * Find the two columns that reported errors.
962			 */
963			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
964				rc = &rm->rm_col[c];
965				if (rc->rc_error != 0)
966					break;
967			}
968			ASSERT(c != rm->rm_cols);
969			ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
970			    rc->rc_error == ESTALE);
971
972			for (c1 = c++; c < rm->rm_cols; c++) {
973				rc = &rm->rm_col[c];
974				if (rc->rc_error != 0)
975					break;
976			}
977			ASSERT(c != rm->rm_cols);
978			ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
979			    rc->rc_error == ESTALE);
980
981			vdev_raidz_reconstruct_pq(rm, c1, c);
982
983			if (zio_checksum_error(zio) == 0) {
984				zio->io_error = 0;
985				atomic_inc_64(&raidz_corrected_pq);
986
987				goto done;
988			}
989			break;
990
991		default:
992			ASSERT(rm->rm_firstdatacol <= 2);
993			ASSERT(0);
994		}
995	}
996
997	/*
998	 * This isn't a typical situation -- either we got a read error or
999	 * a child silently returned bad data. Read every block so we can
1000	 * try again with as much data and parity as we can track down. If
1001	 * we've already been through once before, all children will be marked
1002	 * as tried so we'll proceed to combinatorial reconstruction.
1003	 */
1004	unexpected_errors = 1;
1005	rm->rm_missingdata = 0;
1006	rm->rm_missingparity = 0;
1007
1008	for (c = 0; c < rm->rm_cols; c++) {
1009		if (rm->rm_col[c].rc_tried)
1010			continue;
1011
1012		zio->io_error = 0;
1013		zio_vdev_io_redone(zio);
1014		do {
1015			rc = &rm->rm_col[c];
1016			if (rc->rc_tried)
1017				continue;
1018			zio_nowait(zio_vdev_child_io(zio, NULL,
1019			    vd->vdev_child[rc->rc_devidx],
1020			    rc->rc_offset, rc->rc_data, rc->rc_size,
1021			    zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
1022			    vdev_raidz_child_done, rc));
1023		} while (++c < rm->rm_cols);
1024		dprintf("rereading\n");
1025		zio_wait_children_done(zio);
1026		return;
1027	}
1028
1029	/*
1030	 * At this point we've attempted to reconstruct the data given the
1031	 * errors we detected, and we've attempted to read all columns. There
1032	 * must, therefore, be one or more additional problems -- silent errors
1033	 * resulting in invalid data rather than explicit I/O errors resulting
1034	 * in absent data. Before we attempt combinatorial reconstruction make
1035	 * sure we have a chance of coming up with the right answer.
1036	 */
1037	if (zio->io_numerrors >= rm->rm_firstdatacol) {
1038		ASSERT(zio->io_error != 0);
1039		goto done;
1040	}
1041
1042	if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) {
1043		/*
1044		 * Attempt to reconstruct the data from parity P.
1045		 */
1046		for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1047			void *orig;
1048			rc = &rm->rm_col[c];
1049
1050			orig = zio_buf_alloc(rc->rc_size);
1051			bcopy(rc->rc_data, orig, rc->rc_size);
1052			vdev_raidz_reconstruct_p(rm, c);
1053
1054			if (zio_checksum_error(zio) == 0) {
1055				zio_buf_free(orig, rc->rc_size);
1056				zio->io_error = 0;
1057				atomic_inc_64(&raidz_corrected_p);
1058
1059				/*
1060				 * If this child didn't know that it returned
1061				 * bad data, inform it.
1062				 */
1063				if (rc->rc_tried && rc->rc_error == 0)
1064					raidz_checksum_error(zio, rc);
1065				rc->rc_error = ECKSUM;
1066				goto done;
1067			}
1068
1069			bcopy(orig, rc->rc_data, rc->rc_size);
1070			zio_buf_free(orig, rc->rc_size);
1071		}
1072	}
1073
1074	if (rm->rm_firstdatacol > 1 && rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) {
1075		/*
1076		 * Attempt to reconstruct the data from parity Q.
1077		 */
1078		for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1079			void *orig;
1080			rc = &rm->rm_col[c];
1081
1082			orig = zio_buf_alloc(rc->rc_size);
1083			bcopy(rc->rc_data, orig, rc->rc_size);
1084			vdev_raidz_reconstruct_q(rm, c);
1085
1086			if (zio_checksum_error(zio) == 0) {
1087				zio_buf_free(orig, rc->rc_size);
1088				zio->io_error = 0;
1089				atomic_inc_64(&raidz_corrected_q);
1090
1091				/*
1092				 * If this child didn't know that it returned
1093				 * bad data, inform it.
1094				 */
1095				if (rc->rc_tried && rc->rc_error == 0)
1096					raidz_checksum_error(zio, rc);
1097				rc->rc_error = ECKSUM;
1098				goto done;
1099			}
1100
1101			bcopy(orig, rc->rc_data, rc->rc_size);
1102			zio_buf_free(orig, rc->rc_size);
1103		}
1104	}
1105
1106	if (rm->rm_firstdatacol > 1 &&
1107	    rm->rm_col[VDEV_RAIDZ_P].rc_error == 0 &&
1108	    rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) {
1109		/*
1110		 * Attempt to reconstruct the data from both P and Q.
1111		 */
1112		for (c = rm->rm_firstdatacol; c < rm->rm_cols - 1; c++) {
1113			void *orig, *orig1;
1114			rc = &rm->rm_col[c];
1115
1116			orig = zio_buf_alloc(rc->rc_size);
1117			bcopy(rc->rc_data, orig, rc->rc_size);
1118
1119			for (c1 = c + 1; c1 < rm->rm_cols; c1++) {
1120				rc1 = &rm->rm_col[c1];
1121
1122				orig1 = zio_buf_alloc(rc1->rc_size);
1123				bcopy(rc1->rc_data, orig1, rc1->rc_size);
1124
1125				vdev_raidz_reconstruct_pq(rm, c, c1);
1126
1127				if (zio_checksum_error(zio) == 0) {
1128					zio_buf_free(orig, rc->rc_size);
1129					zio_buf_free(orig1, rc1->rc_size);
1130					zio->io_error = 0;
1131					atomic_inc_64(&raidz_corrected_pq);
1132
1133					/*
1134					 * If these children didn't know they
1135					 * returned bad data, inform them.
1136					 */
1137					if (rc->rc_tried && rc->rc_error == 0)
1138						raidz_checksum_error(zio, rc);
1139					if (rc1->rc_tried && rc1->rc_error == 0)
1140						raidz_checksum_error(zio, rc1);
1141
1142					rc->rc_error = ECKSUM;
1143					rc1->rc_error = ECKSUM;
1144
1145					goto done;
1146				}
1147
1148				bcopy(orig1, rc1->rc_data, rc1->rc_size);
1149				zio_buf_free(orig1, rc1->rc_size);
1150			}
1151
1152			bcopy(orig, rc->rc_data, rc->rc_size);
1153			zio_buf_free(orig, rc->rc_size);
1154		}
1155	}
1156
1157	/*
1158	 * All combinations failed to checksum. Generate checksum ereports for
1159	 * all children.
1160	 */
1161	zio->io_error = ECKSUM;
1162	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1163		for (c = 0; c < rm->rm_cols; c++) {
1164			rc = &rm->rm_col[c];
1165			zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
1166			    zio->io_spa, vd->vdev_child[rc->rc_devidx], zio,
1167			    rc->rc_offset, rc->rc_size);
1168		}
1169	}
1170
1171done:
1172	zio_checksum_verified(zio);
1173
1174	if (zio->io_error == 0 && (spa_mode & FWRITE) &&
1175	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
1176		zio_t *rio;
1177
1178		/*
1179		 * Use the good data we have in hand to repair damaged children.
1180		 *
1181		 * We issue all repair I/Os as children of 'rio' to arrange
1182		 * that vdev_raidz_map_free(zio) will be invoked after all
1183		 * repairs complete, but before we advance to the next stage.
1184		 */
1185		rio = zio_null(zio, zio->io_spa,
1186		    vdev_raidz_repair_done, zio, ZIO_FLAG_CANFAIL);
1187
1188		for (c = 0; c < rm->rm_cols; c++) {
1189			rc = &rm->rm_col[c];
1190			cvd = vd->vdev_child[rc->rc_devidx];
1191
1192			if (rc->rc_error == 0)
1193				continue;
1194
1195			dprintf("%s resilvered %s @ 0x%llx error %d\n",
1196			    vdev_description(vd),
1197			    vdev_description(cvd),
1198			    zio->io_offset, rc->rc_error);
1199
1200			zio_nowait(zio_vdev_child_io(rio, NULL, cvd,
1201			    rc->rc_offset, rc->rc_data, rc->rc_size,
1202			    ZIO_TYPE_WRITE, zio->io_priority,
1203			    ZIO_FLAG_IO_REPAIR | ZIO_FLAG_DONT_PROPAGATE |
1204			    ZIO_FLAG_CANFAIL, NULL, NULL));
1205		}
1206
1207		zio_nowait(rio);
1208		zio_wait_children_done(zio);
1209		return;
1210	}
1211
1212	vdev_raidz_map_free(zio);
1213	zio_next_stage(zio);
1214}
1215
1216static void
1217vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
1218{
1219	if (faulted > vd->vdev_nparity)
1220		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
1221		    VDEV_AUX_NO_REPLICAS);
1222	else if (degraded + faulted != 0)
1223		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
1224	else
1225		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
1226}
1227
1228vdev_ops_t vdev_raidz_ops = {
1229	vdev_raidz_open,
1230	vdev_raidz_close,
1231	vdev_raidz_asize,
1232	vdev_raidz_io_start,
1233	vdev_raidz_io_done,
1234	vdev_raidz_state_change,
1235	VDEV_TYPE_RAIDZ,	/* name of this vdev type */
1236	B_FALSE			/* not a leaf vdev */
1237};
1238