vdev_raidz.c revision 185029
1168404Spjd/*
2168404Spjd * CDDL HEADER START
3168404Spjd *
4168404Spjd * The contents of this file are subject to the terms of the
5168404Spjd * Common Development and Distribution License (the "License").
6168404Spjd * You may not use this file except in compliance with the License.
7168404Spjd *
8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9168404Spjd * or http://www.opensolaris.org/os/licensing.
10168404Spjd * See the License for the specific language governing permissions
11168404Spjd * and limitations under the License.
12168404Spjd *
13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each
14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15168404Spjd * If applicable, add the following below this CDDL HEADER, with the
16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying
17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner]
18168404Spjd *
19168404Spjd * CDDL HEADER END
20168404Spjd */
21168404Spjd
22168404Spjd/*
23185029Spjd * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24168404Spjd * Use is subject to license terms.
25168404Spjd */
26168404Spjd
27168404Spjd#include <sys/zfs_context.h>
28168404Spjd#include <sys/spa.h>
29168404Spjd#include <sys/vdev_impl.h>
30168404Spjd#include <sys/zio.h>
31168404Spjd#include <sys/zio_checksum.h>
32168404Spjd#include <sys/fs/zfs.h>
33168404Spjd#include <sys/fm/fs/zfs.h>
34168404Spjd
35168404Spjd/*
36168404Spjd * Virtual device vector for RAID-Z.
37168404Spjd *
38168404Spjd * This vdev supports both single and double parity. For single parity, we
39168404Spjd * use a simple XOR of all the data columns. For double parity, we use both
40168404Spjd * the simple XOR as well as a technique described in "The mathematics of
41168404Spjd * RAID-6" by H. Peter Anvin. This technique defines a Galois field, GF(2^8),
42168404Spjd * over the integers expressable in a single byte. Briefly, the operations on
43168404Spjd * the field are defined as follows:
44168404Spjd *
45168404Spjd *   o addition (+) is represented by a bitwise XOR
46168404Spjd *   o subtraction (-) is therefore identical to addition: A + B = A - B
47168404Spjd *   o multiplication of A by 2 is defined by the following bitwise expression:
48168404Spjd *	(A * 2)_7 = A_6
49168404Spjd *	(A * 2)_6 = A_5
50168404Spjd *	(A * 2)_5 = A_4
51168404Spjd *	(A * 2)_4 = A_3 + A_7
52168404Spjd *	(A * 2)_3 = A_2 + A_7
53168404Spjd *	(A * 2)_2 = A_1 + A_7
54168404Spjd *	(A * 2)_1 = A_0
55168404Spjd *	(A * 2)_0 = A_7
56168404Spjd *
57168404Spjd * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
58168404Spjd *
59168404Spjd * Observe that any number in the field (except for 0) can be expressed as a
60168404Spjd * power of 2 -- a generator for the field. We store a table of the powers of
61168404Spjd * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
62168404Spjd * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
63168404Spjd * than field addition). The inverse of a field element A (A^-1) is A^254.
64168404Spjd *
65168404Spjd * The two parity columns, P and Q, over several data columns, D_0, ... D_n-1,
66168404Spjd * can be expressed by field operations:
67168404Spjd *
68168404Spjd *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
69168404Spjd *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
70168404Spjd *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
71168404Spjd *
72168404Spjd * See the reconstruction code below for how P and Q can used individually or
73168404Spjd * in concert to recover missing data columns.
74168404Spjd */
75168404Spjd
76168404Spjdtypedef struct raidz_col {
77168404Spjd	uint64_t rc_devidx;		/* child device index for I/O */
78168404Spjd	uint64_t rc_offset;		/* device offset */
79168404Spjd	uint64_t rc_size;		/* I/O size */
80168404Spjd	void *rc_data;			/* I/O data */
81168404Spjd	int rc_error;			/* I/O error for this device */
82168404Spjd	uint8_t rc_tried;		/* Did we attempt this I/O column? */
83168404Spjd	uint8_t rc_skipped;		/* Did we skip this I/O column? */
84168404Spjd} raidz_col_t;
85168404Spjd
86168404Spjdtypedef struct raidz_map {
87168404Spjd	uint64_t rm_cols;		/* Column count */
88168404Spjd	uint64_t rm_bigcols;		/* Number of oversized columns */
89168404Spjd	uint64_t rm_asize;		/* Actual total I/O size */
90168404Spjd	uint64_t rm_missingdata;	/* Count of missing data devices */
91168404Spjd	uint64_t rm_missingparity;	/* Count of missing parity devices */
92168404Spjd	uint64_t rm_firstdatacol;	/* First data column/parity count */
93168404Spjd	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
94168404Spjd} raidz_map_t;
95168404Spjd
96168404Spjd#define	VDEV_RAIDZ_P		0
97168404Spjd#define	VDEV_RAIDZ_Q		1
98168404Spjd
99168404Spjd#define	VDEV_RAIDZ_MAXPARITY	2
100168404Spjd
101168404Spjd#define	VDEV_RAIDZ_MUL_2(a)	(((a) << 1) ^ (((a) & 0x80) ? 0x1d : 0))
102168404Spjd
103168404Spjd/*
104168404Spjd * These two tables represent powers and logs of 2 in the Galois field defined
105168404Spjd * above. These values were computed by repeatedly multiplying by 2 as above.
106168404Spjd */
107168404Spjdstatic const uint8_t vdev_raidz_pow2[256] = {
108168404Spjd	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
109168404Spjd	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
110168404Spjd	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
111168404Spjd	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
112168404Spjd	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
113168404Spjd	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
114168404Spjd	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
115168404Spjd	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
116168404Spjd	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
117168404Spjd	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
118168404Spjd	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
119168404Spjd	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
120168404Spjd	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
121168404Spjd	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
122168404Spjd	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
123168404Spjd	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
124168404Spjd	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
125168404Spjd	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
126168404Spjd	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
127168404Spjd	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
128168404Spjd	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
129168404Spjd	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
130168404Spjd	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
131168404Spjd	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
132168404Spjd	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
133168404Spjd	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
134168404Spjd	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
135168404Spjd	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
136168404Spjd	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
137168404Spjd	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
138168404Spjd	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
139168404Spjd	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
140168404Spjd};
141168404Spjdstatic const uint8_t vdev_raidz_log2[256] = {
142168404Spjd	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
143168404Spjd	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
144168404Spjd	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
145168404Spjd	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
146168404Spjd	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
147168404Spjd	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
148168404Spjd	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
149168404Spjd	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
150168404Spjd	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
151168404Spjd	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
152168404Spjd	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
153168404Spjd	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
154168404Spjd	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
155168404Spjd	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
156168404Spjd	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
157168404Spjd	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
158168404Spjd	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
159168404Spjd	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
160168404Spjd	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
161168404Spjd	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
162168404Spjd	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
163168404Spjd	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
164168404Spjd	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
165168404Spjd	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
166168404Spjd	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
167168404Spjd	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
168168404Spjd	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
169168404Spjd	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
170168404Spjd	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
171168404Spjd	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
172168404Spjd	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
173168404Spjd	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
174168404Spjd};
175168404Spjd
176168404Spjd/*
177168404Spjd * Multiply a given number by 2 raised to the given power.
178168404Spjd */
179168404Spjdstatic uint8_t
180168404Spjdvdev_raidz_exp2(uint_t a, int exp)
181168404Spjd{
182168404Spjd	if (a == 0)
183168404Spjd		return (0);
184168404Spjd
185168404Spjd	ASSERT(exp >= 0);
186168404Spjd	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
187168404Spjd
188168404Spjd	exp += vdev_raidz_log2[a];
189168404Spjd	if (exp > 255)
190168404Spjd		exp -= 255;
191168404Spjd
192168404Spjd	return (vdev_raidz_pow2[exp]);
193168404Spjd}
194168404Spjd
195185029Spjdstatic void
196185029Spjdvdev_raidz_map_free(zio_t *zio)
197185029Spjd{
198185029Spjd	raidz_map_t *rm = zio->io_vsd;
199185029Spjd	int c;
200185029Spjd
201185029Spjd	for (c = 0; c < rm->rm_firstdatacol; c++)
202185029Spjd		zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
203185029Spjd
204185029Spjd	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols]));
205185029Spjd}
206185029Spjd
207168404Spjdstatic raidz_map_t *
208168404Spjdvdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
209168404Spjd    uint64_t nparity)
210168404Spjd{
211168404Spjd	raidz_map_t *rm;
212168404Spjd	uint64_t b = zio->io_offset >> unit_shift;
213168404Spjd	uint64_t s = zio->io_size >> unit_shift;
214168404Spjd	uint64_t f = b % dcols;
215168404Spjd	uint64_t o = (b / dcols) << unit_shift;
216168404Spjd	uint64_t q, r, c, bc, col, acols, coff, devidx;
217168404Spjd
218168404Spjd	q = s / (dcols - nparity);
219168404Spjd	r = s - q * (dcols - nparity);
220168404Spjd	bc = (r == 0 ? 0 : r + nparity);
221168404Spjd
222168404Spjd	acols = (q == 0 ? bc : dcols);
223168404Spjd
224168404Spjd	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[acols]), KM_SLEEP);
225168404Spjd
226168404Spjd	rm->rm_cols = acols;
227168404Spjd	rm->rm_bigcols = bc;
228168404Spjd	rm->rm_asize = 0;
229168404Spjd	rm->rm_missingdata = 0;
230168404Spjd	rm->rm_missingparity = 0;
231168404Spjd	rm->rm_firstdatacol = nparity;
232168404Spjd
233168404Spjd	for (c = 0; c < acols; c++) {
234168404Spjd		col = f + c;
235168404Spjd		coff = o;
236168404Spjd		if (col >= dcols) {
237168404Spjd			col -= dcols;
238168404Spjd			coff += 1ULL << unit_shift;
239168404Spjd		}
240168404Spjd		rm->rm_col[c].rc_devidx = col;
241168404Spjd		rm->rm_col[c].rc_offset = coff;
242168404Spjd		rm->rm_col[c].rc_size = (q + (c < bc)) << unit_shift;
243168404Spjd		rm->rm_col[c].rc_data = NULL;
244168404Spjd		rm->rm_col[c].rc_error = 0;
245168404Spjd		rm->rm_col[c].rc_tried = 0;
246168404Spjd		rm->rm_col[c].rc_skipped = 0;
247168404Spjd		rm->rm_asize += rm->rm_col[c].rc_size;
248168404Spjd	}
249168404Spjd
250168404Spjd	rm->rm_asize = roundup(rm->rm_asize, (nparity + 1) << unit_shift);
251168404Spjd
252168404Spjd	for (c = 0; c < rm->rm_firstdatacol; c++)
253168404Spjd		rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
254168404Spjd
255168404Spjd	rm->rm_col[c].rc_data = zio->io_data;
256168404Spjd
257168404Spjd	for (c = c + 1; c < acols; c++)
258168404Spjd		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
259168404Spjd		    rm->rm_col[c - 1].rc_size;
260168404Spjd
261168404Spjd	/*
262168404Spjd	 * If all data stored spans all columns, there's a danger that parity
263168404Spjd	 * will always be on the same device and, since parity isn't read
264168404Spjd	 * during normal operation, that that device's I/O bandwidth won't be
265168404Spjd	 * used effectively. We therefore switch the parity every 1MB.
266168404Spjd	 *
267168404Spjd	 * ... at least that was, ostensibly, the theory. As a practical
268168404Spjd	 * matter unless we juggle the parity between all devices evenly, we
269168404Spjd	 * won't see any benefit. Further, occasional writes that aren't a
270168404Spjd	 * multiple of the LCM of the number of children and the minimum
271168404Spjd	 * stripe width are sufficient to avoid pessimal behavior.
272168404Spjd	 * Unfortunately, this decision created an implicit on-disk format
273168404Spjd	 * requirement that we need to support for all eternity, but only
274168404Spjd	 * for single-parity RAID-Z.
275168404Spjd	 */
276168404Spjd	ASSERT(rm->rm_cols >= 2);
277168404Spjd	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
278168404Spjd
279168404Spjd	if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
280168404Spjd		devidx = rm->rm_col[0].rc_devidx;
281168404Spjd		o = rm->rm_col[0].rc_offset;
282168404Spjd		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
283168404Spjd		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
284168404Spjd		rm->rm_col[1].rc_devidx = devidx;
285168404Spjd		rm->rm_col[1].rc_offset = o;
286168404Spjd	}
287168404Spjd
288168404Spjd	zio->io_vsd = rm;
289185029Spjd	zio->io_vsd_free = vdev_raidz_map_free;
290168404Spjd	return (rm);
291168404Spjd}
292168404Spjd
293168404Spjdstatic void
294168404Spjdvdev_raidz_generate_parity_p(raidz_map_t *rm)
295168404Spjd{
296168404Spjd	uint64_t *p, *src, pcount, ccount, i;
297168404Spjd	int c;
298168404Spjd
299168404Spjd	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
300168404Spjd
301168404Spjd	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
302168404Spjd		src = rm->rm_col[c].rc_data;
303168404Spjd		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
304168404Spjd		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
305168404Spjd
306168404Spjd		if (c == rm->rm_firstdatacol) {
307168404Spjd			ASSERT(ccount == pcount);
308168404Spjd			for (i = 0; i < ccount; i++, p++, src++) {
309168404Spjd				*p = *src;
310168404Spjd			}
311168404Spjd		} else {
312168404Spjd			ASSERT(ccount <= pcount);
313168404Spjd			for (i = 0; i < ccount; i++, p++, src++) {
314168404Spjd				*p ^= *src;
315168404Spjd			}
316168404Spjd		}
317168404Spjd	}
318168404Spjd}
319168404Spjd
320168404Spjdstatic void
321168404Spjdvdev_raidz_generate_parity_pq(raidz_map_t *rm)
322168404Spjd{
323168404Spjd	uint64_t *q, *p, *src, pcount, ccount, mask, i;
324168404Spjd	int c;
325168404Spjd
326168404Spjd	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
327168404Spjd	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
328168404Spjd	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
329168404Spjd
330168404Spjd	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
331168404Spjd		src = rm->rm_col[c].rc_data;
332168404Spjd		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
333168404Spjd		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
334168404Spjd		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
335168404Spjd
336168404Spjd		if (c == rm->rm_firstdatacol) {
337168404Spjd			ASSERT(ccount == pcount || ccount == 0);
338168404Spjd			for (i = 0; i < ccount; i++, p++, q++, src++) {
339168404Spjd				*q = *src;
340168404Spjd				*p = *src;
341168404Spjd			}
342168404Spjd			for (; i < pcount; i++, p++, q++, src++) {
343168404Spjd				*q = 0;
344168404Spjd				*p = 0;
345168404Spjd			}
346168404Spjd		} else {
347168404Spjd			ASSERT(ccount <= pcount);
348168404Spjd
349168404Spjd			/*
350168404Spjd			 * Rather than multiplying each byte individually (as
351168404Spjd			 * described above), we are able to handle 8 at once
352168404Spjd			 * by generating a mask based on the high bit in each
353168404Spjd			 * byte and using that to conditionally XOR in 0x1d.
354168404Spjd			 */
355168404Spjd			for (i = 0; i < ccount; i++, p++, q++, src++) {
356168404Spjd				mask = *q & 0x8080808080808080ULL;
357168404Spjd				mask = (mask << 1) - (mask >> 7);
358168404Spjd				*q = ((*q << 1) & 0xfefefefefefefefeULL) ^
359168404Spjd				    (mask & 0x1d1d1d1d1d1d1d1dULL);
360168404Spjd				*q ^= *src;
361168404Spjd				*p ^= *src;
362168404Spjd			}
363168404Spjd
364168404Spjd			/*
365168404Spjd			 * Treat short columns as though they are full of 0s.
366168404Spjd			 */
367168404Spjd			for (; i < pcount; i++, q++) {
368168404Spjd				mask = *q & 0x8080808080808080ULL;
369168404Spjd				mask = (mask << 1) - (mask >> 7);
370168404Spjd				*q = ((*q << 1) & 0xfefefefefefefefeULL) ^
371168404Spjd				    (mask & 0x1d1d1d1d1d1d1d1dULL);
372168404Spjd			}
373168404Spjd		}
374168404Spjd	}
375168404Spjd}
376168404Spjd
377168404Spjdstatic void
378168404Spjdvdev_raidz_reconstruct_p(raidz_map_t *rm, int x)
379168404Spjd{
380168404Spjd	uint64_t *dst, *src, xcount, ccount, count, i;
381168404Spjd	int c;
382168404Spjd
383168404Spjd	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
384168404Spjd	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
385168404Spjd	ASSERT(xcount > 0);
386168404Spjd
387168404Spjd	src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
388168404Spjd	dst = rm->rm_col[x].rc_data;
389168404Spjd	for (i = 0; i < xcount; i++, dst++, src++) {
390168404Spjd		*dst = *src;
391168404Spjd	}
392168404Spjd
393168404Spjd	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
394168404Spjd		src = rm->rm_col[c].rc_data;
395168404Spjd		dst = rm->rm_col[x].rc_data;
396168404Spjd
397168404Spjd		if (c == x)
398168404Spjd			continue;
399168404Spjd
400168404Spjd		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
401168404Spjd		count = MIN(ccount, xcount);
402168404Spjd
403168404Spjd		for (i = 0; i < count; i++, dst++, src++) {
404168404Spjd			*dst ^= *src;
405168404Spjd		}
406168404Spjd	}
407168404Spjd}
408168404Spjd
409168404Spjdstatic void
410168404Spjdvdev_raidz_reconstruct_q(raidz_map_t *rm, int x)
411168404Spjd{
412168404Spjd	uint64_t *dst, *src, xcount, ccount, count, mask, i;
413168404Spjd	uint8_t *b;
414168404Spjd	int c, j, exp;
415168404Spjd
416168404Spjd	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
417168404Spjd	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
418168404Spjd
419168404Spjd	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
420168404Spjd		src = rm->rm_col[c].rc_data;
421168404Spjd		dst = rm->rm_col[x].rc_data;
422168404Spjd
423168404Spjd		if (c == x)
424168404Spjd			ccount = 0;
425168404Spjd		else
426168404Spjd			ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
427168404Spjd
428168404Spjd		count = MIN(ccount, xcount);
429168404Spjd
430168404Spjd		if (c == rm->rm_firstdatacol) {
431168404Spjd			for (i = 0; i < count; i++, dst++, src++) {
432168404Spjd				*dst = *src;
433168404Spjd			}
434168404Spjd			for (; i < xcount; i++, dst++) {
435168404Spjd				*dst = 0;
436168404Spjd			}
437168404Spjd
438168404Spjd		} else {
439168404Spjd			/*
440168404Spjd			 * For an explanation of this, see the comment in
441168404Spjd			 * vdev_raidz_generate_parity_pq() above.
442168404Spjd			 */
443168404Spjd			for (i = 0; i < count; i++, dst++, src++) {
444168404Spjd				mask = *dst & 0x8080808080808080ULL;
445168404Spjd				mask = (mask << 1) - (mask >> 7);
446168404Spjd				*dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
447168404Spjd				    (mask & 0x1d1d1d1d1d1d1d1dULL);
448168404Spjd				*dst ^= *src;
449168404Spjd			}
450168404Spjd
451168404Spjd			for (; i < xcount; i++, dst++) {
452168404Spjd				mask = *dst & 0x8080808080808080ULL;
453168404Spjd				mask = (mask << 1) - (mask >> 7);
454168404Spjd				*dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
455168404Spjd				    (mask & 0x1d1d1d1d1d1d1d1dULL);
456168404Spjd			}
457168404Spjd		}
458168404Spjd	}
459168404Spjd
460168404Spjd	src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
461168404Spjd	dst = rm->rm_col[x].rc_data;
462168404Spjd	exp = 255 - (rm->rm_cols - 1 - x);
463168404Spjd
464168404Spjd	for (i = 0; i < xcount; i++, dst++, src++) {
465168404Spjd		*dst ^= *src;
466168404Spjd		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
467168404Spjd			*b = vdev_raidz_exp2(*b, exp);
468168404Spjd		}
469168404Spjd	}
470168404Spjd}
471168404Spjd
472168404Spjdstatic void
473168404Spjdvdev_raidz_reconstruct_pq(raidz_map_t *rm, int x, int y)
474168404Spjd{
475168404Spjd	uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
476168404Spjd	void *pdata, *qdata;
477168404Spjd	uint64_t xsize, ysize, i;
478168404Spjd
479168404Spjd	ASSERT(x < y);
480168404Spjd	ASSERT(x >= rm->rm_firstdatacol);
481168404Spjd	ASSERT(y < rm->rm_cols);
482168404Spjd
483168404Spjd	ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
484168404Spjd
485168404Spjd	/*
486168404Spjd	 * Move the parity data aside -- we're going to compute parity as
487168404Spjd	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
488168404Spjd	 * reuse the parity generation mechanism without trashing the actual
489168404Spjd	 * parity so we make those columns appear to be full of zeros by
490168404Spjd	 * setting their lengths to zero.
491168404Spjd	 */
492168404Spjd	pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
493168404Spjd	qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
494168404Spjd	xsize = rm->rm_col[x].rc_size;
495168404Spjd	ysize = rm->rm_col[y].rc_size;
496168404Spjd
497168404Spjd	rm->rm_col[VDEV_RAIDZ_P].rc_data =
498168404Spjd	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
499168404Spjd	rm->rm_col[VDEV_RAIDZ_Q].rc_data =
500168404Spjd	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
501168404Spjd	rm->rm_col[x].rc_size = 0;
502168404Spjd	rm->rm_col[y].rc_size = 0;
503168404Spjd
504168404Spjd	vdev_raidz_generate_parity_pq(rm);
505168404Spjd
506168404Spjd	rm->rm_col[x].rc_size = xsize;
507168404Spjd	rm->rm_col[y].rc_size = ysize;
508168404Spjd
509168404Spjd	p = pdata;
510168404Spjd	q = qdata;
511168404Spjd	pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
512168404Spjd	qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
513168404Spjd	xd = rm->rm_col[x].rc_data;
514168404Spjd	yd = rm->rm_col[y].rc_data;
515168404Spjd
516168404Spjd	/*
517168404Spjd	 * We now have:
518168404Spjd	 *	Pxy = P + D_x + D_y
519168404Spjd	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
520168404Spjd	 *
521168404Spjd	 * We can then solve for D_x:
522168404Spjd	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
523168404Spjd	 * where
524168404Spjd	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
525168404Spjd	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
526168404Spjd	 *
527168404Spjd	 * With D_x in hand, we can easily solve for D_y:
528168404Spjd	 *	D_y = P + Pxy + D_x
529168404Spjd	 */
530168404Spjd
531168404Spjd	a = vdev_raidz_pow2[255 + x - y];
532168404Spjd	b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
533168404Spjd	tmp = 255 - vdev_raidz_log2[a ^ 1];
534168404Spjd
535168404Spjd	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
536168404Spjd	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
537168404Spjd
538168404Spjd	for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
539168404Spjd		*xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
540168404Spjd		    vdev_raidz_exp2(*q ^ *qxy, bexp);
541168404Spjd
542168404Spjd		if (i < ysize)
543168404Spjd			*yd = *p ^ *pxy ^ *xd;
544168404Spjd	}
545168404Spjd
546168404Spjd	zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
547168404Spjd	    rm->rm_col[VDEV_RAIDZ_P].rc_size);
548168404Spjd	zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
549168404Spjd	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
550168404Spjd
551168404Spjd	/*
552168404Spjd	 * Restore the saved parity data.
553168404Spjd	 */
554168404Spjd	rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
555168404Spjd	rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
556168404Spjd}
557168404Spjd
558168404Spjd
559168404Spjdstatic int
560168404Spjdvdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
561168404Spjd{
562168404Spjd	vdev_t *cvd;
563168404Spjd	uint64_t nparity = vd->vdev_nparity;
564168404Spjd	int c, error;
565168404Spjd	int lasterror = 0;
566168404Spjd	int numerrors = 0;
567168404Spjd
568168404Spjd	ASSERT(nparity > 0);
569168404Spjd
570168404Spjd	if (nparity > VDEV_RAIDZ_MAXPARITY ||
571168404Spjd	    vd->vdev_children < nparity + 1) {
572168404Spjd		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
573168404Spjd		return (EINVAL);
574168404Spjd	}
575168404Spjd
576168404Spjd	for (c = 0; c < vd->vdev_children; c++) {
577168404Spjd		cvd = vd->vdev_child[c];
578168404Spjd
579168404Spjd		if ((error = vdev_open(cvd)) != 0) {
580168404Spjd			lasterror = error;
581168404Spjd			numerrors++;
582168404Spjd			continue;
583168404Spjd		}
584168404Spjd
585168404Spjd		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
586168404Spjd		*ashift = MAX(*ashift, cvd->vdev_ashift);
587168404Spjd	}
588168404Spjd
589168404Spjd	*asize *= vd->vdev_children;
590168404Spjd
591168404Spjd	if (numerrors > nparity) {
592168404Spjd		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
593168404Spjd		return (lasterror);
594168404Spjd	}
595168404Spjd
596168404Spjd	return (0);
597168404Spjd}
598168404Spjd
599168404Spjdstatic void
600168404Spjdvdev_raidz_close(vdev_t *vd)
601168404Spjd{
602168404Spjd	int c;
603168404Spjd
604168404Spjd	for (c = 0; c < vd->vdev_children; c++)
605168404Spjd		vdev_close(vd->vdev_child[c]);
606168404Spjd}
607168404Spjd
608168404Spjdstatic uint64_t
609168404Spjdvdev_raidz_asize(vdev_t *vd, uint64_t psize)
610168404Spjd{
611168404Spjd	uint64_t asize;
612168404Spjd	uint64_t ashift = vd->vdev_top->vdev_ashift;
613168404Spjd	uint64_t cols = vd->vdev_children;
614168404Spjd	uint64_t nparity = vd->vdev_nparity;
615168404Spjd
616168404Spjd	asize = ((psize - 1) >> ashift) + 1;
617168404Spjd	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
618168404Spjd	asize = roundup(asize, nparity + 1) << ashift;
619168404Spjd
620168404Spjd	return (asize);
621168404Spjd}
622168404Spjd
623168404Spjdstatic void
624168404Spjdvdev_raidz_child_done(zio_t *zio)
625168404Spjd{
626168404Spjd	raidz_col_t *rc = zio->io_private;
627168404Spjd
628168404Spjd	rc->rc_error = zio->io_error;
629168404Spjd	rc->rc_tried = 1;
630168404Spjd	rc->rc_skipped = 0;
631168404Spjd}
632168404Spjd
633185029Spjdstatic int
634168404Spjdvdev_raidz_io_start(zio_t *zio)
635168404Spjd{
636168404Spjd	vdev_t *vd = zio->io_vd;
637168404Spjd	vdev_t *tvd = vd->vdev_top;
638168404Spjd	vdev_t *cvd;
639168404Spjd	blkptr_t *bp = zio->io_bp;
640168404Spjd	raidz_map_t *rm;
641168404Spjd	raidz_col_t *rc;
642168404Spjd	int c;
643168404Spjd
644168404Spjd	rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
645168404Spjd	    vd->vdev_nparity);
646168404Spjd
647168404Spjd	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
648168404Spjd
649168404Spjd	if (zio->io_type == ZIO_TYPE_WRITE) {
650168404Spjd		/*
651168404Spjd		 * Generate RAID parity in the first virtual columns.
652168404Spjd		 */
653168404Spjd		if (rm->rm_firstdatacol == 1)
654168404Spjd			vdev_raidz_generate_parity_p(rm);
655168404Spjd		else
656168404Spjd			vdev_raidz_generate_parity_pq(rm);
657168404Spjd
658168404Spjd		for (c = 0; c < rm->rm_cols; c++) {
659168404Spjd			rc = &rm->rm_col[c];
660168404Spjd			cvd = vd->vdev_child[rc->rc_devidx];
661168404Spjd			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
662168404Spjd			    rc->rc_offset, rc->rc_data, rc->rc_size,
663185029Spjd			    zio->io_type, zio->io_priority, 0,
664168404Spjd			    vdev_raidz_child_done, rc));
665168404Spjd		}
666185029Spjd
667185029Spjd		return (ZIO_PIPELINE_CONTINUE);
668168404Spjd	}
669168404Spjd
670168404Spjd	ASSERT(zio->io_type == ZIO_TYPE_READ);
671168404Spjd
672168404Spjd	/*
673168404Spjd	 * Iterate over the columns in reverse order so that we hit the parity
674168404Spjd	 * last -- any errors along the way will force us to read the parity
675168404Spjd	 * data.
676168404Spjd	 */
677168404Spjd	for (c = rm->rm_cols - 1; c >= 0; c--) {
678168404Spjd		rc = &rm->rm_col[c];
679168404Spjd		cvd = vd->vdev_child[rc->rc_devidx];
680185029Spjd		if (!vdev_readable(cvd)) {
681168404Spjd			if (c >= rm->rm_firstdatacol)
682168404Spjd				rm->rm_missingdata++;
683168404Spjd			else
684168404Spjd				rm->rm_missingparity++;
685168404Spjd			rc->rc_error = ENXIO;
686168404Spjd			rc->rc_tried = 1;	/* don't even try */
687168404Spjd			rc->rc_skipped = 1;
688168404Spjd			continue;
689168404Spjd		}
690168404Spjd		if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) {
691168404Spjd			if (c >= rm->rm_firstdatacol)
692168404Spjd				rm->rm_missingdata++;
693168404Spjd			else
694168404Spjd				rm->rm_missingparity++;
695168404Spjd			rc->rc_error = ESTALE;
696168404Spjd			rc->rc_skipped = 1;
697168404Spjd			continue;
698168404Spjd		}
699168404Spjd		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
700168404Spjd		    (zio->io_flags & ZIO_FLAG_SCRUB)) {
701168404Spjd			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
702168404Spjd			    rc->rc_offset, rc->rc_data, rc->rc_size,
703185029Spjd			    zio->io_type, zio->io_priority, 0,
704168404Spjd			    vdev_raidz_child_done, rc));
705168404Spjd		}
706168404Spjd	}
707168404Spjd
708185029Spjd	return (ZIO_PIPELINE_CONTINUE);
709168404Spjd}
710168404Spjd
711168404Spjd/*
712168404Spjd * Report a checksum error for a child of a RAID-Z device.
713168404Spjd */
714168404Spjdstatic void
715168404Spjdraidz_checksum_error(zio_t *zio, raidz_col_t *rc)
716168404Spjd{
717168404Spjd	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
718168404Spjd
719168404Spjd	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
720168404Spjd		mutex_enter(&vd->vdev_stat_lock);
721168404Spjd		vd->vdev_stat.vs_checksum_errors++;
722168404Spjd		mutex_exit(&vd->vdev_stat_lock);
723168404Spjd	}
724168404Spjd
725168404Spjd	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE))
726168404Spjd		zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
727168404Spjd		    zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size);
728168404Spjd}
729168404Spjd
730168404Spjd/*
731168404Spjd * Generate the parity from the data columns. If we tried and were able to
732168404Spjd * read the parity without error, verify that the generated parity matches the
733168404Spjd * data we read. If it doesn't, we fire off a checksum error. Return the
734168404Spjd * number such failures.
735168404Spjd */
736168404Spjdstatic int
737168404Spjdraidz_parity_verify(zio_t *zio, raidz_map_t *rm)
738168404Spjd{
739168404Spjd	void *orig[VDEV_RAIDZ_MAXPARITY];
740168404Spjd	int c, ret = 0;
741168404Spjd	raidz_col_t *rc;
742168404Spjd
743168404Spjd	for (c = 0; c < rm->rm_firstdatacol; c++) {
744168404Spjd		rc = &rm->rm_col[c];
745168404Spjd		if (!rc->rc_tried || rc->rc_error != 0)
746168404Spjd			continue;
747168404Spjd		orig[c] = zio_buf_alloc(rc->rc_size);
748168404Spjd		bcopy(rc->rc_data, orig[c], rc->rc_size);
749168404Spjd	}
750168404Spjd
751168404Spjd	if (rm->rm_firstdatacol == 1)
752168404Spjd		vdev_raidz_generate_parity_p(rm);
753168404Spjd	else
754168404Spjd		vdev_raidz_generate_parity_pq(rm);
755168404Spjd
756168404Spjd	for (c = 0; c < rm->rm_firstdatacol; c++) {
757168404Spjd		rc = &rm->rm_col[c];
758168404Spjd		if (!rc->rc_tried || rc->rc_error != 0)
759168404Spjd			continue;
760168404Spjd		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
761168404Spjd			raidz_checksum_error(zio, rc);
762168404Spjd			rc->rc_error = ECKSUM;
763168404Spjd			ret++;
764168404Spjd		}
765168404Spjd		zio_buf_free(orig[c], rc->rc_size);
766168404Spjd	}
767168404Spjd
768168404Spjd	return (ret);
769168404Spjd}
770168404Spjd
771168404Spjdstatic uint64_t raidz_corrected_p;
772168404Spjdstatic uint64_t raidz_corrected_q;
773168404Spjdstatic uint64_t raidz_corrected_pq;
774168404Spjd
775185029Spjdstatic int
776185029Spjdvdev_raidz_worst_error(raidz_map_t *rm)
777185029Spjd{
778185029Spjd	int error = 0;
779185029Spjd
780185029Spjd	for (int c = 0; c < rm->rm_cols; c++)
781185029Spjd		error = zio_worst_error(error, rm->rm_col[c].rc_error);
782185029Spjd
783185029Spjd	return (error);
784185029Spjd}
785185029Spjd
786168404Spjdstatic void
787168404Spjdvdev_raidz_io_done(zio_t *zio)
788168404Spjd{
789168404Spjd	vdev_t *vd = zio->io_vd;
790168404Spjd	vdev_t *cvd;
791168404Spjd	raidz_map_t *rm = zio->io_vsd;
792168404Spjd	raidz_col_t *rc, *rc1;
793168404Spjd	int unexpected_errors = 0;
794168404Spjd	int parity_errors = 0;
795168404Spjd	int parity_untried = 0;
796168404Spjd	int data_errors = 0;
797185029Spjd	int total_errors = 0;
798168404Spjd	int n, c, c1;
799168404Spjd
800168404Spjd	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
801168404Spjd
802168404Spjd	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
803168404Spjd	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
804168404Spjd
805168404Spjd	for (c = 0; c < rm->rm_cols; c++) {
806168404Spjd		rc = &rm->rm_col[c];
807168404Spjd
808168404Spjd		if (rc->rc_error) {
809185029Spjd			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
810168404Spjd
811168404Spjd			if (c < rm->rm_firstdatacol)
812168404Spjd				parity_errors++;
813168404Spjd			else
814168404Spjd				data_errors++;
815168404Spjd
816168404Spjd			if (!rc->rc_skipped)
817168404Spjd				unexpected_errors++;
818168404Spjd
819185029Spjd			total_errors++;
820168404Spjd		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
821168404Spjd			parity_untried++;
822168404Spjd		}
823168404Spjd	}
824168404Spjd
825168404Spjd	if (zio->io_type == ZIO_TYPE_WRITE) {
826168404Spjd		/*
827185029Spjd		 * XXX -- for now, treat partial writes as a success.
828185029Spjd		 * (If we couldn't write enough columns to reconstruct
829185029Spjd		 * the data, the I/O failed.  Otherwise, good enough.)
830185029Spjd		 *
831185029Spjd		 * Now that we support write reallocation, it would be better
832185029Spjd		 * to treat partial failure as real failure unless there are
833185029Spjd		 * no non-degraded top-level vdevs left, and not update DTLs
834185029Spjd		 * if we intend to reallocate.
835168404Spjd		 */
836168404Spjd		/* XXPOLICY */
837185029Spjd		if (total_errors > rm->rm_firstdatacol)
838185029Spjd			zio->io_error = vdev_raidz_worst_error(rm);
839168404Spjd
840168404Spjd		return;
841168404Spjd	}
842168404Spjd
843168404Spjd	ASSERT(zio->io_type == ZIO_TYPE_READ);
844168404Spjd	/*
845168404Spjd	 * There are three potential phases for a read:
846168404Spjd	 *	1. produce valid data from the columns read
847168404Spjd	 *	2. read all disks and try again
848168404Spjd	 *	3. perform combinatorial reconstruction
849168404Spjd	 *
850168404Spjd	 * Each phase is progressively both more expensive and less likely to
851168404Spjd	 * occur. If we encounter more errors than we can repair or all phases
852168404Spjd	 * fail, we have no choice but to return an error.
853168404Spjd	 */
854168404Spjd
855168404Spjd	/*
856168404Spjd	 * If the number of errors we saw was correctable -- less than or equal
857168404Spjd	 * to the number of parity disks read -- attempt to produce data that
858168404Spjd	 * has a valid checksum. Naturally, this case applies in the absence of
859168404Spjd	 * any errors.
860168404Spjd	 */
861185029Spjd	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
862168404Spjd		switch (data_errors) {
863168404Spjd		case 0:
864168404Spjd			if (zio_checksum_error(zio) == 0) {
865168738Spjd				/*
866168738Spjd				 * If we read parity information (unnecessarily
867168738Spjd				 * as it happens since no reconstruction was
868168738Spjd				 * needed) regenerate and verify the parity.
869168738Spjd				 * We also regenerate parity when resilvering
870168738Spjd				 * so we can write it out to the failed device
871168738Spjd				 * later.
872168738Spjd				 */
873168404Spjd				if (parity_errors + parity_untried <
874168738Spjd				    rm->rm_firstdatacol ||
875168738Spjd				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
876168404Spjd					n = raidz_parity_verify(zio, rm);
877168404Spjd					unexpected_errors += n;
878168404Spjd					ASSERT(parity_errors + n <=
879168404Spjd					    rm->rm_firstdatacol);
880168404Spjd				}
881168404Spjd				goto done;
882168404Spjd			}
883168404Spjd			break;
884168404Spjd
885168404Spjd		case 1:
886168404Spjd			/*
887168404Spjd			 * We either attempt to read all the parity columns or
888168404Spjd			 * none of them. If we didn't try to read parity, we
889168404Spjd			 * wouldn't be here in the correctable case. There must
890168404Spjd			 * also have been fewer parity errors than parity
891168404Spjd			 * columns or, again, we wouldn't be in this code path.
892168404Spjd			 */
893168404Spjd			ASSERT(parity_untried == 0);
894168404Spjd			ASSERT(parity_errors < rm->rm_firstdatacol);
895168404Spjd
896168404Spjd			/*
897168404Spjd			 * Find the column that reported the error.
898168404Spjd			 */
899168404Spjd			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
900168404Spjd				rc = &rm->rm_col[c];
901168404Spjd				if (rc->rc_error != 0)
902168404Spjd					break;
903168404Spjd			}
904168404Spjd			ASSERT(c != rm->rm_cols);
905168404Spjd			ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
906168404Spjd			    rc->rc_error == ESTALE);
907168404Spjd
908168404Spjd			if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) {
909168404Spjd				vdev_raidz_reconstruct_p(rm, c);
910168404Spjd			} else {
911168404Spjd				ASSERT(rm->rm_firstdatacol > 1);
912168404Spjd				vdev_raidz_reconstruct_q(rm, c);
913168404Spjd			}
914168404Spjd
915168404Spjd			if (zio_checksum_error(zio) == 0) {
916168404Spjd				if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0)
917168404Spjd					atomic_inc_64(&raidz_corrected_p);
918168404Spjd				else
919168404Spjd					atomic_inc_64(&raidz_corrected_q);
920168404Spjd
921168404Spjd				/*
922168404Spjd				 * If there's more than one parity disk that
923168404Spjd				 * was successfully read, confirm that the
924168404Spjd				 * other parity disk produced the correct data.
925168404Spjd				 * This routine is suboptimal in that it
926168404Spjd				 * regenerates both the parity we wish to test
927168404Spjd				 * as well as the parity we just used to
928168404Spjd				 * perform the reconstruction, but this should
929168404Spjd				 * be a relatively uncommon case, and can be
930168404Spjd				 * optimized if it becomes a problem.
931168738Spjd				 * We also regenerate parity when resilvering
932168738Spjd				 * so we can write it out to the failed device
933168738Spjd				 * later.
934168404Spjd				 */
935168738Spjd				if (parity_errors < rm->rm_firstdatacol - 1 ||
936168738Spjd				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
937168404Spjd					n = raidz_parity_verify(zio, rm);
938168404Spjd					unexpected_errors += n;
939168404Spjd					ASSERT(parity_errors + n <=
940168404Spjd					    rm->rm_firstdatacol);
941168404Spjd				}
942168404Spjd
943168404Spjd				goto done;
944168404Spjd			}
945168404Spjd			break;
946168404Spjd
947168404Spjd		case 2:
948168404Spjd			/*
949168404Spjd			 * Two data column errors require double parity.
950168404Spjd			 */
951168404Spjd			ASSERT(rm->rm_firstdatacol == 2);
952168404Spjd
953168404Spjd			/*
954168404Spjd			 * Find the two columns that reported errors.
955168404Spjd			 */
956168404Spjd			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
957168404Spjd				rc = &rm->rm_col[c];
958168404Spjd				if (rc->rc_error != 0)
959168404Spjd					break;
960168404Spjd			}
961168404Spjd			ASSERT(c != rm->rm_cols);
962168404Spjd			ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
963168404Spjd			    rc->rc_error == ESTALE);
964168404Spjd
965168404Spjd			for (c1 = c++; c < rm->rm_cols; c++) {
966168404Spjd				rc = &rm->rm_col[c];
967168404Spjd				if (rc->rc_error != 0)
968168404Spjd					break;
969168404Spjd			}
970168404Spjd			ASSERT(c != rm->rm_cols);
971168404Spjd			ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
972168404Spjd			    rc->rc_error == ESTALE);
973168404Spjd
974168404Spjd			vdev_raidz_reconstruct_pq(rm, c1, c);
975168404Spjd
976168404Spjd			if (zio_checksum_error(zio) == 0) {
977168404Spjd				atomic_inc_64(&raidz_corrected_pq);
978168404Spjd				goto done;
979168404Spjd			}
980168404Spjd			break;
981168404Spjd
982168404Spjd		default:
983168404Spjd			ASSERT(rm->rm_firstdatacol <= 2);
984168404Spjd			ASSERT(0);
985168404Spjd		}
986168404Spjd	}
987168404Spjd
988168404Spjd	/*
989168404Spjd	 * This isn't a typical situation -- either we got a read error or
990168404Spjd	 * a child silently returned bad data. Read every block so we can
991168404Spjd	 * try again with as much data and parity as we can track down. If
992168404Spjd	 * we've already been through once before, all children will be marked
993168404Spjd	 * as tried so we'll proceed to combinatorial reconstruction.
994168404Spjd	 */
995168404Spjd	unexpected_errors = 1;
996168404Spjd	rm->rm_missingdata = 0;
997168404Spjd	rm->rm_missingparity = 0;
998168404Spjd
999168404Spjd	for (c = 0; c < rm->rm_cols; c++) {
1000168404Spjd		if (rm->rm_col[c].rc_tried)
1001168404Spjd			continue;
1002168404Spjd
1003168404Spjd		zio_vdev_io_redone(zio);
1004168404Spjd		do {
1005168404Spjd			rc = &rm->rm_col[c];
1006168404Spjd			if (rc->rc_tried)
1007168404Spjd				continue;
1008168404Spjd			zio_nowait(zio_vdev_child_io(zio, NULL,
1009168404Spjd			    vd->vdev_child[rc->rc_devidx],
1010168404Spjd			    rc->rc_offset, rc->rc_data, rc->rc_size,
1011185029Spjd			    zio->io_type, zio->io_priority, 0,
1012168404Spjd			    vdev_raidz_child_done, rc));
1013168404Spjd		} while (++c < rm->rm_cols);
1014185029Spjd
1015168404Spjd		return;
1016168404Spjd	}
1017168404Spjd
1018168404Spjd	/*
1019168404Spjd	 * At this point we've attempted to reconstruct the data given the
1020168404Spjd	 * errors we detected, and we've attempted to read all columns. There
1021168404Spjd	 * must, therefore, be one or more additional problems -- silent errors
1022168404Spjd	 * resulting in invalid data rather than explicit I/O errors resulting
1023168404Spjd	 * in absent data. Before we attempt combinatorial reconstruction make
1024168404Spjd	 * sure we have a chance of coming up with the right answer.
1025168404Spjd	 */
1026185029Spjd	if (total_errors >= rm->rm_firstdatacol) {
1027185029Spjd		zio->io_error = vdev_raidz_worst_error(rm);
1028185029Spjd		/*
1029185029Spjd		 * If there were exactly as many device errors as parity
1030185029Spjd		 * columns, yet we couldn't reconstruct the data, then at
1031185029Spjd		 * least one device must have returned bad data silently.
1032185029Spjd		 */
1033185029Spjd		if (total_errors == rm->rm_firstdatacol)
1034185029Spjd			zio->io_error = zio_worst_error(zio->io_error, ECKSUM);
1035168404Spjd		goto done;
1036168404Spjd	}
1037168404Spjd
1038168404Spjd	if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) {
1039168404Spjd		/*
1040168404Spjd		 * Attempt to reconstruct the data from parity P.
1041168404Spjd		 */
1042168404Spjd		for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1043168404Spjd			void *orig;
1044168404Spjd			rc = &rm->rm_col[c];
1045168404Spjd
1046168404Spjd			orig = zio_buf_alloc(rc->rc_size);
1047168404Spjd			bcopy(rc->rc_data, orig, rc->rc_size);
1048168404Spjd			vdev_raidz_reconstruct_p(rm, c);
1049168404Spjd
1050168404Spjd			if (zio_checksum_error(zio) == 0) {
1051168404Spjd				zio_buf_free(orig, rc->rc_size);
1052168404Spjd				atomic_inc_64(&raidz_corrected_p);
1053168404Spjd
1054168404Spjd				/*
1055168404Spjd				 * If this child didn't know that it returned
1056168404Spjd				 * bad data, inform it.
1057168404Spjd				 */
1058168404Spjd				if (rc->rc_tried && rc->rc_error == 0)
1059168404Spjd					raidz_checksum_error(zio, rc);
1060168404Spjd				rc->rc_error = ECKSUM;
1061168404Spjd				goto done;
1062168404Spjd			}
1063168404Spjd
1064168404Spjd			bcopy(orig, rc->rc_data, rc->rc_size);
1065168404Spjd			zio_buf_free(orig, rc->rc_size);
1066168404Spjd		}
1067168404Spjd	}
1068168404Spjd
1069168404Spjd	if (rm->rm_firstdatacol > 1 && rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) {
1070168404Spjd		/*
1071168404Spjd		 * Attempt to reconstruct the data from parity Q.
1072168404Spjd		 */
1073168404Spjd		for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1074168404Spjd			void *orig;
1075168404Spjd			rc = &rm->rm_col[c];
1076168404Spjd
1077168404Spjd			orig = zio_buf_alloc(rc->rc_size);
1078168404Spjd			bcopy(rc->rc_data, orig, rc->rc_size);
1079168404Spjd			vdev_raidz_reconstruct_q(rm, c);
1080168404Spjd
1081168404Spjd			if (zio_checksum_error(zio) == 0) {
1082168404Spjd				zio_buf_free(orig, rc->rc_size);
1083168404Spjd				atomic_inc_64(&raidz_corrected_q);
1084168404Spjd
1085168404Spjd				/*
1086168404Spjd				 * If this child didn't know that it returned
1087168404Spjd				 * bad data, inform it.
1088168404Spjd				 */
1089168404Spjd				if (rc->rc_tried && rc->rc_error == 0)
1090168404Spjd					raidz_checksum_error(zio, rc);
1091168404Spjd				rc->rc_error = ECKSUM;
1092168404Spjd				goto done;
1093168404Spjd			}
1094168404Spjd
1095168404Spjd			bcopy(orig, rc->rc_data, rc->rc_size);
1096168404Spjd			zio_buf_free(orig, rc->rc_size);
1097168404Spjd		}
1098168404Spjd	}
1099168404Spjd
1100168404Spjd	if (rm->rm_firstdatacol > 1 &&
1101168404Spjd	    rm->rm_col[VDEV_RAIDZ_P].rc_error == 0 &&
1102168404Spjd	    rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) {
1103168404Spjd		/*
1104168404Spjd		 * Attempt to reconstruct the data from both P and Q.
1105168404Spjd		 */
1106168404Spjd		for (c = rm->rm_firstdatacol; c < rm->rm_cols - 1; c++) {
1107168404Spjd			void *orig, *orig1;
1108168404Spjd			rc = &rm->rm_col[c];
1109168404Spjd
1110168404Spjd			orig = zio_buf_alloc(rc->rc_size);
1111168404Spjd			bcopy(rc->rc_data, orig, rc->rc_size);
1112168404Spjd
1113168404Spjd			for (c1 = c + 1; c1 < rm->rm_cols; c1++) {
1114168404Spjd				rc1 = &rm->rm_col[c1];
1115168404Spjd
1116168404Spjd				orig1 = zio_buf_alloc(rc1->rc_size);
1117168404Spjd				bcopy(rc1->rc_data, orig1, rc1->rc_size);
1118168404Spjd
1119168404Spjd				vdev_raidz_reconstruct_pq(rm, c, c1);
1120168404Spjd
1121168404Spjd				if (zio_checksum_error(zio) == 0) {
1122168404Spjd					zio_buf_free(orig, rc->rc_size);
1123168404Spjd					zio_buf_free(orig1, rc1->rc_size);
1124168404Spjd					atomic_inc_64(&raidz_corrected_pq);
1125168404Spjd
1126168404Spjd					/*
1127168404Spjd					 * If these children didn't know they
1128168404Spjd					 * returned bad data, inform them.
1129168404Spjd					 */
1130168404Spjd					if (rc->rc_tried && rc->rc_error == 0)
1131168404Spjd						raidz_checksum_error(zio, rc);
1132168404Spjd					if (rc1->rc_tried && rc1->rc_error == 0)
1133168404Spjd						raidz_checksum_error(zio, rc1);
1134168404Spjd
1135168404Spjd					rc->rc_error = ECKSUM;
1136168404Spjd					rc1->rc_error = ECKSUM;
1137168404Spjd
1138168404Spjd					goto done;
1139168404Spjd				}
1140168404Spjd
1141168404Spjd				bcopy(orig1, rc1->rc_data, rc1->rc_size);
1142168404Spjd				zio_buf_free(orig1, rc1->rc_size);
1143168404Spjd			}
1144168404Spjd
1145168404Spjd			bcopy(orig, rc->rc_data, rc->rc_size);
1146168404Spjd			zio_buf_free(orig, rc->rc_size);
1147168404Spjd		}
1148168404Spjd	}
1149168404Spjd
1150168404Spjd	/*
1151168404Spjd	 * All combinations failed to checksum. Generate checksum ereports for
1152168404Spjd	 * all children.
1153168404Spjd	 */
1154168404Spjd	zio->io_error = ECKSUM;
1155185029Spjd
1156168404Spjd	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1157168404Spjd		for (c = 0; c < rm->rm_cols; c++) {
1158168404Spjd			rc = &rm->rm_col[c];
1159168404Spjd			zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
1160168404Spjd			    zio->io_spa, vd->vdev_child[rc->rc_devidx], zio,
1161168404Spjd			    rc->rc_offset, rc->rc_size);
1162168404Spjd		}
1163168404Spjd	}
1164168404Spjd
1165168404Spjddone:
1166168404Spjd	zio_checksum_verified(zio);
1167168404Spjd
1168168404Spjd	if (zio->io_error == 0 && (spa_mode & FWRITE) &&
1169168404Spjd	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
1170168404Spjd		/*
1171168404Spjd		 * Use the good data we have in hand to repair damaged children.
1172168404Spjd		 */
1173168404Spjd		for (c = 0; c < rm->rm_cols; c++) {
1174168404Spjd			rc = &rm->rm_col[c];
1175168404Spjd			cvd = vd->vdev_child[rc->rc_devidx];
1176168404Spjd
1177168404Spjd			if (rc->rc_error == 0)
1178168404Spjd				continue;
1179168404Spjd
1180185029Spjd			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1181168404Spjd			    rc->rc_offset, rc->rc_data, rc->rc_size,
1182168404Spjd			    ZIO_TYPE_WRITE, zio->io_priority,
1183185029Spjd			    ZIO_FLAG_IO_REPAIR, NULL, NULL));
1184168404Spjd		}
1185168404Spjd	}
1186168404Spjd}
1187168404Spjd
1188168404Spjdstatic void
1189168404Spjdvdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
1190168404Spjd{
1191168404Spjd	if (faulted > vd->vdev_nparity)
1192168404Spjd		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
1193168404Spjd		    VDEV_AUX_NO_REPLICAS);
1194168404Spjd	else if (degraded + faulted != 0)
1195168404Spjd		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
1196168404Spjd	else
1197168404Spjd		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
1198168404Spjd}
1199168404Spjd
1200168404Spjdvdev_ops_t vdev_raidz_ops = {
1201168404Spjd	vdev_raidz_open,
1202168404Spjd	vdev_raidz_close,
1203168404Spjd	vdev_raidz_asize,
1204168404Spjd	vdev_raidz_io_start,
1205168404Spjd	vdev_raidz_io_done,
1206168404Spjd	vdev_raidz_state_change,
1207168404Spjd	VDEV_TYPE_RAIDZ,	/* name of this vdev type */
1208168404Spjd	B_FALSE			/* not a leaf vdev */
1209168404Spjd};
1210