1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
25 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
26 * Copyright (c) 2014 Integros [integros.com]
27 */
28
29#include <sys/zfs_context.h>
30#include <sys/spa.h>
31#include <sys/vdev_impl.h>
32#ifdef illumos
33#include <sys/vdev_disk.h>
34#endif
35#include <sys/vdev_file.h>
36#include <sys/vdev_raidz.h>
37#include <sys/zio.h>
38#include <sys/zio_checksum.h>
39#include <sys/abd.h>
40#include <sys/fs/zfs.h>
41#include <sys/fm/fs/zfs.h>
42#include <sys/bio.h>
43
44#ifdef ZFS_DEBUG
45#include <sys/vdev_initialize.h>	/* vdev_xlate testing */
46#endif
47
48/*
49 * Virtual device vector for RAID-Z.
50 *
51 * This vdev supports single, double, and triple parity. For single parity,
52 * we use a simple XOR of all the data columns. For double or triple parity,
53 * we use a special case of Reed-Solomon coding. This extends the
54 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
55 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
56 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
57 * former is also based. The latter is designed to provide higher performance
58 * for writes.
59 *
60 * Note that the Plank paper claimed to support arbitrary N+M, but was then
61 * amended six years later identifying a critical flaw that invalidates its
62 * claims. Nevertheless, the technique can be adapted to work for up to
63 * triple parity. For additional parity, the amendment "Note: Correction to
64 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
65 * is viable, but the additional complexity means that write performance will
66 * suffer.
67 *
68 * All of the methods above operate on a Galois field, defined over the
69 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
70 * can be expressed with a single byte. Briefly, the operations on the
71 * field are defined as follows:
72 *
73 *   o addition (+) is represented by a bitwise XOR
74 *   o subtraction (-) is therefore identical to addition: A + B = A - B
75 *   o multiplication of A by 2 is defined by the following bitwise expression:
76 *
77 *	(A * 2)_7 = A_6
78 *	(A * 2)_6 = A_5
79 *	(A * 2)_5 = A_4
80 *	(A * 2)_4 = A_3 + A_7
81 *	(A * 2)_3 = A_2 + A_7
82 *	(A * 2)_2 = A_1 + A_7
83 *	(A * 2)_1 = A_0
84 *	(A * 2)_0 = A_7
85 *
86 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
87 * As an aside, this multiplication is derived from the error correcting
88 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
89 *
90 * Observe that any number in the field (except for 0) can be expressed as a
91 * power of 2 -- a generator for the field. We store a table of the powers of
92 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
93 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
94 * than field addition). The inverse of a field element A (A^-1) is therefore
95 * A ^ (255 - 1) = A^254.
96 *
97 * The up-to-three parity columns, P, Q, R over several data columns,
98 * D_0, ... D_n-1, can be expressed by field operations:
99 *
100 *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
101 *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
102 *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
103 *	R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
104 *	  = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
105 *
106 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
107 * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
108 * independent coefficients. (There are no additional coefficients that have
109 * this property which is why the uncorrected Plank method breaks down.)
110 *
111 * See the reconstruction code below for how P, Q and R can used individually
112 * or in concert to recover missing data columns.
113 */
114
115typedef struct raidz_col {
116	uint64_t rc_devidx;		/* child device index for I/O */
117	uint64_t rc_offset;		/* device offset */
118	uint64_t rc_size;		/* I/O size */
119	abd_t *rc_abd;			/* I/O data */
120	void *rc_gdata;			/* used to store the "good" version */
121	int rc_error;			/* I/O error for this device */
122	uint8_t rc_tried;		/* Did we attempt this I/O column? */
123	uint8_t rc_skipped;		/* Did we skip this I/O column? */
124} raidz_col_t;
125
126typedef struct raidz_map {
127	uint64_t rm_cols;		/* Regular column count */
128	uint64_t rm_scols;		/* Count including skipped columns */
129	uint64_t rm_bigcols;		/* Number of oversized columns */
130	uint64_t rm_asize;		/* Actual total I/O size */
131	uint64_t rm_missingdata;	/* Count of missing data devices */
132	uint64_t rm_missingparity;	/* Count of missing parity devices */
133	uint64_t rm_firstdatacol;	/* First data column/parity count */
134	uint64_t rm_nskip;		/* Skipped sectors for padding */
135	uint64_t rm_skipstart;		/* Column index of padding start */
136	abd_t *rm_abd_copy;		/* rm_asize-buffer of copied data */
137	uintptr_t rm_reports;		/* # of referencing checksum reports */
138	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
139	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
140	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
141} raidz_map_t;
142
143#define	VDEV_RAIDZ_P		0
144#define	VDEV_RAIDZ_Q		1
145#define	VDEV_RAIDZ_R		2
146
147#define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
148#define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
149
150/*
151 * We provide a mechanism to perform the field multiplication operation on a
152 * 64-bit value all at once rather than a byte at a time. This works by
153 * creating a mask from the top bit in each byte and using that to
154 * conditionally apply the XOR of 0x1d.
155 */
156#define	VDEV_RAIDZ_64MUL_2(x, mask) \
157{ \
158	(mask) = (x) & 0x8080808080808080ULL; \
159	(mask) = ((mask) << 1) - ((mask) >> 7); \
160	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
161	    ((mask) & 0x1d1d1d1d1d1d1d1d); \
162}
163
164#define	VDEV_RAIDZ_64MUL_4(x, mask) \
165{ \
166	VDEV_RAIDZ_64MUL_2((x), mask); \
167	VDEV_RAIDZ_64MUL_2((x), mask); \
168}
169
170#define	VDEV_LABEL_OFFSET(x)	(x + VDEV_LABEL_START_SIZE)
171
172/*
173 * Force reconstruction to use the general purpose method.
174 */
175int vdev_raidz_default_to_general;
176
177/* Powers of 2 in the Galois field defined above. */
178static const uint8_t vdev_raidz_pow2[256] = {
179	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
180	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
181	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
182	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
183	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
184	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
185	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
186	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
187	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
188	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
189	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
190	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
191	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
192	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
193	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
194	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
195	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
196	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
197	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
198	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
199	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
200	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
201	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
202	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
203	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
204	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
205	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
206	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
207	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
208	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
209	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
210	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
211};
212/* Logs of 2 in the Galois field defined above. */
213static const uint8_t vdev_raidz_log2[256] = {
214	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
215	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
216	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
217	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
218	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
219	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
220	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
221	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
222	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
223	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
224	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
225	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
226	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
227	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
228	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
229	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
230	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
231	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
232	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
233	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
234	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
235	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
236	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
237	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
238	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
239	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
240	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
241	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
242	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
243	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
244	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
245	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
246};
247
248static void vdev_raidz_generate_parity(raidz_map_t *rm);
249
250/*
251 * Multiply a given number by 2 raised to the given power.
252 */
253static uint8_t
254vdev_raidz_exp2(uint_t a, int exp)
255{
256	if (a == 0)
257		return (0);
258
259	ASSERT(exp >= 0);
260	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
261
262	exp += vdev_raidz_log2[a];
263	if (exp > 255)
264		exp -= 255;
265
266	return (vdev_raidz_pow2[exp]);
267}
268
269static void
270vdev_raidz_map_free(raidz_map_t *rm)
271{
272	int c;
273	size_t size;
274
275	for (c = 0; c < rm->rm_firstdatacol; c++) {
276		if (rm->rm_col[c].rc_abd != NULL)
277			abd_free(rm->rm_col[c].rc_abd);
278
279		if (rm->rm_col[c].rc_gdata != NULL)
280			zio_buf_free(rm->rm_col[c].rc_gdata,
281			    rm->rm_col[c].rc_size);
282	}
283
284	size = 0;
285	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
286		if (rm->rm_col[c].rc_abd != NULL)
287			abd_put(rm->rm_col[c].rc_abd);
288		size += rm->rm_col[c].rc_size;
289	}
290
291	if (rm->rm_abd_copy != NULL)
292		abd_free(rm->rm_abd_copy);
293
294	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
295}
296
297static void
298vdev_raidz_map_free_vsd(zio_t *zio)
299{
300	raidz_map_t *rm = zio->io_vsd;
301
302	ASSERT0(rm->rm_freed);
303	rm->rm_freed = 1;
304
305	if (rm->rm_reports == 0)
306		vdev_raidz_map_free(rm);
307}
308
309/*ARGSUSED*/
310static void
311vdev_raidz_cksum_free(void *arg, size_t ignored)
312{
313	raidz_map_t *rm = arg;
314
315	ASSERT3U(rm->rm_reports, >, 0);
316
317	if (--rm->rm_reports == 0 && rm->rm_freed != 0)
318		vdev_raidz_map_free(rm);
319}
320
321static void
322vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
323{
324	raidz_map_t *rm = zcr->zcr_cbdata;
325	size_t c = zcr->zcr_cbinfo;
326	size_t x;
327
328	const char *good = NULL;
329	char *bad;
330
331	if (good_data == NULL) {
332		zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
333		return;
334	}
335
336	if (c < rm->rm_firstdatacol) {
337		/*
338		 * The first time through, calculate the parity blocks for
339		 * the good data (this relies on the fact that the good
340		 * data never changes for a given logical ZIO)
341		 */
342		if (rm->rm_col[0].rc_gdata == NULL) {
343			abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY];
344			char *buf;
345			int offset;
346
347			/*
348			 * Set up the rm_col[]s to generate the parity for
349			 * good_data, first saving the parity bufs and
350			 * replacing them with buffers to hold the result.
351			 */
352			for (x = 0; x < rm->rm_firstdatacol; x++) {
353				bad_parity[x] = rm->rm_col[x].rc_abd;
354				rm->rm_col[x].rc_gdata =
355				    zio_buf_alloc(rm->rm_col[x].rc_size);
356				rm->rm_col[x].rc_abd =
357				    abd_get_from_buf(rm->rm_col[x].rc_gdata,
358				    rm->rm_col[x].rc_size);
359			}
360
361			/* fill in the data columns from good_data */
362			buf = (char *)good_data;
363			for (; x < rm->rm_cols; x++) {
364				abd_put(rm->rm_col[x].rc_abd);
365				rm->rm_col[x].rc_abd = abd_get_from_buf(buf,
366				    rm->rm_col[x].rc_size);
367				buf += rm->rm_col[x].rc_size;
368			}
369
370			/*
371			 * Construct the parity from the good data.
372			 */
373			vdev_raidz_generate_parity(rm);
374
375			/* restore everything back to its original state */
376			for (x = 0; x < rm->rm_firstdatacol; x++) {
377				abd_put(rm->rm_col[x].rc_abd);
378				rm->rm_col[x].rc_abd = bad_parity[x];
379			}
380
381			offset = 0;
382			for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
383				abd_put(rm->rm_col[x].rc_abd);
384				rm->rm_col[x].rc_abd = abd_get_offset(
385				    rm->rm_abd_copy, offset);
386				offset += rm->rm_col[x].rc_size;
387			}
388		}
389
390		ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
391		good = rm->rm_col[c].rc_gdata;
392	} else {
393		/* adjust good_data to point at the start of our column */
394		good = good_data;
395
396		for (x = rm->rm_firstdatacol; x < c; x++)
397			good += rm->rm_col[x].rc_size;
398	}
399
400	bad = abd_borrow_buf_copy(rm->rm_col[c].rc_abd, rm->rm_col[c].rc_size);
401	/* we drop the ereport if it ends up that the data was good */
402	zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
403	abd_return_buf(rm->rm_col[c].rc_abd, bad, rm->rm_col[c].rc_size);
404}
405
406/*
407 * Invoked indirectly by zfs_ereport_start_checksum(), called
408 * below when our read operation fails completely.  The main point
409 * is to keep a copy of everything we read from disk, so that at
410 * vdev_raidz_cksum_finish() time we can compare it with the good data.
411 */
412static void
413vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
414{
415	size_t c = (size_t)(uintptr_t)arg;
416	size_t offset;
417
418	raidz_map_t *rm = zio->io_vsd;
419	size_t size;
420
421	/* set up the report and bump the refcount  */
422	zcr->zcr_cbdata = rm;
423	zcr->zcr_cbinfo = c;
424	zcr->zcr_finish = vdev_raidz_cksum_finish;
425	zcr->zcr_free = vdev_raidz_cksum_free;
426
427	rm->rm_reports++;
428	ASSERT3U(rm->rm_reports, >, 0);
429
430	if (rm->rm_abd_copy != NULL)
431		return;
432
433	/*
434	 * It's the first time we're called for this raidz_map_t, so we need
435	 * to copy the data aside; there's no guarantee that our zio's buffer
436	 * won't be re-used for something else.
437	 *
438	 * Our parity data is already in separate buffers, so there's no need
439	 * to copy them.
440	 */
441
442	size = 0;
443	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
444		size += rm->rm_col[c].rc_size;
445
446	rm->rm_abd_copy =
447	    abd_alloc_sametype(rm->rm_col[rm->rm_firstdatacol].rc_abd, size);
448
449	for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
450		raidz_col_t *col = &rm->rm_col[c];
451		abd_t *tmp = abd_get_offset(rm->rm_abd_copy, offset);
452
453		abd_copy(tmp, col->rc_abd, col->rc_size);
454		abd_put(col->rc_abd);
455		col->rc_abd = tmp;
456
457		offset += col->rc_size;
458	}
459	ASSERT3U(offset, ==, size);
460}
461
462static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
463	vdev_raidz_map_free_vsd,
464	vdev_raidz_cksum_report
465};
466
467/*
468 * Divides the IO evenly across all child vdevs; usually, dcols is
469 * the number of children in the target vdev.
470 */
471static raidz_map_t *
472vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset, boolean_t dofree,
473    uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
474{
475	raidz_map_t *rm;
476	/* The starting RAIDZ (parent) vdev sector of the block. */
477	uint64_t b = offset >> unit_shift;
478	/* The zio's size in units of the vdev's minimum sector size. */
479	uint64_t s = size >> unit_shift;
480	/* The first column for this stripe. */
481	uint64_t f = b % dcols;
482	/* The starting byte offset on each child vdev. */
483	uint64_t o = (b / dcols) << unit_shift;
484	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
485	uint64_t off = 0;
486
487	/*
488	 * "Quotient": The number of data sectors for this stripe on all but
489	 * the "big column" child vdevs that also contain "remainder" data.
490	 */
491	q = s / (dcols - nparity);
492
493	/*
494	 * "Remainder": The number of partial stripe data sectors in this I/O.
495	 * This will add a sector to some, but not all, child vdevs.
496	 */
497	r = s - q * (dcols - nparity);
498
499	/* The number of "big columns" - those which contain remainder data. */
500	bc = (r == 0 ? 0 : r + nparity);
501
502	/*
503	 * The total number of data and parity sectors associated with
504	 * this I/O.
505	 */
506	tot = s + nparity * (q + (r == 0 ? 0 : 1));
507
508	/* acols: The columns that will be accessed. */
509	/* scols: The columns that will be accessed or skipped. */
510	if (q == 0) {
511		/* Our I/O request doesn't span all child vdevs. */
512		acols = bc;
513		scols = MIN(dcols, roundup(bc, nparity + 1));
514	} else {
515		acols = dcols;
516		scols = dcols;
517	}
518
519	ASSERT3U(acols, <=, scols);
520
521	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
522
523	rm->rm_cols = acols;
524	rm->rm_scols = scols;
525	rm->rm_bigcols = bc;
526	rm->rm_skipstart = bc;
527	rm->rm_missingdata = 0;
528	rm->rm_missingparity = 0;
529	rm->rm_firstdatacol = nparity;
530	rm->rm_abd_copy = NULL;
531	rm->rm_reports = 0;
532	rm->rm_freed = 0;
533	rm->rm_ecksuminjected = 0;
534
535	asize = 0;
536
537	for (c = 0; c < scols; c++) {
538		col = f + c;
539		coff = o;
540		if (col >= dcols) {
541			col -= dcols;
542			coff += 1ULL << unit_shift;
543		}
544		rm->rm_col[c].rc_devidx = col;
545		rm->rm_col[c].rc_offset = coff;
546		rm->rm_col[c].rc_abd = NULL;
547		rm->rm_col[c].rc_gdata = NULL;
548		rm->rm_col[c].rc_error = 0;
549		rm->rm_col[c].rc_tried = 0;
550		rm->rm_col[c].rc_skipped = 0;
551
552		if (c >= acols)
553			rm->rm_col[c].rc_size = 0;
554		else if (c < bc)
555			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
556		else
557			rm->rm_col[c].rc_size = q << unit_shift;
558
559		asize += rm->rm_col[c].rc_size;
560	}
561
562	ASSERT3U(asize, ==, tot << unit_shift);
563	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
564	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
565	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
566	ASSERT3U(rm->rm_nskip, <=, nparity);
567
568	if (!dofree) {
569		for (c = 0; c < rm->rm_firstdatacol; c++) {
570			rm->rm_col[c].rc_abd =
571			    abd_alloc_linear(rm->rm_col[c].rc_size, B_TRUE);
572		}
573
574		rm->rm_col[c].rc_abd = abd_get_offset(abd, 0);
575		off = rm->rm_col[c].rc_size;
576
577		for (c = c + 1; c < acols; c++) {
578			rm->rm_col[c].rc_abd = abd_get_offset(abd, off);
579			off += rm->rm_col[c].rc_size;
580		}
581	}
582
583	/*
584	 * If all data stored spans all columns, there's a danger that parity
585	 * will always be on the same device and, since parity isn't read
586	 * during normal operation, that that device's I/O bandwidth won't be
587	 * used effectively. We therefore switch the parity every 1MB.
588	 *
589	 * ... at least that was, ostensibly, the theory. As a practical
590	 * matter unless we juggle the parity between all devices evenly, we
591	 * won't see any benefit. Further, occasional writes that aren't a
592	 * multiple of the LCM of the number of children and the minimum
593	 * stripe width are sufficient to avoid pessimal behavior.
594	 * Unfortunately, this decision created an implicit on-disk format
595	 * requirement that we need to support for all eternity, but only
596	 * for single-parity RAID-Z.
597	 *
598	 * If we intend to skip a sector in the zeroth column for padding
599	 * we must make sure to note this swap. We will never intend to
600	 * skip the first column since at least one data and one parity
601	 * column must appear in each row.
602	 */
603	ASSERT(rm->rm_cols >= 2);
604	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
605
606	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
607		devidx = rm->rm_col[0].rc_devidx;
608		o = rm->rm_col[0].rc_offset;
609		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
610		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
611		rm->rm_col[1].rc_devidx = devidx;
612		rm->rm_col[1].rc_offset = o;
613
614		if (rm->rm_skipstart == 0)
615			rm->rm_skipstart = 1;
616	}
617
618	return (rm);
619}
620
621struct pqr_struct {
622	uint64_t *p;
623	uint64_t *q;
624	uint64_t *r;
625};
626
627static int
628vdev_raidz_p_func(void *buf, size_t size, void *private)
629{
630	struct pqr_struct *pqr = private;
631	const uint64_t *src = buf;
632	int i, cnt = size / sizeof (src[0]);
633
634	ASSERT(pqr->p && !pqr->q && !pqr->r);
635
636	for (i = 0; i < cnt; i++, src++, pqr->p++)
637		*pqr->p ^= *src;
638
639	return (0);
640}
641
642static int
643vdev_raidz_pq_func(void *buf, size_t size, void *private)
644{
645	struct pqr_struct *pqr = private;
646	const uint64_t *src = buf;
647	uint64_t mask;
648	int i, cnt = size / sizeof (src[0]);
649
650	ASSERT(pqr->p && pqr->q && !pqr->r);
651
652	for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
653		*pqr->p ^= *src;
654		VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
655		*pqr->q ^= *src;
656	}
657
658	return (0);
659}
660
661static int
662vdev_raidz_pqr_func(void *buf, size_t size, void *private)
663{
664	struct pqr_struct *pqr = private;
665	const uint64_t *src = buf;
666	uint64_t mask;
667	int i, cnt = size / sizeof (src[0]);
668
669	ASSERT(pqr->p && pqr->q && pqr->r);
670
671	for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
672		*pqr->p ^= *src;
673		VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
674		*pqr->q ^= *src;
675		VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
676		*pqr->r ^= *src;
677	}
678
679	return (0);
680}
681
682static void
683vdev_raidz_generate_parity_p(raidz_map_t *rm)
684{
685	uint64_t *p;
686	int c;
687	abd_t *src;
688
689	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
690		src = rm->rm_col[c].rc_abd;
691		p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
692
693		if (c == rm->rm_firstdatacol) {
694			abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
695		} else {
696			struct pqr_struct pqr = { p, NULL, NULL };
697			(void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
698			    vdev_raidz_p_func, &pqr);
699		}
700	}
701}
702
703static void
704vdev_raidz_generate_parity_pq(raidz_map_t *rm)
705{
706	uint64_t *p, *q, pcnt, ccnt, mask, i;
707	int c;
708	abd_t *src;
709
710	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
711	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
712	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
713
714	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
715		src = rm->rm_col[c].rc_abd;
716		p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
717		q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
718
719		ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
720
721		if (c == rm->rm_firstdatacol) {
722			abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
723			(void) memcpy(q, p, rm->rm_col[c].rc_size);
724		} else {
725			struct pqr_struct pqr = { p, q, NULL };
726			(void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
727			    vdev_raidz_pq_func, &pqr);
728		}
729
730		if (c == rm->rm_firstdatacol) {
731			for (i = ccnt; i < pcnt; i++) {
732				p[i] = 0;
733				q[i] = 0;
734			}
735		} else {
736			/*
737			 * Treat short columns as though they are full of 0s.
738			 * Note that there's therefore nothing needed for P.
739			 */
740			for (i = ccnt; i < pcnt; i++) {
741				VDEV_RAIDZ_64MUL_2(q[i], mask);
742			}
743		}
744	}
745}
746
747static void
748vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
749{
750	uint64_t *p, *q, *r, pcnt, ccnt, mask, i;
751	int c;
752	abd_t *src;
753
754	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
755	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
756	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
757	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
758	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
759
760	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
761		src = rm->rm_col[c].rc_abd;
762		p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
763		q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
764		r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd);
765
766		ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
767
768		if (c == rm->rm_firstdatacol) {
769			abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
770			(void) memcpy(q, p, rm->rm_col[c].rc_size);
771			(void) memcpy(r, p, rm->rm_col[c].rc_size);
772		} else {
773			struct pqr_struct pqr = { p, q, r };
774			(void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
775			    vdev_raidz_pqr_func, &pqr);
776		}
777
778		if (c == rm->rm_firstdatacol) {
779			for (i = ccnt; i < pcnt; i++) {
780				p[i] = 0;
781				q[i] = 0;
782				r[i] = 0;
783			}
784		} else {
785			/*
786			 * Treat short columns as though they are full of 0s.
787			 * Note that there's therefore nothing needed for P.
788			 */
789			for (i = ccnt; i < pcnt; i++) {
790				VDEV_RAIDZ_64MUL_2(q[i], mask);
791				VDEV_RAIDZ_64MUL_4(r[i], mask);
792			}
793		}
794	}
795}
796
797/*
798 * Generate RAID parity in the first virtual columns according to the number of
799 * parity columns available.
800 */
801static void
802vdev_raidz_generate_parity(raidz_map_t *rm)
803{
804	switch (rm->rm_firstdatacol) {
805	case 1:
806		vdev_raidz_generate_parity_p(rm);
807		break;
808	case 2:
809		vdev_raidz_generate_parity_pq(rm);
810		break;
811	case 3:
812		vdev_raidz_generate_parity_pqr(rm);
813		break;
814	default:
815		cmn_err(CE_PANIC, "invalid RAID-Z configuration");
816	}
817}
818
819/* ARGSUSED */
820static int
821vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
822{
823	uint64_t *dst = dbuf;
824	uint64_t *src = sbuf;
825	int cnt = size / sizeof (src[0]);
826
827	for (int i = 0; i < cnt; i++) {
828		dst[i] ^= src[i];
829	}
830
831	return (0);
832}
833
834/* ARGSUSED */
835static int
836vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size,
837    void *private)
838{
839	uint64_t *dst = dbuf;
840	uint64_t *src = sbuf;
841	uint64_t mask;
842	int cnt = size / sizeof (dst[0]);
843
844	for (int i = 0; i < cnt; i++, dst++, src++) {
845		VDEV_RAIDZ_64MUL_2(*dst, mask);
846		*dst ^= *src;
847	}
848
849	return (0);
850}
851
852/* ARGSUSED */
853static int
854vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private)
855{
856	uint64_t *dst = buf;
857	uint64_t mask;
858	int cnt = size / sizeof (dst[0]);
859
860	for (int i = 0; i < cnt; i++, dst++) {
861		/* same operation as vdev_raidz_reconst_q_pre_func() on dst */
862		VDEV_RAIDZ_64MUL_2(*dst, mask);
863	}
864
865	return (0);
866}
867
868struct reconst_q_struct {
869	uint64_t *q;
870	int exp;
871};
872
873static int
874vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private)
875{
876	struct reconst_q_struct *rq = private;
877	uint64_t *dst = buf;
878	int cnt = size / sizeof (dst[0]);
879
880	for (int i = 0; i < cnt; i++, dst++, rq->q++) {
881		*dst ^= *rq->q;
882
883		int j;
884		uint8_t *b;
885		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
886			*b = vdev_raidz_exp2(*b, rq->exp);
887		}
888	}
889
890	return (0);
891}
892
893struct reconst_pq_struct {
894	uint8_t *p;
895	uint8_t *q;
896	uint8_t *pxy;
897	uint8_t *qxy;
898	int aexp;
899	int bexp;
900};
901
902static int
903vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private)
904{
905	struct reconst_pq_struct *rpq = private;
906	uint8_t *xd = xbuf;
907	uint8_t *yd = ybuf;
908
909	for (int i = 0; i < size;
910	    i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) {
911		*xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
912		    vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
913		*yd = *rpq->p ^ *rpq->pxy ^ *xd;
914	}
915
916	return (0);
917}
918
919static int
920vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private)
921{
922	struct reconst_pq_struct *rpq = private;
923	uint8_t *xd = xbuf;
924
925	for (int i = 0; i < size;
926	    i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) {
927		/* same operation as vdev_raidz_reconst_pq_func() on xd */
928		*xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
929		    vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
930	}
931
932	return (0);
933}
934
935static int
936vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
937{
938	int x = tgts[0];
939	int c;
940	abd_t *dst, *src;
941
942	ASSERT(ntgts == 1);
943	ASSERT(x >= rm->rm_firstdatacol);
944	ASSERT(x < rm->rm_cols);
945
946	ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size);
947	ASSERT(rm->rm_col[x].rc_size > 0);
948
949	src = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
950	dst = rm->rm_col[x].rc_abd;
951
952	abd_copy(dst, src, rm->rm_col[x].rc_size);
953
954	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
955		uint64_t size = MIN(rm->rm_col[x].rc_size,
956		    rm->rm_col[c].rc_size);
957
958		src = rm->rm_col[c].rc_abd;
959		dst = rm->rm_col[x].rc_abd;
960
961		if (c == x)
962			continue;
963
964		(void) abd_iterate_func2(dst, src, 0, 0, size,
965		    vdev_raidz_reconst_p_func, NULL);
966	}
967
968	return (1 << VDEV_RAIDZ_P);
969}
970
971static int
972vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
973{
974	int x = tgts[0];
975	int c, exp;
976	abd_t *dst, *src;
977
978	ASSERT(ntgts == 1);
979
980	ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size);
981
982	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
983		uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size,
984		    rm->rm_col[c].rc_size);
985
986		src = rm->rm_col[c].rc_abd;
987		dst = rm->rm_col[x].rc_abd;
988
989		if (c == rm->rm_firstdatacol) {
990			abd_copy(dst, src, size);
991			if (rm->rm_col[x].rc_size > size)
992				abd_zero_off(dst, size,
993				    rm->rm_col[x].rc_size - size);
994		} else {
995			ASSERT3U(size, <=, rm->rm_col[x].rc_size);
996			(void) abd_iterate_func2(dst, src, 0, 0, size,
997			    vdev_raidz_reconst_q_pre_func, NULL);
998			(void) abd_iterate_func(dst,
999			    size, rm->rm_col[x].rc_size - size,
1000			    vdev_raidz_reconst_q_pre_tail_func, NULL);
1001		}
1002	}
1003
1004	src = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
1005	dst = rm->rm_col[x].rc_abd;
1006	exp = 255 - (rm->rm_cols - 1 - x);
1007
1008	struct reconst_q_struct rq = { abd_to_buf(src), exp };
1009	(void) abd_iterate_func(dst, 0, rm->rm_col[x].rc_size,
1010	    vdev_raidz_reconst_q_post_func, &rq);
1011
1012	return (1 << VDEV_RAIDZ_Q);
1013}
1014
1015static int
1016vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
1017{
1018	uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
1019	abd_t *pdata, *qdata;
1020	uint64_t xsize, ysize;
1021	int x = tgts[0];
1022	int y = tgts[1];
1023	abd_t *xd, *yd;
1024
1025	ASSERT(ntgts == 2);
1026	ASSERT(x < y);
1027	ASSERT(x >= rm->rm_firstdatacol);
1028	ASSERT(y < rm->rm_cols);
1029
1030	ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
1031
1032	/*
1033	 * Move the parity data aside -- we're going to compute parity as
1034	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
1035	 * reuse the parity generation mechanism without trashing the actual
1036	 * parity so we make those columns appear to be full of zeros by
1037	 * setting their lengths to zero.
1038	 */
1039	pdata = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
1040	qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
1041	xsize = rm->rm_col[x].rc_size;
1042	ysize = rm->rm_col[y].rc_size;
1043
1044	rm->rm_col[VDEV_RAIDZ_P].rc_abd =
1045	    abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
1046	rm->rm_col[VDEV_RAIDZ_Q].rc_abd =
1047	    abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
1048	rm->rm_col[x].rc_size = 0;
1049	rm->rm_col[y].rc_size = 0;
1050
1051	vdev_raidz_generate_parity_pq(rm);
1052
1053	rm->rm_col[x].rc_size = xsize;
1054	rm->rm_col[y].rc_size = ysize;
1055
1056	p = abd_to_buf(pdata);
1057	q = abd_to_buf(qdata);
1058	pxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
1059	qxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
1060	xd = rm->rm_col[x].rc_abd;
1061	yd = rm->rm_col[y].rc_abd;
1062
1063	/*
1064	 * We now have:
1065	 *	Pxy = P + D_x + D_y
1066	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
1067	 *
1068	 * We can then solve for D_x:
1069	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
1070	 * where
1071	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
1072	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
1073	 *
1074	 * With D_x in hand, we can easily solve for D_y:
1075	 *	D_y = P + Pxy + D_x
1076	 */
1077
1078	a = vdev_raidz_pow2[255 + x - y];
1079	b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
1080	tmp = 255 - vdev_raidz_log2[a ^ 1];
1081
1082	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
1083	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
1084
1085	ASSERT3U(xsize, >=, ysize);
1086	struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp };
1087	(void) abd_iterate_func2(xd, yd, 0, 0, ysize,
1088	    vdev_raidz_reconst_pq_func, &rpq);
1089	(void) abd_iterate_func(xd, ysize, xsize - ysize,
1090	    vdev_raidz_reconst_pq_tail_func, &rpq);
1091
1092	abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
1093	abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
1094
1095	/*
1096	 * Restore the saved parity data.
1097	 */
1098	rm->rm_col[VDEV_RAIDZ_P].rc_abd = pdata;
1099	rm->rm_col[VDEV_RAIDZ_Q].rc_abd = qdata;
1100
1101	return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
1102}
1103
1104/* BEGIN CSTYLED */
1105/*
1106 * In the general case of reconstruction, we must solve the system of linear
1107 * equations defined by the coeffecients used to generate parity as well as
1108 * the contents of the data and parity disks. This can be expressed with
1109 * vectors for the original data (D) and the actual data (d) and parity (p)
1110 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
1111 *
1112 *            __   __                     __     __
1113 *            |     |         __     __   |  p_0  |
1114 *            |  V  |         |  D_0  |   | p_m-1 |
1115 *            |     |    x    |   :   | = |  d_0  |
1116 *            |  I  |         | D_n-1 |   |   :   |
1117 *            |     |         ~~     ~~   | d_n-1 |
1118 *            ~~   ~~                     ~~     ~~
1119 *
1120 * I is simply a square identity matrix of size n, and V is a vandermonde
1121 * matrix defined by the coeffecients we chose for the various parity columns
1122 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
1123 * computation as well as linear separability.
1124 *
1125 *      __               __               __     __
1126 *      |   1   ..  1 1 1 |               |  p_0  |
1127 *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
1128 *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
1129 *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
1130 *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
1131 *      |   :       : : : |   |   :   |   |  d_2  |
1132 *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
1133 *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
1134 *      |   0   ..  0 0 1 |               | d_n-1 |
1135 *      ~~               ~~               ~~     ~~
1136 *
1137 * Note that I, V, d, and p are known. To compute D, we must invert the
1138 * matrix and use the known data and parity values to reconstruct the unknown
1139 * data values. We begin by removing the rows in V|I and d|p that correspond
1140 * to failed or missing columns; we then make V|I square (n x n) and d|p
1141 * sized n by removing rows corresponding to unused parity from the bottom up
1142 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
1143 * using Gauss-Jordan elimination. In the example below we use m=3 parity
1144 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
1145 *           __                               __
1146 *           |  1   1   1   1   1   1   1   1  |
1147 *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
1148 *           |  19 205 116  29  64  16  4   1  |      / /
1149 *           |  1   0   0   0   0   0   0   0  |     / /
1150 *           |  0   1   0   0   0   0   0   0  | <--' /
1151 *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
1152 *           |  0   0   0   1   0   0   0   0  |
1153 *           |  0   0   0   0   1   0   0   0  |
1154 *           |  0   0   0   0   0   1   0   0  |
1155 *           |  0   0   0   0   0   0   1   0  |
1156 *           |  0   0   0   0   0   0   0   1  |
1157 *           ~~                               ~~
1158 *           __                               __
1159 *           |  1   1   1   1   1   1   1   1  |
1160 *           |  19 205 116  29  64  16  4   1  |
1161 *           |  1   0   0   0   0   0   0   0  |
1162 *  (V|I)' = |  0   0   0   1   0   0   0   0  |
1163 *           |  0   0   0   0   1   0   0   0  |
1164 *           |  0   0   0   0   0   1   0   0  |
1165 *           |  0   0   0   0   0   0   1   0  |
1166 *           |  0   0   0   0   0   0   0   1  |
1167 *           ~~                               ~~
1168 *
1169 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1170 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1171 * matrix is not singular.
1172 * __                                                                 __
1173 * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1174 * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1175 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1176 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1177 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1178 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1179 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1180 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1181 * ~~                                                                 ~~
1182 * __                                                                 __
1183 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1184 * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1185 * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1186 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1187 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1188 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1189 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1190 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1191 * ~~                                                                 ~~
1192 * __                                                                 __
1193 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1194 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1195 * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
1196 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1197 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1198 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1199 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1200 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1201 * ~~                                                                 ~~
1202 * __                                                                 __
1203 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1204 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1205 * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
1206 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1207 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1208 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1209 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1210 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1211 * ~~                                                                 ~~
1212 * __                                                                 __
1213 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1214 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1215 * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1216 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1217 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1218 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1219 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1220 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1221 * ~~                                                                 ~~
1222 * __                                                                 __
1223 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1224 * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
1225 * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1226 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1227 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1228 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1229 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1230 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1231 * ~~                                                                 ~~
1232 *                   __                               __
1233 *                   |  0   0   1   0   0   0   0   0  |
1234 *                   | 167 100  5   41 159 169 217 208 |
1235 *                   | 166 100  4   40 158 168 216 209 |
1236 *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
1237 *                   |  0   0   0   0   1   0   0   0  |
1238 *                   |  0   0   0   0   0   1   0   0  |
1239 *                   |  0   0   0   0   0   0   1   0  |
1240 *                   |  0   0   0   0   0   0   0   1  |
1241 *                   ~~                               ~~
1242 *
1243 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1244 * of the missing data.
1245 *
1246 * As is apparent from the example above, the only non-trivial rows in the
1247 * inverse matrix correspond to the data disks that we're trying to
1248 * reconstruct. Indeed, those are the only rows we need as the others would
1249 * only be useful for reconstructing data known or assumed to be valid. For
1250 * that reason, we only build the coefficients in the rows that correspond to
1251 * targeted columns.
1252 */
1253/* END CSTYLED */
1254
1255static void
1256vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1257    uint8_t **rows)
1258{
1259	int i, j;
1260	int pow;
1261
1262	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1263
1264	/*
1265	 * Fill in the missing rows of interest.
1266	 */
1267	for (i = 0; i < nmap; i++) {
1268		ASSERT3S(0, <=, map[i]);
1269		ASSERT3S(map[i], <=, 2);
1270
1271		pow = map[i] * n;
1272		if (pow > 255)
1273			pow -= 255;
1274		ASSERT(pow <= 255);
1275
1276		for (j = 0; j < n; j++) {
1277			pow -= map[i];
1278			if (pow < 0)
1279				pow += 255;
1280			rows[i][j] = vdev_raidz_pow2[pow];
1281		}
1282	}
1283}
1284
1285static void
1286vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1287    uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1288{
1289	int i, j, ii, jj;
1290	uint8_t log;
1291
1292	/*
1293	 * Assert that the first nmissing entries from the array of used
1294	 * columns correspond to parity columns and that subsequent entries
1295	 * correspond to data columns.
1296	 */
1297	for (i = 0; i < nmissing; i++) {
1298		ASSERT3S(used[i], <, rm->rm_firstdatacol);
1299	}
1300	for (; i < n; i++) {
1301		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1302	}
1303
1304	/*
1305	 * First initialize the storage where we'll compute the inverse rows.
1306	 */
1307	for (i = 0; i < nmissing; i++) {
1308		for (j = 0; j < n; j++) {
1309			invrows[i][j] = (i == j) ? 1 : 0;
1310		}
1311	}
1312
1313	/*
1314	 * Subtract all trivial rows from the rows of consequence.
1315	 */
1316	for (i = 0; i < nmissing; i++) {
1317		for (j = nmissing; j < n; j++) {
1318			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1319			jj = used[j] - rm->rm_firstdatacol;
1320			ASSERT3S(jj, <, n);
1321			invrows[i][j] = rows[i][jj];
1322			rows[i][jj] = 0;
1323		}
1324	}
1325
1326	/*
1327	 * For each of the rows of interest, we must normalize it and subtract
1328	 * a multiple of it from the other rows.
1329	 */
1330	for (i = 0; i < nmissing; i++) {
1331		for (j = 0; j < missing[i]; j++) {
1332			ASSERT0(rows[i][j]);
1333		}
1334		ASSERT3U(rows[i][missing[i]], !=, 0);
1335
1336		/*
1337		 * Compute the inverse of the first element and multiply each
1338		 * element in the row by that value.
1339		 */
1340		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1341
1342		for (j = 0; j < n; j++) {
1343			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1344			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1345		}
1346
1347		for (ii = 0; ii < nmissing; ii++) {
1348			if (i == ii)
1349				continue;
1350
1351			ASSERT3U(rows[ii][missing[i]], !=, 0);
1352
1353			log = vdev_raidz_log2[rows[ii][missing[i]]];
1354
1355			for (j = 0; j < n; j++) {
1356				rows[ii][j] ^=
1357				    vdev_raidz_exp2(rows[i][j], log);
1358				invrows[ii][j] ^=
1359				    vdev_raidz_exp2(invrows[i][j], log);
1360			}
1361		}
1362	}
1363
1364	/*
1365	 * Verify that the data that is left in the rows are properly part of
1366	 * an identity matrix.
1367	 */
1368	for (i = 0; i < nmissing; i++) {
1369		for (j = 0; j < n; j++) {
1370			if (j == missing[i]) {
1371				ASSERT3U(rows[i][j], ==, 1);
1372			} else {
1373				ASSERT0(rows[i][j]);
1374			}
1375		}
1376	}
1377}
1378
1379static void
1380vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1381    int *missing, uint8_t **invrows, const uint8_t *used)
1382{
1383	int i, j, x, cc, c;
1384	uint8_t *src;
1385	uint64_t ccount;
1386	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1387	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1388	uint8_t log = 0;
1389	uint8_t val;
1390	int ll;
1391	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1392	uint8_t *p, *pp;
1393	size_t psize;
1394
1395	psize = sizeof (invlog[0][0]) * n * nmissing;
1396	p = kmem_alloc(psize, KM_SLEEP);
1397
1398	for (pp = p, i = 0; i < nmissing; i++) {
1399		invlog[i] = pp;
1400		pp += n;
1401	}
1402
1403	for (i = 0; i < nmissing; i++) {
1404		for (j = 0; j < n; j++) {
1405			ASSERT3U(invrows[i][j], !=, 0);
1406			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1407		}
1408	}
1409
1410	for (i = 0; i < n; i++) {
1411		c = used[i];
1412		ASSERT3U(c, <, rm->rm_cols);
1413
1414		src = abd_to_buf(rm->rm_col[c].rc_abd);
1415		ccount = rm->rm_col[c].rc_size;
1416		for (j = 0; j < nmissing; j++) {
1417			cc = missing[j] + rm->rm_firstdatacol;
1418			ASSERT3U(cc, >=, rm->rm_firstdatacol);
1419			ASSERT3U(cc, <, rm->rm_cols);
1420			ASSERT3U(cc, !=, c);
1421
1422			dst[j] = abd_to_buf(rm->rm_col[cc].rc_abd);
1423			dcount[j] = rm->rm_col[cc].rc_size;
1424		}
1425
1426		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1427
1428		for (x = 0; x < ccount; x++, src++) {
1429			if (*src != 0)
1430				log = vdev_raidz_log2[*src];
1431
1432			for (cc = 0; cc < nmissing; cc++) {
1433				if (x >= dcount[cc])
1434					continue;
1435
1436				if (*src == 0) {
1437					val = 0;
1438				} else {
1439					if ((ll = log + invlog[cc][i]) >= 255)
1440						ll -= 255;
1441					val = vdev_raidz_pow2[ll];
1442				}
1443
1444				if (i == 0)
1445					dst[cc][x] = val;
1446				else
1447					dst[cc][x] ^= val;
1448			}
1449		}
1450	}
1451
1452	kmem_free(p, psize);
1453}
1454
1455static int
1456vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1457{
1458	int n, i, c, t, tt;
1459	int nmissing_rows;
1460	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1461	int parity_map[VDEV_RAIDZ_MAXPARITY];
1462
1463	uint8_t *p, *pp;
1464	size_t psize;
1465
1466	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1467	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1468	uint8_t *used;
1469
1470	abd_t **bufs = NULL;
1471
1472	int code = 0;
1473
1474	/*
1475	 * Matrix reconstruction can't use scatter ABDs yet, so we allocate
1476	 * temporary linear ABDs.
1477	 */
1478	if (!abd_is_linear(rm->rm_col[rm->rm_firstdatacol].rc_abd)) {
1479		bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE);
1480
1481		for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1482			raidz_col_t *col = &rm->rm_col[c];
1483
1484			bufs[c] = col->rc_abd;
1485			col->rc_abd = abd_alloc_linear(col->rc_size, B_TRUE);
1486			abd_copy(col->rc_abd, bufs[c], col->rc_size);
1487		}
1488	}
1489
1490	n = rm->rm_cols - rm->rm_firstdatacol;
1491
1492	/*
1493	 * Figure out which data columns are missing.
1494	 */
1495	nmissing_rows = 0;
1496	for (t = 0; t < ntgts; t++) {
1497		if (tgts[t] >= rm->rm_firstdatacol) {
1498			missing_rows[nmissing_rows++] =
1499			    tgts[t] - rm->rm_firstdatacol;
1500		}
1501	}
1502
1503	/*
1504	 * Figure out which parity columns to use to help generate the missing
1505	 * data columns.
1506	 */
1507	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1508		ASSERT(tt < ntgts);
1509		ASSERT(c < rm->rm_firstdatacol);
1510
1511		/*
1512		 * Skip any targeted parity columns.
1513		 */
1514		if (c == tgts[tt]) {
1515			tt++;
1516			continue;
1517		}
1518
1519		code |= 1 << c;
1520
1521		parity_map[i] = c;
1522		i++;
1523	}
1524
1525	ASSERT(code != 0);
1526	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1527
1528	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1529	    nmissing_rows * n + sizeof (used[0]) * n;
1530	p = kmem_alloc(psize, KM_SLEEP);
1531
1532	for (pp = p, i = 0; i < nmissing_rows; i++) {
1533		rows[i] = pp;
1534		pp += n;
1535		invrows[i] = pp;
1536		pp += n;
1537	}
1538	used = pp;
1539
1540	for (i = 0; i < nmissing_rows; i++) {
1541		used[i] = parity_map[i];
1542	}
1543
1544	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1545		if (tt < nmissing_rows &&
1546		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1547			tt++;
1548			continue;
1549		}
1550
1551		ASSERT3S(i, <, n);
1552		used[i] = c;
1553		i++;
1554	}
1555
1556	/*
1557	 * Initialize the interesting rows of the matrix.
1558	 */
1559	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1560
1561	/*
1562	 * Invert the matrix.
1563	 */
1564	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1565	    invrows, used);
1566
1567	/*
1568	 * Reconstruct the missing data using the generated matrix.
1569	 */
1570	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1571	    invrows, used);
1572
1573	kmem_free(p, psize);
1574
1575	/*
1576	 * copy back from temporary linear abds and free them
1577	 */
1578	if (bufs) {
1579		for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1580			raidz_col_t *col = &rm->rm_col[c];
1581
1582			abd_copy(bufs[c], col->rc_abd, col->rc_size);
1583			abd_free(col->rc_abd);
1584			col->rc_abd = bufs[c];
1585		}
1586		kmem_free(bufs, rm->rm_cols * sizeof (abd_t *));
1587	}
1588
1589	return (code);
1590}
1591
1592static int
1593vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1594{
1595	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1596	int ntgts;
1597	int i, c;
1598	int code;
1599	int nbadparity, nbaddata;
1600	int parity_valid[VDEV_RAIDZ_MAXPARITY];
1601
1602	/*
1603	 * The tgts list must already be sorted.
1604	 */
1605	for (i = 1; i < nt; i++) {
1606		ASSERT(t[i] > t[i - 1]);
1607	}
1608
1609	nbadparity = rm->rm_firstdatacol;
1610	nbaddata = rm->rm_cols - nbadparity;
1611	ntgts = 0;
1612	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1613		if (c < rm->rm_firstdatacol)
1614			parity_valid[c] = B_FALSE;
1615
1616		if (i < nt && c == t[i]) {
1617			tgts[ntgts++] = c;
1618			i++;
1619		} else if (rm->rm_col[c].rc_error != 0) {
1620			tgts[ntgts++] = c;
1621		} else if (c >= rm->rm_firstdatacol) {
1622			nbaddata--;
1623		} else {
1624			parity_valid[c] = B_TRUE;
1625			nbadparity--;
1626		}
1627	}
1628
1629	ASSERT(ntgts >= nt);
1630	ASSERT(nbaddata >= 0);
1631	ASSERT(nbaddata + nbadparity == ntgts);
1632
1633	dt = &tgts[nbadparity];
1634
1635	/*
1636	 * See if we can use any of our optimized reconstruction routines.
1637	 */
1638	if (!vdev_raidz_default_to_general) {
1639		switch (nbaddata) {
1640		case 1:
1641			if (parity_valid[VDEV_RAIDZ_P])
1642				return (vdev_raidz_reconstruct_p(rm, dt, 1));
1643
1644			ASSERT(rm->rm_firstdatacol > 1);
1645
1646			if (parity_valid[VDEV_RAIDZ_Q])
1647				return (vdev_raidz_reconstruct_q(rm, dt, 1));
1648
1649			ASSERT(rm->rm_firstdatacol > 2);
1650			break;
1651
1652		case 2:
1653			ASSERT(rm->rm_firstdatacol > 1);
1654
1655			if (parity_valid[VDEV_RAIDZ_P] &&
1656			    parity_valid[VDEV_RAIDZ_Q])
1657				return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1658
1659			ASSERT(rm->rm_firstdatacol > 2);
1660
1661			break;
1662		}
1663	}
1664
1665	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1666	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1667	ASSERT(code > 0);
1668	return (code);
1669}
1670
1671static int
1672vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1673    uint64_t *logical_ashift, uint64_t *physical_ashift)
1674{
1675	vdev_t *cvd;
1676	uint64_t nparity = vd->vdev_nparity;
1677	int c;
1678	int lasterror = 0;
1679	int numerrors = 0;
1680
1681	ASSERT(nparity > 0);
1682
1683	if (nparity > VDEV_RAIDZ_MAXPARITY ||
1684	    vd->vdev_children < nparity + 1) {
1685		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1686		return (SET_ERROR(EINVAL));
1687	}
1688
1689	vdev_open_children(vd);
1690
1691	for (c = 0; c < vd->vdev_children; c++) {
1692		cvd = vd->vdev_child[c];
1693
1694		if (cvd->vdev_open_error != 0) {
1695			lasterror = cvd->vdev_open_error;
1696			numerrors++;
1697			continue;
1698		}
1699
1700		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1701		*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1702		*logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
1703		*physical_ashift = MAX(*physical_ashift,
1704		    cvd->vdev_physical_ashift);
1705	}
1706
1707	*asize *= vd->vdev_children;
1708	*max_asize *= vd->vdev_children;
1709
1710	if (numerrors > nparity) {
1711		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1712		return (lasterror);
1713	}
1714
1715	return (0);
1716}
1717
1718static void
1719vdev_raidz_close(vdev_t *vd)
1720{
1721	int c;
1722
1723	for (c = 0; c < vd->vdev_children; c++)
1724		vdev_close(vd->vdev_child[c]);
1725}
1726
1727#ifdef illumos
1728/*
1729 * Handle a read or write I/O to a RAID-Z dump device.
1730 *
1731 * The dump device is in a unique situation compared to other ZFS datasets:
1732 * writing to this device should be as simple and fast as possible.  In
1733 * addition, durability matters much less since the dump will be extracted
1734 * once the machine reboots.  For that reason, this function eschews parity for
1735 * performance and simplicity.  The dump device uses the checksum setting
1736 * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this
1737 * dataset.
1738 *
1739 * Blocks of size 128 KB have been preallocated for this volume.  I/Os less than
1740 * 128 KB will not fill an entire block; in addition, they may not be properly
1741 * aligned.  In that case, this function uses the preallocated 128 KB block and
1742 * omits reading or writing any "empty" portions of that block, as opposed to
1743 * allocating a fresh appropriately-sized block.
1744 *
1745 * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs:
1746 *
1747 *     vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB)
1748 *
1749 * If this were a standard RAID-Z dataset, a block of at least 40 KB would be
1750 * allocated which spans all five child vdevs.  8 KB of data would be written to
1751 * each of four vdevs, with the fifth containing the parity bits.
1752 *
1753 *       parity    data     data     data     data
1754 *     |   PP   |   XX   |   XX   |   XX   |   XX   |
1755 *         ^        ^        ^        ^        ^
1756 *         |        |        |        |        |
1757 *   8 KB parity    ------8 KB data blocks------
1758 *
1759 * However, when writing to the dump device, the behavior is different:
1760 *
1761 *     vdev_raidz_physio(data, size: 32 KB, offset: 64 KB)
1762 *
1763 * Unlike the normal RAID-Z case in which the block is allocated based on the
1764 * I/O size, reads and writes here always use a 128 KB logical I/O size.  If the
1765 * I/O size is less than 128 KB, only the actual portions of data are written.
1766 * In this example the data is written to the third data vdev since that vdev
1767 * contains the offset [64 KB, 96 KB).
1768 *
1769 *       parity    data     data     data     data
1770 *     |        |        |        |   XX   |        |
1771 *                                    ^
1772 *                                    |
1773 *                             32 KB data block
1774 *
1775 * As a result, an individual I/O may not span all child vdevs; moreover, a
1776 * small I/O may only operate on a single child vdev.
1777 *
1778 * Note that since there are no parity bits calculated or written, this format
1779 * remains the same no matter how many parity bits are used in a normal RAID-Z
1780 * stripe.  On a RAID-Z3 configuration with seven child vdevs, the example above
1781 * would look like:
1782 *
1783 *       parity   parity   parity    data     data     data     data
1784 *     |        |        |        |        |        |   XX   |        |
1785 *                                                      ^
1786 *                                                      |
1787 *                                               32 KB data block
1788 */
1789int
1790vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
1791    uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump)
1792{
1793	vdev_t *tvd = vd->vdev_top;
1794	vdev_t *cvd;
1795	raidz_map_t *rm;
1796	raidz_col_t *rc;
1797	int c, err = 0;
1798
1799	uint64_t start, end, colstart, colend;
1800	uint64_t coloffset, colsize, colskip;
1801
1802	int flags = doread ? BIO_READ : BIO_WRITE;
1803
1804#ifdef	_KERNEL
1805
1806	/*
1807	 * Don't write past the end of the block
1808	 */
1809	VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE);
1810
1811	start = offset;
1812	end = start + size;
1813
1814	/*
1815	 * Allocate a RAID-Z map for this block.  Note that this block starts
1816	 * from the "original" offset, this is, the offset of the extent which
1817	 * contains the requisite offset of the data being read or written.
1818	 *
1819	 * Even if this I/O operation doesn't span the full block size, let's
1820	 * treat the on-disk format as if the only blocks are the complete 128
1821	 * KB size.
1822	 */
1823	abd_t *abd = abd_get_from_buf(data - (offset - origoffset),
1824	    SPA_OLD_MAXBLOCKSIZE);
1825	rm = vdev_raidz_map_alloc(abd,
1826	    SPA_OLD_MAXBLOCKSIZE, origoffset, B_FALSE, tvd->vdev_ashift,
1827	    vd->vdev_children, vd->vdev_nparity);
1828
1829	coloffset = origoffset;
1830
1831	for (c = rm->rm_firstdatacol; c < rm->rm_cols;
1832	    c++, coloffset += rc->rc_size) {
1833		rc = &rm->rm_col[c];
1834		cvd = vd->vdev_child[rc->rc_devidx];
1835
1836		/*
1837		 * Find the start and end of this column in the RAID-Z map,
1838		 * keeping in mind that the stated size and offset of the
1839		 * operation may not fill the entire column for this vdev.
1840		 *
1841		 * If any portion of the data spans this column, issue the
1842		 * appropriate operation to the vdev.
1843		 */
1844		if (coloffset + rc->rc_size <= start)
1845			continue;
1846		if (coloffset >= end)
1847			continue;
1848
1849		colstart = MAX(coloffset, start);
1850		colend = MIN(end, coloffset + rc->rc_size);
1851		colsize = colend - colstart;
1852		colskip = colstart - coloffset;
1853
1854		VERIFY3U(colsize, <=, rc->rc_size);
1855		VERIFY3U(colskip, <=, rc->rc_size);
1856
1857		/*
1858		 * Note that the child vdev will have a vdev label at the start
1859		 * of its range of offsets, hence the need for
1860		 * VDEV_LABEL_OFFSET().  See zio_vdev_child_io() for another
1861		 * example of why this calculation is needed.
1862		 */
1863		if ((err = vdev_disk_physio(cvd,
1864		    ((char *)abd_to_buf(rc->rc_abd)) + colskip, colsize,
1865		    VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
1866		    flags, isdump)) != 0)
1867			break;
1868	}
1869
1870	vdev_raidz_map_free(rm);
1871	abd_put(abd);
1872#endif	/* KERNEL */
1873
1874	return (err);
1875}
1876#endif
1877
1878static uint64_t
1879vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1880{
1881	uint64_t asize;
1882	uint64_t ashift = vd->vdev_top->vdev_ashift;
1883	uint64_t cols = vd->vdev_children;
1884	uint64_t nparity = vd->vdev_nparity;
1885
1886	asize = ((psize - 1) >> ashift) + 1;
1887	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1888	asize = roundup(asize, nparity + 1) << ashift;
1889
1890	return (asize);
1891}
1892
1893static void
1894vdev_raidz_child_done(zio_t *zio)
1895{
1896	raidz_col_t *rc = zio->io_private;
1897
1898	rc->rc_error = zio->io_error;
1899	rc->rc_tried = 1;
1900	rc->rc_skipped = 0;
1901}
1902
1903static void
1904vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, int col)
1905{
1906#ifdef ZFS_DEBUG
1907	vdev_t *vd = zio->io_vd;
1908	vdev_t *tvd = vd->vdev_top;
1909
1910	range_seg_t logical_rs, physical_rs;
1911	logical_rs.rs_start = zio->io_offset;
1912	logical_rs.rs_end = logical_rs.rs_start +
1913	    vdev_raidz_asize(zio->io_vd, zio->io_size);
1914
1915	raidz_col_t *rc = &rm->rm_col[col];
1916	vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
1917
1918	vdev_xlate(cvd, &logical_rs, &physical_rs);
1919	ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start);
1920	ASSERT3U(rc->rc_offset, <, physical_rs.rs_end);
1921	/*
1922	 * It would be nice to assert that rs_end is equal
1923	 * to rc_offset + rc_size but there might be an
1924	 * optional I/O at the end that is not accounted in
1925	 * rc_size.
1926	 */
1927	if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) {
1928		ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset +
1929		    rc->rc_size + (1 << tvd->vdev_ashift));
1930	} else {
1931		ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size);
1932	}
1933#endif
1934}
1935
1936/*
1937 * Start an IO operation on a RAIDZ VDev
1938 *
1939 * Outline:
1940 * - For write operations:
1941 *   1. Generate the parity data
1942 *   2. Create child zio write operations to each column's vdev, for both
1943 *      data and parity.
1944 *   3. If the column skips any sectors for padding, create optional dummy
1945 *      write zio children for those areas to improve aggregation continuity.
1946 * - For read operations:
1947 *   1. Create child zio read operations to each data column's vdev to read
1948 *      the range of data required for zio.
1949 *   2. If this is a scrub or resilver operation, or if any of the data
1950 *      vdevs have had errors, then create zio read operations to the parity
1951 *      columns' VDevs as well.
1952 */
1953static void
1954vdev_raidz_io_start(zio_t *zio)
1955{
1956	vdev_t *vd = zio->io_vd;
1957	vdev_t *tvd = vd->vdev_top;
1958	vdev_t *cvd;
1959	raidz_map_t *rm;
1960	raidz_col_t *rc;
1961	int c, i;
1962
1963	rm = vdev_raidz_map_alloc(zio->io_abd, zio->io_size, zio->io_offset,
1964	    zio->io_type == ZIO_TYPE_FREE,
1965	    tvd->vdev_ashift, vd->vdev_children,
1966	    vd->vdev_nparity);
1967
1968	zio->io_vsd = rm;
1969	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
1970
1971	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1972
1973	if (zio->io_type == ZIO_TYPE_FREE) {
1974		for (c = 0; c < rm->rm_cols; c++) {
1975			rc = &rm->rm_col[c];
1976			cvd = vd->vdev_child[rc->rc_devidx];
1977			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1978			    rc->rc_offset, rc->rc_abd, rc->rc_size,
1979			    zio->io_type, zio->io_priority, 0,
1980			    vdev_raidz_child_done, rc));
1981		}
1982
1983		zio_execute(zio);
1984		return;
1985	}
1986
1987	if (zio->io_type == ZIO_TYPE_WRITE) {
1988		vdev_raidz_generate_parity(rm);
1989
1990		for (c = 0; c < rm->rm_cols; c++) {
1991			rc = &rm->rm_col[c];
1992			cvd = vd->vdev_child[rc->rc_devidx];
1993
1994			/*
1995			 * Verify physical to logical translation.
1996			 */
1997			vdev_raidz_io_verify(zio, rm, c);
1998
1999			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2000			    rc->rc_offset, rc->rc_abd, rc->rc_size,
2001			    zio->io_type, zio->io_priority, 0,
2002			    vdev_raidz_child_done, rc));
2003		}
2004
2005		/*
2006		 * Generate optional I/Os for any skipped sectors to improve
2007		 * aggregation contiguity.
2008		 */
2009		for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
2010			ASSERT(c <= rm->rm_scols);
2011			if (c == rm->rm_scols)
2012				c = 0;
2013			rc = &rm->rm_col[c];
2014			cvd = vd->vdev_child[rc->rc_devidx];
2015			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2016			    rc->rc_offset + rc->rc_size, NULL,
2017			    1 << tvd->vdev_ashift,
2018			    zio->io_type, zio->io_priority,
2019			    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
2020		}
2021
2022		zio_execute(zio);
2023		return;
2024	}
2025
2026	ASSERT(zio->io_type == ZIO_TYPE_READ);
2027
2028	/*
2029	 * Iterate over the columns in reverse order so that we hit the parity
2030	 * last -- any errors along the way will force us to read the parity.
2031	 */
2032	for (c = rm->rm_cols - 1; c >= 0; c--) {
2033		rc = &rm->rm_col[c];
2034		cvd = vd->vdev_child[rc->rc_devidx];
2035		if (!vdev_readable(cvd)) {
2036			if (c >= rm->rm_firstdatacol)
2037				rm->rm_missingdata++;
2038			else
2039				rm->rm_missingparity++;
2040			rc->rc_error = SET_ERROR(ENXIO);
2041			rc->rc_tried = 1;	/* don't even try */
2042			rc->rc_skipped = 1;
2043			continue;
2044		}
2045		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
2046			if (c >= rm->rm_firstdatacol)
2047				rm->rm_missingdata++;
2048			else
2049				rm->rm_missingparity++;
2050			rc->rc_error = SET_ERROR(ESTALE);
2051			rc->rc_skipped = 1;
2052			continue;
2053		}
2054		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
2055		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
2056			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2057			    rc->rc_offset, rc->rc_abd, rc->rc_size,
2058			    zio->io_type, zio->io_priority, 0,
2059			    vdev_raidz_child_done, rc));
2060		}
2061	}
2062
2063	zio_execute(zio);
2064}
2065
2066
2067/*
2068 * Report a checksum error for a child of a RAID-Z device.
2069 */
2070static void
2071raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
2072{
2073	void *buf;
2074	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
2075
2076	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2077		zio_bad_cksum_t zbc;
2078		raidz_map_t *rm = zio->io_vsd;
2079
2080		mutex_enter(&vd->vdev_stat_lock);
2081		vd->vdev_stat.vs_checksum_errors++;
2082		mutex_exit(&vd->vdev_stat_lock);
2083
2084		zbc.zbc_has_cksum = 0;
2085		zbc.zbc_injected = rm->rm_ecksuminjected;
2086
2087		buf = abd_borrow_buf_copy(rc->rc_abd, rc->rc_size);
2088		zfs_ereport_post_checksum(zio->io_spa, vd, zio,
2089		    rc->rc_offset, rc->rc_size, buf, bad_data,
2090		    &zbc);
2091		abd_return_buf(rc->rc_abd, buf, rc->rc_size);
2092	}
2093}
2094
2095/*
2096 * We keep track of whether or not there were any injected errors, so that
2097 * any ereports we generate can note it.
2098 */
2099static int
2100raidz_checksum_verify(zio_t *zio)
2101{
2102	zio_bad_cksum_t zbc;
2103	raidz_map_t *rm = zio->io_vsd;
2104
2105	int ret = zio_checksum_error(zio, &zbc);
2106	if (ret != 0 && zbc.zbc_injected != 0)
2107		rm->rm_ecksuminjected = 1;
2108
2109	return (ret);
2110}
2111
2112/*
2113 * Generate the parity from the data columns. If we tried and were able to
2114 * read the parity without error, verify that the generated parity matches the
2115 * data we read. If it doesn't, we fire off a checksum error. Return the
2116 * number such failures.
2117 */
2118static int
2119raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
2120{
2121	void *orig[VDEV_RAIDZ_MAXPARITY];
2122	int c, ret = 0;
2123	raidz_col_t *rc;
2124
2125	blkptr_t *bp = zio->io_bp;
2126	enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
2127	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
2128
2129	if (checksum == ZIO_CHECKSUM_NOPARITY)
2130		return (ret);
2131
2132	for (c = 0; c < rm->rm_firstdatacol; c++) {
2133		rc = &rm->rm_col[c];
2134		if (!rc->rc_tried || rc->rc_error != 0)
2135			continue;
2136		orig[c] = zio_buf_alloc(rc->rc_size);
2137		abd_copy_to_buf(orig[c], rc->rc_abd, rc->rc_size);
2138	}
2139
2140	vdev_raidz_generate_parity(rm);
2141
2142	for (c = 0; c < rm->rm_firstdatacol; c++) {
2143		rc = &rm->rm_col[c];
2144		if (!rc->rc_tried || rc->rc_error != 0)
2145			continue;
2146		if (abd_cmp_buf(rc->rc_abd, orig[c], rc->rc_size) != 0) {
2147			raidz_checksum_error(zio, rc, orig[c]);
2148			rc->rc_error = SET_ERROR(ECKSUM);
2149			ret++;
2150		}
2151		zio_buf_free(orig[c], rc->rc_size);
2152	}
2153
2154	return (ret);
2155}
2156
2157/*
2158 * Keep statistics on all the ways that we used parity to correct data.
2159 */
2160static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
2161
2162static int
2163vdev_raidz_worst_error(raidz_map_t *rm)
2164{
2165	int error = 0;
2166
2167	for (int c = 0; c < rm->rm_cols; c++)
2168		error = zio_worst_error(error, rm->rm_col[c].rc_error);
2169
2170	return (error);
2171}
2172
2173/*
2174 * Iterate over all combinations of bad data and attempt a reconstruction.
2175 * Note that the algorithm below is non-optimal because it doesn't take into
2176 * account how reconstruction is actually performed. For example, with
2177 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
2178 * is targeted as invalid as if columns 1 and 4 are targeted since in both
2179 * cases we'd only use parity information in column 0.
2180 */
2181static int
2182vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
2183{
2184	raidz_map_t *rm = zio->io_vsd;
2185	raidz_col_t *rc;
2186	void *orig[VDEV_RAIDZ_MAXPARITY];
2187	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
2188	int *tgts = &tstore[1];
2189	int current, next, i, c, n;
2190	int code, ret = 0;
2191
2192	ASSERT(total_errors < rm->rm_firstdatacol);
2193
2194	/*
2195	 * This simplifies one edge condition.
2196	 */
2197	tgts[-1] = -1;
2198
2199	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
2200		/*
2201		 * Initialize the targets array by finding the first n columns
2202		 * that contain no error.
2203		 *
2204		 * If there were no data errors, we need to ensure that we're
2205		 * always explicitly attempting to reconstruct at least one
2206		 * data column. To do this, we simply push the highest target
2207		 * up into the data columns.
2208		 */
2209		for (c = 0, i = 0; i < n; i++) {
2210			if (i == n - 1 && data_errors == 0 &&
2211			    c < rm->rm_firstdatacol) {
2212				c = rm->rm_firstdatacol;
2213			}
2214
2215			while (rm->rm_col[c].rc_error != 0) {
2216				c++;
2217				ASSERT3S(c, <, rm->rm_cols);
2218			}
2219
2220			tgts[i] = c++;
2221		}
2222
2223		/*
2224		 * Setting tgts[n] simplifies the other edge condition.
2225		 */
2226		tgts[n] = rm->rm_cols;
2227
2228		/*
2229		 * These buffers were allocated in previous iterations.
2230		 */
2231		for (i = 0; i < n - 1; i++) {
2232			ASSERT(orig[i] != NULL);
2233		}
2234
2235		orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
2236
2237		current = 0;
2238		next = tgts[current];
2239
2240		while (current != n) {
2241			tgts[current] = next;
2242			current = 0;
2243
2244			/*
2245			 * Save off the original data that we're going to
2246			 * attempt to reconstruct.
2247			 */
2248			for (i = 0; i < n; i++) {
2249				ASSERT(orig[i] != NULL);
2250				c = tgts[i];
2251				ASSERT3S(c, >=, 0);
2252				ASSERT3S(c, <, rm->rm_cols);
2253				rc = &rm->rm_col[c];
2254				abd_copy_to_buf(orig[i], rc->rc_abd,
2255				    rc->rc_size);
2256			}
2257
2258			/*
2259			 * Attempt a reconstruction and exit the outer loop on
2260			 * success.
2261			 */
2262			code = vdev_raidz_reconstruct(rm, tgts, n);
2263			if (raidz_checksum_verify(zio) == 0) {
2264				atomic_inc_64(&raidz_corrected[code]);
2265
2266				for (i = 0; i < n; i++) {
2267					c = tgts[i];
2268					rc = &rm->rm_col[c];
2269					ASSERT(rc->rc_error == 0);
2270					if (rc->rc_tried)
2271						raidz_checksum_error(zio, rc,
2272						    orig[i]);
2273					rc->rc_error = SET_ERROR(ECKSUM);
2274				}
2275
2276				ret = code;
2277				goto done;
2278			}
2279
2280			/*
2281			 * Restore the original data.
2282			 */
2283			for (i = 0; i < n; i++) {
2284				c = tgts[i];
2285				rc = &rm->rm_col[c];
2286				abd_copy_from_buf(rc->rc_abd, orig[i],
2287				    rc->rc_size);
2288			}
2289
2290			do {
2291				/*
2292				 * Find the next valid column after the current
2293				 * position..
2294				 */
2295				for (next = tgts[current] + 1;
2296				    next < rm->rm_cols &&
2297				    rm->rm_col[next].rc_error != 0; next++)
2298					continue;
2299
2300				ASSERT(next <= tgts[current + 1]);
2301
2302				/*
2303				 * If that spot is available, we're done here.
2304				 */
2305				if (next != tgts[current + 1])
2306					break;
2307
2308				/*
2309				 * Otherwise, find the next valid column after
2310				 * the previous position.
2311				 */
2312				for (c = tgts[current - 1] + 1;
2313				    rm->rm_col[c].rc_error != 0; c++)
2314					continue;
2315
2316				tgts[current] = c;
2317				current++;
2318
2319			} while (current != n);
2320		}
2321	}
2322	n--;
2323done:
2324	for (i = 0; i < n; i++) {
2325		zio_buf_free(orig[i], rm->rm_col[0].rc_size);
2326	}
2327
2328	return (ret);
2329}
2330
2331/*
2332 * Complete an IO operation on a RAIDZ VDev
2333 *
2334 * Outline:
2335 * - For write operations:
2336 *   1. Check for errors on the child IOs.
2337 *   2. Return, setting an error code if too few child VDevs were written
2338 *      to reconstruct the data later.  Note that partial writes are
2339 *      considered successful if they can be reconstructed at all.
2340 * - For read operations:
2341 *   1. Check for errors on the child IOs.
2342 *   2. If data errors occurred:
2343 *      a. Try to reassemble the data from the parity available.
2344 *      b. If we haven't yet read the parity drives, read them now.
2345 *      c. If all parity drives have been read but the data still doesn't
2346 *         reassemble with a correct checksum, then try combinatorial
2347 *         reconstruction.
2348 *      d. If that doesn't work, return an error.
2349 *   3. If there were unexpected errors or this is a resilver operation,
2350 *      rewrite the vdevs that had errors.
2351 */
2352static void
2353vdev_raidz_io_done(zio_t *zio)
2354{
2355	vdev_t *vd = zio->io_vd;
2356	vdev_t *cvd;
2357	raidz_map_t *rm = zio->io_vsd;
2358	raidz_col_t *rc;
2359	int unexpected_errors = 0;
2360	int parity_errors = 0;
2361	int parity_untried = 0;
2362	int data_errors = 0;
2363	int total_errors = 0;
2364	int n, c;
2365	int tgts[VDEV_RAIDZ_MAXPARITY];
2366	int code;
2367
2368	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
2369
2370	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
2371	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
2372
2373	for (c = 0; c < rm->rm_cols; c++) {
2374		rc = &rm->rm_col[c];
2375
2376		if (rc->rc_error) {
2377			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
2378
2379			if (c < rm->rm_firstdatacol)
2380				parity_errors++;
2381			else
2382				data_errors++;
2383
2384			if (!rc->rc_skipped)
2385				unexpected_errors++;
2386
2387			total_errors++;
2388		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
2389			parity_untried++;
2390		}
2391	}
2392
2393	if (zio->io_type == ZIO_TYPE_WRITE) {
2394		/*
2395		 * XXX -- for now, treat partial writes as a success.
2396		 * (If we couldn't write enough columns to reconstruct
2397		 * the data, the I/O failed.  Otherwise, good enough.)
2398		 *
2399		 * Now that we support write reallocation, it would be better
2400		 * to treat partial failure as real failure unless there are
2401		 * no non-degraded top-level vdevs left, and not update DTLs
2402		 * if we intend to reallocate.
2403		 */
2404		/* XXPOLICY */
2405		if (total_errors > rm->rm_firstdatacol)
2406			zio->io_error = vdev_raidz_worst_error(rm);
2407
2408		return;
2409	} else if (zio->io_type == ZIO_TYPE_FREE) {
2410		return;
2411	}
2412
2413	ASSERT(zio->io_type == ZIO_TYPE_READ);
2414	/*
2415	 * There are three potential phases for a read:
2416	 *	1. produce valid data from the columns read
2417	 *	2. read all disks and try again
2418	 *	3. perform combinatorial reconstruction
2419	 *
2420	 * Each phase is progressively both more expensive and less likely to
2421	 * occur. If we encounter more errors than we can repair or all phases
2422	 * fail, we have no choice but to return an error.
2423	 */
2424
2425	/*
2426	 * If the number of errors we saw was correctable -- less than or equal
2427	 * to the number of parity disks read -- attempt to produce data that
2428	 * has a valid checksum. Naturally, this case applies in the absence of
2429	 * any errors.
2430	 */
2431	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
2432		if (data_errors == 0) {
2433			if (raidz_checksum_verify(zio) == 0) {
2434				/*
2435				 * If we read parity information (unnecessarily
2436				 * as it happens since no reconstruction was
2437				 * needed) regenerate and verify the parity.
2438				 * We also regenerate parity when resilvering
2439				 * so we can write it out to the failed device
2440				 * later.
2441				 */
2442				if (parity_errors + parity_untried <
2443				    rm->rm_firstdatacol ||
2444				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
2445					n = raidz_parity_verify(zio, rm);
2446					unexpected_errors += n;
2447					ASSERT(parity_errors + n <=
2448					    rm->rm_firstdatacol);
2449				}
2450				goto done;
2451			}
2452		} else {
2453			/*
2454			 * We either attempt to read all the parity columns or
2455			 * none of them. If we didn't try to read parity, we
2456			 * wouldn't be here in the correctable case. There must
2457			 * also have been fewer parity errors than parity
2458			 * columns or, again, we wouldn't be in this code path.
2459			 */
2460			ASSERT(parity_untried == 0);
2461			ASSERT(parity_errors < rm->rm_firstdatacol);
2462
2463			/*
2464			 * Identify the data columns that reported an error.
2465			 */
2466			n = 0;
2467			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
2468				rc = &rm->rm_col[c];
2469				if (rc->rc_error != 0) {
2470					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2471					tgts[n++] = c;
2472				}
2473			}
2474
2475			ASSERT(rm->rm_firstdatacol >= n);
2476
2477			code = vdev_raidz_reconstruct(rm, tgts, n);
2478
2479			if (raidz_checksum_verify(zio) == 0) {
2480				atomic_inc_64(&raidz_corrected[code]);
2481
2482				/*
2483				 * If we read more parity disks than were used
2484				 * for reconstruction, confirm that the other
2485				 * parity disks produced correct data. This
2486				 * routine is suboptimal in that it regenerates
2487				 * the parity that we already used in addition
2488				 * to the parity that we're attempting to
2489				 * verify, but this should be a relatively
2490				 * uncommon case, and can be optimized if it
2491				 * becomes a problem. Note that we regenerate
2492				 * parity when resilvering so we can write it
2493				 * out to failed devices later.
2494				 */
2495				if (parity_errors < rm->rm_firstdatacol - n ||
2496				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
2497					n = raidz_parity_verify(zio, rm);
2498					unexpected_errors += n;
2499					ASSERT(parity_errors + n <=
2500					    rm->rm_firstdatacol);
2501				}
2502
2503				goto done;
2504			}
2505		}
2506	}
2507
2508	/*
2509	 * This isn't a typical situation -- either we got a read error or
2510	 * a child silently returned bad data. Read every block so we can
2511	 * try again with as much data and parity as we can track down. If
2512	 * we've already been through once before, all children will be marked
2513	 * as tried so we'll proceed to combinatorial reconstruction.
2514	 */
2515	unexpected_errors = 1;
2516	rm->rm_missingdata = 0;
2517	rm->rm_missingparity = 0;
2518
2519	for (c = 0; c < rm->rm_cols; c++) {
2520		if (rm->rm_col[c].rc_tried)
2521			continue;
2522
2523		zio_vdev_io_redone(zio);
2524		do {
2525			rc = &rm->rm_col[c];
2526			if (rc->rc_tried)
2527				continue;
2528			zio_nowait(zio_vdev_child_io(zio, NULL,
2529			    vd->vdev_child[rc->rc_devidx],
2530			    rc->rc_offset, rc->rc_abd, rc->rc_size,
2531			    zio->io_type, zio->io_priority, 0,
2532			    vdev_raidz_child_done, rc));
2533		} while (++c < rm->rm_cols);
2534
2535		return;
2536	}
2537
2538	/*
2539	 * At this point we've attempted to reconstruct the data given the
2540	 * errors we detected, and we've attempted to read all columns. There
2541	 * must, therefore, be one or more additional problems -- silent errors
2542	 * resulting in invalid data rather than explicit I/O errors resulting
2543	 * in absent data. We check if there is enough additional data to
2544	 * possibly reconstruct the data and then perform combinatorial
2545	 * reconstruction over all possible combinations. If that fails,
2546	 * we're cooked.
2547	 */
2548	if (total_errors > rm->rm_firstdatacol) {
2549		zio->io_error = vdev_raidz_worst_error(rm);
2550
2551	} else if (total_errors < rm->rm_firstdatacol &&
2552	    (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2553		/*
2554		 * If we didn't use all the available parity for the
2555		 * combinatorial reconstruction, verify that the remaining
2556		 * parity is correct.
2557		 */
2558		if (code != (1 << rm->rm_firstdatacol) - 1)
2559			(void) raidz_parity_verify(zio, rm);
2560	} else {
2561		/*
2562		 * We're here because either:
2563		 *
2564		 *	total_errors == rm_first_datacol, or
2565		 *	vdev_raidz_combrec() failed
2566		 *
2567		 * In either case, there is enough bad data to prevent
2568		 * reconstruction.
2569		 *
2570		 * Start checksum ereports for all children which haven't
2571		 * failed, and the IO wasn't speculative.
2572		 */
2573		zio->io_error = SET_ERROR(ECKSUM);
2574
2575		if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2576			for (c = 0; c < rm->rm_cols; c++) {
2577				rc = &rm->rm_col[c];
2578				if (rc->rc_error == 0) {
2579					zio_bad_cksum_t zbc;
2580					zbc.zbc_has_cksum = 0;
2581					zbc.zbc_injected =
2582					    rm->rm_ecksuminjected;
2583
2584					zfs_ereport_start_checksum(
2585					    zio->io_spa,
2586					    vd->vdev_child[rc->rc_devidx],
2587					    zio, rc->rc_offset, rc->rc_size,
2588					    (void *)(uintptr_t)c, &zbc);
2589				}
2590			}
2591		}
2592	}
2593
2594done:
2595	zio_checksum_verified(zio);
2596
2597	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2598	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2599		/*
2600		 * Use the good data we have in hand to repair damaged children.
2601		 */
2602		for (c = 0; c < rm->rm_cols; c++) {
2603			rc = &rm->rm_col[c];
2604			cvd = vd->vdev_child[rc->rc_devidx];
2605
2606			if (rc->rc_error == 0)
2607				continue;
2608
2609			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2610			    rc->rc_offset, rc->rc_abd, rc->rc_size,
2611			    ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
2612			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2613			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2614		}
2615	}
2616}
2617
2618static void
2619vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2620{
2621	if (faulted > vd->vdev_nparity)
2622		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2623		    VDEV_AUX_NO_REPLICAS);
2624	else if (degraded + faulted != 0)
2625		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2626	else
2627		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2628}
2629
2630/*
2631 * Determine if any portion of the provided block resides on a child vdev
2632 * with a dirty DTL and therefore needs to be resilvered.  The function
2633 * assumes that at least one DTL is dirty which imples that full stripe
2634 * width blocks must be resilvered.
2635 */
2636static boolean_t
2637vdev_raidz_need_resilver(vdev_t *vd, uint64_t offset, size_t psize)
2638{
2639	uint64_t dcols = vd->vdev_children;
2640	uint64_t nparity = vd->vdev_nparity;
2641	uint64_t ashift = vd->vdev_top->vdev_ashift;
2642	/* The starting RAIDZ (parent) vdev sector of the block. */
2643	uint64_t b = offset >> ashift;
2644	/* The zio's size in units of the vdev's minimum sector size. */
2645	uint64_t s = ((psize - 1) >> ashift) + 1;
2646	/* The first column for this stripe. */
2647	uint64_t f = b % dcols;
2648
2649	if (s + nparity >= dcols)
2650		return (B_TRUE);
2651
2652	for (uint64_t c = 0; c < s + nparity; c++) {
2653		uint64_t devidx = (f + c) % dcols;
2654		vdev_t *cvd = vd->vdev_child[devidx];
2655
2656		/*
2657		 * dsl_scan_need_resilver() already checked vd with
2658		 * vdev_dtl_contains(). So here just check cvd with
2659		 * vdev_dtl_empty(), cheaper and a good approximation.
2660		 */
2661		if (!vdev_dtl_empty(cvd, DTL_PARTIAL))
2662			return (B_TRUE);
2663	}
2664
2665	return (B_FALSE);
2666}
2667
2668static void
2669vdev_raidz_xlate(vdev_t *cvd, const range_seg_t *in, range_seg_t *res)
2670{
2671	vdev_t *raidvd = cvd->vdev_parent;
2672	ASSERT(raidvd->vdev_ops == &vdev_raidz_ops);
2673
2674	uint64_t width = raidvd->vdev_children;
2675	uint64_t tgt_col = cvd->vdev_id;
2676	uint64_t ashift = raidvd->vdev_top->vdev_ashift;
2677
2678	/* make sure the offsets are block-aligned */
2679	ASSERT0(in->rs_start % (1 << ashift));
2680	ASSERT0(in->rs_end % (1 << ashift));
2681	uint64_t b_start = in->rs_start >> ashift;
2682	uint64_t b_end = in->rs_end >> ashift;
2683
2684	uint64_t start_row = 0;
2685	if (b_start > tgt_col) /* avoid underflow */
2686		start_row = ((b_start - tgt_col - 1) / width) + 1;
2687
2688	uint64_t end_row = 0;
2689	if (b_end > tgt_col)
2690		end_row = ((b_end - tgt_col - 1) / width) + 1;
2691
2692	res->rs_start = start_row << ashift;
2693	res->rs_end = end_row << ashift;
2694
2695	ASSERT3U(res->rs_start, <=, in->rs_start);
2696	ASSERT3U(res->rs_end - res->rs_start, <=, in->rs_end - in->rs_start);
2697}
2698
2699vdev_ops_t vdev_raidz_ops = {
2700	vdev_raidz_open,
2701	vdev_raidz_close,
2702	vdev_raidz_asize,
2703	vdev_raidz_io_start,
2704	vdev_raidz_io_done,
2705	vdev_raidz_state_change,
2706	vdev_raidz_need_resilver,
2707	NULL,
2708	NULL,
2709	NULL,
2710	vdev_raidz_xlate,
2711	VDEV_TYPE_RAIDZ,	/* name of this vdev type */
2712	B_FALSE			/* not a leaf vdev */
2713};
2714