Deleted Added
full compact
vdev_raidz.c (269407) vdev_raidz.c (274304)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2013 by Delphix. All rights reserved.
24 * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
25 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
26 */
27
28#include <sys/zfs_context.h>
29#include <sys/spa.h>
30#include <sys/vdev_impl.h>
31#ifdef illumos
32#include <sys/vdev_disk.h>
33#endif
34#include <sys/vdev_file.h>
35#include <sys/vdev_raidz.h>
36#include <sys/zio.h>
37#include <sys/zio_checksum.h>
38#include <sys/fs/zfs.h>
39#include <sys/fm/fs/zfs.h>
40#include <sys/bio.h>
41
42/*
43 * Virtual device vector for RAID-Z.
44 *
45 * This vdev supports single, double, and triple parity. For single parity,
46 * we use a simple XOR of all the data columns. For double or triple parity,
47 * we use a special case of Reed-Solomon coding. This extends the
48 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
49 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
50 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
51 * former is also based. The latter is designed to provide higher performance
52 * for writes.
53 *
54 * Note that the Plank paper claimed to support arbitrary N+M, but was then
55 * amended six years later identifying a critical flaw that invalidates its
56 * claims. Nevertheless, the technique can be adapted to work for up to
57 * triple parity. For additional parity, the amendment "Note: Correction to
58 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
59 * is viable, but the additional complexity means that write performance will
60 * suffer.
61 *
62 * All of the methods above operate on a Galois field, defined over the
63 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
64 * can be expressed with a single byte. Briefly, the operations on the
65 * field are defined as follows:
66 *
67 * o addition (+) is represented by a bitwise XOR
68 * o subtraction (-) is therefore identical to addition: A + B = A - B
69 * o multiplication of A by 2 is defined by the following bitwise expression:
70 *
71 * (A * 2)_7 = A_6
72 * (A * 2)_6 = A_5
73 * (A * 2)_5 = A_4
74 * (A * 2)_4 = A_3 + A_7
75 * (A * 2)_3 = A_2 + A_7
76 * (A * 2)_2 = A_1 + A_7
77 * (A * 2)_1 = A_0
78 * (A * 2)_0 = A_7
79 *
80 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
81 * As an aside, this multiplication is derived from the error correcting
82 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
83 *
84 * Observe that any number in the field (except for 0) can be expressed as a
85 * power of 2 -- a generator for the field. We store a table of the powers of
86 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
87 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
88 * than field addition). The inverse of a field element A (A^-1) is therefore
89 * A ^ (255 - 1) = A^254.
90 *
91 * The up-to-three parity columns, P, Q, R over several data columns,
92 * D_0, ... D_n-1, can be expressed by field operations:
93 *
94 * P = D_0 + D_1 + ... + D_n-2 + D_n-1
95 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
96 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
97 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
98 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
99 *
100 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
101 * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
102 * independent coefficients. (There are no additional coefficients that have
103 * this property which is why the uncorrected Plank method breaks down.)
104 *
105 * See the reconstruction code below for how P, Q and R can used individually
106 * or in concert to recover missing data columns.
107 */
108
109typedef struct raidz_col {
110 uint64_t rc_devidx; /* child device index for I/O */
111 uint64_t rc_offset; /* device offset */
112 uint64_t rc_size; /* I/O size */
113 void *rc_data; /* I/O data */
114 void *rc_gdata; /* used to store the "good" version */
115 int rc_error; /* I/O error for this device */
116 uint8_t rc_tried; /* Did we attempt this I/O column? */
117 uint8_t rc_skipped; /* Did we skip this I/O column? */
118} raidz_col_t;
119
120typedef struct raidz_map {
121 uint64_t rm_cols; /* Regular column count */
122 uint64_t rm_scols; /* Count including skipped columns */
123 uint64_t rm_bigcols; /* Number of oversized columns */
124 uint64_t rm_asize; /* Actual total I/O size */
125 uint64_t rm_missingdata; /* Count of missing data devices */
126 uint64_t rm_missingparity; /* Count of missing parity devices */
127 uint64_t rm_firstdatacol; /* First data column/parity count */
128 uint64_t rm_nskip; /* Skipped sectors for padding */
129 uint64_t rm_skipstart; /* Column index of padding start */
130 void *rm_datacopy; /* rm_asize-buffer of copied data */
131 uintptr_t rm_reports; /* # of referencing checksum reports */
132 uint8_t rm_freed; /* map no longer has referencing ZIO */
133 uint8_t rm_ecksuminjected; /* checksum error was injected */
134 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
135} raidz_map_t;
136
137#define VDEV_RAIDZ_P 0
138#define VDEV_RAIDZ_Q 1
139#define VDEV_RAIDZ_R 2
140
141#define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
142#define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
143
144/*
145 * We provide a mechanism to perform the field multiplication operation on a
146 * 64-bit value all at once rather than a byte at a time. This works by
147 * creating a mask from the top bit in each byte and using that to
148 * conditionally apply the XOR of 0x1d.
149 */
150#define VDEV_RAIDZ_64MUL_2(x, mask) \
151{ \
152 (mask) = (x) & 0x8080808080808080ULL; \
153 (mask) = ((mask) << 1) - ((mask) >> 7); \
154 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
155 ((mask) & 0x1d1d1d1d1d1d1d1d); \
156}
157
158#define VDEV_RAIDZ_64MUL_4(x, mask) \
159{ \
160 VDEV_RAIDZ_64MUL_2((x), mask); \
161 VDEV_RAIDZ_64MUL_2((x), mask); \
162}
163
164#define VDEV_LABEL_OFFSET(x) (x + VDEV_LABEL_START_SIZE)
165
166/*
167 * Force reconstruction to use the general purpose method.
168 */
169int vdev_raidz_default_to_general;
170
171/* Powers of 2 in the Galois field defined above. */
172static const uint8_t vdev_raidz_pow2[256] = {
173 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
174 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
175 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
176 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
177 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
178 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
179 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
180 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
181 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
182 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
183 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
184 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
185 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
186 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
187 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
188 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
189 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
190 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
191 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
192 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
193 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
194 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
195 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
196 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
197 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
198 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
199 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
200 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
201 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
202 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
203 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
204 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
205};
206/* Logs of 2 in the Galois field defined above. */
207static const uint8_t vdev_raidz_log2[256] = {
208 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
209 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
210 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
211 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
212 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
213 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
214 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
215 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
216 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
217 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
218 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
219 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
220 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
221 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
222 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
223 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
224 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
225 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
226 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
227 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
228 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
229 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
230 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
231 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
232 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
233 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
234 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
235 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
236 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
237 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
238 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
239 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
240};
241
242static void vdev_raidz_generate_parity(raidz_map_t *rm);
243
244/*
245 * Multiply a given number by 2 raised to the given power.
246 */
247static uint8_t
248vdev_raidz_exp2(uint_t a, int exp)
249{
250 if (a == 0)
251 return (0);
252
253 ASSERT(exp >= 0);
254 ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
255
256 exp += vdev_raidz_log2[a];
257 if (exp > 255)
258 exp -= 255;
259
260 return (vdev_raidz_pow2[exp]);
261}
262
263static void
264vdev_raidz_map_free(raidz_map_t *rm)
265{
266 int c;
267 size_t size;
268
269 for (c = 0; c < rm->rm_firstdatacol; c++) {
270 if (rm->rm_col[c].rc_data != NULL)
271 zio_buf_free(rm->rm_col[c].rc_data,
272 rm->rm_col[c].rc_size);
273
274 if (rm->rm_col[c].rc_gdata != NULL)
275 zio_buf_free(rm->rm_col[c].rc_gdata,
276 rm->rm_col[c].rc_size);
277 }
278
279 size = 0;
280 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
281 size += rm->rm_col[c].rc_size;
282
283 if (rm->rm_datacopy != NULL)
284 zio_buf_free(rm->rm_datacopy, size);
285
286 kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
287}
288
289static void
290vdev_raidz_map_free_vsd(zio_t *zio)
291{
292 raidz_map_t *rm = zio->io_vsd;
293
294 ASSERT0(rm->rm_freed);
295 rm->rm_freed = 1;
296
297 if (rm->rm_reports == 0)
298 vdev_raidz_map_free(rm);
299}
300
301/*ARGSUSED*/
302static void
303vdev_raidz_cksum_free(void *arg, size_t ignored)
304{
305 raidz_map_t *rm = arg;
306
307 ASSERT3U(rm->rm_reports, >, 0);
308
309 if (--rm->rm_reports == 0 && rm->rm_freed != 0)
310 vdev_raidz_map_free(rm);
311}
312
313static void
314vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
315{
316 raidz_map_t *rm = zcr->zcr_cbdata;
317 size_t c = zcr->zcr_cbinfo;
318 size_t x;
319
320 const char *good = NULL;
321 const char *bad = rm->rm_col[c].rc_data;
322
323 if (good_data == NULL) {
324 zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
325 return;
326 }
327
328 if (c < rm->rm_firstdatacol) {
329 /*
330 * The first time through, calculate the parity blocks for
331 * the good data (this relies on the fact that the good
332 * data never changes for a given logical ZIO)
333 */
334 if (rm->rm_col[0].rc_gdata == NULL) {
335 char *bad_parity[VDEV_RAIDZ_MAXPARITY];
336 char *buf;
337
338 /*
339 * Set up the rm_col[]s to generate the parity for
340 * good_data, first saving the parity bufs and
341 * replacing them with buffers to hold the result.
342 */
343 for (x = 0; x < rm->rm_firstdatacol; x++) {
344 bad_parity[x] = rm->rm_col[x].rc_data;
345 rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
346 zio_buf_alloc(rm->rm_col[x].rc_size);
347 }
348
349 /* fill in the data columns from good_data */
350 buf = (char *)good_data;
351 for (; x < rm->rm_cols; x++) {
352 rm->rm_col[x].rc_data = buf;
353 buf += rm->rm_col[x].rc_size;
354 }
355
356 /*
357 * Construct the parity from the good data.
358 */
359 vdev_raidz_generate_parity(rm);
360
361 /* restore everything back to its original state */
362 for (x = 0; x < rm->rm_firstdatacol; x++)
363 rm->rm_col[x].rc_data = bad_parity[x];
364
365 buf = rm->rm_datacopy;
366 for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
367 rm->rm_col[x].rc_data = buf;
368 buf += rm->rm_col[x].rc_size;
369 }
370 }
371
372 ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
373 good = rm->rm_col[c].rc_gdata;
374 } else {
375 /* adjust good_data to point at the start of our column */
376 good = good_data;
377
378 for (x = rm->rm_firstdatacol; x < c; x++)
379 good += rm->rm_col[x].rc_size;
380 }
381
382 /* we drop the ereport if it ends up that the data was good */
383 zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
384}
385
386/*
387 * Invoked indirectly by zfs_ereport_start_checksum(), called
388 * below when our read operation fails completely. The main point
389 * is to keep a copy of everything we read from disk, so that at
390 * vdev_raidz_cksum_finish() time we can compare it with the good data.
391 */
392static void
393vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
394{
395 size_t c = (size_t)(uintptr_t)arg;
396 caddr_t buf;
397
398 raidz_map_t *rm = zio->io_vsd;
399 size_t size;
400
401 /* set up the report and bump the refcount */
402 zcr->zcr_cbdata = rm;
403 zcr->zcr_cbinfo = c;
404 zcr->zcr_finish = vdev_raidz_cksum_finish;
405 zcr->zcr_free = vdev_raidz_cksum_free;
406
407 rm->rm_reports++;
408 ASSERT3U(rm->rm_reports, >, 0);
409
410 if (rm->rm_datacopy != NULL)
411 return;
412
413 /*
414 * It's the first time we're called for this raidz_map_t, so we need
415 * to copy the data aside; there's no guarantee that our zio's buffer
416 * won't be re-used for something else.
417 *
418 * Our parity data is already in separate buffers, so there's no need
419 * to copy them.
420 */
421
422 size = 0;
423 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
424 size += rm->rm_col[c].rc_size;
425
426 buf = rm->rm_datacopy = zio_buf_alloc(size);
427
428 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
429 raidz_col_t *col = &rm->rm_col[c];
430
431 bcopy(col->rc_data, buf, col->rc_size);
432 col->rc_data = buf;
433
434 buf += col->rc_size;
435 }
436 ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
437}
438
439static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
440 vdev_raidz_map_free_vsd,
441 vdev_raidz_cksum_report
442};
443
444/*
445 * Divides the IO evenly across all child vdevs; usually, dcols is
446 * the number of children in the target vdev.
447 */
448static raidz_map_t *
449vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset, boolean_t dofree,
450 uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
451{
452 raidz_map_t *rm;
453 /* The starting RAIDZ (parent) vdev sector of the block. */
454 uint64_t b = offset >> unit_shift;
455 /* The zio's size in units of the vdev's minimum sector size. */
456 uint64_t s = size >> unit_shift;
457 /* The first column for this stripe. */
458 uint64_t f = b % dcols;
459 /* The starting byte offset on each child vdev. */
460 uint64_t o = (b / dcols) << unit_shift;
461 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
462
463 /*
464 * "Quotient": The number of data sectors for this stripe on all but
465 * the "big column" child vdevs that also contain "remainder" data.
466 */
467 q = s / (dcols - nparity);
468
469 /*
470 * "Remainder": The number of partial stripe data sectors in this I/O.
471 * This will add a sector to some, but not all, child vdevs.
472 */
473 r = s - q * (dcols - nparity);
474
475 /* The number of "big columns" - those which contain remainder data. */
476 bc = (r == 0 ? 0 : r + nparity);
477
478 /*
479 * The total number of data and parity sectors associated with
480 * this I/O.
481 */
482 tot = s + nparity * (q + (r == 0 ? 0 : 1));
483
484 /* acols: The columns that will be accessed. */
485 /* scols: The columns that will be accessed or skipped. */
486 if (q == 0) {
487 /* Our I/O request doesn't span all child vdevs. */
488 acols = bc;
489 scols = MIN(dcols, roundup(bc, nparity + 1));
490 } else {
491 acols = dcols;
492 scols = dcols;
493 }
494
495 ASSERT3U(acols, <=, scols);
496
497 rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
498
499 rm->rm_cols = acols;
500 rm->rm_scols = scols;
501 rm->rm_bigcols = bc;
502 rm->rm_skipstart = bc;
503 rm->rm_missingdata = 0;
504 rm->rm_missingparity = 0;
505 rm->rm_firstdatacol = nparity;
506 rm->rm_datacopy = NULL;
507 rm->rm_reports = 0;
508 rm->rm_freed = 0;
509 rm->rm_ecksuminjected = 0;
510
511 asize = 0;
512
513 for (c = 0; c < scols; c++) {
514 col = f + c;
515 coff = o;
516 if (col >= dcols) {
517 col -= dcols;
518 coff += 1ULL << unit_shift;
519 }
520 rm->rm_col[c].rc_devidx = col;
521 rm->rm_col[c].rc_offset = coff;
522 rm->rm_col[c].rc_data = NULL;
523 rm->rm_col[c].rc_gdata = NULL;
524 rm->rm_col[c].rc_error = 0;
525 rm->rm_col[c].rc_tried = 0;
526 rm->rm_col[c].rc_skipped = 0;
527
528 if (c >= acols)
529 rm->rm_col[c].rc_size = 0;
530 else if (c < bc)
531 rm->rm_col[c].rc_size = (q + 1) << unit_shift;
532 else
533 rm->rm_col[c].rc_size = q << unit_shift;
534
535 asize += rm->rm_col[c].rc_size;
536 }
537
538 ASSERT3U(asize, ==, tot << unit_shift);
539 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
540 rm->rm_nskip = roundup(tot, nparity + 1) - tot;
541 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
542 ASSERT3U(rm->rm_nskip, <=, nparity);
543
544 if (!dofree) {
545 for (c = 0; c < rm->rm_firstdatacol; c++) {
546 rm->rm_col[c].rc_data =
547 zio_buf_alloc(rm->rm_col[c].rc_size);
548 }
549
550 rm->rm_col[c].rc_data = data;
551
552 for (c = c + 1; c < acols; c++) {
553 rm->rm_col[c].rc_data =
554 (char *)rm->rm_col[c - 1].rc_data +
555 rm->rm_col[c - 1].rc_size;
556 }
557 }
558
559 /*
560 * If all data stored spans all columns, there's a danger that parity
561 * will always be on the same device and, since parity isn't read
562 * during normal operation, that that device's I/O bandwidth won't be
563 * used effectively. We therefore switch the parity every 1MB.
564 *
565 * ... at least that was, ostensibly, the theory. As a practical
566 * matter unless we juggle the parity between all devices evenly, we
567 * won't see any benefit. Further, occasional writes that aren't a
568 * multiple of the LCM of the number of children and the minimum
569 * stripe width are sufficient to avoid pessimal behavior.
570 * Unfortunately, this decision created an implicit on-disk format
571 * requirement that we need to support for all eternity, but only
572 * for single-parity RAID-Z.
573 *
574 * If we intend to skip a sector in the zeroth column for padding
575 * we must make sure to note this swap. We will never intend to
576 * skip the first column since at least one data and one parity
577 * column must appear in each row.
578 */
579 ASSERT(rm->rm_cols >= 2);
580 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
581
582 if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
583 devidx = rm->rm_col[0].rc_devidx;
584 o = rm->rm_col[0].rc_offset;
585 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
586 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
587 rm->rm_col[1].rc_devidx = devidx;
588 rm->rm_col[1].rc_offset = o;
589
590 if (rm->rm_skipstart == 0)
591 rm->rm_skipstart = 1;
592 }
593
594 return (rm);
595}
596
597static void
598vdev_raidz_generate_parity_p(raidz_map_t *rm)
599{
600 uint64_t *p, *src, pcount, ccount, i;
601 int c;
602
603 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
604
605 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
606 src = rm->rm_col[c].rc_data;
607 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
608 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
609
610 if (c == rm->rm_firstdatacol) {
611 ASSERT(ccount == pcount);
612 for (i = 0; i < ccount; i++, src++, p++) {
613 *p = *src;
614 }
615 } else {
616 ASSERT(ccount <= pcount);
617 for (i = 0; i < ccount; i++, src++, p++) {
618 *p ^= *src;
619 }
620 }
621 }
622}
623
624static void
625vdev_raidz_generate_parity_pq(raidz_map_t *rm)
626{
627 uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
628 int c;
629
630 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
631 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
632 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
633
634 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
635 src = rm->rm_col[c].rc_data;
636 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
637 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
638
639 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
640
641 if (c == rm->rm_firstdatacol) {
642 ASSERT(ccnt == pcnt || ccnt == 0);
643 for (i = 0; i < ccnt; i++, src++, p++, q++) {
644 *p = *src;
645 *q = *src;
646 }
647 for (; i < pcnt; i++, src++, p++, q++) {
648 *p = 0;
649 *q = 0;
650 }
651 } else {
652 ASSERT(ccnt <= pcnt);
653
654 /*
655 * Apply the algorithm described above by multiplying
656 * the previous result and adding in the new value.
657 */
658 for (i = 0; i < ccnt; i++, src++, p++, q++) {
659 *p ^= *src;
660
661 VDEV_RAIDZ_64MUL_2(*q, mask);
662 *q ^= *src;
663 }
664
665 /*
666 * Treat short columns as though they are full of 0s.
667 * Note that there's therefore nothing needed for P.
668 */
669 for (; i < pcnt; i++, q++) {
670 VDEV_RAIDZ_64MUL_2(*q, mask);
671 }
672 }
673 }
674}
675
676static void
677vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
678{
679 uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
680 int c;
681
682 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
683 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
684 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
685 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
686 rm->rm_col[VDEV_RAIDZ_R].rc_size);
687
688 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
689 src = rm->rm_col[c].rc_data;
690 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
691 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
692 r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
693
694 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
695
696 if (c == rm->rm_firstdatacol) {
697 ASSERT(ccnt == pcnt || ccnt == 0);
698 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
699 *p = *src;
700 *q = *src;
701 *r = *src;
702 }
703 for (; i < pcnt; i++, src++, p++, q++, r++) {
704 *p = 0;
705 *q = 0;
706 *r = 0;
707 }
708 } else {
709 ASSERT(ccnt <= pcnt);
710
711 /*
712 * Apply the algorithm described above by multiplying
713 * the previous result and adding in the new value.
714 */
715 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
716 *p ^= *src;
717
718 VDEV_RAIDZ_64MUL_2(*q, mask);
719 *q ^= *src;
720
721 VDEV_RAIDZ_64MUL_4(*r, mask);
722 *r ^= *src;
723 }
724
725 /*
726 * Treat short columns as though they are full of 0s.
727 * Note that there's therefore nothing needed for P.
728 */
729 for (; i < pcnt; i++, q++, r++) {
730 VDEV_RAIDZ_64MUL_2(*q, mask);
731 VDEV_RAIDZ_64MUL_4(*r, mask);
732 }
733 }
734 }
735}
736
737/*
738 * Generate RAID parity in the first virtual columns according to the number of
739 * parity columns available.
740 */
741static void
742vdev_raidz_generate_parity(raidz_map_t *rm)
743{
744 switch (rm->rm_firstdatacol) {
745 case 1:
746 vdev_raidz_generate_parity_p(rm);
747 break;
748 case 2:
749 vdev_raidz_generate_parity_pq(rm);
750 break;
751 case 3:
752 vdev_raidz_generate_parity_pqr(rm);
753 break;
754 default:
755 cmn_err(CE_PANIC, "invalid RAID-Z configuration");
756 }
757}
758
759static int
760vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
761{
762 uint64_t *dst, *src, xcount, ccount, count, i;
763 int x = tgts[0];
764 int c;
765
766 ASSERT(ntgts == 1);
767 ASSERT(x >= rm->rm_firstdatacol);
768 ASSERT(x < rm->rm_cols);
769
770 xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
771 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
772 ASSERT(xcount > 0);
773
774 src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
775 dst = rm->rm_col[x].rc_data;
776 for (i = 0; i < xcount; i++, dst++, src++) {
777 *dst = *src;
778 }
779
780 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
781 src = rm->rm_col[c].rc_data;
782 dst = rm->rm_col[x].rc_data;
783
784 if (c == x)
785 continue;
786
787 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
788 count = MIN(ccount, xcount);
789
790 for (i = 0; i < count; i++, dst++, src++) {
791 *dst ^= *src;
792 }
793 }
794
795 return (1 << VDEV_RAIDZ_P);
796}
797
798static int
799vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
800{
801 uint64_t *dst, *src, xcount, ccount, count, mask, i;
802 uint8_t *b;
803 int x = tgts[0];
804 int c, j, exp;
805
806 ASSERT(ntgts == 1);
807
808 xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
809 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
810
811 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
812 src = rm->rm_col[c].rc_data;
813 dst = rm->rm_col[x].rc_data;
814
815 if (c == x)
816 ccount = 0;
817 else
818 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
819
820 count = MIN(ccount, xcount);
821
822 if (c == rm->rm_firstdatacol) {
823 for (i = 0; i < count; i++, dst++, src++) {
824 *dst = *src;
825 }
826 for (; i < xcount; i++, dst++) {
827 *dst = 0;
828 }
829
830 } else {
831 for (i = 0; i < count; i++, dst++, src++) {
832 VDEV_RAIDZ_64MUL_2(*dst, mask);
833 *dst ^= *src;
834 }
835
836 for (; i < xcount; i++, dst++) {
837 VDEV_RAIDZ_64MUL_2(*dst, mask);
838 }
839 }
840 }
841
842 src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
843 dst = rm->rm_col[x].rc_data;
844 exp = 255 - (rm->rm_cols - 1 - x);
845
846 for (i = 0; i < xcount; i++, dst++, src++) {
847 *dst ^= *src;
848 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
849 *b = vdev_raidz_exp2(*b, exp);
850 }
851 }
852
853 return (1 << VDEV_RAIDZ_Q);
854}
855
856static int
857vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
858{
859 uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
860 void *pdata, *qdata;
861 uint64_t xsize, ysize, i;
862 int x = tgts[0];
863 int y = tgts[1];
864
865 ASSERT(ntgts == 2);
866 ASSERT(x < y);
867 ASSERT(x >= rm->rm_firstdatacol);
868 ASSERT(y < rm->rm_cols);
869
870 ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
871
872 /*
873 * Move the parity data aside -- we're going to compute parity as
874 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
875 * reuse the parity generation mechanism without trashing the actual
876 * parity so we make those columns appear to be full of zeros by
877 * setting their lengths to zero.
878 */
879 pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
880 qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
881 xsize = rm->rm_col[x].rc_size;
882 ysize = rm->rm_col[y].rc_size;
883
884 rm->rm_col[VDEV_RAIDZ_P].rc_data =
885 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
886 rm->rm_col[VDEV_RAIDZ_Q].rc_data =
887 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
888 rm->rm_col[x].rc_size = 0;
889 rm->rm_col[y].rc_size = 0;
890
891 vdev_raidz_generate_parity_pq(rm);
892
893 rm->rm_col[x].rc_size = xsize;
894 rm->rm_col[y].rc_size = ysize;
895
896 p = pdata;
897 q = qdata;
898 pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
899 qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
900 xd = rm->rm_col[x].rc_data;
901 yd = rm->rm_col[y].rc_data;
902
903 /*
904 * We now have:
905 * Pxy = P + D_x + D_y
906 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
907 *
908 * We can then solve for D_x:
909 * D_x = A * (P + Pxy) + B * (Q + Qxy)
910 * where
911 * A = 2^(x - y) * (2^(x - y) + 1)^-1
912 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
913 *
914 * With D_x in hand, we can easily solve for D_y:
915 * D_y = P + Pxy + D_x
916 */
917
918 a = vdev_raidz_pow2[255 + x - y];
919 b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
920 tmp = 255 - vdev_raidz_log2[a ^ 1];
921
922 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
923 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
924
925 for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
926 *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
927 vdev_raidz_exp2(*q ^ *qxy, bexp);
928
929 if (i < ysize)
930 *yd = *p ^ *pxy ^ *xd;
931 }
932
933 zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
934 rm->rm_col[VDEV_RAIDZ_P].rc_size);
935 zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
936 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
937
938 /*
939 * Restore the saved parity data.
940 */
941 rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
942 rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
943
944 return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
945}
946
947/* BEGIN CSTYLED */
948/*
949 * In the general case of reconstruction, we must solve the system of linear
950 * equations defined by the coeffecients used to generate parity as well as
951 * the contents of the data and parity disks. This can be expressed with
952 * vectors for the original data (D) and the actual data (d) and parity (p)
953 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
954 *
955 * __ __ __ __
956 * | | __ __ | p_0 |
957 * | V | | D_0 | | p_m-1 |
958 * | | x | : | = | d_0 |
959 * | I | | D_n-1 | | : |
960 * | | ~~ ~~ | d_n-1 |
961 * ~~ ~~ ~~ ~~
962 *
963 * I is simply a square identity matrix of size n, and V is a vandermonde
964 * matrix defined by the coeffecients we chose for the various parity columns
965 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
966 * computation as well as linear separability.
967 *
968 * __ __ __ __
969 * | 1 .. 1 1 1 | | p_0 |
970 * | 2^n-1 .. 4 2 1 | __ __ | : |
971 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 |
972 * | 1 .. 0 0 0 | | D_1 | | d_0 |
973 * | 0 .. 0 0 0 | x | D_2 | = | d_1 |
974 * | : : : : | | : | | d_2 |
975 * | 0 .. 1 0 0 | | D_n-1 | | : |
976 * | 0 .. 0 1 0 | ~~ ~~ | : |
977 * | 0 .. 0 0 1 | | d_n-1 |
978 * ~~ ~~ ~~ ~~
979 *
980 * Note that I, V, d, and p are known. To compute D, we must invert the
981 * matrix and use the known data and parity values to reconstruct the unknown
982 * data values. We begin by removing the rows in V|I and d|p that correspond
983 * to failed or missing columns; we then make V|I square (n x n) and d|p
984 * sized n by removing rows corresponding to unused parity from the bottom up
985 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
986 * using Gauss-Jordan elimination. In the example below we use m=3 parity
987 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
988 * __ __
989 * | 1 1 1 1 1 1 1 1 |
990 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks
991 * | 19 205 116 29 64 16 4 1 | / /
992 * | 1 0 0 0 0 0 0 0 | / /
993 * | 0 1 0 0 0 0 0 0 | <--' /
994 * (V|I) = | 0 0 1 0 0 0 0 0 | <---'
995 * | 0 0 0 1 0 0 0 0 |
996 * | 0 0 0 0 1 0 0 0 |
997 * | 0 0 0 0 0 1 0 0 |
998 * | 0 0 0 0 0 0 1 0 |
999 * | 0 0 0 0 0 0 0 1 |
1000 * ~~ ~~
1001 * __ __
1002 * | 1 1 1 1 1 1 1 1 |
1003 * | 19 205 116 29 64 16 4 1 |
1004 * | 1 0 0 0 0 0 0 0 |
1005 * (V|I)' = | 0 0 0 1 0 0 0 0 |
1006 * | 0 0 0 0 1 0 0 0 |
1007 * | 0 0 0 0 0 1 0 0 |
1008 * | 0 0 0 0 0 0 1 0 |
1009 * | 0 0 0 0 0 0 0 1 |
1010 * ~~ ~~
1011 *
1012 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1013 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1014 * matrix is not singular.
1015 * __ __
1016 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
1017 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
1018 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1019 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1020 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1021 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1022 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1023 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1024 * ~~ ~~
1025 * __ __
1026 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1027 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
1028 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
1029 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1030 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1031 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1032 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1033 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1034 * ~~ ~~
1035 * __ __
1036 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1037 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1038 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 |
1039 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1040 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1041 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1042 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1043 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1044 * ~~ ~~
1045 * __ __
1046 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1047 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1048 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 |
1049 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1050 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1051 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1052 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1053 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1054 * ~~ ~~
1055 * __ __
1056 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1057 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1058 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1059 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1060 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1061 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1062 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1063 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1064 * ~~ ~~
1065 * __ __
1066 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1067 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 |
1068 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1069 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1070 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1071 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1072 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1073 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1074 * ~~ ~~
1075 * __ __
1076 * | 0 0 1 0 0 0 0 0 |
1077 * | 167 100 5 41 159 169 217 208 |
1078 * | 166 100 4 40 158 168 216 209 |
1079 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 |
1080 * | 0 0 0 0 1 0 0 0 |
1081 * | 0 0 0 0 0 1 0 0 |
1082 * | 0 0 0 0 0 0 1 0 |
1083 * | 0 0 0 0 0 0 0 1 |
1084 * ~~ ~~
1085 *
1086 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1087 * of the missing data.
1088 *
1089 * As is apparent from the example above, the only non-trivial rows in the
1090 * inverse matrix correspond to the data disks that we're trying to
1091 * reconstruct. Indeed, those are the only rows we need as the others would
1092 * only be useful for reconstructing data known or assumed to be valid. For
1093 * that reason, we only build the coefficients in the rows that correspond to
1094 * targeted columns.
1095 */
1096/* END CSTYLED */
1097
1098static void
1099vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1100 uint8_t **rows)
1101{
1102 int i, j;
1103 int pow;
1104
1105 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1106
1107 /*
1108 * Fill in the missing rows of interest.
1109 */
1110 for (i = 0; i < nmap; i++) {
1111 ASSERT3S(0, <=, map[i]);
1112 ASSERT3S(map[i], <=, 2);
1113
1114 pow = map[i] * n;
1115 if (pow > 255)
1116 pow -= 255;
1117 ASSERT(pow <= 255);
1118
1119 for (j = 0; j < n; j++) {
1120 pow -= map[i];
1121 if (pow < 0)
1122 pow += 255;
1123 rows[i][j] = vdev_raidz_pow2[pow];
1124 }
1125 }
1126}
1127
1128static void
1129vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1130 uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1131{
1132 int i, j, ii, jj;
1133 uint8_t log;
1134
1135 /*
1136 * Assert that the first nmissing entries from the array of used
1137 * columns correspond to parity columns and that subsequent entries
1138 * correspond to data columns.
1139 */
1140 for (i = 0; i < nmissing; i++) {
1141 ASSERT3S(used[i], <, rm->rm_firstdatacol);
1142 }
1143 for (; i < n; i++) {
1144 ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1145 }
1146
1147 /*
1148 * First initialize the storage where we'll compute the inverse rows.
1149 */
1150 for (i = 0; i < nmissing; i++) {
1151 for (j = 0; j < n; j++) {
1152 invrows[i][j] = (i == j) ? 1 : 0;
1153 }
1154 }
1155
1156 /*
1157 * Subtract all trivial rows from the rows of consequence.
1158 */
1159 for (i = 0; i < nmissing; i++) {
1160 for (j = nmissing; j < n; j++) {
1161 ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1162 jj = used[j] - rm->rm_firstdatacol;
1163 ASSERT3S(jj, <, n);
1164 invrows[i][j] = rows[i][jj];
1165 rows[i][jj] = 0;
1166 }
1167 }
1168
1169 /*
1170 * For each of the rows of interest, we must normalize it and subtract
1171 * a multiple of it from the other rows.
1172 */
1173 for (i = 0; i < nmissing; i++) {
1174 for (j = 0; j < missing[i]; j++) {
1175 ASSERT0(rows[i][j]);
1176 }
1177 ASSERT3U(rows[i][missing[i]], !=, 0);
1178
1179 /*
1180 * Compute the inverse of the first element and multiply each
1181 * element in the row by that value.
1182 */
1183 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1184
1185 for (j = 0; j < n; j++) {
1186 rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1187 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1188 }
1189
1190 for (ii = 0; ii < nmissing; ii++) {
1191 if (i == ii)
1192 continue;
1193
1194 ASSERT3U(rows[ii][missing[i]], !=, 0);
1195
1196 log = vdev_raidz_log2[rows[ii][missing[i]]];
1197
1198 for (j = 0; j < n; j++) {
1199 rows[ii][j] ^=
1200 vdev_raidz_exp2(rows[i][j], log);
1201 invrows[ii][j] ^=
1202 vdev_raidz_exp2(invrows[i][j], log);
1203 }
1204 }
1205 }
1206
1207 /*
1208 * Verify that the data that is left in the rows are properly part of
1209 * an identity matrix.
1210 */
1211 for (i = 0; i < nmissing; i++) {
1212 for (j = 0; j < n; j++) {
1213 if (j == missing[i]) {
1214 ASSERT3U(rows[i][j], ==, 1);
1215 } else {
1216 ASSERT0(rows[i][j]);
1217 }
1218 }
1219 }
1220}
1221
1222static void
1223vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1224 int *missing, uint8_t **invrows, const uint8_t *used)
1225{
1226 int i, j, x, cc, c;
1227 uint8_t *src;
1228 uint64_t ccount;
1229 uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1230 uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1231 uint8_t log = 0;
1232 uint8_t val;
1233 int ll;
1234 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1235 uint8_t *p, *pp;
1236 size_t psize;
1237
1238 psize = sizeof (invlog[0][0]) * n * nmissing;
1239 p = kmem_alloc(psize, KM_SLEEP);
1240
1241 for (pp = p, i = 0; i < nmissing; i++) {
1242 invlog[i] = pp;
1243 pp += n;
1244 }
1245
1246 for (i = 0; i < nmissing; i++) {
1247 for (j = 0; j < n; j++) {
1248 ASSERT3U(invrows[i][j], !=, 0);
1249 invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1250 }
1251 }
1252
1253 for (i = 0; i < n; i++) {
1254 c = used[i];
1255 ASSERT3U(c, <, rm->rm_cols);
1256
1257 src = rm->rm_col[c].rc_data;
1258 ccount = rm->rm_col[c].rc_size;
1259 for (j = 0; j < nmissing; j++) {
1260 cc = missing[j] + rm->rm_firstdatacol;
1261 ASSERT3U(cc, >=, rm->rm_firstdatacol);
1262 ASSERT3U(cc, <, rm->rm_cols);
1263 ASSERT3U(cc, !=, c);
1264
1265 dst[j] = rm->rm_col[cc].rc_data;
1266 dcount[j] = rm->rm_col[cc].rc_size;
1267 }
1268
1269 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1270
1271 for (x = 0; x < ccount; x++, src++) {
1272 if (*src != 0)
1273 log = vdev_raidz_log2[*src];
1274
1275 for (cc = 0; cc < nmissing; cc++) {
1276 if (x >= dcount[cc])
1277 continue;
1278
1279 if (*src == 0) {
1280 val = 0;
1281 } else {
1282 if ((ll = log + invlog[cc][i]) >= 255)
1283 ll -= 255;
1284 val = vdev_raidz_pow2[ll];
1285 }
1286
1287 if (i == 0)
1288 dst[cc][x] = val;
1289 else
1290 dst[cc][x] ^= val;
1291 }
1292 }
1293 }
1294
1295 kmem_free(p, psize);
1296}
1297
1298static int
1299vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1300{
1301 int n, i, c, t, tt;
1302 int nmissing_rows;
1303 int missing_rows[VDEV_RAIDZ_MAXPARITY];
1304 int parity_map[VDEV_RAIDZ_MAXPARITY];
1305
1306 uint8_t *p, *pp;
1307 size_t psize;
1308
1309 uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1310 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1311 uint8_t *used;
1312
1313 int code = 0;
1314
1315
1316 n = rm->rm_cols - rm->rm_firstdatacol;
1317
1318 /*
1319 * Figure out which data columns are missing.
1320 */
1321 nmissing_rows = 0;
1322 for (t = 0; t < ntgts; t++) {
1323 if (tgts[t] >= rm->rm_firstdatacol) {
1324 missing_rows[nmissing_rows++] =
1325 tgts[t] - rm->rm_firstdatacol;
1326 }
1327 }
1328
1329 /*
1330 * Figure out which parity columns to use to help generate the missing
1331 * data columns.
1332 */
1333 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1334 ASSERT(tt < ntgts);
1335 ASSERT(c < rm->rm_firstdatacol);
1336
1337 /*
1338 * Skip any targeted parity columns.
1339 */
1340 if (c == tgts[tt]) {
1341 tt++;
1342 continue;
1343 }
1344
1345 code |= 1 << c;
1346
1347 parity_map[i] = c;
1348 i++;
1349 }
1350
1351 ASSERT(code != 0);
1352 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1353
1354 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1355 nmissing_rows * n + sizeof (used[0]) * n;
1356 p = kmem_alloc(psize, KM_SLEEP);
1357
1358 for (pp = p, i = 0; i < nmissing_rows; i++) {
1359 rows[i] = pp;
1360 pp += n;
1361 invrows[i] = pp;
1362 pp += n;
1363 }
1364 used = pp;
1365
1366 for (i = 0; i < nmissing_rows; i++) {
1367 used[i] = parity_map[i];
1368 }
1369
1370 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1371 if (tt < nmissing_rows &&
1372 c == missing_rows[tt] + rm->rm_firstdatacol) {
1373 tt++;
1374 continue;
1375 }
1376
1377 ASSERT3S(i, <, n);
1378 used[i] = c;
1379 i++;
1380 }
1381
1382 /*
1383 * Initialize the interesting rows of the matrix.
1384 */
1385 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1386
1387 /*
1388 * Invert the matrix.
1389 */
1390 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1391 invrows, used);
1392
1393 /*
1394 * Reconstruct the missing data using the generated matrix.
1395 */
1396 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1397 invrows, used);
1398
1399 kmem_free(p, psize);
1400
1401 return (code);
1402}
1403
1404static int
1405vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1406{
1407 int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1408 int ntgts;
1409 int i, c;
1410 int code;
1411 int nbadparity, nbaddata;
1412 int parity_valid[VDEV_RAIDZ_MAXPARITY];
1413
1414 /*
1415 * The tgts list must already be sorted.
1416 */
1417 for (i = 1; i < nt; i++) {
1418 ASSERT(t[i] > t[i - 1]);
1419 }
1420
1421 nbadparity = rm->rm_firstdatacol;
1422 nbaddata = rm->rm_cols - nbadparity;
1423 ntgts = 0;
1424 for (i = 0, c = 0; c < rm->rm_cols; c++) {
1425 if (c < rm->rm_firstdatacol)
1426 parity_valid[c] = B_FALSE;
1427
1428 if (i < nt && c == t[i]) {
1429 tgts[ntgts++] = c;
1430 i++;
1431 } else if (rm->rm_col[c].rc_error != 0) {
1432 tgts[ntgts++] = c;
1433 } else if (c >= rm->rm_firstdatacol) {
1434 nbaddata--;
1435 } else {
1436 parity_valid[c] = B_TRUE;
1437 nbadparity--;
1438 }
1439 }
1440
1441 ASSERT(ntgts >= nt);
1442 ASSERT(nbaddata >= 0);
1443 ASSERT(nbaddata + nbadparity == ntgts);
1444
1445 dt = &tgts[nbadparity];
1446
1447 /*
1448 * See if we can use any of our optimized reconstruction routines.
1449 */
1450 if (!vdev_raidz_default_to_general) {
1451 switch (nbaddata) {
1452 case 1:
1453 if (parity_valid[VDEV_RAIDZ_P])
1454 return (vdev_raidz_reconstruct_p(rm, dt, 1));
1455
1456 ASSERT(rm->rm_firstdatacol > 1);
1457
1458 if (parity_valid[VDEV_RAIDZ_Q])
1459 return (vdev_raidz_reconstruct_q(rm, dt, 1));
1460
1461 ASSERT(rm->rm_firstdatacol > 2);
1462 break;
1463
1464 case 2:
1465 ASSERT(rm->rm_firstdatacol > 1);
1466
1467 if (parity_valid[VDEV_RAIDZ_P] &&
1468 parity_valid[VDEV_RAIDZ_Q])
1469 return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1470
1471 ASSERT(rm->rm_firstdatacol > 2);
1472
1473 break;
1474 }
1475 }
1476
1477 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1478 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1479 ASSERT(code > 0);
1480 return (code);
1481}
1482
1483static int
1484vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1485 uint64_t *logical_ashift, uint64_t *physical_ashift)
1486{
1487 vdev_t *cvd;
1488 uint64_t nparity = vd->vdev_nparity;
1489 int c;
1490 int lasterror = 0;
1491 int numerrors = 0;
1492
1493 ASSERT(nparity > 0);
1494
1495 if (nparity > VDEV_RAIDZ_MAXPARITY ||
1496 vd->vdev_children < nparity + 1) {
1497 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1498 return (SET_ERROR(EINVAL));
1499 }
1500
1501 vdev_open_children(vd);
1502
1503 for (c = 0; c < vd->vdev_children; c++) {
1504 cvd = vd->vdev_child[c];
1505
1506 if (cvd->vdev_open_error != 0) {
1507 lasterror = cvd->vdev_open_error;
1508 numerrors++;
1509 continue;
1510 }
1511
1512 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1513 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1514 *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
1515 *physical_ashift = MAX(*physical_ashift,
1516 cvd->vdev_physical_ashift);
1517 }
1518
1519 *asize *= vd->vdev_children;
1520 *max_asize *= vd->vdev_children;
1521
1522 if (numerrors > nparity) {
1523 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1524 return (lasterror);
1525 }
1526
1527 return (0);
1528}
1529
1530static void
1531vdev_raidz_close(vdev_t *vd)
1532{
1533 int c;
1534
1535 for (c = 0; c < vd->vdev_children; c++)
1536 vdev_close(vd->vdev_child[c]);
1537}
1538
1539#ifdef illumos
1540/*
1541 * Handle a read or write I/O to a RAID-Z dump device.
1542 *
1543 * The dump device is in a unique situation compared to other ZFS datasets:
1544 * writing to this device should be as simple and fast as possible. In
1545 * addition, durability matters much less since the dump will be extracted
1546 * once the machine reboots. For that reason, this function eschews parity for
1547 * performance and simplicity. The dump device uses the checksum setting
1548 * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this
1549 * dataset.
1550 *
1551 * Blocks of size 128 KB have been preallocated for this volume. I/Os less than
1552 * 128 KB will not fill an entire block; in addition, they may not be properly
1553 * aligned. In that case, this function uses the preallocated 128 KB block and
1554 * omits reading or writing any "empty" portions of that block, as opposed to
1555 * allocating a fresh appropriately-sized block.
1556 *
1557 * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs:
1558 *
1559 * vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB)
1560 *
1561 * If this were a standard RAID-Z dataset, a block of at least 40 KB would be
1562 * allocated which spans all five child vdevs. 8 KB of data would be written to
1563 * each of four vdevs, with the fifth containing the parity bits.
1564 *
1565 * parity data data data data
1566 * | PP | XX | XX | XX | XX |
1567 * ^ ^ ^ ^ ^
1568 * | | | | |
1569 * 8 KB parity ------8 KB data blocks------
1570 *
1571 * However, when writing to the dump device, the behavior is different:
1572 *
1573 * vdev_raidz_physio(data, size: 32 KB, offset: 64 KB)
1574 *
1575 * Unlike the normal RAID-Z case in which the block is allocated based on the
1576 * I/O size, reads and writes here always use a 128 KB logical I/O size. If the
1577 * I/O size is less than 128 KB, only the actual portions of data are written.
1578 * In this example the data is written to the third data vdev since that vdev
1579 * contains the offset [64 KB, 96 KB).
1580 *
1581 * parity data data data data
1582 * | | | | XX | |
1583 * ^
1584 * |
1585 * 32 KB data block
1586 *
1587 * As a result, an individual I/O may not span all child vdevs; moreover, a
1588 * small I/O may only operate on a single child vdev.
1589 *
1590 * Note that since there are no parity bits calculated or written, this format
1591 * remains the same no matter how many parity bits are used in a normal RAID-Z
1592 * stripe. On a RAID-Z3 configuration with seven child vdevs, the example above
1593 * would look like:
1594 *
1595 * parity parity parity data data data data
1596 * | | | | | | XX | |
1597 * ^
1598 * |
1599 * 32 KB data block
1600 */
1601int
1602vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
1603 uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump)
1604{
1605 vdev_t *tvd = vd->vdev_top;
1606 vdev_t *cvd;
1607 raidz_map_t *rm;
1608 raidz_col_t *rc;
1609 int c, err = 0;
1610
1611 uint64_t start, end, colstart, colend;
1612 uint64_t coloffset, colsize, colskip;
1613
1614 int flags = doread ? BIO_READ : BIO_WRITE;
1615
1616#ifdef _KERNEL
1617
1618 /*
1619 * Don't write past the end of the block
1620 */
1621 VERIFY3U(offset + size, <=, origoffset + SPA_MAXBLOCKSIZE);
1622
1623 start = offset;
1624 end = start + size;
1625
1626 /*
1627 * Allocate a RAID-Z map for this block. Note that this block starts
1628 * from the "original" offset, this is, the offset of the extent which
1629 * contains the requisite offset of the data being read or written.
1630 *
1631 * Even if this I/O operation doesn't span the full block size, let's
1632 * treat the on-disk format as if the only blocks are the complete 128
1633 * KB size.
1634 */
1635 rm = vdev_raidz_map_alloc(data - (offset - origoffset),
1636 SPA_MAXBLOCKSIZE, origoffset, B_FALSE, tvd->vdev_ashift, vd->vdev_children,
1637 vd->vdev_nparity);
1638
1639 coloffset = origoffset;
1640
1641 for (c = rm->rm_firstdatacol; c < rm->rm_cols;
1642 c++, coloffset += rc->rc_size) {
1643 rc = &rm->rm_col[c];
1644 cvd = vd->vdev_child[rc->rc_devidx];
1645
1646 /*
1647 * Find the start and end of this column in the RAID-Z map,
1648 * keeping in mind that the stated size and offset of the
1649 * operation may not fill the entire column for this vdev.
1650 *
1651 * If any portion of the data spans this column, issue the
1652 * appropriate operation to the vdev.
1653 */
1654 if (coloffset + rc->rc_size <= start)
1655 continue;
1656 if (coloffset >= end)
1657 continue;
1658
1659 colstart = MAX(coloffset, start);
1660 colend = MIN(end, coloffset + rc->rc_size);
1661 colsize = colend - colstart;
1662 colskip = colstart - coloffset;
1663
1664 VERIFY3U(colsize, <=, rc->rc_size);
1665 VERIFY3U(colskip, <=, rc->rc_size);
1666
1667 /*
1668 * Note that the child vdev will have a vdev label at the start
1669 * of its range of offsets, hence the need for
1670 * VDEV_LABEL_OFFSET(). See zio_vdev_child_io() for another
1671 * example of why this calculation is needed.
1672 */
1673 if ((err = vdev_disk_physio(cvd,
1674 ((char *)rc->rc_data) + colskip, colsize,
1675 VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
1676 flags, isdump)) != 0)
1677 break;
1678 }
1679
1680 vdev_raidz_map_free(rm);
1681#endif /* KERNEL */
1682
1683 return (err);
1684}
1685#endif
1686
1687static uint64_t
1688vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1689{
1690 uint64_t asize;
1691 uint64_t ashift = vd->vdev_top->vdev_ashift;
1692 uint64_t cols = vd->vdev_children;
1693 uint64_t nparity = vd->vdev_nparity;
1694
1695 asize = ((psize - 1) >> ashift) + 1;
1696 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1697 asize = roundup(asize, nparity + 1) << ashift;
1698
1699 return (asize);
1700}
1701
1702static void
1703vdev_raidz_child_done(zio_t *zio)
1704{
1705 raidz_col_t *rc = zio->io_private;
1706
1707 rc->rc_error = zio->io_error;
1708 rc->rc_tried = 1;
1709 rc->rc_skipped = 0;
1710}
1711
1712/*
1713 * Start an IO operation on a RAIDZ VDev
1714 *
1715 * Outline:
1716 * - For write operations:
1717 * 1. Generate the parity data
1718 * 2. Create child zio write operations to each column's vdev, for both
1719 * data and parity.
1720 * 3. If the column skips any sectors for padding, create optional dummy
1721 * write zio children for those areas to improve aggregation continuity.
1722 * - For read operations:
1723 * 1. Create child zio read operations to each data column's vdev to read
1724 * the range of data required for zio.
1725 * 2. If this is a scrub or resilver operation, or if any of the data
1726 * vdevs have had errors, then create zio read operations to the parity
1727 * columns' VDevs as well.
1728 */
25 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
26 */
27
28#include <sys/zfs_context.h>
29#include <sys/spa.h>
30#include <sys/vdev_impl.h>
31#ifdef illumos
32#include <sys/vdev_disk.h>
33#endif
34#include <sys/vdev_file.h>
35#include <sys/vdev_raidz.h>
36#include <sys/zio.h>
37#include <sys/zio_checksum.h>
38#include <sys/fs/zfs.h>
39#include <sys/fm/fs/zfs.h>
40#include <sys/bio.h>
41
42/*
43 * Virtual device vector for RAID-Z.
44 *
45 * This vdev supports single, double, and triple parity. For single parity,
46 * we use a simple XOR of all the data columns. For double or triple parity,
47 * we use a special case of Reed-Solomon coding. This extends the
48 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
49 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
50 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
51 * former is also based. The latter is designed to provide higher performance
52 * for writes.
53 *
54 * Note that the Plank paper claimed to support arbitrary N+M, but was then
55 * amended six years later identifying a critical flaw that invalidates its
56 * claims. Nevertheless, the technique can be adapted to work for up to
57 * triple parity. For additional parity, the amendment "Note: Correction to
58 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
59 * is viable, but the additional complexity means that write performance will
60 * suffer.
61 *
62 * All of the methods above operate on a Galois field, defined over the
63 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
64 * can be expressed with a single byte. Briefly, the operations on the
65 * field are defined as follows:
66 *
67 * o addition (+) is represented by a bitwise XOR
68 * o subtraction (-) is therefore identical to addition: A + B = A - B
69 * o multiplication of A by 2 is defined by the following bitwise expression:
70 *
71 * (A * 2)_7 = A_6
72 * (A * 2)_6 = A_5
73 * (A * 2)_5 = A_4
74 * (A * 2)_4 = A_3 + A_7
75 * (A * 2)_3 = A_2 + A_7
76 * (A * 2)_2 = A_1 + A_7
77 * (A * 2)_1 = A_0
78 * (A * 2)_0 = A_7
79 *
80 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
81 * As an aside, this multiplication is derived from the error correcting
82 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
83 *
84 * Observe that any number in the field (except for 0) can be expressed as a
85 * power of 2 -- a generator for the field. We store a table of the powers of
86 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
87 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
88 * than field addition). The inverse of a field element A (A^-1) is therefore
89 * A ^ (255 - 1) = A^254.
90 *
91 * The up-to-three parity columns, P, Q, R over several data columns,
92 * D_0, ... D_n-1, can be expressed by field operations:
93 *
94 * P = D_0 + D_1 + ... + D_n-2 + D_n-1
95 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
96 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
97 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
98 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
99 *
100 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
101 * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
102 * independent coefficients. (There are no additional coefficients that have
103 * this property which is why the uncorrected Plank method breaks down.)
104 *
105 * See the reconstruction code below for how P, Q and R can used individually
106 * or in concert to recover missing data columns.
107 */
108
109typedef struct raidz_col {
110 uint64_t rc_devidx; /* child device index for I/O */
111 uint64_t rc_offset; /* device offset */
112 uint64_t rc_size; /* I/O size */
113 void *rc_data; /* I/O data */
114 void *rc_gdata; /* used to store the "good" version */
115 int rc_error; /* I/O error for this device */
116 uint8_t rc_tried; /* Did we attempt this I/O column? */
117 uint8_t rc_skipped; /* Did we skip this I/O column? */
118} raidz_col_t;
119
120typedef struct raidz_map {
121 uint64_t rm_cols; /* Regular column count */
122 uint64_t rm_scols; /* Count including skipped columns */
123 uint64_t rm_bigcols; /* Number of oversized columns */
124 uint64_t rm_asize; /* Actual total I/O size */
125 uint64_t rm_missingdata; /* Count of missing data devices */
126 uint64_t rm_missingparity; /* Count of missing parity devices */
127 uint64_t rm_firstdatacol; /* First data column/parity count */
128 uint64_t rm_nskip; /* Skipped sectors for padding */
129 uint64_t rm_skipstart; /* Column index of padding start */
130 void *rm_datacopy; /* rm_asize-buffer of copied data */
131 uintptr_t rm_reports; /* # of referencing checksum reports */
132 uint8_t rm_freed; /* map no longer has referencing ZIO */
133 uint8_t rm_ecksuminjected; /* checksum error was injected */
134 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
135} raidz_map_t;
136
137#define VDEV_RAIDZ_P 0
138#define VDEV_RAIDZ_Q 1
139#define VDEV_RAIDZ_R 2
140
141#define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
142#define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
143
144/*
145 * We provide a mechanism to perform the field multiplication operation on a
146 * 64-bit value all at once rather than a byte at a time. This works by
147 * creating a mask from the top bit in each byte and using that to
148 * conditionally apply the XOR of 0x1d.
149 */
150#define VDEV_RAIDZ_64MUL_2(x, mask) \
151{ \
152 (mask) = (x) & 0x8080808080808080ULL; \
153 (mask) = ((mask) << 1) - ((mask) >> 7); \
154 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
155 ((mask) & 0x1d1d1d1d1d1d1d1d); \
156}
157
158#define VDEV_RAIDZ_64MUL_4(x, mask) \
159{ \
160 VDEV_RAIDZ_64MUL_2((x), mask); \
161 VDEV_RAIDZ_64MUL_2((x), mask); \
162}
163
164#define VDEV_LABEL_OFFSET(x) (x + VDEV_LABEL_START_SIZE)
165
166/*
167 * Force reconstruction to use the general purpose method.
168 */
169int vdev_raidz_default_to_general;
170
171/* Powers of 2 in the Galois field defined above. */
172static const uint8_t vdev_raidz_pow2[256] = {
173 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
174 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
175 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
176 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
177 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
178 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
179 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
180 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
181 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
182 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
183 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
184 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
185 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
186 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
187 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
188 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
189 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
190 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
191 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
192 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
193 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
194 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
195 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
196 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
197 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
198 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
199 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
200 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
201 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
202 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
203 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
204 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
205};
206/* Logs of 2 in the Galois field defined above. */
207static const uint8_t vdev_raidz_log2[256] = {
208 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
209 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
210 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
211 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
212 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
213 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
214 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
215 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
216 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
217 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
218 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
219 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
220 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
221 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
222 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
223 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
224 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
225 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
226 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
227 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
228 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
229 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
230 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
231 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
232 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
233 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
234 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
235 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
236 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
237 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
238 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
239 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
240};
241
242static void vdev_raidz_generate_parity(raidz_map_t *rm);
243
244/*
245 * Multiply a given number by 2 raised to the given power.
246 */
247static uint8_t
248vdev_raidz_exp2(uint_t a, int exp)
249{
250 if (a == 0)
251 return (0);
252
253 ASSERT(exp >= 0);
254 ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
255
256 exp += vdev_raidz_log2[a];
257 if (exp > 255)
258 exp -= 255;
259
260 return (vdev_raidz_pow2[exp]);
261}
262
263static void
264vdev_raidz_map_free(raidz_map_t *rm)
265{
266 int c;
267 size_t size;
268
269 for (c = 0; c < rm->rm_firstdatacol; c++) {
270 if (rm->rm_col[c].rc_data != NULL)
271 zio_buf_free(rm->rm_col[c].rc_data,
272 rm->rm_col[c].rc_size);
273
274 if (rm->rm_col[c].rc_gdata != NULL)
275 zio_buf_free(rm->rm_col[c].rc_gdata,
276 rm->rm_col[c].rc_size);
277 }
278
279 size = 0;
280 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
281 size += rm->rm_col[c].rc_size;
282
283 if (rm->rm_datacopy != NULL)
284 zio_buf_free(rm->rm_datacopy, size);
285
286 kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
287}
288
289static void
290vdev_raidz_map_free_vsd(zio_t *zio)
291{
292 raidz_map_t *rm = zio->io_vsd;
293
294 ASSERT0(rm->rm_freed);
295 rm->rm_freed = 1;
296
297 if (rm->rm_reports == 0)
298 vdev_raidz_map_free(rm);
299}
300
301/*ARGSUSED*/
302static void
303vdev_raidz_cksum_free(void *arg, size_t ignored)
304{
305 raidz_map_t *rm = arg;
306
307 ASSERT3U(rm->rm_reports, >, 0);
308
309 if (--rm->rm_reports == 0 && rm->rm_freed != 0)
310 vdev_raidz_map_free(rm);
311}
312
313static void
314vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
315{
316 raidz_map_t *rm = zcr->zcr_cbdata;
317 size_t c = zcr->zcr_cbinfo;
318 size_t x;
319
320 const char *good = NULL;
321 const char *bad = rm->rm_col[c].rc_data;
322
323 if (good_data == NULL) {
324 zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
325 return;
326 }
327
328 if (c < rm->rm_firstdatacol) {
329 /*
330 * The first time through, calculate the parity blocks for
331 * the good data (this relies on the fact that the good
332 * data never changes for a given logical ZIO)
333 */
334 if (rm->rm_col[0].rc_gdata == NULL) {
335 char *bad_parity[VDEV_RAIDZ_MAXPARITY];
336 char *buf;
337
338 /*
339 * Set up the rm_col[]s to generate the parity for
340 * good_data, first saving the parity bufs and
341 * replacing them with buffers to hold the result.
342 */
343 for (x = 0; x < rm->rm_firstdatacol; x++) {
344 bad_parity[x] = rm->rm_col[x].rc_data;
345 rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
346 zio_buf_alloc(rm->rm_col[x].rc_size);
347 }
348
349 /* fill in the data columns from good_data */
350 buf = (char *)good_data;
351 for (; x < rm->rm_cols; x++) {
352 rm->rm_col[x].rc_data = buf;
353 buf += rm->rm_col[x].rc_size;
354 }
355
356 /*
357 * Construct the parity from the good data.
358 */
359 vdev_raidz_generate_parity(rm);
360
361 /* restore everything back to its original state */
362 for (x = 0; x < rm->rm_firstdatacol; x++)
363 rm->rm_col[x].rc_data = bad_parity[x];
364
365 buf = rm->rm_datacopy;
366 for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
367 rm->rm_col[x].rc_data = buf;
368 buf += rm->rm_col[x].rc_size;
369 }
370 }
371
372 ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
373 good = rm->rm_col[c].rc_gdata;
374 } else {
375 /* adjust good_data to point at the start of our column */
376 good = good_data;
377
378 for (x = rm->rm_firstdatacol; x < c; x++)
379 good += rm->rm_col[x].rc_size;
380 }
381
382 /* we drop the ereport if it ends up that the data was good */
383 zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
384}
385
386/*
387 * Invoked indirectly by zfs_ereport_start_checksum(), called
388 * below when our read operation fails completely. The main point
389 * is to keep a copy of everything we read from disk, so that at
390 * vdev_raidz_cksum_finish() time we can compare it with the good data.
391 */
392static void
393vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
394{
395 size_t c = (size_t)(uintptr_t)arg;
396 caddr_t buf;
397
398 raidz_map_t *rm = zio->io_vsd;
399 size_t size;
400
401 /* set up the report and bump the refcount */
402 zcr->zcr_cbdata = rm;
403 zcr->zcr_cbinfo = c;
404 zcr->zcr_finish = vdev_raidz_cksum_finish;
405 zcr->zcr_free = vdev_raidz_cksum_free;
406
407 rm->rm_reports++;
408 ASSERT3U(rm->rm_reports, >, 0);
409
410 if (rm->rm_datacopy != NULL)
411 return;
412
413 /*
414 * It's the first time we're called for this raidz_map_t, so we need
415 * to copy the data aside; there's no guarantee that our zio's buffer
416 * won't be re-used for something else.
417 *
418 * Our parity data is already in separate buffers, so there's no need
419 * to copy them.
420 */
421
422 size = 0;
423 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
424 size += rm->rm_col[c].rc_size;
425
426 buf = rm->rm_datacopy = zio_buf_alloc(size);
427
428 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
429 raidz_col_t *col = &rm->rm_col[c];
430
431 bcopy(col->rc_data, buf, col->rc_size);
432 col->rc_data = buf;
433
434 buf += col->rc_size;
435 }
436 ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
437}
438
439static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
440 vdev_raidz_map_free_vsd,
441 vdev_raidz_cksum_report
442};
443
444/*
445 * Divides the IO evenly across all child vdevs; usually, dcols is
446 * the number of children in the target vdev.
447 */
448static raidz_map_t *
449vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset, boolean_t dofree,
450 uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
451{
452 raidz_map_t *rm;
453 /* The starting RAIDZ (parent) vdev sector of the block. */
454 uint64_t b = offset >> unit_shift;
455 /* The zio's size in units of the vdev's minimum sector size. */
456 uint64_t s = size >> unit_shift;
457 /* The first column for this stripe. */
458 uint64_t f = b % dcols;
459 /* The starting byte offset on each child vdev. */
460 uint64_t o = (b / dcols) << unit_shift;
461 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
462
463 /*
464 * "Quotient": The number of data sectors for this stripe on all but
465 * the "big column" child vdevs that also contain "remainder" data.
466 */
467 q = s / (dcols - nparity);
468
469 /*
470 * "Remainder": The number of partial stripe data sectors in this I/O.
471 * This will add a sector to some, but not all, child vdevs.
472 */
473 r = s - q * (dcols - nparity);
474
475 /* The number of "big columns" - those which contain remainder data. */
476 bc = (r == 0 ? 0 : r + nparity);
477
478 /*
479 * The total number of data and parity sectors associated with
480 * this I/O.
481 */
482 tot = s + nparity * (q + (r == 0 ? 0 : 1));
483
484 /* acols: The columns that will be accessed. */
485 /* scols: The columns that will be accessed or skipped. */
486 if (q == 0) {
487 /* Our I/O request doesn't span all child vdevs. */
488 acols = bc;
489 scols = MIN(dcols, roundup(bc, nparity + 1));
490 } else {
491 acols = dcols;
492 scols = dcols;
493 }
494
495 ASSERT3U(acols, <=, scols);
496
497 rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
498
499 rm->rm_cols = acols;
500 rm->rm_scols = scols;
501 rm->rm_bigcols = bc;
502 rm->rm_skipstart = bc;
503 rm->rm_missingdata = 0;
504 rm->rm_missingparity = 0;
505 rm->rm_firstdatacol = nparity;
506 rm->rm_datacopy = NULL;
507 rm->rm_reports = 0;
508 rm->rm_freed = 0;
509 rm->rm_ecksuminjected = 0;
510
511 asize = 0;
512
513 for (c = 0; c < scols; c++) {
514 col = f + c;
515 coff = o;
516 if (col >= dcols) {
517 col -= dcols;
518 coff += 1ULL << unit_shift;
519 }
520 rm->rm_col[c].rc_devidx = col;
521 rm->rm_col[c].rc_offset = coff;
522 rm->rm_col[c].rc_data = NULL;
523 rm->rm_col[c].rc_gdata = NULL;
524 rm->rm_col[c].rc_error = 0;
525 rm->rm_col[c].rc_tried = 0;
526 rm->rm_col[c].rc_skipped = 0;
527
528 if (c >= acols)
529 rm->rm_col[c].rc_size = 0;
530 else if (c < bc)
531 rm->rm_col[c].rc_size = (q + 1) << unit_shift;
532 else
533 rm->rm_col[c].rc_size = q << unit_shift;
534
535 asize += rm->rm_col[c].rc_size;
536 }
537
538 ASSERT3U(asize, ==, tot << unit_shift);
539 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
540 rm->rm_nskip = roundup(tot, nparity + 1) - tot;
541 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
542 ASSERT3U(rm->rm_nskip, <=, nparity);
543
544 if (!dofree) {
545 for (c = 0; c < rm->rm_firstdatacol; c++) {
546 rm->rm_col[c].rc_data =
547 zio_buf_alloc(rm->rm_col[c].rc_size);
548 }
549
550 rm->rm_col[c].rc_data = data;
551
552 for (c = c + 1; c < acols; c++) {
553 rm->rm_col[c].rc_data =
554 (char *)rm->rm_col[c - 1].rc_data +
555 rm->rm_col[c - 1].rc_size;
556 }
557 }
558
559 /*
560 * If all data stored spans all columns, there's a danger that parity
561 * will always be on the same device and, since parity isn't read
562 * during normal operation, that that device's I/O bandwidth won't be
563 * used effectively. We therefore switch the parity every 1MB.
564 *
565 * ... at least that was, ostensibly, the theory. As a practical
566 * matter unless we juggle the parity between all devices evenly, we
567 * won't see any benefit. Further, occasional writes that aren't a
568 * multiple of the LCM of the number of children and the minimum
569 * stripe width are sufficient to avoid pessimal behavior.
570 * Unfortunately, this decision created an implicit on-disk format
571 * requirement that we need to support for all eternity, but only
572 * for single-parity RAID-Z.
573 *
574 * If we intend to skip a sector in the zeroth column for padding
575 * we must make sure to note this swap. We will never intend to
576 * skip the first column since at least one data and one parity
577 * column must appear in each row.
578 */
579 ASSERT(rm->rm_cols >= 2);
580 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
581
582 if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
583 devidx = rm->rm_col[0].rc_devidx;
584 o = rm->rm_col[0].rc_offset;
585 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
586 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
587 rm->rm_col[1].rc_devidx = devidx;
588 rm->rm_col[1].rc_offset = o;
589
590 if (rm->rm_skipstart == 0)
591 rm->rm_skipstart = 1;
592 }
593
594 return (rm);
595}
596
597static void
598vdev_raidz_generate_parity_p(raidz_map_t *rm)
599{
600 uint64_t *p, *src, pcount, ccount, i;
601 int c;
602
603 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
604
605 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
606 src = rm->rm_col[c].rc_data;
607 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
608 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
609
610 if (c == rm->rm_firstdatacol) {
611 ASSERT(ccount == pcount);
612 for (i = 0; i < ccount; i++, src++, p++) {
613 *p = *src;
614 }
615 } else {
616 ASSERT(ccount <= pcount);
617 for (i = 0; i < ccount; i++, src++, p++) {
618 *p ^= *src;
619 }
620 }
621 }
622}
623
624static void
625vdev_raidz_generate_parity_pq(raidz_map_t *rm)
626{
627 uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
628 int c;
629
630 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
631 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
632 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
633
634 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
635 src = rm->rm_col[c].rc_data;
636 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
637 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
638
639 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
640
641 if (c == rm->rm_firstdatacol) {
642 ASSERT(ccnt == pcnt || ccnt == 0);
643 for (i = 0; i < ccnt; i++, src++, p++, q++) {
644 *p = *src;
645 *q = *src;
646 }
647 for (; i < pcnt; i++, src++, p++, q++) {
648 *p = 0;
649 *q = 0;
650 }
651 } else {
652 ASSERT(ccnt <= pcnt);
653
654 /*
655 * Apply the algorithm described above by multiplying
656 * the previous result and adding in the new value.
657 */
658 for (i = 0; i < ccnt; i++, src++, p++, q++) {
659 *p ^= *src;
660
661 VDEV_RAIDZ_64MUL_2(*q, mask);
662 *q ^= *src;
663 }
664
665 /*
666 * Treat short columns as though they are full of 0s.
667 * Note that there's therefore nothing needed for P.
668 */
669 for (; i < pcnt; i++, q++) {
670 VDEV_RAIDZ_64MUL_2(*q, mask);
671 }
672 }
673 }
674}
675
676static void
677vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
678{
679 uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
680 int c;
681
682 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
683 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
684 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
685 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
686 rm->rm_col[VDEV_RAIDZ_R].rc_size);
687
688 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
689 src = rm->rm_col[c].rc_data;
690 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
691 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
692 r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
693
694 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
695
696 if (c == rm->rm_firstdatacol) {
697 ASSERT(ccnt == pcnt || ccnt == 0);
698 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
699 *p = *src;
700 *q = *src;
701 *r = *src;
702 }
703 for (; i < pcnt; i++, src++, p++, q++, r++) {
704 *p = 0;
705 *q = 0;
706 *r = 0;
707 }
708 } else {
709 ASSERT(ccnt <= pcnt);
710
711 /*
712 * Apply the algorithm described above by multiplying
713 * the previous result and adding in the new value.
714 */
715 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
716 *p ^= *src;
717
718 VDEV_RAIDZ_64MUL_2(*q, mask);
719 *q ^= *src;
720
721 VDEV_RAIDZ_64MUL_4(*r, mask);
722 *r ^= *src;
723 }
724
725 /*
726 * Treat short columns as though they are full of 0s.
727 * Note that there's therefore nothing needed for P.
728 */
729 for (; i < pcnt; i++, q++, r++) {
730 VDEV_RAIDZ_64MUL_2(*q, mask);
731 VDEV_RAIDZ_64MUL_4(*r, mask);
732 }
733 }
734 }
735}
736
737/*
738 * Generate RAID parity in the first virtual columns according to the number of
739 * parity columns available.
740 */
741static void
742vdev_raidz_generate_parity(raidz_map_t *rm)
743{
744 switch (rm->rm_firstdatacol) {
745 case 1:
746 vdev_raidz_generate_parity_p(rm);
747 break;
748 case 2:
749 vdev_raidz_generate_parity_pq(rm);
750 break;
751 case 3:
752 vdev_raidz_generate_parity_pqr(rm);
753 break;
754 default:
755 cmn_err(CE_PANIC, "invalid RAID-Z configuration");
756 }
757}
758
759static int
760vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
761{
762 uint64_t *dst, *src, xcount, ccount, count, i;
763 int x = tgts[0];
764 int c;
765
766 ASSERT(ntgts == 1);
767 ASSERT(x >= rm->rm_firstdatacol);
768 ASSERT(x < rm->rm_cols);
769
770 xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
771 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
772 ASSERT(xcount > 0);
773
774 src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
775 dst = rm->rm_col[x].rc_data;
776 for (i = 0; i < xcount; i++, dst++, src++) {
777 *dst = *src;
778 }
779
780 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
781 src = rm->rm_col[c].rc_data;
782 dst = rm->rm_col[x].rc_data;
783
784 if (c == x)
785 continue;
786
787 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
788 count = MIN(ccount, xcount);
789
790 for (i = 0; i < count; i++, dst++, src++) {
791 *dst ^= *src;
792 }
793 }
794
795 return (1 << VDEV_RAIDZ_P);
796}
797
798static int
799vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
800{
801 uint64_t *dst, *src, xcount, ccount, count, mask, i;
802 uint8_t *b;
803 int x = tgts[0];
804 int c, j, exp;
805
806 ASSERT(ntgts == 1);
807
808 xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
809 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
810
811 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
812 src = rm->rm_col[c].rc_data;
813 dst = rm->rm_col[x].rc_data;
814
815 if (c == x)
816 ccount = 0;
817 else
818 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
819
820 count = MIN(ccount, xcount);
821
822 if (c == rm->rm_firstdatacol) {
823 for (i = 0; i < count; i++, dst++, src++) {
824 *dst = *src;
825 }
826 for (; i < xcount; i++, dst++) {
827 *dst = 0;
828 }
829
830 } else {
831 for (i = 0; i < count; i++, dst++, src++) {
832 VDEV_RAIDZ_64MUL_2(*dst, mask);
833 *dst ^= *src;
834 }
835
836 for (; i < xcount; i++, dst++) {
837 VDEV_RAIDZ_64MUL_2(*dst, mask);
838 }
839 }
840 }
841
842 src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
843 dst = rm->rm_col[x].rc_data;
844 exp = 255 - (rm->rm_cols - 1 - x);
845
846 for (i = 0; i < xcount; i++, dst++, src++) {
847 *dst ^= *src;
848 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
849 *b = vdev_raidz_exp2(*b, exp);
850 }
851 }
852
853 return (1 << VDEV_RAIDZ_Q);
854}
855
856static int
857vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
858{
859 uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
860 void *pdata, *qdata;
861 uint64_t xsize, ysize, i;
862 int x = tgts[0];
863 int y = tgts[1];
864
865 ASSERT(ntgts == 2);
866 ASSERT(x < y);
867 ASSERT(x >= rm->rm_firstdatacol);
868 ASSERT(y < rm->rm_cols);
869
870 ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
871
872 /*
873 * Move the parity data aside -- we're going to compute parity as
874 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
875 * reuse the parity generation mechanism without trashing the actual
876 * parity so we make those columns appear to be full of zeros by
877 * setting their lengths to zero.
878 */
879 pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
880 qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
881 xsize = rm->rm_col[x].rc_size;
882 ysize = rm->rm_col[y].rc_size;
883
884 rm->rm_col[VDEV_RAIDZ_P].rc_data =
885 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
886 rm->rm_col[VDEV_RAIDZ_Q].rc_data =
887 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
888 rm->rm_col[x].rc_size = 0;
889 rm->rm_col[y].rc_size = 0;
890
891 vdev_raidz_generate_parity_pq(rm);
892
893 rm->rm_col[x].rc_size = xsize;
894 rm->rm_col[y].rc_size = ysize;
895
896 p = pdata;
897 q = qdata;
898 pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
899 qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
900 xd = rm->rm_col[x].rc_data;
901 yd = rm->rm_col[y].rc_data;
902
903 /*
904 * We now have:
905 * Pxy = P + D_x + D_y
906 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
907 *
908 * We can then solve for D_x:
909 * D_x = A * (P + Pxy) + B * (Q + Qxy)
910 * where
911 * A = 2^(x - y) * (2^(x - y) + 1)^-1
912 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
913 *
914 * With D_x in hand, we can easily solve for D_y:
915 * D_y = P + Pxy + D_x
916 */
917
918 a = vdev_raidz_pow2[255 + x - y];
919 b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
920 tmp = 255 - vdev_raidz_log2[a ^ 1];
921
922 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
923 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
924
925 for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
926 *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
927 vdev_raidz_exp2(*q ^ *qxy, bexp);
928
929 if (i < ysize)
930 *yd = *p ^ *pxy ^ *xd;
931 }
932
933 zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
934 rm->rm_col[VDEV_RAIDZ_P].rc_size);
935 zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
936 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
937
938 /*
939 * Restore the saved parity data.
940 */
941 rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
942 rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
943
944 return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
945}
946
947/* BEGIN CSTYLED */
948/*
949 * In the general case of reconstruction, we must solve the system of linear
950 * equations defined by the coeffecients used to generate parity as well as
951 * the contents of the data and parity disks. This can be expressed with
952 * vectors for the original data (D) and the actual data (d) and parity (p)
953 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
954 *
955 * __ __ __ __
956 * | | __ __ | p_0 |
957 * | V | | D_0 | | p_m-1 |
958 * | | x | : | = | d_0 |
959 * | I | | D_n-1 | | : |
960 * | | ~~ ~~ | d_n-1 |
961 * ~~ ~~ ~~ ~~
962 *
963 * I is simply a square identity matrix of size n, and V is a vandermonde
964 * matrix defined by the coeffecients we chose for the various parity columns
965 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
966 * computation as well as linear separability.
967 *
968 * __ __ __ __
969 * | 1 .. 1 1 1 | | p_0 |
970 * | 2^n-1 .. 4 2 1 | __ __ | : |
971 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 |
972 * | 1 .. 0 0 0 | | D_1 | | d_0 |
973 * | 0 .. 0 0 0 | x | D_2 | = | d_1 |
974 * | : : : : | | : | | d_2 |
975 * | 0 .. 1 0 0 | | D_n-1 | | : |
976 * | 0 .. 0 1 0 | ~~ ~~ | : |
977 * | 0 .. 0 0 1 | | d_n-1 |
978 * ~~ ~~ ~~ ~~
979 *
980 * Note that I, V, d, and p are known. To compute D, we must invert the
981 * matrix and use the known data and parity values to reconstruct the unknown
982 * data values. We begin by removing the rows in V|I and d|p that correspond
983 * to failed or missing columns; we then make V|I square (n x n) and d|p
984 * sized n by removing rows corresponding to unused parity from the bottom up
985 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
986 * using Gauss-Jordan elimination. In the example below we use m=3 parity
987 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
988 * __ __
989 * | 1 1 1 1 1 1 1 1 |
990 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks
991 * | 19 205 116 29 64 16 4 1 | / /
992 * | 1 0 0 0 0 0 0 0 | / /
993 * | 0 1 0 0 0 0 0 0 | <--' /
994 * (V|I) = | 0 0 1 0 0 0 0 0 | <---'
995 * | 0 0 0 1 0 0 0 0 |
996 * | 0 0 0 0 1 0 0 0 |
997 * | 0 0 0 0 0 1 0 0 |
998 * | 0 0 0 0 0 0 1 0 |
999 * | 0 0 0 0 0 0 0 1 |
1000 * ~~ ~~
1001 * __ __
1002 * | 1 1 1 1 1 1 1 1 |
1003 * | 19 205 116 29 64 16 4 1 |
1004 * | 1 0 0 0 0 0 0 0 |
1005 * (V|I)' = | 0 0 0 1 0 0 0 0 |
1006 * | 0 0 0 0 1 0 0 0 |
1007 * | 0 0 0 0 0 1 0 0 |
1008 * | 0 0 0 0 0 0 1 0 |
1009 * | 0 0 0 0 0 0 0 1 |
1010 * ~~ ~~
1011 *
1012 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1013 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1014 * matrix is not singular.
1015 * __ __
1016 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
1017 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
1018 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1019 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1020 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1021 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1022 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1023 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1024 * ~~ ~~
1025 * __ __
1026 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1027 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
1028 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
1029 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1030 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1031 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1032 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1033 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1034 * ~~ ~~
1035 * __ __
1036 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1037 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1038 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 |
1039 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1040 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1041 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1042 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1043 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1044 * ~~ ~~
1045 * __ __
1046 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1047 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1048 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 |
1049 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1050 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1051 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1052 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1053 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1054 * ~~ ~~
1055 * __ __
1056 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1057 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1058 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1059 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1060 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1061 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1062 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1063 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1064 * ~~ ~~
1065 * __ __
1066 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1067 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 |
1068 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1069 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1070 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1071 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1072 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1073 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1074 * ~~ ~~
1075 * __ __
1076 * | 0 0 1 0 0 0 0 0 |
1077 * | 167 100 5 41 159 169 217 208 |
1078 * | 166 100 4 40 158 168 216 209 |
1079 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 |
1080 * | 0 0 0 0 1 0 0 0 |
1081 * | 0 0 0 0 0 1 0 0 |
1082 * | 0 0 0 0 0 0 1 0 |
1083 * | 0 0 0 0 0 0 0 1 |
1084 * ~~ ~~
1085 *
1086 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1087 * of the missing data.
1088 *
1089 * As is apparent from the example above, the only non-trivial rows in the
1090 * inverse matrix correspond to the data disks that we're trying to
1091 * reconstruct. Indeed, those are the only rows we need as the others would
1092 * only be useful for reconstructing data known or assumed to be valid. For
1093 * that reason, we only build the coefficients in the rows that correspond to
1094 * targeted columns.
1095 */
1096/* END CSTYLED */
1097
1098static void
1099vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1100 uint8_t **rows)
1101{
1102 int i, j;
1103 int pow;
1104
1105 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1106
1107 /*
1108 * Fill in the missing rows of interest.
1109 */
1110 for (i = 0; i < nmap; i++) {
1111 ASSERT3S(0, <=, map[i]);
1112 ASSERT3S(map[i], <=, 2);
1113
1114 pow = map[i] * n;
1115 if (pow > 255)
1116 pow -= 255;
1117 ASSERT(pow <= 255);
1118
1119 for (j = 0; j < n; j++) {
1120 pow -= map[i];
1121 if (pow < 0)
1122 pow += 255;
1123 rows[i][j] = vdev_raidz_pow2[pow];
1124 }
1125 }
1126}
1127
1128static void
1129vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1130 uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1131{
1132 int i, j, ii, jj;
1133 uint8_t log;
1134
1135 /*
1136 * Assert that the first nmissing entries from the array of used
1137 * columns correspond to parity columns and that subsequent entries
1138 * correspond to data columns.
1139 */
1140 for (i = 0; i < nmissing; i++) {
1141 ASSERT3S(used[i], <, rm->rm_firstdatacol);
1142 }
1143 for (; i < n; i++) {
1144 ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1145 }
1146
1147 /*
1148 * First initialize the storage where we'll compute the inverse rows.
1149 */
1150 for (i = 0; i < nmissing; i++) {
1151 for (j = 0; j < n; j++) {
1152 invrows[i][j] = (i == j) ? 1 : 0;
1153 }
1154 }
1155
1156 /*
1157 * Subtract all trivial rows from the rows of consequence.
1158 */
1159 for (i = 0; i < nmissing; i++) {
1160 for (j = nmissing; j < n; j++) {
1161 ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1162 jj = used[j] - rm->rm_firstdatacol;
1163 ASSERT3S(jj, <, n);
1164 invrows[i][j] = rows[i][jj];
1165 rows[i][jj] = 0;
1166 }
1167 }
1168
1169 /*
1170 * For each of the rows of interest, we must normalize it and subtract
1171 * a multiple of it from the other rows.
1172 */
1173 for (i = 0; i < nmissing; i++) {
1174 for (j = 0; j < missing[i]; j++) {
1175 ASSERT0(rows[i][j]);
1176 }
1177 ASSERT3U(rows[i][missing[i]], !=, 0);
1178
1179 /*
1180 * Compute the inverse of the first element and multiply each
1181 * element in the row by that value.
1182 */
1183 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1184
1185 for (j = 0; j < n; j++) {
1186 rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1187 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1188 }
1189
1190 for (ii = 0; ii < nmissing; ii++) {
1191 if (i == ii)
1192 continue;
1193
1194 ASSERT3U(rows[ii][missing[i]], !=, 0);
1195
1196 log = vdev_raidz_log2[rows[ii][missing[i]]];
1197
1198 for (j = 0; j < n; j++) {
1199 rows[ii][j] ^=
1200 vdev_raidz_exp2(rows[i][j], log);
1201 invrows[ii][j] ^=
1202 vdev_raidz_exp2(invrows[i][j], log);
1203 }
1204 }
1205 }
1206
1207 /*
1208 * Verify that the data that is left in the rows are properly part of
1209 * an identity matrix.
1210 */
1211 for (i = 0; i < nmissing; i++) {
1212 for (j = 0; j < n; j++) {
1213 if (j == missing[i]) {
1214 ASSERT3U(rows[i][j], ==, 1);
1215 } else {
1216 ASSERT0(rows[i][j]);
1217 }
1218 }
1219 }
1220}
1221
1222static void
1223vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1224 int *missing, uint8_t **invrows, const uint8_t *used)
1225{
1226 int i, j, x, cc, c;
1227 uint8_t *src;
1228 uint64_t ccount;
1229 uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1230 uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1231 uint8_t log = 0;
1232 uint8_t val;
1233 int ll;
1234 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1235 uint8_t *p, *pp;
1236 size_t psize;
1237
1238 psize = sizeof (invlog[0][0]) * n * nmissing;
1239 p = kmem_alloc(psize, KM_SLEEP);
1240
1241 for (pp = p, i = 0; i < nmissing; i++) {
1242 invlog[i] = pp;
1243 pp += n;
1244 }
1245
1246 for (i = 0; i < nmissing; i++) {
1247 for (j = 0; j < n; j++) {
1248 ASSERT3U(invrows[i][j], !=, 0);
1249 invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1250 }
1251 }
1252
1253 for (i = 0; i < n; i++) {
1254 c = used[i];
1255 ASSERT3U(c, <, rm->rm_cols);
1256
1257 src = rm->rm_col[c].rc_data;
1258 ccount = rm->rm_col[c].rc_size;
1259 for (j = 0; j < nmissing; j++) {
1260 cc = missing[j] + rm->rm_firstdatacol;
1261 ASSERT3U(cc, >=, rm->rm_firstdatacol);
1262 ASSERT3U(cc, <, rm->rm_cols);
1263 ASSERT3U(cc, !=, c);
1264
1265 dst[j] = rm->rm_col[cc].rc_data;
1266 dcount[j] = rm->rm_col[cc].rc_size;
1267 }
1268
1269 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1270
1271 for (x = 0; x < ccount; x++, src++) {
1272 if (*src != 0)
1273 log = vdev_raidz_log2[*src];
1274
1275 for (cc = 0; cc < nmissing; cc++) {
1276 if (x >= dcount[cc])
1277 continue;
1278
1279 if (*src == 0) {
1280 val = 0;
1281 } else {
1282 if ((ll = log + invlog[cc][i]) >= 255)
1283 ll -= 255;
1284 val = vdev_raidz_pow2[ll];
1285 }
1286
1287 if (i == 0)
1288 dst[cc][x] = val;
1289 else
1290 dst[cc][x] ^= val;
1291 }
1292 }
1293 }
1294
1295 kmem_free(p, psize);
1296}
1297
1298static int
1299vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1300{
1301 int n, i, c, t, tt;
1302 int nmissing_rows;
1303 int missing_rows[VDEV_RAIDZ_MAXPARITY];
1304 int parity_map[VDEV_RAIDZ_MAXPARITY];
1305
1306 uint8_t *p, *pp;
1307 size_t psize;
1308
1309 uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1310 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1311 uint8_t *used;
1312
1313 int code = 0;
1314
1315
1316 n = rm->rm_cols - rm->rm_firstdatacol;
1317
1318 /*
1319 * Figure out which data columns are missing.
1320 */
1321 nmissing_rows = 0;
1322 for (t = 0; t < ntgts; t++) {
1323 if (tgts[t] >= rm->rm_firstdatacol) {
1324 missing_rows[nmissing_rows++] =
1325 tgts[t] - rm->rm_firstdatacol;
1326 }
1327 }
1328
1329 /*
1330 * Figure out which parity columns to use to help generate the missing
1331 * data columns.
1332 */
1333 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1334 ASSERT(tt < ntgts);
1335 ASSERT(c < rm->rm_firstdatacol);
1336
1337 /*
1338 * Skip any targeted parity columns.
1339 */
1340 if (c == tgts[tt]) {
1341 tt++;
1342 continue;
1343 }
1344
1345 code |= 1 << c;
1346
1347 parity_map[i] = c;
1348 i++;
1349 }
1350
1351 ASSERT(code != 0);
1352 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1353
1354 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1355 nmissing_rows * n + sizeof (used[0]) * n;
1356 p = kmem_alloc(psize, KM_SLEEP);
1357
1358 for (pp = p, i = 0; i < nmissing_rows; i++) {
1359 rows[i] = pp;
1360 pp += n;
1361 invrows[i] = pp;
1362 pp += n;
1363 }
1364 used = pp;
1365
1366 for (i = 0; i < nmissing_rows; i++) {
1367 used[i] = parity_map[i];
1368 }
1369
1370 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1371 if (tt < nmissing_rows &&
1372 c == missing_rows[tt] + rm->rm_firstdatacol) {
1373 tt++;
1374 continue;
1375 }
1376
1377 ASSERT3S(i, <, n);
1378 used[i] = c;
1379 i++;
1380 }
1381
1382 /*
1383 * Initialize the interesting rows of the matrix.
1384 */
1385 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1386
1387 /*
1388 * Invert the matrix.
1389 */
1390 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1391 invrows, used);
1392
1393 /*
1394 * Reconstruct the missing data using the generated matrix.
1395 */
1396 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1397 invrows, used);
1398
1399 kmem_free(p, psize);
1400
1401 return (code);
1402}
1403
1404static int
1405vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1406{
1407 int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1408 int ntgts;
1409 int i, c;
1410 int code;
1411 int nbadparity, nbaddata;
1412 int parity_valid[VDEV_RAIDZ_MAXPARITY];
1413
1414 /*
1415 * The tgts list must already be sorted.
1416 */
1417 for (i = 1; i < nt; i++) {
1418 ASSERT(t[i] > t[i - 1]);
1419 }
1420
1421 nbadparity = rm->rm_firstdatacol;
1422 nbaddata = rm->rm_cols - nbadparity;
1423 ntgts = 0;
1424 for (i = 0, c = 0; c < rm->rm_cols; c++) {
1425 if (c < rm->rm_firstdatacol)
1426 parity_valid[c] = B_FALSE;
1427
1428 if (i < nt && c == t[i]) {
1429 tgts[ntgts++] = c;
1430 i++;
1431 } else if (rm->rm_col[c].rc_error != 0) {
1432 tgts[ntgts++] = c;
1433 } else if (c >= rm->rm_firstdatacol) {
1434 nbaddata--;
1435 } else {
1436 parity_valid[c] = B_TRUE;
1437 nbadparity--;
1438 }
1439 }
1440
1441 ASSERT(ntgts >= nt);
1442 ASSERT(nbaddata >= 0);
1443 ASSERT(nbaddata + nbadparity == ntgts);
1444
1445 dt = &tgts[nbadparity];
1446
1447 /*
1448 * See if we can use any of our optimized reconstruction routines.
1449 */
1450 if (!vdev_raidz_default_to_general) {
1451 switch (nbaddata) {
1452 case 1:
1453 if (parity_valid[VDEV_RAIDZ_P])
1454 return (vdev_raidz_reconstruct_p(rm, dt, 1));
1455
1456 ASSERT(rm->rm_firstdatacol > 1);
1457
1458 if (parity_valid[VDEV_RAIDZ_Q])
1459 return (vdev_raidz_reconstruct_q(rm, dt, 1));
1460
1461 ASSERT(rm->rm_firstdatacol > 2);
1462 break;
1463
1464 case 2:
1465 ASSERT(rm->rm_firstdatacol > 1);
1466
1467 if (parity_valid[VDEV_RAIDZ_P] &&
1468 parity_valid[VDEV_RAIDZ_Q])
1469 return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1470
1471 ASSERT(rm->rm_firstdatacol > 2);
1472
1473 break;
1474 }
1475 }
1476
1477 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1478 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1479 ASSERT(code > 0);
1480 return (code);
1481}
1482
1483static int
1484vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1485 uint64_t *logical_ashift, uint64_t *physical_ashift)
1486{
1487 vdev_t *cvd;
1488 uint64_t nparity = vd->vdev_nparity;
1489 int c;
1490 int lasterror = 0;
1491 int numerrors = 0;
1492
1493 ASSERT(nparity > 0);
1494
1495 if (nparity > VDEV_RAIDZ_MAXPARITY ||
1496 vd->vdev_children < nparity + 1) {
1497 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1498 return (SET_ERROR(EINVAL));
1499 }
1500
1501 vdev_open_children(vd);
1502
1503 for (c = 0; c < vd->vdev_children; c++) {
1504 cvd = vd->vdev_child[c];
1505
1506 if (cvd->vdev_open_error != 0) {
1507 lasterror = cvd->vdev_open_error;
1508 numerrors++;
1509 continue;
1510 }
1511
1512 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1513 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1514 *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
1515 *physical_ashift = MAX(*physical_ashift,
1516 cvd->vdev_physical_ashift);
1517 }
1518
1519 *asize *= vd->vdev_children;
1520 *max_asize *= vd->vdev_children;
1521
1522 if (numerrors > nparity) {
1523 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1524 return (lasterror);
1525 }
1526
1527 return (0);
1528}
1529
1530static void
1531vdev_raidz_close(vdev_t *vd)
1532{
1533 int c;
1534
1535 for (c = 0; c < vd->vdev_children; c++)
1536 vdev_close(vd->vdev_child[c]);
1537}
1538
1539#ifdef illumos
1540/*
1541 * Handle a read or write I/O to a RAID-Z dump device.
1542 *
1543 * The dump device is in a unique situation compared to other ZFS datasets:
1544 * writing to this device should be as simple and fast as possible. In
1545 * addition, durability matters much less since the dump will be extracted
1546 * once the machine reboots. For that reason, this function eschews parity for
1547 * performance and simplicity. The dump device uses the checksum setting
1548 * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this
1549 * dataset.
1550 *
1551 * Blocks of size 128 KB have been preallocated for this volume. I/Os less than
1552 * 128 KB will not fill an entire block; in addition, they may not be properly
1553 * aligned. In that case, this function uses the preallocated 128 KB block and
1554 * omits reading or writing any "empty" portions of that block, as opposed to
1555 * allocating a fresh appropriately-sized block.
1556 *
1557 * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs:
1558 *
1559 * vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB)
1560 *
1561 * If this were a standard RAID-Z dataset, a block of at least 40 KB would be
1562 * allocated which spans all five child vdevs. 8 KB of data would be written to
1563 * each of four vdevs, with the fifth containing the parity bits.
1564 *
1565 * parity data data data data
1566 * | PP | XX | XX | XX | XX |
1567 * ^ ^ ^ ^ ^
1568 * | | | | |
1569 * 8 KB parity ------8 KB data blocks------
1570 *
1571 * However, when writing to the dump device, the behavior is different:
1572 *
1573 * vdev_raidz_physio(data, size: 32 KB, offset: 64 KB)
1574 *
1575 * Unlike the normal RAID-Z case in which the block is allocated based on the
1576 * I/O size, reads and writes here always use a 128 KB logical I/O size. If the
1577 * I/O size is less than 128 KB, only the actual portions of data are written.
1578 * In this example the data is written to the third data vdev since that vdev
1579 * contains the offset [64 KB, 96 KB).
1580 *
1581 * parity data data data data
1582 * | | | | XX | |
1583 * ^
1584 * |
1585 * 32 KB data block
1586 *
1587 * As a result, an individual I/O may not span all child vdevs; moreover, a
1588 * small I/O may only operate on a single child vdev.
1589 *
1590 * Note that since there are no parity bits calculated or written, this format
1591 * remains the same no matter how many parity bits are used in a normal RAID-Z
1592 * stripe. On a RAID-Z3 configuration with seven child vdevs, the example above
1593 * would look like:
1594 *
1595 * parity parity parity data data data data
1596 * | | | | | | XX | |
1597 * ^
1598 * |
1599 * 32 KB data block
1600 */
1601int
1602vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
1603 uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump)
1604{
1605 vdev_t *tvd = vd->vdev_top;
1606 vdev_t *cvd;
1607 raidz_map_t *rm;
1608 raidz_col_t *rc;
1609 int c, err = 0;
1610
1611 uint64_t start, end, colstart, colend;
1612 uint64_t coloffset, colsize, colskip;
1613
1614 int flags = doread ? BIO_READ : BIO_WRITE;
1615
1616#ifdef _KERNEL
1617
1618 /*
1619 * Don't write past the end of the block
1620 */
1621 VERIFY3U(offset + size, <=, origoffset + SPA_MAXBLOCKSIZE);
1622
1623 start = offset;
1624 end = start + size;
1625
1626 /*
1627 * Allocate a RAID-Z map for this block. Note that this block starts
1628 * from the "original" offset, this is, the offset of the extent which
1629 * contains the requisite offset of the data being read or written.
1630 *
1631 * Even if this I/O operation doesn't span the full block size, let's
1632 * treat the on-disk format as if the only blocks are the complete 128
1633 * KB size.
1634 */
1635 rm = vdev_raidz_map_alloc(data - (offset - origoffset),
1636 SPA_MAXBLOCKSIZE, origoffset, B_FALSE, tvd->vdev_ashift, vd->vdev_children,
1637 vd->vdev_nparity);
1638
1639 coloffset = origoffset;
1640
1641 for (c = rm->rm_firstdatacol; c < rm->rm_cols;
1642 c++, coloffset += rc->rc_size) {
1643 rc = &rm->rm_col[c];
1644 cvd = vd->vdev_child[rc->rc_devidx];
1645
1646 /*
1647 * Find the start and end of this column in the RAID-Z map,
1648 * keeping in mind that the stated size and offset of the
1649 * operation may not fill the entire column for this vdev.
1650 *
1651 * If any portion of the data spans this column, issue the
1652 * appropriate operation to the vdev.
1653 */
1654 if (coloffset + rc->rc_size <= start)
1655 continue;
1656 if (coloffset >= end)
1657 continue;
1658
1659 colstart = MAX(coloffset, start);
1660 colend = MIN(end, coloffset + rc->rc_size);
1661 colsize = colend - colstart;
1662 colskip = colstart - coloffset;
1663
1664 VERIFY3U(colsize, <=, rc->rc_size);
1665 VERIFY3U(colskip, <=, rc->rc_size);
1666
1667 /*
1668 * Note that the child vdev will have a vdev label at the start
1669 * of its range of offsets, hence the need for
1670 * VDEV_LABEL_OFFSET(). See zio_vdev_child_io() for another
1671 * example of why this calculation is needed.
1672 */
1673 if ((err = vdev_disk_physio(cvd,
1674 ((char *)rc->rc_data) + colskip, colsize,
1675 VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
1676 flags, isdump)) != 0)
1677 break;
1678 }
1679
1680 vdev_raidz_map_free(rm);
1681#endif /* KERNEL */
1682
1683 return (err);
1684}
1685#endif
1686
1687static uint64_t
1688vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1689{
1690 uint64_t asize;
1691 uint64_t ashift = vd->vdev_top->vdev_ashift;
1692 uint64_t cols = vd->vdev_children;
1693 uint64_t nparity = vd->vdev_nparity;
1694
1695 asize = ((psize - 1) >> ashift) + 1;
1696 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1697 asize = roundup(asize, nparity + 1) << ashift;
1698
1699 return (asize);
1700}
1701
1702static void
1703vdev_raidz_child_done(zio_t *zio)
1704{
1705 raidz_col_t *rc = zio->io_private;
1706
1707 rc->rc_error = zio->io_error;
1708 rc->rc_tried = 1;
1709 rc->rc_skipped = 0;
1710}
1711
1712/*
1713 * Start an IO operation on a RAIDZ VDev
1714 *
1715 * Outline:
1716 * - For write operations:
1717 * 1. Generate the parity data
1718 * 2. Create child zio write operations to each column's vdev, for both
1719 * data and parity.
1720 * 3. If the column skips any sectors for padding, create optional dummy
1721 * write zio children for those areas to improve aggregation continuity.
1722 * - For read operations:
1723 * 1. Create child zio read operations to each data column's vdev to read
1724 * the range of data required for zio.
1725 * 2. If this is a scrub or resilver operation, or if any of the data
1726 * vdevs have had errors, then create zio read operations to the parity
1727 * columns' VDevs as well.
1728 */
1729static int
1729static void
1730vdev_raidz_io_start(zio_t *zio)
1731{
1732 vdev_t *vd = zio->io_vd;
1733 vdev_t *tvd = vd->vdev_top;
1734 vdev_t *cvd;
1735 raidz_map_t *rm;
1736 raidz_col_t *rc;
1737 int c, i;
1738
1739 rm = vdev_raidz_map_alloc(zio->io_data, zio->io_size, zio->io_offset,
1740 zio->io_type == ZIO_TYPE_FREE,
1741 tvd->vdev_ashift, vd->vdev_children,
1742 vd->vdev_nparity);
1743
1744 zio->io_vsd = rm;
1745 zio->io_vsd_ops = &vdev_raidz_vsd_ops;
1746
1747 ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1748
1749 if (zio->io_type == ZIO_TYPE_FREE) {
1750 for (c = 0; c < rm->rm_cols; c++) {
1751 rc = &rm->rm_col[c];
1752 cvd = vd->vdev_child[rc->rc_devidx];
1753 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1754 rc->rc_offset, rc->rc_data, rc->rc_size,
1755 zio->io_type, zio->io_priority, 0,
1756 vdev_raidz_child_done, rc));
1757 }
1758
1730vdev_raidz_io_start(zio_t *zio)
1731{
1732 vdev_t *vd = zio->io_vd;
1733 vdev_t *tvd = vd->vdev_top;
1734 vdev_t *cvd;
1735 raidz_map_t *rm;
1736 raidz_col_t *rc;
1737 int c, i;
1738
1739 rm = vdev_raidz_map_alloc(zio->io_data, zio->io_size, zio->io_offset,
1740 zio->io_type == ZIO_TYPE_FREE,
1741 tvd->vdev_ashift, vd->vdev_children,
1742 vd->vdev_nparity);
1743
1744 zio->io_vsd = rm;
1745 zio->io_vsd_ops = &vdev_raidz_vsd_ops;
1746
1747 ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1748
1749 if (zio->io_type == ZIO_TYPE_FREE) {
1750 for (c = 0; c < rm->rm_cols; c++) {
1751 rc = &rm->rm_col[c];
1752 cvd = vd->vdev_child[rc->rc_devidx];
1753 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1754 rc->rc_offset, rc->rc_data, rc->rc_size,
1755 zio->io_type, zio->io_priority, 0,
1756 vdev_raidz_child_done, rc));
1757 }
1758
1759 zio_interrupt(zio);
1760 return (ZIO_PIPELINE_STOP);
1759 zio_execute(zio);
1760 return;
1761 }
1762
1763 if (zio->io_type == ZIO_TYPE_WRITE) {
1764 vdev_raidz_generate_parity(rm);
1765
1766 for (c = 0; c < rm->rm_cols; c++) {
1767 rc = &rm->rm_col[c];
1768 cvd = vd->vdev_child[rc->rc_devidx];
1769 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1770 rc->rc_offset, rc->rc_data, rc->rc_size,
1771 zio->io_type, zio->io_priority, 0,
1772 vdev_raidz_child_done, rc));
1773 }
1774
1775 /*
1776 * Generate optional I/Os for any skipped sectors to improve
1777 * aggregation contiguity.
1778 */
1779 for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1780 ASSERT(c <= rm->rm_scols);
1781 if (c == rm->rm_scols)
1782 c = 0;
1783 rc = &rm->rm_col[c];
1784 cvd = vd->vdev_child[rc->rc_devidx];
1785 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1786 rc->rc_offset + rc->rc_size, NULL,
1787 1 << tvd->vdev_ashift,
1788 zio->io_type, zio->io_priority,
1789 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1790 }
1791
1761 }
1762
1763 if (zio->io_type == ZIO_TYPE_WRITE) {
1764 vdev_raidz_generate_parity(rm);
1765
1766 for (c = 0; c < rm->rm_cols; c++) {
1767 rc = &rm->rm_col[c];
1768 cvd = vd->vdev_child[rc->rc_devidx];
1769 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1770 rc->rc_offset, rc->rc_data, rc->rc_size,
1771 zio->io_type, zio->io_priority, 0,
1772 vdev_raidz_child_done, rc));
1773 }
1774
1775 /*
1776 * Generate optional I/Os for any skipped sectors to improve
1777 * aggregation contiguity.
1778 */
1779 for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1780 ASSERT(c <= rm->rm_scols);
1781 if (c == rm->rm_scols)
1782 c = 0;
1783 rc = &rm->rm_col[c];
1784 cvd = vd->vdev_child[rc->rc_devidx];
1785 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1786 rc->rc_offset + rc->rc_size, NULL,
1787 1 << tvd->vdev_ashift,
1788 zio->io_type, zio->io_priority,
1789 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1790 }
1791
1792 zio_interrupt(zio);
1793 return (ZIO_PIPELINE_STOP);
1792 zio_execute(zio);
1793 return;
1794 }
1795
1796 ASSERT(zio->io_type == ZIO_TYPE_READ);
1797
1798 /*
1799 * Iterate over the columns in reverse order so that we hit the parity
1800 * last -- any errors along the way will force us to read the parity.
1801 */
1802 for (c = rm->rm_cols - 1; c >= 0; c--) {
1803 rc = &rm->rm_col[c];
1804 cvd = vd->vdev_child[rc->rc_devidx];
1805 if (!vdev_readable(cvd)) {
1806 if (c >= rm->rm_firstdatacol)
1807 rm->rm_missingdata++;
1808 else
1809 rm->rm_missingparity++;
1810 rc->rc_error = SET_ERROR(ENXIO);
1811 rc->rc_tried = 1; /* don't even try */
1812 rc->rc_skipped = 1;
1813 continue;
1814 }
1815 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1816 if (c >= rm->rm_firstdatacol)
1817 rm->rm_missingdata++;
1818 else
1819 rm->rm_missingparity++;
1820 rc->rc_error = SET_ERROR(ESTALE);
1821 rc->rc_skipped = 1;
1822 continue;
1823 }
1824 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
1825 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1826 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1827 rc->rc_offset, rc->rc_data, rc->rc_size,
1828 zio->io_type, zio->io_priority, 0,
1829 vdev_raidz_child_done, rc));
1830 }
1831 }
1832
1794 }
1795
1796 ASSERT(zio->io_type == ZIO_TYPE_READ);
1797
1798 /*
1799 * Iterate over the columns in reverse order so that we hit the parity
1800 * last -- any errors along the way will force us to read the parity.
1801 */
1802 for (c = rm->rm_cols - 1; c >= 0; c--) {
1803 rc = &rm->rm_col[c];
1804 cvd = vd->vdev_child[rc->rc_devidx];
1805 if (!vdev_readable(cvd)) {
1806 if (c >= rm->rm_firstdatacol)
1807 rm->rm_missingdata++;
1808 else
1809 rm->rm_missingparity++;
1810 rc->rc_error = SET_ERROR(ENXIO);
1811 rc->rc_tried = 1; /* don't even try */
1812 rc->rc_skipped = 1;
1813 continue;
1814 }
1815 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1816 if (c >= rm->rm_firstdatacol)
1817 rm->rm_missingdata++;
1818 else
1819 rm->rm_missingparity++;
1820 rc->rc_error = SET_ERROR(ESTALE);
1821 rc->rc_skipped = 1;
1822 continue;
1823 }
1824 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
1825 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1826 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1827 rc->rc_offset, rc->rc_data, rc->rc_size,
1828 zio->io_type, zio->io_priority, 0,
1829 vdev_raidz_child_done, rc));
1830 }
1831 }
1832
1833 zio_interrupt(zio);
1834 return (ZIO_PIPELINE_STOP);
1833 zio_execute(zio);
1835}
1836
1837
1838/*
1839 * Report a checksum error for a child of a RAID-Z device.
1840 */
1841static void
1842raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
1843{
1844 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
1845
1846 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1847 zio_bad_cksum_t zbc;
1848 raidz_map_t *rm = zio->io_vsd;
1849
1850 mutex_enter(&vd->vdev_stat_lock);
1851 vd->vdev_stat.vs_checksum_errors++;
1852 mutex_exit(&vd->vdev_stat_lock);
1853
1854 zbc.zbc_has_cksum = 0;
1855 zbc.zbc_injected = rm->rm_ecksuminjected;
1856
1857 zfs_ereport_post_checksum(zio->io_spa, vd, zio,
1858 rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
1859 &zbc);
1860 }
1861}
1862
1863/*
1864 * We keep track of whether or not there were any injected errors, so that
1865 * any ereports we generate can note it.
1866 */
1867static int
1868raidz_checksum_verify(zio_t *zio)
1869{
1870 zio_bad_cksum_t zbc;
1871 raidz_map_t *rm = zio->io_vsd;
1872
1873 int ret = zio_checksum_error(zio, &zbc);
1874 if (ret != 0 && zbc.zbc_injected != 0)
1875 rm->rm_ecksuminjected = 1;
1876
1877 return (ret);
1878}
1879
1880/*
1881 * Generate the parity from the data columns. If we tried and were able to
1882 * read the parity without error, verify that the generated parity matches the
1883 * data we read. If it doesn't, we fire off a checksum error. Return the
1884 * number such failures.
1885 */
1886static int
1887raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
1888{
1889 void *orig[VDEV_RAIDZ_MAXPARITY];
1890 int c, ret = 0;
1891 raidz_col_t *rc;
1892
1893 blkptr_t *bp = zio->io_bp;
1894 enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
1895 (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
1896
1897 if (checksum == ZIO_CHECKSUM_NOPARITY)
1898 return (ret);
1899
1900 for (c = 0; c < rm->rm_firstdatacol; c++) {
1901 rc = &rm->rm_col[c];
1902 if (!rc->rc_tried || rc->rc_error != 0)
1903 continue;
1904 orig[c] = zio_buf_alloc(rc->rc_size);
1905 bcopy(rc->rc_data, orig[c], rc->rc_size);
1906 }
1907
1908 vdev_raidz_generate_parity(rm);
1909
1910 for (c = 0; c < rm->rm_firstdatacol; c++) {
1911 rc = &rm->rm_col[c];
1912 if (!rc->rc_tried || rc->rc_error != 0)
1913 continue;
1914 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1915 raidz_checksum_error(zio, rc, orig[c]);
1916 rc->rc_error = SET_ERROR(ECKSUM);
1917 ret++;
1918 }
1919 zio_buf_free(orig[c], rc->rc_size);
1920 }
1921
1922 return (ret);
1923}
1924
1925/*
1926 * Keep statistics on all the ways that we used parity to correct data.
1927 */
1928static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
1929
1930static int
1931vdev_raidz_worst_error(raidz_map_t *rm)
1932{
1933 int error = 0;
1934
1935 for (int c = 0; c < rm->rm_cols; c++)
1936 error = zio_worst_error(error, rm->rm_col[c].rc_error);
1937
1938 return (error);
1939}
1940
1941/*
1942 * Iterate over all combinations of bad data and attempt a reconstruction.
1943 * Note that the algorithm below is non-optimal because it doesn't take into
1944 * account how reconstruction is actually performed. For example, with
1945 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1946 * is targeted as invalid as if columns 1 and 4 are targeted since in both
1947 * cases we'd only use parity information in column 0.
1948 */
1949static int
1950vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
1951{
1952 raidz_map_t *rm = zio->io_vsd;
1953 raidz_col_t *rc;
1954 void *orig[VDEV_RAIDZ_MAXPARITY];
1955 int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1956 int *tgts = &tstore[1];
1957 int current, next, i, c, n;
1958 int code, ret = 0;
1959
1960 ASSERT(total_errors < rm->rm_firstdatacol);
1961
1962 /*
1963 * This simplifies one edge condition.
1964 */
1965 tgts[-1] = -1;
1966
1967 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1968 /*
1969 * Initialize the targets array by finding the first n columns
1970 * that contain no error.
1971 *
1972 * If there were no data errors, we need to ensure that we're
1973 * always explicitly attempting to reconstruct at least one
1974 * data column. To do this, we simply push the highest target
1975 * up into the data columns.
1976 */
1977 for (c = 0, i = 0; i < n; i++) {
1978 if (i == n - 1 && data_errors == 0 &&
1979 c < rm->rm_firstdatacol) {
1980 c = rm->rm_firstdatacol;
1981 }
1982
1983 while (rm->rm_col[c].rc_error != 0) {
1984 c++;
1985 ASSERT3S(c, <, rm->rm_cols);
1986 }
1987
1988 tgts[i] = c++;
1989 }
1990
1991 /*
1992 * Setting tgts[n] simplifies the other edge condition.
1993 */
1994 tgts[n] = rm->rm_cols;
1995
1996 /*
1997 * These buffers were allocated in previous iterations.
1998 */
1999 for (i = 0; i < n - 1; i++) {
2000 ASSERT(orig[i] != NULL);
2001 }
2002
2003 orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
2004
2005 current = 0;
2006 next = tgts[current];
2007
2008 while (current != n) {
2009 tgts[current] = next;
2010 current = 0;
2011
2012 /*
2013 * Save off the original data that we're going to
2014 * attempt to reconstruct.
2015 */
2016 for (i = 0; i < n; i++) {
2017 ASSERT(orig[i] != NULL);
2018 c = tgts[i];
2019 ASSERT3S(c, >=, 0);
2020 ASSERT3S(c, <, rm->rm_cols);
2021 rc = &rm->rm_col[c];
2022 bcopy(rc->rc_data, orig[i], rc->rc_size);
2023 }
2024
2025 /*
2026 * Attempt a reconstruction and exit the outer loop on
2027 * success.
2028 */
2029 code = vdev_raidz_reconstruct(rm, tgts, n);
2030 if (raidz_checksum_verify(zio) == 0) {
2031 atomic_inc_64(&raidz_corrected[code]);
2032
2033 for (i = 0; i < n; i++) {
2034 c = tgts[i];
2035 rc = &rm->rm_col[c];
2036 ASSERT(rc->rc_error == 0);
2037 if (rc->rc_tried)
2038 raidz_checksum_error(zio, rc,
2039 orig[i]);
2040 rc->rc_error = SET_ERROR(ECKSUM);
2041 }
2042
2043 ret = code;
2044 goto done;
2045 }
2046
2047 /*
2048 * Restore the original data.
2049 */
2050 for (i = 0; i < n; i++) {
2051 c = tgts[i];
2052 rc = &rm->rm_col[c];
2053 bcopy(orig[i], rc->rc_data, rc->rc_size);
2054 }
2055
2056 do {
2057 /*
2058 * Find the next valid column after the current
2059 * position..
2060 */
2061 for (next = tgts[current] + 1;
2062 next < rm->rm_cols &&
2063 rm->rm_col[next].rc_error != 0; next++)
2064 continue;
2065
2066 ASSERT(next <= tgts[current + 1]);
2067
2068 /*
2069 * If that spot is available, we're done here.
2070 */
2071 if (next != tgts[current + 1])
2072 break;
2073
2074 /*
2075 * Otherwise, find the next valid column after
2076 * the previous position.
2077 */
2078 for (c = tgts[current - 1] + 1;
2079 rm->rm_col[c].rc_error != 0; c++)
2080 continue;
2081
2082 tgts[current] = c;
2083 current++;
2084
2085 } while (current != n);
2086 }
2087 }
2088 n--;
2089done:
2090 for (i = 0; i < n; i++) {
2091 zio_buf_free(orig[i], rm->rm_col[0].rc_size);
2092 }
2093
2094 return (ret);
2095}
2096
2097/*
2098 * Complete an IO operation on a RAIDZ VDev
2099 *
2100 * Outline:
2101 * - For write operations:
2102 * 1. Check for errors on the child IOs.
2103 * 2. Return, setting an error code if too few child VDevs were written
2104 * to reconstruct the data later. Note that partial writes are
2105 * considered successful if they can be reconstructed at all.
2106 * - For read operations:
2107 * 1. Check for errors on the child IOs.
2108 * 2. If data errors occurred:
2109 * a. Try to reassemble the data from the parity available.
2110 * b. If we haven't yet read the parity drives, read them now.
2111 * c. If all parity drives have been read but the data still doesn't
2112 * reassemble with a correct checksum, then try combinatorial
2113 * reconstruction.
2114 * d. If that doesn't work, return an error.
2115 * 3. If there were unexpected errors or this is a resilver operation,
2116 * rewrite the vdevs that had errors.
2117 */
2118static void
2119vdev_raidz_io_done(zio_t *zio)
2120{
2121 vdev_t *vd = zio->io_vd;
2122 vdev_t *cvd;
2123 raidz_map_t *rm = zio->io_vsd;
2124 raidz_col_t *rc;
2125 int unexpected_errors = 0;
2126 int parity_errors = 0;
2127 int parity_untried = 0;
2128 int data_errors = 0;
2129 int total_errors = 0;
2130 int n, c;
2131 int tgts[VDEV_RAIDZ_MAXPARITY];
2132 int code;
2133
2134 ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */
2135
2136 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
2137 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
2138
2139 for (c = 0; c < rm->rm_cols; c++) {
2140 rc = &rm->rm_col[c];
2141
2142 if (rc->rc_error) {
2143 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
2144
2145 if (c < rm->rm_firstdatacol)
2146 parity_errors++;
2147 else
2148 data_errors++;
2149
2150 if (!rc->rc_skipped)
2151 unexpected_errors++;
2152
2153 total_errors++;
2154 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
2155 parity_untried++;
2156 }
2157 }
2158
2159 if (zio->io_type == ZIO_TYPE_WRITE) {
2160 /*
2161 * XXX -- for now, treat partial writes as a success.
2162 * (If we couldn't write enough columns to reconstruct
2163 * the data, the I/O failed. Otherwise, good enough.)
2164 *
2165 * Now that we support write reallocation, it would be better
2166 * to treat partial failure as real failure unless there are
2167 * no non-degraded top-level vdevs left, and not update DTLs
2168 * if we intend to reallocate.
2169 */
2170 /* XXPOLICY */
2171 if (total_errors > rm->rm_firstdatacol)
2172 zio->io_error = vdev_raidz_worst_error(rm);
2173
2174 return;
2175 } else if (zio->io_type == ZIO_TYPE_FREE) {
2176 return;
2177 }
2178
2179 ASSERT(zio->io_type == ZIO_TYPE_READ);
2180 /*
2181 * There are three potential phases for a read:
2182 * 1. produce valid data from the columns read
2183 * 2. read all disks and try again
2184 * 3. perform combinatorial reconstruction
2185 *
2186 * Each phase is progressively both more expensive and less likely to
2187 * occur. If we encounter more errors than we can repair or all phases
2188 * fail, we have no choice but to return an error.
2189 */
2190
2191 /*
2192 * If the number of errors we saw was correctable -- less than or equal
2193 * to the number of parity disks read -- attempt to produce data that
2194 * has a valid checksum. Naturally, this case applies in the absence of
2195 * any errors.
2196 */
2197 if (total_errors <= rm->rm_firstdatacol - parity_untried) {
2198 if (data_errors == 0) {
2199 if (raidz_checksum_verify(zio) == 0) {
2200 /*
2201 * If we read parity information (unnecessarily
2202 * as it happens since no reconstruction was
2203 * needed) regenerate and verify the parity.
2204 * We also regenerate parity when resilvering
2205 * so we can write it out to the failed device
2206 * later.
2207 */
2208 if (parity_errors + parity_untried <
2209 rm->rm_firstdatacol ||
2210 (zio->io_flags & ZIO_FLAG_RESILVER)) {
2211 n = raidz_parity_verify(zio, rm);
2212 unexpected_errors += n;
2213 ASSERT(parity_errors + n <=
2214 rm->rm_firstdatacol);
2215 }
2216 goto done;
2217 }
2218 } else {
2219 /*
2220 * We either attempt to read all the parity columns or
2221 * none of them. If we didn't try to read parity, we
2222 * wouldn't be here in the correctable case. There must
2223 * also have been fewer parity errors than parity
2224 * columns or, again, we wouldn't be in this code path.
2225 */
2226 ASSERT(parity_untried == 0);
2227 ASSERT(parity_errors < rm->rm_firstdatacol);
2228
2229 /*
2230 * Identify the data columns that reported an error.
2231 */
2232 n = 0;
2233 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
2234 rc = &rm->rm_col[c];
2235 if (rc->rc_error != 0) {
2236 ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2237 tgts[n++] = c;
2238 }
2239 }
2240
2241 ASSERT(rm->rm_firstdatacol >= n);
2242
2243 code = vdev_raidz_reconstruct(rm, tgts, n);
2244
2245 if (raidz_checksum_verify(zio) == 0) {
2246 atomic_inc_64(&raidz_corrected[code]);
2247
2248 /*
2249 * If we read more parity disks than were used
2250 * for reconstruction, confirm that the other
2251 * parity disks produced correct data. This
2252 * routine is suboptimal in that it regenerates
2253 * the parity that we already used in addition
2254 * to the parity that we're attempting to
2255 * verify, but this should be a relatively
2256 * uncommon case, and can be optimized if it
2257 * becomes a problem. Note that we regenerate
2258 * parity when resilvering so we can write it
2259 * out to failed devices later.
2260 */
2261 if (parity_errors < rm->rm_firstdatacol - n ||
2262 (zio->io_flags & ZIO_FLAG_RESILVER)) {
2263 n = raidz_parity_verify(zio, rm);
2264 unexpected_errors += n;
2265 ASSERT(parity_errors + n <=
2266 rm->rm_firstdatacol);
2267 }
2268
2269 goto done;
2270 }
2271 }
2272 }
2273
2274 /*
2275 * This isn't a typical situation -- either we got a read error or
2276 * a child silently returned bad data. Read every block so we can
2277 * try again with as much data and parity as we can track down. If
2278 * we've already been through once before, all children will be marked
2279 * as tried so we'll proceed to combinatorial reconstruction.
2280 */
2281 unexpected_errors = 1;
2282 rm->rm_missingdata = 0;
2283 rm->rm_missingparity = 0;
2284
2285 for (c = 0; c < rm->rm_cols; c++) {
2286 if (rm->rm_col[c].rc_tried)
2287 continue;
2288
2289 zio_vdev_io_redone(zio);
2290 do {
2291 rc = &rm->rm_col[c];
2292 if (rc->rc_tried)
2293 continue;
2294 zio_nowait(zio_vdev_child_io(zio, NULL,
2295 vd->vdev_child[rc->rc_devidx],
2296 rc->rc_offset, rc->rc_data, rc->rc_size,
2297 zio->io_type, zio->io_priority, 0,
2298 vdev_raidz_child_done, rc));
2299 } while (++c < rm->rm_cols);
2300
2301 return;
2302 }
2303
2304 /*
2305 * At this point we've attempted to reconstruct the data given the
2306 * errors we detected, and we've attempted to read all columns. There
2307 * must, therefore, be one or more additional problems -- silent errors
2308 * resulting in invalid data rather than explicit I/O errors resulting
2309 * in absent data. We check if there is enough additional data to
2310 * possibly reconstruct the data and then perform combinatorial
2311 * reconstruction over all possible combinations. If that fails,
2312 * we're cooked.
2313 */
2314 if (total_errors > rm->rm_firstdatacol) {
2315 zio->io_error = vdev_raidz_worst_error(rm);
2316
2317 } else if (total_errors < rm->rm_firstdatacol &&
2318 (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2319 /*
2320 * If we didn't use all the available parity for the
2321 * combinatorial reconstruction, verify that the remaining
2322 * parity is correct.
2323 */
2324 if (code != (1 << rm->rm_firstdatacol) - 1)
2325 (void) raidz_parity_verify(zio, rm);
2326 } else {
2327 /*
2328 * We're here because either:
2329 *
2330 * total_errors == rm_first_datacol, or
2331 * vdev_raidz_combrec() failed
2332 *
2333 * In either case, there is enough bad data to prevent
2334 * reconstruction.
2335 *
2336 * Start checksum ereports for all children which haven't
2337 * failed, and the IO wasn't speculative.
2338 */
2339 zio->io_error = SET_ERROR(ECKSUM);
2340
2341 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2342 for (c = 0; c < rm->rm_cols; c++) {
2343 rc = &rm->rm_col[c];
2344 if (rc->rc_error == 0) {
2345 zio_bad_cksum_t zbc;
2346 zbc.zbc_has_cksum = 0;
2347 zbc.zbc_injected =
2348 rm->rm_ecksuminjected;
2349
2350 zfs_ereport_start_checksum(
2351 zio->io_spa,
2352 vd->vdev_child[rc->rc_devidx],
2353 zio, rc->rc_offset, rc->rc_size,
2354 (void *)(uintptr_t)c, &zbc);
2355 }
2356 }
2357 }
2358 }
2359
2360done:
2361 zio_checksum_verified(zio);
2362
2363 if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2364 (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2365 /*
2366 * Use the good data we have in hand to repair damaged children.
2367 */
2368 for (c = 0; c < rm->rm_cols; c++) {
2369 rc = &rm->rm_col[c];
2370 cvd = vd->vdev_child[rc->rc_devidx];
2371
2372 if (rc->rc_error == 0)
2373 continue;
2374
2375 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2376 rc->rc_offset, rc->rc_data, rc->rc_size,
2377 ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
2378 ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2379 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2380 }
2381 }
2382}
2383
2384static void
2385vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2386{
2387 if (faulted > vd->vdev_nparity)
2388 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2389 VDEV_AUX_NO_REPLICAS);
2390 else if (degraded + faulted != 0)
2391 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2392 else
2393 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2394}
2395
2396vdev_ops_t vdev_raidz_ops = {
2397 vdev_raidz_open,
2398 vdev_raidz_close,
2399 vdev_raidz_asize,
2400 vdev_raidz_io_start,
2401 vdev_raidz_io_done,
2402 vdev_raidz_state_change,
2403 NULL,
2404 NULL,
2405 VDEV_TYPE_RAIDZ, /* name of this vdev type */
2406 B_FALSE /* not a leaf vdev */
2407};
1834}
1835
1836
1837/*
1838 * Report a checksum error for a child of a RAID-Z device.
1839 */
1840static void
1841raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
1842{
1843 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
1844
1845 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1846 zio_bad_cksum_t zbc;
1847 raidz_map_t *rm = zio->io_vsd;
1848
1849 mutex_enter(&vd->vdev_stat_lock);
1850 vd->vdev_stat.vs_checksum_errors++;
1851 mutex_exit(&vd->vdev_stat_lock);
1852
1853 zbc.zbc_has_cksum = 0;
1854 zbc.zbc_injected = rm->rm_ecksuminjected;
1855
1856 zfs_ereport_post_checksum(zio->io_spa, vd, zio,
1857 rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
1858 &zbc);
1859 }
1860}
1861
1862/*
1863 * We keep track of whether or not there were any injected errors, so that
1864 * any ereports we generate can note it.
1865 */
1866static int
1867raidz_checksum_verify(zio_t *zio)
1868{
1869 zio_bad_cksum_t zbc;
1870 raidz_map_t *rm = zio->io_vsd;
1871
1872 int ret = zio_checksum_error(zio, &zbc);
1873 if (ret != 0 && zbc.zbc_injected != 0)
1874 rm->rm_ecksuminjected = 1;
1875
1876 return (ret);
1877}
1878
1879/*
1880 * Generate the parity from the data columns. If we tried and were able to
1881 * read the parity without error, verify that the generated parity matches the
1882 * data we read. If it doesn't, we fire off a checksum error. Return the
1883 * number such failures.
1884 */
1885static int
1886raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
1887{
1888 void *orig[VDEV_RAIDZ_MAXPARITY];
1889 int c, ret = 0;
1890 raidz_col_t *rc;
1891
1892 blkptr_t *bp = zio->io_bp;
1893 enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
1894 (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
1895
1896 if (checksum == ZIO_CHECKSUM_NOPARITY)
1897 return (ret);
1898
1899 for (c = 0; c < rm->rm_firstdatacol; c++) {
1900 rc = &rm->rm_col[c];
1901 if (!rc->rc_tried || rc->rc_error != 0)
1902 continue;
1903 orig[c] = zio_buf_alloc(rc->rc_size);
1904 bcopy(rc->rc_data, orig[c], rc->rc_size);
1905 }
1906
1907 vdev_raidz_generate_parity(rm);
1908
1909 for (c = 0; c < rm->rm_firstdatacol; c++) {
1910 rc = &rm->rm_col[c];
1911 if (!rc->rc_tried || rc->rc_error != 0)
1912 continue;
1913 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1914 raidz_checksum_error(zio, rc, orig[c]);
1915 rc->rc_error = SET_ERROR(ECKSUM);
1916 ret++;
1917 }
1918 zio_buf_free(orig[c], rc->rc_size);
1919 }
1920
1921 return (ret);
1922}
1923
1924/*
1925 * Keep statistics on all the ways that we used parity to correct data.
1926 */
1927static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
1928
1929static int
1930vdev_raidz_worst_error(raidz_map_t *rm)
1931{
1932 int error = 0;
1933
1934 for (int c = 0; c < rm->rm_cols; c++)
1935 error = zio_worst_error(error, rm->rm_col[c].rc_error);
1936
1937 return (error);
1938}
1939
1940/*
1941 * Iterate over all combinations of bad data and attempt a reconstruction.
1942 * Note that the algorithm below is non-optimal because it doesn't take into
1943 * account how reconstruction is actually performed. For example, with
1944 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1945 * is targeted as invalid as if columns 1 and 4 are targeted since in both
1946 * cases we'd only use parity information in column 0.
1947 */
1948static int
1949vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
1950{
1951 raidz_map_t *rm = zio->io_vsd;
1952 raidz_col_t *rc;
1953 void *orig[VDEV_RAIDZ_MAXPARITY];
1954 int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1955 int *tgts = &tstore[1];
1956 int current, next, i, c, n;
1957 int code, ret = 0;
1958
1959 ASSERT(total_errors < rm->rm_firstdatacol);
1960
1961 /*
1962 * This simplifies one edge condition.
1963 */
1964 tgts[-1] = -1;
1965
1966 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1967 /*
1968 * Initialize the targets array by finding the first n columns
1969 * that contain no error.
1970 *
1971 * If there were no data errors, we need to ensure that we're
1972 * always explicitly attempting to reconstruct at least one
1973 * data column. To do this, we simply push the highest target
1974 * up into the data columns.
1975 */
1976 for (c = 0, i = 0; i < n; i++) {
1977 if (i == n - 1 && data_errors == 0 &&
1978 c < rm->rm_firstdatacol) {
1979 c = rm->rm_firstdatacol;
1980 }
1981
1982 while (rm->rm_col[c].rc_error != 0) {
1983 c++;
1984 ASSERT3S(c, <, rm->rm_cols);
1985 }
1986
1987 tgts[i] = c++;
1988 }
1989
1990 /*
1991 * Setting tgts[n] simplifies the other edge condition.
1992 */
1993 tgts[n] = rm->rm_cols;
1994
1995 /*
1996 * These buffers were allocated in previous iterations.
1997 */
1998 for (i = 0; i < n - 1; i++) {
1999 ASSERT(orig[i] != NULL);
2000 }
2001
2002 orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
2003
2004 current = 0;
2005 next = tgts[current];
2006
2007 while (current != n) {
2008 tgts[current] = next;
2009 current = 0;
2010
2011 /*
2012 * Save off the original data that we're going to
2013 * attempt to reconstruct.
2014 */
2015 for (i = 0; i < n; i++) {
2016 ASSERT(orig[i] != NULL);
2017 c = tgts[i];
2018 ASSERT3S(c, >=, 0);
2019 ASSERT3S(c, <, rm->rm_cols);
2020 rc = &rm->rm_col[c];
2021 bcopy(rc->rc_data, orig[i], rc->rc_size);
2022 }
2023
2024 /*
2025 * Attempt a reconstruction and exit the outer loop on
2026 * success.
2027 */
2028 code = vdev_raidz_reconstruct(rm, tgts, n);
2029 if (raidz_checksum_verify(zio) == 0) {
2030 atomic_inc_64(&raidz_corrected[code]);
2031
2032 for (i = 0; i < n; i++) {
2033 c = tgts[i];
2034 rc = &rm->rm_col[c];
2035 ASSERT(rc->rc_error == 0);
2036 if (rc->rc_tried)
2037 raidz_checksum_error(zio, rc,
2038 orig[i]);
2039 rc->rc_error = SET_ERROR(ECKSUM);
2040 }
2041
2042 ret = code;
2043 goto done;
2044 }
2045
2046 /*
2047 * Restore the original data.
2048 */
2049 for (i = 0; i < n; i++) {
2050 c = tgts[i];
2051 rc = &rm->rm_col[c];
2052 bcopy(orig[i], rc->rc_data, rc->rc_size);
2053 }
2054
2055 do {
2056 /*
2057 * Find the next valid column after the current
2058 * position..
2059 */
2060 for (next = tgts[current] + 1;
2061 next < rm->rm_cols &&
2062 rm->rm_col[next].rc_error != 0; next++)
2063 continue;
2064
2065 ASSERT(next <= tgts[current + 1]);
2066
2067 /*
2068 * If that spot is available, we're done here.
2069 */
2070 if (next != tgts[current + 1])
2071 break;
2072
2073 /*
2074 * Otherwise, find the next valid column after
2075 * the previous position.
2076 */
2077 for (c = tgts[current - 1] + 1;
2078 rm->rm_col[c].rc_error != 0; c++)
2079 continue;
2080
2081 tgts[current] = c;
2082 current++;
2083
2084 } while (current != n);
2085 }
2086 }
2087 n--;
2088done:
2089 for (i = 0; i < n; i++) {
2090 zio_buf_free(orig[i], rm->rm_col[0].rc_size);
2091 }
2092
2093 return (ret);
2094}
2095
2096/*
2097 * Complete an IO operation on a RAIDZ VDev
2098 *
2099 * Outline:
2100 * - For write operations:
2101 * 1. Check for errors on the child IOs.
2102 * 2. Return, setting an error code if too few child VDevs were written
2103 * to reconstruct the data later. Note that partial writes are
2104 * considered successful if they can be reconstructed at all.
2105 * - For read operations:
2106 * 1. Check for errors on the child IOs.
2107 * 2. If data errors occurred:
2108 * a. Try to reassemble the data from the parity available.
2109 * b. If we haven't yet read the parity drives, read them now.
2110 * c. If all parity drives have been read but the data still doesn't
2111 * reassemble with a correct checksum, then try combinatorial
2112 * reconstruction.
2113 * d. If that doesn't work, return an error.
2114 * 3. If there were unexpected errors or this is a resilver operation,
2115 * rewrite the vdevs that had errors.
2116 */
2117static void
2118vdev_raidz_io_done(zio_t *zio)
2119{
2120 vdev_t *vd = zio->io_vd;
2121 vdev_t *cvd;
2122 raidz_map_t *rm = zio->io_vsd;
2123 raidz_col_t *rc;
2124 int unexpected_errors = 0;
2125 int parity_errors = 0;
2126 int parity_untried = 0;
2127 int data_errors = 0;
2128 int total_errors = 0;
2129 int n, c;
2130 int tgts[VDEV_RAIDZ_MAXPARITY];
2131 int code;
2132
2133 ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */
2134
2135 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
2136 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
2137
2138 for (c = 0; c < rm->rm_cols; c++) {
2139 rc = &rm->rm_col[c];
2140
2141 if (rc->rc_error) {
2142 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
2143
2144 if (c < rm->rm_firstdatacol)
2145 parity_errors++;
2146 else
2147 data_errors++;
2148
2149 if (!rc->rc_skipped)
2150 unexpected_errors++;
2151
2152 total_errors++;
2153 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
2154 parity_untried++;
2155 }
2156 }
2157
2158 if (zio->io_type == ZIO_TYPE_WRITE) {
2159 /*
2160 * XXX -- for now, treat partial writes as a success.
2161 * (If we couldn't write enough columns to reconstruct
2162 * the data, the I/O failed. Otherwise, good enough.)
2163 *
2164 * Now that we support write reallocation, it would be better
2165 * to treat partial failure as real failure unless there are
2166 * no non-degraded top-level vdevs left, and not update DTLs
2167 * if we intend to reallocate.
2168 */
2169 /* XXPOLICY */
2170 if (total_errors > rm->rm_firstdatacol)
2171 zio->io_error = vdev_raidz_worst_error(rm);
2172
2173 return;
2174 } else if (zio->io_type == ZIO_TYPE_FREE) {
2175 return;
2176 }
2177
2178 ASSERT(zio->io_type == ZIO_TYPE_READ);
2179 /*
2180 * There are three potential phases for a read:
2181 * 1. produce valid data from the columns read
2182 * 2. read all disks and try again
2183 * 3. perform combinatorial reconstruction
2184 *
2185 * Each phase is progressively both more expensive and less likely to
2186 * occur. If we encounter more errors than we can repair or all phases
2187 * fail, we have no choice but to return an error.
2188 */
2189
2190 /*
2191 * If the number of errors we saw was correctable -- less than or equal
2192 * to the number of parity disks read -- attempt to produce data that
2193 * has a valid checksum. Naturally, this case applies in the absence of
2194 * any errors.
2195 */
2196 if (total_errors <= rm->rm_firstdatacol - parity_untried) {
2197 if (data_errors == 0) {
2198 if (raidz_checksum_verify(zio) == 0) {
2199 /*
2200 * If we read parity information (unnecessarily
2201 * as it happens since no reconstruction was
2202 * needed) regenerate and verify the parity.
2203 * We also regenerate parity when resilvering
2204 * so we can write it out to the failed device
2205 * later.
2206 */
2207 if (parity_errors + parity_untried <
2208 rm->rm_firstdatacol ||
2209 (zio->io_flags & ZIO_FLAG_RESILVER)) {
2210 n = raidz_parity_verify(zio, rm);
2211 unexpected_errors += n;
2212 ASSERT(parity_errors + n <=
2213 rm->rm_firstdatacol);
2214 }
2215 goto done;
2216 }
2217 } else {
2218 /*
2219 * We either attempt to read all the parity columns or
2220 * none of them. If we didn't try to read parity, we
2221 * wouldn't be here in the correctable case. There must
2222 * also have been fewer parity errors than parity
2223 * columns or, again, we wouldn't be in this code path.
2224 */
2225 ASSERT(parity_untried == 0);
2226 ASSERT(parity_errors < rm->rm_firstdatacol);
2227
2228 /*
2229 * Identify the data columns that reported an error.
2230 */
2231 n = 0;
2232 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
2233 rc = &rm->rm_col[c];
2234 if (rc->rc_error != 0) {
2235 ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2236 tgts[n++] = c;
2237 }
2238 }
2239
2240 ASSERT(rm->rm_firstdatacol >= n);
2241
2242 code = vdev_raidz_reconstruct(rm, tgts, n);
2243
2244 if (raidz_checksum_verify(zio) == 0) {
2245 atomic_inc_64(&raidz_corrected[code]);
2246
2247 /*
2248 * If we read more parity disks than were used
2249 * for reconstruction, confirm that the other
2250 * parity disks produced correct data. This
2251 * routine is suboptimal in that it regenerates
2252 * the parity that we already used in addition
2253 * to the parity that we're attempting to
2254 * verify, but this should be a relatively
2255 * uncommon case, and can be optimized if it
2256 * becomes a problem. Note that we regenerate
2257 * parity when resilvering so we can write it
2258 * out to failed devices later.
2259 */
2260 if (parity_errors < rm->rm_firstdatacol - n ||
2261 (zio->io_flags & ZIO_FLAG_RESILVER)) {
2262 n = raidz_parity_verify(zio, rm);
2263 unexpected_errors += n;
2264 ASSERT(parity_errors + n <=
2265 rm->rm_firstdatacol);
2266 }
2267
2268 goto done;
2269 }
2270 }
2271 }
2272
2273 /*
2274 * This isn't a typical situation -- either we got a read error or
2275 * a child silently returned bad data. Read every block so we can
2276 * try again with as much data and parity as we can track down. If
2277 * we've already been through once before, all children will be marked
2278 * as tried so we'll proceed to combinatorial reconstruction.
2279 */
2280 unexpected_errors = 1;
2281 rm->rm_missingdata = 0;
2282 rm->rm_missingparity = 0;
2283
2284 for (c = 0; c < rm->rm_cols; c++) {
2285 if (rm->rm_col[c].rc_tried)
2286 continue;
2287
2288 zio_vdev_io_redone(zio);
2289 do {
2290 rc = &rm->rm_col[c];
2291 if (rc->rc_tried)
2292 continue;
2293 zio_nowait(zio_vdev_child_io(zio, NULL,
2294 vd->vdev_child[rc->rc_devidx],
2295 rc->rc_offset, rc->rc_data, rc->rc_size,
2296 zio->io_type, zio->io_priority, 0,
2297 vdev_raidz_child_done, rc));
2298 } while (++c < rm->rm_cols);
2299
2300 return;
2301 }
2302
2303 /*
2304 * At this point we've attempted to reconstruct the data given the
2305 * errors we detected, and we've attempted to read all columns. There
2306 * must, therefore, be one or more additional problems -- silent errors
2307 * resulting in invalid data rather than explicit I/O errors resulting
2308 * in absent data. We check if there is enough additional data to
2309 * possibly reconstruct the data and then perform combinatorial
2310 * reconstruction over all possible combinations. If that fails,
2311 * we're cooked.
2312 */
2313 if (total_errors > rm->rm_firstdatacol) {
2314 zio->io_error = vdev_raidz_worst_error(rm);
2315
2316 } else if (total_errors < rm->rm_firstdatacol &&
2317 (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2318 /*
2319 * If we didn't use all the available parity for the
2320 * combinatorial reconstruction, verify that the remaining
2321 * parity is correct.
2322 */
2323 if (code != (1 << rm->rm_firstdatacol) - 1)
2324 (void) raidz_parity_verify(zio, rm);
2325 } else {
2326 /*
2327 * We're here because either:
2328 *
2329 * total_errors == rm_first_datacol, or
2330 * vdev_raidz_combrec() failed
2331 *
2332 * In either case, there is enough bad data to prevent
2333 * reconstruction.
2334 *
2335 * Start checksum ereports for all children which haven't
2336 * failed, and the IO wasn't speculative.
2337 */
2338 zio->io_error = SET_ERROR(ECKSUM);
2339
2340 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2341 for (c = 0; c < rm->rm_cols; c++) {
2342 rc = &rm->rm_col[c];
2343 if (rc->rc_error == 0) {
2344 zio_bad_cksum_t zbc;
2345 zbc.zbc_has_cksum = 0;
2346 zbc.zbc_injected =
2347 rm->rm_ecksuminjected;
2348
2349 zfs_ereport_start_checksum(
2350 zio->io_spa,
2351 vd->vdev_child[rc->rc_devidx],
2352 zio, rc->rc_offset, rc->rc_size,
2353 (void *)(uintptr_t)c, &zbc);
2354 }
2355 }
2356 }
2357 }
2358
2359done:
2360 zio_checksum_verified(zio);
2361
2362 if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2363 (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2364 /*
2365 * Use the good data we have in hand to repair damaged children.
2366 */
2367 for (c = 0; c < rm->rm_cols; c++) {
2368 rc = &rm->rm_col[c];
2369 cvd = vd->vdev_child[rc->rc_devidx];
2370
2371 if (rc->rc_error == 0)
2372 continue;
2373
2374 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2375 rc->rc_offset, rc->rc_data, rc->rc_size,
2376 ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
2377 ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2378 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2379 }
2380 }
2381}
2382
2383static void
2384vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2385{
2386 if (faulted > vd->vdev_nparity)
2387 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2388 VDEV_AUX_NO_REPLICAS);
2389 else if (degraded + faulted != 0)
2390 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2391 else
2392 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2393}
2394
2395vdev_ops_t vdev_raidz_ops = {
2396 vdev_raidz_open,
2397 vdev_raidz_close,
2398 vdev_raidz_asize,
2399 vdev_raidz_io_start,
2400 vdev_raidz_io_done,
2401 vdev_raidz_state_change,
2402 NULL,
2403 NULL,
2404 VDEV_TYPE_RAIDZ, /* name of this vdev type */
2405 B_FALSE /* not a leaf vdev */
2406};