Deleted Added
sdiff udiff text old ( 251631 ) new ( 254591 )
full compact
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2013 by Delphix. All rights reserved.
25 */
26
27#include <sys/zfs_context.h>
28#include <sys/spa.h>
29#include <sys/vdev_impl.h>
30#include <sys/zio.h>
31#include <sys/zio_checksum.h>
32#include <sys/fs/zfs.h>
33#include <sys/fm/fs/zfs.h>
34
35/*
36 * Virtual device vector for RAID-Z.
37 *
38 * This vdev supports single, double, and triple parity. For single parity,
39 * we use a simple XOR of all the data columns. For double or triple parity,
40 * we use a special case of Reed-Solomon coding. This extends the
41 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
42 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
43 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
44 * former is also based. The latter is designed to provide higher performance
45 * for writes.
46 *
47 * Note that the Plank paper claimed to support arbitrary N+M, but was then
48 * amended six years later identifying a critical flaw that invalidates its
49 * claims. Nevertheless, the technique can be adapted to work for up to
50 * triple parity. For additional parity, the amendment "Note: Correction to
51 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
52 * is viable, but the additional complexity means that write performance will
53 * suffer.
54 *
55 * All of the methods above operate on a Galois field, defined over the
56 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
57 * can be expressed with a single byte. Briefly, the operations on the
58 * field are defined as follows:
59 *
60 * o addition (+) is represented by a bitwise XOR
61 * o subtraction (-) is therefore identical to addition: A + B = A - B
62 * o multiplication of A by 2 is defined by the following bitwise expression:
63 *
64 * (A * 2)_7 = A_6
65 * (A * 2)_6 = A_5
66 * (A * 2)_5 = A_4
67 * (A * 2)_4 = A_3 + A_7
68 * (A * 2)_3 = A_2 + A_7
69 * (A * 2)_2 = A_1 + A_7
70 * (A * 2)_1 = A_0
71 * (A * 2)_0 = A_7
72 *
73 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
74 * As an aside, this multiplication is derived from the error correcting
75 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
76 *
77 * Observe that any number in the field (except for 0) can be expressed as a
78 * power of 2 -- a generator for the field. We store a table of the powers of
79 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
80 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
81 * than field addition). The inverse of a field element A (A^-1) is therefore
82 * A ^ (255 - 1) = A^254.
83 *
84 * The up-to-three parity columns, P, Q, R over several data columns,
85 * D_0, ... D_n-1, can be expressed by field operations:
86 *
87 * P = D_0 + D_1 + ... + D_n-2 + D_n-1
88 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
89 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
90 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
91 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
92 *
93 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
94 * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
95 * independent coefficients. (There are no additional coefficients that have
96 * this property which is why the uncorrected Plank method breaks down.)
97 *
98 * See the reconstruction code below for how P, Q and R can used individually
99 * or in concert to recover missing data columns.
100 */
101
102typedef struct raidz_col {
103 uint64_t rc_devidx; /* child device index for I/O */
104 uint64_t rc_offset; /* device offset */
105 uint64_t rc_size; /* I/O size */
106 void *rc_data; /* I/O data */
107 void *rc_gdata; /* used to store the "good" version */
108 int rc_error; /* I/O error for this device */
109 uint8_t rc_tried; /* Did we attempt this I/O column? */
110 uint8_t rc_skipped; /* Did we skip this I/O column? */
111} raidz_col_t;
112
113typedef struct raidz_map {
114 uint64_t rm_cols; /* Regular column count */
115 uint64_t rm_scols; /* Count including skipped columns */
116 uint64_t rm_bigcols; /* Number of oversized columns */
117 uint64_t rm_asize; /* Actual total I/O size */
118 uint64_t rm_missingdata; /* Count of missing data devices */
119 uint64_t rm_missingparity; /* Count of missing parity devices */
120 uint64_t rm_firstdatacol; /* First data column/parity count */
121 uint64_t rm_nskip; /* Skipped sectors for padding */
122 uint64_t rm_skipstart; /* Column index of padding start */
123 void *rm_datacopy; /* rm_asize-buffer of copied data */
124 uintptr_t rm_reports; /* # of referencing checksum reports */
125 uint8_t rm_freed; /* map no longer has referencing ZIO */
126 uint8_t rm_ecksuminjected; /* checksum error was injected */
127 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
128} raidz_map_t;
129
130#define VDEV_RAIDZ_P 0
131#define VDEV_RAIDZ_Q 1
132#define VDEV_RAIDZ_R 2
133
134#define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
135#define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
136
137/*
138 * We provide a mechanism to perform the field multiplication operation on a
139 * 64-bit value all at once rather than a byte at a time. This works by
140 * creating a mask from the top bit in each byte and using that to
141 * conditionally apply the XOR of 0x1d.
142 */
143#define VDEV_RAIDZ_64MUL_2(x, mask) \
144{ \
145 (mask) = (x) & 0x8080808080808080ULL; \
146 (mask) = ((mask) << 1) - ((mask) >> 7); \
147 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
148 ((mask) & 0x1d1d1d1d1d1d1d1d); \
149}
150
151#define VDEV_RAIDZ_64MUL_4(x, mask) \
152{ \
153 VDEV_RAIDZ_64MUL_2((x), mask); \
154 VDEV_RAIDZ_64MUL_2((x), mask); \
155}
156
157/*
158 * Force reconstruction to use the general purpose method.
159 */
160int vdev_raidz_default_to_general;
161
162/* Powers of 2 in the Galois field defined above. */
163static const uint8_t vdev_raidz_pow2[256] = {
164 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
165 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
166 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
167 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
168 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
169 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
170 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
171 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
172 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
173 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
174 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
175 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
176 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
177 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
178 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
179 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
180 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
181 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
182 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
183 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
184 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
185 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
186 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
187 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
188 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
189 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
190 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
191 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
192 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
193 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
194 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
195 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
196};
197/* Logs of 2 in the Galois field defined above. */
198static const uint8_t vdev_raidz_log2[256] = {
199 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
200 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
201 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
202 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
203 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
204 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
205 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
206 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
207 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
208 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
209 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
210 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
211 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
212 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
213 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
214 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
215 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
216 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
217 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
218 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
219 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
220 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
221 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
222 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
223 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
224 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
225 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
226 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
227 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
228 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
229 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
230 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
231};
232
233static void vdev_raidz_generate_parity(raidz_map_t *rm);
234
235/*
236 * Multiply a given number by 2 raised to the given power.
237 */
238static uint8_t
239vdev_raidz_exp2(uint_t a, int exp)
240{
241 if (a == 0)
242 return (0);
243
244 ASSERT(exp >= 0);
245 ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
246
247 exp += vdev_raidz_log2[a];
248 if (exp > 255)
249 exp -= 255;
250
251 return (vdev_raidz_pow2[exp]);
252}
253
254static void
255vdev_raidz_map_free(raidz_map_t *rm)
256{
257 int c;
258 size_t size;
259
260 for (c = 0; c < rm->rm_firstdatacol; c++) {
261 if (rm->rm_col[c].rc_data != NULL)
262 zio_buf_free(rm->rm_col[c].rc_data,
263 rm->rm_col[c].rc_size);
264
265 if (rm->rm_col[c].rc_gdata != NULL)
266 zio_buf_free(rm->rm_col[c].rc_gdata,
267 rm->rm_col[c].rc_size);
268 }
269
270 size = 0;
271 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
272 size += rm->rm_col[c].rc_size;
273
274 if (rm->rm_datacopy != NULL)
275 zio_buf_free(rm->rm_datacopy, size);
276
277 kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
278}
279
280static void
281vdev_raidz_map_free_vsd(zio_t *zio)
282{
283 raidz_map_t *rm = zio->io_vsd;
284
285 ASSERT0(rm->rm_freed);
286 rm->rm_freed = 1;
287
288 if (rm->rm_reports == 0)
289 vdev_raidz_map_free(rm);
290}
291
292/*ARGSUSED*/
293static void
294vdev_raidz_cksum_free(void *arg, size_t ignored)
295{
296 raidz_map_t *rm = arg;
297
298 ASSERT3U(rm->rm_reports, >, 0);
299
300 if (--rm->rm_reports == 0 && rm->rm_freed != 0)
301 vdev_raidz_map_free(rm);
302}
303
304static void
305vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
306{
307 raidz_map_t *rm = zcr->zcr_cbdata;
308 size_t c = zcr->zcr_cbinfo;
309 size_t x;
310
311 const char *good = NULL;
312 const char *bad = rm->rm_col[c].rc_data;
313
314 if (good_data == NULL) {
315 zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
316 return;
317 }
318
319 if (c < rm->rm_firstdatacol) {
320 /*
321 * The first time through, calculate the parity blocks for
322 * the good data (this relies on the fact that the good
323 * data never changes for a given logical ZIO)
324 */
325 if (rm->rm_col[0].rc_gdata == NULL) {
326 char *bad_parity[VDEV_RAIDZ_MAXPARITY];
327 char *buf;
328
329 /*
330 * Set up the rm_col[]s to generate the parity for
331 * good_data, first saving the parity bufs and
332 * replacing them with buffers to hold the result.
333 */
334 for (x = 0; x < rm->rm_firstdatacol; x++) {
335 bad_parity[x] = rm->rm_col[x].rc_data;
336 rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
337 zio_buf_alloc(rm->rm_col[x].rc_size);
338 }
339
340 /* fill in the data columns from good_data */
341 buf = (char *)good_data;
342 for (; x < rm->rm_cols; x++) {
343 rm->rm_col[x].rc_data = buf;
344 buf += rm->rm_col[x].rc_size;
345 }
346
347 /*
348 * Construct the parity from the good data.
349 */
350 vdev_raidz_generate_parity(rm);
351
352 /* restore everything back to its original state */
353 for (x = 0; x < rm->rm_firstdatacol; x++)
354 rm->rm_col[x].rc_data = bad_parity[x];
355
356 buf = rm->rm_datacopy;
357 for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
358 rm->rm_col[x].rc_data = buf;
359 buf += rm->rm_col[x].rc_size;
360 }
361 }
362
363 ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
364 good = rm->rm_col[c].rc_gdata;
365 } else {
366 /* adjust good_data to point at the start of our column */
367 good = good_data;
368
369 for (x = rm->rm_firstdatacol; x < c; x++)
370 good += rm->rm_col[x].rc_size;
371 }
372
373 /* we drop the ereport if it ends up that the data was good */
374 zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
375}
376
377/*
378 * Invoked indirectly by zfs_ereport_start_checksum(), called
379 * below when our read operation fails completely. The main point
380 * is to keep a copy of everything we read from disk, so that at
381 * vdev_raidz_cksum_finish() time we can compare it with the good data.
382 */
383static void
384vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
385{
386 size_t c = (size_t)(uintptr_t)arg;
387 caddr_t buf;
388
389 raidz_map_t *rm = zio->io_vsd;
390 size_t size;
391
392 /* set up the report and bump the refcount */
393 zcr->zcr_cbdata = rm;
394 zcr->zcr_cbinfo = c;
395 zcr->zcr_finish = vdev_raidz_cksum_finish;
396 zcr->zcr_free = vdev_raidz_cksum_free;
397
398 rm->rm_reports++;
399 ASSERT3U(rm->rm_reports, >, 0);
400
401 if (rm->rm_datacopy != NULL)
402 return;
403
404 /*
405 * It's the first time we're called for this raidz_map_t, so we need
406 * to copy the data aside; there's no guarantee that our zio's buffer
407 * won't be re-used for something else.
408 *
409 * Our parity data is already in separate buffers, so there's no need
410 * to copy them.
411 */
412
413 size = 0;
414 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
415 size += rm->rm_col[c].rc_size;
416
417 buf = rm->rm_datacopy = zio_buf_alloc(size);
418
419 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
420 raidz_col_t *col = &rm->rm_col[c];
421
422 bcopy(col->rc_data, buf, col->rc_size);
423 col->rc_data = buf;
424
425 buf += col->rc_size;
426 }
427 ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
428}
429
430static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
431 vdev_raidz_map_free_vsd,
432 vdev_raidz_cksum_report
433};
434
435/*
436 * Divides the IO evenly across all child vdevs; usually, dcols is
437 * the number of children in the target vdev.
438 */
439static raidz_map_t *
440vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
441 uint64_t nparity)
442{
443 raidz_map_t *rm;
444 /* The starting RAIDZ (parent) vdev sector of the block. */
445 uint64_t b = zio->io_offset >> unit_shift;
446 /* The zio's size in units of the vdev's minimum sector size. */
447 uint64_t s = zio->io_size >> unit_shift;
448 /* The first column for this stripe. */
449 uint64_t f = b % dcols;
450 /* The starting byte offset on each child vdev. */
451 uint64_t o = (b / dcols) << unit_shift;
452 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
453
454 /*
455 * "Quotient": The number of data sectors for this stripe on all but
456 * the "big column" child vdevs that also contain "remainder" data.
457 */
458 q = s / (dcols - nparity);
459
460 /*
461 * "Remainder": The number of partial stripe data sectors in this I/O.
462 * This will add a sector to some, but not all, child vdevs.
463 */
464 r = s - q * (dcols - nparity);
465
466 /* The number of "big columns" - those which contain remainder data. */
467 bc = (r == 0 ? 0 : r + nparity);
468
469 /*
470 * The total number of data and parity sectors associated with
471 * this I/O.
472 */
473 tot = s + nparity * (q + (r == 0 ? 0 : 1));
474
475 /* acols: The columns that will be accessed. */
476 /* scols: The columns that will be accessed or skipped. */
477 if (q == 0) {
478 /* Our I/O request doesn't span all child vdevs. */
479 acols = bc;
480 scols = MIN(dcols, roundup(bc, nparity + 1));
481 } else {
482 acols = dcols;
483 scols = dcols;
484 }
485
486 ASSERT3U(acols, <=, scols);
487
488 rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
489
490 rm->rm_cols = acols;
491 rm->rm_scols = scols;
492 rm->rm_bigcols = bc;
493 rm->rm_skipstart = bc;
494 rm->rm_missingdata = 0;
495 rm->rm_missingparity = 0;
496 rm->rm_firstdatacol = nparity;
497 rm->rm_datacopy = NULL;
498 rm->rm_reports = 0;
499 rm->rm_freed = 0;
500 rm->rm_ecksuminjected = 0;
501
502 asize = 0;
503
504 for (c = 0; c < scols; c++) {
505 col = f + c;
506 coff = o;
507 if (col >= dcols) {
508 col -= dcols;
509 coff += 1ULL << unit_shift;
510 }
511 rm->rm_col[c].rc_devidx = col;
512 rm->rm_col[c].rc_offset = coff;
513 rm->rm_col[c].rc_data = NULL;
514 rm->rm_col[c].rc_gdata = NULL;
515 rm->rm_col[c].rc_error = 0;
516 rm->rm_col[c].rc_tried = 0;
517 rm->rm_col[c].rc_skipped = 0;
518
519 if (c >= acols)
520 rm->rm_col[c].rc_size = 0;
521 else if (c < bc)
522 rm->rm_col[c].rc_size = (q + 1) << unit_shift;
523 else
524 rm->rm_col[c].rc_size = q << unit_shift;
525
526 asize += rm->rm_col[c].rc_size;
527 }
528
529 ASSERT3U(asize, ==, tot << unit_shift);
530 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
531 rm->rm_nskip = roundup(tot, nparity + 1) - tot;
532 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
533 ASSERT3U(rm->rm_nskip, <=, nparity);
534
535 if (zio->io_type != ZIO_TYPE_FREE) {
536 for (c = 0; c < rm->rm_firstdatacol; c++) {
537 rm->rm_col[c].rc_data =
538 zio_buf_alloc(rm->rm_col[c].rc_size);
539 }
540
541 rm->rm_col[c].rc_data = zio->io_data;
542
543 for (c = c + 1; c < acols; c++) {
544 rm->rm_col[c].rc_data =
545 (char *)rm->rm_col[c - 1].rc_data +
546 rm->rm_col[c - 1].rc_size;
547 }
548 }
549
550 /*
551 * If all data stored spans all columns, there's a danger that parity
552 * will always be on the same device and, since parity isn't read
553 * during normal operation, that that device's I/O bandwidth won't be
554 * used effectively. We therefore switch the parity every 1MB.
555 *
556 * ... at least that was, ostensibly, the theory. As a practical
557 * matter unless we juggle the parity between all devices evenly, we
558 * won't see any benefit. Further, occasional writes that aren't a
559 * multiple of the LCM of the number of children and the minimum
560 * stripe width are sufficient to avoid pessimal behavior.
561 * Unfortunately, this decision created an implicit on-disk format
562 * requirement that we need to support for all eternity, but only
563 * for single-parity RAID-Z.
564 *
565 * If we intend to skip a sector in the zeroth column for padding
566 * we must make sure to note this swap. We will never intend to
567 * skip the first column since at least one data and one parity
568 * column must appear in each row.
569 */
570 ASSERT(rm->rm_cols >= 2);
571 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
572
573 if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
574 devidx = rm->rm_col[0].rc_devidx;
575 o = rm->rm_col[0].rc_offset;
576 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
577 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
578 rm->rm_col[1].rc_devidx = devidx;
579 rm->rm_col[1].rc_offset = o;
580
581 if (rm->rm_skipstart == 0)
582 rm->rm_skipstart = 1;
583 }
584
585 zio->io_vsd = rm;
586 zio->io_vsd_ops = &vdev_raidz_vsd_ops;
587 return (rm);
588}
589
590static void
591vdev_raidz_generate_parity_p(raidz_map_t *rm)
592{
593 uint64_t *p, *src, pcount, ccount, i;
594 int c;
595
596 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
597
598 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
599 src = rm->rm_col[c].rc_data;
600 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
601 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
602
603 if (c == rm->rm_firstdatacol) {
604 ASSERT(ccount == pcount);
605 for (i = 0; i < ccount; i++, src++, p++) {
606 *p = *src;
607 }
608 } else {
609 ASSERT(ccount <= pcount);
610 for (i = 0; i < ccount; i++, src++, p++) {
611 *p ^= *src;
612 }
613 }
614 }
615}
616
617static void
618vdev_raidz_generate_parity_pq(raidz_map_t *rm)
619{
620 uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
621 int c;
622
623 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
624 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
625 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
626
627 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
628 src = rm->rm_col[c].rc_data;
629 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
630 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
631
632 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
633
634 if (c == rm->rm_firstdatacol) {
635 ASSERT(ccnt == pcnt || ccnt == 0);
636 for (i = 0; i < ccnt; i++, src++, p++, q++) {
637 *p = *src;
638 *q = *src;
639 }
640 for (; i < pcnt; i++, src++, p++, q++) {
641 *p = 0;
642 *q = 0;
643 }
644 } else {
645 ASSERT(ccnt <= pcnt);
646
647 /*
648 * Apply the algorithm described above by multiplying
649 * the previous result and adding in the new value.
650 */
651 for (i = 0; i < ccnt; i++, src++, p++, q++) {
652 *p ^= *src;
653
654 VDEV_RAIDZ_64MUL_2(*q, mask);
655 *q ^= *src;
656 }
657
658 /*
659 * Treat short columns as though they are full of 0s.
660 * Note that there's therefore nothing needed for P.
661 */
662 for (; i < pcnt; i++, q++) {
663 VDEV_RAIDZ_64MUL_2(*q, mask);
664 }
665 }
666 }
667}
668
669static void
670vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
671{
672 uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
673 int c;
674
675 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
676 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
677 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
678 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
679 rm->rm_col[VDEV_RAIDZ_R].rc_size);
680
681 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
682 src = rm->rm_col[c].rc_data;
683 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
684 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
685 r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
686
687 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
688
689 if (c == rm->rm_firstdatacol) {
690 ASSERT(ccnt == pcnt || ccnt == 0);
691 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
692 *p = *src;
693 *q = *src;
694 *r = *src;
695 }
696 for (; i < pcnt; i++, src++, p++, q++, r++) {
697 *p = 0;
698 *q = 0;
699 *r = 0;
700 }
701 } else {
702 ASSERT(ccnt <= pcnt);
703
704 /*
705 * Apply the algorithm described above by multiplying
706 * the previous result and adding in the new value.
707 */
708 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
709 *p ^= *src;
710
711 VDEV_RAIDZ_64MUL_2(*q, mask);
712 *q ^= *src;
713
714 VDEV_RAIDZ_64MUL_4(*r, mask);
715 *r ^= *src;
716 }
717
718 /*
719 * Treat short columns as though they are full of 0s.
720 * Note that there's therefore nothing needed for P.
721 */
722 for (; i < pcnt; i++, q++, r++) {
723 VDEV_RAIDZ_64MUL_2(*q, mask);
724 VDEV_RAIDZ_64MUL_4(*r, mask);
725 }
726 }
727 }
728}
729
730/*
731 * Generate RAID parity in the first virtual columns according to the number of
732 * parity columns available.
733 */
734static void
735vdev_raidz_generate_parity(raidz_map_t *rm)
736{
737 switch (rm->rm_firstdatacol) {
738 case 1:
739 vdev_raidz_generate_parity_p(rm);
740 break;
741 case 2:
742 vdev_raidz_generate_parity_pq(rm);
743 break;
744 case 3:
745 vdev_raidz_generate_parity_pqr(rm);
746 break;
747 default:
748 cmn_err(CE_PANIC, "invalid RAID-Z configuration");
749 }
750}
751
752static int
753vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
754{
755 uint64_t *dst, *src, xcount, ccount, count, i;
756 int x = tgts[0];
757 int c;
758
759 ASSERT(ntgts == 1);
760 ASSERT(x >= rm->rm_firstdatacol);
761 ASSERT(x < rm->rm_cols);
762
763 xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
764 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
765 ASSERT(xcount > 0);
766
767 src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
768 dst = rm->rm_col[x].rc_data;
769 for (i = 0; i < xcount; i++, dst++, src++) {
770 *dst = *src;
771 }
772
773 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
774 src = rm->rm_col[c].rc_data;
775 dst = rm->rm_col[x].rc_data;
776
777 if (c == x)
778 continue;
779
780 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
781 count = MIN(ccount, xcount);
782
783 for (i = 0; i < count; i++, dst++, src++) {
784 *dst ^= *src;
785 }
786 }
787
788 return (1 << VDEV_RAIDZ_P);
789}
790
791static int
792vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
793{
794 uint64_t *dst, *src, xcount, ccount, count, mask, i;
795 uint8_t *b;
796 int x = tgts[0];
797 int c, j, exp;
798
799 ASSERT(ntgts == 1);
800
801 xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
802 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
803
804 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
805 src = rm->rm_col[c].rc_data;
806 dst = rm->rm_col[x].rc_data;
807
808 if (c == x)
809 ccount = 0;
810 else
811 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
812
813 count = MIN(ccount, xcount);
814
815 if (c == rm->rm_firstdatacol) {
816 for (i = 0; i < count; i++, dst++, src++) {
817 *dst = *src;
818 }
819 for (; i < xcount; i++, dst++) {
820 *dst = 0;
821 }
822
823 } else {
824 for (i = 0; i < count; i++, dst++, src++) {
825 VDEV_RAIDZ_64MUL_2(*dst, mask);
826 *dst ^= *src;
827 }
828
829 for (; i < xcount; i++, dst++) {
830 VDEV_RAIDZ_64MUL_2(*dst, mask);
831 }
832 }
833 }
834
835 src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
836 dst = rm->rm_col[x].rc_data;
837 exp = 255 - (rm->rm_cols - 1 - x);
838
839 for (i = 0; i < xcount; i++, dst++, src++) {
840 *dst ^= *src;
841 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
842 *b = vdev_raidz_exp2(*b, exp);
843 }
844 }
845
846 return (1 << VDEV_RAIDZ_Q);
847}
848
849static int
850vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
851{
852 uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
853 void *pdata, *qdata;
854 uint64_t xsize, ysize, i;
855 int x = tgts[0];
856 int y = tgts[1];
857
858 ASSERT(ntgts == 2);
859 ASSERT(x < y);
860 ASSERT(x >= rm->rm_firstdatacol);
861 ASSERT(y < rm->rm_cols);
862
863 ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
864
865 /*
866 * Move the parity data aside -- we're going to compute parity as
867 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
868 * reuse the parity generation mechanism without trashing the actual
869 * parity so we make those columns appear to be full of zeros by
870 * setting their lengths to zero.
871 */
872 pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
873 qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
874 xsize = rm->rm_col[x].rc_size;
875 ysize = rm->rm_col[y].rc_size;
876
877 rm->rm_col[VDEV_RAIDZ_P].rc_data =
878 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
879 rm->rm_col[VDEV_RAIDZ_Q].rc_data =
880 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
881 rm->rm_col[x].rc_size = 0;
882 rm->rm_col[y].rc_size = 0;
883
884 vdev_raidz_generate_parity_pq(rm);
885
886 rm->rm_col[x].rc_size = xsize;
887 rm->rm_col[y].rc_size = ysize;
888
889 p = pdata;
890 q = qdata;
891 pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
892 qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
893 xd = rm->rm_col[x].rc_data;
894 yd = rm->rm_col[y].rc_data;
895
896 /*
897 * We now have:
898 * Pxy = P + D_x + D_y
899 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
900 *
901 * We can then solve for D_x:
902 * D_x = A * (P + Pxy) + B * (Q + Qxy)
903 * where
904 * A = 2^(x - y) * (2^(x - y) + 1)^-1
905 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
906 *
907 * With D_x in hand, we can easily solve for D_y:
908 * D_y = P + Pxy + D_x
909 */
910
911 a = vdev_raidz_pow2[255 + x - y];
912 b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
913 tmp = 255 - vdev_raidz_log2[a ^ 1];
914
915 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
916 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
917
918 for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
919 *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
920 vdev_raidz_exp2(*q ^ *qxy, bexp);
921
922 if (i < ysize)
923 *yd = *p ^ *pxy ^ *xd;
924 }
925
926 zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
927 rm->rm_col[VDEV_RAIDZ_P].rc_size);
928 zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
929 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
930
931 /*
932 * Restore the saved parity data.
933 */
934 rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
935 rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
936
937 return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
938}
939
940/* BEGIN CSTYLED */
941/*
942 * In the general case of reconstruction, we must solve the system of linear
943 * equations defined by the coeffecients used to generate parity as well as
944 * the contents of the data and parity disks. This can be expressed with
945 * vectors for the original data (D) and the actual data (d) and parity (p)
946 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
947 *
948 * __ __ __ __
949 * | | __ __ | p_0 |
950 * | V | | D_0 | | p_m-1 |
951 * | | x | : | = | d_0 |
952 * | I | | D_n-1 | | : |
953 * | | ~~ ~~ | d_n-1 |
954 * ~~ ~~ ~~ ~~
955 *
956 * I is simply a square identity matrix of size n, and V is a vandermonde
957 * matrix defined by the coeffecients we chose for the various parity columns
958 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
959 * computation as well as linear separability.
960 *
961 * __ __ __ __
962 * | 1 .. 1 1 1 | | p_0 |
963 * | 2^n-1 .. 4 2 1 | __ __ | : |
964 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 |
965 * | 1 .. 0 0 0 | | D_1 | | d_0 |
966 * | 0 .. 0 0 0 | x | D_2 | = | d_1 |
967 * | : : : : | | : | | d_2 |
968 * | 0 .. 1 0 0 | | D_n-1 | | : |
969 * | 0 .. 0 1 0 | ~~ ~~ | : |
970 * | 0 .. 0 0 1 | | d_n-1 |
971 * ~~ ~~ ~~ ~~
972 *
973 * Note that I, V, d, and p are known. To compute D, we must invert the
974 * matrix and use the known data and parity values to reconstruct the unknown
975 * data values. We begin by removing the rows in V|I and d|p that correspond
976 * to failed or missing columns; we then make V|I square (n x n) and d|p
977 * sized n by removing rows corresponding to unused parity from the bottom up
978 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
979 * using Gauss-Jordan elimination. In the example below we use m=3 parity
980 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
981 * __ __
982 * | 1 1 1 1 1 1 1 1 |
983 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks
984 * | 19 205 116 29 64 16 4 1 | / /
985 * | 1 0 0 0 0 0 0 0 | / /
986 * | 0 1 0 0 0 0 0 0 | <--' /
987 * (V|I) = | 0 0 1 0 0 0 0 0 | <---'
988 * | 0 0 0 1 0 0 0 0 |
989 * | 0 0 0 0 1 0 0 0 |
990 * | 0 0 0 0 0 1 0 0 |
991 * | 0 0 0 0 0 0 1 0 |
992 * | 0 0 0 0 0 0 0 1 |
993 * ~~ ~~
994 * __ __
995 * | 1 1 1 1 1 1 1 1 |
996 * | 128 64 32 16 8 4 2 1 |
997 * | 19 205 116 29 64 16 4 1 |
998 * | 1 0 0 0 0 0 0 0 |
999 * | 0 1 0 0 0 0 0 0 |
1000 * (V|I)' = | 0 0 1 0 0 0 0 0 |
1001 * | 0 0 0 1 0 0 0 0 |
1002 * | 0 0 0 0 1 0 0 0 |
1003 * | 0 0 0 0 0 1 0 0 |
1004 * | 0 0 0 0 0 0 1 0 |
1005 * | 0 0 0 0 0 0 0 1 |
1006 * ~~ ~~
1007 *
1008 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1009 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1010 * matrix is not singular.
1011 * __ __
1012 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
1013 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
1014 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1015 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1016 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1017 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1018 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1019 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1020 * ~~ ~~
1021 * __ __
1022 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1023 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
1024 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
1025 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1026 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1027 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1028 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1029 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1030 * ~~ ~~
1031 * __ __
1032 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1033 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1034 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 |
1035 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1036 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1037 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1038 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1039 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1040 * ~~ ~~
1041 * __ __
1042 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1043 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1044 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 |
1045 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1046 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1047 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1048 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1049 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1050 * ~~ ~~
1051 * __ __
1052 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1053 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1054 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1055 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1056 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1057 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1058 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1059 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1060 * ~~ ~~
1061 * __ __
1062 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1063 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 |
1064 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1065 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1066 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1067 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1068 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1069 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1070 * ~~ ~~
1071 * __ __
1072 * | 0 0 1 0 0 0 0 0 |
1073 * | 167 100 5 41 159 169 217 208 |
1074 * | 166 100 4 40 158 168 216 209 |
1075 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 |
1076 * | 0 0 0 0 1 0 0 0 |
1077 * | 0 0 0 0 0 1 0 0 |
1078 * | 0 0 0 0 0 0 1 0 |
1079 * | 0 0 0 0 0 0 0 1 |
1080 * ~~ ~~
1081 *
1082 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1083 * of the missing data.
1084 *
1085 * As is apparent from the example above, the only non-trivial rows in the
1086 * inverse matrix correspond to the data disks that we're trying to
1087 * reconstruct. Indeed, those are the only rows we need as the others would
1088 * only be useful for reconstructing data known or assumed to be valid. For
1089 * that reason, we only build the coefficients in the rows that correspond to
1090 * targeted columns.
1091 */
1092/* END CSTYLED */
1093
1094static void
1095vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1096 uint8_t **rows)
1097{
1098 int i, j;
1099 int pow;
1100
1101 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1102
1103 /*
1104 * Fill in the missing rows of interest.
1105 */
1106 for (i = 0; i < nmap; i++) {
1107 ASSERT3S(0, <=, map[i]);
1108 ASSERT3S(map[i], <=, 2);
1109
1110 pow = map[i] * n;
1111 if (pow > 255)
1112 pow -= 255;
1113 ASSERT(pow <= 255);
1114
1115 for (j = 0; j < n; j++) {
1116 pow -= map[i];
1117 if (pow < 0)
1118 pow += 255;
1119 rows[i][j] = vdev_raidz_pow2[pow];
1120 }
1121 }
1122}
1123
1124static void
1125vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1126 uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1127{
1128 int i, j, ii, jj;
1129 uint8_t log;
1130
1131 /*
1132 * Assert that the first nmissing entries from the array of used
1133 * columns correspond to parity columns and that subsequent entries
1134 * correspond to data columns.
1135 */
1136 for (i = 0; i < nmissing; i++) {
1137 ASSERT3S(used[i], <, rm->rm_firstdatacol);
1138 }
1139 for (; i < n; i++) {
1140 ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1141 }
1142
1143 /*
1144 * First initialize the storage where we'll compute the inverse rows.
1145 */
1146 for (i = 0; i < nmissing; i++) {
1147 for (j = 0; j < n; j++) {
1148 invrows[i][j] = (i == j) ? 1 : 0;
1149 }
1150 }
1151
1152 /*
1153 * Subtract all trivial rows from the rows of consequence.
1154 */
1155 for (i = 0; i < nmissing; i++) {
1156 for (j = nmissing; j < n; j++) {
1157 ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1158 jj = used[j] - rm->rm_firstdatacol;
1159 ASSERT3S(jj, <, n);
1160 invrows[i][j] = rows[i][jj];
1161 rows[i][jj] = 0;
1162 }
1163 }
1164
1165 /*
1166 * For each of the rows of interest, we must normalize it and subtract
1167 * a multiple of it from the other rows.
1168 */
1169 for (i = 0; i < nmissing; i++) {
1170 for (j = 0; j < missing[i]; j++) {
1171 ASSERT0(rows[i][j]);
1172 }
1173 ASSERT3U(rows[i][missing[i]], !=, 0);
1174
1175 /*
1176 * Compute the inverse of the first element and multiply each
1177 * element in the row by that value.
1178 */
1179 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1180
1181 for (j = 0; j < n; j++) {
1182 rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1183 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1184 }
1185
1186 for (ii = 0; ii < nmissing; ii++) {
1187 if (i == ii)
1188 continue;
1189
1190 ASSERT3U(rows[ii][missing[i]], !=, 0);
1191
1192 log = vdev_raidz_log2[rows[ii][missing[i]]];
1193
1194 for (j = 0; j < n; j++) {
1195 rows[ii][j] ^=
1196 vdev_raidz_exp2(rows[i][j], log);
1197 invrows[ii][j] ^=
1198 vdev_raidz_exp2(invrows[i][j], log);
1199 }
1200 }
1201 }
1202
1203 /*
1204 * Verify that the data that is left in the rows are properly part of
1205 * an identity matrix.
1206 */
1207 for (i = 0; i < nmissing; i++) {
1208 for (j = 0; j < n; j++) {
1209 if (j == missing[i]) {
1210 ASSERT3U(rows[i][j], ==, 1);
1211 } else {
1212 ASSERT0(rows[i][j]);
1213 }
1214 }
1215 }
1216}
1217
1218static void
1219vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1220 int *missing, uint8_t **invrows, const uint8_t *used)
1221{
1222 int i, j, x, cc, c;
1223 uint8_t *src;
1224 uint64_t ccount;
1225 uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1226 uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1227 uint8_t log = 0;
1228 uint8_t val;
1229 int ll;
1230 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1231 uint8_t *p, *pp;
1232 size_t psize;
1233
1234 psize = sizeof (invlog[0][0]) * n * nmissing;
1235 p = kmem_alloc(psize, KM_SLEEP);
1236
1237 for (pp = p, i = 0; i < nmissing; i++) {
1238 invlog[i] = pp;
1239 pp += n;
1240 }
1241
1242 for (i = 0; i < nmissing; i++) {
1243 for (j = 0; j < n; j++) {
1244 ASSERT3U(invrows[i][j], !=, 0);
1245 invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1246 }
1247 }
1248
1249 for (i = 0; i < n; i++) {
1250 c = used[i];
1251 ASSERT3U(c, <, rm->rm_cols);
1252
1253 src = rm->rm_col[c].rc_data;
1254 ccount = rm->rm_col[c].rc_size;
1255 for (j = 0; j < nmissing; j++) {
1256 cc = missing[j] + rm->rm_firstdatacol;
1257 ASSERT3U(cc, >=, rm->rm_firstdatacol);
1258 ASSERT3U(cc, <, rm->rm_cols);
1259 ASSERT3U(cc, !=, c);
1260
1261 dst[j] = rm->rm_col[cc].rc_data;
1262 dcount[j] = rm->rm_col[cc].rc_size;
1263 }
1264
1265 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1266
1267 for (x = 0; x < ccount; x++, src++) {
1268 if (*src != 0)
1269 log = vdev_raidz_log2[*src];
1270
1271 for (cc = 0; cc < nmissing; cc++) {
1272 if (x >= dcount[cc])
1273 continue;
1274
1275 if (*src == 0) {
1276 val = 0;
1277 } else {
1278 if ((ll = log + invlog[cc][i]) >= 255)
1279 ll -= 255;
1280 val = vdev_raidz_pow2[ll];
1281 }
1282
1283 if (i == 0)
1284 dst[cc][x] = val;
1285 else
1286 dst[cc][x] ^= val;
1287 }
1288 }
1289 }
1290
1291 kmem_free(p, psize);
1292}
1293
1294static int
1295vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1296{
1297 int n, i, c, t, tt;
1298 int nmissing_rows;
1299 int missing_rows[VDEV_RAIDZ_MAXPARITY];
1300 int parity_map[VDEV_RAIDZ_MAXPARITY];
1301
1302 uint8_t *p, *pp;
1303 size_t psize;
1304
1305 uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1306 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1307 uint8_t *used;
1308
1309 int code = 0;
1310
1311
1312 n = rm->rm_cols - rm->rm_firstdatacol;
1313
1314 /*
1315 * Figure out which data columns are missing.
1316 */
1317 nmissing_rows = 0;
1318 for (t = 0; t < ntgts; t++) {
1319 if (tgts[t] >= rm->rm_firstdatacol) {
1320 missing_rows[nmissing_rows++] =
1321 tgts[t] - rm->rm_firstdatacol;
1322 }
1323 }
1324
1325 /*
1326 * Figure out which parity columns to use to help generate the missing
1327 * data columns.
1328 */
1329 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1330 ASSERT(tt < ntgts);
1331 ASSERT(c < rm->rm_firstdatacol);
1332
1333 /*
1334 * Skip any targeted parity columns.
1335 */
1336 if (c == tgts[tt]) {
1337 tt++;
1338 continue;
1339 }
1340
1341 code |= 1 << c;
1342
1343 parity_map[i] = c;
1344 i++;
1345 }
1346
1347 ASSERT(code != 0);
1348 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1349
1350 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1351 nmissing_rows * n + sizeof (used[0]) * n;
1352 p = kmem_alloc(psize, KM_SLEEP);
1353
1354 for (pp = p, i = 0; i < nmissing_rows; i++) {
1355 rows[i] = pp;
1356 pp += n;
1357 invrows[i] = pp;
1358 pp += n;
1359 }
1360 used = pp;
1361
1362 for (i = 0; i < nmissing_rows; i++) {
1363 used[i] = parity_map[i];
1364 }
1365
1366 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1367 if (tt < nmissing_rows &&
1368 c == missing_rows[tt] + rm->rm_firstdatacol) {
1369 tt++;
1370 continue;
1371 }
1372
1373 ASSERT3S(i, <, n);
1374 used[i] = c;
1375 i++;
1376 }
1377
1378 /*
1379 * Initialize the interesting rows of the matrix.
1380 */
1381 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1382
1383 /*
1384 * Invert the matrix.
1385 */
1386 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1387 invrows, used);
1388
1389 /*
1390 * Reconstruct the missing data using the generated matrix.
1391 */
1392 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1393 invrows, used);
1394
1395 kmem_free(p, psize);
1396
1397 return (code);
1398}
1399
1400static int
1401vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1402{
1403 int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1404 int ntgts;
1405 int i, c;
1406 int code;
1407 int nbadparity, nbaddata;
1408 int parity_valid[VDEV_RAIDZ_MAXPARITY];
1409
1410 /*
1411 * The tgts list must already be sorted.
1412 */
1413 for (i = 1; i < nt; i++) {
1414 ASSERT(t[i] > t[i - 1]);
1415 }
1416
1417 nbadparity = rm->rm_firstdatacol;
1418 nbaddata = rm->rm_cols - nbadparity;
1419 ntgts = 0;
1420 for (i = 0, c = 0; c < rm->rm_cols; c++) {
1421 if (c < rm->rm_firstdatacol)
1422 parity_valid[c] = B_FALSE;
1423
1424 if (i < nt && c == t[i]) {
1425 tgts[ntgts++] = c;
1426 i++;
1427 } else if (rm->rm_col[c].rc_error != 0) {
1428 tgts[ntgts++] = c;
1429 } else if (c >= rm->rm_firstdatacol) {
1430 nbaddata--;
1431 } else {
1432 parity_valid[c] = B_TRUE;
1433 nbadparity--;
1434 }
1435 }
1436
1437 ASSERT(ntgts >= nt);
1438 ASSERT(nbaddata >= 0);
1439 ASSERT(nbaddata + nbadparity == ntgts);
1440
1441 dt = &tgts[nbadparity];
1442
1443 /*
1444 * See if we can use any of our optimized reconstruction routines.
1445 */
1446 if (!vdev_raidz_default_to_general) {
1447 switch (nbaddata) {
1448 case 1:
1449 if (parity_valid[VDEV_RAIDZ_P])
1450 return (vdev_raidz_reconstruct_p(rm, dt, 1));
1451
1452 ASSERT(rm->rm_firstdatacol > 1);
1453
1454 if (parity_valid[VDEV_RAIDZ_Q])
1455 return (vdev_raidz_reconstruct_q(rm, dt, 1));
1456
1457 ASSERT(rm->rm_firstdatacol > 2);
1458 break;
1459
1460 case 2:
1461 ASSERT(rm->rm_firstdatacol > 1);
1462
1463 if (parity_valid[VDEV_RAIDZ_P] &&
1464 parity_valid[VDEV_RAIDZ_Q])
1465 return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1466
1467 ASSERT(rm->rm_firstdatacol > 2);
1468
1469 break;
1470 }
1471 }
1472
1473 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1474 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1475 ASSERT(code > 0);
1476 return (code);
1477}
1478
1479static int
1480vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1481 uint64_t *logical_ashift, uint64_t *physical_ashift)
1482{
1483 vdev_t *cvd;
1484 uint64_t nparity = vd->vdev_nparity;
1485 int c;
1486 int lasterror = 0;
1487 int numerrors = 0;
1488
1489 ASSERT(nparity > 0);
1490
1491 if (nparity > VDEV_RAIDZ_MAXPARITY ||
1492 vd->vdev_children < nparity + 1) {
1493 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1494 return (SET_ERROR(EINVAL));
1495 }
1496
1497 vdev_open_children(vd);
1498
1499 for (c = 0; c < vd->vdev_children; c++) {
1500 cvd = vd->vdev_child[c];
1501
1502 if (cvd->vdev_open_error != 0) {
1503 lasterror = cvd->vdev_open_error;
1504 numerrors++;
1505 continue;
1506 }
1507
1508 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1509 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1510 *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
1511 *physical_ashift = MAX(*physical_ashift,
1512 cvd->vdev_physical_ashift);
1513 }
1514
1515 *asize *= vd->vdev_children;
1516 *max_asize *= vd->vdev_children;
1517
1518 if (numerrors > nparity) {
1519 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1520 return (lasterror);
1521 }
1522
1523 return (0);
1524}
1525
1526static void
1527vdev_raidz_close(vdev_t *vd)
1528{
1529 int c;
1530
1531 for (c = 0; c < vd->vdev_children; c++)
1532 vdev_close(vd->vdev_child[c]);
1533}
1534
1535static uint64_t
1536vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1537{
1538 uint64_t asize;
1539 uint64_t ashift = vd->vdev_top->vdev_ashift;
1540 uint64_t cols = vd->vdev_children;
1541 uint64_t nparity = vd->vdev_nparity;
1542
1543 asize = ((psize - 1) >> ashift) + 1;
1544 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1545 asize = roundup(asize, nparity + 1) << ashift;
1546
1547 return (asize);
1548}
1549
1550static void
1551vdev_raidz_child_done(zio_t *zio)
1552{
1553 raidz_col_t *rc = zio->io_private;
1554
1555 rc->rc_error = zio->io_error;
1556 rc->rc_tried = 1;
1557 rc->rc_skipped = 0;
1558}
1559
1560/*
1561 * Start an IO operation on a RAIDZ VDev
1562 *
1563 * Outline:
1564 * - For write operations:
1565 * 1. Generate the parity data
1566 * 2. Create child zio write operations to each column's vdev, for both
1567 * data and parity.
1568 * 3. If the column skips any sectors for padding, create optional dummy
1569 * write zio children for those areas to improve aggregation continuity.
1570 * - For read operations:
1571 * 1. Create child zio read operations to each data column's vdev to read
1572 * the range of data required for zio.
1573 * 2. If this is a scrub or resilver operation, or if any of the data
1574 * vdevs have had errors, then create zio read operations to the parity
1575 * columns' VDevs as well.
1576 */
1577static int
1578vdev_raidz_io_start(zio_t *zio)
1579{
1580 vdev_t *vd = zio->io_vd;
1581 vdev_t *tvd = vd->vdev_top;
1582 vdev_t *cvd;
1583 raidz_map_t *rm;
1584 raidz_col_t *rc;
1585 int c, i;
1586
1587 rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
1588 vd->vdev_nparity);
1589
1590 ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1591
1592 if (zio->io_type == ZIO_TYPE_FREE) {
1593 for (c = 0; c < rm->rm_cols; c++) {
1594 rc = &rm->rm_col[c];
1595 cvd = vd->vdev_child[rc->rc_devidx];
1596 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1597 rc->rc_offset, rc->rc_data, rc->rc_size,
1598 zio->io_type, zio->io_priority, 0,
1599 vdev_raidz_child_done, rc));
1600 }
1601 return (ZIO_PIPELINE_CONTINUE);
1602 }
1603
1604 if (zio->io_type == ZIO_TYPE_WRITE) {
1605 vdev_raidz_generate_parity(rm);
1606
1607 for (c = 0; c < rm->rm_cols; c++) {
1608 rc = &rm->rm_col[c];
1609 cvd = vd->vdev_child[rc->rc_devidx];
1610 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1611 rc->rc_offset, rc->rc_data, rc->rc_size,
1612 zio->io_type, zio->io_priority, 0,
1613 vdev_raidz_child_done, rc));
1614 }
1615
1616 /*
1617 * Generate optional I/Os for any skipped sectors to improve
1618 * aggregation contiguity.
1619 */
1620 for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1621 ASSERT(c <= rm->rm_scols);
1622 if (c == rm->rm_scols)
1623 c = 0;
1624 rc = &rm->rm_col[c];
1625 cvd = vd->vdev_child[rc->rc_devidx];
1626 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1627 rc->rc_offset + rc->rc_size, NULL,
1628 1 << tvd->vdev_ashift,
1629 zio->io_type, zio->io_priority,
1630 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1631 }
1632
1633 return (ZIO_PIPELINE_CONTINUE);
1634 }
1635
1636 ASSERT(zio->io_type == ZIO_TYPE_READ);
1637
1638 /*
1639 * Iterate over the columns in reverse order so that we hit the parity
1640 * last -- any errors along the way will force us to read the parity.
1641 */
1642 for (c = rm->rm_cols - 1; c >= 0; c--) {
1643 rc = &rm->rm_col[c];
1644 cvd = vd->vdev_child[rc->rc_devidx];
1645 if (!vdev_readable(cvd)) {
1646 if (c >= rm->rm_firstdatacol)
1647 rm->rm_missingdata++;
1648 else
1649 rm->rm_missingparity++;
1650 rc->rc_error = SET_ERROR(ENXIO);
1651 rc->rc_tried = 1; /* don't even try */
1652 rc->rc_skipped = 1;
1653 continue;
1654 }
1655 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1656 if (c >= rm->rm_firstdatacol)
1657 rm->rm_missingdata++;
1658 else
1659 rm->rm_missingparity++;
1660 rc->rc_error = SET_ERROR(ESTALE);
1661 rc->rc_skipped = 1;
1662 continue;
1663 }
1664 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
1665 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1666 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1667 rc->rc_offset, rc->rc_data, rc->rc_size,
1668 zio->io_type, zio->io_priority, 0,
1669 vdev_raidz_child_done, rc));
1670 }
1671 }
1672
1673 return (ZIO_PIPELINE_CONTINUE);
1674}
1675
1676
1677/*
1678 * Report a checksum error for a child of a RAID-Z device.
1679 */
1680static void
1681raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
1682{
1683 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
1684
1685 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1686 zio_bad_cksum_t zbc;
1687 raidz_map_t *rm = zio->io_vsd;
1688
1689 mutex_enter(&vd->vdev_stat_lock);
1690 vd->vdev_stat.vs_checksum_errors++;
1691 mutex_exit(&vd->vdev_stat_lock);
1692
1693 zbc.zbc_has_cksum = 0;
1694 zbc.zbc_injected = rm->rm_ecksuminjected;
1695
1696 zfs_ereport_post_checksum(zio->io_spa, vd, zio,
1697 rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
1698 &zbc);
1699 }
1700}
1701
1702/*
1703 * We keep track of whether or not there were any injected errors, so that
1704 * any ereports we generate can note it.
1705 */
1706static int
1707raidz_checksum_verify(zio_t *zio)
1708{
1709 zio_bad_cksum_t zbc;
1710 raidz_map_t *rm = zio->io_vsd;
1711
1712 int ret = zio_checksum_error(zio, &zbc);
1713 if (ret != 0 && zbc.zbc_injected != 0)
1714 rm->rm_ecksuminjected = 1;
1715
1716 return (ret);
1717}
1718
1719/*
1720 * Generate the parity from the data columns. If we tried and were able to
1721 * read the parity without error, verify that the generated parity matches the
1722 * data we read. If it doesn't, we fire off a checksum error. Return the
1723 * number such failures.
1724 */
1725static int
1726raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
1727{
1728 void *orig[VDEV_RAIDZ_MAXPARITY];
1729 int c, ret = 0;
1730 raidz_col_t *rc;
1731
1732 for (c = 0; c < rm->rm_firstdatacol; c++) {
1733 rc = &rm->rm_col[c];
1734 if (!rc->rc_tried || rc->rc_error != 0)
1735 continue;
1736 orig[c] = zio_buf_alloc(rc->rc_size);
1737 bcopy(rc->rc_data, orig[c], rc->rc_size);
1738 }
1739
1740 vdev_raidz_generate_parity(rm);
1741
1742 for (c = 0; c < rm->rm_firstdatacol; c++) {
1743 rc = &rm->rm_col[c];
1744 if (!rc->rc_tried || rc->rc_error != 0)
1745 continue;
1746 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1747 raidz_checksum_error(zio, rc, orig[c]);
1748 rc->rc_error = SET_ERROR(ECKSUM);
1749 ret++;
1750 }
1751 zio_buf_free(orig[c], rc->rc_size);
1752 }
1753
1754 return (ret);
1755}
1756
1757/*
1758 * Keep statistics on all the ways that we used parity to correct data.
1759 */
1760static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
1761
1762static int
1763vdev_raidz_worst_error(raidz_map_t *rm)
1764{
1765 int error = 0;
1766
1767 for (int c = 0; c < rm->rm_cols; c++)
1768 error = zio_worst_error(error, rm->rm_col[c].rc_error);
1769
1770 return (error);
1771}
1772
1773/*
1774 * Iterate over all combinations of bad data and attempt a reconstruction.
1775 * Note that the algorithm below is non-optimal because it doesn't take into
1776 * account how reconstruction is actually performed. For example, with
1777 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1778 * is targeted as invalid as if columns 1 and 4 are targeted since in both
1779 * cases we'd only use parity information in column 0.
1780 */
1781static int
1782vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
1783{
1784 raidz_map_t *rm = zio->io_vsd;
1785 raidz_col_t *rc;
1786 void *orig[VDEV_RAIDZ_MAXPARITY];
1787 int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1788 int *tgts = &tstore[1];
1789 int current, next, i, c, n;
1790 int code, ret = 0;
1791
1792 ASSERT(total_errors < rm->rm_firstdatacol);
1793
1794 /*
1795 * This simplifies one edge condition.
1796 */
1797 tgts[-1] = -1;
1798
1799 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1800 /*
1801 * Initialize the targets array by finding the first n columns
1802 * that contain no error.
1803 *
1804 * If there were no data errors, we need to ensure that we're
1805 * always explicitly attempting to reconstruct at least one
1806 * data column. To do this, we simply push the highest target
1807 * up into the data columns.
1808 */
1809 for (c = 0, i = 0; i < n; i++) {
1810 if (i == n - 1 && data_errors == 0 &&
1811 c < rm->rm_firstdatacol) {
1812 c = rm->rm_firstdatacol;
1813 }
1814
1815 while (rm->rm_col[c].rc_error != 0) {
1816 c++;
1817 ASSERT3S(c, <, rm->rm_cols);
1818 }
1819
1820 tgts[i] = c++;
1821 }
1822
1823 /*
1824 * Setting tgts[n] simplifies the other edge condition.
1825 */
1826 tgts[n] = rm->rm_cols;
1827
1828 /*
1829 * These buffers were allocated in previous iterations.
1830 */
1831 for (i = 0; i < n - 1; i++) {
1832 ASSERT(orig[i] != NULL);
1833 }
1834
1835 orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
1836
1837 current = 0;
1838 next = tgts[current];
1839
1840 while (current != n) {
1841 tgts[current] = next;
1842 current = 0;
1843
1844 /*
1845 * Save off the original data that we're going to
1846 * attempt to reconstruct.
1847 */
1848 for (i = 0; i < n; i++) {
1849 ASSERT(orig[i] != NULL);
1850 c = tgts[i];
1851 ASSERT3S(c, >=, 0);
1852 ASSERT3S(c, <, rm->rm_cols);
1853 rc = &rm->rm_col[c];
1854 bcopy(rc->rc_data, orig[i], rc->rc_size);
1855 }
1856
1857 /*
1858 * Attempt a reconstruction and exit the outer loop on
1859 * success.
1860 */
1861 code = vdev_raidz_reconstruct(rm, tgts, n);
1862 if (raidz_checksum_verify(zio) == 0) {
1863 atomic_inc_64(&raidz_corrected[code]);
1864
1865 for (i = 0; i < n; i++) {
1866 c = tgts[i];
1867 rc = &rm->rm_col[c];
1868 ASSERT(rc->rc_error == 0);
1869 if (rc->rc_tried)
1870 raidz_checksum_error(zio, rc,
1871 orig[i]);
1872 rc->rc_error = SET_ERROR(ECKSUM);
1873 }
1874
1875 ret = code;
1876 goto done;
1877 }
1878
1879 /*
1880 * Restore the original data.
1881 */
1882 for (i = 0; i < n; i++) {
1883 c = tgts[i];
1884 rc = &rm->rm_col[c];
1885 bcopy(orig[i], rc->rc_data, rc->rc_size);
1886 }
1887
1888 do {
1889 /*
1890 * Find the next valid column after the current
1891 * position..
1892 */
1893 for (next = tgts[current] + 1;
1894 next < rm->rm_cols &&
1895 rm->rm_col[next].rc_error != 0; next++)
1896 continue;
1897
1898 ASSERT(next <= tgts[current + 1]);
1899
1900 /*
1901 * If that spot is available, we're done here.
1902 */
1903 if (next != tgts[current + 1])
1904 break;
1905
1906 /*
1907 * Otherwise, find the next valid column after
1908 * the previous position.
1909 */
1910 for (c = tgts[current - 1] + 1;
1911 rm->rm_col[c].rc_error != 0; c++)
1912 continue;
1913
1914 tgts[current] = c;
1915 current++;
1916
1917 } while (current != n);
1918 }
1919 }
1920 n--;
1921done:
1922 for (i = 0; i < n; i++) {
1923 zio_buf_free(orig[i], rm->rm_col[0].rc_size);
1924 }
1925
1926 return (ret);
1927}
1928
1929/*
1930 * Complete an IO operation on a RAIDZ VDev
1931 *
1932 * Outline:
1933 * - For write operations:
1934 * 1. Check for errors on the child IOs.
1935 * 2. Return, setting an error code if too few child VDevs were written
1936 * to reconstruct the data later. Note that partial writes are
1937 * considered successful if they can be reconstructed at all.
1938 * - For read operations:
1939 * 1. Check for errors on the child IOs.
1940 * 2. If data errors occurred:
1941 * a. Try to reassemble the data from the parity available.
1942 * b. If we haven't yet read the parity drives, read them now.
1943 * c. If all parity drives have been read but the data still doesn't
1944 * reassemble with a correct checksum, then try combinatorial
1945 * reconstruction.
1946 * d. If that doesn't work, return an error.
1947 * 3. If there were unexpected errors or this is a resilver operation,
1948 * rewrite the vdevs that had errors.
1949 */
1950static void
1951vdev_raidz_io_done(zio_t *zio)
1952{
1953 vdev_t *vd = zio->io_vd;
1954 vdev_t *cvd;
1955 raidz_map_t *rm = zio->io_vsd;
1956 raidz_col_t *rc;
1957 int unexpected_errors = 0;
1958 int parity_errors = 0;
1959 int parity_untried = 0;
1960 int data_errors = 0;
1961 int total_errors = 0;
1962 int n, c;
1963 int tgts[VDEV_RAIDZ_MAXPARITY];
1964 int code;
1965
1966 ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */
1967
1968 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1969 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1970
1971 for (c = 0; c < rm->rm_cols; c++) {
1972 rc = &rm->rm_col[c];
1973
1974 if (rc->rc_error) {
1975 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
1976
1977 if (c < rm->rm_firstdatacol)
1978 parity_errors++;
1979 else
1980 data_errors++;
1981
1982 if (!rc->rc_skipped)
1983 unexpected_errors++;
1984
1985 total_errors++;
1986 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1987 parity_untried++;
1988 }
1989 }
1990
1991 if (zio->io_type == ZIO_TYPE_WRITE) {
1992 /*
1993 * XXX -- for now, treat partial writes as a success.
1994 * (If we couldn't write enough columns to reconstruct
1995 * the data, the I/O failed. Otherwise, good enough.)
1996 *
1997 * Now that we support write reallocation, it would be better
1998 * to treat partial failure as real failure unless there are
1999 * no non-degraded top-level vdevs left, and not update DTLs
2000 * if we intend to reallocate.
2001 */
2002 /* XXPOLICY */
2003 if (total_errors > rm->rm_firstdatacol)
2004 zio->io_error = vdev_raidz_worst_error(rm);
2005
2006 return;
2007 } else if (zio->io_type == ZIO_TYPE_FREE) {
2008 return;
2009 }
2010
2011 ASSERT(zio->io_type == ZIO_TYPE_READ);
2012 /*
2013 * There are three potential phases for a read:
2014 * 1. produce valid data from the columns read
2015 * 2. read all disks and try again
2016 * 3. perform combinatorial reconstruction
2017 *
2018 * Each phase is progressively both more expensive and less likely to
2019 * occur. If we encounter more errors than we can repair or all phases
2020 * fail, we have no choice but to return an error.
2021 */
2022
2023 /*
2024 * If the number of errors we saw was correctable -- less than or equal
2025 * to the number of parity disks read -- attempt to produce data that
2026 * has a valid checksum. Naturally, this case applies in the absence of
2027 * any errors.
2028 */
2029 if (total_errors <= rm->rm_firstdatacol - parity_untried) {
2030 if (data_errors == 0) {
2031 if (raidz_checksum_verify(zio) == 0) {
2032 /*
2033 * If we read parity information (unnecessarily
2034 * as it happens since no reconstruction was
2035 * needed) regenerate and verify the parity.
2036 * We also regenerate parity when resilvering
2037 * so we can write it out to the failed device
2038 * later.
2039 */
2040 if (parity_errors + parity_untried <
2041 rm->rm_firstdatacol ||
2042 (zio->io_flags & ZIO_FLAG_RESILVER)) {
2043 n = raidz_parity_verify(zio, rm);
2044 unexpected_errors += n;
2045 ASSERT(parity_errors + n <=
2046 rm->rm_firstdatacol);
2047 }
2048 goto done;
2049 }
2050 } else {
2051 /*
2052 * We either attempt to read all the parity columns or
2053 * none of them. If we didn't try to read parity, we
2054 * wouldn't be here in the correctable case. There must
2055 * also have been fewer parity errors than parity
2056 * columns or, again, we wouldn't be in this code path.
2057 */
2058 ASSERT(parity_untried == 0);
2059 ASSERT(parity_errors < rm->rm_firstdatacol);
2060
2061 /*
2062 * Identify the data columns that reported an error.
2063 */
2064 n = 0;
2065 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
2066 rc = &rm->rm_col[c];
2067 if (rc->rc_error != 0) {
2068 ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2069 tgts[n++] = c;
2070 }
2071 }
2072
2073 ASSERT(rm->rm_firstdatacol >= n);
2074
2075 code = vdev_raidz_reconstruct(rm, tgts, n);
2076
2077 if (raidz_checksum_verify(zio) == 0) {
2078 atomic_inc_64(&raidz_corrected[code]);
2079
2080 /*
2081 * If we read more parity disks than were used
2082 * for reconstruction, confirm that the other
2083 * parity disks produced correct data. This
2084 * routine is suboptimal in that it regenerates
2085 * the parity that we already used in addition
2086 * to the parity that we're attempting to
2087 * verify, but this should be a relatively
2088 * uncommon case, and can be optimized if it
2089 * becomes a problem. Note that we regenerate
2090 * parity when resilvering so we can write it
2091 * out to failed devices later.
2092 */
2093 if (parity_errors < rm->rm_firstdatacol - n ||
2094 (zio->io_flags & ZIO_FLAG_RESILVER)) {
2095 n = raidz_parity_verify(zio, rm);
2096 unexpected_errors += n;
2097 ASSERT(parity_errors + n <=
2098 rm->rm_firstdatacol);
2099 }
2100
2101 goto done;
2102 }
2103 }
2104 }
2105
2106 /*
2107 * This isn't a typical situation -- either we got a read error or
2108 * a child silently returned bad data. Read every block so we can
2109 * try again with as much data and parity as we can track down. If
2110 * we've already been through once before, all children will be marked
2111 * as tried so we'll proceed to combinatorial reconstruction.
2112 */
2113 unexpected_errors = 1;
2114 rm->rm_missingdata = 0;
2115 rm->rm_missingparity = 0;
2116
2117 for (c = 0; c < rm->rm_cols; c++) {
2118 if (rm->rm_col[c].rc_tried)
2119 continue;
2120
2121 zio_vdev_io_redone(zio);
2122 do {
2123 rc = &rm->rm_col[c];
2124 if (rc->rc_tried)
2125 continue;
2126 zio_nowait(zio_vdev_child_io(zio, NULL,
2127 vd->vdev_child[rc->rc_devidx],
2128 rc->rc_offset, rc->rc_data, rc->rc_size,
2129 zio->io_type, zio->io_priority, 0,
2130 vdev_raidz_child_done, rc));
2131 } while (++c < rm->rm_cols);
2132
2133 return;
2134 }
2135
2136 /*
2137 * At this point we've attempted to reconstruct the data given the
2138 * errors we detected, and we've attempted to read all columns. There
2139 * must, therefore, be one or more additional problems -- silent errors
2140 * resulting in invalid data rather than explicit I/O errors resulting
2141 * in absent data. We check if there is enough additional data to
2142 * possibly reconstruct the data and then perform combinatorial
2143 * reconstruction over all possible combinations. If that fails,
2144 * we're cooked.
2145 */
2146 if (total_errors > rm->rm_firstdatacol) {
2147 zio->io_error = vdev_raidz_worst_error(rm);
2148
2149 } else if (total_errors < rm->rm_firstdatacol &&
2150 (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2151 /*
2152 * If we didn't use all the available parity for the
2153 * combinatorial reconstruction, verify that the remaining
2154 * parity is correct.
2155 */
2156 if (code != (1 << rm->rm_firstdatacol) - 1)
2157 (void) raidz_parity_verify(zio, rm);
2158 } else {
2159 /*
2160 * We're here because either:
2161 *
2162 * total_errors == rm_first_datacol, or
2163 * vdev_raidz_combrec() failed
2164 *
2165 * In either case, there is enough bad data to prevent
2166 * reconstruction.
2167 *
2168 * Start checksum ereports for all children which haven't
2169 * failed, and the IO wasn't speculative.
2170 */
2171 zio->io_error = SET_ERROR(ECKSUM);
2172
2173 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2174 for (c = 0; c < rm->rm_cols; c++) {
2175 rc = &rm->rm_col[c];
2176 if (rc->rc_error == 0) {
2177 zio_bad_cksum_t zbc;
2178 zbc.zbc_has_cksum = 0;
2179 zbc.zbc_injected =
2180 rm->rm_ecksuminjected;
2181
2182 zfs_ereport_start_checksum(
2183 zio->io_spa,
2184 vd->vdev_child[rc->rc_devidx],
2185 zio, rc->rc_offset, rc->rc_size,
2186 (void *)(uintptr_t)c, &zbc);
2187 }
2188 }
2189 }
2190 }
2191
2192done:
2193 zio_checksum_verified(zio);
2194
2195 if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2196 (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2197 /*
2198 * Use the good data we have in hand to repair damaged children.
2199 */
2200 for (c = 0; c < rm->rm_cols; c++) {
2201 rc = &rm->rm_col[c];
2202 cvd = vd->vdev_child[rc->rc_devidx];
2203
2204 if (rc->rc_error == 0)
2205 continue;
2206
2207 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2208 rc->rc_offset, rc->rc_data, rc->rc_size,
2209 ZIO_TYPE_WRITE, zio->io_priority,
2210 ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2211 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2212 }
2213 }
2214}
2215
2216static void
2217vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2218{
2219 if (faulted > vd->vdev_nparity)
2220 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2221 VDEV_AUX_NO_REPLICAS);
2222 else if (degraded + faulted != 0)
2223 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2224 else
2225 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2226}
2227
2228vdev_ops_t vdev_raidz_ops = {
2229 vdev_raidz_open,
2230 vdev_raidz_close,
2231 vdev_raidz_asize,
2232 vdev_raidz_io_start,
2233 vdev_raidz_io_done,
2234 vdev_raidz_state_change,
2235 NULL,
2236 NULL,
2237 VDEV_TYPE_RAIDZ, /* name of this vdev type */
2238 B_FALSE /* not a leaf vdev */
2239};