vdev_raidz.c revision 1.3
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22/* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2012, 2014 by Delphix. All rights reserved. 25 * Copyright (c) 2013, Joyent, Inc. All rights reserved. 26 * Copyright (c) 2014 Integros [integros.com] 27 */ 28 29#include <sys/zfs_context.h> 30#include <sys/spa.h> 31#include <sys/vdev_impl.h> 32#ifndef __FreeBSD__ 33#include <sys/vdev_disk.h> 34#endif 35#include <sys/vdev_file.h> 36#include <sys/vdev_raidz.h> 37#include <sys/zio.h> 38#include <sys/zio_checksum.h> 39#include <sys/fs/zfs.h> 40#include <sys/fm/fs/zfs.h> 41#ifdef __FreeBSD__ 42#include <sys/bio.h> 43#endif 44 45/* 46 * Virtual device vector for RAID-Z. 47 * 48 * This vdev supports single, double, and triple parity. For single parity, 49 * we use a simple XOR of all the data columns. For double or triple parity, 50 * we use a special case of Reed-Solomon coding. This extends the 51 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by 52 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for 53 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the 54 * former is also based. The latter is designed to provide higher performance 55 * for writes. 56 * 57 * Note that the Plank paper claimed to support arbitrary N+M, but was then 58 * amended six years later identifying a critical flaw that invalidates its 59 * claims. Nevertheless, the technique can be adapted to work for up to 60 * triple parity. For additional parity, the amendment "Note: Correction to 61 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding 62 * is viable, but the additional complexity means that write performance will 63 * suffer. 64 * 65 * All of the methods above operate on a Galois field, defined over the 66 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements 67 * can be expressed with a single byte. Briefly, the operations on the 68 * field are defined as follows: 69 * 70 * o addition (+) is represented by a bitwise XOR 71 * o subtraction (-) is therefore identical to addition: A + B = A - B 72 * o multiplication of A by 2 is defined by the following bitwise expression: 73 * 74 * (A * 2)_7 = A_6 75 * (A * 2)_6 = A_5 76 * (A * 2)_5 = A_4 77 * (A * 2)_4 = A_3 + A_7 78 * (A * 2)_3 = A_2 + A_7 79 * (A * 2)_2 = A_1 + A_7 80 * (A * 2)_1 = A_0 81 * (A * 2)_0 = A_7 82 * 83 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). 84 * As an aside, this multiplication is derived from the error correcting 85 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1. 86 * 87 * Observe that any number in the field (except for 0) can be expressed as a 88 * power of 2 -- a generator for the field. We store a table of the powers of 89 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can 90 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather 91 * than field addition). The inverse of a field element A (A^-1) is therefore 92 * A ^ (255 - 1) = A^254. 93 * 94 * The up-to-three parity columns, P, Q, R over several data columns, 95 * D_0, ... D_n-1, can be expressed by field operations: 96 * 97 * P = D_0 + D_1 + ... + D_n-2 + D_n-1 98 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 99 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 100 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1 101 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1 102 * 103 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival 104 * XOR operation, and 2 and 4 can be computed quickly and generate linearly- 105 * independent coefficients. (There are no additional coefficients that have 106 * this property which is why the uncorrected Plank method breaks down.) 107 * 108 * See the reconstruction code below for how P, Q and R can used individually 109 * or in concert to recover missing data columns. 110 */ 111 112typedef struct raidz_col { 113 uint64_t rc_devidx; /* child device index for I/O */ 114 uint64_t rc_offset; /* device offset */ 115 uint64_t rc_size; /* I/O size */ 116 void *rc_data; /* I/O data */ 117 void *rc_gdata; /* used to store the "good" version */ 118 int rc_error; /* I/O error for this device */ 119 uint8_t rc_tried; /* Did we attempt this I/O column? */ 120 uint8_t rc_skipped; /* Did we skip this I/O column? */ 121} raidz_col_t; 122 123typedef struct raidz_map { 124 uint64_t rm_cols; /* Regular column count */ 125 uint64_t rm_scols; /* Count including skipped columns */ 126 uint64_t rm_bigcols; /* Number of oversized columns */ 127 uint64_t rm_asize; /* Actual total I/O size */ 128 uint64_t rm_missingdata; /* Count of missing data devices */ 129 uint64_t rm_missingparity; /* Count of missing parity devices */ 130 uint64_t rm_firstdatacol; /* First data column/parity count */ 131 uint64_t rm_nskip; /* Skipped sectors for padding */ 132 uint64_t rm_skipstart; /* Column index of padding start */ 133 void *rm_datacopy; /* rm_asize-buffer of copied data */ 134 uintptr_t rm_reports; /* # of referencing checksum reports */ 135 uint8_t rm_freed; /* map no longer has referencing ZIO */ 136 uint8_t rm_ecksuminjected; /* checksum error was injected */ 137 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ 138} raidz_map_t; 139 140#define VDEV_RAIDZ_P 0 141#define VDEV_RAIDZ_Q 1 142#define VDEV_RAIDZ_R 2 143 144#define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 145#define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 146 147/* 148 * We provide a mechanism to perform the field multiplication operation on a 149 * 64-bit value all at once rather than a byte at a time. This works by 150 * creating a mask from the top bit in each byte and using that to 151 * conditionally apply the XOR of 0x1d. 152 */ 153#define VDEV_RAIDZ_64MUL_2(x, mask) \ 154{ \ 155 (mask) = (x) & 0x8080808080808080ULL; \ 156 (mask) = ((mask) << 1) - ((mask) >> 7); \ 157 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 158 ((mask) & 0x1d1d1d1d1d1d1d1d); \ 159} 160 161#define VDEV_RAIDZ_64MUL_4(x, mask) \ 162{ \ 163 VDEV_RAIDZ_64MUL_2((x), mask); \ 164 VDEV_RAIDZ_64MUL_2((x), mask); \ 165} 166 167#define VDEV_LABEL_OFFSET(x) (x + VDEV_LABEL_START_SIZE) 168 169/* 170 * Force reconstruction to use the general purpose method. 171 */ 172int vdev_raidz_default_to_general; 173 174/* Powers of 2 in the Galois field defined above. */ 175static const uint8_t vdev_raidz_pow2[256] = { 176 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 177 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 178 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 179 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 180 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 181 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 182 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, 183 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 184 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 185 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, 186 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 187 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 188 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 189 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 190 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 191 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 192 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 193 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 194 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 195 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 196 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 197 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 198 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 199 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 200 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, 201 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 202 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 203 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, 204 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 205 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, 206 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 207 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 208}; 209/* Logs of 2 in the Galois field defined above. */ 210static const uint8_t vdev_raidz_log2[256] = { 211 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, 212 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, 213 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, 214 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, 215 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, 216 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, 217 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, 218 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, 219 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, 220 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, 221 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, 222 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, 223 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, 224 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, 225 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, 226 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, 227 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, 228 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, 229 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, 230 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, 231 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, 232 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, 233 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, 234 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, 235 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, 236 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, 237 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, 238 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, 239 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, 240 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, 241 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, 242 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, 243}; 244 245static void vdev_raidz_generate_parity(raidz_map_t *rm); 246 247/* 248 * Multiply a given number by 2 raised to the given power. 249 */ 250static uint8_t 251vdev_raidz_exp2(uint_t a, int exp) 252{ 253 if (a == 0) 254 return (0); 255 256 ASSERT(exp >= 0); 257 ASSERT(vdev_raidz_log2[a] > 0 || a == 1); 258 259 exp += vdev_raidz_log2[a]; 260 if (exp > 255) 261 exp -= 255; 262 263 return (vdev_raidz_pow2[exp]); 264} 265 266static void 267vdev_raidz_map_free(raidz_map_t *rm) 268{ 269 int c; 270 size_t size; 271 272 for (c = 0; c < rm->rm_firstdatacol; c++) { 273 if (rm->rm_col[c].rc_data != NULL) 274 zio_buf_free(rm->rm_col[c].rc_data, 275 rm->rm_col[c].rc_size); 276 277 if (rm->rm_col[c].rc_gdata != NULL) 278 zio_buf_free(rm->rm_col[c].rc_gdata, 279 rm->rm_col[c].rc_size); 280 } 281 282 size = 0; 283 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) 284 size += rm->rm_col[c].rc_size; 285 286 if (rm->rm_datacopy != NULL) 287 zio_buf_free(rm->rm_datacopy, size); 288 289 kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); 290} 291 292static void 293vdev_raidz_map_free_vsd(zio_t *zio) 294{ 295 raidz_map_t *rm = zio->io_vsd; 296 297 ASSERT0(rm->rm_freed); 298 rm->rm_freed = 1; 299 300 if (rm->rm_reports == 0) 301 vdev_raidz_map_free(rm); 302} 303 304/*ARGSUSED*/ 305static void 306vdev_raidz_cksum_free(void *arg, size_t ignored) 307{ 308 raidz_map_t *rm = arg; 309 310 ASSERT3U(rm->rm_reports, >, 0); 311 312 if (--rm->rm_reports == 0 && rm->rm_freed != 0) 313 vdev_raidz_map_free(rm); 314} 315 316static void 317vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) 318{ 319 raidz_map_t *rm = zcr->zcr_cbdata; 320 size_t c = zcr->zcr_cbinfo; 321 size_t x; 322 323 const char *good = NULL; 324 const char *bad = rm->rm_col[c].rc_data; 325 326 if (good_data == NULL) { 327 zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); 328 return; 329 } 330 331 if (c < rm->rm_firstdatacol) { 332 /* 333 * The first time through, calculate the parity blocks for 334 * the good data (this relies on the fact that the good 335 * data never changes for a given logical ZIO) 336 */ 337 if (rm->rm_col[0].rc_gdata == NULL) { 338 char *bad_parity[VDEV_RAIDZ_MAXPARITY]; 339 char *buf; 340 341 /* 342 * Set up the rm_col[]s to generate the parity for 343 * good_data, first saving the parity bufs and 344 * replacing them with buffers to hold the result. 345 */ 346 for (x = 0; x < rm->rm_firstdatacol; x++) { 347 bad_parity[x] = rm->rm_col[x].rc_data; 348 rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata = 349 zio_buf_alloc(rm->rm_col[x].rc_size); 350 } 351 352 /* fill in the data columns from good_data */ 353 buf = (char *)good_data; 354 for (; x < rm->rm_cols; x++) { 355 rm->rm_col[x].rc_data = buf; 356 buf += rm->rm_col[x].rc_size; 357 } 358 359 /* 360 * Construct the parity from the good data. 361 */ 362 vdev_raidz_generate_parity(rm); 363 364 /* restore everything back to its original state */ 365 for (x = 0; x < rm->rm_firstdatacol; x++) 366 rm->rm_col[x].rc_data = bad_parity[x]; 367 368 buf = rm->rm_datacopy; 369 for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) { 370 rm->rm_col[x].rc_data = buf; 371 buf += rm->rm_col[x].rc_size; 372 } 373 } 374 375 ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL); 376 good = rm->rm_col[c].rc_gdata; 377 } else { 378 /* adjust good_data to point at the start of our column */ 379 good = good_data; 380 381 for (x = rm->rm_firstdatacol; x < c; x++) 382 good += rm->rm_col[x].rc_size; 383 } 384 385 /* we drop the ereport if it ends up that the data was good */ 386 zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE); 387} 388 389/* 390 * Invoked indirectly by zfs_ereport_start_checksum(), called 391 * below when our read operation fails completely. The main point 392 * is to keep a copy of everything we read from disk, so that at 393 * vdev_raidz_cksum_finish() time we can compare it with the good data. 394 */ 395static void 396vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) 397{ 398 size_t c = (size_t)(uintptr_t)arg; 399 caddr_t buf; 400 401 raidz_map_t *rm = zio->io_vsd; 402 size_t size; 403 404 /* set up the report and bump the refcount */ 405 zcr->zcr_cbdata = rm; 406 zcr->zcr_cbinfo = c; 407 zcr->zcr_finish = vdev_raidz_cksum_finish; 408 zcr->zcr_free = vdev_raidz_cksum_free; 409 410 rm->rm_reports++; 411 ASSERT3U(rm->rm_reports, >, 0); 412 413 if (rm->rm_datacopy != NULL) 414 return; 415 416 /* 417 * It's the first time we're called for this raidz_map_t, so we need 418 * to copy the data aside; there's no guarantee that our zio's buffer 419 * won't be re-used for something else. 420 * 421 * Our parity data is already in separate buffers, so there's no need 422 * to copy them. 423 */ 424 425 size = 0; 426 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) 427 size += rm->rm_col[c].rc_size; 428 429 buf = rm->rm_datacopy = zio_buf_alloc(size); 430 431 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 432 raidz_col_t *col = &rm->rm_col[c]; 433 434 bcopy(col->rc_data, buf, col->rc_size); 435 col->rc_data = buf; 436 437 buf += col->rc_size; 438 } 439 ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size); 440} 441 442static const zio_vsd_ops_t vdev_raidz_vsd_ops = { 443 vdev_raidz_map_free_vsd, 444 vdev_raidz_cksum_report 445}; 446 447/* 448 * Divides the IO evenly across all child vdevs; usually, dcols is 449 * the number of children in the target vdev. 450 */ 451static raidz_map_t * 452vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset, boolean_t dofree, 453 uint64_t unit_shift, uint64_t dcols, uint64_t nparity) 454{ 455 raidz_map_t *rm; 456 /* The starting RAIDZ (parent) vdev sector of the block. */ 457 uint64_t b = offset >> unit_shift; 458 /* The zio's size in units of the vdev's minimum sector size. */ 459 uint64_t s = size >> unit_shift; 460 /* The first column for this stripe. */ 461 uint64_t f = b % dcols; 462 /* The starting byte offset on each child vdev. */ 463 uint64_t o = (b / dcols) << unit_shift; 464 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; 465 466 /* 467 * "Quotient": The number of data sectors for this stripe on all but 468 * the "big column" child vdevs that also contain "remainder" data. 469 */ 470 q = s / (dcols - nparity); 471 472 /* 473 * "Remainder": The number of partial stripe data sectors in this I/O. 474 * This will add a sector to some, but not all, child vdevs. 475 */ 476 r = s - q * (dcols - nparity); 477 478 /* The number of "big columns" - those which contain remainder data. */ 479 bc = (r == 0 ? 0 : r + nparity); 480 481 /* 482 * The total number of data and parity sectors associated with 483 * this I/O. 484 */ 485 tot = s + nparity * (q + (r == 0 ? 0 : 1)); 486 487 /* acols: The columns that will be accessed. */ 488 /* scols: The columns that will be accessed or skipped. */ 489 if (q == 0) { 490 /* Our I/O request doesn't span all child vdevs. */ 491 acols = bc; 492 scols = MIN(dcols, roundup(bc, nparity + 1)); 493 } else { 494 acols = dcols; 495 scols = dcols; 496 } 497 498 ASSERT3U(acols, <=, scols); 499 500 rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP); 501 502 rm->rm_cols = acols; 503 rm->rm_scols = scols; 504 rm->rm_bigcols = bc; 505 rm->rm_skipstart = bc; 506 rm->rm_missingdata = 0; 507 rm->rm_missingparity = 0; 508 rm->rm_firstdatacol = nparity; 509 rm->rm_datacopy = NULL; 510 rm->rm_reports = 0; 511 rm->rm_freed = 0; 512 rm->rm_ecksuminjected = 0; 513 514 asize = 0; 515 516 for (c = 0; c < scols; c++) { 517 col = f + c; 518 coff = o; 519 if (col >= dcols) { 520 col -= dcols; 521 coff += 1ULL << unit_shift; 522 } 523 rm->rm_col[c].rc_devidx = col; 524 rm->rm_col[c].rc_offset = coff; 525 rm->rm_col[c].rc_data = NULL; 526 rm->rm_col[c].rc_gdata = NULL; 527 rm->rm_col[c].rc_error = 0; 528 rm->rm_col[c].rc_tried = 0; 529 rm->rm_col[c].rc_skipped = 0; 530 531 if (c >= acols) 532 rm->rm_col[c].rc_size = 0; 533 else if (c < bc) 534 rm->rm_col[c].rc_size = (q + 1) << unit_shift; 535 else 536 rm->rm_col[c].rc_size = q << unit_shift; 537 538 asize += rm->rm_col[c].rc_size; 539 } 540 541 ASSERT3U(asize, ==, tot << unit_shift); 542 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); 543 rm->rm_nskip = roundup(tot, nparity + 1) - tot; 544 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); 545 ASSERT3U(rm->rm_nskip, <=, nparity); 546 547 if (!dofree) { 548 for (c = 0; c < rm->rm_firstdatacol; c++) { 549 rm->rm_col[c].rc_data = 550 zio_buf_alloc(rm->rm_col[c].rc_size); 551 } 552 553 rm->rm_col[c].rc_data = data; 554 555 for (c = c + 1; c < acols; c++) { 556 rm->rm_col[c].rc_data = 557 (char *)rm->rm_col[c - 1].rc_data + 558 rm->rm_col[c - 1].rc_size; 559 } 560 } 561 562 /* 563 * If all data stored spans all columns, there's a danger that parity 564 * will always be on the same device and, since parity isn't read 565 * during normal operation, that that device's I/O bandwidth won't be 566 * used effectively. We therefore switch the parity every 1MB. 567 * 568 * ... at least that was, ostensibly, the theory. As a practical 569 * matter unless we juggle the parity between all devices evenly, we 570 * won't see any benefit. Further, occasional writes that aren't a 571 * multiple of the LCM of the number of children and the minimum 572 * stripe width are sufficient to avoid pessimal behavior. 573 * Unfortunately, this decision created an implicit on-disk format 574 * requirement that we need to support for all eternity, but only 575 * for single-parity RAID-Z. 576 * 577 * If we intend to skip a sector in the zeroth column for padding 578 * we must make sure to note this swap. We will never intend to 579 * skip the first column since at least one data and one parity 580 * column must appear in each row. 581 */ 582 ASSERT(rm->rm_cols >= 2); 583 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); 584 585 if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) { 586 devidx = rm->rm_col[0].rc_devidx; 587 o = rm->rm_col[0].rc_offset; 588 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; 589 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; 590 rm->rm_col[1].rc_devidx = devidx; 591 rm->rm_col[1].rc_offset = o; 592 593 if (rm->rm_skipstart == 0) 594 rm->rm_skipstart = 1; 595 } 596 597 return (rm); 598} 599 600static void 601vdev_raidz_generate_parity_p(raidz_map_t *rm) 602{ 603 uint64_t *p, *src, pcount, ccount, i; 604 int c; 605 606 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 607 608 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 609 src = rm->rm_col[c].rc_data; 610 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 611 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 612 613 if (c == rm->rm_firstdatacol) { 614 ASSERT(ccount == pcount); 615 for (i = 0; i < ccount; i++, src++, p++) { 616 *p = *src; 617 } 618 } else { 619 ASSERT(ccount <= pcount); 620 for (i = 0; i < ccount; i++, src++, p++) { 621 *p ^= *src; 622 } 623 } 624 } 625} 626 627static void 628vdev_raidz_generate_parity_pq(raidz_map_t *rm) 629{ 630 uint64_t *p, *q, *src, pcnt, ccnt, mask, i; 631 int c; 632 633 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 634 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 635 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 636 637 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 638 src = rm->rm_col[c].rc_data; 639 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 640 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 641 642 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 643 644 if (c == rm->rm_firstdatacol) { 645 ASSERT(ccnt == pcnt || ccnt == 0); 646 for (i = 0; i < ccnt; i++, src++, p++, q++) { 647 *p = *src; 648 *q = *src; 649 } 650 for (; i < pcnt; i++, src++, p++, q++) { 651 *p = 0; 652 *q = 0; 653 } 654 } else { 655 ASSERT(ccnt <= pcnt); 656 657 /* 658 * Apply the algorithm described above by multiplying 659 * the previous result and adding in the new value. 660 */ 661 for (i = 0; i < ccnt; i++, src++, p++, q++) { 662 *p ^= *src; 663 664 VDEV_RAIDZ_64MUL_2(*q, mask); 665 *q ^= *src; 666 } 667 668 /* 669 * Treat short columns as though they are full of 0s. 670 * Note that there's therefore nothing needed for P. 671 */ 672 for (; i < pcnt; i++, q++) { 673 VDEV_RAIDZ_64MUL_2(*q, mask); 674 } 675 } 676 } 677} 678 679static void 680vdev_raidz_generate_parity_pqr(raidz_map_t *rm) 681{ 682 uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i; 683 int c; 684 685 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 686 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 687 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 688 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 689 rm->rm_col[VDEV_RAIDZ_R].rc_size); 690 691 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 692 src = rm->rm_col[c].rc_data; 693 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 694 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 695 r = rm->rm_col[VDEV_RAIDZ_R].rc_data; 696 697 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 698 699 if (c == rm->rm_firstdatacol) { 700 ASSERT(ccnt == pcnt || ccnt == 0); 701 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 702 *p = *src; 703 *q = *src; 704 *r = *src; 705 } 706 for (; i < pcnt; i++, src++, p++, q++, r++) { 707 *p = 0; 708 *q = 0; 709 *r = 0; 710 } 711 } else { 712 ASSERT(ccnt <= pcnt); 713 714 /* 715 * Apply the algorithm described above by multiplying 716 * the previous result and adding in the new value. 717 */ 718 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 719 *p ^= *src; 720 721 VDEV_RAIDZ_64MUL_2(*q, mask); 722 *q ^= *src; 723 724 VDEV_RAIDZ_64MUL_4(*r, mask); 725 *r ^= *src; 726 } 727 728 /* 729 * Treat short columns as though they are full of 0s. 730 * Note that there's therefore nothing needed for P. 731 */ 732 for (; i < pcnt; i++, q++, r++) { 733 VDEV_RAIDZ_64MUL_2(*q, mask); 734 VDEV_RAIDZ_64MUL_4(*r, mask); 735 } 736 } 737 } 738} 739 740/* 741 * Generate RAID parity in the first virtual columns according to the number of 742 * parity columns available. 743 */ 744static void 745vdev_raidz_generate_parity(raidz_map_t *rm) 746{ 747 switch (rm->rm_firstdatacol) { 748 case 1: 749 vdev_raidz_generate_parity_p(rm); 750 break; 751 case 2: 752 vdev_raidz_generate_parity_pq(rm); 753 break; 754 case 3: 755 vdev_raidz_generate_parity_pqr(rm); 756 break; 757 default: 758 cmn_err(CE_PANIC, "invalid RAID-Z configuration"); 759 } 760} 761 762static int 763vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) 764{ 765 uint64_t *dst, *src, xcount, ccount, count, i; 766 int x = tgts[0]; 767 int c; 768 769 ASSERT(ntgts == 1); 770 ASSERT(x >= rm->rm_firstdatacol); 771 ASSERT(x < rm->rm_cols); 772 773 xcount = rm->rm_col[x].rc_size / sizeof (src[0]); 774 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0])); 775 ASSERT(xcount > 0); 776 777 src = rm->rm_col[VDEV_RAIDZ_P].rc_data; 778 dst = rm->rm_col[x].rc_data; 779 for (i = 0; i < xcount; i++, dst++, src++) { 780 *dst = *src; 781 } 782 783 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 784 src = rm->rm_col[c].rc_data; 785 dst = rm->rm_col[x].rc_data; 786 787 if (c == x) 788 continue; 789 790 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 791 count = MIN(ccount, xcount); 792 793 for (i = 0; i < count; i++, dst++, src++) { 794 *dst ^= *src; 795 } 796 } 797 798 return (1 << VDEV_RAIDZ_P); 799} 800 801static int 802vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) 803{ 804 uint64_t *dst, *src, xcount, ccount, count, mask, i; 805 uint8_t *b; 806 int x = tgts[0]; 807 int c, j, exp; 808 809 ASSERT(ntgts == 1); 810 811 xcount = rm->rm_col[x].rc_size / sizeof (src[0]); 812 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0])); 813 814 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 815 src = rm->rm_col[c].rc_data; 816 dst = rm->rm_col[x].rc_data; 817 818 if (c == x) 819 ccount = 0; 820 else 821 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 822 823 count = MIN(ccount, xcount); 824 825 if (c == rm->rm_firstdatacol) { 826 for (i = 0; i < count; i++, dst++, src++) { 827 *dst = *src; 828 } 829 for (; i < xcount; i++, dst++) { 830 *dst = 0; 831 } 832 833 } else { 834 for (i = 0; i < count; i++, dst++, src++) { 835 VDEV_RAIDZ_64MUL_2(*dst, mask); 836 *dst ^= *src; 837 } 838 839 for (; i < xcount; i++, dst++) { 840 VDEV_RAIDZ_64MUL_2(*dst, mask); 841 } 842 } 843 } 844 845 src = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 846 dst = rm->rm_col[x].rc_data; 847 exp = 255 - (rm->rm_cols - 1 - x); 848 849 for (i = 0; i < xcount; i++, dst++, src++) { 850 *dst ^= *src; 851 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { 852 *b = vdev_raidz_exp2(*b, exp); 853 } 854 } 855 856 return (1 << VDEV_RAIDZ_Q); 857} 858 859static int 860vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) 861{ 862 uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp; 863 void *pdata, *qdata; 864 uint64_t xsize, ysize, i; 865 int x = tgts[0]; 866 int y = tgts[1]; 867 868 ASSERT(ntgts == 2); 869 ASSERT(x < y); 870 ASSERT(x >= rm->rm_firstdatacol); 871 ASSERT(y < rm->rm_cols); 872 873 ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size); 874 875 /* 876 * Move the parity data aside -- we're going to compute parity as 877 * though columns x and y were full of zeros -- Pxy and Qxy. We want to 878 * reuse the parity generation mechanism without trashing the actual 879 * parity so we make those columns appear to be full of zeros by 880 * setting their lengths to zero. 881 */ 882 pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data; 883 qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 884 xsize = rm->rm_col[x].rc_size; 885 ysize = rm->rm_col[y].rc_size; 886 887 rm->rm_col[VDEV_RAIDZ_P].rc_data = 888 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size); 889 rm->rm_col[VDEV_RAIDZ_Q].rc_data = 890 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size); 891 rm->rm_col[x].rc_size = 0; 892 rm->rm_col[y].rc_size = 0; 893 894 vdev_raidz_generate_parity_pq(rm); 895 896 rm->rm_col[x].rc_size = xsize; 897 rm->rm_col[y].rc_size = ysize; 898 899 p = pdata; 900 q = qdata; 901 pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data; 902 qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 903 xd = rm->rm_col[x].rc_data; 904 yd = rm->rm_col[y].rc_data; 905 906 /* 907 * We now have: 908 * Pxy = P + D_x + D_y 909 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y 910 * 911 * We can then solve for D_x: 912 * D_x = A * (P + Pxy) + B * (Q + Qxy) 913 * where 914 * A = 2^(x - y) * (2^(x - y) + 1)^-1 915 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 916 * 917 * With D_x in hand, we can easily solve for D_y: 918 * D_y = P + Pxy + D_x 919 */ 920 921 a = vdev_raidz_pow2[255 + x - y]; 922 b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)]; 923 tmp = 255 - vdev_raidz_log2[a ^ 1]; 924 925 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; 926 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; 927 928 for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) { 929 *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^ 930 vdev_raidz_exp2(*q ^ *qxy, bexp); 931 932 if (i < ysize) 933 *yd = *p ^ *pxy ^ *xd; 934 } 935 936 zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data, 937 rm->rm_col[VDEV_RAIDZ_P].rc_size); 938 zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data, 939 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 940 941 /* 942 * Restore the saved parity data. 943 */ 944 rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata; 945 rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata; 946 947 return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q)); 948} 949 950/* BEGIN CSTYLED */ 951/* 952 * In the general case of reconstruction, we must solve the system of linear 953 * equations defined by the coeffecients used to generate parity as well as 954 * the contents of the data and parity disks. This can be expressed with 955 * vectors for the original data (D) and the actual data (d) and parity (p) 956 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 957 * 958 * __ __ __ __ 959 * | | __ __ | p_0 | 960 * | V | | D_0 | | p_m-1 | 961 * | | x | : | = | d_0 | 962 * | I | | D_n-1 | | : | 963 * | | ~~ ~~ | d_n-1 | 964 * ~~ ~~ ~~ ~~ 965 * 966 * I is simply a square identity matrix of size n, and V is a vandermonde 967 * matrix defined by the coeffecients we chose for the various parity columns 968 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 969 * computation as well as linear separability. 970 * 971 * __ __ __ __ 972 * | 1 .. 1 1 1 | | p_0 | 973 * | 2^n-1 .. 4 2 1 | __ __ | : | 974 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 975 * | 1 .. 0 0 0 | | D_1 | | d_0 | 976 * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 977 * | : : : : | | : | | d_2 | 978 * | 0 .. 1 0 0 | | D_n-1 | | : | 979 * | 0 .. 0 1 0 | ~~ ~~ | : | 980 * | 0 .. 0 0 1 | | d_n-1 | 981 * ~~ ~~ ~~ ~~ 982 * 983 * Note that I, V, d, and p are known. To compute D, we must invert the 984 * matrix and use the known data and parity values to reconstruct the unknown 985 * data values. We begin by removing the rows in V|I and d|p that correspond 986 * to failed or missing columns; we then make V|I square (n x n) and d|p 987 * sized n by removing rows corresponding to unused parity from the bottom up 988 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 989 * using Gauss-Jordan elimination. In the example below we use m=3 parity 990 * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 991 * __ __ 992 * | 1 1 1 1 1 1 1 1 | 993 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 994 * | 19 205 116 29 64 16 4 1 | / / 995 * | 1 0 0 0 0 0 0 0 | / / 996 * | 0 1 0 0 0 0 0 0 | <--' / 997 * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 998 * | 0 0 0 1 0 0 0 0 | 999 * | 0 0 0 0 1 0 0 0 | 1000 * | 0 0 0 0 0 1 0 0 | 1001 * | 0 0 0 0 0 0 1 0 | 1002 * | 0 0 0 0 0 0 0 1 | 1003 * ~~ ~~ 1004 * __ __ 1005 * | 1 1 1 1 1 1 1 1 | 1006 * | 19 205 116 29 64 16 4 1 | 1007 * | 1 0 0 0 0 0 0 0 | 1008 * (V|I)' = | 0 0 0 1 0 0 0 0 | 1009 * | 0 0 0 0 1 0 0 0 | 1010 * | 0 0 0 0 0 1 0 0 | 1011 * | 0 0 0 0 0 0 1 0 | 1012 * | 0 0 0 0 0 0 0 1 | 1013 * ~~ ~~ 1014 * 1015 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 1016 * have carefully chosen the seed values 1, 2, and 4 to ensure that this 1017 * matrix is not singular. 1018 * __ __ 1019 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1020 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1021 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1022 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1023 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1024 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1025 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1026 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1027 * ~~ ~~ 1028 * __ __ 1029 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1030 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1031 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1032 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1033 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1034 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1035 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1036 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1037 * ~~ ~~ 1038 * __ __ 1039 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1040 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1041 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 1042 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1043 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1044 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1045 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1046 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1047 * ~~ ~~ 1048 * __ __ 1049 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1050 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1051 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 1052 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1053 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1054 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1055 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1056 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1057 * ~~ ~~ 1058 * __ __ 1059 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1060 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1061 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1062 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1063 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1064 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1065 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1066 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1067 * ~~ ~~ 1068 * __ __ 1069 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1070 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 1071 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1072 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1073 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1074 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1075 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1076 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1077 * ~~ ~~ 1078 * __ __ 1079 * | 0 0 1 0 0 0 0 0 | 1080 * | 167 100 5 41 159 169 217 208 | 1081 * | 166 100 4 40 158 168 216 209 | 1082 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 1083 * | 0 0 0 0 1 0 0 0 | 1084 * | 0 0 0 0 0 1 0 0 | 1085 * | 0 0 0 0 0 0 1 0 | 1086 * | 0 0 0 0 0 0 0 1 | 1087 * ~~ ~~ 1088 * 1089 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 1090 * of the missing data. 1091 * 1092 * As is apparent from the example above, the only non-trivial rows in the 1093 * inverse matrix correspond to the data disks that we're trying to 1094 * reconstruct. Indeed, those are the only rows we need as the others would 1095 * only be useful for reconstructing data known or assumed to be valid. For 1096 * that reason, we only build the coefficients in the rows that correspond to 1097 * targeted columns. 1098 */ 1099/* END CSTYLED */ 1100 1101static void 1102vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, 1103 uint8_t **rows) 1104{ 1105 int i, j; 1106 int pow; 1107 1108 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); 1109 1110 /* 1111 * Fill in the missing rows of interest. 1112 */ 1113 for (i = 0; i < nmap; i++) { 1114 ASSERT3S(0, <=, map[i]); 1115 ASSERT3S(map[i], <=, 2); 1116 1117 pow = map[i] * n; 1118 if (pow > 255) 1119 pow -= 255; 1120 ASSERT(pow <= 255); 1121 1122 for (j = 0; j < n; j++) { 1123 pow -= map[i]; 1124 if (pow < 0) 1125 pow += 255; 1126 rows[i][j] = vdev_raidz_pow2[pow]; 1127 } 1128 } 1129} 1130 1131static void 1132vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, 1133 uint8_t **rows, uint8_t **invrows, const uint8_t *used) 1134{ 1135 int i, j, ii, jj; 1136 uint8_t log; 1137 1138 /* 1139 * Assert that the first nmissing entries from the array of used 1140 * columns correspond to parity columns and that subsequent entries 1141 * correspond to data columns. 1142 */ 1143 for (i = 0; i < nmissing; i++) { 1144 ASSERT3S(used[i], <, rm->rm_firstdatacol); 1145 } 1146 for (; i < n; i++) { 1147 ASSERT3S(used[i], >=, rm->rm_firstdatacol); 1148 } 1149 1150 /* 1151 * First initialize the storage where we'll compute the inverse rows. 1152 */ 1153 for (i = 0; i < nmissing; i++) { 1154 for (j = 0; j < n; j++) { 1155 invrows[i][j] = (i == j) ? 1 : 0; 1156 } 1157 } 1158 1159 /* 1160 * Subtract all trivial rows from the rows of consequence. 1161 */ 1162 for (i = 0; i < nmissing; i++) { 1163 for (j = nmissing; j < n; j++) { 1164 ASSERT3U(used[j], >=, rm->rm_firstdatacol); 1165 jj = used[j] - rm->rm_firstdatacol; 1166 ASSERT3S(jj, <, n); 1167 invrows[i][j] = rows[i][jj]; 1168 rows[i][jj] = 0; 1169 } 1170 } 1171 1172 /* 1173 * For each of the rows of interest, we must normalize it and subtract 1174 * a multiple of it from the other rows. 1175 */ 1176 for (i = 0; i < nmissing; i++) { 1177 for (j = 0; j < missing[i]; j++) { 1178 ASSERT0(rows[i][j]); 1179 } 1180 ASSERT3U(rows[i][missing[i]], !=, 0); 1181 1182 /* 1183 * Compute the inverse of the first element and multiply each 1184 * element in the row by that value. 1185 */ 1186 log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 1187 1188 for (j = 0; j < n; j++) { 1189 rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 1190 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 1191 } 1192 1193 for (ii = 0; ii < nmissing; ii++) { 1194 if (i == ii) 1195 continue; 1196 1197 ASSERT3U(rows[ii][missing[i]], !=, 0); 1198 1199 log = vdev_raidz_log2[rows[ii][missing[i]]]; 1200 1201 for (j = 0; j < n; j++) { 1202 rows[ii][j] ^= 1203 vdev_raidz_exp2(rows[i][j], log); 1204 invrows[ii][j] ^= 1205 vdev_raidz_exp2(invrows[i][j], log); 1206 } 1207 } 1208 } 1209 1210 /* 1211 * Verify that the data that is left in the rows are properly part of 1212 * an identity matrix. 1213 */ 1214 for (i = 0; i < nmissing; i++) { 1215 for (j = 0; j < n; j++) { 1216 if (j == missing[i]) { 1217 ASSERT3U(rows[i][j], ==, 1); 1218 } else { 1219 ASSERT0(rows[i][j]); 1220 } 1221 } 1222 } 1223} 1224 1225static void 1226vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, 1227 int *missing, uint8_t **invrows, const uint8_t *used) 1228{ 1229 int i, j, x, cc, c; 1230 uint8_t *src; 1231 uint64_t ccount; 1232 uint8_t *dst[VDEV_RAIDZ_MAXPARITY]; 1233 uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; 1234 uint8_t log = 0; 1235 uint8_t val; 1236 int ll; 1237 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 1238 uint8_t *p, *pp; 1239 size_t psize; 1240 1241 psize = sizeof (invlog[0][0]) * n * nmissing; 1242 p = kmem_alloc(psize, KM_SLEEP); 1243 1244 for (pp = p, i = 0; i < nmissing; i++) { 1245 invlog[i] = pp; 1246 pp += n; 1247 } 1248 1249 for (i = 0; i < nmissing; i++) { 1250 for (j = 0; j < n; j++) { 1251 ASSERT3U(invrows[i][j], !=, 0); 1252 invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 1253 } 1254 } 1255 1256 for (i = 0; i < n; i++) { 1257 c = used[i]; 1258 ASSERT3U(c, <, rm->rm_cols); 1259 1260 src = rm->rm_col[c].rc_data; 1261 ccount = rm->rm_col[c].rc_size; 1262 for (j = 0; j < nmissing; j++) { 1263 cc = missing[j] + rm->rm_firstdatacol; 1264 ASSERT3U(cc, >=, rm->rm_firstdatacol); 1265 ASSERT3U(cc, <, rm->rm_cols); 1266 ASSERT3U(cc, !=, c); 1267 1268 dst[j] = rm->rm_col[cc].rc_data; 1269 dcount[j] = rm->rm_col[cc].rc_size; 1270 } 1271 1272 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); 1273 1274 for (x = 0; x < ccount; x++, src++) { 1275 if (*src != 0) 1276 log = vdev_raidz_log2[*src]; 1277 1278 for (cc = 0; cc < nmissing; cc++) { 1279 if (x >= dcount[cc]) 1280 continue; 1281 1282 if (*src == 0) { 1283 val = 0; 1284 } else { 1285 if ((ll = log + invlog[cc][i]) >= 255) 1286 ll -= 255; 1287 val = vdev_raidz_pow2[ll]; 1288 } 1289 1290 if (i == 0) 1291 dst[cc][x] = val; 1292 else 1293 dst[cc][x] ^= val; 1294 } 1295 } 1296 } 1297 1298 kmem_free(p, psize); 1299} 1300 1301static int 1302vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) 1303{ 1304 int n, i, c, t, tt; 1305 int nmissing_rows; 1306 int missing_rows[VDEV_RAIDZ_MAXPARITY]; 1307 int parity_map[VDEV_RAIDZ_MAXPARITY]; 1308 1309 uint8_t *p, *pp; 1310 size_t psize; 1311 1312 uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 1313 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 1314 uint8_t *used; 1315 1316 int code = 0; 1317 1318 1319 n = rm->rm_cols - rm->rm_firstdatacol; 1320 1321 /* 1322 * Figure out which data columns are missing. 1323 */ 1324 nmissing_rows = 0; 1325 for (t = 0; t < ntgts; t++) { 1326 if (tgts[t] >= rm->rm_firstdatacol) { 1327 missing_rows[nmissing_rows++] = 1328 tgts[t] - rm->rm_firstdatacol; 1329 } 1330 } 1331 1332 /* 1333 * Figure out which parity columns to use to help generate the missing 1334 * data columns. 1335 */ 1336 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 1337 ASSERT(tt < ntgts); 1338 ASSERT(c < rm->rm_firstdatacol); 1339 1340 /* 1341 * Skip any targeted parity columns. 1342 */ 1343 if (c == tgts[tt]) { 1344 tt++; 1345 continue; 1346 } 1347 1348 code |= 1 << c; 1349 1350 parity_map[i] = c; 1351 i++; 1352 } 1353 1354 ASSERT(code != 0); 1355 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); 1356 1357 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 1358 nmissing_rows * n + sizeof (used[0]) * n; 1359 p = kmem_alloc(psize, KM_SLEEP); 1360 1361 for (pp = p, i = 0; i < nmissing_rows; i++) { 1362 rows[i] = pp; 1363 pp += n; 1364 invrows[i] = pp; 1365 pp += n; 1366 } 1367 used = pp; 1368 1369 for (i = 0; i < nmissing_rows; i++) { 1370 used[i] = parity_map[i]; 1371 } 1372 1373 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1374 if (tt < nmissing_rows && 1375 c == missing_rows[tt] + rm->rm_firstdatacol) { 1376 tt++; 1377 continue; 1378 } 1379 1380 ASSERT3S(i, <, n); 1381 used[i] = c; 1382 i++; 1383 } 1384 1385 /* 1386 * Initialize the interesting rows of the matrix. 1387 */ 1388 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); 1389 1390 /* 1391 * Invert the matrix. 1392 */ 1393 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, 1394 invrows, used); 1395 1396 /* 1397 * Reconstruct the missing data using the generated matrix. 1398 */ 1399 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, 1400 invrows, used); 1401 1402 kmem_free(p, psize); 1403 1404 return (code); 1405} 1406 1407static int 1408vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) 1409{ 1410 int tgts[VDEV_RAIDZ_MAXPARITY], *dt; 1411 int ntgts; 1412 int i, c; 1413 int code; 1414 int nbadparity, nbaddata; 1415 int parity_valid[VDEV_RAIDZ_MAXPARITY] = {0}; 1416 1417 /* 1418 * The tgts list must already be sorted. 1419 */ 1420 for (i = 1; i < nt; i++) { 1421 ASSERT(t[i] > t[i - 1]); 1422 } 1423 1424 nbadparity = rm->rm_firstdatacol; 1425 nbaddata = rm->rm_cols - nbadparity; 1426 ntgts = 0; 1427 for (i = 0, c = 0; c < rm->rm_cols; c++) { 1428 if (c < rm->rm_firstdatacol) 1429 parity_valid[c] = B_FALSE; 1430 1431 if (i < nt && c == t[i]) { 1432 tgts[ntgts++] = c; 1433 i++; 1434 } else if (rm->rm_col[c].rc_error != 0) { 1435 tgts[ntgts++] = c; 1436 } else if (c >= rm->rm_firstdatacol) { 1437 nbaddata--; 1438 } else { 1439 parity_valid[c] = B_TRUE; 1440 nbadparity--; 1441 } 1442 } 1443 1444 ASSERT(ntgts >= nt); 1445 ASSERT(nbaddata >= 0); 1446 ASSERT(nbaddata + nbadparity == ntgts); 1447 1448 dt = &tgts[nbadparity]; 1449 1450 /* 1451 * See if we can use any of our optimized reconstruction routines. 1452 */ 1453 if (!vdev_raidz_default_to_general) { 1454 switch (nbaddata) { 1455 case 1: 1456 if (parity_valid[VDEV_RAIDZ_P]) 1457 return (vdev_raidz_reconstruct_p(rm, dt, 1)); 1458 1459 ASSERT(rm->rm_firstdatacol > 1); 1460 1461 if (parity_valid[VDEV_RAIDZ_Q]) 1462 return (vdev_raidz_reconstruct_q(rm, dt, 1)); 1463 1464 ASSERT(rm->rm_firstdatacol > 2); 1465 break; 1466 1467 case 2: 1468 ASSERT(rm->rm_firstdatacol > 1); 1469 1470 if (parity_valid[VDEV_RAIDZ_P] && 1471 parity_valid[VDEV_RAIDZ_Q]) 1472 return (vdev_raidz_reconstruct_pq(rm, dt, 2)); 1473 1474 ASSERT(rm->rm_firstdatacol > 2); 1475 1476 break; 1477 } 1478 } 1479 1480 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); 1481 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); 1482 ASSERT(code > 0); 1483 return (code); 1484} 1485 1486static int 1487vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, 1488 uint64_t *logical_ashift, uint64_t *physical_ashift) 1489{ 1490 vdev_t *cvd; 1491 uint64_t nparity = vd->vdev_nparity; 1492 int c; 1493 int lasterror = 0; 1494 int numerrors = 0; 1495 1496 ASSERT(nparity > 0); 1497 1498 if (nparity > VDEV_RAIDZ_MAXPARITY || 1499 vd->vdev_children < nparity + 1) { 1500 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 1501 return (SET_ERROR(EINVAL)); 1502 } 1503 1504 vdev_open_children(vd); 1505 1506 for (c = 0; c < vd->vdev_children; c++) { 1507 cvd = vd->vdev_child[c]; 1508 1509 if (cvd->vdev_open_error != 0) { 1510 lasterror = cvd->vdev_open_error; 1511 numerrors++; 1512 continue; 1513 } 1514 1515 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; 1516 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; 1517 *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift); 1518 *physical_ashift = MAX(*physical_ashift, 1519 cvd->vdev_physical_ashift); 1520 } 1521 1522 *asize *= vd->vdev_children; 1523 *max_asize *= vd->vdev_children; 1524 1525 if (numerrors > nparity) { 1526 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; 1527 return (lasterror); 1528 } 1529 1530 return (0); 1531} 1532 1533static void 1534vdev_raidz_close(vdev_t *vd) 1535{ 1536 int c; 1537 1538 for (c = 0; c < vd->vdev_children; c++) 1539 vdev_close(vd->vdev_child[c]); 1540} 1541 1542#ifdef illumos 1543/* 1544 * Handle a read or write I/O to a RAID-Z dump device. 1545 * 1546 * The dump device is in a unique situation compared to other ZFS datasets: 1547 * writing to this device should be as simple and fast as possible. In 1548 * addition, durability matters much less since the dump will be extracted 1549 * once the machine reboots. For that reason, this function eschews parity for 1550 * performance and simplicity. The dump device uses the checksum setting 1551 * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this 1552 * dataset. 1553 * 1554 * Blocks of size 128 KB have been preallocated for this volume. I/Os less than 1555 * 128 KB will not fill an entire block; in addition, they may not be properly 1556 * aligned. In that case, this function uses the preallocated 128 KB block and 1557 * omits reading or writing any "empty" portions of that block, as opposed to 1558 * allocating a fresh appropriately-sized block. 1559 * 1560 * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs: 1561 * 1562 * vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB) 1563 * 1564 * If this were a standard RAID-Z dataset, a block of at least 40 KB would be 1565 * allocated which spans all five child vdevs. 8 KB of data would be written to 1566 * each of four vdevs, with the fifth containing the parity bits. 1567 * 1568 * parity data data data data 1569 * | PP | XX | XX | XX | XX | 1570 * ^ ^ ^ ^ ^ 1571 * | | | | | 1572 * 8 KB parity ------8 KB data blocks------ 1573 * 1574 * However, when writing to the dump device, the behavior is different: 1575 * 1576 * vdev_raidz_physio(data, size: 32 KB, offset: 64 KB) 1577 * 1578 * Unlike the normal RAID-Z case in which the block is allocated based on the 1579 * I/O size, reads and writes here always use a 128 KB logical I/O size. If the 1580 * I/O size is less than 128 KB, only the actual portions of data are written. 1581 * In this example the data is written to the third data vdev since that vdev 1582 * contains the offset [64 KB, 96 KB). 1583 * 1584 * parity data data data data 1585 * | | | | XX | | 1586 * ^ 1587 * | 1588 * 32 KB data block 1589 * 1590 * As a result, an individual I/O may not span all child vdevs; moreover, a 1591 * small I/O may only operate on a single child vdev. 1592 * 1593 * Note that since there are no parity bits calculated or written, this format 1594 * remains the same no matter how many parity bits are used in a normal RAID-Z 1595 * stripe. On a RAID-Z3 configuration with seven child vdevs, the example above 1596 * would look like: 1597 * 1598 * parity parity parity data data data data 1599 * | | | | | | XX | | 1600 * ^ 1601 * | 1602 * 32 KB data block 1603 */ 1604int 1605vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size, 1606 uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump) 1607{ 1608 vdev_t *tvd = vd->vdev_top; 1609 vdev_t *cvd; 1610 raidz_map_t *rm; 1611 raidz_col_t *rc; 1612 int c, err = 0; 1613 1614 uint64_t start, end, colstart, colend; 1615 uint64_t coloffset, colsize, colskip; 1616 1617 int flags = doread ? BIO_READ : BIO_WRITE; 1618 1619#ifdef _KERNEL 1620 1621 /* 1622 * Don't write past the end of the block 1623 */ 1624 VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE); 1625 1626 start = offset; 1627 end = start + size; 1628 1629 /* 1630 * Allocate a RAID-Z map for this block. Note that this block starts 1631 * from the "original" offset, this is, the offset of the extent which 1632 * contains the requisite offset of the data being read or written. 1633 * 1634 * Even if this I/O operation doesn't span the full block size, let's 1635 * treat the on-disk format as if the only blocks are the complete 128 1636 * KB size. 1637 */ 1638 rm = vdev_raidz_map_alloc(data - (offset - origoffset), 1639 SPA_OLD_MAXBLOCKSIZE, origoffset, B_FALSE, tvd->vdev_ashift, 1640 vd->vdev_children, vd->vdev_nparity); 1641 1642 coloffset = origoffset; 1643 1644 for (c = rm->rm_firstdatacol; c < rm->rm_cols; 1645 c++, coloffset += rc->rc_size) { 1646 rc = &rm->rm_col[c]; 1647 cvd = vd->vdev_child[rc->rc_devidx]; 1648 1649 /* 1650 * Find the start and end of this column in the RAID-Z map, 1651 * keeping in mind that the stated size and offset of the 1652 * operation may not fill the entire column for this vdev. 1653 * 1654 * If any portion of the data spans this column, issue the 1655 * appropriate operation to the vdev. 1656 */ 1657 if (coloffset + rc->rc_size <= start) 1658 continue; 1659 if (coloffset >= end) 1660 continue; 1661 1662 colstart = MAX(coloffset, start); 1663 colend = MIN(end, coloffset + rc->rc_size); 1664 colsize = colend - colstart; 1665 colskip = colstart - coloffset; 1666 1667 VERIFY3U(colsize, <=, rc->rc_size); 1668 VERIFY3U(colskip, <=, rc->rc_size); 1669 1670 /* 1671 * Note that the child vdev will have a vdev label at the start 1672 * of its range of offsets, hence the need for 1673 * VDEV_LABEL_OFFSET(). See zio_vdev_child_io() for another 1674 * example of why this calculation is needed. 1675 */ 1676 if ((err = vdev_disk_physio(cvd, 1677 ((char *)rc->rc_data) + colskip, colsize, 1678 VDEV_LABEL_OFFSET(rc->rc_offset) + colskip, 1679 flags, isdump)) != 0) 1680 break; 1681 } 1682 1683 vdev_raidz_map_free(rm); 1684#endif /* KERNEL */ 1685 1686 return (err); 1687} 1688#endif 1689 1690static uint64_t 1691vdev_raidz_asize(vdev_t *vd, uint64_t psize) 1692{ 1693 uint64_t asize; 1694 uint64_t ashift = vd->vdev_top->vdev_ashift; 1695 uint64_t cols = vd->vdev_children; 1696 uint64_t nparity = vd->vdev_nparity; 1697 1698 asize = ((psize - 1) >> ashift) + 1; 1699 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); 1700 asize = roundup(asize, nparity + 1) << ashift; 1701 1702 return (asize); 1703} 1704 1705static void 1706vdev_raidz_child_done(zio_t *zio) 1707{ 1708 raidz_col_t *rc = zio->io_private; 1709 1710 rc->rc_error = zio->io_error; 1711 rc->rc_tried = 1; 1712 rc->rc_skipped = 0; 1713} 1714 1715/* 1716 * Start an IO operation on a RAIDZ VDev 1717 * 1718 * Outline: 1719 * - For write operations: 1720 * 1. Generate the parity data 1721 * 2. Create child zio write operations to each column's vdev, for both 1722 * data and parity. 1723 * 3. If the column skips any sectors for padding, create optional dummy 1724 * write zio children for those areas to improve aggregation continuity. 1725 * - For read operations: 1726 * 1. Create child zio read operations to each data column's vdev to read 1727 * the range of data required for zio. 1728 * 2. If this is a scrub or resilver operation, or if any of the data 1729 * vdevs have had errors, then create zio read operations to the parity 1730 * columns' VDevs as well. 1731 */ 1732static void 1733vdev_raidz_io_start(zio_t *zio) 1734{ 1735 vdev_t *vd = zio->io_vd; 1736 vdev_t *tvd = vd->vdev_top; 1737 vdev_t *cvd; 1738 raidz_map_t *rm; 1739 raidz_col_t *rc; 1740 int c, i; 1741 1742 rm = vdev_raidz_map_alloc(zio->io_data, zio->io_size, zio->io_offset, 1743 zio->io_type == ZIO_TYPE_FREE, 1744 tvd->vdev_ashift, vd->vdev_children, 1745 vd->vdev_nparity); 1746 1747 zio->io_vsd = rm; 1748 zio->io_vsd_ops = &vdev_raidz_vsd_ops; 1749 1750 ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); 1751 1752 if (zio->io_type == ZIO_TYPE_FREE) { 1753 for (c = 0; c < rm->rm_cols; c++) { 1754 rc = &rm->rm_col[c]; 1755 cvd = vd->vdev_child[rc->rc_devidx]; 1756 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1757 rc->rc_offset, rc->rc_data, rc->rc_size, 1758 zio->io_type, zio->io_priority, 0, 1759 vdev_raidz_child_done, rc)); 1760 } 1761 1762 zio_execute(zio); 1763 return; 1764 } 1765 1766 if (zio->io_type == ZIO_TYPE_WRITE) { 1767 vdev_raidz_generate_parity(rm); 1768 1769 for (c = 0; c < rm->rm_cols; c++) { 1770 rc = &rm->rm_col[c]; 1771 cvd = vd->vdev_child[rc->rc_devidx]; 1772 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1773 rc->rc_offset, rc->rc_data, rc->rc_size, 1774 zio->io_type, zio->io_priority, 0, 1775 vdev_raidz_child_done, rc)); 1776 } 1777 1778 /* 1779 * Generate optional I/Os for any skipped sectors to improve 1780 * aggregation contiguity. 1781 */ 1782 for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) { 1783 ASSERT(c <= rm->rm_scols); 1784 if (c == rm->rm_scols) 1785 c = 0; 1786 rc = &rm->rm_col[c]; 1787 cvd = vd->vdev_child[rc->rc_devidx]; 1788 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1789 rc->rc_offset + rc->rc_size, NULL, 1790 1 << tvd->vdev_ashift, 1791 zio->io_type, zio->io_priority, 1792 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); 1793 } 1794 1795 zio_execute(zio); 1796 return; 1797 } 1798 1799 ASSERT(zio->io_type == ZIO_TYPE_READ); 1800 1801 /* 1802 * Iterate over the columns in reverse order so that we hit the parity 1803 * last -- any errors along the way will force us to read the parity. 1804 */ 1805 for (c = rm->rm_cols - 1; c >= 0; c--) { 1806 rc = &rm->rm_col[c]; 1807 cvd = vd->vdev_child[rc->rc_devidx]; 1808 if (!vdev_readable(cvd)) { 1809 if (c >= rm->rm_firstdatacol) 1810 rm->rm_missingdata++; 1811 else 1812 rm->rm_missingparity++; 1813 rc->rc_error = SET_ERROR(ENXIO); 1814 rc->rc_tried = 1; /* don't even try */ 1815 rc->rc_skipped = 1; 1816 continue; 1817 } 1818 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 1819 if (c >= rm->rm_firstdatacol) 1820 rm->rm_missingdata++; 1821 else 1822 rm->rm_missingparity++; 1823 rc->rc_error = SET_ERROR(ESTALE); 1824 rc->rc_skipped = 1; 1825 continue; 1826 } 1827 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || 1828 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { 1829 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1830 rc->rc_offset, rc->rc_data, rc->rc_size, 1831 zio->io_type, zio->io_priority, 0, 1832 vdev_raidz_child_done, rc)); 1833 } 1834 } 1835 1836 zio_execute(zio); 1837} 1838 1839 1840/* 1841 * Report a checksum error for a child of a RAID-Z device. 1842 */ 1843static void 1844raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data) 1845{ 1846 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; 1847 1848 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 1849 zio_bad_cksum_t zbc; 1850 raidz_map_t *rm = zio->io_vsd; 1851 1852 mutex_enter(&vd->vdev_stat_lock); 1853 vd->vdev_stat.vs_checksum_errors++; 1854 mutex_exit(&vd->vdev_stat_lock); 1855 1856 zbc.zbc_has_cksum = 0; 1857 zbc.zbc_injected = rm->rm_ecksuminjected; 1858 1859 zfs_ereport_post_checksum(zio->io_spa, vd, zio, 1860 rc->rc_offset, rc->rc_size, rc->rc_data, bad_data, 1861 &zbc); 1862 } 1863} 1864 1865/* 1866 * We keep track of whether or not there were any injected errors, so that 1867 * any ereports we generate can note it. 1868 */ 1869static int 1870raidz_checksum_verify(zio_t *zio) 1871{ 1872 zio_bad_cksum_t zbc; 1873 raidz_map_t *rm = zio->io_vsd; 1874 1875 int ret = zio_checksum_error(zio, &zbc); 1876 if (ret != 0 && zbc.zbc_injected != 0) 1877 rm->rm_ecksuminjected = 1; 1878 1879 return (ret); 1880} 1881 1882/* 1883 * Generate the parity from the data columns. If we tried and were able to 1884 * read the parity without error, verify that the generated parity matches the 1885 * data we read. If it doesn't, we fire off a checksum error. Return the 1886 * number such failures. 1887 */ 1888static int 1889raidz_parity_verify(zio_t *zio, raidz_map_t *rm) 1890{ 1891 void *orig[VDEV_RAIDZ_MAXPARITY]; 1892 int c, ret = 0; 1893 raidz_col_t *rc; 1894 1895 blkptr_t *bp = zio->io_bp; 1896 enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum : 1897 (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); 1898 1899 if (checksum == ZIO_CHECKSUM_NOPARITY) 1900 return (ret); 1901 1902 for (c = 0; c < rm->rm_firstdatacol; c++) { 1903 rc = &rm->rm_col[c]; 1904 if (!rc->rc_tried || rc->rc_error != 0) 1905 continue; 1906 orig[c] = zio_buf_alloc(rc->rc_size); 1907 bcopy(rc->rc_data, orig[c], rc->rc_size); 1908 } 1909 1910 vdev_raidz_generate_parity(rm); 1911 1912 for (c = 0; c < rm->rm_firstdatacol; c++) { 1913 rc = &rm->rm_col[c]; 1914 if (!rc->rc_tried || rc->rc_error != 0) 1915 continue; 1916 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { 1917 raidz_checksum_error(zio, rc, orig[c]); 1918 rc->rc_error = SET_ERROR(ECKSUM); 1919 ret++; 1920 } 1921 zio_buf_free(orig[c], rc->rc_size); 1922 } 1923 1924 return (ret); 1925} 1926 1927/* 1928 * Keep statistics on all the ways that we used parity to correct data. 1929 */ 1930static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY]; 1931 1932static int 1933vdev_raidz_worst_error(raidz_map_t *rm) 1934{ 1935 int error = 0; 1936 1937 for (int c = 0; c < rm->rm_cols; c++) 1938 error = zio_worst_error(error, rm->rm_col[c].rc_error); 1939 1940 return (error); 1941} 1942 1943/* 1944 * Iterate over all combinations of bad data and attempt a reconstruction. 1945 * Note that the algorithm below is non-optimal because it doesn't take into 1946 * account how reconstruction is actually performed. For example, with 1947 * triple-parity RAID-Z the reconstruction procedure is the same if column 4 1948 * is targeted as invalid as if columns 1 and 4 are targeted since in both 1949 * cases we'd only use parity information in column 0. 1950 */ 1951static int 1952vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) 1953{ 1954 raidz_map_t *rm = zio->io_vsd; 1955 raidz_col_t *rc; 1956 void *orig[VDEV_RAIDZ_MAXPARITY]; 1957 int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 1958 int *tgts = &tstore[1]; 1959 int current, next, i, c, n; 1960 int code, ret = 0; 1961 1962 ASSERT(total_errors < rm->rm_firstdatacol); 1963 1964 /* 1965 * This simplifies one edge condition. 1966 */ 1967 tgts[-1] = -1; 1968 1969 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { 1970 /* 1971 * Initialize the targets array by finding the first n columns 1972 * that contain no error. 1973 * 1974 * If there were no data errors, we need to ensure that we're 1975 * always explicitly attempting to reconstruct at least one 1976 * data column. To do this, we simply push the highest target 1977 * up into the data columns. 1978 */ 1979 for (c = 0, i = 0; i < n; i++) { 1980 if (i == n - 1 && data_errors == 0 && 1981 c < rm->rm_firstdatacol) { 1982 c = rm->rm_firstdatacol; 1983 } 1984 1985 while (rm->rm_col[c].rc_error != 0) { 1986 c++; 1987 ASSERT3S(c, <, rm->rm_cols); 1988 } 1989 1990 tgts[i] = c++; 1991 } 1992 1993 /* 1994 * Setting tgts[n] simplifies the other edge condition. 1995 */ 1996 tgts[n] = rm->rm_cols; 1997 1998 /* 1999 * These buffers were allocated in previous iterations. 2000 */ 2001 for (i = 0; i < n - 1; i++) { 2002 ASSERT(orig[i] != NULL); 2003 } 2004 2005 orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size); 2006 2007 current = 0; 2008 next = tgts[current]; 2009 2010 while (current != n) { 2011 tgts[current] = next; 2012 current = 0; 2013 2014 /* 2015 * Save off the original data that we're going to 2016 * attempt to reconstruct. 2017 */ 2018 for (i = 0; i < n; i++) { 2019 ASSERT(orig[i] != NULL); 2020 c = tgts[i]; 2021 ASSERT3S(c, >=, 0); 2022 ASSERT3S(c, <, rm->rm_cols); 2023 rc = &rm->rm_col[c]; 2024 bcopy(rc->rc_data, orig[i], rc->rc_size); 2025 } 2026 2027 /* 2028 * Attempt a reconstruction and exit the outer loop on 2029 * success. 2030 */ 2031 code = vdev_raidz_reconstruct(rm, tgts, n); 2032 if (raidz_checksum_verify(zio) == 0) { 2033 atomic_inc_64(&raidz_corrected[code]); 2034 2035 for (i = 0; i < n; i++) { 2036 c = tgts[i]; 2037 rc = &rm->rm_col[c]; 2038 ASSERT(rc->rc_error == 0); 2039 if (rc->rc_tried) 2040 raidz_checksum_error(zio, rc, 2041 orig[i]); 2042 rc->rc_error = SET_ERROR(ECKSUM); 2043 } 2044 2045 ret = code; 2046 goto done; 2047 } 2048 2049 /* 2050 * Restore the original data. 2051 */ 2052 for (i = 0; i < n; i++) { 2053 c = tgts[i]; 2054 rc = &rm->rm_col[c]; 2055 bcopy(orig[i], rc->rc_data, rc->rc_size); 2056 } 2057 2058 do { 2059 /* 2060 * Find the next valid column after the current 2061 * position.. 2062 */ 2063 for (next = tgts[current] + 1; 2064 next < rm->rm_cols && 2065 rm->rm_col[next].rc_error != 0; next++) 2066 continue; 2067 2068 ASSERT(next <= tgts[current + 1]); 2069 2070 /* 2071 * If that spot is available, we're done here. 2072 */ 2073 if (next != tgts[current + 1]) 2074 break; 2075 2076 /* 2077 * Otherwise, find the next valid column after 2078 * the previous position. 2079 */ 2080 for (c = tgts[current - 1] + 1; 2081 rm->rm_col[c].rc_error != 0; c++) 2082 continue; 2083 2084 tgts[current] = c; 2085 current++; 2086 2087 } while (current != n); 2088 } 2089 } 2090 n--; 2091done: 2092 for (i = 0; i < n; i++) { 2093 zio_buf_free(orig[i], rm->rm_col[0].rc_size); 2094 } 2095 2096 return (ret); 2097} 2098 2099/* 2100 * Complete an IO operation on a RAIDZ VDev 2101 * 2102 * Outline: 2103 * - For write operations: 2104 * 1. Check for errors on the child IOs. 2105 * 2. Return, setting an error code if too few child VDevs were written 2106 * to reconstruct the data later. Note that partial writes are 2107 * considered successful if they can be reconstructed at all. 2108 * - For read operations: 2109 * 1. Check for errors on the child IOs. 2110 * 2. If data errors occurred: 2111 * a. Try to reassemble the data from the parity available. 2112 * b. If we haven't yet read the parity drives, read them now. 2113 * c. If all parity drives have been read but the data still doesn't 2114 * reassemble with a correct checksum, then try combinatorial 2115 * reconstruction. 2116 * d. If that doesn't work, return an error. 2117 * 3. If there were unexpected errors or this is a resilver operation, 2118 * rewrite the vdevs that had errors. 2119 */ 2120static void 2121vdev_raidz_io_done(zio_t *zio) 2122{ 2123 vdev_t *vd = zio->io_vd; 2124 vdev_t *cvd; 2125 raidz_map_t *rm = zio->io_vsd; 2126 raidz_col_t *rc; 2127 int unexpected_errors = 0; 2128 int parity_errors = 0; 2129 int parity_untried = 0; 2130 int data_errors = 0; 2131 int total_errors = 0; 2132 int n, c; 2133 int tgts[VDEV_RAIDZ_MAXPARITY]; 2134 int code; 2135 2136 ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ 2137 2138 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); 2139 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); 2140 2141 for (c = 0; c < rm->rm_cols; c++) { 2142 rc = &rm->rm_col[c]; 2143 2144 if (rc->rc_error) { 2145 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 2146 2147 if (c < rm->rm_firstdatacol) 2148 parity_errors++; 2149 else 2150 data_errors++; 2151 2152 if (!rc->rc_skipped) 2153 unexpected_errors++; 2154 2155 total_errors++; 2156 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { 2157 parity_untried++; 2158 } 2159 } 2160 2161 if (zio->io_type == ZIO_TYPE_WRITE) { 2162 /* 2163 * XXX -- for now, treat partial writes as a success. 2164 * (If we couldn't write enough columns to reconstruct 2165 * the data, the I/O failed. Otherwise, good enough.) 2166 * 2167 * Now that we support write reallocation, it would be better 2168 * to treat partial failure as real failure unless there are 2169 * no non-degraded top-level vdevs left, and not update DTLs 2170 * if we intend to reallocate. 2171 */ 2172 /* XXPOLICY */ 2173 if (total_errors > rm->rm_firstdatacol) 2174 zio->io_error = vdev_raidz_worst_error(rm); 2175 2176 return; 2177 } else if (zio->io_type == ZIO_TYPE_FREE) { 2178 return; 2179 } 2180 2181 ASSERT(zio->io_type == ZIO_TYPE_READ); 2182 /* 2183 * There are three potential phases for a read: 2184 * 1. produce valid data from the columns read 2185 * 2. read all disks and try again 2186 * 3. perform combinatorial reconstruction 2187 * 2188 * Each phase is progressively both more expensive and less likely to 2189 * occur. If we encounter more errors than we can repair or all phases 2190 * fail, we have no choice but to return an error. 2191 */ 2192 2193 /* 2194 * If the number of errors we saw was correctable -- less than or equal 2195 * to the number of parity disks read -- attempt to produce data that 2196 * has a valid checksum. Naturally, this case applies in the absence of 2197 * any errors. 2198 */ 2199 if (total_errors <= rm->rm_firstdatacol - parity_untried) { 2200 if (data_errors == 0) { 2201 if (raidz_checksum_verify(zio) == 0) { 2202 /* 2203 * If we read parity information (unnecessarily 2204 * as it happens since no reconstruction was 2205 * needed) regenerate and verify the parity. 2206 * We also regenerate parity when resilvering 2207 * so we can write it out to the failed device 2208 * later. 2209 */ 2210 if (parity_errors + parity_untried < 2211 rm->rm_firstdatacol || 2212 (zio->io_flags & ZIO_FLAG_RESILVER)) { 2213 n = raidz_parity_verify(zio, rm); 2214 unexpected_errors += n; 2215 ASSERT(parity_errors + n <= 2216 rm->rm_firstdatacol); 2217 } 2218 goto done; 2219 } 2220 } else { 2221 /* 2222 * We either attempt to read all the parity columns or 2223 * none of them. If we didn't try to read parity, we 2224 * wouldn't be here in the correctable case. There must 2225 * also have been fewer parity errors than parity 2226 * columns or, again, we wouldn't be in this code path. 2227 */ 2228 ASSERT(parity_untried == 0); 2229 ASSERT(parity_errors < rm->rm_firstdatacol); 2230 2231 /* 2232 * Identify the data columns that reported an error. 2233 */ 2234 n = 0; 2235 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 2236 rc = &rm->rm_col[c]; 2237 if (rc->rc_error != 0) { 2238 ASSERT(n < VDEV_RAIDZ_MAXPARITY); 2239 tgts[n++] = c; 2240 } 2241 } 2242 2243 ASSERT(rm->rm_firstdatacol >= n); 2244 2245 code = vdev_raidz_reconstruct(rm, tgts, n); 2246 2247 if (raidz_checksum_verify(zio) == 0) { 2248 atomic_inc_64(&raidz_corrected[code]); 2249 2250 /* 2251 * If we read more parity disks than were used 2252 * for reconstruction, confirm that the other 2253 * parity disks produced correct data. This 2254 * routine is suboptimal in that it regenerates 2255 * the parity that we already used in addition 2256 * to the parity that we're attempting to 2257 * verify, but this should be a relatively 2258 * uncommon case, and can be optimized if it 2259 * becomes a problem. Note that we regenerate 2260 * parity when resilvering so we can write it 2261 * out to failed devices later. 2262 */ 2263 if (parity_errors < rm->rm_firstdatacol - n || 2264 (zio->io_flags & ZIO_FLAG_RESILVER)) { 2265 n = raidz_parity_verify(zio, rm); 2266 unexpected_errors += n; 2267 ASSERT(parity_errors + n <= 2268 rm->rm_firstdatacol); 2269 } 2270 2271 goto done; 2272 } 2273 } 2274 } 2275 2276 /* 2277 * This isn't a typical situation -- either we got a read error or 2278 * a child silently returned bad data. Read every block so we can 2279 * try again with as much data and parity as we can track down. If 2280 * we've already been through once before, all children will be marked 2281 * as tried so we'll proceed to combinatorial reconstruction. 2282 */ 2283 unexpected_errors = 1; 2284 rm->rm_missingdata = 0; 2285 rm->rm_missingparity = 0; 2286 2287 for (c = 0; c < rm->rm_cols; c++) { 2288 if (rm->rm_col[c].rc_tried) 2289 continue; 2290 2291 zio_vdev_io_redone(zio); 2292 do { 2293 rc = &rm->rm_col[c]; 2294 if (rc->rc_tried) 2295 continue; 2296 zio_nowait(zio_vdev_child_io(zio, NULL, 2297 vd->vdev_child[rc->rc_devidx], 2298 rc->rc_offset, rc->rc_data, rc->rc_size, 2299 zio->io_type, zio->io_priority, 0, 2300 vdev_raidz_child_done, rc)); 2301 } while (++c < rm->rm_cols); 2302 2303 return; 2304 } 2305 2306 /* 2307 * At this point we've attempted to reconstruct the data given the 2308 * errors we detected, and we've attempted to read all columns. There 2309 * must, therefore, be one or more additional problems -- silent errors 2310 * resulting in invalid data rather than explicit I/O errors resulting 2311 * in absent data. We check if there is enough additional data to 2312 * possibly reconstruct the data and then perform combinatorial 2313 * reconstruction over all possible combinations. If that fails, 2314 * we're cooked. 2315 */ 2316 if (total_errors > rm->rm_firstdatacol) { 2317 zio->io_error = vdev_raidz_worst_error(rm); 2318 2319 } else if (total_errors < rm->rm_firstdatacol && 2320 (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) { 2321 /* 2322 * If we didn't use all the available parity for the 2323 * combinatorial reconstruction, verify that the remaining 2324 * parity is correct. 2325 */ 2326 if (code != (1 << rm->rm_firstdatacol) - 1) 2327 (void) raidz_parity_verify(zio, rm); 2328 } else { 2329 /* 2330 * We're here because either: 2331 * 2332 * total_errors == rm_first_datacol, or 2333 * vdev_raidz_combrec() failed 2334 * 2335 * In either case, there is enough bad data to prevent 2336 * reconstruction. 2337 * 2338 * Start checksum ereports for all children which haven't 2339 * failed, and the IO wasn't speculative. 2340 */ 2341 zio->io_error = SET_ERROR(ECKSUM); 2342 2343 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2344 for (c = 0; c < rm->rm_cols; c++) { 2345 rc = &rm->rm_col[c]; 2346 if (rc->rc_error == 0) { 2347 zio_bad_cksum_t zbc; 2348 zbc.zbc_has_cksum = 0; 2349 zbc.zbc_injected = 2350 rm->rm_ecksuminjected; 2351 2352 zfs_ereport_start_checksum( 2353 zio->io_spa, 2354 vd->vdev_child[rc->rc_devidx], 2355 zio, rc->rc_offset, rc->rc_size, 2356 (void *)(uintptr_t)c, &zbc); 2357 } 2358 } 2359 } 2360 } 2361 2362done: 2363 zio_checksum_verified(zio); 2364 2365 if (zio->io_error == 0 && spa_writeable(zio->io_spa) && 2366 (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { 2367 /* 2368 * Use the good data we have in hand to repair damaged children. 2369 */ 2370 for (c = 0; c < rm->rm_cols; c++) { 2371 rc = &rm->rm_col[c]; 2372 cvd = vd->vdev_child[rc->rc_devidx]; 2373 2374 if (rc->rc_error == 0) 2375 continue; 2376 2377 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 2378 rc->rc_offset, rc->rc_data, rc->rc_size, 2379 ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, 2380 ZIO_FLAG_IO_REPAIR | (unexpected_errors ? 2381 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); 2382 } 2383 } 2384} 2385 2386static void 2387vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) 2388{ 2389 if (faulted > vd->vdev_nparity) 2390 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2391 VDEV_AUX_NO_REPLICAS); 2392 else if (degraded + faulted != 0) 2393 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); 2394 else 2395 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); 2396} 2397 2398vdev_ops_t vdev_raidz_ops = { 2399 vdev_raidz_open, 2400 vdev_raidz_close, 2401 vdev_raidz_asize, 2402 vdev_raidz_io_start, 2403 vdev_raidz_io_done, 2404 vdev_raidz_state_change, 2405 NULL, 2406 NULL, 2407 VDEV_TYPE_RAIDZ, /* name of this vdev type */ 2408 B_FALSE /* not a leaf vdev */ 2409}; 2410