24 */ 25 26#include <sys/zfs_context.h> 27#include <sys/spa.h> 28#include <sys/vdev_impl.h> 29#include <sys/zio.h> 30#include <sys/zio_checksum.h> 31#include <sys/fs/zfs.h> 32#include <sys/fm/fs/zfs.h> 33 34/* 35 * Virtual device vector for RAID-Z. 36 * 37 * This vdev supports single, double, and triple parity. For single parity, 38 * we use a simple XOR of all the data columns. For double or triple parity, 39 * we use a special case of Reed-Solomon coding. This extends the 40 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by 41 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for 42 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the 43 * former is also based. The latter is designed to provide higher performance 44 * for writes. 45 * 46 * Note that the Plank paper claimed to support arbitrary N+M, but was then 47 * amended six years later identifying a critical flaw that invalidates its 48 * claims. Nevertheless, the technique can be adapted to work for up to 49 * triple parity. For additional parity, the amendment "Note: Correction to 50 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding 51 * is viable, but the additional complexity means that write performance will 52 * suffer. 53 * 54 * All of the methods above operate on a Galois field, defined over the 55 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements 56 * can be expressed with a single byte. Briefly, the operations on the 57 * field are defined as follows: 58 * 59 * o addition (+) is represented by a bitwise XOR 60 * o subtraction (-) is therefore identical to addition: A + B = A - B 61 * o multiplication of A by 2 is defined by the following bitwise expression: 62 * (A * 2)_7 = A_6 63 * (A * 2)_6 = A_5 64 * (A * 2)_5 = A_4 65 * (A * 2)_4 = A_3 + A_7 66 * (A * 2)_3 = A_2 + A_7 67 * (A * 2)_2 = A_1 + A_7 68 * (A * 2)_1 = A_0 69 * (A * 2)_0 = A_7 70 * 71 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). 72 * As an aside, this multiplication is derived from the error correcting 73 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1. 74 * 75 * Observe that any number in the field (except for 0) can be expressed as a 76 * power of 2 -- a generator for the field. We store a table of the powers of 77 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can 78 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather 79 * than field addition). The inverse of a field element A (A^-1) is therefore 80 * A ^ (255 - 1) = A^254. 81 * 82 * The up-to-three parity columns, P, Q, R over several data columns, 83 * D_0, ... D_n-1, can be expressed by field operations: 84 * 85 * P = D_0 + D_1 + ... + D_n-2 + D_n-1 86 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 87 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 88 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1 89 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1 90 * 91 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival 92 * XOR operation, and 2 and 4 can be computed quickly and generate linearly- 93 * independent coefficients. (There are no additional coefficients that have 94 * this property which is why the uncorrected Plank method breaks down.) 95 * 96 * See the reconstruction code below for how P, Q and R can used individually 97 * or in concert to recover missing data columns. 98 */ 99 100typedef struct raidz_col { 101 uint64_t rc_devidx; /* child device index for I/O */ 102 uint64_t rc_offset; /* device offset */ 103 uint64_t rc_size; /* I/O size */ 104 void *rc_data; /* I/O data */ 105 void *rc_gdata; /* used to store the "good" version */ 106 int rc_error; /* I/O error for this device */ 107 uint8_t rc_tried; /* Did we attempt this I/O column? */ 108 uint8_t rc_skipped; /* Did we skip this I/O column? */ 109} raidz_col_t; 110 111typedef struct raidz_map { 112 uint64_t rm_cols; /* Regular column count */ 113 uint64_t rm_scols; /* Count including skipped columns */ 114 uint64_t rm_bigcols; /* Number of oversized columns */ 115 uint64_t rm_asize; /* Actual total I/O size */ 116 uint64_t rm_missingdata; /* Count of missing data devices */ 117 uint64_t rm_missingparity; /* Count of missing parity devices */ 118 uint64_t rm_firstdatacol; /* First data column/parity count */ 119 uint64_t rm_nskip; /* Skipped sectors for padding */ 120 uint64_t rm_skipstart; /* Column index of padding start */ 121 void *rm_datacopy; /* rm_asize-buffer of copied data */ 122 uintptr_t rm_reports; /* # of referencing checksum reports */ 123 uint8_t rm_freed; /* map no longer has referencing ZIO */ 124 uint8_t rm_ecksuminjected; /* checksum error was injected */ 125 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ 126} raidz_map_t; 127 128#define VDEV_RAIDZ_P 0 129#define VDEV_RAIDZ_Q 1 130#define VDEV_RAIDZ_R 2 131 132#define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 133#define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 134 135/* 136 * We provide a mechanism to perform the field multiplication operation on a 137 * 64-bit value all at once rather than a byte at a time. This works by 138 * creating a mask from the top bit in each byte and using that to 139 * conditionally apply the XOR of 0x1d. 140 */ 141#define VDEV_RAIDZ_64MUL_2(x, mask) \ 142{ \ 143 (mask) = (x) & 0x8080808080808080ULL; \ 144 (mask) = ((mask) << 1) - ((mask) >> 7); \ 145 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 146 ((mask) & 0x1d1d1d1d1d1d1d1d); \ 147} 148 149#define VDEV_RAIDZ_64MUL_4(x, mask) \ 150{ \ 151 VDEV_RAIDZ_64MUL_2((x), mask); \ 152 VDEV_RAIDZ_64MUL_2((x), mask); \ 153} 154 155/* 156 * Force reconstruction to use the general purpose method. 157 */ 158int vdev_raidz_default_to_general; 159 160/* 161 * These two tables represent powers and logs of 2 in the Galois field defined 162 * above. These values were computed by repeatedly multiplying by 2 as above. 163 */ 164static const uint8_t vdev_raidz_pow2[256] = { 165 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 166 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 167 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 168 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 169 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 170 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 171 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, 172 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 173 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 174 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, 175 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 176 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 177 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 178 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 179 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 180 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 181 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 182 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 183 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 184 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 185 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 186 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 187 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 188 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 189 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, 190 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 191 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 192 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, 193 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 194 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, 195 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 196 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 197}; 198static const uint8_t vdev_raidz_log2[256] = { 199 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, 200 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, 201 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, 202 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, 203 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, 204 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, 205 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, 206 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, 207 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, 208 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, 209 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, 210 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, 211 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, 212 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, 213 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, 214 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, 215 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, 216 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, 217 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, 218 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, 219 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, 220 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, 221 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, 222 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, 223 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, 224 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, 225 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, 226 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, 227 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, 228 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, 229 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, 230 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, 231}; 232 233static void vdev_raidz_generate_parity(raidz_map_t *rm); 234 235/* 236 * Multiply a given number by 2 raised to the given power. 237 */ 238static uint8_t 239vdev_raidz_exp2(uint_t a, int exp) 240{ 241 if (a == 0) 242 return (0); 243 244 ASSERT(exp >= 0); 245 ASSERT(vdev_raidz_log2[a] > 0 || a == 1); 246 247 exp += vdev_raidz_log2[a]; 248 if (exp > 255) 249 exp -= 255; 250 251 return (vdev_raidz_pow2[exp]); 252} 253 254static void 255vdev_raidz_map_free(raidz_map_t *rm) 256{ 257 int c; 258 size_t size; 259 260 for (c = 0; c < rm->rm_firstdatacol; c++) { 261 zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); 262 263 if (rm->rm_col[c].rc_gdata != NULL) 264 zio_buf_free(rm->rm_col[c].rc_gdata, 265 rm->rm_col[c].rc_size); 266 } 267 268 size = 0; 269 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) 270 size += rm->rm_col[c].rc_size; 271 272 if (rm->rm_datacopy != NULL) 273 zio_buf_free(rm->rm_datacopy, size); 274 275 kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); 276} 277 278static void 279vdev_raidz_map_free_vsd(zio_t *zio) 280{ 281 raidz_map_t *rm = zio->io_vsd; 282 283 ASSERT3U(rm->rm_freed, ==, 0); 284 rm->rm_freed = 1; 285 286 if (rm->rm_reports == 0) 287 vdev_raidz_map_free(rm); 288} 289 290/*ARGSUSED*/ 291static void 292vdev_raidz_cksum_free(void *arg, size_t ignored) 293{ 294 raidz_map_t *rm = arg; 295 296 ASSERT3U(rm->rm_reports, >, 0); 297 298 if (--rm->rm_reports == 0 && rm->rm_freed != 0) 299 vdev_raidz_map_free(rm); 300} 301 302static void 303vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) 304{ 305 raidz_map_t *rm = zcr->zcr_cbdata; 306 size_t c = zcr->zcr_cbinfo; 307 size_t x; 308 309 const char *good = NULL; 310 const char *bad = rm->rm_col[c].rc_data; 311 312 if (good_data == NULL) { 313 zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); 314 return; 315 } 316 317 if (c < rm->rm_firstdatacol) { 318 /* 319 * The first time through, calculate the parity blocks for 320 * the good data (this relies on the fact that the good 321 * data never changes for a given logical ZIO) 322 */ 323 if (rm->rm_col[0].rc_gdata == NULL) { 324 char *bad_parity[VDEV_RAIDZ_MAXPARITY]; 325 char *buf; 326 327 /* 328 * Set up the rm_col[]s to generate the parity for 329 * good_data, first saving the parity bufs and 330 * replacing them with buffers to hold the result. 331 */ 332 for (x = 0; x < rm->rm_firstdatacol; x++) { 333 bad_parity[x] = rm->rm_col[x].rc_data; 334 rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata = 335 zio_buf_alloc(rm->rm_col[x].rc_size); 336 } 337 338 /* fill in the data columns from good_data */ 339 buf = (char *)good_data; 340 for (; x < rm->rm_cols; x++) { 341 rm->rm_col[x].rc_data = buf; 342 buf += rm->rm_col[x].rc_size; 343 } 344 345 /* 346 * Construct the parity from the good data. 347 */ 348 vdev_raidz_generate_parity(rm); 349 350 /* restore everything back to its original state */ 351 for (x = 0; x < rm->rm_firstdatacol; x++) 352 rm->rm_col[x].rc_data = bad_parity[x]; 353 354 buf = rm->rm_datacopy; 355 for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) { 356 rm->rm_col[x].rc_data = buf; 357 buf += rm->rm_col[x].rc_size; 358 } 359 } 360 361 ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL); 362 good = rm->rm_col[c].rc_gdata; 363 } else { 364 /* adjust good_data to point at the start of our column */ 365 good = good_data; 366 367 for (x = rm->rm_firstdatacol; x < c; x++) 368 good += rm->rm_col[x].rc_size; 369 } 370 371 /* we drop the ereport if it ends up that the data was good */ 372 zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE); 373} 374 375/* 376 * Invoked indirectly by zfs_ereport_start_checksum(), called 377 * below when our read operation fails completely. The main point 378 * is to keep a copy of everything we read from disk, so that at 379 * vdev_raidz_cksum_finish() time we can compare it with the good data. 380 */ 381static void 382vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) 383{ 384 size_t c = (size_t)(uintptr_t)arg; 385 caddr_t buf; 386 387 raidz_map_t *rm = zio->io_vsd; 388 size_t size; 389 390 /* set up the report and bump the refcount */ 391 zcr->zcr_cbdata = rm; 392 zcr->zcr_cbinfo = c; 393 zcr->zcr_finish = vdev_raidz_cksum_finish; 394 zcr->zcr_free = vdev_raidz_cksum_free; 395 396 rm->rm_reports++; 397 ASSERT3U(rm->rm_reports, >, 0); 398 399 if (rm->rm_datacopy != NULL) 400 return; 401 402 /* 403 * It's the first time we're called for this raidz_map_t, so we need 404 * to copy the data aside; there's no guarantee that our zio's buffer 405 * won't be re-used for something else. 406 * 407 * Our parity data is already in separate buffers, so there's no need 408 * to copy them. 409 */ 410 411 size = 0; 412 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) 413 size += rm->rm_col[c].rc_size; 414 415 buf = rm->rm_datacopy = zio_buf_alloc(size); 416 417 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 418 raidz_col_t *col = &rm->rm_col[c]; 419 420 bcopy(col->rc_data, buf, col->rc_size); 421 col->rc_data = buf; 422 423 buf += col->rc_size; 424 } 425 ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size); 426} 427 428static const zio_vsd_ops_t vdev_raidz_vsd_ops = { 429 vdev_raidz_map_free_vsd, 430 vdev_raidz_cksum_report 431}; 432 433static raidz_map_t * 434vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, 435 uint64_t nparity) 436{ 437 raidz_map_t *rm; 438 uint64_t b = zio->io_offset >> unit_shift; 439 uint64_t s = zio->io_size >> unit_shift; 440 uint64_t f = b % dcols; 441 uint64_t o = (b / dcols) << unit_shift; 442 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; 443 444 q = s / (dcols - nparity); 445 r = s - q * (dcols - nparity); 446 bc = (r == 0 ? 0 : r + nparity); 447 tot = s + nparity * (q + (r == 0 ? 0 : 1)); 448 449 if (q == 0) { 450 acols = bc; 451 scols = MIN(dcols, roundup(bc, nparity + 1)); 452 } else { 453 acols = dcols; 454 scols = dcols; 455 } 456 457 ASSERT3U(acols, <=, scols); 458 459 rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP); 460 461 rm->rm_cols = acols; 462 rm->rm_scols = scols; 463 rm->rm_bigcols = bc; 464 rm->rm_skipstart = bc; 465 rm->rm_missingdata = 0; 466 rm->rm_missingparity = 0; 467 rm->rm_firstdatacol = nparity; 468 rm->rm_datacopy = NULL; 469 rm->rm_reports = 0; 470 rm->rm_freed = 0; 471 rm->rm_ecksuminjected = 0; 472 473 asize = 0; 474 475 for (c = 0; c < scols; c++) { 476 col = f + c; 477 coff = o; 478 if (col >= dcols) { 479 col -= dcols; 480 coff += 1ULL << unit_shift; 481 } 482 rm->rm_col[c].rc_devidx = col; 483 rm->rm_col[c].rc_offset = coff; 484 rm->rm_col[c].rc_data = NULL; 485 rm->rm_col[c].rc_gdata = NULL; 486 rm->rm_col[c].rc_error = 0; 487 rm->rm_col[c].rc_tried = 0; 488 rm->rm_col[c].rc_skipped = 0; 489 490 if (c >= acols) 491 rm->rm_col[c].rc_size = 0; 492 else if (c < bc) 493 rm->rm_col[c].rc_size = (q + 1) << unit_shift; 494 else 495 rm->rm_col[c].rc_size = q << unit_shift; 496 497 asize += rm->rm_col[c].rc_size; 498 } 499 500 ASSERT3U(asize, ==, tot << unit_shift); 501 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); 502 rm->rm_nskip = roundup(tot, nparity + 1) - tot; 503 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); 504 ASSERT3U(rm->rm_nskip, <=, nparity); 505 506 for (c = 0; c < rm->rm_firstdatacol; c++) 507 rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size); 508 509 rm->rm_col[c].rc_data = zio->io_data; 510 511 for (c = c + 1; c < acols; c++) 512 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + 513 rm->rm_col[c - 1].rc_size; 514 515 /* 516 * If all data stored spans all columns, there's a danger that parity 517 * will always be on the same device and, since parity isn't read 518 * during normal operation, that that device's I/O bandwidth won't be 519 * used effectively. We therefore switch the parity every 1MB. 520 * 521 * ... at least that was, ostensibly, the theory. As a practical 522 * matter unless we juggle the parity between all devices evenly, we 523 * won't see any benefit. Further, occasional writes that aren't a 524 * multiple of the LCM of the number of children and the minimum 525 * stripe width are sufficient to avoid pessimal behavior. 526 * Unfortunately, this decision created an implicit on-disk format 527 * requirement that we need to support for all eternity, but only 528 * for single-parity RAID-Z. 529 * 530 * If we intend to skip a sector in the zeroth column for padding 531 * we must make sure to note this swap. We will never intend to 532 * skip the first column since at least one data and one parity 533 * column must appear in each row. 534 */ 535 ASSERT(rm->rm_cols >= 2); 536 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); 537 538 if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { 539 devidx = rm->rm_col[0].rc_devidx; 540 o = rm->rm_col[0].rc_offset; 541 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; 542 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; 543 rm->rm_col[1].rc_devidx = devidx; 544 rm->rm_col[1].rc_offset = o; 545 546 if (rm->rm_skipstart == 0) 547 rm->rm_skipstart = 1; 548 } 549 550 zio->io_vsd = rm; 551 zio->io_vsd_ops = &vdev_raidz_vsd_ops; 552 return (rm); 553} 554 555static void 556vdev_raidz_generate_parity_p(raidz_map_t *rm) 557{ 558 uint64_t *p, *src, pcount, ccount, i; 559 int c; 560 561 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 562 563 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 564 src = rm->rm_col[c].rc_data; 565 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 566 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 567 568 if (c == rm->rm_firstdatacol) { 569 ASSERT(ccount == pcount); 570 for (i = 0; i < ccount; i++, src++, p++) { 571 *p = *src; 572 } 573 } else { 574 ASSERT(ccount <= pcount); 575 for (i = 0; i < ccount; i++, src++, p++) { 576 *p ^= *src; 577 } 578 } 579 } 580} 581 582static void 583vdev_raidz_generate_parity_pq(raidz_map_t *rm) 584{ 585 uint64_t *p, *q, *src, pcnt, ccnt, mask, i; 586 int c; 587 588 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 589 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 590 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 591 592 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 593 src = rm->rm_col[c].rc_data; 594 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 595 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 596 597 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 598 599 if (c == rm->rm_firstdatacol) { 600 ASSERT(ccnt == pcnt || ccnt == 0); 601 for (i = 0; i < ccnt; i++, src++, p++, q++) { 602 *p = *src; 603 *q = *src; 604 } 605 for (; i < pcnt; i++, src++, p++, q++) { 606 *p = 0; 607 *q = 0; 608 } 609 } else { 610 ASSERT(ccnt <= pcnt); 611 612 /* 613 * Apply the algorithm described above by multiplying 614 * the previous result and adding in the new value. 615 */ 616 for (i = 0; i < ccnt; i++, src++, p++, q++) { 617 *p ^= *src; 618 619 VDEV_RAIDZ_64MUL_2(*q, mask); 620 *q ^= *src; 621 } 622 623 /* 624 * Treat short columns as though they are full of 0s. 625 * Note that there's therefore nothing needed for P. 626 */ 627 for (; i < pcnt; i++, q++) { 628 VDEV_RAIDZ_64MUL_2(*q, mask); 629 } 630 } 631 } 632} 633 634static void 635vdev_raidz_generate_parity_pqr(raidz_map_t *rm) 636{ 637 uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i; 638 int c; 639 640 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 641 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 642 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 643 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 644 rm->rm_col[VDEV_RAIDZ_R].rc_size); 645 646 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 647 src = rm->rm_col[c].rc_data; 648 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 649 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 650 r = rm->rm_col[VDEV_RAIDZ_R].rc_data; 651 652 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 653 654 if (c == rm->rm_firstdatacol) { 655 ASSERT(ccnt == pcnt || ccnt == 0); 656 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 657 *p = *src; 658 *q = *src; 659 *r = *src; 660 } 661 for (; i < pcnt; i++, src++, p++, q++, r++) { 662 *p = 0; 663 *q = 0; 664 *r = 0; 665 } 666 } else { 667 ASSERT(ccnt <= pcnt); 668 669 /* 670 * Apply the algorithm described above by multiplying 671 * the previous result and adding in the new value. 672 */ 673 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 674 *p ^= *src; 675 676 VDEV_RAIDZ_64MUL_2(*q, mask); 677 *q ^= *src; 678 679 VDEV_RAIDZ_64MUL_4(*r, mask); 680 *r ^= *src; 681 } 682 683 /* 684 * Treat short columns as though they are full of 0s. 685 * Note that there's therefore nothing needed for P. 686 */ 687 for (; i < pcnt; i++, q++, r++) { 688 VDEV_RAIDZ_64MUL_2(*q, mask); 689 VDEV_RAIDZ_64MUL_4(*r, mask); 690 } 691 } 692 } 693} 694 695/* 696 * Generate RAID parity in the first virtual columns according to the number of 697 * parity columns available. 698 */ 699static void 700vdev_raidz_generate_parity(raidz_map_t *rm) 701{ 702 switch (rm->rm_firstdatacol) { 703 case 1: 704 vdev_raidz_generate_parity_p(rm); 705 break; 706 case 2: 707 vdev_raidz_generate_parity_pq(rm); 708 break; 709 case 3: 710 vdev_raidz_generate_parity_pqr(rm); 711 break; 712 default: 713 cmn_err(CE_PANIC, "invalid RAID-Z configuration"); 714 } 715} 716 717static int 718vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) 719{ 720 uint64_t *dst, *src, xcount, ccount, count, i; 721 int x = tgts[0]; 722 int c; 723 724 ASSERT(ntgts == 1); 725 ASSERT(x >= rm->rm_firstdatacol); 726 ASSERT(x < rm->rm_cols); 727 728 xcount = rm->rm_col[x].rc_size / sizeof (src[0]); 729 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0])); 730 ASSERT(xcount > 0); 731 732 src = rm->rm_col[VDEV_RAIDZ_P].rc_data; 733 dst = rm->rm_col[x].rc_data; 734 for (i = 0; i < xcount; i++, dst++, src++) { 735 *dst = *src; 736 } 737 738 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 739 src = rm->rm_col[c].rc_data; 740 dst = rm->rm_col[x].rc_data; 741 742 if (c == x) 743 continue; 744 745 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 746 count = MIN(ccount, xcount); 747 748 for (i = 0; i < count; i++, dst++, src++) { 749 *dst ^= *src; 750 } 751 } 752 753 return (1 << VDEV_RAIDZ_P); 754} 755 756static int 757vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) 758{ 759 uint64_t *dst, *src, xcount, ccount, count, mask, i; 760 uint8_t *b; 761 int x = tgts[0]; 762 int c, j, exp; 763 764 ASSERT(ntgts == 1); 765 766 xcount = rm->rm_col[x].rc_size / sizeof (src[0]); 767 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0])); 768 769 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 770 src = rm->rm_col[c].rc_data; 771 dst = rm->rm_col[x].rc_data; 772 773 if (c == x) 774 ccount = 0; 775 else 776 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 777 778 count = MIN(ccount, xcount); 779 780 if (c == rm->rm_firstdatacol) { 781 for (i = 0; i < count; i++, dst++, src++) { 782 *dst = *src; 783 } 784 for (; i < xcount; i++, dst++) { 785 *dst = 0; 786 } 787 788 } else { 789 for (i = 0; i < count; i++, dst++, src++) { 790 VDEV_RAIDZ_64MUL_2(*dst, mask); 791 *dst ^= *src; 792 } 793 794 for (; i < xcount; i++, dst++) { 795 VDEV_RAIDZ_64MUL_2(*dst, mask); 796 } 797 } 798 } 799 800 src = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 801 dst = rm->rm_col[x].rc_data; 802 exp = 255 - (rm->rm_cols - 1 - x); 803 804 for (i = 0; i < xcount; i++, dst++, src++) { 805 *dst ^= *src; 806 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { 807 *b = vdev_raidz_exp2(*b, exp); 808 } 809 } 810 811 return (1 << VDEV_RAIDZ_Q); 812} 813 814static int 815vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) 816{ 817 uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp; 818 void *pdata, *qdata; 819 uint64_t xsize, ysize, i; 820 int x = tgts[0]; 821 int y = tgts[1]; 822 823 ASSERT(ntgts == 2); 824 ASSERT(x < y); 825 ASSERT(x >= rm->rm_firstdatacol); 826 ASSERT(y < rm->rm_cols); 827 828 ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size); 829 830 /* 831 * Move the parity data aside -- we're going to compute parity as 832 * though columns x and y were full of zeros -- Pxy and Qxy. We want to 833 * reuse the parity generation mechanism without trashing the actual 834 * parity so we make those columns appear to be full of zeros by 835 * setting their lengths to zero. 836 */ 837 pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data; 838 qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 839 xsize = rm->rm_col[x].rc_size; 840 ysize = rm->rm_col[y].rc_size; 841 842 rm->rm_col[VDEV_RAIDZ_P].rc_data = 843 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size); 844 rm->rm_col[VDEV_RAIDZ_Q].rc_data = 845 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size); 846 rm->rm_col[x].rc_size = 0; 847 rm->rm_col[y].rc_size = 0; 848 849 vdev_raidz_generate_parity_pq(rm); 850 851 rm->rm_col[x].rc_size = xsize; 852 rm->rm_col[y].rc_size = ysize; 853 854 p = pdata; 855 q = qdata; 856 pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data; 857 qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 858 xd = rm->rm_col[x].rc_data; 859 yd = rm->rm_col[y].rc_data; 860 861 /* 862 * We now have: 863 * Pxy = P + D_x + D_y 864 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y 865 * 866 * We can then solve for D_x: 867 * D_x = A * (P + Pxy) + B * (Q + Qxy) 868 * where 869 * A = 2^(x - y) * (2^(x - y) + 1)^-1 870 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 871 * 872 * With D_x in hand, we can easily solve for D_y: 873 * D_y = P + Pxy + D_x 874 */ 875 876 a = vdev_raidz_pow2[255 + x - y]; 877 b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)]; 878 tmp = 255 - vdev_raidz_log2[a ^ 1]; 879 880 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; 881 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; 882 883 for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) { 884 *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^ 885 vdev_raidz_exp2(*q ^ *qxy, bexp); 886 887 if (i < ysize) 888 *yd = *p ^ *pxy ^ *xd; 889 } 890 891 zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data, 892 rm->rm_col[VDEV_RAIDZ_P].rc_size); 893 zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data, 894 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 895 896 /* 897 * Restore the saved parity data. 898 */ 899 rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata; 900 rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata; 901 902 return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q)); 903} 904 905/* BEGIN CSTYLED */ 906/* 907 * In the general case of reconstruction, we must solve the system of linear 908 * equations defined by the coeffecients used to generate parity as well as 909 * the contents of the data and parity disks. This can be expressed with 910 * vectors for the original data (D) and the actual data (d) and parity (p) 911 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 912 * 913 * __ __ __ __ 914 * | | __ __ | p_0 | 915 * | V | | D_0 | | p_m-1 | 916 * | | x | : | = | d_0 | 917 * | I | | D_n-1 | | : | 918 * | | ~~ ~~ | d_n-1 | 919 * ~~ ~~ ~~ ~~ 920 * 921 * I is simply a square identity matrix of size n, and V is a vandermonde 922 * matrix defined by the coeffecients we chose for the various parity columns 923 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 924 * computation as well as linear separability. 925 * 926 * __ __ __ __ 927 * | 1 .. 1 1 1 | | p_0 | 928 * | 2^n-1 .. 4 2 1 | __ __ | : | 929 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 930 * | 1 .. 0 0 0 | | D_1 | | d_0 | 931 * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 932 * | : : : : | | : | | d_2 | 933 * | 0 .. 1 0 0 | | D_n-1 | | : | 934 * | 0 .. 0 1 0 | ~~ ~~ | : | 935 * | 0 .. 0 0 1 | | d_n-1 | 936 * ~~ ~~ ~~ ~~ 937 * 938 * Note that I, V, d, and p are known. To compute D, we must invert the 939 * matrix and use the known data and parity values to reconstruct the unknown 940 * data values. We begin by removing the rows in V|I and d|p that correspond 941 * to failed or missing columns; we then make V|I square (n x n) and d|p 942 * sized n by removing rows corresponding to unused parity from the bottom up 943 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 944 * using Gauss-Jordan elimination. In the example below we use m=3 parity 945 * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 946 * __ __ 947 * | 1 1 1 1 1 1 1 1 | 948 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 949 * | 19 205 116 29 64 16 4 1 | / / 950 * | 1 0 0 0 0 0 0 0 | / / 951 * | 0 1 0 0 0 0 0 0 | <--' / 952 * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 953 * | 0 0 0 1 0 0 0 0 | 954 * | 0 0 0 0 1 0 0 0 | 955 * | 0 0 0 0 0 1 0 0 | 956 * | 0 0 0 0 0 0 1 0 | 957 * | 0 0 0 0 0 0 0 1 | 958 * ~~ ~~ 959 * __ __ 960 * | 1 1 1 1 1 1 1 1 | 961 * | 128 64 32 16 8 4 2 1 | 962 * | 19 205 116 29 64 16 4 1 | 963 * | 1 0 0 0 0 0 0 0 | 964 * | 0 1 0 0 0 0 0 0 | 965 * (V|I)' = | 0 0 1 0 0 0 0 0 | 966 * | 0 0 0 1 0 0 0 0 | 967 * | 0 0 0 0 1 0 0 0 | 968 * | 0 0 0 0 0 1 0 0 | 969 * | 0 0 0 0 0 0 1 0 | 970 * | 0 0 0 0 0 0 0 1 | 971 * ~~ ~~ 972 * 973 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 974 * have carefully chosen the seed values 1, 2, and 4 to ensure that this 975 * matrix is not singular. 976 * __ __ 977 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 978 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 979 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 980 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 981 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 982 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 983 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 984 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 985 * ~~ ~~ 986 * __ __ 987 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 988 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 989 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 990 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 991 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 992 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 993 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 994 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 995 * ~~ ~~ 996 * __ __ 997 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 998 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 999 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 1000 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1001 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1002 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1003 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1004 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1005 * ~~ ~~ 1006 * __ __ 1007 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1008 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1009 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 1010 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1011 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1012 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1013 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1014 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1015 * ~~ ~~ 1016 * __ __ 1017 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1018 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1019 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1020 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1021 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1022 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1023 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1024 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1025 * ~~ ~~ 1026 * __ __ 1027 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1028 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 1029 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1030 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1031 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1032 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1033 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1034 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1035 * ~~ ~~ 1036 * __ __ 1037 * | 0 0 1 0 0 0 0 0 | 1038 * | 167 100 5 41 159 169 217 208 | 1039 * | 166 100 4 40 158 168 216 209 | 1040 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 1041 * | 0 0 0 0 1 0 0 0 | 1042 * | 0 0 0 0 0 1 0 0 | 1043 * | 0 0 0 0 0 0 1 0 | 1044 * | 0 0 0 0 0 0 0 1 | 1045 * ~~ ~~ 1046 * 1047 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 1048 * of the missing data. 1049 * 1050 * As is apparent from the example above, the only non-trivial rows in the 1051 * inverse matrix correspond to the data disks that we're trying to 1052 * reconstruct. Indeed, those are the only rows we need as the others would 1053 * only be useful for reconstructing data known or assumed to be valid. For 1054 * that reason, we only build the coefficients in the rows that correspond to 1055 * targeted columns. 1056 */ 1057/* END CSTYLED */ 1058 1059static void 1060vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, 1061 uint8_t **rows) 1062{ 1063 int i, j; 1064 int pow; 1065 1066 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); 1067 1068 /* 1069 * Fill in the missing rows of interest. 1070 */ 1071 for (i = 0; i < nmap; i++) { 1072 ASSERT3S(0, <=, map[i]); 1073 ASSERT3S(map[i], <=, 2); 1074 1075 pow = map[i] * n; 1076 if (pow > 255) 1077 pow -= 255; 1078 ASSERT(pow <= 255); 1079 1080 for (j = 0; j < n; j++) { 1081 pow -= map[i]; 1082 if (pow < 0) 1083 pow += 255; 1084 rows[i][j] = vdev_raidz_pow2[pow]; 1085 } 1086 } 1087} 1088 1089static void 1090vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, 1091 uint8_t **rows, uint8_t **invrows, const uint8_t *used) 1092{ 1093 int i, j, ii, jj; 1094 uint8_t log; 1095 1096 /* 1097 * Assert that the first nmissing entries from the array of used 1098 * columns correspond to parity columns and that subsequent entries 1099 * correspond to data columns. 1100 */ 1101 for (i = 0; i < nmissing; i++) { 1102 ASSERT3S(used[i], <, rm->rm_firstdatacol); 1103 } 1104 for (; i < n; i++) { 1105 ASSERT3S(used[i], >=, rm->rm_firstdatacol); 1106 } 1107 1108 /* 1109 * First initialize the storage where we'll compute the inverse rows. 1110 */ 1111 for (i = 0; i < nmissing; i++) { 1112 for (j = 0; j < n; j++) { 1113 invrows[i][j] = (i == j) ? 1 : 0; 1114 } 1115 } 1116 1117 /* 1118 * Subtract all trivial rows from the rows of consequence. 1119 */ 1120 for (i = 0; i < nmissing; i++) { 1121 for (j = nmissing; j < n; j++) { 1122 ASSERT3U(used[j], >=, rm->rm_firstdatacol); 1123 jj = used[j] - rm->rm_firstdatacol; 1124 ASSERT3S(jj, <, n); 1125 invrows[i][j] = rows[i][jj]; 1126 rows[i][jj] = 0; 1127 } 1128 } 1129 1130 /* 1131 * For each of the rows of interest, we must normalize it and subtract 1132 * a multiple of it from the other rows. 1133 */ 1134 for (i = 0; i < nmissing; i++) { 1135 for (j = 0; j < missing[i]; j++) { 1136 ASSERT3U(rows[i][j], ==, 0); 1137 } 1138 ASSERT3U(rows[i][missing[i]], !=, 0); 1139 1140 /* 1141 * Compute the inverse of the first element and multiply each 1142 * element in the row by that value. 1143 */ 1144 log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 1145 1146 for (j = 0; j < n; j++) { 1147 rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 1148 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 1149 } 1150 1151 for (ii = 0; ii < nmissing; ii++) { 1152 if (i == ii) 1153 continue; 1154 1155 ASSERT3U(rows[ii][missing[i]], !=, 0); 1156 1157 log = vdev_raidz_log2[rows[ii][missing[i]]]; 1158 1159 for (j = 0; j < n; j++) { 1160 rows[ii][j] ^= 1161 vdev_raidz_exp2(rows[i][j], log); 1162 invrows[ii][j] ^= 1163 vdev_raidz_exp2(invrows[i][j], log); 1164 } 1165 } 1166 } 1167 1168 /* 1169 * Verify that the data that is left in the rows are properly part of 1170 * an identity matrix. 1171 */ 1172 for (i = 0; i < nmissing; i++) { 1173 for (j = 0; j < n; j++) { 1174 if (j == missing[i]) { 1175 ASSERT3U(rows[i][j], ==, 1); 1176 } else { 1177 ASSERT3U(rows[i][j], ==, 0); 1178 } 1179 } 1180 } 1181} 1182 1183static void 1184vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, 1185 int *missing, uint8_t **invrows, const uint8_t *used) 1186{ 1187 int i, j, x, cc, c; 1188 uint8_t *src; 1189 uint64_t ccount; 1190 uint8_t *dst[VDEV_RAIDZ_MAXPARITY]; 1191 uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; 1192 uint8_t log, val; 1193 int ll; 1194 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 1195 uint8_t *p, *pp; 1196 size_t psize; 1197 1198 psize = sizeof (invlog[0][0]) * n * nmissing; 1199 p = kmem_alloc(psize, KM_SLEEP); 1200 1201 for (pp = p, i = 0; i < nmissing; i++) { 1202 invlog[i] = pp; 1203 pp += n; 1204 } 1205 1206 for (i = 0; i < nmissing; i++) { 1207 for (j = 0; j < n; j++) { 1208 ASSERT3U(invrows[i][j], !=, 0); 1209 invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 1210 } 1211 } 1212 1213 for (i = 0; i < n; i++) { 1214 c = used[i]; 1215 ASSERT3U(c, <, rm->rm_cols); 1216 1217 src = rm->rm_col[c].rc_data; 1218 ccount = rm->rm_col[c].rc_size; 1219 for (j = 0; j < nmissing; j++) { 1220 cc = missing[j] + rm->rm_firstdatacol; 1221 ASSERT3U(cc, >=, rm->rm_firstdatacol); 1222 ASSERT3U(cc, <, rm->rm_cols); 1223 ASSERT3U(cc, !=, c); 1224 1225 dst[j] = rm->rm_col[cc].rc_data; 1226 dcount[j] = rm->rm_col[cc].rc_size; 1227 } 1228 1229 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); 1230 1231 for (x = 0; x < ccount; x++, src++) { 1232 if (*src != 0) 1233 log = vdev_raidz_log2[*src]; 1234 1235 for (cc = 0; cc < nmissing; cc++) { 1236 if (x >= dcount[cc]) 1237 continue; 1238 1239 if (*src == 0) { 1240 val = 0; 1241 } else { 1242 if ((ll = log + invlog[cc][i]) >= 255) 1243 ll -= 255; 1244 val = vdev_raidz_pow2[ll]; 1245 } 1246 1247 if (i == 0) 1248 dst[cc][x] = val; 1249 else 1250 dst[cc][x] ^= val; 1251 } 1252 } 1253 } 1254 1255 kmem_free(p, psize); 1256} 1257 1258static int 1259vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) 1260{ 1261 int n, i, c, t, tt; 1262 int nmissing_rows; 1263 int missing_rows[VDEV_RAIDZ_MAXPARITY]; 1264 int parity_map[VDEV_RAIDZ_MAXPARITY]; 1265 1266 uint8_t *p, *pp; 1267 size_t psize; 1268 1269 uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 1270 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 1271 uint8_t *used; 1272 1273 int code = 0; 1274 1275 1276 n = rm->rm_cols - rm->rm_firstdatacol; 1277 1278 /* 1279 * Figure out which data columns are missing. 1280 */ 1281 nmissing_rows = 0; 1282 for (t = 0; t < ntgts; t++) { 1283 if (tgts[t] >= rm->rm_firstdatacol) { 1284 missing_rows[nmissing_rows++] = 1285 tgts[t] - rm->rm_firstdatacol; 1286 } 1287 } 1288 1289 /* 1290 * Figure out which parity columns to use to help generate the missing 1291 * data columns. 1292 */ 1293 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 1294 ASSERT(tt < ntgts); 1295 ASSERT(c < rm->rm_firstdatacol); 1296 1297 /* 1298 * Skip any targeted parity columns. 1299 */ 1300 if (c == tgts[tt]) { 1301 tt++; 1302 continue; 1303 } 1304 1305 code |= 1 << c; 1306 1307 parity_map[i] = c; 1308 i++; 1309 } 1310 1311 ASSERT(code != 0); 1312 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); 1313 1314 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 1315 nmissing_rows * n + sizeof (used[0]) * n; 1316 p = kmem_alloc(psize, KM_SLEEP); 1317 1318 for (pp = p, i = 0; i < nmissing_rows; i++) { 1319 rows[i] = pp; 1320 pp += n; 1321 invrows[i] = pp; 1322 pp += n; 1323 } 1324 used = pp; 1325 1326 for (i = 0; i < nmissing_rows; i++) { 1327 used[i] = parity_map[i]; 1328 } 1329 1330 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1331 if (tt < nmissing_rows && 1332 c == missing_rows[tt] + rm->rm_firstdatacol) { 1333 tt++; 1334 continue; 1335 } 1336 1337 ASSERT3S(i, <, n); 1338 used[i] = c; 1339 i++; 1340 } 1341 1342 /* 1343 * Initialize the interesting rows of the matrix. 1344 */ 1345 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); 1346 1347 /* 1348 * Invert the matrix. 1349 */ 1350 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, 1351 invrows, used); 1352 1353 /* 1354 * Reconstruct the missing data using the generated matrix. 1355 */ 1356 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, 1357 invrows, used); 1358 1359 kmem_free(p, psize); 1360 1361 return (code); 1362} 1363 1364static int 1365vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) 1366{ 1367 int tgts[VDEV_RAIDZ_MAXPARITY], *dt; 1368 int ntgts; 1369 int i, c; 1370 int code; 1371 int nbadparity, nbaddata; 1372 int parity_valid[VDEV_RAIDZ_MAXPARITY]; 1373 1374 /* 1375 * The tgts list must already be sorted. 1376 */ 1377 for (i = 1; i < nt; i++) { 1378 ASSERT(t[i] > t[i - 1]); 1379 } 1380 1381 nbadparity = rm->rm_firstdatacol; 1382 nbaddata = rm->rm_cols - nbadparity; 1383 ntgts = 0; 1384 for (i = 0, c = 0; c < rm->rm_cols; c++) { 1385 if (c < rm->rm_firstdatacol) 1386 parity_valid[c] = B_FALSE; 1387 1388 if (i < nt && c == t[i]) { 1389 tgts[ntgts++] = c; 1390 i++; 1391 } else if (rm->rm_col[c].rc_error != 0) { 1392 tgts[ntgts++] = c; 1393 } else if (c >= rm->rm_firstdatacol) { 1394 nbaddata--; 1395 } else { 1396 parity_valid[c] = B_TRUE; 1397 nbadparity--; 1398 } 1399 } 1400 1401 ASSERT(ntgts >= nt); 1402 ASSERT(nbaddata >= 0); 1403 ASSERT(nbaddata + nbadparity == ntgts); 1404 1405 dt = &tgts[nbadparity]; 1406 1407 /* 1408 * See if we can use any of our optimized reconstruction routines. 1409 */ 1410 if (!vdev_raidz_default_to_general) { 1411 switch (nbaddata) { 1412 case 1: 1413 if (parity_valid[VDEV_RAIDZ_P]) 1414 return (vdev_raidz_reconstruct_p(rm, dt, 1)); 1415 1416 ASSERT(rm->rm_firstdatacol > 1); 1417 1418 if (parity_valid[VDEV_RAIDZ_Q]) 1419 return (vdev_raidz_reconstruct_q(rm, dt, 1)); 1420 1421 ASSERT(rm->rm_firstdatacol > 2); 1422 break; 1423 1424 case 2: 1425 ASSERT(rm->rm_firstdatacol > 1); 1426 1427 if (parity_valid[VDEV_RAIDZ_P] && 1428 parity_valid[VDEV_RAIDZ_Q]) 1429 return (vdev_raidz_reconstruct_pq(rm, dt, 2)); 1430 1431 ASSERT(rm->rm_firstdatacol > 2); 1432 1433 break; 1434 } 1435 } 1436 1437 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); 1438 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); 1439 ASSERT(code > 0); 1440 return (code); 1441} 1442 1443static int
| 25 */ 26 27#include <sys/zfs_context.h> 28#include <sys/spa.h> 29#include <sys/vdev_impl.h> 30#include <sys/zio.h> 31#include <sys/zio_checksum.h> 32#include <sys/fs/zfs.h> 33#include <sys/fm/fs/zfs.h> 34 35/* 36 * Virtual device vector for RAID-Z. 37 * 38 * This vdev supports single, double, and triple parity. For single parity, 39 * we use a simple XOR of all the data columns. For double or triple parity, 40 * we use a special case of Reed-Solomon coding. This extends the 41 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by 42 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for 43 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the 44 * former is also based. The latter is designed to provide higher performance 45 * for writes. 46 * 47 * Note that the Plank paper claimed to support arbitrary N+M, but was then 48 * amended six years later identifying a critical flaw that invalidates its 49 * claims. Nevertheless, the technique can be adapted to work for up to 50 * triple parity. For additional parity, the amendment "Note: Correction to 51 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding 52 * is viable, but the additional complexity means that write performance will 53 * suffer. 54 * 55 * All of the methods above operate on a Galois field, defined over the 56 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements 57 * can be expressed with a single byte. Briefly, the operations on the 58 * field are defined as follows: 59 * 60 * o addition (+) is represented by a bitwise XOR 61 * o subtraction (-) is therefore identical to addition: A + B = A - B 62 * o multiplication of A by 2 is defined by the following bitwise expression: 63 * (A * 2)_7 = A_6 64 * (A * 2)_6 = A_5 65 * (A * 2)_5 = A_4 66 * (A * 2)_4 = A_3 + A_7 67 * (A * 2)_3 = A_2 + A_7 68 * (A * 2)_2 = A_1 + A_7 69 * (A * 2)_1 = A_0 70 * (A * 2)_0 = A_7 71 * 72 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). 73 * As an aside, this multiplication is derived from the error correcting 74 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1. 75 * 76 * Observe that any number in the field (except for 0) can be expressed as a 77 * power of 2 -- a generator for the field. We store a table of the powers of 78 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can 79 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather 80 * than field addition). The inverse of a field element A (A^-1) is therefore 81 * A ^ (255 - 1) = A^254. 82 * 83 * The up-to-three parity columns, P, Q, R over several data columns, 84 * D_0, ... D_n-1, can be expressed by field operations: 85 * 86 * P = D_0 + D_1 + ... + D_n-2 + D_n-1 87 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 88 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 89 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1 90 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1 91 * 92 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival 93 * XOR operation, and 2 and 4 can be computed quickly and generate linearly- 94 * independent coefficients. (There are no additional coefficients that have 95 * this property which is why the uncorrected Plank method breaks down.) 96 * 97 * See the reconstruction code below for how P, Q and R can used individually 98 * or in concert to recover missing data columns. 99 */ 100 101typedef struct raidz_col { 102 uint64_t rc_devidx; /* child device index for I/O */ 103 uint64_t rc_offset; /* device offset */ 104 uint64_t rc_size; /* I/O size */ 105 void *rc_data; /* I/O data */ 106 void *rc_gdata; /* used to store the "good" version */ 107 int rc_error; /* I/O error for this device */ 108 uint8_t rc_tried; /* Did we attempt this I/O column? */ 109 uint8_t rc_skipped; /* Did we skip this I/O column? */ 110} raidz_col_t; 111 112typedef struct raidz_map { 113 uint64_t rm_cols; /* Regular column count */ 114 uint64_t rm_scols; /* Count including skipped columns */ 115 uint64_t rm_bigcols; /* Number of oversized columns */ 116 uint64_t rm_asize; /* Actual total I/O size */ 117 uint64_t rm_missingdata; /* Count of missing data devices */ 118 uint64_t rm_missingparity; /* Count of missing parity devices */ 119 uint64_t rm_firstdatacol; /* First data column/parity count */ 120 uint64_t rm_nskip; /* Skipped sectors for padding */ 121 uint64_t rm_skipstart; /* Column index of padding start */ 122 void *rm_datacopy; /* rm_asize-buffer of copied data */ 123 uintptr_t rm_reports; /* # of referencing checksum reports */ 124 uint8_t rm_freed; /* map no longer has referencing ZIO */ 125 uint8_t rm_ecksuminjected; /* checksum error was injected */ 126 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ 127} raidz_map_t; 128 129#define VDEV_RAIDZ_P 0 130#define VDEV_RAIDZ_Q 1 131#define VDEV_RAIDZ_R 2 132 133#define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 134#define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 135 136/* 137 * We provide a mechanism to perform the field multiplication operation on a 138 * 64-bit value all at once rather than a byte at a time. This works by 139 * creating a mask from the top bit in each byte and using that to 140 * conditionally apply the XOR of 0x1d. 141 */ 142#define VDEV_RAIDZ_64MUL_2(x, mask) \ 143{ \ 144 (mask) = (x) & 0x8080808080808080ULL; \ 145 (mask) = ((mask) << 1) - ((mask) >> 7); \ 146 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 147 ((mask) & 0x1d1d1d1d1d1d1d1d); \ 148} 149 150#define VDEV_RAIDZ_64MUL_4(x, mask) \ 151{ \ 152 VDEV_RAIDZ_64MUL_2((x), mask); \ 153 VDEV_RAIDZ_64MUL_2((x), mask); \ 154} 155 156/* 157 * Force reconstruction to use the general purpose method. 158 */ 159int vdev_raidz_default_to_general; 160 161/* 162 * These two tables represent powers and logs of 2 in the Galois field defined 163 * above. These values were computed by repeatedly multiplying by 2 as above. 164 */ 165static const uint8_t vdev_raidz_pow2[256] = { 166 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 167 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 168 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 169 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 170 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 171 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 172 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, 173 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 174 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 175 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, 176 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 177 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 178 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 179 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 180 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 181 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 182 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 183 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 184 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 185 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 186 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 187 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 188 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 189 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 190 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, 191 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 192 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 193 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, 194 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 195 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, 196 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 197 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 198}; 199static const uint8_t vdev_raidz_log2[256] = { 200 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, 201 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, 202 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, 203 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, 204 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, 205 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, 206 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, 207 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, 208 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, 209 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, 210 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, 211 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, 212 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, 213 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, 214 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, 215 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, 216 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, 217 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, 218 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, 219 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, 220 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, 221 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, 222 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, 223 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, 224 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, 225 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, 226 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, 227 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, 228 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, 229 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, 230 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, 231 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, 232}; 233 234static void vdev_raidz_generate_parity(raidz_map_t *rm); 235 236/* 237 * Multiply a given number by 2 raised to the given power. 238 */ 239static uint8_t 240vdev_raidz_exp2(uint_t a, int exp) 241{ 242 if (a == 0) 243 return (0); 244 245 ASSERT(exp >= 0); 246 ASSERT(vdev_raidz_log2[a] > 0 || a == 1); 247 248 exp += vdev_raidz_log2[a]; 249 if (exp > 255) 250 exp -= 255; 251 252 return (vdev_raidz_pow2[exp]); 253} 254 255static void 256vdev_raidz_map_free(raidz_map_t *rm) 257{ 258 int c; 259 size_t size; 260 261 for (c = 0; c < rm->rm_firstdatacol; c++) { 262 zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); 263 264 if (rm->rm_col[c].rc_gdata != NULL) 265 zio_buf_free(rm->rm_col[c].rc_gdata, 266 rm->rm_col[c].rc_size); 267 } 268 269 size = 0; 270 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) 271 size += rm->rm_col[c].rc_size; 272 273 if (rm->rm_datacopy != NULL) 274 zio_buf_free(rm->rm_datacopy, size); 275 276 kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); 277} 278 279static void 280vdev_raidz_map_free_vsd(zio_t *zio) 281{ 282 raidz_map_t *rm = zio->io_vsd; 283 284 ASSERT3U(rm->rm_freed, ==, 0); 285 rm->rm_freed = 1; 286 287 if (rm->rm_reports == 0) 288 vdev_raidz_map_free(rm); 289} 290 291/*ARGSUSED*/ 292static void 293vdev_raidz_cksum_free(void *arg, size_t ignored) 294{ 295 raidz_map_t *rm = arg; 296 297 ASSERT3U(rm->rm_reports, >, 0); 298 299 if (--rm->rm_reports == 0 && rm->rm_freed != 0) 300 vdev_raidz_map_free(rm); 301} 302 303static void 304vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) 305{ 306 raidz_map_t *rm = zcr->zcr_cbdata; 307 size_t c = zcr->zcr_cbinfo; 308 size_t x; 309 310 const char *good = NULL; 311 const char *bad = rm->rm_col[c].rc_data; 312 313 if (good_data == NULL) { 314 zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); 315 return; 316 } 317 318 if (c < rm->rm_firstdatacol) { 319 /* 320 * The first time through, calculate the parity blocks for 321 * the good data (this relies on the fact that the good 322 * data never changes for a given logical ZIO) 323 */ 324 if (rm->rm_col[0].rc_gdata == NULL) { 325 char *bad_parity[VDEV_RAIDZ_MAXPARITY]; 326 char *buf; 327 328 /* 329 * Set up the rm_col[]s to generate the parity for 330 * good_data, first saving the parity bufs and 331 * replacing them with buffers to hold the result. 332 */ 333 for (x = 0; x < rm->rm_firstdatacol; x++) { 334 bad_parity[x] = rm->rm_col[x].rc_data; 335 rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata = 336 zio_buf_alloc(rm->rm_col[x].rc_size); 337 } 338 339 /* fill in the data columns from good_data */ 340 buf = (char *)good_data; 341 for (; x < rm->rm_cols; x++) { 342 rm->rm_col[x].rc_data = buf; 343 buf += rm->rm_col[x].rc_size; 344 } 345 346 /* 347 * Construct the parity from the good data. 348 */ 349 vdev_raidz_generate_parity(rm); 350 351 /* restore everything back to its original state */ 352 for (x = 0; x < rm->rm_firstdatacol; x++) 353 rm->rm_col[x].rc_data = bad_parity[x]; 354 355 buf = rm->rm_datacopy; 356 for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) { 357 rm->rm_col[x].rc_data = buf; 358 buf += rm->rm_col[x].rc_size; 359 } 360 } 361 362 ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL); 363 good = rm->rm_col[c].rc_gdata; 364 } else { 365 /* adjust good_data to point at the start of our column */ 366 good = good_data; 367 368 for (x = rm->rm_firstdatacol; x < c; x++) 369 good += rm->rm_col[x].rc_size; 370 } 371 372 /* we drop the ereport if it ends up that the data was good */ 373 zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE); 374} 375 376/* 377 * Invoked indirectly by zfs_ereport_start_checksum(), called 378 * below when our read operation fails completely. The main point 379 * is to keep a copy of everything we read from disk, so that at 380 * vdev_raidz_cksum_finish() time we can compare it with the good data. 381 */ 382static void 383vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) 384{ 385 size_t c = (size_t)(uintptr_t)arg; 386 caddr_t buf; 387 388 raidz_map_t *rm = zio->io_vsd; 389 size_t size; 390 391 /* set up the report and bump the refcount */ 392 zcr->zcr_cbdata = rm; 393 zcr->zcr_cbinfo = c; 394 zcr->zcr_finish = vdev_raidz_cksum_finish; 395 zcr->zcr_free = vdev_raidz_cksum_free; 396 397 rm->rm_reports++; 398 ASSERT3U(rm->rm_reports, >, 0); 399 400 if (rm->rm_datacopy != NULL) 401 return; 402 403 /* 404 * It's the first time we're called for this raidz_map_t, so we need 405 * to copy the data aside; there's no guarantee that our zio's buffer 406 * won't be re-used for something else. 407 * 408 * Our parity data is already in separate buffers, so there's no need 409 * to copy them. 410 */ 411 412 size = 0; 413 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) 414 size += rm->rm_col[c].rc_size; 415 416 buf = rm->rm_datacopy = zio_buf_alloc(size); 417 418 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 419 raidz_col_t *col = &rm->rm_col[c]; 420 421 bcopy(col->rc_data, buf, col->rc_size); 422 col->rc_data = buf; 423 424 buf += col->rc_size; 425 } 426 ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size); 427} 428 429static const zio_vsd_ops_t vdev_raidz_vsd_ops = { 430 vdev_raidz_map_free_vsd, 431 vdev_raidz_cksum_report 432}; 433 434static raidz_map_t * 435vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, 436 uint64_t nparity) 437{ 438 raidz_map_t *rm; 439 uint64_t b = zio->io_offset >> unit_shift; 440 uint64_t s = zio->io_size >> unit_shift; 441 uint64_t f = b % dcols; 442 uint64_t o = (b / dcols) << unit_shift; 443 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; 444 445 q = s / (dcols - nparity); 446 r = s - q * (dcols - nparity); 447 bc = (r == 0 ? 0 : r + nparity); 448 tot = s + nparity * (q + (r == 0 ? 0 : 1)); 449 450 if (q == 0) { 451 acols = bc; 452 scols = MIN(dcols, roundup(bc, nparity + 1)); 453 } else { 454 acols = dcols; 455 scols = dcols; 456 } 457 458 ASSERT3U(acols, <=, scols); 459 460 rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP); 461 462 rm->rm_cols = acols; 463 rm->rm_scols = scols; 464 rm->rm_bigcols = bc; 465 rm->rm_skipstart = bc; 466 rm->rm_missingdata = 0; 467 rm->rm_missingparity = 0; 468 rm->rm_firstdatacol = nparity; 469 rm->rm_datacopy = NULL; 470 rm->rm_reports = 0; 471 rm->rm_freed = 0; 472 rm->rm_ecksuminjected = 0; 473 474 asize = 0; 475 476 for (c = 0; c < scols; c++) { 477 col = f + c; 478 coff = o; 479 if (col >= dcols) { 480 col -= dcols; 481 coff += 1ULL << unit_shift; 482 } 483 rm->rm_col[c].rc_devidx = col; 484 rm->rm_col[c].rc_offset = coff; 485 rm->rm_col[c].rc_data = NULL; 486 rm->rm_col[c].rc_gdata = NULL; 487 rm->rm_col[c].rc_error = 0; 488 rm->rm_col[c].rc_tried = 0; 489 rm->rm_col[c].rc_skipped = 0; 490 491 if (c >= acols) 492 rm->rm_col[c].rc_size = 0; 493 else if (c < bc) 494 rm->rm_col[c].rc_size = (q + 1) << unit_shift; 495 else 496 rm->rm_col[c].rc_size = q << unit_shift; 497 498 asize += rm->rm_col[c].rc_size; 499 } 500 501 ASSERT3U(asize, ==, tot << unit_shift); 502 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); 503 rm->rm_nskip = roundup(tot, nparity + 1) - tot; 504 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); 505 ASSERT3U(rm->rm_nskip, <=, nparity); 506 507 for (c = 0; c < rm->rm_firstdatacol; c++) 508 rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size); 509 510 rm->rm_col[c].rc_data = zio->io_data; 511 512 for (c = c + 1; c < acols; c++) 513 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + 514 rm->rm_col[c - 1].rc_size; 515 516 /* 517 * If all data stored spans all columns, there's a danger that parity 518 * will always be on the same device and, since parity isn't read 519 * during normal operation, that that device's I/O bandwidth won't be 520 * used effectively. We therefore switch the parity every 1MB. 521 * 522 * ... at least that was, ostensibly, the theory. As a practical 523 * matter unless we juggle the parity between all devices evenly, we 524 * won't see any benefit. Further, occasional writes that aren't a 525 * multiple of the LCM of the number of children and the minimum 526 * stripe width are sufficient to avoid pessimal behavior. 527 * Unfortunately, this decision created an implicit on-disk format 528 * requirement that we need to support for all eternity, but only 529 * for single-parity RAID-Z. 530 * 531 * If we intend to skip a sector in the zeroth column for padding 532 * we must make sure to note this swap. We will never intend to 533 * skip the first column since at least one data and one parity 534 * column must appear in each row. 535 */ 536 ASSERT(rm->rm_cols >= 2); 537 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); 538 539 if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { 540 devidx = rm->rm_col[0].rc_devidx; 541 o = rm->rm_col[0].rc_offset; 542 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; 543 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; 544 rm->rm_col[1].rc_devidx = devidx; 545 rm->rm_col[1].rc_offset = o; 546 547 if (rm->rm_skipstart == 0) 548 rm->rm_skipstart = 1; 549 } 550 551 zio->io_vsd = rm; 552 zio->io_vsd_ops = &vdev_raidz_vsd_ops; 553 return (rm); 554} 555 556static void 557vdev_raidz_generate_parity_p(raidz_map_t *rm) 558{ 559 uint64_t *p, *src, pcount, ccount, i; 560 int c; 561 562 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 563 564 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 565 src = rm->rm_col[c].rc_data; 566 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 567 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 568 569 if (c == rm->rm_firstdatacol) { 570 ASSERT(ccount == pcount); 571 for (i = 0; i < ccount; i++, src++, p++) { 572 *p = *src; 573 } 574 } else { 575 ASSERT(ccount <= pcount); 576 for (i = 0; i < ccount; i++, src++, p++) { 577 *p ^= *src; 578 } 579 } 580 } 581} 582 583static void 584vdev_raidz_generate_parity_pq(raidz_map_t *rm) 585{ 586 uint64_t *p, *q, *src, pcnt, ccnt, mask, i; 587 int c; 588 589 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 590 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 591 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 592 593 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 594 src = rm->rm_col[c].rc_data; 595 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 596 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 597 598 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 599 600 if (c == rm->rm_firstdatacol) { 601 ASSERT(ccnt == pcnt || ccnt == 0); 602 for (i = 0; i < ccnt; i++, src++, p++, q++) { 603 *p = *src; 604 *q = *src; 605 } 606 for (; i < pcnt; i++, src++, p++, q++) { 607 *p = 0; 608 *q = 0; 609 } 610 } else { 611 ASSERT(ccnt <= pcnt); 612 613 /* 614 * Apply the algorithm described above by multiplying 615 * the previous result and adding in the new value. 616 */ 617 for (i = 0; i < ccnt; i++, src++, p++, q++) { 618 *p ^= *src; 619 620 VDEV_RAIDZ_64MUL_2(*q, mask); 621 *q ^= *src; 622 } 623 624 /* 625 * Treat short columns as though they are full of 0s. 626 * Note that there's therefore nothing needed for P. 627 */ 628 for (; i < pcnt; i++, q++) { 629 VDEV_RAIDZ_64MUL_2(*q, mask); 630 } 631 } 632 } 633} 634 635static void 636vdev_raidz_generate_parity_pqr(raidz_map_t *rm) 637{ 638 uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i; 639 int c; 640 641 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 642 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 643 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 644 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 645 rm->rm_col[VDEV_RAIDZ_R].rc_size); 646 647 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 648 src = rm->rm_col[c].rc_data; 649 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 650 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 651 r = rm->rm_col[VDEV_RAIDZ_R].rc_data; 652 653 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 654 655 if (c == rm->rm_firstdatacol) { 656 ASSERT(ccnt == pcnt || ccnt == 0); 657 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 658 *p = *src; 659 *q = *src; 660 *r = *src; 661 } 662 for (; i < pcnt; i++, src++, p++, q++, r++) { 663 *p = 0; 664 *q = 0; 665 *r = 0; 666 } 667 } else { 668 ASSERT(ccnt <= pcnt); 669 670 /* 671 * Apply the algorithm described above by multiplying 672 * the previous result and adding in the new value. 673 */ 674 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 675 *p ^= *src; 676 677 VDEV_RAIDZ_64MUL_2(*q, mask); 678 *q ^= *src; 679 680 VDEV_RAIDZ_64MUL_4(*r, mask); 681 *r ^= *src; 682 } 683 684 /* 685 * Treat short columns as though they are full of 0s. 686 * Note that there's therefore nothing needed for P. 687 */ 688 for (; i < pcnt; i++, q++, r++) { 689 VDEV_RAIDZ_64MUL_2(*q, mask); 690 VDEV_RAIDZ_64MUL_4(*r, mask); 691 } 692 } 693 } 694} 695 696/* 697 * Generate RAID parity in the first virtual columns according to the number of 698 * parity columns available. 699 */ 700static void 701vdev_raidz_generate_parity(raidz_map_t *rm) 702{ 703 switch (rm->rm_firstdatacol) { 704 case 1: 705 vdev_raidz_generate_parity_p(rm); 706 break; 707 case 2: 708 vdev_raidz_generate_parity_pq(rm); 709 break; 710 case 3: 711 vdev_raidz_generate_parity_pqr(rm); 712 break; 713 default: 714 cmn_err(CE_PANIC, "invalid RAID-Z configuration"); 715 } 716} 717 718static int 719vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) 720{ 721 uint64_t *dst, *src, xcount, ccount, count, i; 722 int x = tgts[0]; 723 int c; 724 725 ASSERT(ntgts == 1); 726 ASSERT(x >= rm->rm_firstdatacol); 727 ASSERT(x < rm->rm_cols); 728 729 xcount = rm->rm_col[x].rc_size / sizeof (src[0]); 730 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0])); 731 ASSERT(xcount > 0); 732 733 src = rm->rm_col[VDEV_RAIDZ_P].rc_data; 734 dst = rm->rm_col[x].rc_data; 735 for (i = 0; i < xcount; i++, dst++, src++) { 736 *dst = *src; 737 } 738 739 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 740 src = rm->rm_col[c].rc_data; 741 dst = rm->rm_col[x].rc_data; 742 743 if (c == x) 744 continue; 745 746 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 747 count = MIN(ccount, xcount); 748 749 for (i = 0; i < count; i++, dst++, src++) { 750 *dst ^= *src; 751 } 752 } 753 754 return (1 << VDEV_RAIDZ_P); 755} 756 757static int 758vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) 759{ 760 uint64_t *dst, *src, xcount, ccount, count, mask, i; 761 uint8_t *b; 762 int x = tgts[0]; 763 int c, j, exp; 764 765 ASSERT(ntgts == 1); 766 767 xcount = rm->rm_col[x].rc_size / sizeof (src[0]); 768 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0])); 769 770 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 771 src = rm->rm_col[c].rc_data; 772 dst = rm->rm_col[x].rc_data; 773 774 if (c == x) 775 ccount = 0; 776 else 777 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 778 779 count = MIN(ccount, xcount); 780 781 if (c == rm->rm_firstdatacol) { 782 for (i = 0; i < count; i++, dst++, src++) { 783 *dst = *src; 784 } 785 for (; i < xcount; i++, dst++) { 786 *dst = 0; 787 } 788 789 } else { 790 for (i = 0; i < count; i++, dst++, src++) { 791 VDEV_RAIDZ_64MUL_2(*dst, mask); 792 *dst ^= *src; 793 } 794 795 for (; i < xcount; i++, dst++) { 796 VDEV_RAIDZ_64MUL_2(*dst, mask); 797 } 798 } 799 } 800 801 src = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 802 dst = rm->rm_col[x].rc_data; 803 exp = 255 - (rm->rm_cols - 1 - x); 804 805 for (i = 0; i < xcount; i++, dst++, src++) { 806 *dst ^= *src; 807 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { 808 *b = vdev_raidz_exp2(*b, exp); 809 } 810 } 811 812 return (1 << VDEV_RAIDZ_Q); 813} 814 815static int 816vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) 817{ 818 uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp; 819 void *pdata, *qdata; 820 uint64_t xsize, ysize, i; 821 int x = tgts[0]; 822 int y = tgts[1]; 823 824 ASSERT(ntgts == 2); 825 ASSERT(x < y); 826 ASSERT(x >= rm->rm_firstdatacol); 827 ASSERT(y < rm->rm_cols); 828 829 ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size); 830 831 /* 832 * Move the parity data aside -- we're going to compute parity as 833 * though columns x and y were full of zeros -- Pxy and Qxy. We want to 834 * reuse the parity generation mechanism without trashing the actual 835 * parity so we make those columns appear to be full of zeros by 836 * setting their lengths to zero. 837 */ 838 pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data; 839 qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 840 xsize = rm->rm_col[x].rc_size; 841 ysize = rm->rm_col[y].rc_size; 842 843 rm->rm_col[VDEV_RAIDZ_P].rc_data = 844 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size); 845 rm->rm_col[VDEV_RAIDZ_Q].rc_data = 846 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size); 847 rm->rm_col[x].rc_size = 0; 848 rm->rm_col[y].rc_size = 0; 849 850 vdev_raidz_generate_parity_pq(rm); 851 852 rm->rm_col[x].rc_size = xsize; 853 rm->rm_col[y].rc_size = ysize; 854 855 p = pdata; 856 q = qdata; 857 pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data; 858 qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 859 xd = rm->rm_col[x].rc_data; 860 yd = rm->rm_col[y].rc_data; 861 862 /* 863 * We now have: 864 * Pxy = P + D_x + D_y 865 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y 866 * 867 * We can then solve for D_x: 868 * D_x = A * (P + Pxy) + B * (Q + Qxy) 869 * where 870 * A = 2^(x - y) * (2^(x - y) + 1)^-1 871 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 872 * 873 * With D_x in hand, we can easily solve for D_y: 874 * D_y = P + Pxy + D_x 875 */ 876 877 a = vdev_raidz_pow2[255 + x - y]; 878 b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)]; 879 tmp = 255 - vdev_raidz_log2[a ^ 1]; 880 881 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; 882 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; 883 884 for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) { 885 *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^ 886 vdev_raidz_exp2(*q ^ *qxy, bexp); 887 888 if (i < ysize) 889 *yd = *p ^ *pxy ^ *xd; 890 } 891 892 zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data, 893 rm->rm_col[VDEV_RAIDZ_P].rc_size); 894 zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data, 895 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 896 897 /* 898 * Restore the saved parity data. 899 */ 900 rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata; 901 rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata; 902 903 return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q)); 904} 905 906/* BEGIN CSTYLED */ 907/* 908 * In the general case of reconstruction, we must solve the system of linear 909 * equations defined by the coeffecients used to generate parity as well as 910 * the contents of the data and parity disks. This can be expressed with 911 * vectors for the original data (D) and the actual data (d) and parity (p) 912 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 913 * 914 * __ __ __ __ 915 * | | __ __ | p_0 | 916 * | V | | D_0 | | p_m-1 | 917 * | | x | : | = | d_0 | 918 * | I | | D_n-1 | | : | 919 * | | ~~ ~~ | d_n-1 | 920 * ~~ ~~ ~~ ~~ 921 * 922 * I is simply a square identity matrix of size n, and V is a vandermonde 923 * matrix defined by the coeffecients we chose for the various parity columns 924 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 925 * computation as well as linear separability. 926 * 927 * __ __ __ __ 928 * | 1 .. 1 1 1 | | p_0 | 929 * | 2^n-1 .. 4 2 1 | __ __ | : | 930 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 931 * | 1 .. 0 0 0 | | D_1 | | d_0 | 932 * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 933 * | : : : : | | : | | d_2 | 934 * | 0 .. 1 0 0 | | D_n-1 | | : | 935 * | 0 .. 0 1 0 | ~~ ~~ | : | 936 * | 0 .. 0 0 1 | | d_n-1 | 937 * ~~ ~~ ~~ ~~ 938 * 939 * Note that I, V, d, and p are known. To compute D, we must invert the 940 * matrix and use the known data and parity values to reconstruct the unknown 941 * data values. We begin by removing the rows in V|I and d|p that correspond 942 * to failed or missing columns; we then make V|I square (n x n) and d|p 943 * sized n by removing rows corresponding to unused parity from the bottom up 944 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 945 * using Gauss-Jordan elimination. In the example below we use m=3 parity 946 * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 947 * __ __ 948 * | 1 1 1 1 1 1 1 1 | 949 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 950 * | 19 205 116 29 64 16 4 1 | / / 951 * | 1 0 0 0 0 0 0 0 | / / 952 * | 0 1 0 0 0 0 0 0 | <--' / 953 * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 954 * | 0 0 0 1 0 0 0 0 | 955 * | 0 0 0 0 1 0 0 0 | 956 * | 0 0 0 0 0 1 0 0 | 957 * | 0 0 0 0 0 0 1 0 | 958 * | 0 0 0 0 0 0 0 1 | 959 * ~~ ~~ 960 * __ __ 961 * | 1 1 1 1 1 1 1 1 | 962 * | 128 64 32 16 8 4 2 1 | 963 * | 19 205 116 29 64 16 4 1 | 964 * | 1 0 0 0 0 0 0 0 | 965 * | 0 1 0 0 0 0 0 0 | 966 * (V|I)' = | 0 0 1 0 0 0 0 0 | 967 * | 0 0 0 1 0 0 0 0 | 968 * | 0 0 0 0 1 0 0 0 | 969 * | 0 0 0 0 0 1 0 0 | 970 * | 0 0 0 0 0 0 1 0 | 971 * | 0 0 0 0 0 0 0 1 | 972 * ~~ ~~ 973 * 974 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 975 * have carefully chosen the seed values 1, 2, and 4 to ensure that this 976 * matrix is not singular. 977 * __ __ 978 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 979 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 980 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 981 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 982 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 983 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 984 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 985 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 986 * ~~ ~~ 987 * __ __ 988 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 989 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 990 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 991 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 992 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 993 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 994 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 995 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 996 * ~~ ~~ 997 * __ __ 998 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 999 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1000 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 1001 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1002 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1003 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1004 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1005 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1006 * ~~ ~~ 1007 * __ __ 1008 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1009 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1010 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 1011 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1012 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1013 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1014 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1015 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1016 * ~~ ~~ 1017 * __ __ 1018 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1019 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1020 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1021 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1022 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1023 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1024 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1025 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1026 * ~~ ~~ 1027 * __ __ 1028 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1029 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 1030 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1031 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1032 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1033 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1034 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1035 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1036 * ~~ ~~ 1037 * __ __ 1038 * | 0 0 1 0 0 0 0 0 | 1039 * | 167 100 5 41 159 169 217 208 | 1040 * | 166 100 4 40 158 168 216 209 | 1041 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 1042 * | 0 0 0 0 1 0 0 0 | 1043 * | 0 0 0 0 0 1 0 0 | 1044 * | 0 0 0 0 0 0 1 0 | 1045 * | 0 0 0 0 0 0 0 1 | 1046 * ~~ ~~ 1047 * 1048 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 1049 * of the missing data. 1050 * 1051 * As is apparent from the example above, the only non-trivial rows in the 1052 * inverse matrix correspond to the data disks that we're trying to 1053 * reconstruct. Indeed, those are the only rows we need as the others would 1054 * only be useful for reconstructing data known or assumed to be valid. For 1055 * that reason, we only build the coefficients in the rows that correspond to 1056 * targeted columns. 1057 */ 1058/* END CSTYLED */ 1059 1060static void 1061vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, 1062 uint8_t **rows) 1063{ 1064 int i, j; 1065 int pow; 1066 1067 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); 1068 1069 /* 1070 * Fill in the missing rows of interest. 1071 */ 1072 for (i = 0; i < nmap; i++) { 1073 ASSERT3S(0, <=, map[i]); 1074 ASSERT3S(map[i], <=, 2); 1075 1076 pow = map[i] * n; 1077 if (pow > 255) 1078 pow -= 255; 1079 ASSERT(pow <= 255); 1080 1081 for (j = 0; j < n; j++) { 1082 pow -= map[i]; 1083 if (pow < 0) 1084 pow += 255; 1085 rows[i][j] = vdev_raidz_pow2[pow]; 1086 } 1087 } 1088} 1089 1090static void 1091vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, 1092 uint8_t **rows, uint8_t **invrows, const uint8_t *used) 1093{ 1094 int i, j, ii, jj; 1095 uint8_t log; 1096 1097 /* 1098 * Assert that the first nmissing entries from the array of used 1099 * columns correspond to parity columns and that subsequent entries 1100 * correspond to data columns. 1101 */ 1102 for (i = 0; i < nmissing; i++) { 1103 ASSERT3S(used[i], <, rm->rm_firstdatacol); 1104 } 1105 for (; i < n; i++) { 1106 ASSERT3S(used[i], >=, rm->rm_firstdatacol); 1107 } 1108 1109 /* 1110 * First initialize the storage where we'll compute the inverse rows. 1111 */ 1112 for (i = 0; i < nmissing; i++) { 1113 for (j = 0; j < n; j++) { 1114 invrows[i][j] = (i == j) ? 1 : 0; 1115 } 1116 } 1117 1118 /* 1119 * Subtract all trivial rows from the rows of consequence. 1120 */ 1121 for (i = 0; i < nmissing; i++) { 1122 for (j = nmissing; j < n; j++) { 1123 ASSERT3U(used[j], >=, rm->rm_firstdatacol); 1124 jj = used[j] - rm->rm_firstdatacol; 1125 ASSERT3S(jj, <, n); 1126 invrows[i][j] = rows[i][jj]; 1127 rows[i][jj] = 0; 1128 } 1129 } 1130 1131 /* 1132 * For each of the rows of interest, we must normalize it and subtract 1133 * a multiple of it from the other rows. 1134 */ 1135 for (i = 0; i < nmissing; i++) { 1136 for (j = 0; j < missing[i]; j++) { 1137 ASSERT3U(rows[i][j], ==, 0); 1138 } 1139 ASSERT3U(rows[i][missing[i]], !=, 0); 1140 1141 /* 1142 * Compute the inverse of the first element and multiply each 1143 * element in the row by that value. 1144 */ 1145 log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 1146 1147 for (j = 0; j < n; j++) { 1148 rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 1149 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 1150 } 1151 1152 for (ii = 0; ii < nmissing; ii++) { 1153 if (i == ii) 1154 continue; 1155 1156 ASSERT3U(rows[ii][missing[i]], !=, 0); 1157 1158 log = vdev_raidz_log2[rows[ii][missing[i]]]; 1159 1160 for (j = 0; j < n; j++) { 1161 rows[ii][j] ^= 1162 vdev_raidz_exp2(rows[i][j], log); 1163 invrows[ii][j] ^= 1164 vdev_raidz_exp2(invrows[i][j], log); 1165 } 1166 } 1167 } 1168 1169 /* 1170 * Verify that the data that is left in the rows are properly part of 1171 * an identity matrix. 1172 */ 1173 for (i = 0; i < nmissing; i++) { 1174 for (j = 0; j < n; j++) { 1175 if (j == missing[i]) { 1176 ASSERT3U(rows[i][j], ==, 1); 1177 } else { 1178 ASSERT3U(rows[i][j], ==, 0); 1179 } 1180 } 1181 } 1182} 1183 1184static void 1185vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, 1186 int *missing, uint8_t **invrows, const uint8_t *used) 1187{ 1188 int i, j, x, cc, c; 1189 uint8_t *src; 1190 uint64_t ccount; 1191 uint8_t *dst[VDEV_RAIDZ_MAXPARITY]; 1192 uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; 1193 uint8_t log, val; 1194 int ll; 1195 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 1196 uint8_t *p, *pp; 1197 size_t psize; 1198 1199 psize = sizeof (invlog[0][0]) * n * nmissing; 1200 p = kmem_alloc(psize, KM_SLEEP); 1201 1202 for (pp = p, i = 0; i < nmissing; i++) { 1203 invlog[i] = pp; 1204 pp += n; 1205 } 1206 1207 for (i = 0; i < nmissing; i++) { 1208 for (j = 0; j < n; j++) { 1209 ASSERT3U(invrows[i][j], !=, 0); 1210 invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 1211 } 1212 } 1213 1214 for (i = 0; i < n; i++) { 1215 c = used[i]; 1216 ASSERT3U(c, <, rm->rm_cols); 1217 1218 src = rm->rm_col[c].rc_data; 1219 ccount = rm->rm_col[c].rc_size; 1220 for (j = 0; j < nmissing; j++) { 1221 cc = missing[j] + rm->rm_firstdatacol; 1222 ASSERT3U(cc, >=, rm->rm_firstdatacol); 1223 ASSERT3U(cc, <, rm->rm_cols); 1224 ASSERT3U(cc, !=, c); 1225 1226 dst[j] = rm->rm_col[cc].rc_data; 1227 dcount[j] = rm->rm_col[cc].rc_size; 1228 } 1229 1230 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); 1231 1232 for (x = 0; x < ccount; x++, src++) { 1233 if (*src != 0) 1234 log = vdev_raidz_log2[*src]; 1235 1236 for (cc = 0; cc < nmissing; cc++) { 1237 if (x >= dcount[cc]) 1238 continue; 1239 1240 if (*src == 0) { 1241 val = 0; 1242 } else { 1243 if ((ll = log + invlog[cc][i]) >= 255) 1244 ll -= 255; 1245 val = vdev_raidz_pow2[ll]; 1246 } 1247 1248 if (i == 0) 1249 dst[cc][x] = val; 1250 else 1251 dst[cc][x] ^= val; 1252 } 1253 } 1254 } 1255 1256 kmem_free(p, psize); 1257} 1258 1259static int 1260vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) 1261{ 1262 int n, i, c, t, tt; 1263 int nmissing_rows; 1264 int missing_rows[VDEV_RAIDZ_MAXPARITY]; 1265 int parity_map[VDEV_RAIDZ_MAXPARITY]; 1266 1267 uint8_t *p, *pp; 1268 size_t psize; 1269 1270 uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 1271 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 1272 uint8_t *used; 1273 1274 int code = 0; 1275 1276 1277 n = rm->rm_cols - rm->rm_firstdatacol; 1278 1279 /* 1280 * Figure out which data columns are missing. 1281 */ 1282 nmissing_rows = 0; 1283 for (t = 0; t < ntgts; t++) { 1284 if (tgts[t] >= rm->rm_firstdatacol) { 1285 missing_rows[nmissing_rows++] = 1286 tgts[t] - rm->rm_firstdatacol; 1287 } 1288 } 1289 1290 /* 1291 * Figure out which parity columns to use to help generate the missing 1292 * data columns. 1293 */ 1294 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 1295 ASSERT(tt < ntgts); 1296 ASSERT(c < rm->rm_firstdatacol); 1297 1298 /* 1299 * Skip any targeted parity columns. 1300 */ 1301 if (c == tgts[tt]) { 1302 tt++; 1303 continue; 1304 } 1305 1306 code |= 1 << c; 1307 1308 parity_map[i] = c; 1309 i++; 1310 } 1311 1312 ASSERT(code != 0); 1313 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); 1314 1315 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 1316 nmissing_rows * n + sizeof (used[0]) * n; 1317 p = kmem_alloc(psize, KM_SLEEP); 1318 1319 for (pp = p, i = 0; i < nmissing_rows; i++) { 1320 rows[i] = pp; 1321 pp += n; 1322 invrows[i] = pp; 1323 pp += n; 1324 } 1325 used = pp; 1326 1327 for (i = 0; i < nmissing_rows; i++) { 1328 used[i] = parity_map[i]; 1329 } 1330 1331 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1332 if (tt < nmissing_rows && 1333 c == missing_rows[tt] + rm->rm_firstdatacol) { 1334 tt++; 1335 continue; 1336 } 1337 1338 ASSERT3S(i, <, n); 1339 used[i] = c; 1340 i++; 1341 } 1342 1343 /* 1344 * Initialize the interesting rows of the matrix. 1345 */ 1346 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); 1347 1348 /* 1349 * Invert the matrix. 1350 */ 1351 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, 1352 invrows, used); 1353 1354 /* 1355 * Reconstruct the missing data using the generated matrix. 1356 */ 1357 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, 1358 invrows, used); 1359 1360 kmem_free(p, psize); 1361 1362 return (code); 1363} 1364 1365static int 1366vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) 1367{ 1368 int tgts[VDEV_RAIDZ_MAXPARITY], *dt; 1369 int ntgts; 1370 int i, c; 1371 int code; 1372 int nbadparity, nbaddata; 1373 int parity_valid[VDEV_RAIDZ_MAXPARITY]; 1374 1375 /* 1376 * The tgts list must already be sorted. 1377 */ 1378 for (i = 1; i < nt; i++) { 1379 ASSERT(t[i] > t[i - 1]); 1380 } 1381 1382 nbadparity = rm->rm_firstdatacol; 1383 nbaddata = rm->rm_cols - nbadparity; 1384 ntgts = 0; 1385 for (i = 0, c = 0; c < rm->rm_cols; c++) { 1386 if (c < rm->rm_firstdatacol) 1387 parity_valid[c] = B_FALSE; 1388 1389 if (i < nt && c == t[i]) { 1390 tgts[ntgts++] = c; 1391 i++; 1392 } else if (rm->rm_col[c].rc_error != 0) { 1393 tgts[ntgts++] = c; 1394 } else if (c >= rm->rm_firstdatacol) { 1395 nbaddata--; 1396 } else { 1397 parity_valid[c] = B_TRUE; 1398 nbadparity--; 1399 } 1400 } 1401 1402 ASSERT(ntgts >= nt); 1403 ASSERT(nbaddata >= 0); 1404 ASSERT(nbaddata + nbadparity == ntgts); 1405 1406 dt = &tgts[nbadparity]; 1407 1408 /* 1409 * See if we can use any of our optimized reconstruction routines. 1410 */ 1411 if (!vdev_raidz_default_to_general) { 1412 switch (nbaddata) { 1413 case 1: 1414 if (parity_valid[VDEV_RAIDZ_P]) 1415 return (vdev_raidz_reconstruct_p(rm, dt, 1)); 1416 1417 ASSERT(rm->rm_firstdatacol > 1); 1418 1419 if (parity_valid[VDEV_RAIDZ_Q]) 1420 return (vdev_raidz_reconstruct_q(rm, dt, 1)); 1421 1422 ASSERT(rm->rm_firstdatacol > 2); 1423 break; 1424 1425 case 2: 1426 ASSERT(rm->rm_firstdatacol > 1); 1427 1428 if (parity_valid[VDEV_RAIDZ_P] && 1429 parity_valid[VDEV_RAIDZ_Q]) 1430 return (vdev_raidz_reconstruct_pq(rm, dt, 2)); 1431 1432 ASSERT(rm->rm_firstdatacol > 2); 1433 1434 break; 1435 } 1436 } 1437 1438 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); 1439 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); 1440 ASSERT(code > 0); 1441 return (code); 1442} 1443 1444static int
|
1476 1477 if (numerrors > nparity) { 1478 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; 1479 return (lasterror); 1480 } 1481 1482 return (0); 1483} 1484 1485static void 1486vdev_raidz_close(vdev_t *vd) 1487{ 1488 int c; 1489 1490 for (c = 0; c < vd->vdev_children; c++) 1491 vdev_close(vd->vdev_child[c]); 1492} 1493 1494static uint64_t 1495vdev_raidz_asize(vdev_t *vd, uint64_t psize) 1496{ 1497 uint64_t asize; 1498 uint64_t ashift = vd->vdev_top->vdev_ashift; 1499 uint64_t cols = vd->vdev_children; 1500 uint64_t nparity = vd->vdev_nparity; 1501 1502 asize = ((psize - 1) >> ashift) + 1; 1503 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); 1504 asize = roundup(asize, nparity + 1) << ashift; 1505 1506 return (asize); 1507} 1508 1509static void 1510vdev_raidz_child_done(zio_t *zio) 1511{ 1512 raidz_col_t *rc = zio->io_private; 1513 1514 rc->rc_error = zio->io_error; 1515 rc->rc_tried = 1; 1516 rc->rc_skipped = 0; 1517} 1518 1519static int 1520vdev_raidz_io_start(zio_t *zio) 1521{ 1522 vdev_t *vd = zio->io_vd; 1523 vdev_t *tvd = vd->vdev_top; 1524 vdev_t *cvd; 1525 raidz_map_t *rm; 1526 raidz_col_t *rc; 1527 int c, i; 1528 1529 rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children, 1530 vd->vdev_nparity); 1531 1532 ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); 1533 1534 if (zio->io_type == ZIO_TYPE_WRITE) { 1535 vdev_raidz_generate_parity(rm); 1536 1537 for (c = 0; c < rm->rm_cols; c++) { 1538 rc = &rm->rm_col[c]; 1539 cvd = vd->vdev_child[rc->rc_devidx]; 1540 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1541 rc->rc_offset, rc->rc_data, rc->rc_size, 1542 zio->io_type, zio->io_priority, 0, 1543 vdev_raidz_child_done, rc)); 1544 } 1545 1546 /* 1547 * Generate optional I/Os for any skipped sectors to improve 1548 * aggregation contiguity. 1549 */ 1550 for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) { 1551 ASSERT(c <= rm->rm_scols); 1552 if (c == rm->rm_scols) 1553 c = 0; 1554 rc = &rm->rm_col[c]; 1555 cvd = vd->vdev_child[rc->rc_devidx]; 1556 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1557 rc->rc_offset + rc->rc_size, NULL, 1558 1 << tvd->vdev_ashift, 1559 zio->io_type, zio->io_priority, 1560 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); 1561 } 1562 1563 return (ZIO_PIPELINE_CONTINUE); 1564 } 1565 1566 ASSERT(zio->io_type == ZIO_TYPE_READ); 1567 1568 /* 1569 * Iterate over the columns in reverse order so that we hit the parity 1570 * last -- any errors along the way will force us to read the parity. 1571 */ 1572 for (c = rm->rm_cols - 1; c >= 0; c--) { 1573 rc = &rm->rm_col[c]; 1574 cvd = vd->vdev_child[rc->rc_devidx]; 1575 if (!vdev_readable(cvd)) { 1576 if (c >= rm->rm_firstdatacol) 1577 rm->rm_missingdata++; 1578 else 1579 rm->rm_missingparity++; 1580 rc->rc_error = ENXIO; 1581 rc->rc_tried = 1; /* don't even try */ 1582 rc->rc_skipped = 1; 1583 continue; 1584 } 1585 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 1586 if (c >= rm->rm_firstdatacol) 1587 rm->rm_missingdata++; 1588 else 1589 rm->rm_missingparity++; 1590 rc->rc_error = ESTALE; 1591 rc->rc_skipped = 1; 1592 continue; 1593 } 1594 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || 1595 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { 1596 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1597 rc->rc_offset, rc->rc_data, rc->rc_size, 1598 zio->io_type, zio->io_priority, 0, 1599 vdev_raidz_child_done, rc)); 1600 } 1601 } 1602 1603 return (ZIO_PIPELINE_CONTINUE); 1604} 1605 1606 1607/* 1608 * Report a checksum error for a child of a RAID-Z device. 1609 */ 1610static void 1611raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data) 1612{ 1613 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; 1614 1615 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 1616 zio_bad_cksum_t zbc; 1617 raidz_map_t *rm = zio->io_vsd; 1618 1619 mutex_enter(&vd->vdev_stat_lock); 1620 vd->vdev_stat.vs_checksum_errors++; 1621 mutex_exit(&vd->vdev_stat_lock); 1622 1623 zbc.zbc_has_cksum = 0; 1624 zbc.zbc_injected = rm->rm_ecksuminjected; 1625 1626 zfs_ereport_post_checksum(zio->io_spa, vd, zio, 1627 rc->rc_offset, rc->rc_size, rc->rc_data, bad_data, 1628 &zbc); 1629 } 1630} 1631 1632/* 1633 * We keep track of whether or not there were any injected errors, so that 1634 * any ereports we generate can note it. 1635 */ 1636static int 1637raidz_checksum_verify(zio_t *zio) 1638{ 1639 zio_bad_cksum_t zbc; 1640 raidz_map_t *rm = zio->io_vsd; 1641 1642 int ret = zio_checksum_error(zio, &zbc); 1643 if (ret != 0 && zbc.zbc_injected != 0) 1644 rm->rm_ecksuminjected = 1; 1645 1646 return (ret); 1647} 1648 1649/* 1650 * Generate the parity from the data columns. If we tried and were able to 1651 * read the parity without error, verify that the generated parity matches the 1652 * data we read. If it doesn't, we fire off a checksum error. Return the 1653 * number such failures. 1654 */ 1655static int 1656raidz_parity_verify(zio_t *zio, raidz_map_t *rm) 1657{ 1658 void *orig[VDEV_RAIDZ_MAXPARITY]; 1659 int c, ret = 0; 1660 raidz_col_t *rc; 1661 1662 for (c = 0; c < rm->rm_firstdatacol; c++) { 1663 rc = &rm->rm_col[c]; 1664 if (!rc->rc_tried || rc->rc_error != 0) 1665 continue; 1666 orig[c] = zio_buf_alloc(rc->rc_size); 1667 bcopy(rc->rc_data, orig[c], rc->rc_size); 1668 } 1669 1670 vdev_raidz_generate_parity(rm); 1671 1672 for (c = 0; c < rm->rm_firstdatacol; c++) { 1673 rc = &rm->rm_col[c]; 1674 if (!rc->rc_tried || rc->rc_error != 0) 1675 continue; 1676 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { 1677 raidz_checksum_error(zio, rc, orig[c]); 1678 rc->rc_error = ECKSUM; 1679 ret++; 1680 } 1681 zio_buf_free(orig[c], rc->rc_size); 1682 } 1683 1684 return (ret); 1685} 1686 1687/* 1688 * Keep statistics on all the ways that we used parity to correct data. 1689 */ 1690static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY]; 1691 1692static int 1693vdev_raidz_worst_error(raidz_map_t *rm) 1694{ 1695 int error = 0; 1696 1697 for (int c = 0; c < rm->rm_cols; c++) 1698 error = zio_worst_error(error, rm->rm_col[c].rc_error); 1699 1700 return (error); 1701} 1702 1703/* 1704 * Iterate over all combinations of bad data and attempt a reconstruction. 1705 * Note that the algorithm below is non-optimal because it doesn't take into 1706 * account how reconstruction is actually performed. For example, with 1707 * triple-parity RAID-Z the reconstruction procedure is the same if column 4 1708 * is targeted as invalid as if columns 1 and 4 are targeted since in both 1709 * cases we'd only use parity information in column 0. 1710 */ 1711static int 1712vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) 1713{ 1714 raidz_map_t *rm = zio->io_vsd; 1715 raidz_col_t *rc; 1716 void *orig[VDEV_RAIDZ_MAXPARITY]; 1717 int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 1718 int *tgts = &tstore[1]; 1719 int current, next, i, c, n; 1720 int code, ret = 0; 1721 1722 ASSERT(total_errors < rm->rm_firstdatacol); 1723 1724 /* 1725 * This simplifies one edge condition. 1726 */ 1727 tgts[-1] = -1; 1728 1729 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { 1730 /* 1731 * Initialize the targets array by finding the first n columns 1732 * that contain no error. 1733 * 1734 * If there were no data errors, we need to ensure that we're 1735 * always explicitly attempting to reconstruct at least one 1736 * data column. To do this, we simply push the highest target 1737 * up into the data columns. 1738 */ 1739 for (c = 0, i = 0; i < n; i++) { 1740 if (i == n - 1 && data_errors == 0 && 1741 c < rm->rm_firstdatacol) { 1742 c = rm->rm_firstdatacol; 1743 } 1744 1745 while (rm->rm_col[c].rc_error != 0) { 1746 c++; 1747 ASSERT3S(c, <, rm->rm_cols); 1748 } 1749 1750 tgts[i] = c++; 1751 } 1752 1753 /* 1754 * Setting tgts[n] simplifies the other edge condition. 1755 */ 1756 tgts[n] = rm->rm_cols; 1757 1758 /* 1759 * These buffers were allocated in previous iterations. 1760 */ 1761 for (i = 0; i < n - 1; i++) { 1762 ASSERT(orig[i] != NULL); 1763 } 1764 1765 orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size); 1766 1767 current = 0; 1768 next = tgts[current]; 1769 1770 while (current != n) { 1771 tgts[current] = next; 1772 current = 0; 1773 1774 /* 1775 * Save off the original data that we're going to 1776 * attempt to reconstruct. 1777 */ 1778 for (i = 0; i < n; i++) { 1779 ASSERT(orig[i] != NULL); 1780 c = tgts[i]; 1781 ASSERT3S(c, >=, 0); 1782 ASSERT3S(c, <, rm->rm_cols); 1783 rc = &rm->rm_col[c]; 1784 bcopy(rc->rc_data, orig[i], rc->rc_size); 1785 } 1786 1787 /* 1788 * Attempt a reconstruction and exit the outer loop on 1789 * success. 1790 */ 1791 code = vdev_raidz_reconstruct(rm, tgts, n); 1792 if (raidz_checksum_verify(zio) == 0) { 1793 atomic_inc_64(&raidz_corrected[code]); 1794 1795 for (i = 0; i < n; i++) { 1796 c = tgts[i]; 1797 rc = &rm->rm_col[c]; 1798 ASSERT(rc->rc_error == 0); 1799 if (rc->rc_tried) 1800 raidz_checksum_error(zio, rc, 1801 orig[i]); 1802 rc->rc_error = ECKSUM; 1803 } 1804 1805 ret = code; 1806 goto done; 1807 } 1808 1809 /* 1810 * Restore the original data. 1811 */ 1812 for (i = 0; i < n; i++) { 1813 c = tgts[i]; 1814 rc = &rm->rm_col[c]; 1815 bcopy(orig[i], rc->rc_data, rc->rc_size); 1816 } 1817 1818 do { 1819 /* 1820 * Find the next valid column after the current 1821 * position.. 1822 */ 1823 for (next = tgts[current] + 1; 1824 next < rm->rm_cols && 1825 rm->rm_col[next].rc_error != 0; next++) 1826 continue; 1827 1828 ASSERT(next <= tgts[current + 1]); 1829 1830 /* 1831 * If that spot is available, we're done here. 1832 */ 1833 if (next != tgts[current + 1]) 1834 break; 1835 1836 /* 1837 * Otherwise, find the next valid column after 1838 * the previous position. 1839 */ 1840 for (c = tgts[current - 1] + 1; 1841 rm->rm_col[c].rc_error != 0; c++) 1842 continue; 1843 1844 tgts[current] = c; 1845 current++; 1846 1847 } while (current != n); 1848 } 1849 } 1850 n--; 1851done: 1852 for (i = 0; i < n; i++) { 1853 zio_buf_free(orig[i], rm->rm_col[0].rc_size); 1854 } 1855 1856 return (ret); 1857} 1858 1859static void 1860vdev_raidz_io_done(zio_t *zio) 1861{ 1862 vdev_t *vd = zio->io_vd; 1863 vdev_t *cvd; 1864 raidz_map_t *rm = zio->io_vsd; 1865 raidz_col_t *rc; 1866 int unexpected_errors = 0; 1867 int parity_errors = 0; 1868 int parity_untried = 0; 1869 int data_errors = 0; 1870 int total_errors = 0; 1871 int n, c; 1872 int tgts[VDEV_RAIDZ_MAXPARITY]; 1873 int code; 1874 1875 ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ 1876 1877 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); 1878 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); 1879 1880 for (c = 0; c < rm->rm_cols; c++) { 1881 rc = &rm->rm_col[c]; 1882 1883 if (rc->rc_error) { 1884 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 1885 1886 if (c < rm->rm_firstdatacol) 1887 parity_errors++; 1888 else 1889 data_errors++; 1890 1891 if (!rc->rc_skipped) 1892 unexpected_errors++; 1893 1894 total_errors++; 1895 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { 1896 parity_untried++; 1897 } 1898 } 1899 1900 if (zio->io_type == ZIO_TYPE_WRITE) { 1901 /* 1902 * XXX -- for now, treat partial writes as a success. 1903 * (If we couldn't write enough columns to reconstruct 1904 * the data, the I/O failed. Otherwise, good enough.) 1905 * 1906 * Now that we support write reallocation, it would be better 1907 * to treat partial failure as real failure unless there are 1908 * no non-degraded top-level vdevs left, and not update DTLs 1909 * if we intend to reallocate. 1910 */ 1911 /* XXPOLICY */ 1912 if (total_errors > rm->rm_firstdatacol) 1913 zio->io_error = vdev_raidz_worst_error(rm); 1914 1915 return; 1916 } 1917 1918 ASSERT(zio->io_type == ZIO_TYPE_READ); 1919 /* 1920 * There are three potential phases for a read: 1921 * 1. produce valid data from the columns read 1922 * 2. read all disks and try again 1923 * 3. perform combinatorial reconstruction 1924 * 1925 * Each phase is progressively both more expensive and less likely to 1926 * occur. If we encounter more errors than we can repair or all phases 1927 * fail, we have no choice but to return an error. 1928 */ 1929 1930 /* 1931 * If the number of errors we saw was correctable -- less than or equal 1932 * to the number of parity disks read -- attempt to produce data that 1933 * has a valid checksum. Naturally, this case applies in the absence of 1934 * any errors. 1935 */ 1936 if (total_errors <= rm->rm_firstdatacol - parity_untried) { 1937 if (data_errors == 0) { 1938 if (raidz_checksum_verify(zio) == 0) { 1939 /* 1940 * If we read parity information (unnecessarily 1941 * as it happens since no reconstruction was 1942 * needed) regenerate and verify the parity. 1943 * We also regenerate parity when resilvering 1944 * so we can write it out to the failed device 1945 * later. 1946 */ 1947 if (parity_errors + parity_untried < 1948 rm->rm_firstdatacol || 1949 (zio->io_flags & ZIO_FLAG_RESILVER)) { 1950 n = raidz_parity_verify(zio, rm); 1951 unexpected_errors += n; 1952 ASSERT(parity_errors + n <= 1953 rm->rm_firstdatacol); 1954 } 1955 goto done; 1956 } 1957 } else { 1958 /* 1959 * We either attempt to read all the parity columns or 1960 * none of them. If we didn't try to read parity, we 1961 * wouldn't be here in the correctable case. There must 1962 * also have been fewer parity errors than parity 1963 * columns or, again, we wouldn't be in this code path. 1964 */ 1965 ASSERT(parity_untried == 0); 1966 ASSERT(parity_errors < rm->rm_firstdatacol); 1967 1968 /* 1969 * Identify the data columns that reported an error. 1970 */ 1971 n = 0; 1972 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1973 rc = &rm->rm_col[c]; 1974 if (rc->rc_error != 0) { 1975 ASSERT(n < VDEV_RAIDZ_MAXPARITY); 1976 tgts[n++] = c; 1977 } 1978 } 1979 1980 ASSERT(rm->rm_firstdatacol >= n); 1981 1982 code = vdev_raidz_reconstruct(rm, tgts, n); 1983 1984 if (raidz_checksum_verify(zio) == 0) { 1985 atomic_inc_64(&raidz_corrected[code]); 1986 1987 /* 1988 * If we read more parity disks than were used 1989 * for reconstruction, confirm that the other 1990 * parity disks produced correct data. This 1991 * routine is suboptimal in that it regenerates 1992 * the parity that we already used in addition 1993 * to the parity that we're attempting to 1994 * verify, but this should be a relatively 1995 * uncommon case, and can be optimized if it 1996 * becomes a problem. Note that we regenerate 1997 * parity when resilvering so we can write it 1998 * out to failed devices later. 1999 */ 2000 if (parity_errors < rm->rm_firstdatacol - n || 2001 (zio->io_flags & ZIO_FLAG_RESILVER)) { 2002 n = raidz_parity_verify(zio, rm); 2003 unexpected_errors += n; 2004 ASSERT(parity_errors + n <= 2005 rm->rm_firstdatacol); 2006 } 2007 2008 goto done; 2009 } 2010 } 2011 } 2012 2013 /* 2014 * This isn't a typical situation -- either we got a read error or 2015 * a child silently returned bad data. Read every block so we can 2016 * try again with as much data and parity as we can track down. If 2017 * we've already been through once before, all children will be marked 2018 * as tried so we'll proceed to combinatorial reconstruction. 2019 */ 2020 unexpected_errors = 1; 2021 rm->rm_missingdata = 0; 2022 rm->rm_missingparity = 0; 2023 2024 for (c = 0; c < rm->rm_cols; c++) { 2025 if (rm->rm_col[c].rc_tried) 2026 continue; 2027 2028 zio_vdev_io_redone(zio); 2029 do { 2030 rc = &rm->rm_col[c]; 2031 if (rc->rc_tried) 2032 continue; 2033 zio_nowait(zio_vdev_child_io(zio, NULL, 2034 vd->vdev_child[rc->rc_devidx], 2035 rc->rc_offset, rc->rc_data, rc->rc_size, 2036 zio->io_type, zio->io_priority, 0, 2037 vdev_raidz_child_done, rc)); 2038 } while (++c < rm->rm_cols); 2039 2040 return; 2041 } 2042 2043 /* 2044 * At this point we've attempted to reconstruct the data given the 2045 * errors we detected, and we've attempted to read all columns. There 2046 * must, therefore, be one or more additional problems -- silent errors 2047 * resulting in invalid data rather than explicit I/O errors resulting 2048 * in absent data. We check if there is enough additional data to 2049 * possibly reconstruct the data and then perform combinatorial 2050 * reconstruction over all possible combinations. If that fails, 2051 * we're cooked. 2052 */ 2053 if (total_errors > rm->rm_firstdatacol) { 2054 zio->io_error = vdev_raidz_worst_error(rm); 2055 2056 } else if (total_errors < rm->rm_firstdatacol && 2057 (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) { 2058 /* 2059 * If we didn't use all the available parity for the 2060 * combinatorial reconstruction, verify that the remaining 2061 * parity is correct. 2062 */ 2063 if (code != (1 << rm->rm_firstdatacol) - 1) 2064 (void) raidz_parity_verify(zio, rm); 2065 } else { 2066 /* 2067 * We're here because either: 2068 * 2069 * total_errors == rm_first_datacol, or 2070 * vdev_raidz_combrec() failed 2071 * 2072 * In either case, there is enough bad data to prevent 2073 * reconstruction. 2074 * 2075 * Start checksum ereports for all children which haven't 2076 * failed, and the IO wasn't speculative. 2077 */ 2078 zio->io_error = ECKSUM; 2079 2080 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2081 for (c = 0; c < rm->rm_cols; c++) { 2082 rc = &rm->rm_col[c]; 2083 if (rc->rc_error == 0) { 2084 zio_bad_cksum_t zbc; 2085 zbc.zbc_has_cksum = 0; 2086 zbc.zbc_injected = 2087 rm->rm_ecksuminjected; 2088 2089 zfs_ereport_start_checksum( 2090 zio->io_spa, 2091 vd->vdev_child[rc->rc_devidx], 2092 zio, rc->rc_offset, rc->rc_size, 2093 (void *)(uintptr_t)c, &zbc); 2094 } 2095 } 2096 } 2097 } 2098 2099done: 2100 zio_checksum_verified(zio); 2101 2102 if (zio->io_error == 0 && spa_writeable(zio->io_spa) && 2103 (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { 2104 /* 2105 * Use the good data we have in hand to repair damaged children. 2106 */ 2107 for (c = 0; c < rm->rm_cols; c++) { 2108 rc = &rm->rm_col[c]; 2109 cvd = vd->vdev_child[rc->rc_devidx]; 2110 2111 if (rc->rc_error == 0) 2112 continue; 2113 2114 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 2115 rc->rc_offset, rc->rc_data, rc->rc_size, 2116 ZIO_TYPE_WRITE, zio->io_priority, 2117 ZIO_FLAG_IO_REPAIR | (unexpected_errors ? 2118 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); 2119 } 2120 } 2121} 2122 2123static void 2124vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) 2125{ 2126 if (faulted > vd->vdev_nparity) 2127 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2128 VDEV_AUX_NO_REPLICAS); 2129 else if (degraded + faulted != 0) 2130 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); 2131 else 2132 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); 2133} 2134 2135vdev_ops_t vdev_raidz_ops = { 2136 vdev_raidz_open, 2137 vdev_raidz_close, 2138 vdev_raidz_asize, 2139 vdev_raidz_io_start, 2140 vdev_raidz_io_done, 2141 vdev_raidz_state_change, 2142 NULL, 2143 NULL, 2144 VDEV_TYPE_RAIDZ, /* name of this vdev type */ 2145 B_FALSE /* not a leaf vdev */ 2146};
| 1480 1481 if (numerrors > nparity) { 1482 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; 1483 return (lasterror); 1484 } 1485 1486 return (0); 1487} 1488 1489static void 1490vdev_raidz_close(vdev_t *vd) 1491{ 1492 int c; 1493 1494 for (c = 0; c < vd->vdev_children; c++) 1495 vdev_close(vd->vdev_child[c]); 1496} 1497 1498static uint64_t 1499vdev_raidz_asize(vdev_t *vd, uint64_t psize) 1500{ 1501 uint64_t asize; 1502 uint64_t ashift = vd->vdev_top->vdev_ashift; 1503 uint64_t cols = vd->vdev_children; 1504 uint64_t nparity = vd->vdev_nparity; 1505 1506 asize = ((psize - 1) >> ashift) + 1; 1507 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); 1508 asize = roundup(asize, nparity + 1) << ashift; 1509 1510 return (asize); 1511} 1512 1513static void 1514vdev_raidz_child_done(zio_t *zio) 1515{ 1516 raidz_col_t *rc = zio->io_private; 1517 1518 rc->rc_error = zio->io_error; 1519 rc->rc_tried = 1; 1520 rc->rc_skipped = 0; 1521} 1522 1523static int 1524vdev_raidz_io_start(zio_t *zio) 1525{ 1526 vdev_t *vd = zio->io_vd; 1527 vdev_t *tvd = vd->vdev_top; 1528 vdev_t *cvd; 1529 raidz_map_t *rm; 1530 raidz_col_t *rc; 1531 int c, i; 1532 1533 rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children, 1534 vd->vdev_nparity); 1535 1536 ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); 1537 1538 if (zio->io_type == ZIO_TYPE_WRITE) { 1539 vdev_raidz_generate_parity(rm); 1540 1541 for (c = 0; c < rm->rm_cols; c++) { 1542 rc = &rm->rm_col[c]; 1543 cvd = vd->vdev_child[rc->rc_devidx]; 1544 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1545 rc->rc_offset, rc->rc_data, rc->rc_size, 1546 zio->io_type, zio->io_priority, 0, 1547 vdev_raidz_child_done, rc)); 1548 } 1549 1550 /* 1551 * Generate optional I/Os for any skipped sectors to improve 1552 * aggregation contiguity. 1553 */ 1554 for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) { 1555 ASSERT(c <= rm->rm_scols); 1556 if (c == rm->rm_scols) 1557 c = 0; 1558 rc = &rm->rm_col[c]; 1559 cvd = vd->vdev_child[rc->rc_devidx]; 1560 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1561 rc->rc_offset + rc->rc_size, NULL, 1562 1 << tvd->vdev_ashift, 1563 zio->io_type, zio->io_priority, 1564 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); 1565 } 1566 1567 return (ZIO_PIPELINE_CONTINUE); 1568 } 1569 1570 ASSERT(zio->io_type == ZIO_TYPE_READ); 1571 1572 /* 1573 * Iterate over the columns in reverse order so that we hit the parity 1574 * last -- any errors along the way will force us to read the parity. 1575 */ 1576 for (c = rm->rm_cols - 1; c >= 0; c--) { 1577 rc = &rm->rm_col[c]; 1578 cvd = vd->vdev_child[rc->rc_devidx]; 1579 if (!vdev_readable(cvd)) { 1580 if (c >= rm->rm_firstdatacol) 1581 rm->rm_missingdata++; 1582 else 1583 rm->rm_missingparity++; 1584 rc->rc_error = ENXIO; 1585 rc->rc_tried = 1; /* don't even try */ 1586 rc->rc_skipped = 1; 1587 continue; 1588 } 1589 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 1590 if (c >= rm->rm_firstdatacol) 1591 rm->rm_missingdata++; 1592 else 1593 rm->rm_missingparity++; 1594 rc->rc_error = ESTALE; 1595 rc->rc_skipped = 1; 1596 continue; 1597 } 1598 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || 1599 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { 1600 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1601 rc->rc_offset, rc->rc_data, rc->rc_size, 1602 zio->io_type, zio->io_priority, 0, 1603 vdev_raidz_child_done, rc)); 1604 } 1605 } 1606 1607 return (ZIO_PIPELINE_CONTINUE); 1608} 1609 1610 1611/* 1612 * Report a checksum error for a child of a RAID-Z device. 1613 */ 1614static void 1615raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data) 1616{ 1617 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; 1618 1619 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 1620 zio_bad_cksum_t zbc; 1621 raidz_map_t *rm = zio->io_vsd; 1622 1623 mutex_enter(&vd->vdev_stat_lock); 1624 vd->vdev_stat.vs_checksum_errors++; 1625 mutex_exit(&vd->vdev_stat_lock); 1626 1627 zbc.zbc_has_cksum = 0; 1628 zbc.zbc_injected = rm->rm_ecksuminjected; 1629 1630 zfs_ereport_post_checksum(zio->io_spa, vd, zio, 1631 rc->rc_offset, rc->rc_size, rc->rc_data, bad_data, 1632 &zbc); 1633 } 1634} 1635 1636/* 1637 * We keep track of whether or not there were any injected errors, so that 1638 * any ereports we generate can note it. 1639 */ 1640static int 1641raidz_checksum_verify(zio_t *zio) 1642{ 1643 zio_bad_cksum_t zbc; 1644 raidz_map_t *rm = zio->io_vsd; 1645 1646 int ret = zio_checksum_error(zio, &zbc); 1647 if (ret != 0 && zbc.zbc_injected != 0) 1648 rm->rm_ecksuminjected = 1; 1649 1650 return (ret); 1651} 1652 1653/* 1654 * Generate the parity from the data columns. If we tried and were able to 1655 * read the parity without error, verify that the generated parity matches the 1656 * data we read. If it doesn't, we fire off a checksum error. Return the 1657 * number such failures. 1658 */ 1659static int 1660raidz_parity_verify(zio_t *zio, raidz_map_t *rm) 1661{ 1662 void *orig[VDEV_RAIDZ_MAXPARITY]; 1663 int c, ret = 0; 1664 raidz_col_t *rc; 1665 1666 for (c = 0; c < rm->rm_firstdatacol; c++) { 1667 rc = &rm->rm_col[c]; 1668 if (!rc->rc_tried || rc->rc_error != 0) 1669 continue; 1670 orig[c] = zio_buf_alloc(rc->rc_size); 1671 bcopy(rc->rc_data, orig[c], rc->rc_size); 1672 } 1673 1674 vdev_raidz_generate_parity(rm); 1675 1676 for (c = 0; c < rm->rm_firstdatacol; c++) { 1677 rc = &rm->rm_col[c]; 1678 if (!rc->rc_tried || rc->rc_error != 0) 1679 continue; 1680 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { 1681 raidz_checksum_error(zio, rc, orig[c]); 1682 rc->rc_error = ECKSUM; 1683 ret++; 1684 } 1685 zio_buf_free(orig[c], rc->rc_size); 1686 } 1687 1688 return (ret); 1689} 1690 1691/* 1692 * Keep statistics on all the ways that we used parity to correct data. 1693 */ 1694static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY]; 1695 1696static int 1697vdev_raidz_worst_error(raidz_map_t *rm) 1698{ 1699 int error = 0; 1700 1701 for (int c = 0; c < rm->rm_cols; c++) 1702 error = zio_worst_error(error, rm->rm_col[c].rc_error); 1703 1704 return (error); 1705} 1706 1707/* 1708 * Iterate over all combinations of bad data and attempt a reconstruction. 1709 * Note that the algorithm below is non-optimal because it doesn't take into 1710 * account how reconstruction is actually performed. For example, with 1711 * triple-parity RAID-Z the reconstruction procedure is the same if column 4 1712 * is targeted as invalid as if columns 1 and 4 are targeted since in both 1713 * cases we'd only use parity information in column 0. 1714 */ 1715static int 1716vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) 1717{ 1718 raidz_map_t *rm = zio->io_vsd; 1719 raidz_col_t *rc; 1720 void *orig[VDEV_RAIDZ_MAXPARITY]; 1721 int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 1722 int *tgts = &tstore[1]; 1723 int current, next, i, c, n; 1724 int code, ret = 0; 1725 1726 ASSERT(total_errors < rm->rm_firstdatacol); 1727 1728 /* 1729 * This simplifies one edge condition. 1730 */ 1731 tgts[-1] = -1; 1732 1733 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { 1734 /* 1735 * Initialize the targets array by finding the first n columns 1736 * that contain no error. 1737 * 1738 * If there were no data errors, we need to ensure that we're 1739 * always explicitly attempting to reconstruct at least one 1740 * data column. To do this, we simply push the highest target 1741 * up into the data columns. 1742 */ 1743 for (c = 0, i = 0; i < n; i++) { 1744 if (i == n - 1 && data_errors == 0 && 1745 c < rm->rm_firstdatacol) { 1746 c = rm->rm_firstdatacol; 1747 } 1748 1749 while (rm->rm_col[c].rc_error != 0) { 1750 c++; 1751 ASSERT3S(c, <, rm->rm_cols); 1752 } 1753 1754 tgts[i] = c++; 1755 } 1756 1757 /* 1758 * Setting tgts[n] simplifies the other edge condition. 1759 */ 1760 tgts[n] = rm->rm_cols; 1761 1762 /* 1763 * These buffers were allocated in previous iterations. 1764 */ 1765 for (i = 0; i < n - 1; i++) { 1766 ASSERT(orig[i] != NULL); 1767 } 1768 1769 orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size); 1770 1771 current = 0; 1772 next = tgts[current]; 1773 1774 while (current != n) { 1775 tgts[current] = next; 1776 current = 0; 1777 1778 /* 1779 * Save off the original data that we're going to 1780 * attempt to reconstruct. 1781 */ 1782 for (i = 0; i < n; i++) { 1783 ASSERT(orig[i] != NULL); 1784 c = tgts[i]; 1785 ASSERT3S(c, >=, 0); 1786 ASSERT3S(c, <, rm->rm_cols); 1787 rc = &rm->rm_col[c]; 1788 bcopy(rc->rc_data, orig[i], rc->rc_size); 1789 } 1790 1791 /* 1792 * Attempt a reconstruction and exit the outer loop on 1793 * success. 1794 */ 1795 code = vdev_raidz_reconstruct(rm, tgts, n); 1796 if (raidz_checksum_verify(zio) == 0) { 1797 atomic_inc_64(&raidz_corrected[code]); 1798 1799 for (i = 0; i < n; i++) { 1800 c = tgts[i]; 1801 rc = &rm->rm_col[c]; 1802 ASSERT(rc->rc_error == 0); 1803 if (rc->rc_tried) 1804 raidz_checksum_error(zio, rc, 1805 orig[i]); 1806 rc->rc_error = ECKSUM; 1807 } 1808 1809 ret = code; 1810 goto done; 1811 } 1812 1813 /* 1814 * Restore the original data. 1815 */ 1816 for (i = 0; i < n; i++) { 1817 c = tgts[i]; 1818 rc = &rm->rm_col[c]; 1819 bcopy(orig[i], rc->rc_data, rc->rc_size); 1820 } 1821 1822 do { 1823 /* 1824 * Find the next valid column after the current 1825 * position.. 1826 */ 1827 for (next = tgts[current] + 1; 1828 next < rm->rm_cols && 1829 rm->rm_col[next].rc_error != 0; next++) 1830 continue; 1831 1832 ASSERT(next <= tgts[current + 1]); 1833 1834 /* 1835 * If that spot is available, we're done here. 1836 */ 1837 if (next != tgts[current + 1]) 1838 break; 1839 1840 /* 1841 * Otherwise, find the next valid column after 1842 * the previous position. 1843 */ 1844 for (c = tgts[current - 1] + 1; 1845 rm->rm_col[c].rc_error != 0; c++) 1846 continue; 1847 1848 tgts[current] = c; 1849 current++; 1850 1851 } while (current != n); 1852 } 1853 } 1854 n--; 1855done: 1856 for (i = 0; i < n; i++) { 1857 zio_buf_free(orig[i], rm->rm_col[0].rc_size); 1858 } 1859 1860 return (ret); 1861} 1862 1863static void 1864vdev_raidz_io_done(zio_t *zio) 1865{ 1866 vdev_t *vd = zio->io_vd; 1867 vdev_t *cvd; 1868 raidz_map_t *rm = zio->io_vsd; 1869 raidz_col_t *rc; 1870 int unexpected_errors = 0; 1871 int parity_errors = 0; 1872 int parity_untried = 0; 1873 int data_errors = 0; 1874 int total_errors = 0; 1875 int n, c; 1876 int tgts[VDEV_RAIDZ_MAXPARITY]; 1877 int code; 1878 1879 ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ 1880 1881 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); 1882 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); 1883 1884 for (c = 0; c < rm->rm_cols; c++) { 1885 rc = &rm->rm_col[c]; 1886 1887 if (rc->rc_error) { 1888 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 1889 1890 if (c < rm->rm_firstdatacol) 1891 parity_errors++; 1892 else 1893 data_errors++; 1894 1895 if (!rc->rc_skipped) 1896 unexpected_errors++; 1897 1898 total_errors++; 1899 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { 1900 parity_untried++; 1901 } 1902 } 1903 1904 if (zio->io_type == ZIO_TYPE_WRITE) { 1905 /* 1906 * XXX -- for now, treat partial writes as a success. 1907 * (If we couldn't write enough columns to reconstruct 1908 * the data, the I/O failed. Otherwise, good enough.) 1909 * 1910 * Now that we support write reallocation, it would be better 1911 * to treat partial failure as real failure unless there are 1912 * no non-degraded top-level vdevs left, and not update DTLs 1913 * if we intend to reallocate. 1914 */ 1915 /* XXPOLICY */ 1916 if (total_errors > rm->rm_firstdatacol) 1917 zio->io_error = vdev_raidz_worst_error(rm); 1918 1919 return; 1920 } 1921 1922 ASSERT(zio->io_type == ZIO_TYPE_READ); 1923 /* 1924 * There are three potential phases for a read: 1925 * 1. produce valid data from the columns read 1926 * 2. read all disks and try again 1927 * 3. perform combinatorial reconstruction 1928 * 1929 * Each phase is progressively both more expensive and less likely to 1930 * occur. If we encounter more errors than we can repair or all phases 1931 * fail, we have no choice but to return an error. 1932 */ 1933 1934 /* 1935 * If the number of errors we saw was correctable -- less than or equal 1936 * to the number of parity disks read -- attempt to produce data that 1937 * has a valid checksum. Naturally, this case applies in the absence of 1938 * any errors. 1939 */ 1940 if (total_errors <= rm->rm_firstdatacol - parity_untried) { 1941 if (data_errors == 0) { 1942 if (raidz_checksum_verify(zio) == 0) { 1943 /* 1944 * If we read parity information (unnecessarily 1945 * as it happens since no reconstruction was 1946 * needed) regenerate and verify the parity. 1947 * We also regenerate parity when resilvering 1948 * so we can write it out to the failed device 1949 * later. 1950 */ 1951 if (parity_errors + parity_untried < 1952 rm->rm_firstdatacol || 1953 (zio->io_flags & ZIO_FLAG_RESILVER)) { 1954 n = raidz_parity_verify(zio, rm); 1955 unexpected_errors += n; 1956 ASSERT(parity_errors + n <= 1957 rm->rm_firstdatacol); 1958 } 1959 goto done; 1960 } 1961 } else { 1962 /* 1963 * We either attempt to read all the parity columns or 1964 * none of them. If we didn't try to read parity, we 1965 * wouldn't be here in the correctable case. There must 1966 * also have been fewer parity errors than parity 1967 * columns or, again, we wouldn't be in this code path. 1968 */ 1969 ASSERT(parity_untried == 0); 1970 ASSERT(parity_errors < rm->rm_firstdatacol); 1971 1972 /* 1973 * Identify the data columns that reported an error. 1974 */ 1975 n = 0; 1976 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1977 rc = &rm->rm_col[c]; 1978 if (rc->rc_error != 0) { 1979 ASSERT(n < VDEV_RAIDZ_MAXPARITY); 1980 tgts[n++] = c; 1981 } 1982 } 1983 1984 ASSERT(rm->rm_firstdatacol >= n); 1985 1986 code = vdev_raidz_reconstruct(rm, tgts, n); 1987 1988 if (raidz_checksum_verify(zio) == 0) { 1989 atomic_inc_64(&raidz_corrected[code]); 1990 1991 /* 1992 * If we read more parity disks than were used 1993 * for reconstruction, confirm that the other 1994 * parity disks produced correct data. This 1995 * routine is suboptimal in that it regenerates 1996 * the parity that we already used in addition 1997 * to the parity that we're attempting to 1998 * verify, but this should be a relatively 1999 * uncommon case, and can be optimized if it 2000 * becomes a problem. Note that we regenerate 2001 * parity when resilvering so we can write it 2002 * out to failed devices later. 2003 */ 2004 if (parity_errors < rm->rm_firstdatacol - n || 2005 (zio->io_flags & ZIO_FLAG_RESILVER)) { 2006 n = raidz_parity_verify(zio, rm); 2007 unexpected_errors += n; 2008 ASSERT(parity_errors + n <= 2009 rm->rm_firstdatacol); 2010 } 2011 2012 goto done; 2013 } 2014 } 2015 } 2016 2017 /* 2018 * This isn't a typical situation -- either we got a read error or 2019 * a child silently returned bad data. Read every block so we can 2020 * try again with as much data and parity as we can track down. If 2021 * we've already been through once before, all children will be marked 2022 * as tried so we'll proceed to combinatorial reconstruction. 2023 */ 2024 unexpected_errors = 1; 2025 rm->rm_missingdata = 0; 2026 rm->rm_missingparity = 0; 2027 2028 for (c = 0; c < rm->rm_cols; c++) { 2029 if (rm->rm_col[c].rc_tried) 2030 continue; 2031 2032 zio_vdev_io_redone(zio); 2033 do { 2034 rc = &rm->rm_col[c]; 2035 if (rc->rc_tried) 2036 continue; 2037 zio_nowait(zio_vdev_child_io(zio, NULL, 2038 vd->vdev_child[rc->rc_devidx], 2039 rc->rc_offset, rc->rc_data, rc->rc_size, 2040 zio->io_type, zio->io_priority, 0, 2041 vdev_raidz_child_done, rc)); 2042 } while (++c < rm->rm_cols); 2043 2044 return; 2045 } 2046 2047 /* 2048 * At this point we've attempted to reconstruct the data given the 2049 * errors we detected, and we've attempted to read all columns. There 2050 * must, therefore, be one or more additional problems -- silent errors 2051 * resulting in invalid data rather than explicit I/O errors resulting 2052 * in absent data. We check if there is enough additional data to 2053 * possibly reconstruct the data and then perform combinatorial 2054 * reconstruction over all possible combinations. If that fails, 2055 * we're cooked. 2056 */ 2057 if (total_errors > rm->rm_firstdatacol) { 2058 zio->io_error = vdev_raidz_worst_error(rm); 2059 2060 } else if (total_errors < rm->rm_firstdatacol && 2061 (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) { 2062 /* 2063 * If we didn't use all the available parity for the 2064 * combinatorial reconstruction, verify that the remaining 2065 * parity is correct. 2066 */ 2067 if (code != (1 << rm->rm_firstdatacol) - 1) 2068 (void) raidz_parity_verify(zio, rm); 2069 } else { 2070 /* 2071 * We're here because either: 2072 * 2073 * total_errors == rm_first_datacol, or 2074 * vdev_raidz_combrec() failed 2075 * 2076 * In either case, there is enough bad data to prevent 2077 * reconstruction. 2078 * 2079 * Start checksum ereports for all children which haven't 2080 * failed, and the IO wasn't speculative. 2081 */ 2082 zio->io_error = ECKSUM; 2083 2084 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2085 for (c = 0; c < rm->rm_cols; c++) { 2086 rc = &rm->rm_col[c]; 2087 if (rc->rc_error == 0) { 2088 zio_bad_cksum_t zbc; 2089 zbc.zbc_has_cksum = 0; 2090 zbc.zbc_injected = 2091 rm->rm_ecksuminjected; 2092 2093 zfs_ereport_start_checksum( 2094 zio->io_spa, 2095 vd->vdev_child[rc->rc_devidx], 2096 zio, rc->rc_offset, rc->rc_size, 2097 (void *)(uintptr_t)c, &zbc); 2098 } 2099 } 2100 } 2101 } 2102 2103done: 2104 zio_checksum_verified(zio); 2105 2106 if (zio->io_error == 0 && spa_writeable(zio->io_spa) && 2107 (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { 2108 /* 2109 * Use the good data we have in hand to repair damaged children. 2110 */ 2111 for (c = 0; c < rm->rm_cols; c++) { 2112 rc = &rm->rm_col[c]; 2113 cvd = vd->vdev_child[rc->rc_devidx]; 2114 2115 if (rc->rc_error == 0) 2116 continue; 2117 2118 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 2119 rc->rc_offset, rc->rc_data, rc->rc_size, 2120 ZIO_TYPE_WRITE, zio->io_priority, 2121 ZIO_FLAG_IO_REPAIR | (unexpected_errors ? 2122 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); 2123 } 2124 } 2125} 2126 2127static void 2128vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) 2129{ 2130 if (faulted > vd->vdev_nparity) 2131 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2132 VDEV_AUX_NO_REPLICAS); 2133 else if (degraded + faulted != 0) 2134 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); 2135 else 2136 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); 2137} 2138 2139vdev_ops_t vdev_raidz_ops = { 2140 vdev_raidz_open, 2141 vdev_raidz_close, 2142 vdev_raidz_asize, 2143 vdev_raidz_io_start, 2144 vdev_raidz_io_done, 2145 vdev_raidz_state_change, 2146 NULL, 2147 NULL, 2148 VDEV_TYPE_RAIDZ, /* name of this vdev type */ 2149 B_FALSE /* not a leaf vdev */ 2150};
|