25 * Copyright (c) 2013, Joyent, Inc. All rights reserved. 26 */ 27 28#include <sys/zfs_context.h> 29#include <sys/spa.h> 30#include <sys/vdev_impl.h> 31#ifdef illumos 32#include <sys/vdev_disk.h> 33#endif 34#include <sys/vdev_file.h> 35#include <sys/vdev_raidz.h> 36#include <sys/zio.h> 37#include <sys/zio_checksum.h> 38#include <sys/fs/zfs.h> 39#include <sys/fm/fs/zfs.h> 40#include <sys/bio.h> 41 42/* 43 * Virtual device vector for RAID-Z. 44 * 45 * This vdev supports single, double, and triple parity. For single parity, 46 * we use a simple XOR of all the data columns. For double or triple parity, 47 * we use a special case of Reed-Solomon coding. This extends the 48 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by 49 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for 50 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the 51 * former is also based. The latter is designed to provide higher performance 52 * for writes. 53 * 54 * Note that the Plank paper claimed to support arbitrary N+M, but was then 55 * amended six years later identifying a critical flaw that invalidates its 56 * claims. Nevertheless, the technique can be adapted to work for up to 57 * triple parity. For additional parity, the amendment "Note: Correction to 58 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding 59 * is viable, but the additional complexity means that write performance will 60 * suffer. 61 * 62 * All of the methods above operate on a Galois field, defined over the 63 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements 64 * can be expressed with a single byte. Briefly, the operations on the 65 * field are defined as follows: 66 * 67 * o addition (+) is represented by a bitwise XOR 68 * o subtraction (-) is therefore identical to addition: A + B = A - B 69 * o multiplication of A by 2 is defined by the following bitwise expression: 70 * 71 * (A * 2)_7 = A_6 72 * (A * 2)_6 = A_5 73 * (A * 2)_5 = A_4 74 * (A * 2)_4 = A_3 + A_7 75 * (A * 2)_3 = A_2 + A_7 76 * (A * 2)_2 = A_1 + A_7 77 * (A * 2)_1 = A_0 78 * (A * 2)_0 = A_7 79 * 80 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). 81 * As an aside, this multiplication is derived from the error correcting 82 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1. 83 * 84 * Observe that any number in the field (except for 0) can be expressed as a 85 * power of 2 -- a generator for the field. We store a table of the powers of 86 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can 87 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather 88 * than field addition). The inverse of a field element A (A^-1) is therefore 89 * A ^ (255 - 1) = A^254. 90 * 91 * The up-to-three parity columns, P, Q, R over several data columns, 92 * D_0, ... D_n-1, can be expressed by field operations: 93 * 94 * P = D_0 + D_1 + ... + D_n-2 + D_n-1 95 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 96 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 97 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1 98 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1 99 * 100 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival 101 * XOR operation, and 2 and 4 can be computed quickly and generate linearly- 102 * independent coefficients. (There are no additional coefficients that have 103 * this property which is why the uncorrected Plank method breaks down.) 104 * 105 * See the reconstruction code below for how P, Q and R can used individually 106 * or in concert to recover missing data columns. 107 */ 108 109typedef struct raidz_col { 110 uint64_t rc_devidx; /* child device index for I/O */ 111 uint64_t rc_offset; /* device offset */ 112 uint64_t rc_size; /* I/O size */ 113 void *rc_data; /* I/O data */ 114 void *rc_gdata; /* used to store the "good" version */ 115 int rc_error; /* I/O error for this device */ 116 uint8_t rc_tried; /* Did we attempt this I/O column? */ 117 uint8_t rc_skipped; /* Did we skip this I/O column? */ 118} raidz_col_t; 119 120typedef struct raidz_map { 121 uint64_t rm_cols; /* Regular column count */ 122 uint64_t rm_scols; /* Count including skipped columns */ 123 uint64_t rm_bigcols; /* Number of oversized columns */ 124 uint64_t rm_asize; /* Actual total I/O size */ 125 uint64_t rm_missingdata; /* Count of missing data devices */ 126 uint64_t rm_missingparity; /* Count of missing parity devices */ 127 uint64_t rm_firstdatacol; /* First data column/parity count */ 128 uint64_t rm_nskip; /* Skipped sectors for padding */ 129 uint64_t rm_skipstart; /* Column index of padding start */ 130 void *rm_datacopy; /* rm_asize-buffer of copied data */ 131 uintptr_t rm_reports; /* # of referencing checksum reports */ 132 uint8_t rm_freed; /* map no longer has referencing ZIO */ 133 uint8_t rm_ecksuminjected; /* checksum error was injected */ 134 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ 135} raidz_map_t; 136 137#define VDEV_RAIDZ_P 0 138#define VDEV_RAIDZ_Q 1 139#define VDEV_RAIDZ_R 2 140 141#define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 142#define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 143 144/* 145 * We provide a mechanism to perform the field multiplication operation on a 146 * 64-bit value all at once rather than a byte at a time. This works by 147 * creating a mask from the top bit in each byte and using that to 148 * conditionally apply the XOR of 0x1d. 149 */ 150#define VDEV_RAIDZ_64MUL_2(x, mask) \ 151{ \ 152 (mask) = (x) & 0x8080808080808080ULL; \ 153 (mask) = ((mask) << 1) - ((mask) >> 7); \ 154 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 155 ((mask) & 0x1d1d1d1d1d1d1d1d); \ 156} 157 158#define VDEV_RAIDZ_64MUL_4(x, mask) \ 159{ \ 160 VDEV_RAIDZ_64MUL_2((x), mask); \ 161 VDEV_RAIDZ_64MUL_2((x), mask); \ 162} 163 164#define VDEV_LABEL_OFFSET(x) (x + VDEV_LABEL_START_SIZE) 165 166/* 167 * Force reconstruction to use the general purpose method. 168 */ 169int vdev_raidz_default_to_general; 170 171/* Powers of 2 in the Galois field defined above. */ 172static const uint8_t vdev_raidz_pow2[256] = { 173 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 174 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 175 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 176 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 177 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 178 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 179 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, 180 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 181 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 182 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, 183 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 184 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 185 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 186 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 187 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 188 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 189 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 190 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 191 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 192 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 193 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 194 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 195 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 196 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 197 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, 198 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 199 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 200 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, 201 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 202 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, 203 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 204 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 205}; 206/* Logs of 2 in the Galois field defined above. */ 207static const uint8_t vdev_raidz_log2[256] = { 208 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, 209 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, 210 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, 211 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, 212 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, 213 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, 214 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, 215 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, 216 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, 217 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, 218 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, 219 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, 220 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, 221 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, 222 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, 223 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, 224 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, 225 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, 226 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, 227 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, 228 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, 229 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, 230 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, 231 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, 232 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, 233 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, 234 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, 235 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, 236 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, 237 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, 238 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, 239 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, 240}; 241 242static void vdev_raidz_generate_parity(raidz_map_t *rm); 243 244/* 245 * Multiply a given number by 2 raised to the given power. 246 */ 247static uint8_t 248vdev_raidz_exp2(uint_t a, int exp) 249{ 250 if (a == 0) 251 return (0); 252 253 ASSERT(exp >= 0); 254 ASSERT(vdev_raidz_log2[a] > 0 || a == 1); 255 256 exp += vdev_raidz_log2[a]; 257 if (exp > 255) 258 exp -= 255; 259 260 return (vdev_raidz_pow2[exp]); 261} 262 263static void 264vdev_raidz_map_free(raidz_map_t *rm) 265{ 266 int c; 267 size_t size; 268 269 for (c = 0; c < rm->rm_firstdatacol; c++) { 270 if (rm->rm_col[c].rc_data != NULL) 271 zio_buf_free(rm->rm_col[c].rc_data, 272 rm->rm_col[c].rc_size); 273 274 if (rm->rm_col[c].rc_gdata != NULL) 275 zio_buf_free(rm->rm_col[c].rc_gdata, 276 rm->rm_col[c].rc_size); 277 } 278 279 size = 0; 280 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) 281 size += rm->rm_col[c].rc_size; 282 283 if (rm->rm_datacopy != NULL) 284 zio_buf_free(rm->rm_datacopy, size); 285 286 kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); 287} 288 289static void 290vdev_raidz_map_free_vsd(zio_t *zio) 291{ 292 raidz_map_t *rm = zio->io_vsd; 293 294 ASSERT0(rm->rm_freed); 295 rm->rm_freed = 1; 296 297 if (rm->rm_reports == 0) 298 vdev_raidz_map_free(rm); 299} 300 301/*ARGSUSED*/ 302static void 303vdev_raidz_cksum_free(void *arg, size_t ignored) 304{ 305 raidz_map_t *rm = arg; 306 307 ASSERT3U(rm->rm_reports, >, 0); 308 309 if (--rm->rm_reports == 0 && rm->rm_freed != 0) 310 vdev_raidz_map_free(rm); 311} 312 313static void 314vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) 315{ 316 raidz_map_t *rm = zcr->zcr_cbdata; 317 size_t c = zcr->zcr_cbinfo; 318 size_t x; 319 320 const char *good = NULL; 321 const char *bad = rm->rm_col[c].rc_data; 322 323 if (good_data == NULL) { 324 zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); 325 return; 326 } 327 328 if (c < rm->rm_firstdatacol) { 329 /* 330 * The first time through, calculate the parity blocks for 331 * the good data (this relies on the fact that the good 332 * data never changes for a given logical ZIO) 333 */ 334 if (rm->rm_col[0].rc_gdata == NULL) { 335 char *bad_parity[VDEV_RAIDZ_MAXPARITY]; 336 char *buf; 337 338 /* 339 * Set up the rm_col[]s to generate the parity for 340 * good_data, first saving the parity bufs and 341 * replacing them with buffers to hold the result. 342 */ 343 for (x = 0; x < rm->rm_firstdatacol; x++) { 344 bad_parity[x] = rm->rm_col[x].rc_data; 345 rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata = 346 zio_buf_alloc(rm->rm_col[x].rc_size); 347 } 348 349 /* fill in the data columns from good_data */ 350 buf = (char *)good_data; 351 for (; x < rm->rm_cols; x++) { 352 rm->rm_col[x].rc_data = buf; 353 buf += rm->rm_col[x].rc_size; 354 } 355 356 /* 357 * Construct the parity from the good data. 358 */ 359 vdev_raidz_generate_parity(rm); 360 361 /* restore everything back to its original state */ 362 for (x = 0; x < rm->rm_firstdatacol; x++) 363 rm->rm_col[x].rc_data = bad_parity[x]; 364 365 buf = rm->rm_datacopy; 366 for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) { 367 rm->rm_col[x].rc_data = buf; 368 buf += rm->rm_col[x].rc_size; 369 } 370 } 371 372 ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL); 373 good = rm->rm_col[c].rc_gdata; 374 } else { 375 /* adjust good_data to point at the start of our column */ 376 good = good_data; 377 378 for (x = rm->rm_firstdatacol; x < c; x++) 379 good += rm->rm_col[x].rc_size; 380 } 381 382 /* we drop the ereport if it ends up that the data was good */ 383 zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE); 384} 385 386/* 387 * Invoked indirectly by zfs_ereport_start_checksum(), called 388 * below when our read operation fails completely. The main point 389 * is to keep a copy of everything we read from disk, so that at 390 * vdev_raidz_cksum_finish() time we can compare it with the good data. 391 */ 392static void 393vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) 394{ 395 size_t c = (size_t)(uintptr_t)arg; 396 caddr_t buf; 397 398 raidz_map_t *rm = zio->io_vsd; 399 size_t size; 400 401 /* set up the report and bump the refcount */ 402 zcr->zcr_cbdata = rm; 403 zcr->zcr_cbinfo = c; 404 zcr->zcr_finish = vdev_raidz_cksum_finish; 405 zcr->zcr_free = vdev_raidz_cksum_free; 406 407 rm->rm_reports++; 408 ASSERT3U(rm->rm_reports, >, 0); 409 410 if (rm->rm_datacopy != NULL) 411 return; 412 413 /* 414 * It's the first time we're called for this raidz_map_t, so we need 415 * to copy the data aside; there's no guarantee that our zio's buffer 416 * won't be re-used for something else. 417 * 418 * Our parity data is already in separate buffers, so there's no need 419 * to copy them. 420 */ 421 422 size = 0; 423 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) 424 size += rm->rm_col[c].rc_size; 425 426 buf = rm->rm_datacopy = zio_buf_alloc(size); 427 428 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 429 raidz_col_t *col = &rm->rm_col[c]; 430 431 bcopy(col->rc_data, buf, col->rc_size); 432 col->rc_data = buf; 433 434 buf += col->rc_size; 435 } 436 ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size); 437} 438 439static const zio_vsd_ops_t vdev_raidz_vsd_ops = { 440 vdev_raidz_map_free_vsd, 441 vdev_raidz_cksum_report 442}; 443 444/* 445 * Divides the IO evenly across all child vdevs; usually, dcols is 446 * the number of children in the target vdev. 447 */ 448static raidz_map_t * 449vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset, boolean_t dofree, 450 uint64_t unit_shift, uint64_t dcols, uint64_t nparity) 451{ 452 raidz_map_t *rm; 453 /* The starting RAIDZ (parent) vdev sector of the block. */ 454 uint64_t b = offset >> unit_shift; 455 /* The zio's size in units of the vdev's minimum sector size. */ 456 uint64_t s = size >> unit_shift; 457 /* The first column for this stripe. */ 458 uint64_t f = b % dcols; 459 /* The starting byte offset on each child vdev. */ 460 uint64_t o = (b / dcols) << unit_shift; 461 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; 462 463 /* 464 * "Quotient": The number of data sectors for this stripe on all but 465 * the "big column" child vdevs that also contain "remainder" data. 466 */ 467 q = s / (dcols - nparity); 468 469 /* 470 * "Remainder": The number of partial stripe data sectors in this I/O. 471 * This will add a sector to some, but not all, child vdevs. 472 */ 473 r = s - q * (dcols - nparity); 474 475 /* The number of "big columns" - those which contain remainder data. */ 476 bc = (r == 0 ? 0 : r + nparity); 477 478 /* 479 * The total number of data and parity sectors associated with 480 * this I/O. 481 */ 482 tot = s + nparity * (q + (r == 0 ? 0 : 1)); 483 484 /* acols: The columns that will be accessed. */ 485 /* scols: The columns that will be accessed or skipped. */ 486 if (q == 0) { 487 /* Our I/O request doesn't span all child vdevs. */ 488 acols = bc; 489 scols = MIN(dcols, roundup(bc, nparity + 1)); 490 } else { 491 acols = dcols; 492 scols = dcols; 493 } 494 495 ASSERT3U(acols, <=, scols); 496 497 rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP); 498 499 rm->rm_cols = acols; 500 rm->rm_scols = scols; 501 rm->rm_bigcols = bc; 502 rm->rm_skipstart = bc; 503 rm->rm_missingdata = 0; 504 rm->rm_missingparity = 0; 505 rm->rm_firstdatacol = nparity; 506 rm->rm_datacopy = NULL; 507 rm->rm_reports = 0; 508 rm->rm_freed = 0; 509 rm->rm_ecksuminjected = 0; 510 511 asize = 0; 512 513 for (c = 0; c < scols; c++) { 514 col = f + c; 515 coff = o; 516 if (col >= dcols) { 517 col -= dcols; 518 coff += 1ULL << unit_shift; 519 } 520 rm->rm_col[c].rc_devidx = col; 521 rm->rm_col[c].rc_offset = coff; 522 rm->rm_col[c].rc_data = NULL; 523 rm->rm_col[c].rc_gdata = NULL; 524 rm->rm_col[c].rc_error = 0; 525 rm->rm_col[c].rc_tried = 0; 526 rm->rm_col[c].rc_skipped = 0; 527 528 if (c >= acols) 529 rm->rm_col[c].rc_size = 0; 530 else if (c < bc) 531 rm->rm_col[c].rc_size = (q + 1) << unit_shift; 532 else 533 rm->rm_col[c].rc_size = q << unit_shift; 534 535 asize += rm->rm_col[c].rc_size; 536 } 537 538 ASSERT3U(asize, ==, tot << unit_shift); 539 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); 540 rm->rm_nskip = roundup(tot, nparity + 1) - tot; 541 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); 542 ASSERT3U(rm->rm_nskip, <=, nparity); 543 544 if (!dofree) { 545 for (c = 0; c < rm->rm_firstdatacol; c++) { 546 rm->rm_col[c].rc_data = 547 zio_buf_alloc(rm->rm_col[c].rc_size); 548 } 549 550 rm->rm_col[c].rc_data = data; 551 552 for (c = c + 1; c < acols; c++) { 553 rm->rm_col[c].rc_data = 554 (char *)rm->rm_col[c - 1].rc_data + 555 rm->rm_col[c - 1].rc_size; 556 } 557 } 558 559 /* 560 * If all data stored spans all columns, there's a danger that parity 561 * will always be on the same device and, since parity isn't read 562 * during normal operation, that that device's I/O bandwidth won't be 563 * used effectively. We therefore switch the parity every 1MB. 564 * 565 * ... at least that was, ostensibly, the theory. As a practical 566 * matter unless we juggle the parity between all devices evenly, we 567 * won't see any benefit. Further, occasional writes that aren't a 568 * multiple of the LCM of the number of children and the minimum 569 * stripe width are sufficient to avoid pessimal behavior. 570 * Unfortunately, this decision created an implicit on-disk format 571 * requirement that we need to support for all eternity, but only 572 * for single-parity RAID-Z. 573 * 574 * If we intend to skip a sector in the zeroth column for padding 575 * we must make sure to note this swap. We will never intend to 576 * skip the first column since at least one data and one parity 577 * column must appear in each row. 578 */ 579 ASSERT(rm->rm_cols >= 2); 580 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); 581 582 if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) { 583 devidx = rm->rm_col[0].rc_devidx; 584 o = rm->rm_col[0].rc_offset; 585 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; 586 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; 587 rm->rm_col[1].rc_devidx = devidx; 588 rm->rm_col[1].rc_offset = o; 589 590 if (rm->rm_skipstart == 0) 591 rm->rm_skipstart = 1; 592 } 593 594 return (rm); 595} 596 597static void 598vdev_raidz_generate_parity_p(raidz_map_t *rm) 599{ 600 uint64_t *p, *src, pcount, ccount, i; 601 int c; 602 603 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 604 605 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 606 src = rm->rm_col[c].rc_data; 607 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 608 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 609 610 if (c == rm->rm_firstdatacol) { 611 ASSERT(ccount == pcount); 612 for (i = 0; i < ccount; i++, src++, p++) { 613 *p = *src; 614 } 615 } else { 616 ASSERT(ccount <= pcount); 617 for (i = 0; i < ccount; i++, src++, p++) { 618 *p ^= *src; 619 } 620 } 621 } 622} 623 624static void 625vdev_raidz_generate_parity_pq(raidz_map_t *rm) 626{ 627 uint64_t *p, *q, *src, pcnt, ccnt, mask, i; 628 int c; 629 630 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 631 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 632 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 633 634 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 635 src = rm->rm_col[c].rc_data; 636 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 637 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 638 639 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 640 641 if (c == rm->rm_firstdatacol) { 642 ASSERT(ccnt == pcnt || ccnt == 0); 643 for (i = 0; i < ccnt; i++, src++, p++, q++) { 644 *p = *src; 645 *q = *src; 646 } 647 for (; i < pcnt; i++, src++, p++, q++) { 648 *p = 0; 649 *q = 0; 650 } 651 } else { 652 ASSERT(ccnt <= pcnt); 653 654 /* 655 * Apply the algorithm described above by multiplying 656 * the previous result and adding in the new value. 657 */ 658 for (i = 0; i < ccnt; i++, src++, p++, q++) { 659 *p ^= *src; 660 661 VDEV_RAIDZ_64MUL_2(*q, mask); 662 *q ^= *src; 663 } 664 665 /* 666 * Treat short columns as though they are full of 0s. 667 * Note that there's therefore nothing needed for P. 668 */ 669 for (; i < pcnt; i++, q++) { 670 VDEV_RAIDZ_64MUL_2(*q, mask); 671 } 672 } 673 } 674} 675 676static void 677vdev_raidz_generate_parity_pqr(raidz_map_t *rm) 678{ 679 uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i; 680 int c; 681 682 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 683 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 684 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 685 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 686 rm->rm_col[VDEV_RAIDZ_R].rc_size); 687 688 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 689 src = rm->rm_col[c].rc_data; 690 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 691 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 692 r = rm->rm_col[VDEV_RAIDZ_R].rc_data; 693 694 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 695 696 if (c == rm->rm_firstdatacol) { 697 ASSERT(ccnt == pcnt || ccnt == 0); 698 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 699 *p = *src; 700 *q = *src; 701 *r = *src; 702 } 703 for (; i < pcnt; i++, src++, p++, q++, r++) { 704 *p = 0; 705 *q = 0; 706 *r = 0; 707 } 708 } else { 709 ASSERT(ccnt <= pcnt); 710 711 /* 712 * Apply the algorithm described above by multiplying 713 * the previous result and adding in the new value. 714 */ 715 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 716 *p ^= *src; 717 718 VDEV_RAIDZ_64MUL_2(*q, mask); 719 *q ^= *src; 720 721 VDEV_RAIDZ_64MUL_4(*r, mask); 722 *r ^= *src; 723 } 724 725 /* 726 * Treat short columns as though they are full of 0s. 727 * Note that there's therefore nothing needed for P. 728 */ 729 for (; i < pcnt; i++, q++, r++) { 730 VDEV_RAIDZ_64MUL_2(*q, mask); 731 VDEV_RAIDZ_64MUL_4(*r, mask); 732 } 733 } 734 } 735} 736 737/* 738 * Generate RAID parity in the first virtual columns according to the number of 739 * parity columns available. 740 */ 741static void 742vdev_raidz_generate_parity(raidz_map_t *rm) 743{ 744 switch (rm->rm_firstdatacol) { 745 case 1: 746 vdev_raidz_generate_parity_p(rm); 747 break; 748 case 2: 749 vdev_raidz_generate_parity_pq(rm); 750 break; 751 case 3: 752 vdev_raidz_generate_parity_pqr(rm); 753 break; 754 default: 755 cmn_err(CE_PANIC, "invalid RAID-Z configuration"); 756 } 757} 758 759static int 760vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) 761{ 762 uint64_t *dst, *src, xcount, ccount, count, i; 763 int x = tgts[0]; 764 int c; 765 766 ASSERT(ntgts == 1); 767 ASSERT(x >= rm->rm_firstdatacol); 768 ASSERT(x < rm->rm_cols); 769 770 xcount = rm->rm_col[x].rc_size / sizeof (src[0]); 771 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0])); 772 ASSERT(xcount > 0); 773 774 src = rm->rm_col[VDEV_RAIDZ_P].rc_data; 775 dst = rm->rm_col[x].rc_data; 776 for (i = 0; i < xcount; i++, dst++, src++) { 777 *dst = *src; 778 } 779 780 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 781 src = rm->rm_col[c].rc_data; 782 dst = rm->rm_col[x].rc_data; 783 784 if (c == x) 785 continue; 786 787 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 788 count = MIN(ccount, xcount); 789 790 for (i = 0; i < count; i++, dst++, src++) { 791 *dst ^= *src; 792 } 793 } 794 795 return (1 << VDEV_RAIDZ_P); 796} 797 798static int 799vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) 800{ 801 uint64_t *dst, *src, xcount, ccount, count, mask, i; 802 uint8_t *b; 803 int x = tgts[0]; 804 int c, j, exp; 805 806 ASSERT(ntgts == 1); 807 808 xcount = rm->rm_col[x].rc_size / sizeof (src[0]); 809 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0])); 810 811 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 812 src = rm->rm_col[c].rc_data; 813 dst = rm->rm_col[x].rc_data; 814 815 if (c == x) 816 ccount = 0; 817 else 818 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 819 820 count = MIN(ccount, xcount); 821 822 if (c == rm->rm_firstdatacol) { 823 for (i = 0; i < count; i++, dst++, src++) { 824 *dst = *src; 825 } 826 for (; i < xcount; i++, dst++) { 827 *dst = 0; 828 } 829 830 } else { 831 for (i = 0; i < count; i++, dst++, src++) { 832 VDEV_RAIDZ_64MUL_2(*dst, mask); 833 *dst ^= *src; 834 } 835 836 for (; i < xcount; i++, dst++) { 837 VDEV_RAIDZ_64MUL_2(*dst, mask); 838 } 839 } 840 } 841 842 src = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 843 dst = rm->rm_col[x].rc_data; 844 exp = 255 - (rm->rm_cols - 1 - x); 845 846 for (i = 0; i < xcount; i++, dst++, src++) { 847 *dst ^= *src; 848 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { 849 *b = vdev_raidz_exp2(*b, exp); 850 } 851 } 852 853 return (1 << VDEV_RAIDZ_Q); 854} 855 856static int 857vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) 858{ 859 uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp; 860 void *pdata, *qdata; 861 uint64_t xsize, ysize, i; 862 int x = tgts[0]; 863 int y = tgts[1]; 864 865 ASSERT(ntgts == 2); 866 ASSERT(x < y); 867 ASSERT(x >= rm->rm_firstdatacol); 868 ASSERT(y < rm->rm_cols); 869 870 ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size); 871 872 /* 873 * Move the parity data aside -- we're going to compute parity as 874 * though columns x and y were full of zeros -- Pxy and Qxy. We want to 875 * reuse the parity generation mechanism without trashing the actual 876 * parity so we make those columns appear to be full of zeros by 877 * setting their lengths to zero. 878 */ 879 pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data; 880 qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 881 xsize = rm->rm_col[x].rc_size; 882 ysize = rm->rm_col[y].rc_size; 883 884 rm->rm_col[VDEV_RAIDZ_P].rc_data = 885 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size); 886 rm->rm_col[VDEV_RAIDZ_Q].rc_data = 887 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size); 888 rm->rm_col[x].rc_size = 0; 889 rm->rm_col[y].rc_size = 0; 890 891 vdev_raidz_generate_parity_pq(rm); 892 893 rm->rm_col[x].rc_size = xsize; 894 rm->rm_col[y].rc_size = ysize; 895 896 p = pdata; 897 q = qdata; 898 pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data; 899 qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 900 xd = rm->rm_col[x].rc_data; 901 yd = rm->rm_col[y].rc_data; 902 903 /* 904 * We now have: 905 * Pxy = P + D_x + D_y 906 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y 907 * 908 * We can then solve for D_x: 909 * D_x = A * (P + Pxy) + B * (Q + Qxy) 910 * where 911 * A = 2^(x - y) * (2^(x - y) + 1)^-1 912 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 913 * 914 * With D_x in hand, we can easily solve for D_y: 915 * D_y = P + Pxy + D_x 916 */ 917 918 a = vdev_raidz_pow2[255 + x - y]; 919 b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)]; 920 tmp = 255 - vdev_raidz_log2[a ^ 1]; 921 922 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; 923 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; 924 925 for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) { 926 *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^ 927 vdev_raidz_exp2(*q ^ *qxy, bexp); 928 929 if (i < ysize) 930 *yd = *p ^ *pxy ^ *xd; 931 } 932 933 zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data, 934 rm->rm_col[VDEV_RAIDZ_P].rc_size); 935 zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data, 936 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 937 938 /* 939 * Restore the saved parity data. 940 */ 941 rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata; 942 rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata; 943 944 return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q)); 945} 946 947/* BEGIN CSTYLED */ 948/* 949 * In the general case of reconstruction, we must solve the system of linear 950 * equations defined by the coeffecients used to generate parity as well as 951 * the contents of the data and parity disks. This can be expressed with 952 * vectors for the original data (D) and the actual data (d) and parity (p) 953 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 954 * 955 * __ __ __ __ 956 * | | __ __ | p_0 | 957 * | V | | D_0 | | p_m-1 | 958 * | | x | : | = | d_0 | 959 * | I | | D_n-1 | | : | 960 * | | ~~ ~~ | d_n-1 | 961 * ~~ ~~ ~~ ~~ 962 * 963 * I is simply a square identity matrix of size n, and V is a vandermonde 964 * matrix defined by the coeffecients we chose for the various parity columns 965 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 966 * computation as well as linear separability. 967 * 968 * __ __ __ __ 969 * | 1 .. 1 1 1 | | p_0 | 970 * | 2^n-1 .. 4 2 1 | __ __ | : | 971 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 972 * | 1 .. 0 0 0 | | D_1 | | d_0 | 973 * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 974 * | : : : : | | : | | d_2 | 975 * | 0 .. 1 0 0 | | D_n-1 | | : | 976 * | 0 .. 0 1 0 | ~~ ~~ | : | 977 * | 0 .. 0 0 1 | | d_n-1 | 978 * ~~ ~~ ~~ ~~ 979 * 980 * Note that I, V, d, and p are known. To compute D, we must invert the 981 * matrix and use the known data and parity values to reconstruct the unknown 982 * data values. We begin by removing the rows in V|I and d|p that correspond 983 * to failed or missing columns; we then make V|I square (n x n) and d|p 984 * sized n by removing rows corresponding to unused parity from the bottom up 985 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 986 * using Gauss-Jordan elimination. In the example below we use m=3 parity 987 * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 988 * __ __ 989 * | 1 1 1 1 1 1 1 1 | 990 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 991 * | 19 205 116 29 64 16 4 1 | / / 992 * | 1 0 0 0 0 0 0 0 | / / 993 * | 0 1 0 0 0 0 0 0 | <--' / 994 * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 995 * | 0 0 0 1 0 0 0 0 | 996 * | 0 0 0 0 1 0 0 0 | 997 * | 0 0 0 0 0 1 0 0 | 998 * | 0 0 0 0 0 0 1 0 | 999 * | 0 0 0 0 0 0 0 1 | 1000 * ~~ ~~ 1001 * __ __ 1002 * | 1 1 1 1 1 1 1 1 | 1003 * | 19 205 116 29 64 16 4 1 | 1004 * | 1 0 0 0 0 0 0 0 | 1005 * (V|I)' = | 0 0 0 1 0 0 0 0 | 1006 * | 0 0 0 0 1 0 0 0 | 1007 * | 0 0 0 0 0 1 0 0 | 1008 * | 0 0 0 0 0 0 1 0 | 1009 * | 0 0 0 0 0 0 0 1 | 1010 * ~~ ~~ 1011 * 1012 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 1013 * have carefully chosen the seed values 1, 2, and 4 to ensure that this 1014 * matrix is not singular. 1015 * __ __ 1016 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1017 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1018 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1019 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1020 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1021 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1022 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1023 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1024 * ~~ ~~ 1025 * __ __ 1026 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1027 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1028 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1029 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1030 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1031 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1032 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1033 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1034 * ~~ ~~ 1035 * __ __ 1036 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1037 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1038 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 1039 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1040 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1041 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1042 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1043 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1044 * ~~ ~~ 1045 * __ __ 1046 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1047 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1048 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 1049 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1050 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1051 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1052 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1053 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1054 * ~~ ~~ 1055 * __ __ 1056 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1057 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1058 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1059 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1060 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1061 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1062 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1063 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1064 * ~~ ~~ 1065 * __ __ 1066 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1067 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 1068 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1069 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1070 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1071 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1072 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1073 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1074 * ~~ ~~ 1075 * __ __ 1076 * | 0 0 1 0 0 0 0 0 | 1077 * | 167 100 5 41 159 169 217 208 | 1078 * | 166 100 4 40 158 168 216 209 | 1079 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 1080 * | 0 0 0 0 1 0 0 0 | 1081 * | 0 0 0 0 0 1 0 0 | 1082 * | 0 0 0 0 0 0 1 0 | 1083 * | 0 0 0 0 0 0 0 1 | 1084 * ~~ ~~ 1085 * 1086 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 1087 * of the missing data. 1088 * 1089 * As is apparent from the example above, the only non-trivial rows in the 1090 * inverse matrix correspond to the data disks that we're trying to 1091 * reconstruct. Indeed, those are the only rows we need as the others would 1092 * only be useful for reconstructing data known or assumed to be valid. For 1093 * that reason, we only build the coefficients in the rows that correspond to 1094 * targeted columns. 1095 */ 1096/* END CSTYLED */ 1097 1098static void 1099vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, 1100 uint8_t **rows) 1101{ 1102 int i, j; 1103 int pow; 1104 1105 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); 1106 1107 /* 1108 * Fill in the missing rows of interest. 1109 */ 1110 for (i = 0; i < nmap; i++) { 1111 ASSERT3S(0, <=, map[i]); 1112 ASSERT3S(map[i], <=, 2); 1113 1114 pow = map[i] * n; 1115 if (pow > 255) 1116 pow -= 255; 1117 ASSERT(pow <= 255); 1118 1119 for (j = 0; j < n; j++) { 1120 pow -= map[i]; 1121 if (pow < 0) 1122 pow += 255; 1123 rows[i][j] = vdev_raidz_pow2[pow]; 1124 } 1125 } 1126} 1127 1128static void 1129vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, 1130 uint8_t **rows, uint8_t **invrows, const uint8_t *used) 1131{ 1132 int i, j, ii, jj; 1133 uint8_t log; 1134 1135 /* 1136 * Assert that the first nmissing entries from the array of used 1137 * columns correspond to parity columns and that subsequent entries 1138 * correspond to data columns. 1139 */ 1140 for (i = 0; i < nmissing; i++) { 1141 ASSERT3S(used[i], <, rm->rm_firstdatacol); 1142 } 1143 for (; i < n; i++) { 1144 ASSERT3S(used[i], >=, rm->rm_firstdatacol); 1145 } 1146 1147 /* 1148 * First initialize the storage where we'll compute the inverse rows. 1149 */ 1150 for (i = 0; i < nmissing; i++) { 1151 for (j = 0; j < n; j++) { 1152 invrows[i][j] = (i == j) ? 1 : 0; 1153 } 1154 } 1155 1156 /* 1157 * Subtract all trivial rows from the rows of consequence. 1158 */ 1159 for (i = 0; i < nmissing; i++) { 1160 for (j = nmissing; j < n; j++) { 1161 ASSERT3U(used[j], >=, rm->rm_firstdatacol); 1162 jj = used[j] - rm->rm_firstdatacol; 1163 ASSERT3S(jj, <, n); 1164 invrows[i][j] = rows[i][jj]; 1165 rows[i][jj] = 0; 1166 } 1167 } 1168 1169 /* 1170 * For each of the rows of interest, we must normalize it and subtract 1171 * a multiple of it from the other rows. 1172 */ 1173 for (i = 0; i < nmissing; i++) { 1174 for (j = 0; j < missing[i]; j++) { 1175 ASSERT0(rows[i][j]); 1176 } 1177 ASSERT3U(rows[i][missing[i]], !=, 0); 1178 1179 /* 1180 * Compute the inverse of the first element and multiply each 1181 * element in the row by that value. 1182 */ 1183 log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 1184 1185 for (j = 0; j < n; j++) { 1186 rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 1187 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 1188 } 1189 1190 for (ii = 0; ii < nmissing; ii++) { 1191 if (i == ii) 1192 continue; 1193 1194 ASSERT3U(rows[ii][missing[i]], !=, 0); 1195 1196 log = vdev_raidz_log2[rows[ii][missing[i]]]; 1197 1198 for (j = 0; j < n; j++) { 1199 rows[ii][j] ^= 1200 vdev_raidz_exp2(rows[i][j], log); 1201 invrows[ii][j] ^= 1202 vdev_raidz_exp2(invrows[i][j], log); 1203 } 1204 } 1205 } 1206 1207 /* 1208 * Verify that the data that is left in the rows are properly part of 1209 * an identity matrix. 1210 */ 1211 for (i = 0; i < nmissing; i++) { 1212 for (j = 0; j < n; j++) { 1213 if (j == missing[i]) { 1214 ASSERT3U(rows[i][j], ==, 1); 1215 } else { 1216 ASSERT0(rows[i][j]); 1217 } 1218 } 1219 } 1220} 1221 1222static void 1223vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, 1224 int *missing, uint8_t **invrows, const uint8_t *used) 1225{ 1226 int i, j, x, cc, c; 1227 uint8_t *src; 1228 uint64_t ccount; 1229 uint8_t *dst[VDEV_RAIDZ_MAXPARITY]; 1230 uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; 1231 uint8_t log = 0; 1232 uint8_t val; 1233 int ll; 1234 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 1235 uint8_t *p, *pp; 1236 size_t psize; 1237 1238 psize = sizeof (invlog[0][0]) * n * nmissing; 1239 p = kmem_alloc(psize, KM_SLEEP); 1240 1241 for (pp = p, i = 0; i < nmissing; i++) { 1242 invlog[i] = pp; 1243 pp += n; 1244 } 1245 1246 for (i = 0; i < nmissing; i++) { 1247 for (j = 0; j < n; j++) { 1248 ASSERT3U(invrows[i][j], !=, 0); 1249 invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 1250 } 1251 } 1252 1253 for (i = 0; i < n; i++) { 1254 c = used[i]; 1255 ASSERT3U(c, <, rm->rm_cols); 1256 1257 src = rm->rm_col[c].rc_data; 1258 ccount = rm->rm_col[c].rc_size; 1259 for (j = 0; j < nmissing; j++) { 1260 cc = missing[j] + rm->rm_firstdatacol; 1261 ASSERT3U(cc, >=, rm->rm_firstdatacol); 1262 ASSERT3U(cc, <, rm->rm_cols); 1263 ASSERT3U(cc, !=, c); 1264 1265 dst[j] = rm->rm_col[cc].rc_data; 1266 dcount[j] = rm->rm_col[cc].rc_size; 1267 } 1268 1269 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); 1270 1271 for (x = 0; x < ccount; x++, src++) { 1272 if (*src != 0) 1273 log = vdev_raidz_log2[*src]; 1274 1275 for (cc = 0; cc < nmissing; cc++) { 1276 if (x >= dcount[cc]) 1277 continue; 1278 1279 if (*src == 0) { 1280 val = 0; 1281 } else { 1282 if ((ll = log + invlog[cc][i]) >= 255) 1283 ll -= 255; 1284 val = vdev_raidz_pow2[ll]; 1285 } 1286 1287 if (i == 0) 1288 dst[cc][x] = val; 1289 else 1290 dst[cc][x] ^= val; 1291 } 1292 } 1293 } 1294 1295 kmem_free(p, psize); 1296} 1297 1298static int 1299vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) 1300{ 1301 int n, i, c, t, tt; 1302 int nmissing_rows; 1303 int missing_rows[VDEV_RAIDZ_MAXPARITY]; 1304 int parity_map[VDEV_RAIDZ_MAXPARITY]; 1305 1306 uint8_t *p, *pp; 1307 size_t psize; 1308 1309 uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 1310 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 1311 uint8_t *used; 1312 1313 int code = 0; 1314 1315 1316 n = rm->rm_cols - rm->rm_firstdatacol; 1317 1318 /* 1319 * Figure out which data columns are missing. 1320 */ 1321 nmissing_rows = 0; 1322 for (t = 0; t < ntgts; t++) { 1323 if (tgts[t] >= rm->rm_firstdatacol) { 1324 missing_rows[nmissing_rows++] = 1325 tgts[t] - rm->rm_firstdatacol; 1326 } 1327 } 1328 1329 /* 1330 * Figure out which parity columns to use to help generate the missing 1331 * data columns. 1332 */ 1333 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 1334 ASSERT(tt < ntgts); 1335 ASSERT(c < rm->rm_firstdatacol); 1336 1337 /* 1338 * Skip any targeted parity columns. 1339 */ 1340 if (c == tgts[tt]) { 1341 tt++; 1342 continue; 1343 } 1344 1345 code |= 1 << c; 1346 1347 parity_map[i] = c; 1348 i++; 1349 } 1350 1351 ASSERT(code != 0); 1352 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); 1353 1354 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 1355 nmissing_rows * n + sizeof (used[0]) * n; 1356 p = kmem_alloc(psize, KM_SLEEP); 1357 1358 for (pp = p, i = 0; i < nmissing_rows; i++) { 1359 rows[i] = pp; 1360 pp += n; 1361 invrows[i] = pp; 1362 pp += n; 1363 } 1364 used = pp; 1365 1366 for (i = 0; i < nmissing_rows; i++) { 1367 used[i] = parity_map[i]; 1368 } 1369 1370 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1371 if (tt < nmissing_rows && 1372 c == missing_rows[tt] + rm->rm_firstdatacol) { 1373 tt++; 1374 continue; 1375 } 1376 1377 ASSERT3S(i, <, n); 1378 used[i] = c; 1379 i++; 1380 } 1381 1382 /* 1383 * Initialize the interesting rows of the matrix. 1384 */ 1385 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); 1386 1387 /* 1388 * Invert the matrix. 1389 */ 1390 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, 1391 invrows, used); 1392 1393 /* 1394 * Reconstruct the missing data using the generated matrix. 1395 */ 1396 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, 1397 invrows, used); 1398 1399 kmem_free(p, psize); 1400 1401 return (code); 1402} 1403 1404static int 1405vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) 1406{ 1407 int tgts[VDEV_RAIDZ_MAXPARITY], *dt; 1408 int ntgts; 1409 int i, c; 1410 int code; 1411 int nbadparity, nbaddata; 1412 int parity_valid[VDEV_RAIDZ_MAXPARITY]; 1413 1414 /* 1415 * The tgts list must already be sorted. 1416 */ 1417 for (i = 1; i < nt; i++) { 1418 ASSERT(t[i] > t[i - 1]); 1419 } 1420 1421 nbadparity = rm->rm_firstdatacol; 1422 nbaddata = rm->rm_cols - nbadparity; 1423 ntgts = 0; 1424 for (i = 0, c = 0; c < rm->rm_cols; c++) { 1425 if (c < rm->rm_firstdatacol) 1426 parity_valid[c] = B_FALSE; 1427 1428 if (i < nt && c == t[i]) { 1429 tgts[ntgts++] = c; 1430 i++; 1431 } else if (rm->rm_col[c].rc_error != 0) { 1432 tgts[ntgts++] = c; 1433 } else if (c >= rm->rm_firstdatacol) { 1434 nbaddata--; 1435 } else { 1436 parity_valid[c] = B_TRUE; 1437 nbadparity--; 1438 } 1439 } 1440 1441 ASSERT(ntgts >= nt); 1442 ASSERT(nbaddata >= 0); 1443 ASSERT(nbaddata + nbadparity == ntgts); 1444 1445 dt = &tgts[nbadparity]; 1446 1447 /* 1448 * See if we can use any of our optimized reconstruction routines. 1449 */ 1450 if (!vdev_raidz_default_to_general) { 1451 switch (nbaddata) { 1452 case 1: 1453 if (parity_valid[VDEV_RAIDZ_P]) 1454 return (vdev_raidz_reconstruct_p(rm, dt, 1)); 1455 1456 ASSERT(rm->rm_firstdatacol > 1); 1457 1458 if (parity_valid[VDEV_RAIDZ_Q]) 1459 return (vdev_raidz_reconstruct_q(rm, dt, 1)); 1460 1461 ASSERT(rm->rm_firstdatacol > 2); 1462 break; 1463 1464 case 2: 1465 ASSERT(rm->rm_firstdatacol > 1); 1466 1467 if (parity_valid[VDEV_RAIDZ_P] && 1468 parity_valid[VDEV_RAIDZ_Q]) 1469 return (vdev_raidz_reconstruct_pq(rm, dt, 2)); 1470 1471 ASSERT(rm->rm_firstdatacol > 2); 1472 1473 break; 1474 } 1475 } 1476 1477 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); 1478 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); 1479 ASSERT(code > 0); 1480 return (code); 1481} 1482 1483static int 1484vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, 1485 uint64_t *logical_ashift, uint64_t *physical_ashift) 1486{ 1487 vdev_t *cvd; 1488 uint64_t nparity = vd->vdev_nparity; 1489 int c; 1490 int lasterror = 0; 1491 int numerrors = 0; 1492 1493 ASSERT(nparity > 0); 1494 1495 if (nparity > VDEV_RAIDZ_MAXPARITY || 1496 vd->vdev_children < nparity + 1) { 1497 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 1498 return (SET_ERROR(EINVAL)); 1499 } 1500 1501 vdev_open_children(vd); 1502 1503 for (c = 0; c < vd->vdev_children; c++) { 1504 cvd = vd->vdev_child[c]; 1505 1506 if (cvd->vdev_open_error != 0) { 1507 lasterror = cvd->vdev_open_error; 1508 numerrors++; 1509 continue; 1510 } 1511 1512 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; 1513 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; 1514 *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift); 1515 *physical_ashift = MAX(*physical_ashift, 1516 cvd->vdev_physical_ashift); 1517 } 1518 1519 *asize *= vd->vdev_children; 1520 *max_asize *= vd->vdev_children; 1521 1522 if (numerrors > nparity) { 1523 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; 1524 return (lasterror); 1525 } 1526 1527 return (0); 1528} 1529 1530static void 1531vdev_raidz_close(vdev_t *vd) 1532{ 1533 int c; 1534 1535 for (c = 0; c < vd->vdev_children; c++) 1536 vdev_close(vd->vdev_child[c]); 1537} 1538 1539#ifdef illumos 1540/* 1541 * Handle a read or write I/O to a RAID-Z dump device. 1542 * 1543 * The dump device is in a unique situation compared to other ZFS datasets: 1544 * writing to this device should be as simple and fast as possible. In 1545 * addition, durability matters much less since the dump will be extracted 1546 * once the machine reboots. For that reason, this function eschews parity for 1547 * performance and simplicity. The dump device uses the checksum setting 1548 * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this 1549 * dataset. 1550 * 1551 * Blocks of size 128 KB have been preallocated for this volume. I/Os less than 1552 * 128 KB will not fill an entire block; in addition, they may not be properly 1553 * aligned. In that case, this function uses the preallocated 128 KB block and 1554 * omits reading or writing any "empty" portions of that block, as opposed to 1555 * allocating a fresh appropriately-sized block. 1556 * 1557 * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs: 1558 * 1559 * vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB) 1560 * 1561 * If this were a standard RAID-Z dataset, a block of at least 40 KB would be 1562 * allocated which spans all five child vdevs. 8 KB of data would be written to 1563 * each of four vdevs, with the fifth containing the parity bits. 1564 * 1565 * parity data data data data 1566 * | PP | XX | XX | XX | XX | 1567 * ^ ^ ^ ^ ^ 1568 * | | | | | 1569 * 8 KB parity ------8 KB data blocks------ 1570 * 1571 * However, when writing to the dump device, the behavior is different: 1572 * 1573 * vdev_raidz_physio(data, size: 32 KB, offset: 64 KB) 1574 * 1575 * Unlike the normal RAID-Z case in which the block is allocated based on the 1576 * I/O size, reads and writes here always use a 128 KB logical I/O size. If the 1577 * I/O size is less than 128 KB, only the actual portions of data are written. 1578 * In this example the data is written to the third data vdev since that vdev 1579 * contains the offset [64 KB, 96 KB). 1580 * 1581 * parity data data data data 1582 * | | | | XX | | 1583 * ^ 1584 * | 1585 * 32 KB data block 1586 * 1587 * As a result, an individual I/O may not span all child vdevs; moreover, a 1588 * small I/O may only operate on a single child vdev. 1589 * 1590 * Note that since there are no parity bits calculated or written, this format 1591 * remains the same no matter how many parity bits are used in a normal RAID-Z 1592 * stripe. On a RAID-Z3 configuration with seven child vdevs, the example above 1593 * would look like: 1594 * 1595 * parity parity parity data data data data 1596 * | | | | | | XX | | 1597 * ^ 1598 * | 1599 * 32 KB data block 1600 */ 1601int 1602vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size, 1603 uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump) 1604{ 1605 vdev_t *tvd = vd->vdev_top; 1606 vdev_t *cvd; 1607 raidz_map_t *rm; 1608 raidz_col_t *rc; 1609 int c, err = 0; 1610 1611 uint64_t start, end, colstart, colend; 1612 uint64_t coloffset, colsize, colskip; 1613 1614 int flags = doread ? BIO_READ : BIO_WRITE; 1615 1616#ifdef _KERNEL 1617 1618 /* 1619 * Don't write past the end of the block 1620 */ 1621 VERIFY3U(offset + size, <=, origoffset + SPA_MAXBLOCKSIZE); 1622 1623 start = offset; 1624 end = start + size; 1625 1626 /* 1627 * Allocate a RAID-Z map for this block. Note that this block starts 1628 * from the "original" offset, this is, the offset of the extent which 1629 * contains the requisite offset of the data being read or written. 1630 * 1631 * Even if this I/O operation doesn't span the full block size, let's 1632 * treat the on-disk format as if the only blocks are the complete 128 1633 * KB size. 1634 */ 1635 rm = vdev_raidz_map_alloc(data - (offset - origoffset), 1636 SPA_MAXBLOCKSIZE, origoffset, B_FALSE, tvd->vdev_ashift, vd->vdev_children, 1637 vd->vdev_nparity); 1638 1639 coloffset = origoffset; 1640 1641 for (c = rm->rm_firstdatacol; c < rm->rm_cols; 1642 c++, coloffset += rc->rc_size) { 1643 rc = &rm->rm_col[c]; 1644 cvd = vd->vdev_child[rc->rc_devidx]; 1645 1646 /* 1647 * Find the start and end of this column in the RAID-Z map, 1648 * keeping in mind that the stated size and offset of the 1649 * operation may not fill the entire column for this vdev. 1650 * 1651 * If any portion of the data spans this column, issue the 1652 * appropriate operation to the vdev. 1653 */ 1654 if (coloffset + rc->rc_size <= start) 1655 continue; 1656 if (coloffset >= end) 1657 continue; 1658 1659 colstart = MAX(coloffset, start); 1660 colend = MIN(end, coloffset + rc->rc_size); 1661 colsize = colend - colstart; 1662 colskip = colstart - coloffset; 1663 1664 VERIFY3U(colsize, <=, rc->rc_size); 1665 VERIFY3U(colskip, <=, rc->rc_size); 1666 1667 /* 1668 * Note that the child vdev will have a vdev label at the start 1669 * of its range of offsets, hence the need for 1670 * VDEV_LABEL_OFFSET(). See zio_vdev_child_io() for another 1671 * example of why this calculation is needed. 1672 */ 1673 if ((err = vdev_disk_physio(cvd, 1674 ((char *)rc->rc_data) + colskip, colsize, 1675 VDEV_LABEL_OFFSET(rc->rc_offset) + colskip, 1676 flags, isdump)) != 0) 1677 break; 1678 } 1679 1680 vdev_raidz_map_free(rm); 1681#endif /* KERNEL */ 1682 1683 return (err); 1684} 1685#endif 1686 1687static uint64_t 1688vdev_raidz_asize(vdev_t *vd, uint64_t psize) 1689{ 1690 uint64_t asize; 1691 uint64_t ashift = vd->vdev_top->vdev_ashift; 1692 uint64_t cols = vd->vdev_children; 1693 uint64_t nparity = vd->vdev_nparity; 1694 1695 asize = ((psize - 1) >> ashift) + 1; 1696 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); 1697 asize = roundup(asize, nparity + 1) << ashift; 1698 1699 return (asize); 1700} 1701 1702static void 1703vdev_raidz_child_done(zio_t *zio) 1704{ 1705 raidz_col_t *rc = zio->io_private; 1706 1707 rc->rc_error = zio->io_error; 1708 rc->rc_tried = 1; 1709 rc->rc_skipped = 0; 1710} 1711 1712/* 1713 * Start an IO operation on a RAIDZ VDev 1714 * 1715 * Outline: 1716 * - For write operations: 1717 * 1. Generate the parity data 1718 * 2. Create child zio write operations to each column's vdev, for both 1719 * data and parity. 1720 * 3. If the column skips any sectors for padding, create optional dummy 1721 * write zio children for those areas to improve aggregation continuity. 1722 * - For read operations: 1723 * 1. Create child zio read operations to each data column's vdev to read 1724 * the range of data required for zio. 1725 * 2. If this is a scrub or resilver operation, or if any of the data 1726 * vdevs have had errors, then create zio read operations to the parity 1727 * columns' VDevs as well. 1728 */
| 25 * Copyright (c) 2013, Joyent, Inc. All rights reserved. 26 */ 27 28#include <sys/zfs_context.h> 29#include <sys/spa.h> 30#include <sys/vdev_impl.h> 31#ifdef illumos 32#include <sys/vdev_disk.h> 33#endif 34#include <sys/vdev_file.h> 35#include <sys/vdev_raidz.h> 36#include <sys/zio.h> 37#include <sys/zio_checksum.h> 38#include <sys/fs/zfs.h> 39#include <sys/fm/fs/zfs.h> 40#include <sys/bio.h> 41 42/* 43 * Virtual device vector for RAID-Z. 44 * 45 * This vdev supports single, double, and triple parity. For single parity, 46 * we use a simple XOR of all the data columns. For double or triple parity, 47 * we use a special case of Reed-Solomon coding. This extends the 48 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by 49 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for 50 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the 51 * former is also based. The latter is designed to provide higher performance 52 * for writes. 53 * 54 * Note that the Plank paper claimed to support arbitrary N+M, but was then 55 * amended six years later identifying a critical flaw that invalidates its 56 * claims. Nevertheless, the technique can be adapted to work for up to 57 * triple parity. For additional parity, the amendment "Note: Correction to 58 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding 59 * is viable, but the additional complexity means that write performance will 60 * suffer. 61 * 62 * All of the methods above operate on a Galois field, defined over the 63 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements 64 * can be expressed with a single byte. Briefly, the operations on the 65 * field are defined as follows: 66 * 67 * o addition (+) is represented by a bitwise XOR 68 * o subtraction (-) is therefore identical to addition: A + B = A - B 69 * o multiplication of A by 2 is defined by the following bitwise expression: 70 * 71 * (A * 2)_7 = A_6 72 * (A * 2)_6 = A_5 73 * (A * 2)_5 = A_4 74 * (A * 2)_4 = A_3 + A_7 75 * (A * 2)_3 = A_2 + A_7 76 * (A * 2)_2 = A_1 + A_7 77 * (A * 2)_1 = A_0 78 * (A * 2)_0 = A_7 79 * 80 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). 81 * As an aside, this multiplication is derived from the error correcting 82 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1. 83 * 84 * Observe that any number in the field (except for 0) can be expressed as a 85 * power of 2 -- a generator for the field. We store a table of the powers of 86 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can 87 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather 88 * than field addition). The inverse of a field element A (A^-1) is therefore 89 * A ^ (255 - 1) = A^254. 90 * 91 * The up-to-three parity columns, P, Q, R over several data columns, 92 * D_0, ... D_n-1, can be expressed by field operations: 93 * 94 * P = D_0 + D_1 + ... + D_n-2 + D_n-1 95 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 96 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 97 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1 98 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1 99 * 100 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival 101 * XOR operation, and 2 and 4 can be computed quickly and generate linearly- 102 * independent coefficients. (There are no additional coefficients that have 103 * this property which is why the uncorrected Plank method breaks down.) 104 * 105 * See the reconstruction code below for how P, Q and R can used individually 106 * or in concert to recover missing data columns. 107 */ 108 109typedef struct raidz_col { 110 uint64_t rc_devidx; /* child device index for I/O */ 111 uint64_t rc_offset; /* device offset */ 112 uint64_t rc_size; /* I/O size */ 113 void *rc_data; /* I/O data */ 114 void *rc_gdata; /* used to store the "good" version */ 115 int rc_error; /* I/O error for this device */ 116 uint8_t rc_tried; /* Did we attempt this I/O column? */ 117 uint8_t rc_skipped; /* Did we skip this I/O column? */ 118} raidz_col_t; 119 120typedef struct raidz_map { 121 uint64_t rm_cols; /* Regular column count */ 122 uint64_t rm_scols; /* Count including skipped columns */ 123 uint64_t rm_bigcols; /* Number of oversized columns */ 124 uint64_t rm_asize; /* Actual total I/O size */ 125 uint64_t rm_missingdata; /* Count of missing data devices */ 126 uint64_t rm_missingparity; /* Count of missing parity devices */ 127 uint64_t rm_firstdatacol; /* First data column/parity count */ 128 uint64_t rm_nskip; /* Skipped sectors for padding */ 129 uint64_t rm_skipstart; /* Column index of padding start */ 130 void *rm_datacopy; /* rm_asize-buffer of copied data */ 131 uintptr_t rm_reports; /* # of referencing checksum reports */ 132 uint8_t rm_freed; /* map no longer has referencing ZIO */ 133 uint8_t rm_ecksuminjected; /* checksum error was injected */ 134 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ 135} raidz_map_t; 136 137#define VDEV_RAIDZ_P 0 138#define VDEV_RAIDZ_Q 1 139#define VDEV_RAIDZ_R 2 140 141#define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 142#define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 143 144/* 145 * We provide a mechanism to perform the field multiplication operation on a 146 * 64-bit value all at once rather than a byte at a time. This works by 147 * creating a mask from the top bit in each byte and using that to 148 * conditionally apply the XOR of 0x1d. 149 */ 150#define VDEV_RAIDZ_64MUL_2(x, mask) \ 151{ \ 152 (mask) = (x) & 0x8080808080808080ULL; \ 153 (mask) = ((mask) << 1) - ((mask) >> 7); \ 154 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 155 ((mask) & 0x1d1d1d1d1d1d1d1d); \ 156} 157 158#define VDEV_RAIDZ_64MUL_4(x, mask) \ 159{ \ 160 VDEV_RAIDZ_64MUL_2((x), mask); \ 161 VDEV_RAIDZ_64MUL_2((x), mask); \ 162} 163 164#define VDEV_LABEL_OFFSET(x) (x + VDEV_LABEL_START_SIZE) 165 166/* 167 * Force reconstruction to use the general purpose method. 168 */ 169int vdev_raidz_default_to_general; 170 171/* Powers of 2 in the Galois field defined above. */ 172static const uint8_t vdev_raidz_pow2[256] = { 173 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 174 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 175 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 176 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 177 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 178 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 179 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, 180 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 181 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 182 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, 183 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 184 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 185 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 186 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 187 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 188 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 189 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 190 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 191 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 192 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 193 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 194 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 195 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 196 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 197 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, 198 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 199 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 200 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, 201 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 202 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, 203 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 204 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 205}; 206/* Logs of 2 in the Galois field defined above. */ 207static const uint8_t vdev_raidz_log2[256] = { 208 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, 209 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, 210 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, 211 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, 212 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, 213 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, 214 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, 215 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, 216 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, 217 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, 218 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, 219 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, 220 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, 221 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, 222 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, 223 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, 224 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, 225 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, 226 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, 227 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, 228 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, 229 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, 230 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, 231 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, 232 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, 233 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, 234 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, 235 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, 236 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, 237 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, 238 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, 239 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, 240}; 241 242static void vdev_raidz_generate_parity(raidz_map_t *rm); 243 244/* 245 * Multiply a given number by 2 raised to the given power. 246 */ 247static uint8_t 248vdev_raidz_exp2(uint_t a, int exp) 249{ 250 if (a == 0) 251 return (0); 252 253 ASSERT(exp >= 0); 254 ASSERT(vdev_raidz_log2[a] > 0 || a == 1); 255 256 exp += vdev_raidz_log2[a]; 257 if (exp > 255) 258 exp -= 255; 259 260 return (vdev_raidz_pow2[exp]); 261} 262 263static void 264vdev_raidz_map_free(raidz_map_t *rm) 265{ 266 int c; 267 size_t size; 268 269 for (c = 0; c < rm->rm_firstdatacol; c++) { 270 if (rm->rm_col[c].rc_data != NULL) 271 zio_buf_free(rm->rm_col[c].rc_data, 272 rm->rm_col[c].rc_size); 273 274 if (rm->rm_col[c].rc_gdata != NULL) 275 zio_buf_free(rm->rm_col[c].rc_gdata, 276 rm->rm_col[c].rc_size); 277 } 278 279 size = 0; 280 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) 281 size += rm->rm_col[c].rc_size; 282 283 if (rm->rm_datacopy != NULL) 284 zio_buf_free(rm->rm_datacopy, size); 285 286 kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); 287} 288 289static void 290vdev_raidz_map_free_vsd(zio_t *zio) 291{ 292 raidz_map_t *rm = zio->io_vsd; 293 294 ASSERT0(rm->rm_freed); 295 rm->rm_freed = 1; 296 297 if (rm->rm_reports == 0) 298 vdev_raidz_map_free(rm); 299} 300 301/*ARGSUSED*/ 302static void 303vdev_raidz_cksum_free(void *arg, size_t ignored) 304{ 305 raidz_map_t *rm = arg; 306 307 ASSERT3U(rm->rm_reports, >, 0); 308 309 if (--rm->rm_reports == 0 && rm->rm_freed != 0) 310 vdev_raidz_map_free(rm); 311} 312 313static void 314vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) 315{ 316 raidz_map_t *rm = zcr->zcr_cbdata; 317 size_t c = zcr->zcr_cbinfo; 318 size_t x; 319 320 const char *good = NULL; 321 const char *bad = rm->rm_col[c].rc_data; 322 323 if (good_data == NULL) { 324 zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); 325 return; 326 } 327 328 if (c < rm->rm_firstdatacol) { 329 /* 330 * The first time through, calculate the parity blocks for 331 * the good data (this relies on the fact that the good 332 * data never changes for a given logical ZIO) 333 */ 334 if (rm->rm_col[0].rc_gdata == NULL) { 335 char *bad_parity[VDEV_RAIDZ_MAXPARITY]; 336 char *buf; 337 338 /* 339 * Set up the rm_col[]s to generate the parity for 340 * good_data, first saving the parity bufs and 341 * replacing them with buffers to hold the result. 342 */ 343 for (x = 0; x < rm->rm_firstdatacol; x++) { 344 bad_parity[x] = rm->rm_col[x].rc_data; 345 rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata = 346 zio_buf_alloc(rm->rm_col[x].rc_size); 347 } 348 349 /* fill in the data columns from good_data */ 350 buf = (char *)good_data; 351 for (; x < rm->rm_cols; x++) { 352 rm->rm_col[x].rc_data = buf; 353 buf += rm->rm_col[x].rc_size; 354 } 355 356 /* 357 * Construct the parity from the good data. 358 */ 359 vdev_raidz_generate_parity(rm); 360 361 /* restore everything back to its original state */ 362 for (x = 0; x < rm->rm_firstdatacol; x++) 363 rm->rm_col[x].rc_data = bad_parity[x]; 364 365 buf = rm->rm_datacopy; 366 for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) { 367 rm->rm_col[x].rc_data = buf; 368 buf += rm->rm_col[x].rc_size; 369 } 370 } 371 372 ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL); 373 good = rm->rm_col[c].rc_gdata; 374 } else { 375 /* adjust good_data to point at the start of our column */ 376 good = good_data; 377 378 for (x = rm->rm_firstdatacol; x < c; x++) 379 good += rm->rm_col[x].rc_size; 380 } 381 382 /* we drop the ereport if it ends up that the data was good */ 383 zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE); 384} 385 386/* 387 * Invoked indirectly by zfs_ereport_start_checksum(), called 388 * below when our read operation fails completely. The main point 389 * is to keep a copy of everything we read from disk, so that at 390 * vdev_raidz_cksum_finish() time we can compare it with the good data. 391 */ 392static void 393vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) 394{ 395 size_t c = (size_t)(uintptr_t)arg; 396 caddr_t buf; 397 398 raidz_map_t *rm = zio->io_vsd; 399 size_t size; 400 401 /* set up the report and bump the refcount */ 402 zcr->zcr_cbdata = rm; 403 zcr->zcr_cbinfo = c; 404 zcr->zcr_finish = vdev_raidz_cksum_finish; 405 zcr->zcr_free = vdev_raidz_cksum_free; 406 407 rm->rm_reports++; 408 ASSERT3U(rm->rm_reports, >, 0); 409 410 if (rm->rm_datacopy != NULL) 411 return; 412 413 /* 414 * It's the first time we're called for this raidz_map_t, so we need 415 * to copy the data aside; there's no guarantee that our zio's buffer 416 * won't be re-used for something else. 417 * 418 * Our parity data is already in separate buffers, so there's no need 419 * to copy them. 420 */ 421 422 size = 0; 423 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) 424 size += rm->rm_col[c].rc_size; 425 426 buf = rm->rm_datacopy = zio_buf_alloc(size); 427 428 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 429 raidz_col_t *col = &rm->rm_col[c]; 430 431 bcopy(col->rc_data, buf, col->rc_size); 432 col->rc_data = buf; 433 434 buf += col->rc_size; 435 } 436 ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size); 437} 438 439static const zio_vsd_ops_t vdev_raidz_vsd_ops = { 440 vdev_raidz_map_free_vsd, 441 vdev_raidz_cksum_report 442}; 443 444/* 445 * Divides the IO evenly across all child vdevs; usually, dcols is 446 * the number of children in the target vdev. 447 */ 448static raidz_map_t * 449vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset, boolean_t dofree, 450 uint64_t unit_shift, uint64_t dcols, uint64_t nparity) 451{ 452 raidz_map_t *rm; 453 /* The starting RAIDZ (parent) vdev sector of the block. */ 454 uint64_t b = offset >> unit_shift; 455 /* The zio's size in units of the vdev's minimum sector size. */ 456 uint64_t s = size >> unit_shift; 457 /* The first column for this stripe. */ 458 uint64_t f = b % dcols; 459 /* The starting byte offset on each child vdev. */ 460 uint64_t o = (b / dcols) << unit_shift; 461 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; 462 463 /* 464 * "Quotient": The number of data sectors for this stripe on all but 465 * the "big column" child vdevs that also contain "remainder" data. 466 */ 467 q = s / (dcols - nparity); 468 469 /* 470 * "Remainder": The number of partial stripe data sectors in this I/O. 471 * This will add a sector to some, but not all, child vdevs. 472 */ 473 r = s - q * (dcols - nparity); 474 475 /* The number of "big columns" - those which contain remainder data. */ 476 bc = (r == 0 ? 0 : r + nparity); 477 478 /* 479 * The total number of data and parity sectors associated with 480 * this I/O. 481 */ 482 tot = s + nparity * (q + (r == 0 ? 0 : 1)); 483 484 /* acols: The columns that will be accessed. */ 485 /* scols: The columns that will be accessed or skipped. */ 486 if (q == 0) { 487 /* Our I/O request doesn't span all child vdevs. */ 488 acols = bc; 489 scols = MIN(dcols, roundup(bc, nparity + 1)); 490 } else { 491 acols = dcols; 492 scols = dcols; 493 } 494 495 ASSERT3U(acols, <=, scols); 496 497 rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP); 498 499 rm->rm_cols = acols; 500 rm->rm_scols = scols; 501 rm->rm_bigcols = bc; 502 rm->rm_skipstart = bc; 503 rm->rm_missingdata = 0; 504 rm->rm_missingparity = 0; 505 rm->rm_firstdatacol = nparity; 506 rm->rm_datacopy = NULL; 507 rm->rm_reports = 0; 508 rm->rm_freed = 0; 509 rm->rm_ecksuminjected = 0; 510 511 asize = 0; 512 513 for (c = 0; c < scols; c++) { 514 col = f + c; 515 coff = o; 516 if (col >= dcols) { 517 col -= dcols; 518 coff += 1ULL << unit_shift; 519 } 520 rm->rm_col[c].rc_devidx = col; 521 rm->rm_col[c].rc_offset = coff; 522 rm->rm_col[c].rc_data = NULL; 523 rm->rm_col[c].rc_gdata = NULL; 524 rm->rm_col[c].rc_error = 0; 525 rm->rm_col[c].rc_tried = 0; 526 rm->rm_col[c].rc_skipped = 0; 527 528 if (c >= acols) 529 rm->rm_col[c].rc_size = 0; 530 else if (c < bc) 531 rm->rm_col[c].rc_size = (q + 1) << unit_shift; 532 else 533 rm->rm_col[c].rc_size = q << unit_shift; 534 535 asize += rm->rm_col[c].rc_size; 536 } 537 538 ASSERT3U(asize, ==, tot << unit_shift); 539 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); 540 rm->rm_nskip = roundup(tot, nparity + 1) - tot; 541 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); 542 ASSERT3U(rm->rm_nskip, <=, nparity); 543 544 if (!dofree) { 545 for (c = 0; c < rm->rm_firstdatacol; c++) { 546 rm->rm_col[c].rc_data = 547 zio_buf_alloc(rm->rm_col[c].rc_size); 548 } 549 550 rm->rm_col[c].rc_data = data; 551 552 for (c = c + 1; c < acols; c++) { 553 rm->rm_col[c].rc_data = 554 (char *)rm->rm_col[c - 1].rc_data + 555 rm->rm_col[c - 1].rc_size; 556 } 557 } 558 559 /* 560 * If all data stored spans all columns, there's a danger that parity 561 * will always be on the same device and, since parity isn't read 562 * during normal operation, that that device's I/O bandwidth won't be 563 * used effectively. We therefore switch the parity every 1MB. 564 * 565 * ... at least that was, ostensibly, the theory. As a practical 566 * matter unless we juggle the parity between all devices evenly, we 567 * won't see any benefit. Further, occasional writes that aren't a 568 * multiple of the LCM of the number of children and the minimum 569 * stripe width are sufficient to avoid pessimal behavior. 570 * Unfortunately, this decision created an implicit on-disk format 571 * requirement that we need to support for all eternity, but only 572 * for single-parity RAID-Z. 573 * 574 * If we intend to skip a sector in the zeroth column for padding 575 * we must make sure to note this swap. We will never intend to 576 * skip the first column since at least one data and one parity 577 * column must appear in each row. 578 */ 579 ASSERT(rm->rm_cols >= 2); 580 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); 581 582 if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) { 583 devidx = rm->rm_col[0].rc_devidx; 584 o = rm->rm_col[0].rc_offset; 585 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; 586 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; 587 rm->rm_col[1].rc_devidx = devidx; 588 rm->rm_col[1].rc_offset = o; 589 590 if (rm->rm_skipstart == 0) 591 rm->rm_skipstart = 1; 592 } 593 594 return (rm); 595} 596 597static void 598vdev_raidz_generate_parity_p(raidz_map_t *rm) 599{ 600 uint64_t *p, *src, pcount, ccount, i; 601 int c; 602 603 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 604 605 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 606 src = rm->rm_col[c].rc_data; 607 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 608 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 609 610 if (c == rm->rm_firstdatacol) { 611 ASSERT(ccount == pcount); 612 for (i = 0; i < ccount; i++, src++, p++) { 613 *p = *src; 614 } 615 } else { 616 ASSERT(ccount <= pcount); 617 for (i = 0; i < ccount; i++, src++, p++) { 618 *p ^= *src; 619 } 620 } 621 } 622} 623 624static void 625vdev_raidz_generate_parity_pq(raidz_map_t *rm) 626{ 627 uint64_t *p, *q, *src, pcnt, ccnt, mask, i; 628 int c; 629 630 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 631 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 632 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 633 634 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 635 src = rm->rm_col[c].rc_data; 636 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 637 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 638 639 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 640 641 if (c == rm->rm_firstdatacol) { 642 ASSERT(ccnt == pcnt || ccnt == 0); 643 for (i = 0; i < ccnt; i++, src++, p++, q++) { 644 *p = *src; 645 *q = *src; 646 } 647 for (; i < pcnt; i++, src++, p++, q++) { 648 *p = 0; 649 *q = 0; 650 } 651 } else { 652 ASSERT(ccnt <= pcnt); 653 654 /* 655 * Apply the algorithm described above by multiplying 656 * the previous result and adding in the new value. 657 */ 658 for (i = 0; i < ccnt; i++, src++, p++, q++) { 659 *p ^= *src; 660 661 VDEV_RAIDZ_64MUL_2(*q, mask); 662 *q ^= *src; 663 } 664 665 /* 666 * Treat short columns as though they are full of 0s. 667 * Note that there's therefore nothing needed for P. 668 */ 669 for (; i < pcnt; i++, q++) { 670 VDEV_RAIDZ_64MUL_2(*q, mask); 671 } 672 } 673 } 674} 675 676static void 677vdev_raidz_generate_parity_pqr(raidz_map_t *rm) 678{ 679 uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i; 680 int c; 681 682 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 683 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 684 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 685 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 686 rm->rm_col[VDEV_RAIDZ_R].rc_size); 687 688 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 689 src = rm->rm_col[c].rc_data; 690 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 691 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 692 r = rm->rm_col[VDEV_RAIDZ_R].rc_data; 693 694 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 695 696 if (c == rm->rm_firstdatacol) { 697 ASSERT(ccnt == pcnt || ccnt == 0); 698 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 699 *p = *src; 700 *q = *src; 701 *r = *src; 702 } 703 for (; i < pcnt; i++, src++, p++, q++, r++) { 704 *p = 0; 705 *q = 0; 706 *r = 0; 707 } 708 } else { 709 ASSERT(ccnt <= pcnt); 710 711 /* 712 * Apply the algorithm described above by multiplying 713 * the previous result and adding in the new value. 714 */ 715 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 716 *p ^= *src; 717 718 VDEV_RAIDZ_64MUL_2(*q, mask); 719 *q ^= *src; 720 721 VDEV_RAIDZ_64MUL_4(*r, mask); 722 *r ^= *src; 723 } 724 725 /* 726 * Treat short columns as though they are full of 0s. 727 * Note that there's therefore nothing needed for P. 728 */ 729 for (; i < pcnt; i++, q++, r++) { 730 VDEV_RAIDZ_64MUL_2(*q, mask); 731 VDEV_RAIDZ_64MUL_4(*r, mask); 732 } 733 } 734 } 735} 736 737/* 738 * Generate RAID parity in the first virtual columns according to the number of 739 * parity columns available. 740 */ 741static void 742vdev_raidz_generate_parity(raidz_map_t *rm) 743{ 744 switch (rm->rm_firstdatacol) { 745 case 1: 746 vdev_raidz_generate_parity_p(rm); 747 break; 748 case 2: 749 vdev_raidz_generate_parity_pq(rm); 750 break; 751 case 3: 752 vdev_raidz_generate_parity_pqr(rm); 753 break; 754 default: 755 cmn_err(CE_PANIC, "invalid RAID-Z configuration"); 756 } 757} 758 759static int 760vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) 761{ 762 uint64_t *dst, *src, xcount, ccount, count, i; 763 int x = tgts[0]; 764 int c; 765 766 ASSERT(ntgts == 1); 767 ASSERT(x >= rm->rm_firstdatacol); 768 ASSERT(x < rm->rm_cols); 769 770 xcount = rm->rm_col[x].rc_size / sizeof (src[0]); 771 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0])); 772 ASSERT(xcount > 0); 773 774 src = rm->rm_col[VDEV_RAIDZ_P].rc_data; 775 dst = rm->rm_col[x].rc_data; 776 for (i = 0; i < xcount; i++, dst++, src++) { 777 *dst = *src; 778 } 779 780 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 781 src = rm->rm_col[c].rc_data; 782 dst = rm->rm_col[x].rc_data; 783 784 if (c == x) 785 continue; 786 787 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 788 count = MIN(ccount, xcount); 789 790 for (i = 0; i < count; i++, dst++, src++) { 791 *dst ^= *src; 792 } 793 } 794 795 return (1 << VDEV_RAIDZ_P); 796} 797 798static int 799vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) 800{ 801 uint64_t *dst, *src, xcount, ccount, count, mask, i; 802 uint8_t *b; 803 int x = tgts[0]; 804 int c, j, exp; 805 806 ASSERT(ntgts == 1); 807 808 xcount = rm->rm_col[x].rc_size / sizeof (src[0]); 809 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0])); 810 811 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 812 src = rm->rm_col[c].rc_data; 813 dst = rm->rm_col[x].rc_data; 814 815 if (c == x) 816 ccount = 0; 817 else 818 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 819 820 count = MIN(ccount, xcount); 821 822 if (c == rm->rm_firstdatacol) { 823 for (i = 0; i < count; i++, dst++, src++) { 824 *dst = *src; 825 } 826 for (; i < xcount; i++, dst++) { 827 *dst = 0; 828 } 829 830 } else { 831 for (i = 0; i < count; i++, dst++, src++) { 832 VDEV_RAIDZ_64MUL_2(*dst, mask); 833 *dst ^= *src; 834 } 835 836 for (; i < xcount; i++, dst++) { 837 VDEV_RAIDZ_64MUL_2(*dst, mask); 838 } 839 } 840 } 841 842 src = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 843 dst = rm->rm_col[x].rc_data; 844 exp = 255 - (rm->rm_cols - 1 - x); 845 846 for (i = 0; i < xcount; i++, dst++, src++) { 847 *dst ^= *src; 848 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { 849 *b = vdev_raidz_exp2(*b, exp); 850 } 851 } 852 853 return (1 << VDEV_RAIDZ_Q); 854} 855 856static int 857vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) 858{ 859 uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp; 860 void *pdata, *qdata; 861 uint64_t xsize, ysize, i; 862 int x = tgts[0]; 863 int y = tgts[1]; 864 865 ASSERT(ntgts == 2); 866 ASSERT(x < y); 867 ASSERT(x >= rm->rm_firstdatacol); 868 ASSERT(y < rm->rm_cols); 869 870 ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size); 871 872 /* 873 * Move the parity data aside -- we're going to compute parity as 874 * though columns x and y were full of zeros -- Pxy and Qxy. We want to 875 * reuse the parity generation mechanism without trashing the actual 876 * parity so we make those columns appear to be full of zeros by 877 * setting their lengths to zero. 878 */ 879 pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data; 880 qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 881 xsize = rm->rm_col[x].rc_size; 882 ysize = rm->rm_col[y].rc_size; 883 884 rm->rm_col[VDEV_RAIDZ_P].rc_data = 885 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size); 886 rm->rm_col[VDEV_RAIDZ_Q].rc_data = 887 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size); 888 rm->rm_col[x].rc_size = 0; 889 rm->rm_col[y].rc_size = 0; 890 891 vdev_raidz_generate_parity_pq(rm); 892 893 rm->rm_col[x].rc_size = xsize; 894 rm->rm_col[y].rc_size = ysize; 895 896 p = pdata; 897 q = qdata; 898 pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data; 899 qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 900 xd = rm->rm_col[x].rc_data; 901 yd = rm->rm_col[y].rc_data; 902 903 /* 904 * We now have: 905 * Pxy = P + D_x + D_y 906 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y 907 * 908 * We can then solve for D_x: 909 * D_x = A * (P + Pxy) + B * (Q + Qxy) 910 * where 911 * A = 2^(x - y) * (2^(x - y) + 1)^-1 912 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 913 * 914 * With D_x in hand, we can easily solve for D_y: 915 * D_y = P + Pxy + D_x 916 */ 917 918 a = vdev_raidz_pow2[255 + x - y]; 919 b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)]; 920 tmp = 255 - vdev_raidz_log2[a ^ 1]; 921 922 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; 923 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; 924 925 for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) { 926 *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^ 927 vdev_raidz_exp2(*q ^ *qxy, bexp); 928 929 if (i < ysize) 930 *yd = *p ^ *pxy ^ *xd; 931 } 932 933 zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data, 934 rm->rm_col[VDEV_RAIDZ_P].rc_size); 935 zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data, 936 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 937 938 /* 939 * Restore the saved parity data. 940 */ 941 rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata; 942 rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata; 943 944 return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q)); 945} 946 947/* BEGIN CSTYLED */ 948/* 949 * In the general case of reconstruction, we must solve the system of linear 950 * equations defined by the coeffecients used to generate parity as well as 951 * the contents of the data and parity disks. This can be expressed with 952 * vectors for the original data (D) and the actual data (d) and parity (p) 953 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 954 * 955 * __ __ __ __ 956 * | | __ __ | p_0 | 957 * | V | | D_0 | | p_m-1 | 958 * | | x | : | = | d_0 | 959 * | I | | D_n-1 | | : | 960 * | | ~~ ~~ | d_n-1 | 961 * ~~ ~~ ~~ ~~ 962 * 963 * I is simply a square identity matrix of size n, and V is a vandermonde 964 * matrix defined by the coeffecients we chose for the various parity columns 965 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 966 * computation as well as linear separability. 967 * 968 * __ __ __ __ 969 * | 1 .. 1 1 1 | | p_0 | 970 * | 2^n-1 .. 4 2 1 | __ __ | : | 971 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 972 * | 1 .. 0 0 0 | | D_1 | | d_0 | 973 * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 974 * | : : : : | | : | | d_2 | 975 * | 0 .. 1 0 0 | | D_n-1 | | : | 976 * | 0 .. 0 1 0 | ~~ ~~ | : | 977 * | 0 .. 0 0 1 | | d_n-1 | 978 * ~~ ~~ ~~ ~~ 979 * 980 * Note that I, V, d, and p are known. To compute D, we must invert the 981 * matrix and use the known data and parity values to reconstruct the unknown 982 * data values. We begin by removing the rows in V|I and d|p that correspond 983 * to failed or missing columns; we then make V|I square (n x n) and d|p 984 * sized n by removing rows corresponding to unused parity from the bottom up 985 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 986 * using Gauss-Jordan elimination. In the example below we use m=3 parity 987 * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 988 * __ __ 989 * | 1 1 1 1 1 1 1 1 | 990 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 991 * | 19 205 116 29 64 16 4 1 | / / 992 * | 1 0 0 0 0 0 0 0 | / / 993 * | 0 1 0 0 0 0 0 0 | <--' / 994 * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 995 * | 0 0 0 1 0 0 0 0 | 996 * | 0 0 0 0 1 0 0 0 | 997 * | 0 0 0 0 0 1 0 0 | 998 * | 0 0 0 0 0 0 1 0 | 999 * | 0 0 0 0 0 0 0 1 | 1000 * ~~ ~~ 1001 * __ __ 1002 * | 1 1 1 1 1 1 1 1 | 1003 * | 19 205 116 29 64 16 4 1 | 1004 * | 1 0 0 0 0 0 0 0 | 1005 * (V|I)' = | 0 0 0 1 0 0 0 0 | 1006 * | 0 0 0 0 1 0 0 0 | 1007 * | 0 0 0 0 0 1 0 0 | 1008 * | 0 0 0 0 0 0 1 0 | 1009 * | 0 0 0 0 0 0 0 1 | 1010 * ~~ ~~ 1011 * 1012 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 1013 * have carefully chosen the seed values 1, 2, and 4 to ensure that this 1014 * matrix is not singular. 1015 * __ __ 1016 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1017 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1018 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1019 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1020 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1021 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1022 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1023 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1024 * ~~ ~~ 1025 * __ __ 1026 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1027 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1028 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1029 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1030 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1031 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1032 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1033 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1034 * ~~ ~~ 1035 * __ __ 1036 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1037 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1038 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 1039 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1040 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1041 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1042 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1043 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1044 * ~~ ~~ 1045 * __ __ 1046 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1047 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1048 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 1049 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1050 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1051 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1052 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1053 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1054 * ~~ ~~ 1055 * __ __ 1056 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1057 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1058 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1059 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1060 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1061 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1062 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1063 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1064 * ~~ ~~ 1065 * __ __ 1066 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1067 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 1068 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1069 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1070 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1071 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1072 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1073 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1074 * ~~ ~~ 1075 * __ __ 1076 * | 0 0 1 0 0 0 0 0 | 1077 * | 167 100 5 41 159 169 217 208 | 1078 * | 166 100 4 40 158 168 216 209 | 1079 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 1080 * | 0 0 0 0 1 0 0 0 | 1081 * | 0 0 0 0 0 1 0 0 | 1082 * | 0 0 0 0 0 0 1 0 | 1083 * | 0 0 0 0 0 0 0 1 | 1084 * ~~ ~~ 1085 * 1086 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 1087 * of the missing data. 1088 * 1089 * As is apparent from the example above, the only non-trivial rows in the 1090 * inverse matrix correspond to the data disks that we're trying to 1091 * reconstruct. Indeed, those are the only rows we need as the others would 1092 * only be useful for reconstructing data known or assumed to be valid. For 1093 * that reason, we only build the coefficients in the rows that correspond to 1094 * targeted columns. 1095 */ 1096/* END CSTYLED */ 1097 1098static void 1099vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, 1100 uint8_t **rows) 1101{ 1102 int i, j; 1103 int pow; 1104 1105 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); 1106 1107 /* 1108 * Fill in the missing rows of interest. 1109 */ 1110 for (i = 0; i < nmap; i++) { 1111 ASSERT3S(0, <=, map[i]); 1112 ASSERT3S(map[i], <=, 2); 1113 1114 pow = map[i] * n; 1115 if (pow > 255) 1116 pow -= 255; 1117 ASSERT(pow <= 255); 1118 1119 for (j = 0; j < n; j++) { 1120 pow -= map[i]; 1121 if (pow < 0) 1122 pow += 255; 1123 rows[i][j] = vdev_raidz_pow2[pow]; 1124 } 1125 } 1126} 1127 1128static void 1129vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, 1130 uint8_t **rows, uint8_t **invrows, const uint8_t *used) 1131{ 1132 int i, j, ii, jj; 1133 uint8_t log; 1134 1135 /* 1136 * Assert that the first nmissing entries from the array of used 1137 * columns correspond to parity columns and that subsequent entries 1138 * correspond to data columns. 1139 */ 1140 for (i = 0; i < nmissing; i++) { 1141 ASSERT3S(used[i], <, rm->rm_firstdatacol); 1142 } 1143 for (; i < n; i++) { 1144 ASSERT3S(used[i], >=, rm->rm_firstdatacol); 1145 } 1146 1147 /* 1148 * First initialize the storage where we'll compute the inverse rows. 1149 */ 1150 for (i = 0; i < nmissing; i++) { 1151 for (j = 0; j < n; j++) { 1152 invrows[i][j] = (i == j) ? 1 : 0; 1153 } 1154 } 1155 1156 /* 1157 * Subtract all trivial rows from the rows of consequence. 1158 */ 1159 for (i = 0; i < nmissing; i++) { 1160 for (j = nmissing; j < n; j++) { 1161 ASSERT3U(used[j], >=, rm->rm_firstdatacol); 1162 jj = used[j] - rm->rm_firstdatacol; 1163 ASSERT3S(jj, <, n); 1164 invrows[i][j] = rows[i][jj]; 1165 rows[i][jj] = 0; 1166 } 1167 } 1168 1169 /* 1170 * For each of the rows of interest, we must normalize it and subtract 1171 * a multiple of it from the other rows. 1172 */ 1173 for (i = 0; i < nmissing; i++) { 1174 for (j = 0; j < missing[i]; j++) { 1175 ASSERT0(rows[i][j]); 1176 } 1177 ASSERT3U(rows[i][missing[i]], !=, 0); 1178 1179 /* 1180 * Compute the inverse of the first element and multiply each 1181 * element in the row by that value. 1182 */ 1183 log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 1184 1185 for (j = 0; j < n; j++) { 1186 rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 1187 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 1188 } 1189 1190 for (ii = 0; ii < nmissing; ii++) { 1191 if (i == ii) 1192 continue; 1193 1194 ASSERT3U(rows[ii][missing[i]], !=, 0); 1195 1196 log = vdev_raidz_log2[rows[ii][missing[i]]]; 1197 1198 for (j = 0; j < n; j++) { 1199 rows[ii][j] ^= 1200 vdev_raidz_exp2(rows[i][j], log); 1201 invrows[ii][j] ^= 1202 vdev_raidz_exp2(invrows[i][j], log); 1203 } 1204 } 1205 } 1206 1207 /* 1208 * Verify that the data that is left in the rows are properly part of 1209 * an identity matrix. 1210 */ 1211 for (i = 0; i < nmissing; i++) { 1212 for (j = 0; j < n; j++) { 1213 if (j == missing[i]) { 1214 ASSERT3U(rows[i][j], ==, 1); 1215 } else { 1216 ASSERT0(rows[i][j]); 1217 } 1218 } 1219 } 1220} 1221 1222static void 1223vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, 1224 int *missing, uint8_t **invrows, const uint8_t *used) 1225{ 1226 int i, j, x, cc, c; 1227 uint8_t *src; 1228 uint64_t ccount; 1229 uint8_t *dst[VDEV_RAIDZ_MAXPARITY]; 1230 uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; 1231 uint8_t log = 0; 1232 uint8_t val; 1233 int ll; 1234 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 1235 uint8_t *p, *pp; 1236 size_t psize; 1237 1238 psize = sizeof (invlog[0][0]) * n * nmissing; 1239 p = kmem_alloc(psize, KM_SLEEP); 1240 1241 for (pp = p, i = 0; i < nmissing; i++) { 1242 invlog[i] = pp; 1243 pp += n; 1244 } 1245 1246 for (i = 0; i < nmissing; i++) { 1247 for (j = 0; j < n; j++) { 1248 ASSERT3U(invrows[i][j], !=, 0); 1249 invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 1250 } 1251 } 1252 1253 for (i = 0; i < n; i++) { 1254 c = used[i]; 1255 ASSERT3U(c, <, rm->rm_cols); 1256 1257 src = rm->rm_col[c].rc_data; 1258 ccount = rm->rm_col[c].rc_size; 1259 for (j = 0; j < nmissing; j++) { 1260 cc = missing[j] + rm->rm_firstdatacol; 1261 ASSERT3U(cc, >=, rm->rm_firstdatacol); 1262 ASSERT3U(cc, <, rm->rm_cols); 1263 ASSERT3U(cc, !=, c); 1264 1265 dst[j] = rm->rm_col[cc].rc_data; 1266 dcount[j] = rm->rm_col[cc].rc_size; 1267 } 1268 1269 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); 1270 1271 for (x = 0; x < ccount; x++, src++) { 1272 if (*src != 0) 1273 log = vdev_raidz_log2[*src]; 1274 1275 for (cc = 0; cc < nmissing; cc++) { 1276 if (x >= dcount[cc]) 1277 continue; 1278 1279 if (*src == 0) { 1280 val = 0; 1281 } else { 1282 if ((ll = log + invlog[cc][i]) >= 255) 1283 ll -= 255; 1284 val = vdev_raidz_pow2[ll]; 1285 } 1286 1287 if (i == 0) 1288 dst[cc][x] = val; 1289 else 1290 dst[cc][x] ^= val; 1291 } 1292 } 1293 } 1294 1295 kmem_free(p, psize); 1296} 1297 1298static int 1299vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) 1300{ 1301 int n, i, c, t, tt; 1302 int nmissing_rows; 1303 int missing_rows[VDEV_RAIDZ_MAXPARITY]; 1304 int parity_map[VDEV_RAIDZ_MAXPARITY]; 1305 1306 uint8_t *p, *pp; 1307 size_t psize; 1308 1309 uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 1310 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 1311 uint8_t *used; 1312 1313 int code = 0; 1314 1315 1316 n = rm->rm_cols - rm->rm_firstdatacol; 1317 1318 /* 1319 * Figure out which data columns are missing. 1320 */ 1321 nmissing_rows = 0; 1322 for (t = 0; t < ntgts; t++) { 1323 if (tgts[t] >= rm->rm_firstdatacol) { 1324 missing_rows[nmissing_rows++] = 1325 tgts[t] - rm->rm_firstdatacol; 1326 } 1327 } 1328 1329 /* 1330 * Figure out which parity columns to use to help generate the missing 1331 * data columns. 1332 */ 1333 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 1334 ASSERT(tt < ntgts); 1335 ASSERT(c < rm->rm_firstdatacol); 1336 1337 /* 1338 * Skip any targeted parity columns. 1339 */ 1340 if (c == tgts[tt]) { 1341 tt++; 1342 continue; 1343 } 1344 1345 code |= 1 << c; 1346 1347 parity_map[i] = c; 1348 i++; 1349 } 1350 1351 ASSERT(code != 0); 1352 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); 1353 1354 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 1355 nmissing_rows * n + sizeof (used[0]) * n; 1356 p = kmem_alloc(psize, KM_SLEEP); 1357 1358 for (pp = p, i = 0; i < nmissing_rows; i++) { 1359 rows[i] = pp; 1360 pp += n; 1361 invrows[i] = pp; 1362 pp += n; 1363 } 1364 used = pp; 1365 1366 for (i = 0; i < nmissing_rows; i++) { 1367 used[i] = parity_map[i]; 1368 } 1369 1370 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1371 if (tt < nmissing_rows && 1372 c == missing_rows[tt] + rm->rm_firstdatacol) { 1373 tt++; 1374 continue; 1375 } 1376 1377 ASSERT3S(i, <, n); 1378 used[i] = c; 1379 i++; 1380 } 1381 1382 /* 1383 * Initialize the interesting rows of the matrix. 1384 */ 1385 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); 1386 1387 /* 1388 * Invert the matrix. 1389 */ 1390 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, 1391 invrows, used); 1392 1393 /* 1394 * Reconstruct the missing data using the generated matrix. 1395 */ 1396 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, 1397 invrows, used); 1398 1399 kmem_free(p, psize); 1400 1401 return (code); 1402} 1403 1404static int 1405vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) 1406{ 1407 int tgts[VDEV_RAIDZ_MAXPARITY], *dt; 1408 int ntgts; 1409 int i, c; 1410 int code; 1411 int nbadparity, nbaddata; 1412 int parity_valid[VDEV_RAIDZ_MAXPARITY]; 1413 1414 /* 1415 * The tgts list must already be sorted. 1416 */ 1417 for (i = 1; i < nt; i++) { 1418 ASSERT(t[i] > t[i - 1]); 1419 } 1420 1421 nbadparity = rm->rm_firstdatacol; 1422 nbaddata = rm->rm_cols - nbadparity; 1423 ntgts = 0; 1424 for (i = 0, c = 0; c < rm->rm_cols; c++) { 1425 if (c < rm->rm_firstdatacol) 1426 parity_valid[c] = B_FALSE; 1427 1428 if (i < nt && c == t[i]) { 1429 tgts[ntgts++] = c; 1430 i++; 1431 } else if (rm->rm_col[c].rc_error != 0) { 1432 tgts[ntgts++] = c; 1433 } else if (c >= rm->rm_firstdatacol) { 1434 nbaddata--; 1435 } else { 1436 parity_valid[c] = B_TRUE; 1437 nbadparity--; 1438 } 1439 } 1440 1441 ASSERT(ntgts >= nt); 1442 ASSERT(nbaddata >= 0); 1443 ASSERT(nbaddata + nbadparity == ntgts); 1444 1445 dt = &tgts[nbadparity]; 1446 1447 /* 1448 * See if we can use any of our optimized reconstruction routines. 1449 */ 1450 if (!vdev_raidz_default_to_general) { 1451 switch (nbaddata) { 1452 case 1: 1453 if (parity_valid[VDEV_RAIDZ_P]) 1454 return (vdev_raidz_reconstruct_p(rm, dt, 1)); 1455 1456 ASSERT(rm->rm_firstdatacol > 1); 1457 1458 if (parity_valid[VDEV_RAIDZ_Q]) 1459 return (vdev_raidz_reconstruct_q(rm, dt, 1)); 1460 1461 ASSERT(rm->rm_firstdatacol > 2); 1462 break; 1463 1464 case 2: 1465 ASSERT(rm->rm_firstdatacol > 1); 1466 1467 if (parity_valid[VDEV_RAIDZ_P] && 1468 parity_valid[VDEV_RAIDZ_Q]) 1469 return (vdev_raidz_reconstruct_pq(rm, dt, 2)); 1470 1471 ASSERT(rm->rm_firstdatacol > 2); 1472 1473 break; 1474 } 1475 } 1476 1477 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); 1478 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); 1479 ASSERT(code > 0); 1480 return (code); 1481} 1482 1483static int 1484vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, 1485 uint64_t *logical_ashift, uint64_t *physical_ashift) 1486{ 1487 vdev_t *cvd; 1488 uint64_t nparity = vd->vdev_nparity; 1489 int c; 1490 int lasterror = 0; 1491 int numerrors = 0; 1492 1493 ASSERT(nparity > 0); 1494 1495 if (nparity > VDEV_RAIDZ_MAXPARITY || 1496 vd->vdev_children < nparity + 1) { 1497 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 1498 return (SET_ERROR(EINVAL)); 1499 } 1500 1501 vdev_open_children(vd); 1502 1503 for (c = 0; c < vd->vdev_children; c++) { 1504 cvd = vd->vdev_child[c]; 1505 1506 if (cvd->vdev_open_error != 0) { 1507 lasterror = cvd->vdev_open_error; 1508 numerrors++; 1509 continue; 1510 } 1511 1512 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; 1513 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; 1514 *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift); 1515 *physical_ashift = MAX(*physical_ashift, 1516 cvd->vdev_physical_ashift); 1517 } 1518 1519 *asize *= vd->vdev_children; 1520 *max_asize *= vd->vdev_children; 1521 1522 if (numerrors > nparity) { 1523 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; 1524 return (lasterror); 1525 } 1526 1527 return (0); 1528} 1529 1530static void 1531vdev_raidz_close(vdev_t *vd) 1532{ 1533 int c; 1534 1535 for (c = 0; c < vd->vdev_children; c++) 1536 vdev_close(vd->vdev_child[c]); 1537} 1538 1539#ifdef illumos 1540/* 1541 * Handle a read or write I/O to a RAID-Z dump device. 1542 * 1543 * The dump device is in a unique situation compared to other ZFS datasets: 1544 * writing to this device should be as simple and fast as possible. In 1545 * addition, durability matters much less since the dump will be extracted 1546 * once the machine reboots. For that reason, this function eschews parity for 1547 * performance and simplicity. The dump device uses the checksum setting 1548 * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this 1549 * dataset. 1550 * 1551 * Blocks of size 128 KB have been preallocated for this volume. I/Os less than 1552 * 128 KB will not fill an entire block; in addition, they may not be properly 1553 * aligned. In that case, this function uses the preallocated 128 KB block and 1554 * omits reading or writing any "empty" portions of that block, as opposed to 1555 * allocating a fresh appropriately-sized block. 1556 * 1557 * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs: 1558 * 1559 * vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB) 1560 * 1561 * If this were a standard RAID-Z dataset, a block of at least 40 KB would be 1562 * allocated which spans all five child vdevs. 8 KB of data would be written to 1563 * each of four vdevs, with the fifth containing the parity bits. 1564 * 1565 * parity data data data data 1566 * | PP | XX | XX | XX | XX | 1567 * ^ ^ ^ ^ ^ 1568 * | | | | | 1569 * 8 KB parity ------8 KB data blocks------ 1570 * 1571 * However, when writing to the dump device, the behavior is different: 1572 * 1573 * vdev_raidz_physio(data, size: 32 KB, offset: 64 KB) 1574 * 1575 * Unlike the normal RAID-Z case in which the block is allocated based on the 1576 * I/O size, reads and writes here always use a 128 KB logical I/O size. If the 1577 * I/O size is less than 128 KB, only the actual portions of data are written. 1578 * In this example the data is written to the third data vdev since that vdev 1579 * contains the offset [64 KB, 96 KB). 1580 * 1581 * parity data data data data 1582 * | | | | XX | | 1583 * ^ 1584 * | 1585 * 32 KB data block 1586 * 1587 * As a result, an individual I/O may not span all child vdevs; moreover, a 1588 * small I/O may only operate on a single child vdev. 1589 * 1590 * Note that since there are no parity bits calculated or written, this format 1591 * remains the same no matter how many parity bits are used in a normal RAID-Z 1592 * stripe. On a RAID-Z3 configuration with seven child vdevs, the example above 1593 * would look like: 1594 * 1595 * parity parity parity data data data data 1596 * | | | | | | XX | | 1597 * ^ 1598 * | 1599 * 32 KB data block 1600 */ 1601int 1602vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size, 1603 uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump) 1604{ 1605 vdev_t *tvd = vd->vdev_top; 1606 vdev_t *cvd; 1607 raidz_map_t *rm; 1608 raidz_col_t *rc; 1609 int c, err = 0; 1610 1611 uint64_t start, end, colstart, colend; 1612 uint64_t coloffset, colsize, colskip; 1613 1614 int flags = doread ? BIO_READ : BIO_WRITE; 1615 1616#ifdef _KERNEL 1617 1618 /* 1619 * Don't write past the end of the block 1620 */ 1621 VERIFY3U(offset + size, <=, origoffset + SPA_MAXBLOCKSIZE); 1622 1623 start = offset; 1624 end = start + size; 1625 1626 /* 1627 * Allocate a RAID-Z map for this block. Note that this block starts 1628 * from the "original" offset, this is, the offset of the extent which 1629 * contains the requisite offset of the data being read or written. 1630 * 1631 * Even if this I/O operation doesn't span the full block size, let's 1632 * treat the on-disk format as if the only blocks are the complete 128 1633 * KB size. 1634 */ 1635 rm = vdev_raidz_map_alloc(data - (offset - origoffset), 1636 SPA_MAXBLOCKSIZE, origoffset, B_FALSE, tvd->vdev_ashift, vd->vdev_children, 1637 vd->vdev_nparity); 1638 1639 coloffset = origoffset; 1640 1641 for (c = rm->rm_firstdatacol; c < rm->rm_cols; 1642 c++, coloffset += rc->rc_size) { 1643 rc = &rm->rm_col[c]; 1644 cvd = vd->vdev_child[rc->rc_devidx]; 1645 1646 /* 1647 * Find the start and end of this column in the RAID-Z map, 1648 * keeping in mind that the stated size and offset of the 1649 * operation may not fill the entire column for this vdev. 1650 * 1651 * If any portion of the data spans this column, issue the 1652 * appropriate operation to the vdev. 1653 */ 1654 if (coloffset + rc->rc_size <= start) 1655 continue; 1656 if (coloffset >= end) 1657 continue; 1658 1659 colstart = MAX(coloffset, start); 1660 colend = MIN(end, coloffset + rc->rc_size); 1661 colsize = colend - colstart; 1662 colskip = colstart - coloffset; 1663 1664 VERIFY3U(colsize, <=, rc->rc_size); 1665 VERIFY3U(colskip, <=, rc->rc_size); 1666 1667 /* 1668 * Note that the child vdev will have a vdev label at the start 1669 * of its range of offsets, hence the need for 1670 * VDEV_LABEL_OFFSET(). See zio_vdev_child_io() for another 1671 * example of why this calculation is needed. 1672 */ 1673 if ((err = vdev_disk_physio(cvd, 1674 ((char *)rc->rc_data) + colskip, colsize, 1675 VDEV_LABEL_OFFSET(rc->rc_offset) + colskip, 1676 flags, isdump)) != 0) 1677 break; 1678 } 1679 1680 vdev_raidz_map_free(rm); 1681#endif /* KERNEL */ 1682 1683 return (err); 1684} 1685#endif 1686 1687static uint64_t 1688vdev_raidz_asize(vdev_t *vd, uint64_t psize) 1689{ 1690 uint64_t asize; 1691 uint64_t ashift = vd->vdev_top->vdev_ashift; 1692 uint64_t cols = vd->vdev_children; 1693 uint64_t nparity = vd->vdev_nparity; 1694 1695 asize = ((psize - 1) >> ashift) + 1; 1696 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); 1697 asize = roundup(asize, nparity + 1) << ashift; 1698 1699 return (asize); 1700} 1701 1702static void 1703vdev_raidz_child_done(zio_t *zio) 1704{ 1705 raidz_col_t *rc = zio->io_private; 1706 1707 rc->rc_error = zio->io_error; 1708 rc->rc_tried = 1; 1709 rc->rc_skipped = 0; 1710} 1711 1712/* 1713 * Start an IO operation on a RAIDZ VDev 1714 * 1715 * Outline: 1716 * - For write operations: 1717 * 1. Generate the parity data 1718 * 2. Create child zio write operations to each column's vdev, for both 1719 * data and parity. 1720 * 3. If the column skips any sectors for padding, create optional dummy 1721 * write zio children for those areas to improve aggregation continuity. 1722 * - For read operations: 1723 * 1. Create child zio read operations to each data column's vdev to read 1724 * the range of data required for zio. 1725 * 2. If this is a scrub or resilver operation, or if any of the data 1726 * vdevs have had errors, then create zio read operations to the parity 1727 * columns' VDevs as well. 1728 */
|