1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22/* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2012, 2017 by Delphix. All rights reserved. 25 * Copyright (c) 2013, Joyent, Inc. All rights reserved. 26 * Copyright (c) 2014 Integros [integros.com] 27 */ 28 29#include <sys/zfs_context.h> 30#include <sys/spa.h> 31#include <sys/vdev_impl.h> 32#ifdef illumos 33#include <sys/vdev_disk.h> 34#endif 35#include <sys/vdev_file.h> 36#include <sys/vdev_raidz.h> 37#include <sys/zio.h> 38#include <sys/zio_checksum.h> 39#include <sys/abd.h> 40#include <sys/fs/zfs.h> 41#include <sys/fm/fs/zfs.h> 42#include <sys/bio.h> 43 44#ifdef ZFS_DEBUG 45#include <sys/vdev_initialize.h> /* vdev_xlate testing */ 46#endif 47 48/* 49 * Virtual device vector for RAID-Z. 50 * 51 * This vdev supports single, double, and triple parity. For single parity, 52 * we use a simple XOR of all the data columns. For double or triple parity, 53 * we use a special case of Reed-Solomon coding. This extends the 54 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by 55 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for 56 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the 57 * former is also based. The latter is designed to provide higher performance 58 * for writes. 59 * 60 * Note that the Plank paper claimed to support arbitrary N+M, but was then 61 * amended six years later identifying a critical flaw that invalidates its 62 * claims. Nevertheless, the technique can be adapted to work for up to 63 * triple parity. For additional parity, the amendment "Note: Correction to 64 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding 65 * is viable, but the additional complexity means that write performance will 66 * suffer. 67 * 68 * All of the methods above operate on a Galois field, defined over the 69 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements 70 * can be expressed with a single byte. Briefly, the operations on the 71 * field are defined as follows: 72 * 73 * o addition (+) is represented by a bitwise XOR 74 * o subtraction (-) is therefore identical to addition: A + B = A - B 75 * o multiplication of A by 2 is defined by the following bitwise expression: 76 * 77 * (A * 2)_7 = A_6 78 * (A * 2)_6 = A_5 79 * (A * 2)_5 = A_4 80 * (A * 2)_4 = A_3 + A_7 81 * (A * 2)_3 = A_2 + A_7 82 * (A * 2)_2 = A_1 + A_7 83 * (A * 2)_1 = A_0 84 * (A * 2)_0 = A_7 85 * 86 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). 87 * As an aside, this multiplication is derived from the error correcting 88 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1. 89 * 90 * Observe that any number in the field (except for 0) can be expressed as a 91 * power of 2 -- a generator for the field. We store a table of the powers of 92 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can 93 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather 94 * than field addition). The inverse of a field element A (A^-1) is therefore 95 * A ^ (255 - 1) = A^254. 96 * 97 * The up-to-three parity columns, P, Q, R over several data columns, 98 * D_0, ... D_n-1, can be expressed by field operations: 99 * 100 * P = D_0 + D_1 + ... + D_n-2 + D_n-1 101 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 102 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 103 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1 104 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1 105 * 106 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival 107 * XOR operation, and 2 and 4 can be computed quickly and generate linearly- 108 * independent coefficients. (There are no additional coefficients that have 109 * this property which is why the uncorrected Plank method breaks down.) 110 * 111 * See the reconstruction code below for how P, Q and R can used individually 112 * or in concert to recover missing data columns. 113 */ 114 115typedef struct raidz_col { 116 uint64_t rc_devidx; /* child device index for I/O */ 117 uint64_t rc_offset; /* device offset */ 118 uint64_t rc_size; /* I/O size */ 119 abd_t *rc_abd; /* I/O data */ 120 void *rc_gdata; /* used to store the "good" version */ 121 int rc_error; /* I/O error for this device */ 122 uint8_t rc_tried; /* Did we attempt this I/O column? */ 123 uint8_t rc_skipped; /* Did we skip this I/O column? */ 124} raidz_col_t; 125 126typedef struct raidz_map { 127 uint64_t rm_cols; /* Regular column count */ 128 uint64_t rm_scols; /* Count including skipped columns */ 129 uint64_t rm_bigcols; /* Number of oversized columns */ 130 uint64_t rm_asize; /* Actual total I/O size */ 131 uint64_t rm_missingdata; /* Count of missing data devices */ 132 uint64_t rm_missingparity; /* Count of missing parity devices */ 133 uint64_t rm_firstdatacol; /* First data column/parity count */ 134 uint64_t rm_nskip; /* Skipped sectors for padding */ 135 uint64_t rm_skipstart; /* Column index of padding start */ 136 abd_t *rm_abd_copy; /* rm_asize-buffer of copied data */ 137 uintptr_t rm_reports; /* # of referencing checksum reports */ 138 uint8_t rm_freed; /* map no longer has referencing ZIO */ 139 uint8_t rm_ecksuminjected; /* checksum error was injected */ 140 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ 141} raidz_map_t; 142 143#define VDEV_RAIDZ_P 0 144#define VDEV_RAIDZ_Q 1 145#define VDEV_RAIDZ_R 2 146 147#define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 148#define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 149 150/* 151 * We provide a mechanism to perform the field multiplication operation on a 152 * 64-bit value all at once rather than a byte at a time. This works by 153 * creating a mask from the top bit in each byte and using that to 154 * conditionally apply the XOR of 0x1d. 155 */ 156#define VDEV_RAIDZ_64MUL_2(x, mask) \ 157{ \ 158 (mask) = (x) & 0x8080808080808080ULL; \ 159 (mask) = ((mask) << 1) - ((mask) >> 7); \ 160 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 161 ((mask) & 0x1d1d1d1d1d1d1d1d); \ 162} 163 164#define VDEV_RAIDZ_64MUL_4(x, mask) \ 165{ \ 166 VDEV_RAIDZ_64MUL_2((x), mask); \ 167 VDEV_RAIDZ_64MUL_2((x), mask); \ 168} 169 170#define VDEV_LABEL_OFFSET(x) (x + VDEV_LABEL_START_SIZE) 171 172/* 173 * Force reconstruction to use the general purpose method. 174 */ 175int vdev_raidz_default_to_general; 176 177/* Powers of 2 in the Galois field defined above. */ 178static const uint8_t vdev_raidz_pow2[256] = { 179 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 180 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 181 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 182 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 183 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 184 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 185 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, 186 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 187 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 188 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, 189 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 190 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 191 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 192 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 193 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 194 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 195 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 196 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 197 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 198 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 199 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 200 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 201 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 202 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 203 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, 204 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 205 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 206 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, 207 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 208 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, 209 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 210 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 211}; 212/* Logs of 2 in the Galois field defined above. */ 213static const uint8_t vdev_raidz_log2[256] = { 214 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, 215 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, 216 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, 217 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, 218 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, 219 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, 220 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, 221 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, 222 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, 223 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, 224 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, 225 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, 226 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, 227 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, 228 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, 229 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, 230 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, 231 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, 232 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, 233 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, 234 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, 235 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, 236 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, 237 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, 238 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, 239 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, 240 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, 241 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, 242 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, 243 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, 244 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, 245 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, 246}; 247 248static void vdev_raidz_generate_parity(raidz_map_t *rm); 249 250/* 251 * Multiply a given number by 2 raised to the given power. 252 */ 253static uint8_t 254vdev_raidz_exp2(uint_t a, int exp) 255{ 256 if (a == 0) 257 return (0); 258 259 ASSERT(exp >= 0); 260 ASSERT(vdev_raidz_log2[a] > 0 || a == 1); 261 262 exp += vdev_raidz_log2[a]; 263 if (exp > 255) 264 exp -= 255; 265 266 return (vdev_raidz_pow2[exp]); 267} 268 269static void 270vdev_raidz_map_free(raidz_map_t *rm) 271{ 272 int c; 273 size_t size; 274 275 for (c = 0; c < rm->rm_firstdatacol; c++) { 276 if (rm->rm_col[c].rc_abd != NULL) 277 abd_free(rm->rm_col[c].rc_abd); 278 279 if (rm->rm_col[c].rc_gdata != NULL) 280 zio_buf_free(rm->rm_col[c].rc_gdata, 281 rm->rm_col[c].rc_size); 282 } 283 284 size = 0; 285 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 286 if (rm->rm_col[c].rc_abd != NULL) 287 abd_put(rm->rm_col[c].rc_abd); 288 size += rm->rm_col[c].rc_size; 289 } 290 291 if (rm->rm_abd_copy != NULL) 292 abd_free(rm->rm_abd_copy); 293 294 kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); 295} 296 297static void 298vdev_raidz_map_free_vsd(zio_t *zio) 299{ 300 raidz_map_t *rm = zio->io_vsd; 301 302 ASSERT0(rm->rm_freed); 303 rm->rm_freed = 1; 304 305 if (rm->rm_reports == 0) 306 vdev_raidz_map_free(rm); 307} 308 309/*ARGSUSED*/ 310static void 311vdev_raidz_cksum_free(void *arg, size_t ignored) 312{ 313 raidz_map_t *rm = arg; 314 315 ASSERT3U(rm->rm_reports, >, 0); 316 317 if (--rm->rm_reports == 0 && rm->rm_freed != 0) 318 vdev_raidz_map_free(rm); 319} 320 321static void 322vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) 323{ 324 raidz_map_t *rm = zcr->zcr_cbdata; 325 size_t c = zcr->zcr_cbinfo; 326 size_t x; 327 328 const char *good = NULL; 329 char *bad; 330 331 if (good_data == NULL) { 332 zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); 333 return; 334 } 335 336 if (c < rm->rm_firstdatacol) { 337 /* 338 * The first time through, calculate the parity blocks for 339 * the good data (this relies on the fact that the good 340 * data never changes for a given logical ZIO) 341 */ 342 if (rm->rm_col[0].rc_gdata == NULL) { 343 abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY]; 344 char *buf; 345 int offset; 346 347 /* 348 * Set up the rm_col[]s to generate the parity for 349 * good_data, first saving the parity bufs and 350 * replacing them with buffers to hold the result. 351 */ 352 for (x = 0; x < rm->rm_firstdatacol; x++) { 353 bad_parity[x] = rm->rm_col[x].rc_abd; 354 rm->rm_col[x].rc_gdata = 355 zio_buf_alloc(rm->rm_col[x].rc_size); 356 rm->rm_col[x].rc_abd = 357 abd_get_from_buf(rm->rm_col[x].rc_gdata, 358 rm->rm_col[x].rc_size); 359 } 360 361 /* fill in the data columns from good_data */ 362 buf = (char *)good_data; 363 for (; x < rm->rm_cols; x++) { 364 abd_put(rm->rm_col[x].rc_abd); 365 rm->rm_col[x].rc_abd = abd_get_from_buf(buf, 366 rm->rm_col[x].rc_size); 367 buf += rm->rm_col[x].rc_size; 368 } 369 370 /* 371 * Construct the parity from the good data. 372 */ 373 vdev_raidz_generate_parity(rm); 374 375 /* restore everything back to its original state */ 376 for (x = 0; x < rm->rm_firstdatacol; x++) { 377 abd_put(rm->rm_col[x].rc_abd); 378 rm->rm_col[x].rc_abd = bad_parity[x]; 379 } 380 381 offset = 0; 382 for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) { 383 abd_put(rm->rm_col[x].rc_abd); 384 rm->rm_col[x].rc_abd = abd_get_offset( 385 rm->rm_abd_copy, offset); 386 offset += rm->rm_col[x].rc_size; 387 } 388 } 389 390 ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL); 391 good = rm->rm_col[c].rc_gdata; 392 } else { 393 /* adjust good_data to point at the start of our column */ 394 good = good_data; 395 396 for (x = rm->rm_firstdatacol; x < c; x++) 397 good += rm->rm_col[x].rc_size; 398 } 399 400 bad = abd_borrow_buf_copy(rm->rm_col[c].rc_abd, rm->rm_col[c].rc_size); 401 /* we drop the ereport if it ends up that the data was good */ 402 zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE); 403 abd_return_buf(rm->rm_col[c].rc_abd, bad, rm->rm_col[c].rc_size); 404} 405 406/* 407 * Invoked indirectly by zfs_ereport_start_checksum(), called 408 * below when our read operation fails completely. The main point 409 * is to keep a copy of everything we read from disk, so that at 410 * vdev_raidz_cksum_finish() time we can compare it with the good data. 411 */ 412static void 413vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) 414{ 415 size_t c = (size_t)(uintptr_t)arg; 416 size_t offset; 417 418 raidz_map_t *rm = zio->io_vsd; 419 size_t size; 420 421 /* set up the report and bump the refcount */ 422 zcr->zcr_cbdata = rm; 423 zcr->zcr_cbinfo = c; 424 zcr->zcr_finish = vdev_raidz_cksum_finish; 425 zcr->zcr_free = vdev_raidz_cksum_free; 426 427 rm->rm_reports++; 428 ASSERT3U(rm->rm_reports, >, 0); 429 430 if (rm->rm_abd_copy != NULL) 431 return; 432 433 /* 434 * It's the first time we're called for this raidz_map_t, so we need 435 * to copy the data aside; there's no guarantee that our zio's buffer 436 * won't be re-used for something else. 437 * 438 * Our parity data is already in separate buffers, so there's no need 439 * to copy them. 440 */ 441 442 size = 0; 443 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) 444 size += rm->rm_col[c].rc_size; 445 446 rm->rm_abd_copy = 447 abd_alloc_sametype(rm->rm_col[rm->rm_firstdatacol].rc_abd, size); 448 449 for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 450 raidz_col_t *col = &rm->rm_col[c]; 451 abd_t *tmp = abd_get_offset(rm->rm_abd_copy, offset); 452 453 abd_copy(tmp, col->rc_abd, col->rc_size); 454 abd_put(col->rc_abd); 455 col->rc_abd = tmp; 456 457 offset += col->rc_size; 458 } 459 ASSERT3U(offset, ==, size); 460} 461 462static const zio_vsd_ops_t vdev_raidz_vsd_ops = { 463 vdev_raidz_map_free_vsd, 464 vdev_raidz_cksum_report 465}; 466 467/* 468 * Divides the IO evenly across all child vdevs; usually, dcols is 469 * the number of children in the target vdev. 470 */ 471static raidz_map_t * 472vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset, boolean_t dofree, 473 uint64_t unit_shift, uint64_t dcols, uint64_t nparity) 474{ 475 raidz_map_t *rm; 476 /* The starting RAIDZ (parent) vdev sector of the block. */ 477 uint64_t b = offset >> unit_shift; 478 /* The zio's size in units of the vdev's minimum sector size. */ 479 uint64_t s = size >> unit_shift; 480 /* The first column for this stripe. */ 481 uint64_t f = b % dcols; 482 /* The starting byte offset on each child vdev. */ 483 uint64_t o = (b / dcols) << unit_shift; 484 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; 485 uint64_t off = 0; 486 487 /* 488 * "Quotient": The number of data sectors for this stripe on all but 489 * the "big column" child vdevs that also contain "remainder" data. 490 */ 491 q = s / (dcols - nparity); 492 493 /* 494 * "Remainder": The number of partial stripe data sectors in this I/O. 495 * This will add a sector to some, but not all, child vdevs. 496 */ 497 r = s - q * (dcols - nparity); 498 499 /* The number of "big columns" - those which contain remainder data. */ 500 bc = (r == 0 ? 0 : r + nparity); 501 502 /* 503 * The total number of data and parity sectors associated with 504 * this I/O. 505 */ 506 tot = s + nparity * (q + (r == 0 ? 0 : 1)); 507 508 /* acols: The columns that will be accessed. */ 509 /* scols: The columns that will be accessed or skipped. */ 510 if (q == 0) { 511 /* Our I/O request doesn't span all child vdevs. */ 512 acols = bc; 513 scols = MIN(dcols, roundup(bc, nparity + 1)); 514 } else { 515 acols = dcols; 516 scols = dcols; 517 } 518 519 ASSERT3U(acols, <=, scols); 520 521 rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP); 522 523 rm->rm_cols = acols; 524 rm->rm_scols = scols; 525 rm->rm_bigcols = bc; 526 rm->rm_skipstart = bc; 527 rm->rm_missingdata = 0; 528 rm->rm_missingparity = 0; 529 rm->rm_firstdatacol = nparity; 530 rm->rm_abd_copy = NULL; 531 rm->rm_reports = 0; 532 rm->rm_freed = 0; 533 rm->rm_ecksuminjected = 0; 534 535 asize = 0; 536 537 for (c = 0; c < scols; c++) { 538 col = f + c; 539 coff = o; 540 if (col >= dcols) { 541 col -= dcols; 542 coff += 1ULL << unit_shift; 543 } 544 rm->rm_col[c].rc_devidx = col; 545 rm->rm_col[c].rc_offset = coff; 546 rm->rm_col[c].rc_abd = NULL; 547 rm->rm_col[c].rc_gdata = NULL; 548 rm->rm_col[c].rc_error = 0; 549 rm->rm_col[c].rc_tried = 0; 550 rm->rm_col[c].rc_skipped = 0; 551 552 if (c >= acols) 553 rm->rm_col[c].rc_size = 0; 554 else if (c < bc) 555 rm->rm_col[c].rc_size = (q + 1) << unit_shift; 556 else 557 rm->rm_col[c].rc_size = q << unit_shift; 558 559 asize += rm->rm_col[c].rc_size; 560 } 561 562 ASSERT3U(asize, ==, tot << unit_shift); 563 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); 564 rm->rm_nskip = roundup(tot, nparity + 1) - tot; 565 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); 566 ASSERT3U(rm->rm_nskip, <=, nparity); 567 568 if (!dofree) { 569 for (c = 0; c < rm->rm_firstdatacol; c++) { 570 rm->rm_col[c].rc_abd = 571 abd_alloc_linear(rm->rm_col[c].rc_size, B_TRUE); 572 } 573 574 rm->rm_col[c].rc_abd = abd_get_offset(abd, 0); 575 off = rm->rm_col[c].rc_size; 576 577 for (c = c + 1; c < acols; c++) { 578 rm->rm_col[c].rc_abd = abd_get_offset(abd, off); 579 off += rm->rm_col[c].rc_size; 580 } 581 } 582 583 /* 584 * If all data stored spans all columns, there's a danger that parity 585 * will always be on the same device and, since parity isn't read 586 * during normal operation, that that device's I/O bandwidth won't be 587 * used effectively. We therefore switch the parity every 1MB. 588 * 589 * ... at least that was, ostensibly, the theory. As a practical 590 * matter unless we juggle the parity between all devices evenly, we 591 * won't see any benefit. Further, occasional writes that aren't a 592 * multiple of the LCM of the number of children and the minimum 593 * stripe width are sufficient to avoid pessimal behavior. 594 * Unfortunately, this decision created an implicit on-disk format 595 * requirement that we need to support for all eternity, but only 596 * for single-parity RAID-Z. 597 * 598 * If we intend to skip a sector in the zeroth column for padding 599 * we must make sure to note this swap. We will never intend to 600 * skip the first column since at least one data and one parity 601 * column must appear in each row. 602 */ 603 ASSERT(rm->rm_cols >= 2); 604 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); 605 606 if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) { 607 devidx = rm->rm_col[0].rc_devidx; 608 o = rm->rm_col[0].rc_offset; 609 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; 610 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; 611 rm->rm_col[1].rc_devidx = devidx; 612 rm->rm_col[1].rc_offset = o; 613 614 if (rm->rm_skipstart == 0) 615 rm->rm_skipstart = 1; 616 } 617 618 return (rm); 619} 620 621struct pqr_struct { 622 uint64_t *p; 623 uint64_t *q; 624 uint64_t *r; 625}; 626 627static int 628vdev_raidz_p_func(void *buf, size_t size, void *private) 629{ 630 struct pqr_struct *pqr = private; 631 const uint64_t *src = buf; 632 int i, cnt = size / sizeof (src[0]); 633 634 ASSERT(pqr->p && !pqr->q && !pqr->r); 635 636 for (i = 0; i < cnt; i++, src++, pqr->p++) 637 *pqr->p ^= *src; 638 639 return (0); 640} 641 642static int 643vdev_raidz_pq_func(void *buf, size_t size, void *private) 644{ 645 struct pqr_struct *pqr = private; 646 const uint64_t *src = buf; 647 uint64_t mask; 648 int i, cnt = size / sizeof (src[0]); 649 650 ASSERT(pqr->p && pqr->q && !pqr->r); 651 652 for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) { 653 *pqr->p ^= *src; 654 VDEV_RAIDZ_64MUL_2(*pqr->q, mask); 655 *pqr->q ^= *src; 656 } 657 658 return (0); 659} 660 661static int 662vdev_raidz_pqr_func(void *buf, size_t size, void *private) 663{ 664 struct pqr_struct *pqr = private; 665 const uint64_t *src = buf; 666 uint64_t mask; 667 int i, cnt = size / sizeof (src[0]); 668 669 ASSERT(pqr->p && pqr->q && pqr->r); 670 671 for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) { 672 *pqr->p ^= *src; 673 VDEV_RAIDZ_64MUL_2(*pqr->q, mask); 674 *pqr->q ^= *src; 675 VDEV_RAIDZ_64MUL_4(*pqr->r, mask); 676 *pqr->r ^= *src; 677 } 678 679 return (0); 680} 681 682static void 683vdev_raidz_generate_parity_p(raidz_map_t *rm) 684{ 685 uint64_t *p; 686 int c; 687 abd_t *src; 688 689 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 690 src = rm->rm_col[c].rc_abd; 691 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); 692 693 if (c == rm->rm_firstdatacol) { 694 abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); 695 } else { 696 struct pqr_struct pqr = { p, NULL, NULL }; 697 (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, 698 vdev_raidz_p_func, &pqr); 699 } 700 } 701} 702 703static void 704vdev_raidz_generate_parity_pq(raidz_map_t *rm) 705{ 706 uint64_t *p, *q, pcnt, ccnt, mask, i; 707 int c; 708 abd_t *src; 709 710 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); 711 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 712 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 713 714 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 715 src = rm->rm_col[c].rc_abd; 716 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); 717 q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); 718 719 ccnt = rm->rm_col[c].rc_size / sizeof (p[0]); 720 721 if (c == rm->rm_firstdatacol) { 722 abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); 723 (void) memcpy(q, p, rm->rm_col[c].rc_size); 724 } else { 725 struct pqr_struct pqr = { p, q, NULL }; 726 (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, 727 vdev_raidz_pq_func, &pqr); 728 } 729 730 if (c == rm->rm_firstdatacol) { 731 for (i = ccnt; i < pcnt; i++) { 732 p[i] = 0; 733 q[i] = 0; 734 } 735 } else { 736 /* 737 * Treat short columns as though they are full of 0s. 738 * Note that there's therefore nothing needed for P. 739 */ 740 for (i = ccnt; i < pcnt; i++) { 741 VDEV_RAIDZ_64MUL_2(q[i], mask); 742 } 743 } 744 } 745} 746 747static void 748vdev_raidz_generate_parity_pqr(raidz_map_t *rm) 749{ 750 uint64_t *p, *q, *r, pcnt, ccnt, mask, i; 751 int c; 752 abd_t *src; 753 754 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); 755 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 756 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 757 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 758 rm->rm_col[VDEV_RAIDZ_R].rc_size); 759 760 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 761 src = rm->rm_col[c].rc_abd; 762 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); 763 q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); 764 r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd); 765 766 ccnt = rm->rm_col[c].rc_size / sizeof (p[0]); 767 768 if (c == rm->rm_firstdatacol) { 769 abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); 770 (void) memcpy(q, p, rm->rm_col[c].rc_size); 771 (void) memcpy(r, p, rm->rm_col[c].rc_size); 772 } else { 773 struct pqr_struct pqr = { p, q, r }; 774 (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, 775 vdev_raidz_pqr_func, &pqr); 776 } 777 778 if (c == rm->rm_firstdatacol) { 779 for (i = ccnt; i < pcnt; i++) { 780 p[i] = 0; 781 q[i] = 0; 782 r[i] = 0; 783 } 784 } else { 785 /* 786 * Treat short columns as though they are full of 0s. 787 * Note that there's therefore nothing needed for P. 788 */ 789 for (i = ccnt; i < pcnt; i++) { 790 VDEV_RAIDZ_64MUL_2(q[i], mask); 791 VDEV_RAIDZ_64MUL_4(r[i], mask); 792 } 793 } 794 } 795} 796 797/* 798 * Generate RAID parity in the first virtual columns according to the number of 799 * parity columns available. 800 */ 801static void 802vdev_raidz_generate_parity(raidz_map_t *rm) 803{ 804 switch (rm->rm_firstdatacol) { 805 case 1: 806 vdev_raidz_generate_parity_p(rm); 807 break; 808 case 2: 809 vdev_raidz_generate_parity_pq(rm); 810 break; 811 case 3: 812 vdev_raidz_generate_parity_pqr(rm); 813 break; 814 default: 815 cmn_err(CE_PANIC, "invalid RAID-Z configuration"); 816 } 817} 818 819/* ARGSUSED */ 820static int 821vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private) 822{ 823 uint64_t *dst = dbuf; 824 uint64_t *src = sbuf; 825 int cnt = size / sizeof (src[0]); 826 827 for (int i = 0; i < cnt; i++) { 828 dst[i] ^= src[i]; 829 } 830 831 return (0); 832} 833 834/* ARGSUSED */ 835static int 836vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size, 837 void *private) 838{ 839 uint64_t *dst = dbuf; 840 uint64_t *src = sbuf; 841 uint64_t mask; 842 int cnt = size / sizeof (dst[0]); 843 844 for (int i = 0; i < cnt; i++, dst++, src++) { 845 VDEV_RAIDZ_64MUL_2(*dst, mask); 846 *dst ^= *src; 847 } 848 849 return (0); 850} 851 852/* ARGSUSED */ 853static int 854vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private) 855{ 856 uint64_t *dst = buf; 857 uint64_t mask; 858 int cnt = size / sizeof (dst[0]); 859 860 for (int i = 0; i < cnt; i++, dst++) { 861 /* same operation as vdev_raidz_reconst_q_pre_func() on dst */ 862 VDEV_RAIDZ_64MUL_2(*dst, mask); 863 } 864 865 return (0); 866} 867 868struct reconst_q_struct { 869 uint64_t *q; 870 int exp; 871}; 872 873static int 874vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private) 875{ 876 struct reconst_q_struct *rq = private; 877 uint64_t *dst = buf; 878 int cnt = size / sizeof (dst[0]); 879 880 for (int i = 0; i < cnt; i++, dst++, rq->q++) { 881 *dst ^= *rq->q; 882 883 int j; 884 uint8_t *b; 885 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { 886 *b = vdev_raidz_exp2(*b, rq->exp); 887 } 888 } 889 890 return (0); 891} 892 893struct reconst_pq_struct { 894 uint8_t *p; 895 uint8_t *q; 896 uint8_t *pxy; 897 uint8_t *qxy; 898 int aexp; 899 int bexp; 900}; 901 902static int 903vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private) 904{ 905 struct reconst_pq_struct *rpq = private; 906 uint8_t *xd = xbuf; 907 uint8_t *yd = ybuf; 908 909 for (int i = 0; i < size; 910 i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) { 911 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ 912 vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); 913 *yd = *rpq->p ^ *rpq->pxy ^ *xd; 914 } 915 916 return (0); 917} 918 919static int 920vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private) 921{ 922 struct reconst_pq_struct *rpq = private; 923 uint8_t *xd = xbuf; 924 925 for (int i = 0; i < size; 926 i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) { 927 /* same operation as vdev_raidz_reconst_pq_func() on xd */ 928 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ 929 vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); 930 } 931 932 return (0); 933} 934 935static int 936vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) 937{ 938 int x = tgts[0]; 939 int c; 940 abd_t *dst, *src; 941 942 ASSERT(ntgts == 1); 943 ASSERT(x >= rm->rm_firstdatacol); 944 ASSERT(x < rm->rm_cols); 945 946 ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size); 947 ASSERT(rm->rm_col[x].rc_size > 0); 948 949 src = rm->rm_col[VDEV_RAIDZ_P].rc_abd; 950 dst = rm->rm_col[x].rc_abd; 951 952 abd_copy(dst, src, rm->rm_col[x].rc_size); 953 954 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 955 uint64_t size = MIN(rm->rm_col[x].rc_size, 956 rm->rm_col[c].rc_size); 957 958 src = rm->rm_col[c].rc_abd; 959 dst = rm->rm_col[x].rc_abd; 960 961 if (c == x) 962 continue; 963 964 (void) abd_iterate_func2(dst, src, 0, 0, size, 965 vdev_raidz_reconst_p_func, NULL); 966 } 967 968 return (1 << VDEV_RAIDZ_P); 969} 970 971static int 972vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) 973{ 974 int x = tgts[0]; 975 int c, exp; 976 abd_t *dst, *src; 977 978 ASSERT(ntgts == 1); 979 980 ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size); 981 982 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 983 uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size, 984 rm->rm_col[c].rc_size); 985 986 src = rm->rm_col[c].rc_abd; 987 dst = rm->rm_col[x].rc_abd; 988 989 if (c == rm->rm_firstdatacol) { 990 abd_copy(dst, src, size); 991 if (rm->rm_col[x].rc_size > size) 992 abd_zero_off(dst, size, 993 rm->rm_col[x].rc_size - size); 994 } else { 995 ASSERT3U(size, <=, rm->rm_col[x].rc_size); 996 (void) abd_iterate_func2(dst, src, 0, 0, size, 997 vdev_raidz_reconst_q_pre_func, NULL); 998 (void) abd_iterate_func(dst, 999 size, rm->rm_col[x].rc_size - size, 1000 vdev_raidz_reconst_q_pre_tail_func, NULL); 1001 } 1002 } 1003 1004 src = rm->rm_col[VDEV_RAIDZ_Q].rc_abd; 1005 dst = rm->rm_col[x].rc_abd; 1006 exp = 255 - (rm->rm_cols - 1 - x); 1007 1008 struct reconst_q_struct rq = { abd_to_buf(src), exp }; 1009 (void) abd_iterate_func(dst, 0, rm->rm_col[x].rc_size, 1010 vdev_raidz_reconst_q_post_func, &rq); 1011 1012 return (1 << VDEV_RAIDZ_Q); 1013} 1014 1015static int 1016vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) 1017{ 1018 uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp; 1019 abd_t *pdata, *qdata; 1020 uint64_t xsize, ysize; 1021 int x = tgts[0]; 1022 int y = tgts[1]; 1023 abd_t *xd, *yd; 1024 1025 ASSERT(ntgts == 2); 1026 ASSERT(x < y); 1027 ASSERT(x >= rm->rm_firstdatacol); 1028 ASSERT(y < rm->rm_cols); 1029 1030 ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size); 1031 1032 /* 1033 * Move the parity data aside -- we're going to compute parity as 1034 * though columns x and y were full of zeros -- Pxy and Qxy. We want to 1035 * reuse the parity generation mechanism without trashing the actual 1036 * parity so we make those columns appear to be full of zeros by 1037 * setting their lengths to zero. 1038 */ 1039 pdata = rm->rm_col[VDEV_RAIDZ_P].rc_abd; 1040 qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_abd; 1041 xsize = rm->rm_col[x].rc_size; 1042 ysize = rm->rm_col[y].rc_size; 1043 1044 rm->rm_col[VDEV_RAIDZ_P].rc_abd = 1045 abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size, B_TRUE); 1046 rm->rm_col[VDEV_RAIDZ_Q].rc_abd = 1047 abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size, B_TRUE); 1048 rm->rm_col[x].rc_size = 0; 1049 rm->rm_col[y].rc_size = 0; 1050 1051 vdev_raidz_generate_parity_pq(rm); 1052 1053 rm->rm_col[x].rc_size = xsize; 1054 rm->rm_col[y].rc_size = ysize; 1055 1056 p = abd_to_buf(pdata); 1057 q = abd_to_buf(qdata); 1058 pxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); 1059 qxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); 1060 xd = rm->rm_col[x].rc_abd; 1061 yd = rm->rm_col[y].rc_abd; 1062 1063 /* 1064 * We now have: 1065 * Pxy = P + D_x + D_y 1066 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y 1067 * 1068 * We can then solve for D_x: 1069 * D_x = A * (P + Pxy) + B * (Q + Qxy) 1070 * where 1071 * A = 2^(x - y) * (2^(x - y) + 1)^-1 1072 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 1073 * 1074 * With D_x in hand, we can easily solve for D_y: 1075 * D_y = P + Pxy + D_x 1076 */ 1077 1078 a = vdev_raidz_pow2[255 + x - y]; 1079 b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)]; 1080 tmp = 255 - vdev_raidz_log2[a ^ 1]; 1081 1082 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; 1083 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; 1084 1085 ASSERT3U(xsize, >=, ysize); 1086 struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp }; 1087 (void) abd_iterate_func2(xd, yd, 0, 0, ysize, 1088 vdev_raidz_reconst_pq_func, &rpq); 1089 (void) abd_iterate_func(xd, ysize, xsize - ysize, 1090 vdev_raidz_reconst_pq_tail_func, &rpq); 1091 1092 abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_abd); 1093 abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); 1094 1095 /* 1096 * Restore the saved parity data. 1097 */ 1098 rm->rm_col[VDEV_RAIDZ_P].rc_abd = pdata; 1099 rm->rm_col[VDEV_RAIDZ_Q].rc_abd = qdata; 1100 1101 return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q)); 1102} 1103 1104/* BEGIN CSTYLED */ 1105/* 1106 * In the general case of reconstruction, we must solve the system of linear 1107 * equations defined by the coeffecients used to generate parity as well as 1108 * the contents of the data and parity disks. This can be expressed with 1109 * vectors for the original data (D) and the actual data (d) and parity (p) 1110 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 1111 * 1112 * __ __ __ __ 1113 * | | __ __ | p_0 | 1114 * | V | | D_0 | | p_m-1 | 1115 * | | x | : | = | d_0 | 1116 * | I | | D_n-1 | | : | 1117 * | | ~~ ~~ | d_n-1 | 1118 * ~~ ~~ ~~ ~~ 1119 * 1120 * I is simply a square identity matrix of size n, and V is a vandermonde 1121 * matrix defined by the coeffecients we chose for the various parity columns 1122 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 1123 * computation as well as linear separability. 1124 * 1125 * __ __ __ __ 1126 * | 1 .. 1 1 1 | | p_0 | 1127 * | 2^n-1 .. 4 2 1 | __ __ | : | 1128 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 1129 * | 1 .. 0 0 0 | | D_1 | | d_0 | 1130 * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 1131 * | : : : : | | : | | d_2 | 1132 * | 0 .. 1 0 0 | | D_n-1 | | : | 1133 * | 0 .. 0 1 0 | ~~ ~~ | : | 1134 * | 0 .. 0 0 1 | | d_n-1 | 1135 * ~~ ~~ ~~ ~~ 1136 * 1137 * Note that I, V, d, and p are known. To compute D, we must invert the 1138 * matrix and use the known data and parity values to reconstruct the unknown 1139 * data values. We begin by removing the rows in V|I and d|p that correspond 1140 * to failed or missing columns; we then make V|I square (n x n) and d|p 1141 * sized n by removing rows corresponding to unused parity from the bottom up 1142 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 1143 * using Gauss-Jordan elimination. In the example below we use m=3 parity 1144 * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 1145 * __ __ 1146 * | 1 1 1 1 1 1 1 1 | 1147 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 1148 * | 19 205 116 29 64 16 4 1 | / / 1149 * | 1 0 0 0 0 0 0 0 | / / 1150 * | 0 1 0 0 0 0 0 0 | <--' / 1151 * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 1152 * | 0 0 0 1 0 0 0 0 | 1153 * | 0 0 0 0 1 0 0 0 | 1154 * | 0 0 0 0 0 1 0 0 | 1155 * | 0 0 0 0 0 0 1 0 | 1156 * | 0 0 0 0 0 0 0 1 | 1157 * ~~ ~~ 1158 * __ __ 1159 * | 1 1 1 1 1 1 1 1 | 1160 * | 19 205 116 29 64 16 4 1 | 1161 * | 1 0 0 0 0 0 0 0 | 1162 * (V|I)' = | 0 0 0 1 0 0 0 0 | 1163 * | 0 0 0 0 1 0 0 0 | 1164 * | 0 0 0 0 0 1 0 0 | 1165 * | 0 0 0 0 0 0 1 0 | 1166 * | 0 0 0 0 0 0 0 1 | 1167 * ~~ ~~ 1168 * 1169 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 1170 * have carefully chosen the seed values 1, 2, and 4 to ensure that this 1171 * matrix is not singular. 1172 * __ __ 1173 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1174 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1175 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1176 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1177 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1178 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1179 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1180 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1181 * ~~ ~~ 1182 * __ __ 1183 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1184 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1185 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1186 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1187 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1188 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1189 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1190 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1191 * ~~ ~~ 1192 * __ __ 1193 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1194 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1195 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 1196 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1197 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1198 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1199 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1200 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1201 * ~~ ~~ 1202 * __ __ 1203 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1204 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1205 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 1206 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1207 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1208 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1209 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1210 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1211 * ~~ ~~ 1212 * __ __ 1213 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1214 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1215 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1216 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1217 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1218 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1219 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1220 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1221 * ~~ ~~ 1222 * __ __ 1223 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1224 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 1225 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1226 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1227 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1228 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1229 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1230 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1231 * ~~ ~~ 1232 * __ __ 1233 * | 0 0 1 0 0 0 0 0 | 1234 * | 167 100 5 41 159 169 217 208 | 1235 * | 166 100 4 40 158 168 216 209 | 1236 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 1237 * | 0 0 0 0 1 0 0 0 | 1238 * | 0 0 0 0 0 1 0 0 | 1239 * | 0 0 0 0 0 0 1 0 | 1240 * | 0 0 0 0 0 0 0 1 | 1241 * ~~ ~~ 1242 * 1243 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 1244 * of the missing data. 1245 * 1246 * As is apparent from the example above, the only non-trivial rows in the 1247 * inverse matrix correspond to the data disks that we're trying to 1248 * reconstruct. Indeed, those are the only rows we need as the others would 1249 * only be useful for reconstructing data known or assumed to be valid. For 1250 * that reason, we only build the coefficients in the rows that correspond to 1251 * targeted columns. 1252 */ 1253/* END CSTYLED */ 1254 1255static void 1256vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, 1257 uint8_t **rows) 1258{ 1259 int i, j; 1260 int pow; 1261 1262 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); 1263 1264 /* 1265 * Fill in the missing rows of interest. 1266 */ 1267 for (i = 0; i < nmap; i++) { 1268 ASSERT3S(0, <=, map[i]); 1269 ASSERT3S(map[i], <=, 2); 1270 1271 pow = map[i] * n; 1272 if (pow > 255) 1273 pow -= 255; 1274 ASSERT(pow <= 255); 1275 1276 for (j = 0; j < n; j++) { 1277 pow -= map[i]; 1278 if (pow < 0) 1279 pow += 255; 1280 rows[i][j] = vdev_raidz_pow2[pow]; 1281 } 1282 } 1283} 1284 1285static void 1286vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, 1287 uint8_t **rows, uint8_t **invrows, const uint8_t *used) 1288{ 1289 int i, j, ii, jj; 1290 uint8_t log; 1291 1292 /* 1293 * Assert that the first nmissing entries from the array of used 1294 * columns correspond to parity columns and that subsequent entries 1295 * correspond to data columns. 1296 */ 1297 for (i = 0; i < nmissing; i++) { 1298 ASSERT3S(used[i], <, rm->rm_firstdatacol); 1299 } 1300 for (; i < n; i++) { 1301 ASSERT3S(used[i], >=, rm->rm_firstdatacol); 1302 } 1303 1304 /* 1305 * First initialize the storage where we'll compute the inverse rows. 1306 */ 1307 for (i = 0; i < nmissing; i++) { 1308 for (j = 0; j < n; j++) { 1309 invrows[i][j] = (i == j) ? 1 : 0; 1310 } 1311 } 1312 1313 /* 1314 * Subtract all trivial rows from the rows of consequence. 1315 */ 1316 for (i = 0; i < nmissing; i++) { 1317 for (j = nmissing; j < n; j++) { 1318 ASSERT3U(used[j], >=, rm->rm_firstdatacol); 1319 jj = used[j] - rm->rm_firstdatacol; 1320 ASSERT3S(jj, <, n); 1321 invrows[i][j] = rows[i][jj]; 1322 rows[i][jj] = 0; 1323 } 1324 } 1325 1326 /* 1327 * For each of the rows of interest, we must normalize it and subtract 1328 * a multiple of it from the other rows. 1329 */ 1330 for (i = 0; i < nmissing; i++) { 1331 for (j = 0; j < missing[i]; j++) { 1332 ASSERT0(rows[i][j]); 1333 } 1334 ASSERT3U(rows[i][missing[i]], !=, 0); 1335 1336 /* 1337 * Compute the inverse of the first element and multiply each 1338 * element in the row by that value. 1339 */ 1340 log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 1341 1342 for (j = 0; j < n; j++) { 1343 rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 1344 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 1345 } 1346 1347 for (ii = 0; ii < nmissing; ii++) { 1348 if (i == ii) 1349 continue; 1350 1351 ASSERT3U(rows[ii][missing[i]], !=, 0); 1352 1353 log = vdev_raidz_log2[rows[ii][missing[i]]]; 1354 1355 for (j = 0; j < n; j++) { 1356 rows[ii][j] ^= 1357 vdev_raidz_exp2(rows[i][j], log); 1358 invrows[ii][j] ^= 1359 vdev_raidz_exp2(invrows[i][j], log); 1360 } 1361 } 1362 } 1363 1364 /* 1365 * Verify that the data that is left in the rows are properly part of 1366 * an identity matrix. 1367 */ 1368 for (i = 0; i < nmissing; i++) { 1369 for (j = 0; j < n; j++) { 1370 if (j == missing[i]) { 1371 ASSERT3U(rows[i][j], ==, 1); 1372 } else { 1373 ASSERT0(rows[i][j]); 1374 } 1375 } 1376 } 1377} 1378 1379static void 1380vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, 1381 int *missing, uint8_t **invrows, const uint8_t *used) 1382{ 1383 int i, j, x, cc, c; 1384 uint8_t *src; 1385 uint64_t ccount; 1386 uint8_t *dst[VDEV_RAIDZ_MAXPARITY]; 1387 uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; 1388 uint8_t log = 0; 1389 uint8_t val; 1390 int ll; 1391 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 1392 uint8_t *p, *pp; 1393 size_t psize; 1394 1395 psize = sizeof (invlog[0][0]) * n * nmissing; 1396 p = kmem_alloc(psize, KM_SLEEP); 1397 1398 for (pp = p, i = 0; i < nmissing; i++) { 1399 invlog[i] = pp; 1400 pp += n; 1401 } 1402 1403 for (i = 0; i < nmissing; i++) { 1404 for (j = 0; j < n; j++) { 1405 ASSERT3U(invrows[i][j], !=, 0); 1406 invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 1407 } 1408 } 1409 1410 for (i = 0; i < n; i++) { 1411 c = used[i]; 1412 ASSERT3U(c, <, rm->rm_cols); 1413 1414 src = abd_to_buf(rm->rm_col[c].rc_abd); 1415 ccount = rm->rm_col[c].rc_size; 1416 for (j = 0; j < nmissing; j++) { 1417 cc = missing[j] + rm->rm_firstdatacol; 1418 ASSERT3U(cc, >=, rm->rm_firstdatacol); 1419 ASSERT3U(cc, <, rm->rm_cols); 1420 ASSERT3U(cc, !=, c); 1421 1422 dst[j] = abd_to_buf(rm->rm_col[cc].rc_abd); 1423 dcount[j] = rm->rm_col[cc].rc_size; 1424 } 1425 1426 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); 1427 1428 for (x = 0; x < ccount; x++, src++) { 1429 if (*src != 0) 1430 log = vdev_raidz_log2[*src]; 1431 1432 for (cc = 0; cc < nmissing; cc++) { 1433 if (x >= dcount[cc]) 1434 continue; 1435 1436 if (*src == 0) { 1437 val = 0; 1438 } else { 1439 if ((ll = log + invlog[cc][i]) >= 255) 1440 ll -= 255; 1441 val = vdev_raidz_pow2[ll]; 1442 } 1443 1444 if (i == 0) 1445 dst[cc][x] = val; 1446 else 1447 dst[cc][x] ^= val; 1448 } 1449 } 1450 } 1451 1452 kmem_free(p, psize); 1453} 1454 1455static int 1456vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) 1457{ 1458 int n, i, c, t, tt; 1459 int nmissing_rows; 1460 int missing_rows[VDEV_RAIDZ_MAXPARITY]; 1461 int parity_map[VDEV_RAIDZ_MAXPARITY]; 1462 1463 uint8_t *p, *pp; 1464 size_t psize; 1465 1466 uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 1467 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 1468 uint8_t *used; 1469 1470 abd_t **bufs = NULL; 1471 1472 int code = 0; 1473 1474 /* 1475 * Matrix reconstruction can't use scatter ABDs yet, so we allocate 1476 * temporary linear ABDs. 1477 */ 1478 if (!abd_is_linear(rm->rm_col[rm->rm_firstdatacol].rc_abd)) { 1479 bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE); 1480 1481 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1482 raidz_col_t *col = &rm->rm_col[c]; 1483 1484 bufs[c] = col->rc_abd; 1485 col->rc_abd = abd_alloc_linear(col->rc_size, B_TRUE); 1486 abd_copy(col->rc_abd, bufs[c], col->rc_size); 1487 } 1488 } 1489 1490 n = rm->rm_cols - rm->rm_firstdatacol; 1491 1492 /* 1493 * Figure out which data columns are missing. 1494 */ 1495 nmissing_rows = 0; 1496 for (t = 0; t < ntgts; t++) { 1497 if (tgts[t] >= rm->rm_firstdatacol) { 1498 missing_rows[nmissing_rows++] = 1499 tgts[t] - rm->rm_firstdatacol; 1500 } 1501 } 1502 1503 /* 1504 * Figure out which parity columns to use to help generate the missing 1505 * data columns. 1506 */ 1507 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 1508 ASSERT(tt < ntgts); 1509 ASSERT(c < rm->rm_firstdatacol); 1510 1511 /* 1512 * Skip any targeted parity columns. 1513 */ 1514 if (c == tgts[tt]) { 1515 tt++; 1516 continue; 1517 } 1518 1519 code |= 1 << c; 1520 1521 parity_map[i] = c; 1522 i++; 1523 } 1524 1525 ASSERT(code != 0); 1526 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); 1527 1528 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 1529 nmissing_rows * n + sizeof (used[0]) * n; 1530 p = kmem_alloc(psize, KM_SLEEP); 1531 1532 for (pp = p, i = 0; i < nmissing_rows; i++) { 1533 rows[i] = pp; 1534 pp += n; 1535 invrows[i] = pp; 1536 pp += n; 1537 } 1538 used = pp; 1539 1540 for (i = 0; i < nmissing_rows; i++) { 1541 used[i] = parity_map[i]; 1542 } 1543 1544 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1545 if (tt < nmissing_rows && 1546 c == missing_rows[tt] + rm->rm_firstdatacol) { 1547 tt++; 1548 continue; 1549 } 1550 1551 ASSERT3S(i, <, n); 1552 used[i] = c; 1553 i++; 1554 } 1555 1556 /* 1557 * Initialize the interesting rows of the matrix. 1558 */ 1559 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); 1560 1561 /* 1562 * Invert the matrix. 1563 */ 1564 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, 1565 invrows, used); 1566 1567 /* 1568 * Reconstruct the missing data using the generated matrix. 1569 */ 1570 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, 1571 invrows, used); 1572 1573 kmem_free(p, psize); 1574 1575 /* 1576 * copy back from temporary linear abds and free them 1577 */ 1578 if (bufs) { 1579 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1580 raidz_col_t *col = &rm->rm_col[c]; 1581 1582 abd_copy(bufs[c], col->rc_abd, col->rc_size); 1583 abd_free(col->rc_abd); 1584 col->rc_abd = bufs[c]; 1585 } 1586 kmem_free(bufs, rm->rm_cols * sizeof (abd_t *)); 1587 } 1588 1589 return (code); 1590} 1591 1592static int 1593vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) 1594{ 1595 int tgts[VDEV_RAIDZ_MAXPARITY], *dt; 1596 int ntgts; 1597 int i, c; 1598 int code; 1599 int nbadparity, nbaddata; 1600 int parity_valid[VDEV_RAIDZ_MAXPARITY]; 1601 1602 /* 1603 * The tgts list must already be sorted. 1604 */ 1605 for (i = 1; i < nt; i++) { 1606 ASSERT(t[i] > t[i - 1]); 1607 } 1608 1609 nbadparity = rm->rm_firstdatacol; 1610 nbaddata = rm->rm_cols - nbadparity; 1611 ntgts = 0; 1612 for (i = 0, c = 0; c < rm->rm_cols; c++) { 1613 if (c < rm->rm_firstdatacol) 1614 parity_valid[c] = B_FALSE; 1615 1616 if (i < nt && c == t[i]) { 1617 tgts[ntgts++] = c; 1618 i++; 1619 } else if (rm->rm_col[c].rc_error != 0) { 1620 tgts[ntgts++] = c; 1621 } else if (c >= rm->rm_firstdatacol) { 1622 nbaddata--; 1623 } else { 1624 parity_valid[c] = B_TRUE; 1625 nbadparity--; 1626 } 1627 } 1628 1629 ASSERT(ntgts >= nt); 1630 ASSERT(nbaddata >= 0); 1631 ASSERT(nbaddata + nbadparity == ntgts); 1632 1633 dt = &tgts[nbadparity]; 1634 1635 /* 1636 * See if we can use any of our optimized reconstruction routines. 1637 */ 1638 if (!vdev_raidz_default_to_general) { 1639 switch (nbaddata) { 1640 case 1: 1641 if (parity_valid[VDEV_RAIDZ_P]) 1642 return (vdev_raidz_reconstruct_p(rm, dt, 1)); 1643 1644 ASSERT(rm->rm_firstdatacol > 1); 1645 1646 if (parity_valid[VDEV_RAIDZ_Q]) 1647 return (vdev_raidz_reconstruct_q(rm, dt, 1)); 1648 1649 ASSERT(rm->rm_firstdatacol > 2); 1650 break; 1651 1652 case 2: 1653 ASSERT(rm->rm_firstdatacol > 1); 1654 1655 if (parity_valid[VDEV_RAIDZ_P] && 1656 parity_valid[VDEV_RAIDZ_Q]) 1657 return (vdev_raidz_reconstruct_pq(rm, dt, 2)); 1658 1659 ASSERT(rm->rm_firstdatacol > 2); 1660 1661 break; 1662 } 1663 } 1664 1665 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); 1666 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); 1667 ASSERT(code > 0); 1668 return (code); 1669} 1670 1671static int 1672vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, 1673 uint64_t *logical_ashift, uint64_t *physical_ashift) 1674{ 1675 vdev_t *cvd; 1676 uint64_t nparity = vd->vdev_nparity; 1677 int c; 1678 int lasterror = 0; 1679 int numerrors = 0; 1680 1681 ASSERT(nparity > 0); 1682 1683 if (nparity > VDEV_RAIDZ_MAXPARITY || 1684 vd->vdev_children < nparity + 1) { 1685 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 1686 return (SET_ERROR(EINVAL)); 1687 } 1688 1689 vdev_open_children(vd); 1690 1691 for (c = 0; c < vd->vdev_children; c++) { 1692 cvd = vd->vdev_child[c]; 1693 1694 if (cvd->vdev_open_error != 0) { 1695 lasterror = cvd->vdev_open_error; 1696 numerrors++; 1697 continue; 1698 } 1699 1700 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; 1701 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; 1702 *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift); 1703 *physical_ashift = MAX(*physical_ashift, 1704 cvd->vdev_physical_ashift); 1705 } 1706 1707 *asize *= vd->vdev_children; 1708 *max_asize *= vd->vdev_children; 1709 1710 if (numerrors > nparity) { 1711 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; 1712 return (lasterror); 1713 } 1714 1715 return (0); 1716} 1717 1718static void 1719vdev_raidz_close(vdev_t *vd) 1720{ 1721 int c; 1722 1723 for (c = 0; c < vd->vdev_children; c++) 1724 vdev_close(vd->vdev_child[c]); 1725} 1726 1727#ifdef illumos 1728/* 1729 * Handle a read or write I/O to a RAID-Z dump device. 1730 * 1731 * The dump device is in a unique situation compared to other ZFS datasets: 1732 * writing to this device should be as simple and fast as possible. In 1733 * addition, durability matters much less since the dump will be extracted 1734 * once the machine reboots. For that reason, this function eschews parity for 1735 * performance and simplicity. The dump device uses the checksum setting 1736 * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this 1737 * dataset. 1738 * 1739 * Blocks of size 128 KB have been preallocated for this volume. I/Os less than 1740 * 128 KB will not fill an entire block; in addition, they may not be properly 1741 * aligned. In that case, this function uses the preallocated 128 KB block and 1742 * omits reading or writing any "empty" portions of that block, as opposed to 1743 * allocating a fresh appropriately-sized block. 1744 * 1745 * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs: 1746 * 1747 * vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB) 1748 * 1749 * If this were a standard RAID-Z dataset, a block of at least 40 KB would be 1750 * allocated which spans all five child vdevs. 8 KB of data would be written to 1751 * each of four vdevs, with the fifth containing the parity bits. 1752 * 1753 * parity data data data data 1754 * | PP | XX | XX | XX | XX | 1755 * ^ ^ ^ ^ ^ 1756 * | | | | | 1757 * 8 KB parity ------8 KB data blocks------ 1758 * 1759 * However, when writing to the dump device, the behavior is different: 1760 * 1761 * vdev_raidz_physio(data, size: 32 KB, offset: 64 KB) 1762 * 1763 * Unlike the normal RAID-Z case in which the block is allocated based on the 1764 * I/O size, reads and writes here always use a 128 KB logical I/O size. If the 1765 * I/O size is less than 128 KB, only the actual portions of data are written. 1766 * In this example the data is written to the third data vdev since that vdev 1767 * contains the offset [64 KB, 96 KB). 1768 * 1769 * parity data data data data 1770 * | | | | XX | | 1771 * ^ 1772 * | 1773 * 32 KB data block 1774 * 1775 * As a result, an individual I/O may not span all child vdevs; moreover, a 1776 * small I/O may only operate on a single child vdev. 1777 * 1778 * Note that since there are no parity bits calculated or written, this format 1779 * remains the same no matter how many parity bits are used in a normal RAID-Z 1780 * stripe. On a RAID-Z3 configuration with seven child vdevs, the example above 1781 * would look like: 1782 * 1783 * parity parity parity data data data data 1784 * | | | | | | XX | | 1785 * ^ 1786 * | 1787 * 32 KB data block 1788 */ 1789int 1790vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size, 1791 uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump) 1792{ 1793 vdev_t *tvd = vd->vdev_top; 1794 vdev_t *cvd; 1795 raidz_map_t *rm; 1796 raidz_col_t *rc; 1797 int c, err = 0; 1798 1799 uint64_t start, end, colstart, colend; 1800 uint64_t coloffset, colsize, colskip; 1801 1802 int flags = doread ? BIO_READ : BIO_WRITE; 1803 1804#ifdef _KERNEL 1805 1806 /* 1807 * Don't write past the end of the block 1808 */ 1809 VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE); 1810 1811 start = offset; 1812 end = start + size; 1813 1814 /* 1815 * Allocate a RAID-Z map for this block. Note that this block starts 1816 * from the "original" offset, this is, the offset of the extent which 1817 * contains the requisite offset of the data being read or written. 1818 * 1819 * Even if this I/O operation doesn't span the full block size, let's 1820 * treat the on-disk format as if the only blocks are the complete 128 1821 * KB size. 1822 */ 1823 abd_t *abd = abd_get_from_buf(data - (offset - origoffset), 1824 SPA_OLD_MAXBLOCKSIZE); 1825 rm = vdev_raidz_map_alloc(abd, 1826 SPA_OLD_MAXBLOCKSIZE, origoffset, B_FALSE, tvd->vdev_ashift, 1827 vd->vdev_children, vd->vdev_nparity); 1828 1829 coloffset = origoffset; 1830 1831 for (c = rm->rm_firstdatacol; c < rm->rm_cols; 1832 c++, coloffset += rc->rc_size) { 1833 rc = &rm->rm_col[c]; 1834 cvd = vd->vdev_child[rc->rc_devidx]; 1835 1836 /* 1837 * Find the start and end of this column in the RAID-Z map, 1838 * keeping in mind that the stated size and offset of the 1839 * operation may not fill the entire column for this vdev. 1840 * 1841 * If any portion of the data spans this column, issue the 1842 * appropriate operation to the vdev. 1843 */ 1844 if (coloffset + rc->rc_size <= start) 1845 continue; 1846 if (coloffset >= end) 1847 continue; 1848 1849 colstart = MAX(coloffset, start); 1850 colend = MIN(end, coloffset + rc->rc_size); 1851 colsize = colend - colstart; 1852 colskip = colstart - coloffset; 1853 1854 VERIFY3U(colsize, <=, rc->rc_size); 1855 VERIFY3U(colskip, <=, rc->rc_size); 1856 1857 /* 1858 * Note that the child vdev will have a vdev label at the start 1859 * of its range of offsets, hence the need for 1860 * VDEV_LABEL_OFFSET(). See zio_vdev_child_io() for another 1861 * example of why this calculation is needed. 1862 */ 1863 if ((err = vdev_disk_physio(cvd, 1864 ((char *)abd_to_buf(rc->rc_abd)) + colskip, colsize, 1865 VDEV_LABEL_OFFSET(rc->rc_offset) + colskip, 1866 flags, isdump)) != 0) 1867 break; 1868 } 1869 1870 vdev_raidz_map_free(rm); 1871 abd_put(abd); 1872#endif /* KERNEL */ 1873 1874 return (err); 1875} 1876#endif 1877 1878static uint64_t 1879vdev_raidz_asize(vdev_t *vd, uint64_t psize) 1880{ 1881 uint64_t asize; 1882 uint64_t ashift = vd->vdev_top->vdev_ashift; 1883 uint64_t cols = vd->vdev_children; 1884 uint64_t nparity = vd->vdev_nparity; 1885 1886 asize = ((psize - 1) >> ashift) + 1; 1887 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); 1888 asize = roundup(asize, nparity + 1) << ashift; 1889 1890 return (asize); 1891} 1892 1893static void 1894vdev_raidz_child_done(zio_t *zio) 1895{ 1896 raidz_col_t *rc = zio->io_private; 1897 1898 rc->rc_error = zio->io_error; 1899 rc->rc_tried = 1; 1900 rc->rc_skipped = 0; 1901} 1902 1903static void 1904vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, int col) 1905{ 1906#ifdef ZFS_DEBUG 1907 vdev_t *vd = zio->io_vd; 1908 vdev_t *tvd = vd->vdev_top; 1909 1910 range_seg_t logical_rs, physical_rs; 1911 logical_rs.rs_start = zio->io_offset; 1912 logical_rs.rs_end = logical_rs.rs_start + 1913 vdev_raidz_asize(zio->io_vd, zio->io_size); 1914 1915 raidz_col_t *rc = &rm->rm_col[col]; 1916 vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; 1917 1918 vdev_xlate(cvd, &logical_rs, &physical_rs); 1919 ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start); 1920 ASSERT3U(rc->rc_offset, <, physical_rs.rs_end); 1921 /* 1922 * It would be nice to assert that rs_end is equal 1923 * to rc_offset + rc_size but there might be an 1924 * optional I/O at the end that is not accounted in 1925 * rc_size. 1926 */ 1927 if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) { 1928 ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + 1929 rc->rc_size + (1 << tvd->vdev_ashift)); 1930 } else { 1931 ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size); 1932 } 1933#endif 1934} 1935 1936/* 1937 * Start an IO operation on a RAIDZ VDev 1938 * 1939 * Outline: 1940 * - For write operations: 1941 * 1. Generate the parity data 1942 * 2. Create child zio write operations to each column's vdev, for both 1943 * data and parity. 1944 * 3. If the column skips any sectors for padding, create optional dummy 1945 * write zio children for those areas to improve aggregation continuity. 1946 * - For read operations: 1947 * 1. Create child zio read operations to each data column's vdev to read 1948 * the range of data required for zio. 1949 * 2. If this is a scrub or resilver operation, or if any of the data 1950 * vdevs have had errors, then create zio read operations to the parity 1951 * columns' VDevs as well. 1952 */ 1953static void 1954vdev_raidz_io_start(zio_t *zio) 1955{ 1956 vdev_t *vd = zio->io_vd; 1957 vdev_t *tvd = vd->vdev_top; 1958 vdev_t *cvd; 1959 raidz_map_t *rm; 1960 raidz_col_t *rc; 1961 int c, i; 1962 1963 rm = vdev_raidz_map_alloc(zio->io_abd, zio->io_size, zio->io_offset, 1964 zio->io_type == ZIO_TYPE_FREE, 1965 tvd->vdev_ashift, vd->vdev_children, 1966 vd->vdev_nparity); 1967 1968 zio->io_vsd = rm; 1969 zio->io_vsd_ops = &vdev_raidz_vsd_ops; 1970 1971 ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); 1972 1973 if (zio->io_type == ZIO_TYPE_FREE) { 1974 for (c = 0; c < rm->rm_cols; c++) { 1975 rc = &rm->rm_col[c]; 1976 cvd = vd->vdev_child[rc->rc_devidx]; 1977 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1978 rc->rc_offset, rc->rc_abd, rc->rc_size, 1979 zio->io_type, zio->io_priority, 0, 1980 vdev_raidz_child_done, rc)); 1981 } 1982 1983 zio_execute(zio); 1984 return; 1985 } 1986 1987 if (zio->io_type == ZIO_TYPE_WRITE) { 1988 vdev_raidz_generate_parity(rm); 1989 1990 for (c = 0; c < rm->rm_cols; c++) { 1991 rc = &rm->rm_col[c]; 1992 cvd = vd->vdev_child[rc->rc_devidx]; 1993 1994 /* 1995 * Verify physical to logical translation. 1996 */ 1997 vdev_raidz_io_verify(zio, rm, c); 1998 1999 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 2000 rc->rc_offset, rc->rc_abd, rc->rc_size, 2001 zio->io_type, zio->io_priority, 0, 2002 vdev_raidz_child_done, rc)); 2003 } 2004 2005 /* 2006 * Generate optional I/Os for any skipped sectors to improve 2007 * aggregation contiguity. 2008 */ 2009 for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) { 2010 ASSERT(c <= rm->rm_scols); 2011 if (c == rm->rm_scols) 2012 c = 0; 2013 rc = &rm->rm_col[c]; 2014 cvd = vd->vdev_child[rc->rc_devidx]; 2015 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 2016 rc->rc_offset + rc->rc_size, NULL, 2017 1 << tvd->vdev_ashift, 2018 zio->io_type, zio->io_priority, 2019 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); 2020 } 2021 2022 zio_execute(zio); 2023 return; 2024 } 2025 2026 ASSERT(zio->io_type == ZIO_TYPE_READ); 2027 2028 /* 2029 * Iterate over the columns in reverse order so that we hit the parity 2030 * last -- any errors along the way will force us to read the parity. 2031 */ 2032 for (c = rm->rm_cols - 1; c >= 0; c--) { 2033 rc = &rm->rm_col[c]; 2034 cvd = vd->vdev_child[rc->rc_devidx]; 2035 if (!vdev_readable(cvd)) { 2036 if (c >= rm->rm_firstdatacol) 2037 rm->rm_missingdata++; 2038 else 2039 rm->rm_missingparity++; 2040 rc->rc_error = SET_ERROR(ENXIO); 2041 rc->rc_tried = 1; /* don't even try */ 2042 rc->rc_skipped = 1; 2043 continue; 2044 } 2045 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 2046 if (c >= rm->rm_firstdatacol) 2047 rm->rm_missingdata++; 2048 else 2049 rm->rm_missingparity++; 2050 rc->rc_error = SET_ERROR(ESTALE); 2051 rc->rc_skipped = 1; 2052 continue; 2053 } 2054 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || 2055 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { 2056 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 2057 rc->rc_offset, rc->rc_abd, rc->rc_size, 2058 zio->io_type, zio->io_priority, 0, 2059 vdev_raidz_child_done, rc)); 2060 } 2061 } 2062 2063 zio_execute(zio); 2064} 2065 2066 2067/* 2068 * Report a checksum error for a child of a RAID-Z device. 2069 */ 2070static void 2071raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data) 2072{ 2073 void *buf; 2074 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; 2075 2076 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2077 zio_bad_cksum_t zbc; 2078 raidz_map_t *rm = zio->io_vsd; 2079 2080 mutex_enter(&vd->vdev_stat_lock); 2081 vd->vdev_stat.vs_checksum_errors++; 2082 mutex_exit(&vd->vdev_stat_lock); 2083 2084 zbc.zbc_has_cksum = 0; 2085 zbc.zbc_injected = rm->rm_ecksuminjected; 2086 2087 buf = abd_borrow_buf_copy(rc->rc_abd, rc->rc_size); 2088 zfs_ereport_post_checksum(zio->io_spa, vd, zio, 2089 rc->rc_offset, rc->rc_size, buf, bad_data, 2090 &zbc); 2091 abd_return_buf(rc->rc_abd, buf, rc->rc_size); 2092 } 2093} 2094 2095/* 2096 * We keep track of whether or not there were any injected errors, so that 2097 * any ereports we generate can note it. 2098 */ 2099static int 2100raidz_checksum_verify(zio_t *zio) 2101{ 2102 zio_bad_cksum_t zbc; 2103 raidz_map_t *rm = zio->io_vsd; 2104 2105 int ret = zio_checksum_error(zio, &zbc); 2106 if (ret != 0 && zbc.zbc_injected != 0) 2107 rm->rm_ecksuminjected = 1; 2108 2109 return (ret); 2110} 2111 2112/* 2113 * Generate the parity from the data columns. If we tried and were able to 2114 * read the parity without error, verify that the generated parity matches the 2115 * data we read. If it doesn't, we fire off a checksum error. Return the 2116 * number such failures. 2117 */ 2118static int 2119raidz_parity_verify(zio_t *zio, raidz_map_t *rm) 2120{ 2121 void *orig[VDEV_RAIDZ_MAXPARITY]; 2122 int c, ret = 0; 2123 raidz_col_t *rc; 2124 2125 blkptr_t *bp = zio->io_bp; 2126 enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum : 2127 (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); 2128 2129 if (checksum == ZIO_CHECKSUM_NOPARITY) 2130 return (ret); 2131 2132 for (c = 0; c < rm->rm_firstdatacol; c++) { 2133 rc = &rm->rm_col[c]; 2134 if (!rc->rc_tried || rc->rc_error != 0) 2135 continue; 2136 orig[c] = zio_buf_alloc(rc->rc_size); 2137 abd_copy_to_buf(orig[c], rc->rc_abd, rc->rc_size); 2138 } 2139 2140 vdev_raidz_generate_parity(rm); 2141 2142 for (c = 0; c < rm->rm_firstdatacol; c++) { 2143 rc = &rm->rm_col[c]; 2144 if (!rc->rc_tried || rc->rc_error != 0) 2145 continue; 2146 if (abd_cmp_buf(rc->rc_abd, orig[c], rc->rc_size) != 0) { 2147 raidz_checksum_error(zio, rc, orig[c]); 2148 rc->rc_error = SET_ERROR(ECKSUM); 2149 ret++; 2150 } 2151 zio_buf_free(orig[c], rc->rc_size); 2152 } 2153 2154 return (ret); 2155} 2156 2157/* 2158 * Keep statistics on all the ways that we used parity to correct data. 2159 */ 2160static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY]; 2161 2162static int 2163vdev_raidz_worst_error(raidz_map_t *rm) 2164{ 2165 int error = 0; 2166 2167 for (int c = 0; c < rm->rm_cols; c++) 2168 error = zio_worst_error(error, rm->rm_col[c].rc_error); 2169 2170 return (error); 2171} 2172 2173/* 2174 * Iterate over all combinations of bad data and attempt a reconstruction. 2175 * Note that the algorithm below is non-optimal because it doesn't take into 2176 * account how reconstruction is actually performed. For example, with 2177 * triple-parity RAID-Z the reconstruction procedure is the same if column 4 2178 * is targeted as invalid as if columns 1 and 4 are targeted since in both 2179 * cases we'd only use parity information in column 0. 2180 */ 2181static int 2182vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) 2183{ 2184 raidz_map_t *rm = zio->io_vsd; 2185 raidz_col_t *rc; 2186 void *orig[VDEV_RAIDZ_MAXPARITY]; 2187 int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 2188 int *tgts = &tstore[1]; 2189 int current, next, i, c, n; 2190 int code, ret = 0; 2191 2192 ASSERT(total_errors < rm->rm_firstdatacol); 2193 2194 /* 2195 * This simplifies one edge condition. 2196 */ 2197 tgts[-1] = -1; 2198 2199 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { 2200 /* 2201 * Initialize the targets array by finding the first n columns 2202 * that contain no error. 2203 * 2204 * If there were no data errors, we need to ensure that we're 2205 * always explicitly attempting to reconstruct at least one 2206 * data column. To do this, we simply push the highest target 2207 * up into the data columns. 2208 */ 2209 for (c = 0, i = 0; i < n; i++) { 2210 if (i == n - 1 && data_errors == 0 && 2211 c < rm->rm_firstdatacol) { 2212 c = rm->rm_firstdatacol; 2213 } 2214 2215 while (rm->rm_col[c].rc_error != 0) { 2216 c++; 2217 ASSERT3S(c, <, rm->rm_cols); 2218 } 2219 2220 tgts[i] = c++; 2221 } 2222 2223 /* 2224 * Setting tgts[n] simplifies the other edge condition. 2225 */ 2226 tgts[n] = rm->rm_cols; 2227 2228 /* 2229 * These buffers were allocated in previous iterations. 2230 */ 2231 for (i = 0; i < n - 1; i++) { 2232 ASSERT(orig[i] != NULL); 2233 } 2234 2235 orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size); 2236 2237 current = 0; 2238 next = tgts[current]; 2239 2240 while (current != n) { 2241 tgts[current] = next; 2242 current = 0; 2243 2244 /* 2245 * Save off the original data that we're going to 2246 * attempt to reconstruct. 2247 */ 2248 for (i = 0; i < n; i++) { 2249 ASSERT(orig[i] != NULL); 2250 c = tgts[i]; 2251 ASSERT3S(c, >=, 0); 2252 ASSERT3S(c, <, rm->rm_cols); 2253 rc = &rm->rm_col[c]; 2254 abd_copy_to_buf(orig[i], rc->rc_abd, 2255 rc->rc_size); 2256 } 2257 2258 /* 2259 * Attempt a reconstruction and exit the outer loop on 2260 * success. 2261 */ 2262 code = vdev_raidz_reconstruct(rm, tgts, n); 2263 if (raidz_checksum_verify(zio) == 0) { 2264 atomic_inc_64(&raidz_corrected[code]); 2265 2266 for (i = 0; i < n; i++) { 2267 c = tgts[i]; 2268 rc = &rm->rm_col[c]; 2269 ASSERT(rc->rc_error == 0); 2270 if (rc->rc_tried) 2271 raidz_checksum_error(zio, rc, 2272 orig[i]); 2273 rc->rc_error = SET_ERROR(ECKSUM); 2274 } 2275 2276 ret = code; 2277 goto done; 2278 } 2279 2280 /* 2281 * Restore the original data. 2282 */ 2283 for (i = 0; i < n; i++) { 2284 c = tgts[i]; 2285 rc = &rm->rm_col[c]; 2286 abd_copy_from_buf(rc->rc_abd, orig[i], 2287 rc->rc_size); 2288 } 2289 2290 do { 2291 /* 2292 * Find the next valid column after the current 2293 * position.. 2294 */ 2295 for (next = tgts[current] + 1; 2296 next < rm->rm_cols && 2297 rm->rm_col[next].rc_error != 0; next++) 2298 continue; 2299 2300 ASSERT(next <= tgts[current + 1]); 2301 2302 /* 2303 * If that spot is available, we're done here. 2304 */ 2305 if (next != tgts[current + 1]) 2306 break; 2307 2308 /* 2309 * Otherwise, find the next valid column after 2310 * the previous position. 2311 */ 2312 for (c = tgts[current - 1] + 1; 2313 rm->rm_col[c].rc_error != 0; c++) 2314 continue; 2315 2316 tgts[current] = c; 2317 current++; 2318 2319 } while (current != n); 2320 } 2321 } 2322 n--; 2323done: 2324 for (i = 0; i < n; i++) { 2325 zio_buf_free(orig[i], rm->rm_col[0].rc_size); 2326 } 2327 2328 return (ret); 2329} 2330 2331/* 2332 * Complete an IO operation on a RAIDZ VDev 2333 * 2334 * Outline: 2335 * - For write operations: 2336 * 1. Check for errors on the child IOs. 2337 * 2. Return, setting an error code if too few child VDevs were written 2338 * to reconstruct the data later. Note that partial writes are 2339 * considered successful if they can be reconstructed at all. 2340 * - For read operations: 2341 * 1. Check for errors on the child IOs. 2342 * 2. If data errors occurred: 2343 * a. Try to reassemble the data from the parity available. 2344 * b. If we haven't yet read the parity drives, read them now. 2345 * c. If all parity drives have been read but the data still doesn't 2346 * reassemble with a correct checksum, then try combinatorial 2347 * reconstruction. 2348 * d. If that doesn't work, return an error. 2349 * 3. If there were unexpected errors or this is a resilver operation, 2350 * rewrite the vdevs that had errors. 2351 */ 2352static void 2353vdev_raidz_io_done(zio_t *zio) 2354{ 2355 vdev_t *vd = zio->io_vd; 2356 vdev_t *cvd; 2357 raidz_map_t *rm = zio->io_vsd; 2358 raidz_col_t *rc; 2359 int unexpected_errors = 0; 2360 int parity_errors = 0; 2361 int parity_untried = 0; 2362 int data_errors = 0; 2363 int total_errors = 0; 2364 int n, c; 2365 int tgts[VDEV_RAIDZ_MAXPARITY]; 2366 int code; 2367 2368 ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ 2369 2370 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); 2371 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); 2372 2373 for (c = 0; c < rm->rm_cols; c++) { 2374 rc = &rm->rm_col[c]; 2375 2376 if (rc->rc_error) { 2377 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 2378 2379 if (c < rm->rm_firstdatacol) 2380 parity_errors++; 2381 else 2382 data_errors++; 2383 2384 if (!rc->rc_skipped) 2385 unexpected_errors++; 2386 2387 total_errors++; 2388 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { 2389 parity_untried++; 2390 } 2391 } 2392 2393 if (zio->io_type == ZIO_TYPE_WRITE) { 2394 /* 2395 * XXX -- for now, treat partial writes as a success. 2396 * (If we couldn't write enough columns to reconstruct 2397 * the data, the I/O failed. Otherwise, good enough.) 2398 * 2399 * Now that we support write reallocation, it would be better 2400 * to treat partial failure as real failure unless there are 2401 * no non-degraded top-level vdevs left, and not update DTLs 2402 * if we intend to reallocate. 2403 */ 2404 /* XXPOLICY */ 2405 if (total_errors > rm->rm_firstdatacol) 2406 zio->io_error = vdev_raidz_worst_error(rm); 2407 2408 return; 2409 } else if (zio->io_type == ZIO_TYPE_FREE) { 2410 return; 2411 } 2412 2413 ASSERT(zio->io_type == ZIO_TYPE_READ); 2414 /* 2415 * There are three potential phases for a read: 2416 * 1. produce valid data from the columns read 2417 * 2. read all disks and try again 2418 * 3. perform combinatorial reconstruction 2419 * 2420 * Each phase is progressively both more expensive and less likely to 2421 * occur. If we encounter more errors than we can repair or all phases 2422 * fail, we have no choice but to return an error. 2423 */ 2424 2425 /* 2426 * If the number of errors we saw was correctable -- less than or equal 2427 * to the number of parity disks read -- attempt to produce data that 2428 * has a valid checksum. Naturally, this case applies in the absence of 2429 * any errors. 2430 */ 2431 if (total_errors <= rm->rm_firstdatacol - parity_untried) { 2432 if (data_errors == 0) { 2433 if (raidz_checksum_verify(zio) == 0) { 2434 /* 2435 * If we read parity information (unnecessarily 2436 * as it happens since no reconstruction was 2437 * needed) regenerate and verify the parity. 2438 * We also regenerate parity when resilvering 2439 * so we can write it out to the failed device 2440 * later. 2441 */ 2442 if (parity_errors + parity_untried < 2443 rm->rm_firstdatacol || 2444 (zio->io_flags & ZIO_FLAG_RESILVER)) { 2445 n = raidz_parity_verify(zio, rm); 2446 unexpected_errors += n; 2447 ASSERT(parity_errors + n <= 2448 rm->rm_firstdatacol); 2449 } 2450 goto done; 2451 } 2452 } else { 2453 /* 2454 * We either attempt to read all the parity columns or 2455 * none of them. If we didn't try to read parity, we 2456 * wouldn't be here in the correctable case. There must 2457 * also have been fewer parity errors than parity 2458 * columns or, again, we wouldn't be in this code path. 2459 */ 2460 ASSERT(parity_untried == 0); 2461 ASSERT(parity_errors < rm->rm_firstdatacol); 2462 2463 /* 2464 * Identify the data columns that reported an error. 2465 */ 2466 n = 0; 2467 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 2468 rc = &rm->rm_col[c]; 2469 if (rc->rc_error != 0) { 2470 ASSERT(n < VDEV_RAIDZ_MAXPARITY); 2471 tgts[n++] = c; 2472 } 2473 } 2474 2475 ASSERT(rm->rm_firstdatacol >= n); 2476 2477 code = vdev_raidz_reconstruct(rm, tgts, n); 2478 2479 if (raidz_checksum_verify(zio) == 0) { 2480 atomic_inc_64(&raidz_corrected[code]); 2481 2482 /* 2483 * If we read more parity disks than were used 2484 * for reconstruction, confirm that the other 2485 * parity disks produced correct data. This 2486 * routine is suboptimal in that it regenerates 2487 * the parity that we already used in addition 2488 * to the parity that we're attempting to 2489 * verify, but this should be a relatively 2490 * uncommon case, and can be optimized if it 2491 * becomes a problem. Note that we regenerate 2492 * parity when resilvering so we can write it 2493 * out to failed devices later. 2494 */ 2495 if (parity_errors < rm->rm_firstdatacol - n || 2496 (zio->io_flags & ZIO_FLAG_RESILVER)) { 2497 n = raidz_parity_verify(zio, rm); 2498 unexpected_errors += n; 2499 ASSERT(parity_errors + n <= 2500 rm->rm_firstdatacol); 2501 } 2502 2503 goto done; 2504 } 2505 } 2506 } 2507 2508 /* 2509 * This isn't a typical situation -- either we got a read error or 2510 * a child silently returned bad data. Read every block so we can 2511 * try again with as much data and parity as we can track down. If 2512 * we've already been through once before, all children will be marked 2513 * as tried so we'll proceed to combinatorial reconstruction. 2514 */ 2515 unexpected_errors = 1; 2516 rm->rm_missingdata = 0; 2517 rm->rm_missingparity = 0; 2518 2519 for (c = 0; c < rm->rm_cols; c++) { 2520 if (rm->rm_col[c].rc_tried) 2521 continue; 2522 2523 zio_vdev_io_redone(zio); 2524 do { 2525 rc = &rm->rm_col[c]; 2526 if (rc->rc_tried) 2527 continue; 2528 zio_nowait(zio_vdev_child_io(zio, NULL, 2529 vd->vdev_child[rc->rc_devidx], 2530 rc->rc_offset, rc->rc_abd, rc->rc_size, 2531 zio->io_type, zio->io_priority, 0, 2532 vdev_raidz_child_done, rc)); 2533 } while (++c < rm->rm_cols); 2534 2535 return; 2536 } 2537 2538 /* 2539 * At this point we've attempted to reconstruct the data given the 2540 * errors we detected, and we've attempted to read all columns. There 2541 * must, therefore, be one or more additional problems -- silent errors 2542 * resulting in invalid data rather than explicit I/O errors resulting 2543 * in absent data. We check if there is enough additional data to 2544 * possibly reconstruct the data and then perform combinatorial 2545 * reconstruction over all possible combinations. If that fails, 2546 * we're cooked. 2547 */ 2548 if (total_errors > rm->rm_firstdatacol) { 2549 zio->io_error = vdev_raidz_worst_error(rm); 2550 2551 } else if (total_errors < rm->rm_firstdatacol && 2552 (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) { 2553 /* 2554 * If we didn't use all the available parity for the 2555 * combinatorial reconstruction, verify that the remaining 2556 * parity is correct. 2557 */ 2558 if (code != (1 << rm->rm_firstdatacol) - 1) 2559 (void) raidz_parity_verify(zio, rm); 2560 } else { 2561 /* 2562 * We're here because either: 2563 * 2564 * total_errors == rm_first_datacol, or 2565 * vdev_raidz_combrec() failed 2566 * 2567 * In either case, there is enough bad data to prevent 2568 * reconstruction. 2569 * 2570 * Start checksum ereports for all children which haven't 2571 * failed, and the IO wasn't speculative. 2572 */ 2573 zio->io_error = SET_ERROR(ECKSUM); 2574 2575 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2576 for (c = 0; c < rm->rm_cols; c++) { 2577 rc = &rm->rm_col[c]; 2578 if (rc->rc_error == 0) { 2579 zio_bad_cksum_t zbc; 2580 zbc.zbc_has_cksum = 0; 2581 zbc.zbc_injected = 2582 rm->rm_ecksuminjected; 2583 2584 zfs_ereport_start_checksum( 2585 zio->io_spa, 2586 vd->vdev_child[rc->rc_devidx], 2587 zio, rc->rc_offset, rc->rc_size, 2588 (void *)(uintptr_t)c, &zbc); 2589 } 2590 } 2591 } 2592 } 2593 2594done: 2595 zio_checksum_verified(zio); 2596 2597 if (zio->io_error == 0 && spa_writeable(zio->io_spa) && 2598 (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { 2599 /* 2600 * Use the good data we have in hand to repair damaged children. 2601 */ 2602 for (c = 0; c < rm->rm_cols; c++) { 2603 rc = &rm->rm_col[c]; 2604 cvd = vd->vdev_child[rc->rc_devidx]; 2605 2606 if (rc->rc_error == 0) 2607 continue; 2608 2609 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 2610 rc->rc_offset, rc->rc_abd, rc->rc_size, 2611 ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, 2612 ZIO_FLAG_IO_REPAIR | (unexpected_errors ? 2613 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); 2614 } 2615 } 2616} 2617 2618static void 2619vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) 2620{ 2621 if (faulted > vd->vdev_nparity) 2622 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2623 VDEV_AUX_NO_REPLICAS); 2624 else if (degraded + faulted != 0) 2625 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); 2626 else 2627 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); 2628} 2629 2630/* 2631 * Determine if any portion of the provided block resides on a child vdev 2632 * with a dirty DTL and therefore needs to be resilvered. The function 2633 * assumes that at least one DTL is dirty which imples that full stripe 2634 * width blocks must be resilvered. 2635 */ 2636static boolean_t 2637vdev_raidz_need_resilver(vdev_t *vd, uint64_t offset, size_t psize) 2638{ 2639 uint64_t dcols = vd->vdev_children; 2640 uint64_t nparity = vd->vdev_nparity; 2641 uint64_t ashift = vd->vdev_top->vdev_ashift; 2642 /* The starting RAIDZ (parent) vdev sector of the block. */ 2643 uint64_t b = offset >> ashift; 2644 /* The zio's size in units of the vdev's minimum sector size. */ 2645 uint64_t s = ((psize - 1) >> ashift) + 1; 2646 /* The first column for this stripe. */ 2647 uint64_t f = b % dcols; 2648 2649 if (s + nparity >= dcols) 2650 return (B_TRUE); 2651 2652 for (uint64_t c = 0; c < s + nparity; c++) { 2653 uint64_t devidx = (f + c) % dcols; 2654 vdev_t *cvd = vd->vdev_child[devidx]; 2655 2656 /* 2657 * dsl_scan_need_resilver() already checked vd with 2658 * vdev_dtl_contains(). So here just check cvd with 2659 * vdev_dtl_empty(), cheaper and a good approximation. 2660 */ 2661 if (!vdev_dtl_empty(cvd, DTL_PARTIAL)) 2662 return (B_TRUE); 2663 } 2664 2665 return (B_FALSE); 2666} 2667 2668static void 2669vdev_raidz_xlate(vdev_t *cvd, const range_seg_t *in, range_seg_t *res) 2670{ 2671 vdev_t *raidvd = cvd->vdev_parent; 2672 ASSERT(raidvd->vdev_ops == &vdev_raidz_ops); 2673 2674 uint64_t width = raidvd->vdev_children; 2675 uint64_t tgt_col = cvd->vdev_id; 2676 uint64_t ashift = raidvd->vdev_top->vdev_ashift; 2677 2678 /* make sure the offsets are block-aligned */ 2679 ASSERT0(in->rs_start % (1 << ashift)); 2680 ASSERT0(in->rs_end % (1 << ashift)); 2681 uint64_t b_start = in->rs_start >> ashift; 2682 uint64_t b_end = in->rs_end >> ashift; 2683 2684 uint64_t start_row = 0; 2685 if (b_start > tgt_col) /* avoid underflow */ 2686 start_row = ((b_start - tgt_col - 1) / width) + 1; 2687 2688 uint64_t end_row = 0; 2689 if (b_end > tgt_col) 2690 end_row = ((b_end - tgt_col - 1) / width) + 1; 2691 2692 res->rs_start = start_row << ashift; 2693 res->rs_end = end_row << ashift; 2694 2695 ASSERT3U(res->rs_start, <=, in->rs_start); 2696 ASSERT3U(res->rs_end - res->rs_start, <=, in->rs_end - in->rs_start); 2697} 2698 2699vdev_ops_t vdev_raidz_ops = { 2700 vdev_raidz_open, 2701 vdev_raidz_close, 2702 vdev_raidz_asize, 2703 vdev_raidz_io_start, 2704 vdev_raidz_io_done, 2705 vdev_raidz_state_change, 2706 vdev_raidz_need_resilver, 2707 NULL, 2708 NULL, 2709 NULL, 2710 vdev_raidz_xlate, 2711 VDEV_TYPE_RAIDZ, /* name of this vdev type */ 2712 B_FALSE /* not a leaf vdev */ 2713}; 2714