vdev_raidz.c revision 185029
1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd 22168404Spjd/* 23185029Spjd * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24168404Spjd * Use is subject to license terms. 25168404Spjd */ 26168404Spjd 27168404Spjd#include <sys/zfs_context.h> 28168404Spjd#include <sys/spa.h> 29168404Spjd#include <sys/vdev_impl.h> 30168404Spjd#include <sys/zio.h> 31168404Spjd#include <sys/zio_checksum.h> 32168404Spjd#include <sys/fs/zfs.h> 33168404Spjd#include <sys/fm/fs/zfs.h> 34168404Spjd 35168404Spjd/* 36168404Spjd * Virtual device vector for RAID-Z. 37168404Spjd * 38168404Spjd * This vdev supports both single and double parity. For single parity, we 39168404Spjd * use a simple XOR of all the data columns. For double parity, we use both 40168404Spjd * the simple XOR as well as a technique described in "The mathematics of 41168404Spjd * RAID-6" by H. Peter Anvin. This technique defines a Galois field, GF(2^8), 42168404Spjd * over the integers expressable in a single byte. Briefly, the operations on 43168404Spjd * the field are defined as follows: 44168404Spjd * 45168404Spjd * o addition (+) is represented by a bitwise XOR 46168404Spjd * o subtraction (-) is therefore identical to addition: A + B = A - B 47168404Spjd * o multiplication of A by 2 is defined by the following bitwise expression: 48168404Spjd * (A * 2)_7 = A_6 49168404Spjd * (A * 2)_6 = A_5 50168404Spjd * (A * 2)_5 = A_4 51168404Spjd * (A * 2)_4 = A_3 + A_7 52168404Spjd * (A * 2)_3 = A_2 + A_7 53168404Spjd * (A * 2)_2 = A_1 + A_7 54168404Spjd * (A * 2)_1 = A_0 55168404Spjd * (A * 2)_0 = A_7 56168404Spjd * 57168404Spjd * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). 58168404Spjd * 59168404Spjd * Observe that any number in the field (except for 0) can be expressed as a 60168404Spjd * power of 2 -- a generator for the field. We store a table of the powers of 61168404Spjd * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can 62168404Spjd * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather 63168404Spjd * than field addition). The inverse of a field element A (A^-1) is A^254. 64168404Spjd * 65168404Spjd * The two parity columns, P and Q, over several data columns, D_0, ... D_n-1, 66168404Spjd * can be expressed by field operations: 67168404Spjd * 68168404Spjd * P = D_0 + D_1 + ... + D_n-2 + D_n-1 69168404Spjd * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 70168404Spjd * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 71168404Spjd * 72168404Spjd * See the reconstruction code below for how P and Q can used individually or 73168404Spjd * in concert to recover missing data columns. 74168404Spjd */ 75168404Spjd 76168404Spjdtypedef struct raidz_col { 77168404Spjd uint64_t rc_devidx; /* child device index for I/O */ 78168404Spjd uint64_t rc_offset; /* device offset */ 79168404Spjd uint64_t rc_size; /* I/O size */ 80168404Spjd void *rc_data; /* I/O data */ 81168404Spjd int rc_error; /* I/O error for this device */ 82168404Spjd uint8_t rc_tried; /* Did we attempt this I/O column? */ 83168404Spjd uint8_t rc_skipped; /* Did we skip this I/O column? */ 84168404Spjd} raidz_col_t; 85168404Spjd 86168404Spjdtypedef struct raidz_map { 87168404Spjd uint64_t rm_cols; /* Column count */ 88168404Spjd uint64_t rm_bigcols; /* Number of oversized columns */ 89168404Spjd uint64_t rm_asize; /* Actual total I/O size */ 90168404Spjd uint64_t rm_missingdata; /* Count of missing data devices */ 91168404Spjd uint64_t rm_missingparity; /* Count of missing parity devices */ 92168404Spjd uint64_t rm_firstdatacol; /* First data column/parity count */ 93168404Spjd raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ 94168404Spjd} raidz_map_t; 95168404Spjd 96168404Spjd#define VDEV_RAIDZ_P 0 97168404Spjd#define VDEV_RAIDZ_Q 1 98168404Spjd 99168404Spjd#define VDEV_RAIDZ_MAXPARITY 2 100168404Spjd 101168404Spjd#define VDEV_RAIDZ_MUL_2(a) (((a) << 1) ^ (((a) & 0x80) ? 0x1d : 0)) 102168404Spjd 103168404Spjd/* 104168404Spjd * These two tables represent powers and logs of 2 in the Galois field defined 105168404Spjd * above. These values were computed by repeatedly multiplying by 2 as above. 106168404Spjd */ 107168404Spjdstatic const uint8_t vdev_raidz_pow2[256] = { 108168404Spjd 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 109168404Spjd 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 110168404Spjd 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 111168404Spjd 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 112168404Spjd 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 113168404Spjd 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 114168404Spjd 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, 115168404Spjd 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 116168404Spjd 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 117168404Spjd 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, 118168404Spjd 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 119168404Spjd 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 120168404Spjd 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 121168404Spjd 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 122168404Spjd 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 123168404Spjd 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 124168404Spjd 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 125168404Spjd 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 126168404Spjd 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 127168404Spjd 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 128168404Spjd 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 129168404Spjd 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 130168404Spjd 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 131168404Spjd 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 132168404Spjd 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, 133168404Spjd 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 134168404Spjd 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 135168404Spjd 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, 136168404Spjd 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 137168404Spjd 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, 138168404Spjd 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 139168404Spjd 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 140168404Spjd}; 141168404Spjdstatic const uint8_t vdev_raidz_log2[256] = { 142168404Spjd 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, 143168404Spjd 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, 144168404Spjd 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, 145168404Spjd 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, 146168404Spjd 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, 147168404Spjd 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, 148168404Spjd 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, 149168404Spjd 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, 150168404Spjd 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, 151168404Spjd 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, 152168404Spjd 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, 153168404Spjd 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, 154168404Spjd 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, 155168404Spjd 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, 156168404Spjd 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, 157168404Spjd 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, 158168404Spjd 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, 159168404Spjd 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, 160168404Spjd 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, 161168404Spjd 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, 162168404Spjd 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, 163168404Spjd 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, 164168404Spjd 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, 165168404Spjd 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, 166168404Spjd 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, 167168404Spjd 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, 168168404Spjd 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, 169168404Spjd 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, 170168404Spjd 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, 171168404Spjd 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, 172168404Spjd 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, 173168404Spjd 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, 174168404Spjd}; 175168404Spjd 176168404Spjd/* 177168404Spjd * Multiply a given number by 2 raised to the given power. 178168404Spjd */ 179168404Spjdstatic uint8_t 180168404Spjdvdev_raidz_exp2(uint_t a, int exp) 181168404Spjd{ 182168404Spjd if (a == 0) 183168404Spjd return (0); 184168404Spjd 185168404Spjd ASSERT(exp >= 0); 186168404Spjd ASSERT(vdev_raidz_log2[a] > 0 || a == 1); 187168404Spjd 188168404Spjd exp += vdev_raidz_log2[a]; 189168404Spjd if (exp > 255) 190168404Spjd exp -= 255; 191168404Spjd 192168404Spjd return (vdev_raidz_pow2[exp]); 193168404Spjd} 194168404Spjd 195185029Spjdstatic void 196185029Spjdvdev_raidz_map_free(zio_t *zio) 197185029Spjd{ 198185029Spjd raidz_map_t *rm = zio->io_vsd; 199185029Spjd int c; 200185029Spjd 201185029Spjd for (c = 0; c < rm->rm_firstdatacol; c++) 202185029Spjd zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); 203185029Spjd 204185029Spjd kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols])); 205185029Spjd} 206185029Spjd 207168404Spjdstatic raidz_map_t * 208168404Spjdvdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, 209168404Spjd uint64_t nparity) 210168404Spjd{ 211168404Spjd raidz_map_t *rm; 212168404Spjd uint64_t b = zio->io_offset >> unit_shift; 213168404Spjd uint64_t s = zio->io_size >> unit_shift; 214168404Spjd uint64_t f = b % dcols; 215168404Spjd uint64_t o = (b / dcols) << unit_shift; 216168404Spjd uint64_t q, r, c, bc, col, acols, coff, devidx; 217168404Spjd 218168404Spjd q = s / (dcols - nparity); 219168404Spjd r = s - q * (dcols - nparity); 220168404Spjd bc = (r == 0 ? 0 : r + nparity); 221168404Spjd 222168404Spjd acols = (q == 0 ? bc : dcols); 223168404Spjd 224168404Spjd rm = kmem_alloc(offsetof(raidz_map_t, rm_col[acols]), KM_SLEEP); 225168404Spjd 226168404Spjd rm->rm_cols = acols; 227168404Spjd rm->rm_bigcols = bc; 228168404Spjd rm->rm_asize = 0; 229168404Spjd rm->rm_missingdata = 0; 230168404Spjd rm->rm_missingparity = 0; 231168404Spjd rm->rm_firstdatacol = nparity; 232168404Spjd 233168404Spjd for (c = 0; c < acols; c++) { 234168404Spjd col = f + c; 235168404Spjd coff = o; 236168404Spjd if (col >= dcols) { 237168404Spjd col -= dcols; 238168404Spjd coff += 1ULL << unit_shift; 239168404Spjd } 240168404Spjd rm->rm_col[c].rc_devidx = col; 241168404Spjd rm->rm_col[c].rc_offset = coff; 242168404Spjd rm->rm_col[c].rc_size = (q + (c < bc)) << unit_shift; 243168404Spjd rm->rm_col[c].rc_data = NULL; 244168404Spjd rm->rm_col[c].rc_error = 0; 245168404Spjd rm->rm_col[c].rc_tried = 0; 246168404Spjd rm->rm_col[c].rc_skipped = 0; 247168404Spjd rm->rm_asize += rm->rm_col[c].rc_size; 248168404Spjd } 249168404Spjd 250168404Spjd rm->rm_asize = roundup(rm->rm_asize, (nparity + 1) << unit_shift); 251168404Spjd 252168404Spjd for (c = 0; c < rm->rm_firstdatacol; c++) 253168404Spjd rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size); 254168404Spjd 255168404Spjd rm->rm_col[c].rc_data = zio->io_data; 256168404Spjd 257168404Spjd for (c = c + 1; c < acols; c++) 258168404Spjd rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + 259168404Spjd rm->rm_col[c - 1].rc_size; 260168404Spjd 261168404Spjd /* 262168404Spjd * If all data stored spans all columns, there's a danger that parity 263168404Spjd * will always be on the same device and, since parity isn't read 264168404Spjd * during normal operation, that that device's I/O bandwidth won't be 265168404Spjd * used effectively. We therefore switch the parity every 1MB. 266168404Spjd * 267168404Spjd * ... at least that was, ostensibly, the theory. As a practical 268168404Spjd * matter unless we juggle the parity between all devices evenly, we 269168404Spjd * won't see any benefit. Further, occasional writes that aren't a 270168404Spjd * multiple of the LCM of the number of children and the minimum 271168404Spjd * stripe width are sufficient to avoid pessimal behavior. 272168404Spjd * Unfortunately, this decision created an implicit on-disk format 273168404Spjd * requirement that we need to support for all eternity, but only 274168404Spjd * for single-parity RAID-Z. 275168404Spjd */ 276168404Spjd ASSERT(rm->rm_cols >= 2); 277168404Spjd ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); 278168404Spjd 279168404Spjd if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { 280168404Spjd devidx = rm->rm_col[0].rc_devidx; 281168404Spjd o = rm->rm_col[0].rc_offset; 282168404Spjd rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; 283168404Spjd rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; 284168404Spjd rm->rm_col[1].rc_devidx = devidx; 285168404Spjd rm->rm_col[1].rc_offset = o; 286168404Spjd } 287168404Spjd 288168404Spjd zio->io_vsd = rm; 289185029Spjd zio->io_vsd_free = vdev_raidz_map_free; 290168404Spjd return (rm); 291168404Spjd} 292168404Spjd 293168404Spjdstatic void 294168404Spjdvdev_raidz_generate_parity_p(raidz_map_t *rm) 295168404Spjd{ 296168404Spjd uint64_t *p, *src, pcount, ccount, i; 297168404Spjd int c; 298168404Spjd 299168404Spjd pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 300168404Spjd 301168404Spjd for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 302168404Spjd src = rm->rm_col[c].rc_data; 303168404Spjd p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 304168404Spjd ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 305168404Spjd 306168404Spjd if (c == rm->rm_firstdatacol) { 307168404Spjd ASSERT(ccount == pcount); 308168404Spjd for (i = 0; i < ccount; i++, p++, src++) { 309168404Spjd *p = *src; 310168404Spjd } 311168404Spjd } else { 312168404Spjd ASSERT(ccount <= pcount); 313168404Spjd for (i = 0; i < ccount; i++, p++, src++) { 314168404Spjd *p ^= *src; 315168404Spjd } 316168404Spjd } 317168404Spjd } 318168404Spjd} 319168404Spjd 320168404Spjdstatic void 321168404Spjdvdev_raidz_generate_parity_pq(raidz_map_t *rm) 322168404Spjd{ 323168404Spjd uint64_t *q, *p, *src, pcount, ccount, mask, i; 324168404Spjd int c; 325168404Spjd 326168404Spjd pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 327168404Spjd ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 328168404Spjd rm->rm_col[VDEV_RAIDZ_Q].rc_size); 329168404Spjd 330168404Spjd for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 331168404Spjd src = rm->rm_col[c].rc_data; 332168404Spjd p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 333168404Spjd q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 334168404Spjd ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 335168404Spjd 336168404Spjd if (c == rm->rm_firstdatacol) { 337168404Spjd ASSERT(ccount == pcount || ccount == 0); 338168404Spjd for (i = 0; i < ccount; i++, p++, q++, src++) { 339168404Spjd *q = *src; 340168404Spjd *p = *src; 341168404Spjd } 342168404Spjd for (; i < pcount; i++, p++, q++, src++) { 343168404Spjd *q = 0; 344168404Spjd *p = 0; 345168404Spjd } 346168404Spjd } else { 347168404Spjd ASSERT(ccount <= pcount); 348168404Spjd 349168404Spjd /* 350168404Spjd * Rather than multiplying each byte individually (as 351168404Spjd * described above), we are able to handle 8 at once 352168404Spjd * by generating a mask based on the high bit in each 353168404Spjd * byte and using that to conditionally XOR in 0x1d. 354168404Spjd */ 355168404Spjd for (i = 0; i < ccount; i++, p++, q++, src++) { 356168404Spjd mask = *q & 0x8080808080808080ULL; 357168404Spjd mask = (mask << 1) - (mask >> 7); 358168404Spjd *q = ((*q << 1) & 0xfefefefefefefefeULL) ^ 359168404Spjd (mask & 0x1d1d1d1d1d1d1d1dULL); 360168404Spjd *q ^= *src; 361168404Spjd *p ^= *src; 362168404Spjd } 363168404Spjd 364168404Spjd /* 365168404Spjd * Treat short columns as though they are full of 0s. 366168404Spjd */ 367168404Spjd for (; i < pcount; i++, q++) { 368168404Spjd mask = *q & 0x8080808080808080ULL; 369168404Spjd mask = (mask << 1) - (mask >> 7); 370168404Spjd *q = ((*q << 1) & 0xfefefefefefefefeULL) ^ 371168404Spjd (mask & 0x1d1d1d1d1d1d1d1dULL); 372168404Spjd } 373168404Spjd } 374168404Spjd } 375168404Spjd} 376168404Spjd 377168404Spjdstatic void 378168404Spjdvdev_raidz_reconstruct_p(raidz_map_t *rm, int x) 379168404Spjd{ 380168404Spjd uint64_t *dst, *src, xcount, ccount, count, i; 381168404Spjd int c; 382168404Spjd 383168404Spjd xcount = rm->rm_col[x].rc_size / sizeof (src[0]); 384168404Spjd ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0])); 385168404Spjd ASSERT(xcount > 0); 386168404Spjd 387168404Spjd src = rm->rm_col[VDEV_RAIDZ_P].rc_data; 388168404Spjd dst = rm->rm_col[x].rc_data; 389168404Spjd for (i = 0; i < xcount; i++, dst++, src++) { 390168404Spjd *dst = *src; 391168404Spjd } 392168404Spjd 393168404Spjd for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 394168404Spjd src = rm->rm_col[c].rc_data; 395168404Spjd dst = rm->rm_col[x].rc_data; 396168404Spjd 397168404Spjd if (c == x) 398168404Spjd continue; 399168404Spjd 400168404Spjd ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 401168404Spjd count = MIN(ccount, xcount); 402168404Spjd 403168404Spjd for (i = 0; i < count; i++, dst++, src++) { 404168404Spjd *dst ^= *src; 405168404Spjd } 406168404Spjd } 407168404Spjd} 408168404Spjd 409168404Spjdstatic void 410168404Spjdvdev_raidz_reconstruct_q(raidz_map_t *rm, int x) 411168404Spjd{ 412168404Spjd uint64_t *dst, *src, xcount, ccount, count, mask, i; 413168404Spjd uint8_t *b; 414168404Spjd int c, j, exp; 415168404Spjd 416168404Spjd xcount = rm->rm_col[x].rc_size / sizeof (src[0]); 417168404Spjd ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0])); 418168404Spjd 419168404Spjd for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 420168404Spjd src = rm->rm_col[c].rc_data; 421168404Spjd dst = rm->rm_col[x].rc_data; 422168404Spjd 423168404Spjd if (c == x) 424168404Spjd ccount = 0; 425168404Spjd else 426168404Spjd ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 427168404Spjd 428168404Spjd count = MIN(ccount, xcount); 429168404Spjd 430168404Spjd if (c == rm->rm_firstdatacol) { 431168404Spjd for (i = 0; i < count; i++, dst++, src++) { 432168404Spjd *dst = *src; 433168404Spjd } 434168404Spjd for (; i < xcount; i++, dst++) { 435168404Spjd *dst = 0; 436168404Spjd } 437168404Spjd 438168404Spjd } else { 439168404Spjd /* 440168404Spjd * For an explanation of this, see the comment in 441168404Spjd * vdev_raidz_generate_parity_pq() above. 442168404Spjd */ 443168404Spjd for (i = 0; i < count; i++, dst++, src++) { 444168404Spjd mask = *dst & 0x8080808080808080ULL; 445168404Spjd mask = (mask << 1) - (mask >> 7); 446168404Spjd *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^ 447168404Spjd (mask & 0x1d1d1d1d1d1d1d1dULL); 448168404Spjd *dst ^= *src; 449168404Spjd } 450168404Spjd 451168404Spjd for (; i < xcount; i++, dst++) { 452168404Spjd mask = *dst & 0x8080808080808080ULL; 453168404Spjd mask = (mask << 1) - (mask >> 7); 454168404Spjd *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^ 455168404Spjd (mask & 0x1d1d1d1d1d1d1d1dULL); 456168404Spjd } 457168404Spjd } 458168404Spjd } 459168404Spjd 460168404Spjd src = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 461168404Spjd dst = rm->rm_col[x].rc_data; 462168404Spjd exp = 255 - (rm->rm_cols - 1 - x); 463168404Spjd 464168404Spjd for (i = 0; i < xcount; i++, dst++, src++) { 465168404Spjd *dst ^= *src; 466168404Spjd for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { 467168404Spjd *b = vdev_raidz_exp2(*b, exp); 468168404Spjd } 469168404Spjd } 470168404Spjd} 471168404Spjd 472168404Spjdstatic void 473168404Spjdvdev_raidz_reconstruct_pq(raidz_map_t *rm, int x, int y) 474168404Spjd{ 475168404Spjd uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp; 476168404Spjd void *pdata, *qdata; 477168404Spjd uint64_t xsize, ysize, i; 478168404Spjd 479168404Spjd ASSERT(x < y); 480168404Spjd ASSERT(x >= rm->rm_firstdatacol); 481168404Spjd ASSERT(y < rm->rm_cols); 482168404Spjd 483168404Spjd ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size); 484168404Spjd 485168404Spjd /* 486168404Spjd * Move the parity data aside -- we're going to compute parity as 487168404Spjd * though columns x and y were full of zeros -- Pxy and Qxy. We want to 488168404Spjd * reuse the parity generation mechanism without trashing the actual 489168404Spjd * parity so we make those columns appear to be full of zeros by 490168404Spjd * setting their lengths to zero. 491168404Spjd */ 492168404Spjd pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data; 493168404Spjd qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 494168404Spjd xsize = rm->rm_col[x].rc_size; 495168404Spjd ysize = rm->rm_col[y].rc_size; 496168404Spjd 497168404Spjd rm->rm_col[VDEV_RAIDZ_P].rc_data = 498168404Spjd zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size); 499168404Spjd rm->rm_col[VDEV_RAIDZ_Q].rc_data = 500168404Spjd zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size); 501168404Spjd rm->rm_col[x].rc_size = 0; 502168404Spjd rm->rm_col[y].rc_size = 0; 503168404Spjd 504168404Spjd vdev_raidz_generate_parity_pq(rm); 505168404Spjd 506168404Spjd rm->rm_col[x].rc_size = xsize; 507168404Spjd rm->rm_col[y].rc_size = ysize; 508168404Spjd 509168404Spjd p = pdata; 510168404Spjd q = qdata; 511168404Spjd pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data; 512168404Spjd qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 513168404Spjd xd = rm->rm_col[x].rc_data; 514168404Spjd yd = rm->rm_col[y].rc_data; 515168404Spjd 516168404Spjd /* 517168404Spjd * We now have: 518168404Spjd * Pxy = P + D_x + D_y 519168404Spjd * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y 520168404Spjd * 521168404Spjd * We can then solve for D_x: 522168404Spjd * D_x = A * (P + Pxy) + B * (Q + Qxy) 523168404Spjd * where 524168404Spjd * A = 2^(x - y) * (2^(x - y) + 1)^-1 525168404Spjd * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 526168404Spjd * 527168404Spjd * With D_x in hand, we can easily solve for D_y: 528168404Spjd * D_y = P + Pxy + D_x 529168404Spjd */ 530168404Spjd 531168404Spjd a = vdev_raidz_pow2[255 + x - y]; 532168404Spjd b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)]; 533168404Spjd tmp = 255 - vdev_raidz_log2[a ^ 1]; 534168404Spjd 535168404Spjd aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; 536168404Spjd bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; 537168404Spjd 538168404Spjd for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) { 539168404Spjd *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^ 540168404Spjd vdev_raidz_exp2(*q ^ *qxy, bexp); 541168404Spjd 542168404Spjd if (i < ysize) 543168404Spjd *yd = *p ^ *pxy ^ *xd; 544168404Spjd } 545168404Spjd 546168404Spjd zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data, 547168404Spjd rm->rm_col[VDEV_RAIDZ_P].rc_size); 548168404Spjd zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data, 549168404Spjd rm->rm_col[VDEV_RAIDZ_Q].rc_size); 550168404Spjd 551168404Spjd /* 552168404Spjd * Restore the saved parity data. 553168404Spjd */ 554168404Spjd rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata; 555168404Spjd rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata; 556168404Spjd} 557168404Spjd 558168404Spjd 559168404Spjdstatic int 560168404Spjdvdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) 561168404Spjd{ 562168404Spjd vdev_t *cvd; 563168404Spjd uint64_t nparity = vd->vdev_nparity; 564168404Spjd int c, error; 565168404Spjd int lasterror = 0; 566168404Spjd int numerrors = 0; 567168404Spjd 568168404Spjd ASSERT(nparity > 0); 569168404Spjd 570168404Spjd if (nparity > VDEV_RAIDZ_MAXPARITY || 571168404Spjd vd->vdev_children < nparity + 1) { 572168404Spjd vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 573168404Spjd return (EINVAL); 574168404Spjd } 575168404Spjd 576168404Spjd for (c = 0; c < vd->vdev_children; c++) { 577168404Spjd cvd = vd->vdev_child[c]; 578168404Spjd 579168404Spjd if ((error = vdev_open(cvd)) != 0) { 580168404Spjd lasterror = error; 581168404Spjd numerrors++; 582168404Spjd continue; 583168404Spjd } 584168404Spjd 585168404Spjd *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; 586168404Spjd *ashift = MAX(*ashift, cvd->vdev_ashift); 587168404Spjd } 588168404Spjd 589168404Spjd *asize *= vd->vdev_children; 590168404Spjd 591168404Spjd if (numerrors > nparity) { 592168404Spjd vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; 593168404Spjd return (lasterror); 594168404Spjd } 595168404Spjd 596168404Spjd return (0); 597168404Spjd} 598168404Spjd 599168404Spjdstatic void 600168404Spjdvdev_raidz_close(vdev_t *vd) 601168404Spjd{ 602168404Spjd int c; 603168404Spjd 604168404Spjd for (c = 0; c < vd->vdev_children; c++) 605168404Spjd vdev_close(vd->vdev_child[c]); 606168404Spjd} 607168404Spjd 608168404Spjdstatic uint64_t 609168404Spjdvdev_raidz_asize(vdev_t *vd, uint64_t psize) 610168404Spjd{ 611168404Spjd uint64_t asize; 612168404Spjd uint64_t ashift = vd->vdev_top->vdev_ashift; 613168404Spjd uint64_t cols = vd->vdev_children; 614168404Spjd uint64_t nparity = vd->vdev_nparity; 615168404Spjd 616168404Spjd asize = ((psize - 1) >> ashift) + 1; 617168404Spjd asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); 618168404Spjd asize = roundup(asize, nparity + 1) << ashift; 619168404Spjd 620168404Spjd return (asize); 621168404Spjd} 622168404Spjd 623168404Spjdstatic void 624168404Spjdvdev_raidz_child_done(zio_t *zio) 625168404Spjd{ 626168404Spjd raidz_col_t *rc = zio->io_private; 627168404Spjd 628168404Spjd rc->rc_error = zio->io_error; 629168404Spjd rc->rc_tried = 1; 630168404Spjd rc->rc_skipped = 0; 631168404Spjd} 632168404Spjd 633185029Spjdstatic int 634168404Spjdvdev_raidz_io_start(zio_t *zio) 635168404Spjd{ 636168404Spjd vdev_t *vd = zio->io_vd; 637168404Spjd vdev_t *tvd = vd->vdev_top; 638168404Spjd vdev_t *cvd; 639168404Spjd blkptr_t *bp = zio->io_bp; 640168404Spjd raidz_map_t *rm; 641168404Spjd raidz_col_t *rc; 642168404Spjd int c; 643168404Spjd 644168404Spjd rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children, 645168404Spjd vd->vdev_nparity); 646168404Spjd 647168404Spjd ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); 648168404Spjd 649168404Spjd if (zio->io_type == ZIO_TYPE_WRITE) { 650168404Spjd /* 651168404Spjd * Generate RAID parity in the first virtual columns. 652168404Spjd */ 653168404Spjd if (rm->rm_firstdatacol == 1) 654168404Spjd vdev_raidz_generate_parity_p(rm); 655168404Spjd else 656168404Spjd vdev_raidz_generate_parity_pq(rm); 657168404Spjd 658168404Spjd for (c = 0; c < rm->rm_cols; c++) { 659168404Spjd rc = &rm->rm_col[c]; 660168404Spjd cvd = vd->vdev_child[rc->rc_devidx]; 661168404Spjd zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 662168404Spjd rc->rc_offset, rc->rc_data, rc->rc_size, 663185029Spjd zio->io_type, zio->io_priority, 0, 664168404Spjd vdev_raidz_child_done, rc)); 665168404Spjd } 666185029Spjd 667185029Spjd return (ZIO_PIPELINE_CONTINUE); 668168404Spjd } 669168404Spjd 670168404Spjd ASSERT(zio->io_type == ZIO_TYPE_READ); 671168404Spjd 672168404Spjd /* 673168404Spjd * Iterate over the columns in reverse order so that we hit the parity 674168404Spjd * last -- any errors along the way will force us to read the parity 675168404Spjd * data. 676168404Spjd */ 677168404Spjd for (c = rm->rm_cols - 1; c >= 0; c--) { 678168404Spjd rc = &rm->rm_col[c]; 679168404Spjd cvd = vd->vdev_child[rc->rc_devidx]; 680185029Spjd if (!vdev_readable(cvd)) { 681168404Spjd if (c >= rm->rm_firstdatacol) 682168404Spjd rm->rm_missingdata++; 683168404Spjd else 684168404Spjd rm->rm_missingparity++; 685168404Spjd rc->rc_error = ENXIO; 686168404Spjd rc->rc_tried = 1; /* don't even try */ 687168404Spjd rc->rc_skipped = 1; 688168404Spjd continue; 689168404Spjd } 690168404Spjd if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) { 691168404Spjd if (c >= rm->rm_firstdatacol) 692168404Spjd rm->rm_missingdata++; 693168404Spjd else 694168404Spjd rm->rm_missingparity++; 695168404Spjd rc->rc_error = ESTALE; 696168404Spjd rc->rc_skipped = 1; 697168404Spjd continue; 698168404Spjd } 699168404Spjd if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || 700168404Spjd (zio->io_flags & ZIO_FLAG_SCRUB)) { 701168404Spjd zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 702168404Spjd rc->rc_offset, rc->rc_data, rc->rc_size, 703185029Spjd zio->io_type, zio->io_priority, 0, 704168404Spjd vdev_raidz_child_done, rc)); 705168404Spjd } 706168404Spjd } 707168404Spjd 708185029Spjd return (ZIO_PIPELINE_CONTINUE); 709168404Spjd} 710168404Spjd 711168404Spjd/* 712168404Spjd * Report a checksum error for a child of a RAID-Z device. 713168404Spjd */ 714168404Spjdstatic void 715168404Spjdraidz_checksum_error(zio_t *zio, raidz_col_t *rc) 716168404Spjd{ 717168404Spjd vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; 718168404Spjd 719168404Spjd if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 720168404Spjd mutex_enter(&vd->vdev_stat_lock); 721168404Spjd vd->vdev_stat.vs_checksum_errors++; 722168404Spjd mutex_exit(&vd->vdev_stat_lock); 723168404Spjd } 724168404Spjd 725168404Spjd if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) 726168404Spjd zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, 727168404Spjd zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size); 728168404Spjd} 729168404Spjd 730168404Spjd/* 731168404Spjd * Generate the parity from the data columns. If we tried and were able to 732168404Spjd * read the parity without error, verify that the generated parity matches the 733168404Spjd * data we read. If it doesn't, we fire off a checksum error. Return the 734168404Spjd * number such failures. 735168404Spjd */ 736168404Spjdstatic int 737168404Spjdraidz_parity_verify(zio_t *zio, raidz_map_t *rm) 738168404Spjd{ 739168404Spjd void *orig[VDEV_RAIDZ_MAXPARITY]; 740168404Spjd int c, ret = 0; 741168404Spjd raidz_col_t *rc; 742168404Spjd 743168404Spjd for (c = 0; c < rm->rm_firstdatacol; c++) { 744168404Spjd rc = &rm->rm_col[c]; 745168404Spjd if (!rc->rc_tried || rc->rc_error != 0) 746168404Spjd continue; 747168404Spjd orig[c] = zio_buf_alloc(rc->rc_size); 748168404Spjd bcopy(rc->rc_data, orig[c], rc->rc_size); 749168404Spjd } 750168404Spjd 751168404Spjd if (rm->rm_firstdatacol == 1) 752168404Spjd vdev_raidz_generate_parity_p(rm); 753168404Spjd else 754168404Spjd vdev_raidz_generate_parity_pq(rm); 755168404Spjd 756168404Spjd for (c = 0; c < rm->rm_firstdatacol; c++) { 757168404Spjd rc = &rm->rm_col[c]; 758168404Spjd if (!rc->rc_tried || rc->rc_error != 0) 759168404Spjd continue; 760168404Spjd if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { 761168404Spjd raidz_checksum_error(zio, rc); 762168404Spjd rc->rc_error = ECKSUM; 763168404Spjd ret++; 764168404Spjd } 765168404Spjd zio_buf_free(orig[c], rc->rc_size); 766168404Spjd } 767168404Spjd 768168404Spjd return (ret); 769168404Spjd} 770168404Spjd 771168404Spjdstatic uint64_t raidz_corrected_p; 772168404Spjdstatic uint64_t raidz_corrected_q; 773168404Spjdstatic uint64_t raidz_corrected_pq; 774168404Spjd 775185029Spjdstatic int 776185029Spjdvdev_raidz_worst_error(raidz_map_t *rm) 777185029Spjd{ 778185029Spjd int error = 0; 779185029Spjd 780185029Spjd for (int c = 0; c < rm->rm_cols; c++) 781185029Spjd error = zio_worst_error(error, rm->rm_col[c].rc_error); 782185029Spjd 783185029Spjd return (error); 784185029Spjd} 785185029Spjd 786168404Spjdstatic void 787168404Spjdvdev_raidz_io_done(zio_t *zio) 788168404Spjd{ 789168404Spjd vdev_t *vd = zio->io_vd; 790168404Spjd vdev_t *cvd; 791168404Spjd raidz_map_t *rm = zio->io_vsd; 792168404Spjd raidz_col_t *rc, *rc1; 793168404Spjd int unexpected_errors = 0; 794168404Spjd int parity_errors = 0; 795168404Spjd int parity_untried = 0; 796168404Spjd int data_errors = 0; 797185029Spjd int total_errors = 0; 798168404Spjd int n, c, c1; 799168404Spjd 800168404Spjd ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ 801168404Spjd 802168404Spjd ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); 803168404Spjd ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); 804168404Spjd 805168404Spjd for (c = 0; c < rm->rm_cols; c++) { 806168404Spjd rc = &rm->rm_col[c]; 807168404Spjd 808168404Spjd if (rc->rc_error) { 809185029Spjd ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 810168404Spjd 811168404Spjd if (c < rm->rm_firstdatacol) 812168404Spjd parity_errors++; 813168404Spjd else 814168404Spjd data_errors++; 815168404Spjd 816168404Spjd if (!rc->rc_skipped) 817168404Spjd unexpected_errors++; 818168404Spjd 819185029Spjd total_errors++; 820168404Spjd } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { 821168404Spjd parity_untried++; 822168404Spjd } 823168404Spjd } 824168404Spjd 825168404Spjd if (zio->io_type == ZIO_TYPE_WRITE) { 826168404Spjd /* 827185029Spjd * XXX -- for now, treat partial writes as a success. 828185029Spjd * (If we couldn't write enough columns to reconstruct 829185029Spjd * the data, the I/O failed. Otherwise, good enough.) 830185029Spjd * 831185029Spjd * Now that we support write reallocation, it would be better 832185029Spjd * to treat partial failure as real failure unless there are 833185029Spjd * no non-degraded top-level vdevs left, and not update DTLs 834185029Spjd * if we intend to reallocate. 835168404Spjd */ 836168404Spjd /* XXPOLICY */ 837185029Spjd if (total_errors > rm->rm_firstdatacol) 838185029Spjd zio->io_error = vdev_raidz_worst_error(rm); 839168404Spjd 840168404Spjd return; 841168404Spjd } 842168404Spjd 843168404Spjd ASSERT(zio->io_type == ZIO_TYPE_READ); 844168404Spjd /* 845168404Spjd * There are three potential phases for a read: 846168404Spjd * 1. produce valid data from the columns read 847168404Spjd * 2. read all disks and try again 848168404Spjd * 3. perform combinatorial reconstruction 849168404Spjd * 850168404Spjd * Each phase is progressively both more expensive and less likely to 851168404Spjd * occur. If we encounter more errors than we can repair or all phases 852168404Spjd * fail, we have no choice but to return an error. 853168404Spjd */ 854168404Spjd 855168404Spjd /* 856168404Spjd * If the number of errors we saw was correctable -- less than or equal 857168404Spjd * to the number of parity disks read -- attempt to produce data that 858168404Spjd * has a valid checksum. Naturally, this case applies in the absence of 859168404Spjd * any errors. 860168404Spjd */ 861185029Spjd if (total_errors <= rm->rm_firstdatacol - parity_untried) { 862168404Spjd switch (data_errors) { 863168404Spjd case 0: 864168404Spjd if (zio_checksum_error(zio) == 0) { 865168738Spjd /* 866168738Spjd * If we read parity information (unnecessarily 867168738Spjd * as it happens since no reconstruction was 868168738Spjd * needed) regenerate and verify the parity. 869168738Spjd * We also regenerate parity when resilvering 870168738Spjd * so we can write it out to the failed device 871168738Spjd * later. 872168738Spjd */ 873168404Spjd if (parity_errors + parity_untried < 874168738Spjd rm->rm_firstdatacol || 875168738Spjd (zio->io_flags & ZIO_FLAG_RESILVER)) { 876168404Spjd n = raidz_parity_verify(zio, rm); 877168404Spjd unexpected_errors += n; 878168404Spjd ASSERT(parity_errors + n <= 879168404Spjd rm->rm_firstdatacol); 880168404Spjd } 881168404Spjd goto done; 882168404Spjd } 883168404Spjd break; 884168404Spjd 885168404Spjd case 1: 886168404Spjd /* 887168404Spjd * We either attempt to read all the parity columns or 888168404Spjd * none of them. If we didn't try to read parity, we 889168404Spjd * wouldn't be here in the correctable case. There must 890168404Spjd * also have been fewer parity errors than parity 891168404Spjd * columns or, again, we wouldn't be in this code path. 892168404Spjd */ 893168404Spjd ASSERT(parity_untried == 0); 894168404Spjd ASSERT(parity_errors < rm->rm_firstdatacol); 895168404Spjd 896168404Spjd /* 897168404Spjd * Find the column that reported the error. 898168404Spjd */ 899168404Spjd for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 900168404Spjd rc = &rm->rm_col[c]; 901168404Spjd if (rc->rc_error != 0) 902168404Spjd break; 903168404Spjd } 904168404Spjd ASSERT(c != rm->rm_cols); 905168404Spjd ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || 906168404Spjd rc->rc_error == ESTALE); 907168404Spjd 908168404Spjd if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) { 909168404Spjd vdev_raidz_reconstruct_p(rm, c); 910168404Spjd } else { 911168404Spjd ASSERT(rm->rm_firstdatacol > 1); 912168404Spjd vdev_raidz_reconstruct_q(rm, c); 913168404Spjd } 914168404Spjd 915168404Spjd if (zio_checksum_error(zio) == 0) { 916168404Spjd if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) 917168404Spjd atomic_inc_64(&raidz_corrected_p); 918168404Spjd else 919168404Spjd atomic_inc_64(&raidz_corrected_q); 920168404Spjd 921168404Spjd /* 922168404Spjd * If there's more than one parity disk that 923168404Spjd * was successfully read, confirm that the 924168404Spjd * other parity disk produced the correct data. 925168404Spjd * This routine is suboptimal in that it 926168404Spjd * regenerates both the parity we wish to test 927168404Spjd * as well as the parity we just used to 928168404Spjd * perform the reconstruction, but this should 929168404Spjd * be a relatively uncommon case, and can be 930168404Spjd * optimized if it becomes a problem. 931168738Spjd * We also regenerate parity when resilvering 932168738Spjd * so we can write it out to the failed device 933168738Spjd * later. 934168404Spjd */ 935168738Spjd if (parity_errors < rm->rm_firstdatacol - 1 || 936168738Spjd (zio->io_flags & ZIO_FLAG_RESILVER)) { 937168404Spjd n = raidz_parity_verify(zio, rm); 938168404Spjd unexpected_errors += n; 939168404Spjd ASSERT(parity_errors + n <= 940168404Spjd rm->rm_firstdatacol); 941168404Spjd } 942168404Spjd 943168404Spjd goto done; 944168404Spjd } 945168404Spjd break; 946168404Spjd 947168404Spjd case 2: 948168404Spjd /* 949168404Spjd * Two data column errors require double parity. 950168404Spjd */ 951168404Spjd ASSERT(rm->rm_firstdatacol == 2); 952168404Spjd 953168404Spjd /* 954168404Spjd * Find the two columns that reported errors. 955168404Spjd */ 956168404Spjd for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 957168404Spjd rc = &rm->rm_col[c]; 958168404Spjd if (rc->rc_error != 0) 959168404Spjd break; 960168404Spjd } 961168404Spjd ASSERT(c != rm->rm_cols); 962168404Spjd ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || 963168404Spjd rc->rc_error == ESTALE); 964168404Spjd 965168404Spjd for (c1 = c++; c < rm->rm_cols; c++) { 966168404Spjd rc = &rm->rm_col[c]; 967168404Spjd if (rc->rc_error != 0) 968168404Spjd break; 969168404Spjd } 970168404Spjd ASSERT(c != rm->rm_cols); 971168404Spjd ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || 972168404Spjd rc->rc_error == ESTALE); 973168404Spjd 974168404Spjd vdev_raidz_reconstruct_pq(rm, c1, c); 975168404Spjd 976168404Spjd if (zio_checksum_error(zio) == 0) { 977168404Spjd atomic_inc_64(&raidz_corrected_pq); 978168404Spjd goto done; 979168404Spjd } 980168404Spjd break; 981168404Spjd 982168404Spjd default: 983168404Spjd ASSERT(rm->rm_firstdatacol <= 2); 984168404Spjd ASSERT(0); 985168404Spjd } 986168404Spjd } 987168404Spjd 988168404Spjd /* 989168404Spjd * This isn't a typical situation -- either we got a read error or 990168404Spjd * a child silently returned bad data. Read every block so we can 991168404Spjd * try again with as much data and parity as we can track down. If 992168404Spjd * we've already been through once before, all children will be marked 993168404Spjd * as tried so we'll proceed to combinatorial reconstruction. 994168404Spjd */ 995168404Spjd unexpected_errors = 1; 996168404Spjd rm->rm_missingdata = 0; 997168404Spjd rm->rm_missingparity = 0; 998168404Spjd 999168404Spjd for (c = 0; c < rm->rm_cols; c++) { 1000168404Spjd if (rm->rm_col[c].rc_tried) 1001168404Spjd continue; 1002168404Spjd 1003168404Spjd zio_vdev_io_redone(zio); 1004168404Spjd do { 1005168404Spjd rc = &rm->rm_col[c]; 1006168404Spjd if (rc->rc_tried) 1007168404Spjd continue; 1008168404Spjd zio_nowait(zio_vdev_child_io(zio, NULL, 1009168404Spjd vd->vdev_child[rc->rc_devidx], 1010168404Spjd rc->rc_offset, rc->rc_data, rc->rc_size, 1011185029Spjd zio->io_type, zio->io_priority, 0, 1012168404Spjd vdev_raidz_child_done, rc)); 1013168404Spjd } while (++c < rm->rm_cols); 1014185029Spjd 1015168404Spjd return; 1016168404Spjd } 1017168404Spjd 1018168404Spjd /* 1019168404Spjd * At this point we've attempted to reconstruct the data given the 1020168404Spjd * errors we detected, and we've attempted to read all columns. There 1021168404Spjd * must, therefore, be one or more additional problems -- silent errors 1022168404Spjd * resulting in invalid data rather than explicit I/O errors resulting 1023168404Spjd * in absent data. Before we attempt combinatorial reconstruction make 1024168404Spjd * sure we have a chance of coming up with the right answer. 1025168404Spjd */ 1026185029Spjd if (total_errors >= rm->rm_firstdatacol) { 1027185029Spjd zio->io_error = vdev_raidz_worst_error(rm); 1028185029Spjd /* 1029185029Spjd * If there were exactly as many device errors as parity 1030185029Spjd * columns, yet we couldn't reconstruct the data, then at 1031185029Spjd * least one device must have returned bad data silently. 1032185029Spjd */ 1033185029Spjd if (total_errors == rm->rm_firstdatacol) 1034185029Spjd zio->io_error = zio_worst_error(zio->io_error, ECKSUM); 1035168404Spjd goto done; 1036168404Spjd } 1037168404Spjd 1038168404Spjd if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) { 1039168404Spjd /* 1040168404Spjd * Attempt to reconstruct the data from parity P. 1041168404Spjd */ 1042168404Spjd for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1043168404Spjd void *orig; 1044168404Spjd rc = &rm->rm_col[c]; 1045168404Spjd 1046168404Spjd orig = zio_buf_alloc(rc->rc_size); 1047168404Spjd bcopy(rc->rc_data, orig, rc->rc_size); 1048168404Spjd vdev_raidz_reconstruct_p(rm, c); 1049168404Spjd 1050168404Spjd if (zio_checksum_error(zio) == 0) { 1051168404Spjd zio_buf_free(orig, rc->rc_size); 1052168404Spjd atomic_inc_64(&raidz_corrected_p); 1053168404Spjd 1054168404Spjd /* 1055168404Spjd * If this child didn't know that it returned 1056168404Spjd * bad data, inform it. 1057168404Spjd */ 1058168404Spjd if (rc->rc_tried && rc->rc_error == 0) 1059168404Spjd raidz_checksum_error(zio, rc); 1060168404Spjd rc->rc_error = ECKSUM; 1061168404Spjd goto done; 1062168404Spjd } 1063168404Spjd 1064168404Spjd bcopy(orig, rc->rc_data, rc->rc_size); 1065168404Spjd zio_buf_free(orig, rc->rc_size); 1066168404Spjd } 1067168404Spjd } 1068168404Spjd 1069168404Spjd if (rm->rm_firstdatacol > 1 && rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) { 1070168404Spjd /* 1071168404Spjd * Attempt to reconstruct the data from parity Q. 1072168404Spjd */ 1073168404Spjd for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1074168404Spjd void *orig; 1075168404Spjd rc = &rm->rm_col[c]; 1076168404Spjd 1077168404Spjd orig = zio_buf_alloc(rc->rc_size); 1078168404Spjd bcopy(rc->rc_data, orig, rc->rc_size); 1079168404Spjd vdev_raidz_reconstruct_q(rm, c); 1080168404Spjd 1081168404Spjd if (zio_checksum_error(zio) == 0) { 1082168404Spjd zio_buf_free(orig, rc->rc_size); 1083168404Spjd atomic_inc_64(&raidz_corrected_q); 1084168404Spjd 1085168404Spjd /* 1086168404Spjd * If this child didn't know that it returned 1087168404Spjd * bad data, inform it. 1088168404Spjd */ 1089168404Spjd if (rc->rc_tried && rc->rc_error == 0) 1090168404Spjd raidz_checksum_error(zio, rc); 1091168404Spjd rc->rc_error = ECKSUM; 1092168404Spjd goto done; 1093168404Spjd } 1094168404Spjd 1095168404Spjd bcopy(orig, rc->rc_data, rc->rc_size); 1096168404Spjd zio_buf_free(orig, rc->rc_size); 1097168404Spjd } 1098168404Spjd } 1099168404Spjd 1100168404Spjd if (rm->rm_firstdatacol > 1 && 1101168404Spjd rm->rm_col[VDEV_RAIDZ_P].rc_error == 0 && 1102168404Spjd rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) { 1103168404Spjd /* 1104168404Spjd * Attempt to reconstruct the data from both P and Q. 1105168404Spjd */ 1106168404Spjd for (c = rm->rm_firstdatacol; c < rm->rm_cols - 1; c++) { 1107168404Spjd void *orig, *orig1; 1108168404Spjd rc = &rm->rm_col[c]; 1109168404Spjd 1110168404Spjd orig = zio_buf_alloc(rc->rc_size); 1111168404Spjd bcopy(rc->rc_data, orig, rc->rc_size); 1112168404Spjd 1113168404Spjd for (c1 = c + 1; c1 < rm->rm_cols; c1++) { 1114168404Spjd rc1 = &rm->rm_col[c1]; 1115168404Spjd 1116168404Spjd orig1 = zio_buf_alloc(rc1->rc_size); 1117168404Spjd bcopy(rc1->rc_data, orig1, rc1->rc_size); 1118168404Spjd 1119168404Spjd vdev_raidz_reconstruct_pq(rm, c, c1); 1120168404Spjd 1121168404Spjd if (zio_checksum_error(zio) == 0) { 1122168404Spjd zio_buf_free(orig, rc->rc_size); 1123168404Spjd zio_buf_free(orig1, rc1->rc_size); 1124168404Spjd atomic_inc_64(&raidz_corrected_pq); 1125168404Spjd 1126168404Spjd /* 1127168404Spjd * If these children didn't know they 1128168404Spjd * returned bad data, inform them. 1129168404Spjd */ 1130168404Spjd if (rc->rc_tried && rc->rc_error == 0) 1131168404Spjd raidz_checksum_error(zio, rc); 1132168404Spjd if (rc1->rc_tried && rc1->rc_error == 0) 1133168404Spjd raidz_checksum_error(zio, rc1); 1134168404Spjd 1135168404Spjd rc->rc_error = ECKSUM; 1136168404Spjd rc1->rc_error = ECKSUM; 1137168404Spjd 1138168404Spjd goto done; 1139168404Spjd } 1140168404Spjd 1141168404Spjd bcopy(orig1, rc1->rc_data, rc1->rc_size); 1142168404Spjd zio_buf_free(orig1, rc1->rc_size); 1143168404Spjd } 1144168404Spjd 1145168404Spjd bcopy(orig, rc->rc_data, rc->rc_size); 1146168404Spjd zio_buf_free(orig, rc->rc_size); 1147168404Spjd } 1148168404Spjd } 1149168404Spjd 1150168404Spjd /* 1151168404Spjd * All combinations failed to checksum. Generate checksum ereports for 1152168404Spjd * all children. 1153168404Spjd */ 1154168404Spjd zio->io_error = ECKSUM; 1155185029Spjd 1156168404Spjd if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 1157168404Spjd for (c = 0; c < rm->rm_cols; c++) { 1158168404Spjd rc = &rm->rm_col[c]; 1159168404Spjd zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, 1160168404Spjd zio->io_spa, vd->vdev_child[rc->rc_devidx], zio, 1161168404Spjd rc->rc_offset, rc->rc_size); 1162168404Spjd } 1163168404Spjd } 1164168404Spjd 1165168404Spjddone: 1166168404Spjd zio_checksum_verified(zio); 1167168404Spjd 1168168404Spjd if (zio->io_error == 0 && (spa_mode & FWRITE) && 1169168404Spjd (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { 1170168404Spjd /* 1171168404Spjd * Use the good data we have in hand to repair damaged children. 1172168404Spjd */ 1173168404Spjd for (c = 0; c < rm->rm_cols; c++) { 1174168404Spjd rc = &rm->rm_col[c]; 1175168404Spjd cvd = vd->vdev_child[rc->rc_devidx]; 1176168404Spjd 1177168404Spjd if (rc->rc_error == 0) 1178168404Spjd continue; 1179168404Spjd 1180185029Spjd zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1181168404Spjd rc->rc_offset, rc->rc_data, rc->rc_size, 1182168404Spjd ZIO_TYPE_WRITE, zio->io_priority, 1183185029Spjd ZIO_FLAG_IO_REPAIR, NULL, NULL)); 1184168404Spjd } 1185168404Spjd } 1186168404Spjd} 1187168404Spjd 1188168404Spjdstatic void 1189168404Spjdvdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) 1190168404Spjd{ 1191168404Spjd if (faulted > vd->vdev_nparity) 1192168404Spjd vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1193168404Spjd VDEV_AUX_NO_REPLICAS); 1194168404Spjd else if (degraded + faulted != 0) 1195168404Spjd vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); 1196168404Spjd else 1197168404Spjd vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); 1198168404Spjd} 1199168404Spjd 1200168404Spjdvdev_ops_t vdev_raidz_ops = { 1201168404Spjd vdev_raidz_open, 1202168404Spjd vdev_raidz_close, 1203168404Spjd vdev_raidz_asize, 1204168404Spjd vdev_raidz_io_start, 1205168404Spjd vdev_raidz_io_done, 1206168404Spjd vdev_raidz_state_change, 1207168404Spjd VDEV_TYPE_RAIDZ, /* name of this vdev type */ 1208168404Spjd B_FALSE /* not a leaf vdev */ 1209168404Spjd}; 1210