1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd 22168404Spjd/* 23219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24249643Smm * Copyright (c) 2013 by Delphix. All rights reserved. 25262089Savg * Copyright (c) 2013, Joyent, Inc. All rights reserved. 26168404Spjd */ 27168404Spjd 28168404Spjd#include <sys/zfs_context.h> 29168404Spjd#include <sys/spa.h> 30168404Spjd#include <sys/vdev_impl.h> 31262089Savg#ifdef illumos 32262089Savg#include <sys/vdev_disk.h> 33262089Savg#endif 34262089Savg#include <sys/vdev_file.h> 35262089Savg#include <sys/vdev_raidz.h> 36168404Spjd#include <sys/zio.h> 37168404Spjd#include <sys/zio_checksum.h> 38168404Spjd#include <sys/fs/zfs.h> 39168404Spjd#include <sys/fm/fs/zfs.h> 40262089Savg#include <sys/bio.h> 41168404Spjd 42168404Spjd/* 43168404Spjd * Virtual device vector for RAID-Z. 44168404Spjd * 45219089Spjd * This vdev supports single, double, and triple parity. For single parity, 46219089Spjd * we use a simple XOR of all the data columns. For double or triple parity, 47219089Spjd * we use a special case of Reed-Solomon coding. This extends the 48219089Spjd * technique described in "The mathematics of RAID-6" by H. Peter Anvin by 49219089Spjd * drawing on the system described in "A Tutorial on Reed-Solomon Coding for 50219089Spjd * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the 51219089Spjd * former is also based. The latter is designed to provide higher performance 52219089Spjd * for writes. 53168404Spjd * 54219089Spjd * Note that the Plank paper claimed to support arbitrary N+M, but was then 55219089Spjd * amended six years later identifying a critical flaw that invalidates its 56219089Spjd * claims. Nevertheless, the technique can be adapted to work for up to 57219089Spjd * triple parity. For additional parity, the amendment "Note: Correction to 58219089Spjd * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding 59219089Spjd * is viable, but the additional complexity means that write performance will 60219089Spjd * suffer. 61219089Spjd * 62219089Spjd * All of the methods above operate on a Galois field, defined over the 63219089Spjd * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements 64219089Spjd * can be expressed with a single byte. Briefly, the operations on the 65219089Spjd * field are defined as follows: 66219089Spjd * 67168404Spjd * o addition (+) is represented by a bitwise XOR 68168404Spjd * o subtraction (-) is therefore identical to addition: A + B = A - B 69168404Spjd * o multiplication of A by 2 is defined by the following bitwise expression: 70252751Sdelphij * 71168404Spjd * (A * 2)_7 = A_6 72168404Spjd * (A * 2)_6 = A_5 73168404Spjd * (A * 2)_5 = A_4 74168404Spjd * (A * 2)_4 = A_3 + A_7 75168404Spjd * (A * 2)_3 = A_2 + A_7 76168404Spjd * (A * 2)_2 = A_1 + A_7 77168404Spjd * (A * 2)_1 = A_0 78168404Spjd * (A * 2)_0 = A_7 79168404Spjd * 80168404Spjd * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). 81219089Spjd * As an aside, this multiplication is derived from the error correcting 82219089Spjd * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1. 83168404Spjd * 84168404Spjd * Observe that any number in the field (except for 0) can be expressed as a 85168404Spjd * power of 2 -- a generator for the field. We store a table of the powers of 86168404Spjd * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can 87168404Spjd * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather 88219089Spjd * than field addition). The inverse of a field element A (A^-1) is therefore 89219089Spjd * A ^ (255 - 1) = A^254. 90168404Spjd * 91219089Spjd * The up-to-three parity columns, P, Q, R over several data columns, 92219089Spjd * D_0, ... D_n-1, can be expressed by field operations: 93168404Spjd * 94168404Spjd * P = D_0 + D_1 + ... + D_n-2 + D_n-1 95168404Spjd * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 96168404Spjd * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 97219089Spjd * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1 98219089Spjd * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1 99168404Spjd * 100219089Spjd * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival 101219089Spjd * XOR operation, and 2 and 4 can be computed quickly and generate linearly- 102219089Spjd * independent coefficients. (There are no additional coefficients that have 103219089Spjd * this property which is why the uncorrected Plank method breaks down.) 104219089Spjd * 105219089Spjd * See the reconstruction code below for how P, Q and R can used individually 106219089Spjd * or in concert to recover missing data columns. 107168404Spjd */ 108168404Spjd 109168404Spjdtypedef struct raidz_col { 110168404Spjd uint64_t rc_devidx; /* child device index for I/O */ 111168404Spjd uint64_t rc_offset; /* device offset */ 112168404Spjd uint64_t rc_size; /* I/O size */ 113168404Spjd void *rc_data; /* I/O data */ 114219089Spjd void *rc_gdata; /* used to store the "good" version */ 115168404Spjd int rc_error; /* I/O error for this device */ 116168404Spjd uint8_t rc_tried; /* Did we attempt this I/O column? */ 117168404Spjd uint8_t rc_skipped; /* Did we skip this I/O column? */ 118168404Spjd} raidz_col_t; 119168404Spjd 120168404Spjdtypedef struct raidz_map { 121219089Spjd uint64_t rm_cols; /* Regular column count */ 122219089Spjd uint64_t rm_scols; /* Count including skipped columns */ 123168404Spjd uint64_t rm_bigcols; /* Number of oversized columns */ 124168404Spjd uint64_t rm_asize; /* Actual total I/O size */ 125168404Spjd uint64_t rm_missingdata; /* Count of missing data devices */ 126168404Spjd uint64_t rm_missingparity; /* Count of missing parity devices */ 127168404Spjd uint64_t rm_firstdatacol; /* First data column/parity count */ 128219089Spjd uint64_t rm_nskip; /* Skipped sectors for padding */ 129252751Sdelphij uint64_t rm_skipstart; /* Column index of padding start */ 130219089Spjd void *rm_datacopy; /* rm_asize-buffer of copied data */ 131219089Spjd uintptr_t rm_reports; /* # of referencing checksum reports */ 132219089Spjd uint8_t rm_freed; /* map no longer has referencing ZIO */ 133219089Spjd uint8_t rm_ecksuminjected; /* checksum error was injected */ 134168404Spjd raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ 135168404Spjd} raidz_map_t; 136168404Spjd 137168404Spjd#define VDEV_RAIDZ_P 0 138168404Spjd#define VDEV_RAIDZ_Q 1 139219089Spjd#define VDEV_RAIDZ_R 2 140168404Spjd 141219089Spjd#define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 142219089Spjd#define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 143168404Spjd 144219089Spjd/* 145219089Spjd * We provide a mechanism to perform the field multiplication operation on a 146219089Spjd * 64-bit value all at once rather than a byte at a time. This works by 147219089Spjd * creating a mask from the top bit in each byte and using that to 148219089Spjd * conditionally apply the XOR of 0x1d. 149219089Spjd */ 150219089Spjd#define VDEV_RAIDZ_64MUL_2(x, mask) \ 151219089Spjd{ \ 152219089Spjd (mask) = (x) & 0x8080808080808080ULL; \ 153219089Spjd (mask) = ((mask) << 1) - ((mask) >> 7); \ 154219089Spjd (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 155219089Spjd ((mask) & 0x1d1d1d1d1d1d1d1d); \ 156219089Spjd} 157168404Spjd 158219089Spjd#define VDEV_RAIDZ_64MUL_4(x, mask) \ 159219089Spjd{ \ 160219089Spjd VDEV_RAIDZ_64MUL_2((x), mask); \ 161219089Spjd VDEV_RAIDZ_64MUL_2((x), mask); \ 162219089Spjd} 163219089Spjd 164262089Savg#define VDEV_LABEL_OFFSET(x) (x + VDEV_LABEL_START_SIZE) 165262089Savg 166168404Spjd/* 167219089Spjd * Force reconstruction to use the general purpose method. 168219089Spjd */ 169219089Spjdint vdev_raidz_default_to_general; 170219089Spjd 171252751Sdelphij/* Powers of 2 in the Galois field defined above. */ 172168404Spjdstatic const uint8_t vdev_raidz_pow2[256] = { 173168404Spjd 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 174168404Spjd 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 175168404Spjd 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 176168404Spjd 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 177168404Spjd 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 178168404Spjd 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 179168404Spjd 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, 180168404Spjd 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 181168404Spjd 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 182168404Spjd 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, 183168404Spjd 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 184168404Spjd 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 185168404Spjd 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 186168404Spjd 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 187168404Spjd 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 188168404Spjd 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 189168404Spjd 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 190168404Spjd 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 191168404Spjd 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 192168404Spjd 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 193168404Spjd 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 194168404Spjd 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 195168404Spjd 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 196168404Spjd 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 197168404Spjd 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, 198168404Spjd 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 199168404Spjd 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 200168404Spjd 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, 201168404Spjd 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 202168404Spjd 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, 203168404Spjd 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 204168404Spjd 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 205168404Spjd}; 206252751Sdelphij/* Logs of 2 in the Galois field defined above. */ 207168404Spjdstatic const uint8_t vdev_raidz_log2[256] = { 208168404Spjd 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, 209168404Spjd 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, 210168404Spjd 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, 211168404Spjd 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, 212168404Spjd 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, 213168404Spjd 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, 214168404Spjd 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, 215168404Spjd 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, 216168404Spjd 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, 217168404Spjd 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, 218168404Spjd 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, 219168404Spjd 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, 220168404Spjd 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, 221168404Spjd 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, 222168404Spjd 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, 223168404Spjd 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, 224168404Spjd 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, 225168404Spjd 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, 226168404Spjd 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, 227168404Spjd 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, 228168404Spjd 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, 229168404Spjd 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, 230168404Spjd 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, 231168404Spjd 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, 232168404Spjd 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, 233168404Spjd 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, 234168404Spjd 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, 235168404Spjd 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, 236168404Spjd 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, 237168404Spjd 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, 238168404Spjd 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, 239168404Spjd 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, 240168404Spjd}; 241168404Spjd 242219089Spjdstatic void vdev_raidz_generate_parity(raidz_map_t *rm); 243219089Spjd 244168404Spjd/* 245168404Spjd * Multiply a given number by 2 raised to the given power. 246168404Spjd */ 247168404Spjdstatic uint8_t 248168404Spjdvdev_raidz_exp2(uint_t a, int exp) 249168404Spjd{ 250168404Spjd if (a == 0) 251168404Spjd return (0); 252168404Spjd 253168404Spjd ASSERT(exp >= 0); 254168404Spjd ASSERT(vdev_raidz_log2[a] > 0 || a == 1); 255168404Spjd 256168404Spjd exp += vdev_raidz_log2[a]; 257168404Spjd if (exp > 255) 258168404Spjd exp -= 255; 259168404Spjd 260168404Spjd return (vdev_raidz_pow2[exp]); 261168404Spjd} 262168404Spjd 263185029Spjdstatic void 264219089Spjdvdev_raidz_map_free(raidz_map_t *rm) 265185029Spjd{ 266185029Spjd int c; 267219089Spjd size_t size; 268185029Spjd 269219089Spjd for (c = 0; c < rm->rm_firstdatacol; c++) { 270251419Ssmh if (rm->rm_col[c].rc_data != NULL) 271251419Ssmh zio_buf_free(rm->rm_col[c].rc_data, 272251419Ssmh rm->rm_col[c].rc_size); 273185029Spjd 274219089Spjd if (rm->rm_col[c].rc_gdata != NULL) 275219089Spjd zio_buf_free(rm->rm_col[c].rc_gdata, 276219089Spjd rm->rm_col[c].rc_size); 277219089Spjd } 278219089Spjd 279219089Spjd size = 0; 280219089Spjd for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) 281219089Spjd size += rm->rm_col[c].rc_size; 282219089Spjd 283219089Spjd if (rm->rm_datacopy != NULL) 284219089Spjd zio_buf_free(rm->rm_datacopy, size); 285219089Spjd 286219089Spjd kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); 287185029Spjd} 288185029Spjd 289219089Spjdstatic void 290219089Spjdvdev_raidz_map_free_vsd(zio_t *zio) 291219089Spjd{ 292219089Spjd raidz_map_t *rm = zio->io_vsd; 293219089Spjd 294243674Smm ASSERT0(rm->rm_freed); 295219089Spjd rm->rm_freed = 1; 296219089Spjd 297219089Spjd if (rm->rm_reports == 0) 298219089Spjd vdev_raidz_map_free(rm); 299219089Spjd} 300219089Spjd 301219089Spjd/*ARGSUSED*/ 302219089Spjdstatic void 303219089Spjdvdev_raidz_cksum_free(void *arg, size_t ignored) 304219089Spjd{ 305219089Spjd raidz_map_t *rm = arg; 306219089Spjd 307219089Spjd ASSERT3U(rm->rm_reports, >, 0); 308219089Spjd 309219089Spjd if (--rm->rm_reports == 0 && rm->rm_freed != 0) 310219089Spjd vdev_raidz_map_free(rm); 311219089Spjd} 312219089Spjd 313219089Spjdstatic void 314219089Spjdvdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) 315219089Spjd{ 316219089Spjd raidz_map_t *rm = zcr->zcr_cbdata; 317219089Spjd size_t c = zcr->zcr_cbinfo; 318219089Spjd size_t x; 319219089Spjd 320219089Spjd const char *good = NULL; 321219089Spjd const char *bad = rm->rm_col[c].rc_data; 322219089Spjd 323219089Spjd if (good_data == NULL) { 324219089Spjd zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); 325219089Spjd return; 326219089Spjd } 327219089Spjd 328219089Spjd if (c < rm->rm_firstdatacol) { 329219089Spjd /* 330219089Spjd * The first time through, calculate the parity blocks for 331219089Spjd * the good data (this relies on the fact that the good 332219089Spjd * data never changes for a given logical ZIO) 333219089Spjd */ 334219089Spjd if (rm->rm_col[0].rc_gdata == NULL) { 335219089Spjd char *bad_parity[VDEV_RAIDZ_MAXPARITY]; 336219089Spjd char *buf; 337219089Spjd 338219089Spjd /* 339219089Spjd * Set up the rm_col[]s to generate the parity for 340219089Spjd * good_data, first saving the parity bufs and 341219089Spjd * replacing them with buffers to hold the result. 342219089Spjd */ 343219089Spjd for (x = 0; x < rm->rm_firstdatacol; x++) { 344219089Spjd bad_parity[x] = rm->rm_col[x].rc_data; 345219089Spjd rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata = 346219089Spjd zio_buf_alloc(rm->rm_col[x].rc_size); 347219089Spjd } 348219089Spjd 349219089Spjd /* fill in the data columns from good_data */ 350219089Spjd buf = (char *)good_data; 351219089Spjd for (; x < rm->rm_cols; x++) { 352219089Spjd rm->rm_col[x].rc_data = buf; 353219089Spjd buf += rm->rm_col[x].rc_size; 354219089Spjd } 355219089Spjd 356219089Spjd /* 357219089Spjd * Construct the parity from the good data. 358219089Spjd */ 359219089Spjd vdev_raidz_generate_parity(rm); 360219089Spjd 361219089Spjd /* restore everything back to its original state */ 362219089Spjd for (x = 0; x < rm->rm_firstdatacol; x++) 363219089Spjd rm->rm_col[x].rc_data = bad_parity[x]; 364219089Spjd 365219089Spjd buf = rm->rm_datacopy; 366219089Spjd for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) { 367219089Spjd rm->rm_col[x].rc_data = buf; 368219089Spjd buf += rm->rm_col[x].rc_size; 369219089Spjd } 370219089Spjd } 371219089Spjd 372219089Spjd ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL); 373219089Spjd good = rm->rm_col[c].rc_gdata; 374219089Spjd } else { 375219089Spjd /* adjust good_data to point at the start of our column */ 376219089Spjd good = good_data; 377219089Spjd 378219089Spjd for (x = rm->rm_firstdatacol; x < c; x++) 379219089Spjd good += rm->rm_col[x].rc_size; 380219089Spjd } 381219089Spjd 382219089Spjd /* we drop the ereport if it ends up that the data was good */ 383219089Spjd zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE); 384219089Spjd} 385219089Spjd 386219089Spjd/* 387219089Spjd * Invoked indirectly by zfs_ereport_start_checksum(), called 388219089Spjd * below when our read operation fails completely. The main point 389219089Spjd * is to keep a copy of everything we read from disk, so that at 390219089Spjd * vdev_raidz_cksum_finish() time we can compare it with the good data. 391219089Spjd */ 392219089Spjdstatic void 393219089Spjdvdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) 394219089Spjd{ 395219089Spjd size_t c = (size_t)(uintptr_t)arg; 396219089Spjd caddr_t buf; 397219089Spjd 398219089Spjd raidz_map_t *rm = zio->io_vsd; 399219089Spjd size_t size; 400219089Spjd 401219089Spjd /* set up the report and bump the refcount */ 402219089Spjd zcr->zcr_cbdata = rm; 403219089Spjd zcr->zcr_cbinfo = c; 404219089Spjd zcr->zcr_finish = vdev_raidz_cksum_finish; 405219089Spjd zcr->zcr_free = vdev_raidz_cksum_free; 406219089Spjd 407219089Spjd rm->rm_reports++; 408219089Spjd ASSERT3U(rm->rm_reports, >, 0); 409219089Spjd 410219089Spjd if (rm->rm_datacopy != NULL) 411219089Spjd return; 412219089Spjd 413219089Spjd /* 414219089Spjd * It's the first time we're called for this raidz_map_t, so we need 415219089Spjd * to copy the data aside; there's no guarantee that our zio's buffer 416219089Spjd * won't be re-used for something else. 417219089Spjd * 418219089Spjd * Our parity data is already in separate buffers, so there's no need 419219089Spjd * to copy them. 420219089Spjd */ 421219089Spjd 422219089Spjd size = 0; 423219089Spjd for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) 424219089Spjd size += rm->rm_col[c].rc_size; 425219089Spjd 426219089Spjd buf = rm->rm_datacopy = zio_buf_alloc(size); 427219089Spjd 428219089Spjd for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 429219089Spjd raidz_col_t *col = &rm->rm_col[c]; 430219089Spjd 431219089Spjd bcopy(col->rc_data, buf, col->rc_size); 432219089Spjd col->rc_data = buf; 433219089Spjd 434219089Spjd buf += col->rc_size; 435219089Spjd } 436219089Spjd ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size); 437219089Spjd} 438219089Spjd 439219089Spjdstatic const zio_vsd_ops_t vdev_raidz_vsd_ops = { 440219089Spjd vdev_raidz_map_free_vsd, 441219089Spjd vdev_raidz_cksum_report 442219089Spjd}; 443219089Spjd 444252749Sdelphij/* 445252749Sdelphij * Divides the IO evenly across all child vdevs; usually, dcols is 446252749Sdelphij * the number of children in the target vdev. 447252749Sdelphij */ 448168404Spjdstatic raidz_map_t * 449262089Savgvdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset, boolean_t dofree, 450262089Savg uint64_t unit_shift, uint64_t dcols, uint64_t nparity) 451168404Spjd{ 452168404Spjd raidz_map_t *rm; 453252749Sdelphij /* The starting RAIDZ (parent) vdev sector of the block. */ 454262089Savg uint64_t b = offset >> unit_shift; 455252749Sdelphij /* The zio's size in units of the vdev's minimum sector size. */ 456262089Savg uint64_t s = size >> unit_shift; 457252749Sdelphij /* The first column for this stripe. */ 458168404Spjd uint64_t f = b % dcols; 459252749Sdelphij /* The starting byte offset on each child vdev. */ 460168404Spjd uint64_t o = (b / dcols) << unit_shift; 461219089Spjd uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; 462168404Spjd 463252749Sdelphij /* 464252749Sdelphij * "Quotient": The number of data sectors for this stripe on all but 465252749Sdelphij * the "big column" child vdevs that also contain "remainder" data. 466252749Sdelphij */ 467168404Spjd q = s / (dcols - nparity); 468252749Sdelphij 469252749Sdelphij /* 470252749Sdelphij * "Remainder": The number of partial stripe data sectors in this I/O. 471252749Sdelphij * This will add a sector to some, but not all, child vdevs. 472252749Sdelphij */ 473168404Spjd r = s - q * (dcols - nparity); 474252749Sdelphij 475252749Sdelphij /* The number of "big columns" - those which contain remainder data. */ 476168404Spjd bc = (r == 0 ? 0 : r + nparity); 477252749Sdelphij 478252749Sdelphij /* 479252749Sdelphij * The total number of data and parity sectors associated with 480252749Sdelphij * this I/O. 481252749Sdelphij */ 482219089Spjd tot = s + nparity * (q + (r == 0 ? 0 : 1)); 483168404Spjd 484252749Sdelphij /* acols: The columns that will be accessed. */ 485252749Sdelphij /* scols: The columns that will be accessed or skipped. */ 486219089Spjd if (q == 0) { 487252749Sdelphij /* Our I/O request doesn't span all child vdevs. */ 488219089Spjd acols = bc; 489219089Spjd scols = MIN(dcols, roundup(bc, nparity + 1)); 490219089Spjd } else { 491219089Spjd acols = dcols; 492219089Spjd scols = dcols; 493219089Spjd } 494168404Spjd 495219089Spjd ASSERT3U(acols, <=, scols); 496168404Spjd 497219089Spjd rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP); 498219089Spjd 499168404Spjd rm->rm_cols = acols; 500219089Spjd rm->rm_scols = scols; 501168404Spjd rm->rm_bigcols = bc; 502219089Spjd rm->rm_skipstart = bc; 503168404Spjd rm->rm_missingdata = 0; 504168404Spjd rm->rm_missingparity = 0; 505168404Spjd rm->rm_firstdatacol = nparity; 506219089Spjd rm->rm_datacopy = NULL; 507219089Spjd rm->rm_reports = 0; 508219089Spjd rm->rm_freed = 0; 509219089Spjd rm->rm_ecksuminjected = 0; 510168404Spjd 511219089Spjd asize = 0; 512219089Spjd 513219089Spjd for (c = 0; c < scols; c++) { 514168404Spjd col = f + c; 515168404Spjd coff = o; 516168404Spjd if (col >= dcols) { 517168404Spjd col -= dcols; 518168404Spjd coff += 1ULL << unit_shift; 519168404Spjd } 520168404Spjd rm->rm_col[c].rc_devidx = col; 521168404Spjd rm->rm_col[c].rc_offset = coff; 522168404Spjd rm->rm_col[c].rc_data = NULL; 523219089Spjd rm->rm_col[c].rc_gdata = NULL; 524168404Spjd rm->rm_col[c].rc_error = 0; 525168404Spjd rm->rm_col[c].rc_tried = 0; 526168404Spjd rm->rm_col[c].rc_skipped = 0; 527219089Spjd 528219089Spjd if (c >= acols) 529219089Spjd rm->rm_col[c].rc_size = 0; 530219089Spjd else if (c < bc) 531219089Spjd rm->rm_col[c].rc_size = (q + 1) << unit_shift; 532219089Spjd else 533219089Spjd rm->rm_col[c].rc_size = q << unit_shift; 534219089Spjd 535219089Spjd asize += rm->rm_col[c].rc_size; 536168404Spjd } 537168404Spjd 538219089Spjd ASSERT3U(asize, ==, tot << unit_shift); 539219089Spjd rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); 540219089Spjd rm->rm_nskip = roundup(tot, nparity + 1) - tot; 541219089Spjd ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); 542219089Spjd ASSERT3U(rm->rm_nskip, <=, nparity); 543168404Spjd 544262089Savg if (!dofree) { 545251419Ssmh for (c = 0; c < rm->rm_firstdatacol; c++) { 546251419Ssmh rm->rm_col[c].rc_data = 547251419Ssmh zio_buf_alloc(rm->rm_col[c].rc_size); 548251419Ssmh } 549168404Spjd 550262089Savg rm->rm_col[c].rc_data = data; 551168404Spjd 552251419Ssmh for (c = c + 1; c < acols; c++) { 553251419Ssmh rm->rm_col[c].rc_data = 554251419Ssmh (char *)rm->rm_col[c - 1].rc_data + 555251419Ssmh rm->rm_col[c - 1].rc_size; 556251419Ssmh } 557251419Ssmh } 558168404Spjd 559168404Spjd /* 560168404Spjd * If all data stored spans all columns, there's a danger that parity 561168404Spjd * will always be on the same device and, since parity isn't read 562168404Spjd * during normal operation, that that device's I/O bandwidth won't be 563168404Spjd * used effectively. We therefore switch the parity every 1MB. 564168404Spjd * 565168404Spjd * ... at least that was, ostensibly, the theory. As a practical 566168404Spjd * matter unless we juggle the parity between all devices evenly, we 567168404Spjd * won't see any benefit. Further, occasional writes that aren't a 568168404Spjd * multiple of the LCM of the number of children and the minimum 569168404Spjd * stripe width are sufficient to avoid pessimal behavior. 570168404Spjd * Unfortunately, this decision created an implicit on-disk format 571168404Spjd * requirement that we need to support for all eternity, but only 572168404Spjd * for single-parity RAID-Z. 573219089Spjd * 574219089Spjd * If we intend to skip a sector in the zeroth column for padding 575219089Spjd * we must make sure to note this swap. We will never intend to 576219089Spjd * skip the first column since at least one data and one parity 577219089Spjd * column must appear in each row. 578168404Spjd */ 579168404Spjd ASSERT(rm->rm_cols >= 2); 580168404Spjd ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); 581168404Spjd 582262089Savg if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) { 583168404Spjd devidx = rm->rm_col[0].rc_devidx; 584168404Spjd o = rm->rm_col[0].rc_offset; 585168404Spjd rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; 586168404Spjd rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; 587168404Spjd rm->rm_col[1].rc_devidx = devidx; 588168404Spjd rm->rm_col[1].rc_offset = o; 589219089Spjd 590219089Spjd if (rm->rm_skipstart == 0) 591219089Spjd rm->rm_skipstart = 1; 592168404Spjd } 593168404Spjd 594168404Spjd return (rm); 595168404Spjd} 596168404Spjd 597168404Spjdstatic void 598168404Spjdvdev_raidz_generate_parity_p(raidz_map_t *rm) 599168404Spjd{ 600168404Spjd uint64_t *p, *src, pcount, ccount, i; 601168404Spjd int c; 602168404Spjd 603168404Spjd pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 604168404Spjd 605168404Spjd for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 606168404Spjd src = rm->rm_col[c].rc_data; 607168404Spjd p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 608168404Spjd ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 609168404Spjd 610168404Spjd if (c == rm->rm_firstdatacol) { 611168404Spjd ASSERT(ccount == pcount); 612219089Spjd for (i = 0; i < ccount; i++, src++, p++) { 613168404Spjd *p = *src; 614168404Spjd } 615168404Spjd } else { 616168404Spjd ASSERT(ccount <= pcount); 617219089Spjd for (i = 0; i < ccount; i++, src++, p++) { 618168404Spjd *p ^= *src; 619168404Spjd } 620168404Spjd } 621168404Spjd } 622168404Spjd} 623168404Spjd 624168404Spjdstatic void 625168404Spjdvdev_raidz_generate_parity_pq(raidz_map_t *rm) 626168404Spjd{ 627219089Spjd uint64_t *p, *q, *src, pcnt, ccnt, mask, i; 628168404Spjd int c; 629168404Spjd 630219089Spjd pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 631168404Spjd ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 632168404Spjd rm->rm_col[VDEV_RAIDZ_Q].rc_size); 633168404Spjd 634168404Spjd for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 635168404Spjd src = rm->rm_col[c].rc_data; 636168404Spjd p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 637168404Spjd q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 638168404Spjd 639219089Spjd ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 640219089Spjd 641168404Spjd if (c == rm->rm_firstdatacol) { 642219089Spjd ASSERT(ccnt == pcnt || ccnt == 0); 643219089Spjd for (i = 0; i < ccnt; i++, src++, p++, q++) { 644219089Spjd *p = *src; 645168404Spjd *q = *src; 646219089Spjd } 647219089Spjd for (; i < pcnt; i++, src++, p++, q++) { 648219089Spjd *p = 0; 649219089Spjd *q = 0; 650219089Spjd } 651219089Spjd } else { 652219089Spjd ASSERT(ccnt <= pcnt); 653219089Spjd 654219089Spjd /* 655219089Spjd * Apply the algorithm described above by multiplying 656219089Spjd * the previous result and adding in the new value. 657219089Spjd */ 658219089Spjd for (i = 0; i < ccnt; i++, src++, p++, q++) { 659219089Spjd *p ^= *src; 660219089Spjd 661219089Spjd VDEV_RAIDZ_64MUL_2(*q, mask); 662219089Spjd *q ^= *src; 663219089Spjd } 664219089Spjd 665219089Spjd /* 666219089Spjd * Treat short columns as though they are full of 0s. 667219089Spjd * Note that there's therefore nothing needed for P. 668219089Spjd */ 669219089Spjd for (; i < pcnt; i++, q++) { 670219089Spjd VDEV_RAIDZ_64MUL_2(*q, mask); 671219089Spjd } 672219089Spjd } 673219089Spjd } 674219089Spjd} 675219089Spjd 676219089Spjdstatic void 677219089Spjdvdev_raidz_generate_parity_pqr(raidz_map_t *rm) 678219089Spjd{ 679219089Spjd uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i; 680219089Spjd int c; 681219089Spjd 682219089Spjd pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 683219089Spjd ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 684219089Spjd rm->rm_col[VDEV_RAIDZ_Q].rc_size); 685219089Spjd ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 686219089Spjd rm->rm_col[VDEV_RAIDZ_R].rc_size); 687219089Spjd 688219089Spjd for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 689219089Spjd src = rm->rm_col[c].rc_data; 690219089Spjd p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 691219089Spjd q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 692219089Spjd r = rm->rm_col[VDEV_RAIDZ_R].rc_data; 693219089Spjd 694219089Spjd ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 695219089Spjd 696219089Spjd if (c == rm->rm_firstdatacol) { 697219089Spjd ASSERT(ccnt == pcnt || ccnt == 0); 698219089Spjd for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 699168404Spjd *p = *src; 700219089Spjd *q = *src; 701219089Spjd *r = *src; 702168404Spjd } 703219089Spjd for (; i < pcnt; i++, src++, p++, q++, r++) { 704219089Spjd *p = 0; 705168404Spjd *q = 0; 706219089Spjd *r = 0; 707168404Spjd } 708168404Spjd } else { 709219089Spjd ASSERT(ccnt <= pcnt); 710168404Spjd 711168404Spjd /* 712219089Spjd * Apply the algorithm described above by multiplying 713219089Spjd * the previous result and adding in the new value. 714168404Spjd */ 715219089Spjd for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 716219089Spjd *p ^= *src; 717219089Spjd 718219089Spjd VDEV_RAIDZ_64MUL_2(*q, mask); 719168404Spjd *q ^= *src; 720219089Spjd 721219089Spjd VDEV_RAIDZ_64MUL_4(*r, mask); 722219089Spjd *r ^= *src; 723168404Spjd } 724168404Spjd 725168404Spjd /* 726168404Spjd * Treat short columns as though they are full of 0s. 727219089Spjd * Note that there's therefore nothing needed for P. 728168404Spjd */ 729219089Spjd for (; i < pcnt; i++, q++, r++) { 730219089Spjd VDEV_RAIDZ_64MUL_2(*q, mask); 731219089Spjd VDEV_RAIDZ_64MUL_4(*r, mask); 732168404Spjd } 733168404Spjd } 734168404Spjd } 735168404Spjd} 736168404Spjd 737219089Spjd/* 738219089Spjd * Generate RAID parity in the first virtual columns according to the number of 739219089Spjd * parity columns available. 740219089Spjd */ 741168404Spjdstatic void 742219089Spjdvdev_raidz_generate_parity(raidz_map_t *rm) 743168404Spjd{ 744219089Spjd switch (rm->rm_firstdatacol) { 745219089Spjd case 1: 746219089Spjd vdev_raidz_generate_parity_p(rm); 747219089Spjd break; 748219089Spjd case 2: 749219089Spjd vdev_raidz_generate_parity_pq(rm); 750219089Spjd break; 751219089Spjd case 3: 752219089Spjd vdev_raidz_generate_parity_pqr(rm); 753219089Spjd break; 754219089Spjd default: 755219089Spjd cmn_err(CE_PANIC, "invalid RAID-Z configuration"); 756219089Spjd } 757219089Spjd} 758219089Spjd 759219089Spjdstatic int 760219089Spjdvdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) 761219089Spjd{ 762168404Spjd uint64_t *dst, *src, xcount, ccount, count, i; 763219089Spjd int x = tgts[0]; 764168404Spjd int c; 765168404Spjd 766219089Spjd ASSERT(ntgts == 1); 767219089Spjd ASSERT(x >= rm->rm_firstdatacol); 768219089Spjd ASSERT(x < rm->rm_cols); 769219089Spjd 770168404Spjd xcount = rm->rm_col[x].rc_size / sizeof (src[0]); 771168404Spjd ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0])); 772168404Spjd ASSERT(xcount > 0); 773168404Spjd 774168404Spjd src = rm->rm_col[VDEV_RAIDZ_P].rc_data; 775168404Spjd dst = rm->rm_col[x].rc_data; 776168404Spjd for (i = 0; i < xcount; i++, dst++, src++) { 777168404Spjd *dst = *src; 778168404Spjd } 779168404Spjd 780168404Spjd for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 781168404Spjd src = rm->rm_col[c].rc_data; 782168404Spjd dst = rm->rm_col[x].rc_data; 783168404Spjd 784168404Spjd if (c == x) 785168404Spjd continue; 786168404Spjd 787168404Spjd ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 788168404Spjd count = MIN(ccount, xcount); 789168404Spjd 790168404Spjd for (i = 0; i < count; i++, dst++, src++) { 791168404Spjd *dst ^= *src; 792168404Spjd } 793168404Spjd } 794219089Spjd 795219089Spjd return (1 << VDEV_RAIDZ_P); 796168404Spjd} 797168404Spjd 798219089Spjdstatic int 799219089Spjdvdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) 800168404Spjd{ 801168404Spjd uint64_t *dst, *src, xcount, ccount, count, mask, i; 802168404Spjd uint8_t *b; 803219089Spjd int x = tgts[0]; 804168404Spjd int c, j, exp; 805168404Spjd 806219089Spjd ASSERT(ntgts == 1); 807219089Spjd 808168404Spjd xcount = rm->rm_col[x].rc_size / sizeof (src[0]); 809168404Spjd ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0])); 810168404Spjd 811168404Spjd for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 812168404Spjd src = rm->rm_col[c].rc_data; 813168404Spjd dst = rm->rm_col[x].rc_data; 814168404Spjd 815168404Spjd if (c == x) 816168404Spjd ccount = 0; 817168404Spjd else 818168404Spjd ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 819168404Spjd 820168404Spjd count = MIN(ccount, xcount); 821168404Spjd 822168404Spjd if (c == rm->rm_firstdatacol) { 823168404Spjd for (i = 0; i < count; i++, dst++, src++) { 824168404Spjd *dst = *src; 825168404Spjd } 826168404Spjd for (; i < xcount; i++, dst++) { 827168404Spjd *dst = 0; 828168404Spjd } 829168404Spjd 830168404Spjd } else { 831168404Spjd for (i = 0; i < count; i++, dst++, src++) { 832219089Spjd VDEV_RAIDZ_64MUL_2(*dst, mask); 833168404Spjd *dst ^= *src; 834168404Spjd } 835168404Spjd 836168404Spjd for (; i < xcount; i++, dst++) { 837219089Spjd VDEV_RAIDZ_64MUL_2(*dst, mask); 838168404Spjd } 839168404Spjd } 840168404Spjd } 841168404Spjd 842168404Spjd src = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 843168404Spjd dst = rm->rm_col[x].rc_data; 844168404Spjd exp = 255 - (rm->rm_cols - 1 - x); 845168404Spjd 846168404Spjd for (i = 0; i < xcount; i++, dst++, src++) { 847168404Spjd *dst ^= *src; 848168404Spjd for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { 849168404Spjd *b = vdev_raidz_exp2(*b, exp); 850168404Spjd } 851168404Spjd } 852219089Spjd 853219089Spjd return (1 << VDEV_RAIDZ_Q); 854168404Spjd} 855168404Spjd 856219089Spjdstatic int 857219089Spjdvdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) 858168404Spjd{ 859168404Spjd uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp; 860168404Spjd void *pdata, *qdata; 861168404Spjd uint64_t xsize, ysize, i; 862219089Spjd int x = tgts[0]; 863219089Spjd int y = tgts[1]; 864168404Spjd 865219089Spjd ASSERT(ntgts == 2); 866168404Spjd ASSERT(x < y); 867168404Spjd ASSERT(x >= rm->rm_firstdatacol); 868168404Spjd ASSERT(y < rm->rm_cols); 869168404Spjd 870168404Spjd ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size); 871168404Spjd 872168404Spjd /* 873168404Spjd * Move the parity data aside -- we're going to compute parity as 874168404Spjd * though columns x and y were full of zeros -- Pxy and Qxy. We want to 875168404Spjd * reuse the parity generation mechanism without trashing the actual 876168404Spjd * parity so we make those columns appear to be full of zeros by 877168404Spjd * setting their lengths to zero. 878168404Spjd */ 879168404Spjd pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data; 880168404Spjd qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 881168404Spjd xsize = rm->rm_col[x].rc_size; 882168404Spjd ysize = rm->rm_col[y].rc_size; 883168404Spjd 884168404Spjd rm->rm_col[VDEV_RAIDZ_P].rc_data = 885168404Spjd zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size); 886168404Spjd rm->rm_col[VDEV_RAIDZ_Q].rc_data = 887168404Spjd zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size); 888168404Spjd rm->rm_col[x].rc_size = 0; 889168404Spjd rm->rm_col[y].rc_size = 0; 890168404Spjd 891168404Spjd vdev_raidz_generate_parity_pq(rm); 892168404Spjd 893168404Spjd rm->rm_col[x].rc_size = xsize; 894168404Spjd rm->rm_col[y].rc_size = ysize; 895168404Spjd 896168404Spjd p = pdata; 897168404Spjd q = qdata; 898168404Spjd pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data; 899168404Spjd qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 900168404Spjd xd = rm->rm_col[x].rc_data; 901168404Spjd yd = rm->rm_col[y].rc_data; 902168404Spjd 903168404Spjd /* 904168404Spjd * We now have: 905168404Spjd * Pxy = P + D_x + D_y 906168404Spjd * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y 907168404Spjd * 908168404Spjd * We can then solve for D_x: 909168404Spjd * D_x = A * (P + Pxy) + B * (Q + Qxy) 910168404Spjd * where 911168404Spjd * A = 2^(x - y) * (2^(x - y) + 1)^-1 912168404Spjd * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 913168404Spjd * 914168404Spjd * With D_x in hand, we can easily solve for D_y: 915168404Spjd * D_y = P + Pxy + D_x 916168404Spjd */ 917168404Spjd 918168404Spjd a = vdev_raidz_pow2[255 + x - y]; 919168404Spjd b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)]; 920168404Spjd tmp = 255 - vdev_raidz_log2[a ^ 1]; 921168404Spjd 922168404Spjd aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; 923168404Spjd bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; 924168404Spjd 925168404Spjd for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) { 926168404Spjd *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^ 927168404Spjd vdev_raidz_exp2(*q ^ *qxy, bexp); 928168404Spjd 929168404Spjd if (i < ysize) 930168404Spjd *yd = *p ^ *pxy ^ *xd; 931168404Spjd } 932168404Spjd 933168404Spjd zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data, 934168404Spjd rm->rm_col[VDEV_RAIDZ_P].rc_size); 935168404Spjd zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data, 936168404Spjd rm->rm_col[VDEV_RAIDZ_Q].rc_size); 937168404Spjd 938168404Spjd /* 939168404Spjd * Restore the saved parity data. 940168404Spjd */ 941168404Spjd rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata; 942168404Spjd rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata; 943219089Spjd 944219089Spjd return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q)); 945168404Spjd} 946168404Spjd 947219089Spjd/* BEGIN CSTYLED */ 948219089Spjd/* 949219089Spjd * In the general case of reconstruction, we must solve the system of linear 950219089Spjd * equations defined by the coeffecients used to generate parity as well as 951219089Spjd * the contents of the data and parity disks. This can be expressed with 952219089Spjd * vectors for the original data (D) and the actual data (d) and parity (p) 953219089Spjd * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 954219089Spjd * 955219089Spjd * __ __ __ __ 956219089Spjd * | | __ __ | p_0 | 957219089Spjd * | V | | D_0 | | p_m-1 | 958219089Spjd * | | x | : | = | d_0 | 959219089Spjd * | I | | D_n-1 | | : | 960219089Spjd * | | ~~ ~~ | d_n-1 | 961219089Spjd * ~~ ~~ ~~ ~~ 962219089Spjd * 963219089Spjd * I is simply a square identity matrix of size n, and V is a vandermonde 964219089Spjd * matrix defined by the coeffecients we chose for the various parity columns 965219089Spjd * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 966219089Spjd * computation as well as linear separability. 967219089Spjd * 968219089Spjd * __ __ __ __ 969219089Spjd * | 1 .. 1 1 1 | | p_0 | 970219089Spjd * | 2^n-1 .. 4 2 1 | __ __ | : | 971219089Spjd * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 972219089Spjd * | 1 .. 0 0 0 | | D_1 | | d_0 | 973219089Spjd * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 974219089Spjd * | : : : : | | : | | d_2 | 975219089Spjd * | 0 .. 1 0 0 | | D_n-1 | | : | 976219089Spjd * | 0 .. 0 1 0 | ~~ ~~ | : | 977219089Spjd * | 0 .. 0 0 1 | | d_n-1 | 978219089Spjd * ~~ ~~ ~~ ~~ 979219089Spjd * 980219089Spjd * Note that I, V, d, and p are known. To compute D, we must invert the 981219089Spjd * matrix and use the known data and parity values to reconstruct the unknown 982219089Spjd * data values. We begin by removing the rows in V|I and d|p that correspond 983219089Spjd * to failed or missing columns; we then make V|I square (n x n) and d|p 984219089Spjd * sized n by removing rows corresponding to unused parity from the bottom up 985219089Spjd * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 986219089Spjd * using Gauss-Jordan elimination. In the example below we use m=3 parity 987219089Spjd * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 988219089Spjd * __ __ 989219089Spjd * | 1 1 1 1 1 1 1 1 | 990219089Spjd * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 991219089Spjd * | 19 205 116 29 64 16 4 1 | / / 992219089Spjd * | 1 0 0 0 0 0 0 0 | / / 993219089Spjd * | 0 1 0 0 0 0 0 0 | <--' / 994219089Spjd * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 995219089Spjd * | 0 0 0 1 0 0 0 0 | 996219089Spjd * | 0 0 0 0 1 0 0 0 | 997219089Spjd * | 0 0 0 0 0 1 0 0 | 998219089Spjd * | 0 0 0 0 0 0 1 0 | 999219089Spjd * | 0 0 0 0 0 0 0 1 | 1000219089Spjd * ~~ ~~ 1001219089Spjd * __ __ 1002219089Spjd * | 1 1 1 1 1 1 1 1 | 1003219089Spjd * | 19 205 116 29 64 16 4 1 | 1004219089Spjd * | 1 0 0 0 0 0 0 0 | 1005262089Savg * (V|I)' = | 0 0 0 1 0 0 0 0 | 1006219089Spjd * | 0 0 0 0 1 0 0 0 | 1007219089Spjd * | 0 0 0 0 0 1 0 0 | 1008219089Spjd * | 0 0 0 0 0 0 1 0 | 1009219089Spjd * | 0 0 0 0 0 0 0 1 | 1010219089Spjd * ~~ ~~ 1011219089Spjd * 1012219089Spjd * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 1013219089Spjd * have carefully chosen the seed values 1, 2, and 4 to ensure that this 1014219089Spjd * matrix is not singular. 1015219089Spjd * __ __ 1016219089Spjd * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1017219089Spjd * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1018219089Spjd * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1019219089Spjd * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1020219089Spjd * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1021219089Spjd * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1022219089Spjd * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1023219089Spjd * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1024219089Spjd * ~~ ~~ 1025219089Spjd * __ __ 1026219089Spjd * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1027219089Spjd * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1028219089Spjd * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1029219089Spjd * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1030219089Spjd * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1031219089Spjd * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1032219089Spjd * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1033219089Spjd * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1034219089Spjd * ~~ ~~ 1035219089Spjd * __ __ 1036219089Spjd * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1037219089Spjd * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1038219089Spjd * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 1039219089Spjd * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1040219089Spjd * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1041219089Spjd * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1042219089Spjd * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1043219089Spjd * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1044219089Spjd * ~~ ~~ 1045219089Spjd * __ __ 1046219089Spjd * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1047219089Spjd * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1048219089Spjd * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 1049219089Spjd * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1050219089Spjd * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1051219089Spjd * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1052219089Spjd * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1053219089Spjd * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1054219089Spjd * ~~ ~~ 1055219089Spjd * __ __ 1056219089Spjd * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1057219089Spjd * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1058219089Spjd * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1059219089Spjd * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1060219089Spjd * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1061219089Spjd * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1062219089Spjd * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1063219089Spjd * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1064219089Spjd * ~~ ~~ 1065219089Spjd * __ __ 1066219089Spjd * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1067219089Spjd * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 1068219089Spjd * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1069219089Spjd * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1070219089Spjd * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1071219089Spjd * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1072219089Spjd * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1073219089Spjd * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1074219089Spjd * ~~ ~~ 1075219089Spjd * __ __ 1076219089Spjd * | 0 0 1 0 0 0 0 0 | 1077219089Spjd * | 167 100 5 41 159 169 217 208 | 1078219089Spjd * | 166 100 4 40 158 168 216 209 | 1079219089Spjd * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 1080219089Spjd * | 0 0 0 0 1 0 0 0 | 1081219089Spjd * | 0 0 0 0 0 1 0 0 | 1082219089Spjd * | 0 0 0 0 0 0 1 0 | 1083219089Spjd * | 0 0 0 0 0 0 0 1 | 1084219089Spjd * ~~ ~~ 1085219089Spjd * 1086219089Spjd * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 1087219089Spjd * of the missing data. 1088219089Spjd * 1089219089Spjd * As is apparent from the example above, the only non-trivial rows in the 1090219089Spjd * inverse matrix correspond to the data disks that we're trying to 1091219089Spjd * reconstruct. Indeed, those are the only rows we need as the others would 1092219089Spjd * only be useful for reconstructing data known or assumed to be valid. For 1093219089Spjd * that reason, we only build the coefficients in the rows that correspond to 1094219089Spjd * targeted columns. 1095219089Spjd */ 1096219089Spjd/* END CSTYLED */ 1097168404Spjd 1098219089Spjdstatic void 1099219089Spjdvdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, 1100219089Spjd uint8_t **rows) 1101219089Spjd{ 1102219089Spjd int i, j; 1103219089Spjd int pow; 1104219089Spjd 1105219089Spjd ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); 1106219089Spjd 1107219089Spjd /* 1108219089Spjd * Fill in the missing rows of interest. 1109219089Spjd */ 1110219089Spjd for (i = 0; i < nmap; i++) { 1111219089Spjd ASSERT3S(0, <=, map[i]); 1112219089Spjd ASSERT3S(map[i], <=, 2); 1113219089Spjd 1114219089Spjd pow = map[i] * n; 1115219089Spjd if (pow > 255) 1116219089Spjd pow -= 255; 1117219089Spjd ASSERT(pow <= 255); 1118219089Spjd 1119219089Spjd for (j = 0; j < n; j++) { 1120219089Spjd pow -= map[i]; 1121219089Spjd if (pow < 0) 1122219089Spjd pow += 255; 1123219089Spjd rows[i][j] = vdev_raidz_pow2[pow]; 1124219089Spjd } 1125219089Spjd } 1126219089Spjd} 1127219089Spjd 1128219089Spjdstatic void 1129219089Spjdvdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, 1130219089Spjd uint8_t **rows, uint8_t **invrows, const uint8_t *used) 1131219089Spjd{ 1132219089Spjd int i, j, ii, jj; 1133219089Spjd uint8_t log; 1134219089Spjd 1135219089Spjd /* 1136219089Spjd * Assert that the first nmissing entries from the array of used 1137219089Spjd * columns correspond to parity columns and that subsequent entries 1138219089Spjd * correspond to data columns. 1139219089Spjd */ 1140219089Spjd for (i = 0; i < nmissing; i++) { 1141219089Spjd ASSERT3S(used[i], <, rm->rm_firstdatacol); 1142219089Spjd } 1143219089Spjd for (; i < n; i++) { 1144219089Spjd ASSERT3S(used[i], >=, rm->rm_firstdatacol); 1145219089Spjd } 1146219089Spjd 1147219089Spjd /* 1148219089Spjd * First initialize the storage where we'll compute the inverse rows. 1149219089Spjd */ 1150219089Spjd for (i = 0; i < nmissing; i++) { 1151219089Spjd for (j = 0; j < n; j++) { 1152219089Spjd invrows[i][j] = (i == j) ? 1 : 0; 1153219089Spjd } 1154219089Spjd } 1155219089Spjd 1156219089Spjd /* 1157219089Spjd * Subtract all trivial rows from the rows of consequence. 1158219089Spjd */ 1159219089Spjd for (i = 0; i < nmissing; i++) { 1160219089Spjd for (j = nmissing; j < n; j++) { 1161219089Spjd ASSERT3U(used[j], >=, rm->rm_firstdatacol); 1162219089Spjd jj = used[j] - rm->rm_firstdatacol; 1163219089Spjd ASSERT3S(jj, <, n); 1164219089Spjd invrows[i][j] = rows[i][jj]; 1165219089Spjd rows[i][jj] = 0; 1166219089Spjd } 1167219089Spjd } 1168219089Spjd 1169219089Spjd /* 1170219089Spjd * For each of the rows of interest, we must normalize it and subtract 1171219089Spjd * a multiple of it from the other rows. 1172219089Spjd */ 1173219089Spjd for (i = 0; i < nmissing; i++) { 1174219089Spjd for (j = 0; j < missing[i]; j++) { 1175243674Smm ASSERT0(rows[i][j]); 1176219089Spjd } 1177219089Spjd ASSERT3U(rows[i][missing[i]], !=, 0); 1178219089Spjd 1179219089Spjd /* 1180219089Spjd * Compute the inverse of the first element and multiply each 1181219089Spjd * element in the row by that value. 1182219089Spjd */ 1183219089Spjd log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 1184219089Spjd 1185219089Spjd for (j = 0; j < n; j++) { 1186219089Spjd rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 1187219089Spjd invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 1188219089Spjd } 1189219089Spjd 1190219089Spjd for (ii = 0; ii < nmissing; ii++) { 1191219089Spjd if (i == ii) 1192219089Spjd continue; 1193219089Spjd 1194219089Spjd ASSERT3U(rows[ii][missing[i]], !=, 0); 1195219089Spjd 1196219089Spjd log = vdev_raidz_log2[rows[ii][missing[i]]]; 1197219089Spjd 1198219089Spjd for (j = 0; j < n; j++) { 1199219089Spjd rows[ii][j] ^= 1200219089Spjd vdev_raidz_exp2(rows[i][j], log); 1201219089Spjd invrows[ii][j] ^= 1202219089Spjd vdev_raidz_exp2(invrows[i][j], log); 1203219089Spjd } 1204219089Spjd } 1205219089Spjd } 1206219089Spjd 1207219089Spjd /* 1208219089Spjd * Verify that the data that is left in the rows are properly part of 1209219089Spjd * an identity matrix. 1210219089Spjd */ 1211219089Spjd for (i = 0; i < nmissing; i++) { 1212219089Spjd for (j = 0; j < n; j++) { 1213219089Spjd if (j == missing[i]) { 1214219089Spjd ASSERT3U(rows[i][j], ==, 1); 1215219089Spjd } else { 1216243674Smm ASSERT0(rows[i][j]); 1217219089Spjd } 1218219089Spjd } 1219219089Spjd } 1220219089Spjd} 1221219089Spjd 1222219089Spjdstatic void 1223219089Spjdvdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, 1224219089Spjd int *missing, uint8_t **invrows, const uint8_t *used) 1225219089Spjd{ 1226219089Spjd int i, j, x, cc, c; 1227219089Spjd uint8_t *src; 1228219089Spjd uint64_t ccount; 1229219089Spjd uint8_t *dst[VDEV_RAIDZ_MAXPARITY]; 1230219089Spjd uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; 1231248369Smm uint8_t log = 0; 1232248369Smm uint8_t val; 1233219089Spjd int ll; 1234219089Spjd uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 1235219089Spjd uint8_t *p, *pp; 1236219089Spjd size_t psize; 1237219089Spjd 1238219089Spjd psize = sizeof (invlog[0][0]) * n * nmissing; 1239219089Spjd p = kmem_alloc(psize, KM_SLEEP); 1240219089Spjd 1241219089Spjd for (pp = p, i = 0; i < nmissing; i++) { 1242219089Spjd invlog[i] = pp; 1243219089Spjd pp += n; 1244219089Spjd } 1245219089Spjd 1246219089Spjd for (i = 0; i < nmissing; i++) { 1247219089Spjd for (j = 0; j < n; j++) { 1248219089Spjd ASSERT3U(invrows[i][j], !=, 0); 1249219089Spjd invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 1250219089Spjd } 1251219089Spjd } 1252219089Spjd 1253219089Spjd for (i = 0; i < n; i++) { 1254219089Spjd c = used[i]; 1255219089Spjd ASSERT3U(c, <, rm->rm_cols); 1256219089Spjd 1257219089Spjd src = rm->rm_col[c].rc_data; 1258219089Spjd ccount = rm->rm_col[c].rc_size; 1259219089Spjd for (j = 0; j < nmissing; j++) { 1260219089Spjd cc = missing[j] + rm->rm_firstdatacol; 1261219089Spjd ASSERT3U(cc, >=, rm->rm_firstdatacol); 1262219089Spjd ASSERT3U(cc, <, rm->rm_cols); 1263219089Spjd ASSERT3U(cc, !=, c); 1264219089Spjd 1265219089Spjd dst[j] = rm->rm_col[cc].rc_data; 1266219089Spjd dcount[j] = rm->rm_col[cc].rc_size; 1267219089Spjd } 1268219089Spjd 1269219089Spjd ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); 1270219089Spjd 1271219089Spjd for (x = 0; x < ccount; x++, src++) { 1272219089Spjd if (*src != 0) 1273219089Spjd log = vdev_raidz_log2[*src]; 1274219089Spjd 1275219089Spjd for (cc = 0; cc < nmissing; cc++) { 1276219089Spjd if (x >= dcount[cc]) 1277219089Spjd continue; 1278219089Spjd 1279219089Spjd if (*src == 0) { 1280219089Spjd val = 0; 1281219089Spjd } else { 1282219089Spjd if ((ll = log + invlog[cc][i]) >= 255) 1283219089Spjd ll -= 255; 1284219089Spjd val = vdev_raidz_pow2[ll]; 1285219089Spjd } 1286219089Spjd 1287219089Spjd if (i == 0) 1288219089Spjd dst[cc][x] = val; 1289219089Spjd else 1290219089Spjd dst[cc][x] ^= val; 1291219089Spjd } 1292219089Spjd } 1293219089Spjd } 1294219089Spjd 1295219089Spjd kmem_free(p, psize); 1296219089Spjd} 1297219089Spjd 1298168404Spjdstatic int 1299219089Spjdvdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) 1300219089Spjd{ 1301219089Spjd int n, i, c, t, tt; 1302219089Spjd int nmissing_rows; 1303219089Spjd int missing_rows[VDEV_RAIDZ_MAXPARITY]; 1304219089Spjd int parity_map[VDEV_RAIDZ_MAXPARITY]; 1305219089Spjd 1306219089Spjd uint8_t *p, *pp; 1307219089Spjd size_t psize; 1308219089Spjd 1309219089Spjd uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 1310219089Spjd uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 1311219089Spjd uint8_t *used; 1312219089Spjd 1313219089Spjd int code = 0; 1314219089Spjd 1315219089Spjd 1316219089Spjd n = rm->rm_cols - rm->rm_firstdatacol; 1317219089Spjd 1318219089Spjd /* 1319219089Spjd * Figure out which data columns are missing. 1320219089Spjd */ 1321219089Spjd nmissing_rows = 0; 1322219089Spjd for (t = 0; t < ntgts; t++) { 1323219089Spjd if (tgts[t] >= rm->rm_firstdatacol) { 1324219089Spjd missing_rows[nmissing_rows++] = 1325219089Spjd tgts[t] - rm->rm_firstdatacol; 1326219089Spjd } 1327219089Spjd } 1328219089Spjd 1329219089Spjd /* 1330219089Spjd * Figure out which parity columns to use to help generate the missing 1331219089Spjd * data columns. 1332219089Spjd */ 1333219089Spjd for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 1334219089Spjd ASSERT(tt < ntgts); 1335219089Spjd ASSERT(c < rm->rm_firstdatacol); 1336219089Spjd 1337219089Spjd /* 1338219089Spjd * Skip any targeted parity columns. 1339219089Spjd */ 1340219089Spjd if (c == tgts[tt]) { 1341219089Spjd tt++; 1342219089Spjd continue; 1343219089Spjd } 1344219089Spjd 1345219089Spjd code |= 1 << c; 1346219089Spjd 1347219089Spjd parity_map[i] = c; 1348219089Spjd i++; 1349219089Spjd } 1350219089Spjd 1351219089Spjd ASSERT(code != 0); 1352219089Spjd ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); 1353219089Spjd 1354219089Spjd psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 1355219089Spjd nmissing_rows * n + sizeof (used[0]) * n; 1356219089Spjd p = kmem_alloc(psize, KM_SLEEP); 1357219089Spjd 1358219089Spjd for (pp = p, i = 0; i < nmissing_rows; i++) { 1359219089Spjd rows[i] = pp; 1360219089Spjd pp += n; 1361219089Spjd invrows[i] = pp; 1362219089Spjd pp += n; 1363219089Spjd } 1364219089Spjd used = pp; 1365219089Spjd 1366219089Spjd for (i = 0; i < nmissing_rows; i++) { 1367219089Spjd used[i] = parity_map[i]; 1368219089Spjd } 1369219089Spjd 1370219089Spjd for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1371219089Spjd if (tt < nmissing_rows && 1372219089Spjd c == missing_rows[tt] + rm->rm_firstdatacol) { 1373219089Spjd tt++; 1374219089Spjd continue; 1375219089Spjd } 1376219089Spjd 1377219089Spjd ASSERT3S(i, <, n); 1378219089Spjd used[i] = c; 1379219089Spjd i++; 1380219089Spjd } 1381219089Spjd 1382219089Spjd /* 1383219089Spjd * Initialize the interesting rows of the matrix. 1384219089Spjd */ 1385219089Spjd vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); 1386219089Spjd 1387219089Spjd /* 1388219089Spjd * Invert the matrix. 1389219089Spjd */ 1390219089Spjd vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, 1391219089Spjd invrows, used); 1392219089Spjd 1393219089Spjd /* 1394219089Spjd * Reconstruct the missing data using the generated matrix. 1395219089Spjd */ 1396219089Spjd vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, 1397219089Spjd invrows, used); 1398219089Spjd 1399219089Spjd kmem_free(p, psize); 1400219089Spjd 1401219089Spjd return (code); 1402219089Spjd} 1403219089Spjd 1404219089Spjdstatic int 1405219089Spjdvdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) 1406219089Spjd{ 1407219089Spjd int tgts[VDEV_RAIDZ_MAXPARITY], *dt; 1408219089Spjd int ntgts; 1409219089Spjd int i, c; 1410219089Spjd int code; 1411219089Spjd int nbadparity, nbaddata; 1412219089Spjd int parity_valid[VDEV_RAIDZ_MAXPARITY]; 1413219089Spjd 1414219089Spjd /* 1415219089Spjd * The tgts list must already be sorted. 1416219089Spjd */ 1417219089Spjd for (i = 1; i < nt; i++) { 1418219089Spjd ASSERT(t[i] > t[i - 1]); 1419219089Spjd } 1420219089Spjd 1421219089Spjd nbadparity = rm->rm_firstdatacol; 1422219089Spjd nbaddata = rm->rm_cols - nbadparity; 1423219089Spjd ntgts = 0; 1424219089Spjd for (i = 0, c = 0; c < rm->rm_cols; c++) { 1425219089Spjd if (c < rm->rm_firstdatacol) 1426219089Spjd parity_valid[c] = B_FALSE; 1427219089Spjd 1428219089Spjd if (i < nt && c == t[i]) { 1429219089Spjd tgts[ntgts++] = c; 1430219089Spjd i++; 1431219089Spjd } else if (rm->rm_col[c].rc_error != 0) { 1432219089Spjd tgts[ntgts++] = c; 1433219089Spjd } else if (c >= rm->rm_firstdatacol) { 1434219089Spjd nbaddata--; 1435219089Spjd } else { 1436219089Spjd parity_valid[c] = B_TRUE; 1437219089Spjd nbadparity--; 1438219089Spjd } 1439219089Spjd } 1440219089Spjd 1441219089Spjd ASSERT(ntgts >= nt); 1442219089Spjd ASSERT(nbaddata >= 0); 1443219089Spjd ASSERT(nbaddata + nbadparity == ntgts); 1444219089Spjd 1445219089Spjd dt = &tgts[nbadparity]; 1446219089Spjd 1447219089Spjd /* 1448219089Spjd * See if we can use any of our optimized reconstruction routines. 1449219089Spjd */ 1450219089Spjd if (!vdev_raidz_default_to_general) { 1451219089Spjd switch (nbaddata) { 1452219089Spjd case 1: 1453219089Spjd if (parity_valid[VDEV_RAIDZ_P]) 1454219089Spjd return (vdev_raidz_reconstruct_p(rm, dt, 1)); 1455219089Spjd 1456219089Spjd ASSERT(rm->rm_firstdatacol > 1); 1457219089Spjd 1458219089Spjd if (parity_valid[VDEV_RAIDZ_Q]) 1459219089Spjd return (vdev_raidz_reconstruct_q(rm, dt, 1)); 1460219089Spjd 1461219089Spjd ASSERT(rm->rm_firstdatacol > 2); 1462219089Spjd break; 1463219089Spjd 1464219089Spjd case 2: 1465219089Spjd ASSERT(rm->rm_firstdatacol > 1); 1466219089Spjd 1467219089Spjd if (parity_valid[VDEV_RAIDZ_P] && 1468219089Spjd parity_valid[VDEV_RAIDZ_Q]) 1469219089Spjd return (vdev_raidz_reconstruct_pq(rm, dt, 2)); 1470219089Spjd 1471219089Spjd ASSERT(rm->rm_firstdatacol > 2); 1472219089Spjd 1473219089Spjd break; 1474219089Spjd } 1475219089Spjd } 1476219089Spjd 1477219089Spjd code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); 1478219089Spjd ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); 1479219089Spjd ASSERT(code > 0); 1480219089Spjd return (code); 1481219089Spjd} 1482219089Spjd 1483219089Spjdstatic int 1484236839Smmvdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, 1485262081Savg uint64_t *logical_ashift, uint64_t *physical_ashift) 1486168404Spjd{ 1487168404Spjd vdev_t *cvd; 1488168404Spjd uint64_t nparity = vd->vdev_nparity; 1489219089Spjd int c; 1490168404Spjd int lasterror = 0; 1491168404Spjd int numerrors = 0; 1492168404Spjd 1493168404Spjd ASSERT(nparity > 0); 1494168404Spjd 1495168404Spjd if (nparity > VDEV_RAIDZ_MAXPARITY || 1496168404Spjd vd->vdev_children < nparity + 1) { 1497168404Spjd vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 1498249643Smm return (SET_ERROR(EINVAL)); 1499168404Spjd } 1500168404Spjd 1501219089Spjd vdev_open_children(vd); 1502219089Spjd 1503168404Spjd for (c = 0; c < vd->vdev_children; c++) { 1504168404Spjd cvd = vd->vdev_child[c]; 1505168404Spjd 1506219089Spjd if (cvd->vdev_open_error != 0) { 1507219089Spjd lasterror = cvd->vdev_open_error; 1508168404Spjd numerrors++; 1509168404Spjd continue; 1510168404Spjd } 1511168404Spjd 1512168404Spjd *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; 1513236839Smm *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; 1514262081Savg *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift); 1515262081Savg *physical_ashift = MAX(*physical_ashift, 1516262081Savg cvd->vdev_physical_ashift); 1517168404Spjd } 1518168404Spjd 1519168404Spjd *asize *= vd->vdev_children; 1520236839Smm *max_asize *= vd->vdev_children; 1521168404Spjd 1522168404Spjd if (numerrors > nparity) { 1523168404Spjd vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; 1524168404Spjd return (lasterror); 1525168404Spjd } 1526168404Spjd 1527168404Spjd return (0); 1528168404Spjd} 1529168404Spjd 1530168404Spjdstatic void 1531168404Spjdvdev_raidz_close(vdev_t *vd) 1532168404Spjd{ 1533168404Spjd int c; 1534168404Spjd 1535168404Spjd for (c = 0; c < vd->vdev_children; c++) 1536168404Spjd vdev_close(vd->vdev_child[c]); 1537168404Spjd} 1538168404Spjd 1539262089Savg#ifdef illumos 1540262089Savg/* 1541262089Savg * Handle a read or write I/O to a RAID-Z dump device. 1542262089Savg * 1543262089Savg * The dump device is in a unique situation compared to other ZFS datasets: 1544262089Savg * writing to this device should be as simple and fast as possible. In 1545262089Savg * addition, durability matters much less since the dump will be extracted 1546262089Savg * once the machine reboots. For that reason, this function eschews parity for 1547262089Savg * performance and simplicity. The dump device uses the checksum setting 1548262089Savg * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this 1549262089Savg * dataset. 1550262089Savg * 1551262089Savg * Blocks of size 128 KB have been preallocated for this volume. I/Os less than 1552262089Savg * 128 KB will not fill an entire block; in addition, they may not be properly 1553262089Savg * aligned. In that case, this function uses the preallocated 128 KB block and 1554262089Savg * omits reading or writing any "empty" portions of that block, as opposed to 1555262089Savg * allocating a fresh appropriately-sized block. 1556262089Savg * 1557262089Savg * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs: 1558262089Savg * 1559262089Savg * vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB) 1560262089Savg * 1561262089Savg * If this were a standard RAID-Z dataset, a block of at least 40 KB would be 1562262089Savg * allocated which spans all five child vdevs. 8 KB of data would be written to 1563262089Savg * each of four vdevs, with the fifth containing the parity bits. 1564262089Savg * 1565262089Savg * parity data data data data 1566262089Savg * | PP | XX | XX | XX | XX | 1567262089Savg * ^ ^ ^ ^ ^ 1568262089Savg * | | | | | 1569262089Savg * 8 KB parity ------8 KB data blocks------ 1570262089Savg * 1571262089Savg * However, when writing to the dump device, the behavior is different: 1572262089Savg * 1573262089Savg * vdev_raidz_physio(data, size: 32 KB, offset: 64 KB) 1574262089Savg * 1575262089Savg * Unlike the normal RAID-Z case in which the block is allocated based on the 1576262089Savg * I/O size, reads and writes here always use a 128 KB logical I/O size. If the 1577262089Savg * I/O size is less than 128 KB, only the actual portions of data are written. 1578262089Savg * In this example the data is written to the third data vdev since that vdev 1579262089Savg * contains the offset [64 KB, 96 KB). 1580262089Savg * 1581262089Savg * parity data data data data 1582262089Savg * | | | | XX | | 1583262089Savg * ^ 1584262089Savg * | 1585262089Savg * 32 KB data block 1586262089Savg * 1587262089Savg * As a result, an individual I/O may not span all child vdevs; moreover, a 1588262089Savg * small I/O may only operate on a single child vdev. 1589262089Savg * 1590262089Savg * Note that since there are no parity bits calculated or written, this format 1591262089Savg * remains the same no matter how many parity bits are used in a normal RAID-Z 1592262089Savg * stripe. On a RAID-Z3 configuration with seven child vdevs, the example above 1593262089Savg * would look like: 1594262089Savg * 1595262089Savg * parity parity parity data data data data 1596262089Savg * | | | | | | XX | | 1597262089Savg * ^ 1598262089Savg * | 1599262089Savg * 32 KB data block 1600262089Savg */ 1601262089Savgint 1602262089Savgvdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size, 1603262089Savg uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump) 1604262089Savg{ 1605262089Savg vdev_t *tvd = vd->vdev_top; 1606262089Savg vdev_t *cvd; 1607262089Savg raidz_map_t *rm; 1608262089Savg raidz_col_t *rc; 1609262089Savg int c, err = 0; 1610262089Savg 1611262089Savg uint64_t start, end, colstart, colend; 1612262089Savg uint64_t coloffset, colsize, colskip; 1613262089Savg 1614262089Savg int flags = doread ? BIO_READ : BIO_WRITE; 1615262089Savg 1616262089Savg#ifdef _KERNEL 1617262089Savg 1618262089Savg /* 1619262089Savg * Don't write past the end of the block 1620262089Savg */ 1621262089Savg VERIFY3U(offset + size, <=, origoffset + SPA_MAXBLOCKSIZE); 1622262089Savg 1623262089Savg start = offset; 1624262089Savg end = start + size; 1625262089Savg 1626262089Savg /* 1627262089Savg * Allocate a RAID-Z map for this block. Note that this block starts 1628262089Savg * from the "original" offset, this is, the offset of the extent which 1629262089Savg * contains the requisite offset of the data being read or written. 1630262089Savg * 1631262089Savg * Even if this I/O operation doesn't span the full block size, let's 1632262089Savg * treat the on-disk format as if the only blocks are the complete 128 1633262089Savg * KB size. 1634262089Savg */ 1635262089Savg rm = vdev_raidz_map_alloc(data - (offset - origoffset), 1636262089Savg SPA_MAXBLOCKSIZE, origoffset, B_FALSE, tvd->vdev_ashift, vd->vdev_children, 1637262089Savg vd->vdev_nparity); 1638262089Savg 1639262089Savg coloffset = origoffset; 1640262089Savg 1641262089Savg for (c = rm->rm_firstdatacol; c < rm->rm_cols; 1642262089Savg c++, coloffset += rc->rc_size) { 1643262089Savg rc = &rm->rm_col[c]; 1644262089Savg cvd = vd->vdev_child[rc->rc_devidx]; 1645262089Savg 1646262089Savg /* 1647262089Savg * Find the start and end of this column in the RAID-Z map, 1648262089Savg * keeping in mind that the stated size and offset of the 1649262089Savg * operation may not fill the entire column for this vdev. 1650262089Savg * 1651262089Savg * If any portion of the data spans this column, issue the 1652262089Savg * appropriate operation to the vdev. 1653262089Savg */ 1654262089Savg if (coloffset + rc->rc_size <= start) 1655262089Savg continue; 1656262089Savg if (coloffset >= end) 1657262089Savg continue; 1658262089Savg 1659262089Savg colstart = MAX(coloffset, start); 1660262089Savg colend = MIN(end, coloffset + rc->rc_size); 1661262089Savg colsize = colend - colstart; 1662262089Savg colskip = colstart - coloffset; 1663262089Savg 1664262089Savg VERIFY3U(colsize, <=, rc->rc_size); 1665262089Savg VERIFY3U(colskip, <=, rc->rc_size); 1666262089Savg 1667262089Savg /* 1668262089Savg * Note that the child vdev will have a vdev label at the start 1669262089Savg * of its range of offsets, hence the need for 1670262089Savg * VDEV_LABEL_OFFSET(). See zio_vdev_child_io() for another 1671262089Savg * example of why this calculation is needed. 1672262089Savg */ 1673262089Savg if ((err = vdev_disk_physio(cvd, 1674262089Savg ((char *)rc->rc_data) + colskip, colsize, 1675262089Savg VDEV_LABEL_OFFSET(rc->rc_offset) + colskip, 1676262089Savg flags, isdump)) != 0) 1677262089Savg break; 1678262089Savg } 1679262089Savg 1680262089Savg vdev_raidz_map_free(rm); 1681262089Savg#endif /* KERNEL */ 1682262089Savg 1683262089Savg return (err); 1684262089Savg} 1685262089Savg#endif 1686262089Savg 1687168404Spjdstatic uint64_t 1688168404Spjdvdev_raidz_asize(vdev_t *vd, uint64_t psize) 1689168404Spjd{ 1690168404Spjd uint64_t asize; 1691168404Spjd uint64_t ashift = vd->vdev_top->vdev_ashift; 1692168404Spjd uint64_t cols = vd->vdev_children; 1693168404Spjd uint64_t nparity = vd->vdev_nparity; 1694168404Spjd 1695168404Spjd asize = ((psize - 1) >> ashift) + 1; 1696168404Spjd asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); 1697168404Spjd asize = roundup(asize, nparity + 1) << ashift; 1698168404Spjd 1699168404Spjd return (asize); 1700168404Spjd} 1701168404Spjd 1702168404Spjdstatic void 1703168404Spjdvdev_raidz_child_done(zio_t *zio) 1704168404Spjd{ 1705168404Spjd raidz_col_t *rc = zio->io_private; 1706168404Spjd 1707168404Spjd rc->rc_error = zio->io_error; 1708168404Spjd rc->rc_tried = 1; 1709168404Spjd rc->rc_skipped = 0; 1710168404Spjd} 1711168404Spjd 1712252749Sdelphij/* 1713252749Sdelphij * Start an IO operation on a RAIDZ VDev 1714252749Sdelphij * 1715252749Sdelphij * Outline: 1716252749Sdelphij * - For write operations: 1717252749Sdelphij * 1. Generate the parity data 1718252749Sdelphij * 2. Create child zio write operations to each column's vdev, for both 1719252749Sdelphij * data and parity. 1720252749Sdelphij * 3. If the column skips any sectors for padding, create optional dummy 1721252749Sdelphij * write zio children for those areas to improve aggregation continuity. 1722252749Sdelphij * - For read operations: 1723252749Sdelphij * 1. Create child zio read operations to each data column's vdev to read 1724252749Sdelphij * the range of data required for zio. 1725252749Sdelphij * 2. If this is a scrub or resilver operation, or if any of the data 1726252749Sdelphij * vdevs have had errors, then create zio read operations to the parity 1727252749Sdelphij * columns' VDevs as well. 1728252749Sdelphij */ 1729185029Spjdstatic int 1730168404Spjdvdev_raidz_io_start(zio_t *zio) 1731168404Spjd{ 1732168404Spjd vdev_t *vd = zio->io_vd; 1733168404Spjd vdev_t *tvd = vd->vdev_top; 1734168404Spjd vdev_t *cvd; 1735168404Spjd raidz_map_t *rm; 1736168404Spjd raidz_col_t *rc; 1737219089Spjd int c, i; 1738168404Spjd 1739262089Savg rm = vdev_raidz_map_alloc(zio->io_data, zio->io_size, zio->io_offset, 1740262089Savg zio->io_type == ZIO_TYPE_FREE, 1741262089Savg tvd->vdev_ashift, vd->vdev_children, 1742168404Spjd vd->vdev_nparity); 1743168404Spjd 1744262089Savg zio->io_vsd = rm; 1745262089Savg zio->io_vsd_ops = &vdev_raidz_vsd_ops; 1746262089Savg 1747168404Spjd ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); 1748168404Spjd 1749251419Ssmh if (zio->io_type == ZIO_TYPE_FREE) { 1750251419Ssmh for (c = 0; c < rm->rm_cols; c++) { 1751251419Ssmh rc = &rm->rm_col[c]; 1752251419Ssmh cvd = vd->vdev_child[rc->rc_devidx]; 1753251419Ssmh zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1754251419Ssmh rc->rc_offset, rc->rc_data, rc->rc_size, 1755251419Ssmh zio->io_type, zio->io_priority, 0, 1756251419Ssmh vdev_raidz_child_done, rc)); 1757251419Ssmh } 1758251419Ssmh return (ZIO_PIPELINE_CONTINUE); 1759251419Ssmh } 1760251419Ssmh 1761168404Spjd if (zio->io_type == ZIO_TYPE_WRITE) { 1762219089Spjd vdev_raidz_generate_parity(rm); 1763168404Spjd 1764168404Spjd for (c = 0; c < rm->rm_cols; c++) { 1765168404Spjd rc = &rm->rm_col[c]; 1766168404Spjd cvd = vd->vdev_child[rc->rc_devidx]; 1767168404Spjd zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1768168404Spjd rc->rc_offset, rc->rc_data, rc->rc_size, 1769185029Spjd zio->io_type, zio->io_priority, 0, 1770168404Spjd vdev_raidz_child_done, rc)); 1771168404Spjd } 1772185029Spjd 1773219089Spjd /* 1774219089Spjd * Generate optional I/Os for any skipped sectors to improve 1775219089Spjd * aggregation contiguity. 1776219089Spjd */ 1777219089Spjd for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) { 1778219089Spjd ASSERT(c <= rm->rm_scols); 1779219089Spjd if (c == rm->rm_scols) 1780219089Spjd c = 0; 1781219089Spjd rc = &rm->rm_col[c]; 1782219089Spjd cvd = vd->vdev_child[rc->rc_devidx]; 1783219089Spjd zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1784219089Spjd rc->rc_offset + rc->rc_size, NULL, 1785219089Spjd 1 << tvd->vdev_ashift, 1786219089Spjd zio->io_type, zio->io_priority, 1787219089Spjd ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); 1788219089Spjd } 1789219089Spjd 1790185029Spjd return (ZIO_PIPELINE_CONTINUE); 1791168404Spjd } 1792168404Spjd 1793168404Spjd ASSERT(zio->io_type == ZIO_TYPE_READ); 1794168404Spjd 1795168404Spjd /* 1796168404Spjd * Iterate over the columns in reverse order so that we hit the parity 1797219089Spjd * last -- any errors along the way will force us to read the parity. 1798168404Spjd */ 1799168404Spjd for (c = rm->rm_cols - 1; c >= 0; c--) { 1800168404Spjd rc = &rm->rm_col[c]; 1801168404Spjd cvd = vd->vdev_child[rc->rc_devidx]; 1802185029Spjd if (!vdev_readable(cvd)) { 1803168404Spjd if (c >= rm->rm_firstdatacol) 1804168404Spjd rm->rm_missingdata++; 1805168404Spjd else 1806168404Spjd rm->rm_missingparity++; 1807249643Smm rc->rc_error = SET_ERROR(ENXIO); 1808168404Spjd rc->rc_tried = 1; /* don't even try */ 1809168404Spjd rc->rc_skipped = 1; 1810168404Spjd continue; 1811168404Spjd } 1812219089Spjd if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 1813168404Spjd if (c >= rm->rm_firstdatacol) 1814168404Spjd rm->rm_missingdata++; 1815168404Spjd else 1816168404Spjd rm->rm_missingparity++; 1817249643Smm rc->rc_error = SET_ERROR(ESTALE); 1818168404Spjd rc->rc_skipped = 1; 1819168404Spjd continue; 1820168404Spjd } 1821168404Spjd if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || 1822209095Smm (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { 1823168404Spjd zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1824168404Spjd rc->rc_offset, rc->rc_data, rc->rc_size, 1825185029Spjd zio->io_type, zio->io_priority, 0, 1826168404Spjd vdev_raidz_child_done, rc)); 1827168404Spjd } 1828168404Spjd } 1829168404Spjd 1830185029Spjd return (ZIO_PIPELINE_CONTINUE); 1831168404Spjd} 1832168404Spjd 1833219089Spjd 1834168404Spjd/* 1835168404Spjd * Report a checksum error for a child of a RAID-Z device. 1836168404Spjd */ 1837168404Spjdstatic void 1838219089Spjdraidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data) 1839168404Spjd{ 1840168404Spjd vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; 1841168404Spjd 1842168404Spjd if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 1843219089Spjd zio_bad_cksum_t zbc; 1844219089Spjd raidz_map_t *rm = zio->io_vsd; 1845219089Spjd 1846168404Spjd mutex_enter(&vd->vdev_stat_lock); 1847168404Spjd vd->vdev_stat.vs_checksum_errors++; 1848168404Spjd mutex_exit(&vd->vdev_stat_lock); 1849219089Spjd 1850219089Spjd zbc.zbc_has_cksum = 0; 1851219089Spjd zbc.zbc_injected = rm->rm_ecksuminjected; 1852219089Spjd 1853219089Spjd zfs_ereport_post_checksum(zio->io_spa, vd, zio, 1854219089Spjd rc->rc_offset, rc->rc_size, rc->rc_data, bad_data, 1855219089Spjd &zbc); 1856168404Spjd } 1857219089Spjd} 1858168404Spjd 1859219089Spjd/* 1860219089Spjd * We keep track of whether or not there were any injected errors, so that 1861219089Spjd * any ereports we generate can note it. 1862219089Spjd */ 1863219089Spjdstatic int 1864219089Spjdraidz_checksum_verify(zio_t *zio) 1865219089Spjd{ 1866219089Spjd zio_bad_cksum_t zbc; 1867219089Spjd raidz_map_t *rm = zio->io_vsd; 1868219089Spjd 1869219089Spjd int ret = zio_checksum_error(zio, &zbc); 1870219089Spjd if (ret != 0 && zbc.zbc_injected != 0) 1871219089Spjd rm->rm_ecksuminjected = 1; 1872219089Spjd 1873219089Spjd return (ret); 1874168404Spjd} 1875168404Spjd 1876168404Spjd/* 1877168404Spjd * Generate the parity from the data columns. If we tried and were able to 1878168404Spjd * read the parity without error, verify that the generated parity matches the 1879168404Spjd * data we read. If it doesn't, we fire off a checksum error. Return the 1880168404Spjd * number such failures. 1881168404Spjd */ 1882168404Spjdstatic int 1883168404Spjdraidz_parity_verify(zio_t *zio, raidz_map_t *rm) 1884168404Spjd{ 1885168404Spjd void *orig[VDEV_RAIDZ_MAXPARITY]; 1886168404Spjd int c, ret = 0; 1887168404Spjd raidz_col_t *rc; 1888168404Spjd 1889262089Savg blkptr_t *bp = zio->io_bp; 1890262089Savg enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum : 1891262089Savg (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); 1892262089Savg 1893262089Savg if (checksum == ZIO_CHECKSUM_NOPARITY) 1894262089Savg return (ret); 1895262089Savg 1896168404Spjd for (c = 0; c < rm->rm_firstdatacol; c++) { 1897168404Spjd rc = &rm->rm_col[c]; 1898168404Spjd if (!rc->rc_tried || rc->rc_error != 0) 1899168404Spjd continue; 1900168404Spjd orig[c] = zio_buf_alloc(rc->rc_size); 1901168404Spjd bcopy(rc->rc_data, orig[c], rc->rc_size); 1902168404Spjd } 1903168404Spjd 1904219089Spjd vdev_raidz_generate_parity(rm); 1905168404Spjd 1906168404Spjd for (c = 0; c < rm->rm_firstdatacol; c++) { 1907168404Spjd rc = &rm->rm_col[c]; 1908168404Spjd if (!rc->rc_tried || rc->rc_error != 0) 1909168404Spjd continue; 1910168404Spjd if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { 1911219089Spjd raidz_checksum_error(zio, rc, orig[c]); 1912249643Smm rc->rc_error = SET_ERROR(ECKSUM); 1913168404Spjd ret++; 1914168404Spjd } 1915168404Spjd zio_buf_free(orig[c], rc->rc_size); 1916168404Spjd } 1917168404Spjd 1918168404Spjd return (ret); 1919168404Spjd} 1920168404Spjd 1921219089Spjd/* 1922219089Spjd * Keep statistics on all the ways that we used parity to correct data. 1923219089Spjd */ 1924219089Spjdstatic uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY]; 1925168404Spjd 1926185029Spjdstatic int 1927185029Spjdvdev_raidz_worst_error(raidz_map_t *rm) 1928185029Spjd{ 1929185029Spjd int error = 0; 1930185029Spjd 1931185029Spjd for (int c = 0; c < rm->rm_cols; c++) 1932185029Spjd error = zio_worst_error(error, rm->rm_col[c].rc_error); 1933185029Spjd 1934185029Spjd return (error); 1935185029Spjd} 1936185029Spjd 1937219089Spjd/* 1938219089Spjd * Iterate over all combinations of bad data and attempt a reconstruction. 1939219089Spjd * Note that the algorithm below is non-optimal because it doesn't take into 1940219089Spjd * account how reconstruction is actually performed. For example, with 1941219089Spjd * triple-parity RAID-Z the reconstruction procedure is the same if column 4 1942219089Spjd * is targeted as invalid as if columns 1 and 4 are targeted since in both 1943219089Spjd * cases we'd only use parity information in column 0. 1944219089Spjd */ 1945219089Spjdstatic int 1946219089Spjdvdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) 1947219089Spjd{ 1948219089Spjd raidz_map_t *rm = zio->io_vsd; 1949219089Spjd raidz_col_t *rc; 1950219089Spjd void *orig[VDEV_RAIDZ_MAXPARITY]; 1951219089Spjd int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 1952219089Spjd int *tgts = &tstore[1]; 1953219089Spjd int current, next, i, c, n; 1954219089Spjd int code, ret = 0; 1955219089Spjd 1956219089Spjd ASSERT(total_errors < rm->rm_firstdatacol); 1957219089Spjd 1958219089Spjd /* 1959219089Spjd * This simplifies one edge condition. 1960219089Spjd */ 1961219089Spjd tgts[-1] = -1; 1962219089Spjd 1963219089Spjd for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { 1964219089Spjd /* 1965219089Spjd * Initialize the targets array by finding the first n columns 1966219089Spjd * that contain no error. 1967219089Spjd * 1968219089Spjd * If there were no data errors, we need to ensure that we're 1969219089Spjd * always explicitly attempting to reconstruct at least one 1970219089Spjd * data column. To do this, we simply push the highest target 1971219089Spjd * up into the data columns. 1972219089Spjd */ 1973219089Spjd for (c = 0, i = 0; i < n; i++) { 1974219089Spjd if (i == n - 1 && data_errors == 0 && 1975219089Spjd c < rm->rm_firstdatacol) { 1976219089Spjd c = rm->rm_firstdatacol; 1977219089Spjd } 1978219089Spjd 1979219089Spjd while (rm->rm_col[c].rc_error != 0) { 1980219089Spjd c++; 1981219089Spjd ASSERT3S(c, <, rm->rm_cols); 1982219089Spjd } 1983219089Spjd 1984219089Spjd tgts[i] = c++; 1985219089Spjd } 1986219089Spjd 1987219089Spjd /* 1988219089Spjd * Setting tgts[n] simplifies the other edge condition. 1989219089Spjd */ 1990219089Spjd tgts[n] = rm->rm_cols; 1991219089Spjd 1992219089Spjd /* 1993219089Spjd * These buffers were allocated in previous iterations. 1994219089Spjd */ 1995219089Spjd for (i = 0; i < n - 1; i++) { 1996219089Spjd ASSERT(orig[i] != NULL); 1997219089Spjd } 1998219089Spjd 1999219089Spjd orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size); 2000219089Spjd 2001219089Spjd current = 0; 2002219089Spjd next = tgts[current]; 2003219089Spjd 2004219089Spjd while (current != n) { 2005219089Spjd tgts[current] = next; 2006219089Spjd current = 0; 2007219089Spjd 2008219089Spjd /* 2009219089Spjd * Save off the original data that we're going to 2010219089Spjd * attempt to reconstruct. 2011219089Spjd */ 2012219089Spjd for (i = 0; i < n; i++) { 2013219089Spjd ASSERT(orig[i] != NULL); 2014219089Spjd c = tgts[i]; 2015219089Spjd ASSERT3S(c, >=, 0); 2016219089Spjd ASSERT3S(c, <, rm->rm_cols); 2017219089Spjd rc = &rm->rm_col[c]; 2018219089Spjd bcopy(rc->rc_data, orig[i], rc->rc_size); 2019219089Spjd } 2020219089Spjd 2021219089Spjd /* 2022219089Spjd * Attempt a reconstruction and exit the outer loop on 2023219089Spjd * success. 2024219089Spjd */ 2025219089Spjd code = vdev_raidz_reconstruct(rm, tgts, n); 2026219089Spjd if (raidz_checksum_verify(zio) == 0) { 2027219089Spjd atomic_inc_64(&raidz_corrected[code]); 2028219089Spjd 2029219089Spjd for (i = 0; i < n; i++) { 2030219089Spjd c = tgts[i]; 2031219089Spjd rc = &rm->rm_col[c]; 2032219089Spjd ASSERT(rc->rc_error == 0); 2033219089Spjd if (rc->rc_tried) 2034219089Spjd raidz_checksum_error(zio, rc, 2035219089Spjd orig[i]); 2036249643Smm rc->rc_error = SET_ERROR(ECKSUM); 2037219089Spjd } 2038219089Spjd 2039219089Spjd ret = code; 2040219089Spjd goto done; 2041219089Spjd } 2042219089Spjd 2043219089Spjd /* 2044219089Spjd * Restore the original data. 2045219089Spjd */ 2046219089Spjd for (i = 0; i < n; i++) { 2047219089Spjd c = tgts[i]; 2048219089Spjd rc = &rm->rm_col[c]; 2049219089Spjd bcopy(orig[i], rc->rc_data, rc->rc_size); 2050219089Spjd } 2051219089Spjd 2052219089Spjd do { 2053219089Spjd /* 2054219089Spjd * Find the next valid column after the current 2055219089Spjd * position.. 2056219089Spjd */ 2057219089Spjd for (next = tgts[current] + 1; 2058219089Spjd next < rm->rm_cols && 2059219089Spjd rm->rm_col[next].rc_error != 0; next++) 2060219089Spjd continue; 2061219089Spjd 2062219089Spjd ASSERT(next <= tgts[current + 1]); 2063219089Spjd 2064219089Spjd /* 2065219089Spjd * If that spot is available, we're done here. 2066219089Spjd */ 2067219089Spjd if (next != tgts[current + 1]) 2068219089Spjd break; 2069219089Spjd 2070219089Spjd /* 2071219089Spjd * Otherwise, find the next valid column after 2072219089Spjd * the previous position. 2073219089Spjd */ 2074219089Spjd for (c = tgts[current - 1] + 1; 2075219089Spjd rm->rm_col[c].rc_error != 0; c++) 2076219089Spjd continue; 2077219089Spjd 2078219089Spjd tgts[current] = c; 2079219089Spjd current++; 2080219089Spjd 2081219089Spjd } while (current != n); 2082219089Spjd } 2083219089Spjd } 2084219089Spjd n--; 2085219089Spjddone: 2086219089Spjd for (i = 0; i < n; i++) { 2087219089Spjd zio_buf_free(orig[i], rm->rm_col[0].rc_size); 2088219089Spjd } 2089219089Spjd 2090219089Spjd return (ret); 2091219089Spjd} 2092219089Spjd 2093252749Sdelphij/* 2094252749Sdelphij * Complete an IO operation on a RAIDZ VDev 2095252749Sdelphij * 2096252749Sdelphij * Outline: 2097252749Sdelphij * - For write operations: 2098252749Sdelphij * 1. Check for errors on the child IOs. 2099252749Sdelphij * 2. Return, setting an error code if too few child VDevs were written 2100252749Sdelphij * to reconstruct the data later. Note that partial writes are 2101252749Sdelphij * considered successful if they can be reconstructed at all. 2102252749Sdelphij * - For read operations: 2103252749Sdelphij * 1. Check for errors on the child IOs. 2104252749Sdelphij * 2. If data errors occurred: 2105252749Sdelphij * a. Try to reassemble the data from the parity available. 2106252749Sdelphij * b. If we haven't yet read the parity drives, read them now. 2107252749Sdelphij * c. If all parity drives have been read but the data still doesn't 2108252749Sdelphij * reassemble with a correct checksum, then try combinatorial 2109252749Sdelphij * reconstruction. 2110252749Sdelphij * d. If that doesn't work, return an error. 2111252749Sdelphij * 3. If there were unexpected errors or this is a resilver operation, 2112252749Sdelphij * rewrite the vdevs that had errors. 2113252749Sdelphij */ 2114168404Spjdstatic void 2115168404Spjdvdev_raidz_io_done(zio_t *zio) 2116168404Spjd{ 2117168404Spjd vdev_t *vd = zio->io_vd; 2118168404Spjd vdev_t *cvd; 2119168404Spjd raidz_map_t *rm = zio->io_vsd; 2120219089Spjd raidz_col_t *rc; 2121168404Spjd int unexpected_errors = 0; 2122168404Spjd int parity_errors = 0; 2123168404Spjd int parity_untried = 0; 2124168404Spjd int data_errors = 0; 2125185029Spjd int total_errors = 0; 2126219089Spjd int n, c; 2127219089Spjd int tgts[VDEV_RAIDZ_MAXPARITY]; 2128219089Spjd int code; 2129168404Spjd 2130168404Spjd ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ 2131168404Spjd 2132168404Spjd ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); 2133168404Spjd ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); 2134168404Spjd 2135168404Spjd for (c = 0; c < rm->rm_cols; c++) { 2136168404Spjd rc = &rm->rm_col[c]; 2137168404Spjd 2138168404Spjd if (rc->rc_error) { 2139185029Spjd ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 2140168404Spjd 2141168404Spjd if (c < rm->rm_firstdatacol) 2142168404Spjd parity_errors++; 2143168404Spjd else 2144168404Spjd data_errors++; 2145168404Spjd 2146168404Spjd if (!rc->rc_skipped) 2147168404Spjd unexpected_errors++; 2148168404Spjd 2149185029Spjd total_errors++; 2150168404Spjd } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { 2151168404Spjd parity_untried++; 2152168404Spjd } 2153168404Spjd } 2154168404Spjd 2155168404Spjd if (zio->io_type == ZIO_TYPE_WRITE) { 2156168404Spjd /* 2157185029Spjd * XXX -- for now, treat partial writes as a success. 2158185029Spjd * (If we couldn't write enough columns to reconstruct 2159185029Spjd * the data, the I/O failed. Otherwise, good enough.) 2160185029Spjd * 2161185029Spjd * Now that we support write reallocation, it would be better 2162185029Spjd * to treat partial failure as real failure unless there are 2163185029Spjd * no non-degraded top-level vdevs left, and not update DTLs 2164185029Spjd * if we intend to reallocate. 2165168404Spjd */ 2166168404Spjd /* XXPOLICY */ 2167185029Spjd if (total_errors > rm->rm_firstdatacol) 2168185029Spjd zio->io_error = vdev_raidz_worst_error(rm); 2169168404Spjd 2170168404Spjd return; 2171251419Ssmh } else if (zio->io_type == ZIO_TYPE_FREE) { 2172251419Ssmh return; 2173168404Spjd } 2174168404Spjd 2175168404Spjd ASSERT(zio->io_type == ZIO_TYPE_READ); 2176168404Spjd /* 2177168404Spjd * There are three potential phases for a read: 2178168404Spjd * 1. produce valid data from the columns read 2179168404Spjd * 2. read all disks and try again 2180168404Spjd * 3. perform combinatorial reconstruction 2181168404Spjd * 2182168404Spjd * Each phase is progressively both more expensive and less likely to 2183168404Spjd * occur. If we encounter more errors than we can repair or all phases 2184168404Spjd * fail, we have no choice but to return an error. 2185168404Spjd */ 2186168404Spjd 2187168404Spjd /* 2188168404Spjd * If the number of errors we saw was correctable -- less than or equal 2189168404Spjd * to the number of parity disks read -- attempt to produce data that 2190168404Spjd * has a valid checksum. Naturally, this case applies in the absence of 2191168404Spjd * any errors. 2192168404Spjd */ 2193185029Spjd if (total_errors <= rm->rm_firstdatacol - parity_untried) { 2194219089Spjd if (data_errors == 0) { 2195219089Spjd if (raidz_checksum_verify(zio) == 0) { 2196168738Spjd /* 2197168738Spjd * If we read parity information (unnecessarily 2198168738Spjd * as it happens since no reconstruction was 2199168738Spjd * needed) regenerate and verify the parity. 2200168738Spjd * We also regenerate parity when resilvering 2201168738Spjd * so we can write it out to the failed device 2202168738Spjd * later. 2203168738Spjd */ 2204168404Spjd if (parity_errors + parity_untried < 2205168738Spjd rm->rm_firstdatacol || 2206168738Spjd (zio->io_flags & ZIO_FLAG_RESILVER)) { 2207168404Spjd n = raidz_parity_verify(zio, rm); 2208168404Spjd unexpected_errors += n; 2209168404Spjd ASSERT(parity_errors + n <= 2210168404Spjd rm->rm_firstdatacol); 2211168404Spjd } 2212168404Spjd goto done; 2213168404Spjd } 2214219089Spjd } else { 2215168404Spjd /* 2216168404Spjd * We either attempt to read all the parity columns or 2217168404Spjd * none of them. If we didn't try to read parity, we 2218168404Spjd * wouldn't be here in the correctable case. There must 2219168404Spjd * also have been fewer parity errors than parity 2220168404Spjd * columns or, again, we wouldn't be in this code path. 2221168404Spjd */ 2222168404Spjd ASSERT(parity_untried == 0); 2223168404Spjd ASSERT(parity_errors < rm->rm_firstdatacol); 2224168404Spjd 2225168404Spjd /* 2226219089Spjd * Identify the data columns that reported an error. 2227168404Spjd */ 2228219089Spjd n = 0; 2229168404Spjd for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 2230168404Spjd rc = &rm->rm_col[c]; 2231219089Spjd if (rc->rc_error != 0) { 2232219089Spjd ASSERT(n < VDEV_RAIDZ_MAXPARITY); 2233219089Spjd tgts[n++] = c; 2234219089Spjd } 2235168404Spjd } 2236168404Spjd 2237219089Spjd ASSERT(rm->rm_firstdatacol >= n); 2238168404Spjd 2239219089Spjd code = vdev_raidz_reconstruct(rm, tgts, n); 2240168404Spjd 2241219089Spjd if (raidz_checksum_verify(zio) == 0) { 2242219089Spjd atomic_inc_64(&raidz_corrected[code]); 2243219089Spjd 2244168404Spjd /* 2245219089Spjd * If we read more parity disks than were used 2246219089Spjd * for reconstruction, confirm that the other 2247219089Spjd * parity disks produced correct data. This 2248219089Spjd * routine is suboptimal in that it regenerates 2249219089Spjd * the parity that we already used in addition 2250219089Spjd * to the parity that we're attempting to 2251219089Spjd * verify, but this should be a relatively 2252219089Spjd * uncommon case, and can be optimized if it 2253219089Spjd * becomes a problem. Note that we regenerate 2254219089Spjd * parity when resilvering so we can write it 2255219089Spjd * out to failed devices later. 2256168404Spjd */ 2257219089Spjd if (parity_errors < rm->rm_firstdatacol - n || 2258168738Spjd (zio->io_flags & ZIO_FLAG_RESILVER)) { 2259168404Spjd n = raidz_parity_verify(zio, rm); 2260168404Spjd unexpected_errors += n; 2261168404Spjd ASSERT(parity_errors + n <= 2262168404Spjd rm->rm_firstdatacol); 2263168404Spjd } 2264168404Spjd 2265168404Spjd goto done; 2266168404Spjd } 2267168404Spjd } 2268168404Spjd } 2269168404Spjd 2270168404Spjd /* 2271168404Spjd * This isn't a typical situation -- either we got a read error or 2272168404Spjd * a child silently returned bad data. Read every block so we can 2273168404Spjd * try again with as much data and parity as we can track down. If 2274168404Spjd * we've already been through once before, all children will be marked 2275168404Spjd * as tried so we'll proceed to combinatorial reconstruction. 2276168404Spjd */ 2277168404Spjd unexpected_errors = 1; 2278168404Spjd rm->rm_missingdata = 0; 2279168404Spjd rm->rm_missingparity = 0; 2280168404Spjd 2281168404Spjd for (c = 0; c < rm->rm_cols; c++) { 2282168404Spjd if (rm->rm_col[c].rc_tried) 2283168404Spjd continue; 2284168404Spjd 2285168404Spjd zio_vdev_io_redone(zio); 2286168404Spjd do { 2287168404Spjd rc = &rm->rm_col[c]; 2288168404Spjd if (rc->rc_tried) 2289168404Spjd continue; 2290168404Spjd zio_nowait(zio_vdev_child_io(zio, NULL, 2291168404Spjd vd->vdev_child[rc->rc_devidx], 2292168404Spjd rc->rc_offset, rc->rc_data, rc->rc_size, 2293185029Spjd zio->io_type, zio->io_priority, 0, 2294168404Spjd vdev_raidz_child_done, rc)); 2295168404Spjd } while (++c < rm->rm_cols); 2296185029Spjd 2297168404Spjd return; 2298168404Spjd } 2299168404Spjd 2300168404Spjd /* 2301168404Spjd * At this point we've attempted to reconstruct the data given the 2302168404Spjd * errors we detected, and we've attempted to read all columns. There 2303168404Spjd * must, therefore, be one or more additional problems -- silent errors 2304168404Spjd * resulting in invalid data rather than explicit I/O errors resulting 2305219089Spjd * in absent data. We check if there is enough additional data to 2306219089Spjd * possibly reconstruct the data and then perform combinatorial 2307219089Spjd * reconstruction over all possible combinations. If that fails, 2308219089Spjd * we're cooked. 2309168404Spjd */ 2310219089Spjd if (total_errors > rm->rm_firstdatacol) { 2311185029Spjd zio->io_error = vdev_raidz_worst_error(rm); 2312168404Spjd 2313219089Spjd } else if (total_errors < rm->rm_firstdatacol && 2314219089Spjd (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) { 2315168404Spjd /* 2316219089Spjd * If we didn't use all the available parity for the 2317219089Spjd * combinatorial reconstruction, verify that the remaining 2318219089Spjd * parity is correct. 2319168404Spjd */ 2320219089Spjd if (code != (1 << rm->rm_firstdatacol) - 1) 2321219089Spjd (void) raidz_parity_verify(zio, rm); 2322219089Spjd } else { 2323168404Spjd /* 2324219089Spjd * We're here because either: 2325219089Spjd * 2326219089Spjd * total_errors == rm_first_datacol, or 2327219089Spjd * vdev_raidz_combrec() failed 2328219089Spjd * 2329219089Spjd * In either case, there is enough bad data to prevent 2330219089Spjd * reconstruction. 2331219089Spjd * 2332219089Spjd * Start checksum ereports for all children which haven't 2333219089Spjd * failed, and the IO wasn't speculative. 2334168404Spjd */ 2335249643Smm zio->io_error = SET_ERROR(ECKSUM); 2336168404Spjd 2337219089Spjd if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2338219089Spjd for (c = 0; c < rm->rm_cols; c++) { 2339219089Spjd rc = &rm->rm_col[c]; 2340219089Spjd if (rc->rc_error == 0) { 2341219089Spjd zio_bad_cksum_t zbc; 2342219089Spjd zbc.zbc_has_cksum = 0; 2343219089Spjd zbc.zbc_injected = 2344219089Spjd rm->rm_ecksuminjected; 2345168404Spjd 2346219089Spjd zfs_ereport_start_checksum( 2347219089Spjd zio->io_spa, 2348219089Spjd vd->vdev_child[rc->rc_devidx], 2349219089Spjd zio, rc->rc_offset, rc->rc_size, 2350219089Spjd (void *)(uintptr_t)c, &zbc); 2351168404Spjd } 2352168404Spjd } 2353168404Spjd } 2354168404Spjd } 2355168404Spjd 2356168404Spjddone: 2357168404Spjd zio_checksum_verified(zio); 2358168404Spjd 2359209962Smm if (zio->io_error == 0 && spa_writeable(zio->io_spa) && 2360168404Spjd (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { 2361168404Spjd /* 2362168404Spjd * Use the good data we have in hand to repair damaged children. 2363168404Spjd */ 2364168404Spjd for (c = 0; c < rm->rm_cols; c++) { 2365168404Spjd rc = &rm->rm_col[c]; 2366168404Spjd cvd = vd->vdev_child[rc->rc_devidx]; 2367168404Spjd 2368168404Spjd if (rc->rc_error == 0) 2369168404Spjd continue; 2370168404Spjd 2371185029Spjd zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 2372168404Spjd rc->rc_offset, rc->rc_data, rc->rc_size, 2373260764Savg ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, 2374209962Smm ZIO_FLAG_IO_REPAIR | (unexpected_errors ? 2375209962Smm ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); 2376168404Spjd } 2377168404Spjd } 2378168404Spjd} 2379168404Spjd 2380168404Spjdstatic void 2381168404Spjdvdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) 2382168404Spjd{ 2383168404Spjd if (faulted > vd->vdev_nparity) 2384168404Spjd vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2385168404Spjd VDEV_AUX_NO_REPLICAS); 2386168404Spjd else if (degraded + faulted != 0) 2387168404Spjd vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); 2388168404Spjd else 2389168404Spjd vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); 2390168404Spjd} 2391168404Spjd 2392168404Spjdvdev_ops_t vdev_raidz_ops = { 2393168404Spjd vdev_raidz_open, 2394168404Spjd vdev_raidz_close, 2395168404Spjd vdev_raidz_asize, 2396168404Spjd vdev_raidz_io_start, 2397168404Spjd vdev_raidz_io_done, 2398168404Spjd vdev_raidz_state_change, 2399219089Spjd NULL, 2400219089Spjd NULL, 2401168404Spjd VDEV_TYPE_RAIDZ, /* name of this vdev type */ 2402168404Spjd B_FALSE /* not a leaf vdev */ 2403168404Spjd}; 2404