1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd 22168404Spjd/* 23219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24249195Smm * Copyright (c) 2013 by Delphix. All rights reserved. 25255750Sdelphij * Copyright (c) 2013, Joyent, Inc. All rights reserved. 26168404Spjd */ 27168404Spjd 28168404Spjd#include <sys/zfs_context.h> 29168404Spjd#include <sys/spa.h> 30168404Spjd#include <sys/vdev_impl.h> 31255750Sdelphij#ifdef illumos 32255750Sdelphij#include <sys/vdev_disk.h> 33255750Sdelphij#endif 34255750Sdelphij#include <sys/vdev_file.h> 35255750Sdelphij#include <sys/vdev_raidz.h> 36168404Spjd#include <sys/zio.h> 37168404Spjd#include <sys/zio_checksum.h> 38168404Spjd#include <sys/fs/zfs.h> 39168404Spjd#include <sys/fm/fs/zfs.h> 40255750Sdelphij#include <sys/bio.h> 41168404Spjd 42168404Spjd/* 43168404Spjd * Virtual device vector for RAID-Z. 44168404Spjd * 45219089Spjd * This vdev supports single, double, and triple parity. For single parity, 46219089Spjd * we use a simple XOR of all the data columns. For double or triple parity, 47219089Spjd * we use a special case of Reed-Solomon coding. This extends the 48219089Spjd * technique described in "The mathematics of RAID-6" by H. Peter Anvin by 49219089Spjd * drawing on the system described in "A Tutorial on Reed-Solomon Coding for 50219089Spjd * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the 51219089Spjd * former is also based. The latter is designed to provide higher performance 52219089Spjd * for writes. 53168404Spjd * 54219089Spjd * Note that the Plank paper claimed to support arbitrary N+M, but was then 55219089Spjd * amended six years later identifying a critical flaw that invalidates its 56219089Spjd * claims. Nevertheless, the technique can be adapted to work for up to 57219089Spjd * triple parity. For additional parity, the amendment "Note: Correction to 58219089Spjd * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding 59219089Spjd * is viable, but the additional complexity means that write performance will 60219089Spjd * suffer. 61219089Spjd * 62219089Spjd * All of the methods above operate on a Galois field, defined over the 63219089Spjd * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements 64219089Spjd * can be expressed with a single byte. Briefly, the operations on the 65219089Spjd * field are defined as follows: 66219089Spjd * 67168404Spjd * o addition (+) is represented by a bitwise XOR 68168404Spjd * o subtraction (-) is therefore identical to addition: A + B = A - B 69168404Spjd * o multiplication of A by 2 is defined by the following bitwise expression: 70251631Sdelphij * 71168404Spjd * (A * 2)_7 = A_6 72168404Spjd * (A * 2)_6 = A_5 73168404Spjd * (A * 2)_5 = A_4 74168404Spjd * (A * 2)_4 = A_3 + A_7 75168404Spjd * (A * 2)_3 = A_2 + A_7 76168404Spjd * (A * 2)_2 = A_1 + A_7 77168404Spjd * (A * 2)_1 = A_0 78168404Spjd * (A * 2)_0 = A_7 79168404Spjd * 80168404Spjd * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). 81219089Spjd * As an aside, this multiplication is derived from the error correcting 82219089Spjd * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1. 83168404Spjd * 84168404Spjd * Observe that any number in the field (except for 0) can be expressed as a 85168404Spjd * power of 2 -- a generator for the field. We store a table of the powers of 86168404Spjd * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can 87168404Spjd * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather 88219089Spjd * than field addition). The inverse of a field element A (A^-1) is therefore 89219089Spjd * A ^ (255 - 1) = A^254. 90168404Spjd * 91219089Spjd * The up-to-three parity columns, P, Q, R over several data columns, 92219089Spjd * D_0, ... D_n-1, can be expressed by field operations: 93168404Spjd * 94168404Spjd * P = D_0 + D_1 + ... + D_n-2 + D_n-1 95168404Spjd * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 96168404Spjd * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 97219089Spjd * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1 98219089Spjd * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1 99168404Spjd * 100219089Spjd * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival 101219089Spjd * XOR operation, and 2 and 4 can be computed quickly and generate linearly- 102219089Spjd * independent coefficients. (There are no additional coefficients that have 103219089Spjd * this property which is why the uncorrected Plank method breaks down.) 104219089Spjd * 105219089Spjd * See the reconstruction code below for how P, Q and R can used individually 106219089Spjd * or in concert to recover missing data columns. 107168404Spjd */ 108168404Spjd 109168404Spjdtypedef struct raidz_col { 110168404Spjd uint64_t rc_devidx; /* child device index for I/O */ 111168404Spjd uint64_t rc_offset; /* device offset */ 112168404Spjd uint64_t rc_size; /* I/O size */ 113168404Spjd void *rc_data; /* I/O data */ 114219089Spjd void *rc_gdata; /* used to store the "good" version */ 115168404Spjd int rc_error; /* I/O error for this device */ 116168404Spjd uint8_t rc_tried; /* Did we attempt this I/O column? */ 117168404Spjd uint8_t rc_skipped; /* Did we skip this I/O column? */ 118168404Spjd} raidz_col_t; 119168404Spjd 120168404Spjdtypedef struct raidz_map { 121219089Spjd uint64_t rm_cols; /* Regular column count */ 122219089Spjd uint64_t rm_scols; /* Count including skipped columns */ 123168404Spjd uint64_t rm_bigcols; /* Number of oversized columns */ 124168404Spjd uint64_t rm_asize; /* Actual total I/O size */ 125168404Spjd uint64_t rm_missingdata; /* Count of missing data devices */ 126168404Spjd uint64_t rm_missingparity; /* Count of missing parity devices */ 127168404Spjd uint64_t rm_firstdatacol; /* First data column/parity count */ 128219089Spjd uint64_t rm_nskip; /* Skipped sectors for padding */ 129251631Sdelphij uint64_t rm_skipstart; /* Column index of padding start */ 130219089Spjd void *rm_datacopy; /* rm_asize-buffer of copied data */ 131219089Spjd uintptr_t rm_reports; /* # of referencing checksum reports */ 132219089Spjd uint8_t rm_freed; /* map no longer has referencing ZIO */ 133219089Spjd uint8_t rm_ecksuminjected; /* checksum error was injected */ 134168404Spjd raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ 135168404Spjd} raidz_map_t; 136168404Spjd 137168404Spjd#define VDEV_RAIDZ_P 0 138168404Spjd#define VDEV_RAIDZ_Q 1 139219089Spjd#define VDEV_RAIDZ_R 2 140168404Spjd 141219089Spjd#define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 142219089Spjd#define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 143168404Spjd 144219089Spjd/* 145219089Spjd * We provide a mechanism to perform the field multiplication operation on a 146219089Spjd * 64-bit value all at once rather than a byte at a time. This works by 147219089Spjd * creating a mask from the top bit in each byte and using that to 148219089Spjd * conditionally apply the XOR of 0x1d. 149219089Spjd */ 150219089Spjd#define VDEV_RAIDZ_64MUL_2(x, mask) \ 151219089Spjd{ \ 152219089Spjd (mask) = (x) & 0x8080808080808080ULL; \ 153219089Spjd (mask) = ((mask) << 1) - ((mask) >> 7); \ 154219089Spjd (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 155219089Spjd ((mask) & 0x1d1d1d1d1d1d1d1d); \ 156219089Spjd} 157168404Spjd 158219089Spjd#define VDEV_RAIDZ_64MUL_4(x, mask) \ 159219089Spjd{ \ 160219089Spjd VDEV_RAIDZ_64MUL_2((x), mask); \ 161219089Spjd VDEV_RAIDZ_64MUL_2((x), mask); \ 162219089Spjd} 163219089Spjd 164255750Sdelphij#define VDEV_LABEL_OFFSET(x) (x + VDEV_LABEL_START_SIZE) 165255750Sdelphij 166168404Spjd/* 167219089Spjd * Force reconstruction to use the general purpose method. 168219089Spjd */ 169219089Spjdint vdev_raidz_default_to_general; 170219089Spjd 171251631Sdelphij/* Powers of 2 in the Galois field defined above. */ 172168404Spjdstatic const uint8_t vdev_raidz_pow2[256] = { 173168404Spjd 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 174168404Spjd 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 175168404Spjd 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 176168404Spjd 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 177168404Spjd 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 178168404Spjd 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 179168404Spjd 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, 180168404Spjd 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 181168404Spjd 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 182168404Spjd 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, 183168404Spjd 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 184168404Spjd 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 185168404Spjd 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 186168404Spjd 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 187168404Spjd 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 188168404Spjd 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 189168404Spjd 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 190168404Spjd 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 191168404Spjd 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 192168404Spjd 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 193168404Spjd 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 194168404Spjd 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 195168404Spjd 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 196168404Spjd 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 197168404Spjd 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, 198168404Spjd 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 199168404Spjd 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 200168404Spjd 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, 201168404Spjd 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 202168404Spjd 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, 203168404Spjd 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 204168404Spjd 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 205168404Spjd}; 206251631Sdelphij/* Logs of 2 in the Galois field defined above. */ 207168404Spjdstatic const uint8_t vdev_raidz_log2[256] = { 208168404Spjd 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, 209168404Spjd 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, 210168404Spjd 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, 211168404Spjd 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, 212168404Spjd 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, 213168404Spjd 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, 214168404Spjd 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, 215168404Spjd 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, 216168404Spjd 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, 217168404Spjd 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, 218168404Spjd 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, 219168404Spjd 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, 220168404Spjd 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, 221168404Spjd 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, 222168404Spjd 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, 223168404Spjd 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, 224168404Spjd 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, 225168404Spjd 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, 226168404Spjd 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, 227168404Spjd 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, 228168404Spjd 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, 229168404Spjd 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, 230168404Spjd 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, 231168404Spjd 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, 232168404Spjd 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, 233168404Spjd 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, 234168404Spjd 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, 235168404Spjd 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, 236168404Spjd 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, 237168404Spjd 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, 238168404Spjd 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, 239168404Spjd 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, 240168404Spjd}; 241168404Spjd 242219089Spjdstatic void vdev_raidz_generate_parity(raidz_map_t *rm); 243219089Spjd 244168404Spjd/* 245168404Spjd * Multiply a given number by 2 raised to the given power. 246168404Spjd */ 247168404Spjdstatic uint8_t 248168404Spjdvdev_raidz_exp2(uint_t a, int exp) 249168404Spjd{ 250168404Spjd if (a == 0) 251168404Spjd return (0); 252168404Spjd 253168404Spjd ASSERT(exp >= 0); 254168404Spjd ASSERT(vdev_raidz_log2[a] > 0 || a == 1); 255168404Spjd 256168404Spjd exp += vdev_raidz_log2[a]; 257168404Spjd if (exp > 255) 258168404Spjd exp -= 255; 259168404Spjd 260168404Spjd return (vdev_raidz_pow2[exp]); 261168404Spjd} 262168404Spjd 263185029Spjdstatic void 264219089Spjdvdev_raidz_map_free(raidz_map_t *rm) 265185029Spjd{ 266185029Spjd int c; 267219089Spjd size_t size; 268185029Spjd 269219089Spjd for (c = 0; c < rm->rm_firstdatacol; c++) { 270240868Spjd if (rm->rm_col[c].rc_data != NULL) 271240868Spjd zio_buf_free(rm->rm_col[c].rc_data, 272240868Spjd rm->rm_col[c].rc_size); 273185029Spjd 274219089Spjd if (rm->rm_col[c].rc_gdata != NULL) 275219089Spjd zio_buf_free(rm->rm_col[c].rc_gdata, 276219089Spjd rm->rm_col[c].rc_size); 277219089Spjd } 278219089Spjd 279219089Spjd size = 0; 280219089Spjd for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) 281219089Spjd size += rm->rm_col[c].rc_size; 282219089Spjd 283219089Spjd if (rm->rm_datacopy != NULL) 284219089Spjd zio_buf_free(rm->rm_datacopy, size); 285219089Spjd 286219089Spjd kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); 287185029Spjd} 288185029Spjd 289219089Spjdstatic void 290219089Spjdvdev_raidz_map_free_vsd(zio_t *zio) 291219089Spjd{ 292219089Spjd raidz_map_t *rm = zio->io_vsd; 293219089Spjd 294240415Smm ASSERT0(rm->rm_freed); 295219089Spjd rm->rm_freed = 1; 296219089Spjd 297219089Spjd if (rm->rm_reports == 0) 298219089Spjd vdev_raidz_map_free(rm); 299219089Spjd} 300219089Spjd 301219089Spjd/*ARGSUSED*/ 302219089Spjdstatic void 303219089Spjdvdev_raidz_cksum_free(void *arg, size_t ignored) 304219089Spjd{ 305219089Spjd raidz_map_t *rm = arg; 306219089Spjd 307219089Spjd ASSERT3U(rm->rm_reports, >, 0); 308219089Spjd 309219089Spjd if (--rm->rm_reports == 0 && rm->rm_freed != 0) 310219089Spjd vdev_raidz_map_free(rm); 311219089Spjd} 312219089Spjd 313219089Spjdstatic void 314219089Spjdvdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) 315219089Spjd{ 316219089Spjd raidz_map_t *rm = zcr->zcr_cbdata; 317219089Spjd size_t c = zcr->zcr_cbinfo; 318219089Spjd size_t x; 319219089Spjd 320219089Spjd const char *good = NULL; 321219089Spjd const char *bad = rm->rm_col[c].rc_data; 322219089Spjd 323219089Spjd if (good_data == NULL) { 324219089Spjd zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); 325219089Spjd return; 326219089Spjd } 327219089Spjd 328219089Spjd if (c < rm->rm_firstdatacol) { 329219089Spjd /* 330219089Spjd * The first time through, calculate the parity blocks for 331219089Spjd * the good data (this relies on the fact that the good 332219089Spjd * data never changes for a given logical ZIO) 333219089Spjd */ 334219089Spjd if (rm->rm_col[0].rc_gdata == NULL) { 335219089Spjd char *bad_parity[VDEV_RAIDZ_MAXPARITY]; 336219089Spjd char *buf; 337219089Spjd 338219089Spjd /* 339219089Spjd * Set up the rm_col[]s to generate the parity for 340219089Spjd * good_data, first saving the parity bufs and 341219089Spjd * replacing them with buffers to hold the result. 342219089Spjd */ 343219089Spjd for (x = 0; x < rm->rm_firstdatacol; x++) { 344219089Spjd bad_parity[x] = rm->rm_col[x].rc_data; 345219089Spjd rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata = 346219089Spjd zio_buf_alloc(rm->rm_col[x].rc_size); 347219089Spjd } 348219089Spjd 349219089Spjd /* fill in the data columns from good_data */ 350219089Spjd buf = (char *)good_data; 351219089Spjd for (; x < rm->rm_cols; x++) { 352219089Spjd rm->rm_col[x].rc_data = buf; 353219089Spjd buf += rm->rm_col[x].rc_size; 354219089Spjd } 355219089Spjd 356219089Spjd /* 357219089Spjd * Construct the parity from the good data. 358219089Spjd */ 359219089Spjd vdev_raidz_generate_parity(rm); 360219089Spjd 361219089Spjd /* restore everything back to its original state */ 362219089Spjd for (x = 0; x < rm->rm_firstdatacol; x++) 363219089Spjd rm->rm_col[x].rc_data = bad_parity[x]; 364219089Spjd 365219089Spjd buf = rm->rm_datacopy; 366219089Spjd for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) { 367219089Spjd rm->rm_col[x].rc_data = buf; 368219089Spjd buf += rm->rm_col[x].rc_size; 369219089Spjd } 370219089Spjd } 371219089Spjd 372219089Spjd ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL); 373219089Spjd good = rm->rm_col[c].rc_gdata; 374219089Spjd } else { 375219089Spjd /* adjust good_data to point at the start of our column */ 376219089Spjd good = good_data; 377219089Spjd 378219089Spjd for (x = rm->rm_firstdatacol; x < c; x++) 379219089Spjd good += rm->rm_col[x].rc_size; 380219089Spjd } 381219089Spjd 382219089Spjd /* we drop the ereport if it ends up that the data was good */ 383219089Spjd zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE); 384219089Spjd} 385219089Spjd 386219089Spjd/* 387219089Spjd * Invoked indirectly by zfs_ereport_start_checksum(), called 388219089Spjd * below when our read operation fails completely. The main point 389219089Spjd * is to keep a copy of everything we read from disk, so that at 390219089Spjd * vdev_raidz_cksum_finish() time we can compare it with the good data. 391219089Spjd */ 392219089Spjdstatic void 393219089Spjdvdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) 394219089Spjd{ 395219089Spjd size_t c = (size_t)(uintptr_t)arg; 396219089Spjd caddr_t buf; 397219089Spjd 398219089Spjd raidz_map_t *rm = zio->io_vsd; 399219089Spjd size_t size; 400219089Spjd 401219089Spjd /* set up the report and bump the refcount */ 402219089Spjd zcr->zcr_cbdata = rm; 403219089Spjd zcr->zcr_cbinfo = c; 404219089Spjd zcr->zcr_finish = vdev_raidz_cksum_finish; 405219089Spjd zcr->zcr_free = vdev_raidz_cksum_free; 406219089Spjd 407219089Spjd rm->rm_reports++; 408219089Spjd ASSERT3U(rm->rm_reports, >, 0); 409219089Spjd 410219089Spjd if (rm->rm_datacopy != NULL) 411219089Spjd return; 412219089Spjd 413219089Spjd /* 414219089Spjd * It's the first time we're called for this raidz_map_t, so we need 415219089Spjd * to copy the data aside; there's no guarantee that our zio's buffer 416219089Spjd * won't be re-used for something else. 417219089Spjd * 418219089Spjd * Our parity data is already in separate buffers, so there's no need 419219089Spjd * to copy them. 420219089Spjd */ 421219089Spjd 422219089Spjd size = 0; 423219089Spjd for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) 424219089Spjd size += rm->rm_col[c].rc_size; 425219089Spjd 426219089Spjd buf = rm->rm_datacopy = zio_buf_alloc(size); 427219089Spjd 428219089Spjd for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 429219089Spjd raidz_col_t *col = &rm->rm_col[c]; 430219089Spjd 431219089Spjd bcopy(col->rc_data, buf, col->rc_size); 432219089Spjd col->rc_data = buf; 433219089Spjd 434219089Spjd buf += col->rc_size; 435219089Spjd } 436219089Spjd ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size); 437219089Spjd} 438219089Spjd 439219089Spjdstatic const zio_vsd_ops_t vdev_raidz_vsd_ops = { 440219089Spjd vdev_raidz_map_free_vsd, 441219089Spjd vdev_raidz_cksum_report 442219089Spjd}; 443219089Spjd 444251629Sdelphij/* 445251629Sdelphij * Divides the IO evenly across all child vdevs; usually, dcols is 446251629Sdelphij * the number of children in the target vdev. 447251629Sdelphij */ 448168404Spjdstatic raidz_map_t * 449255750Sdelphijvdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset, boolean_t dofree, 450255750Sdelphij uint64_t unit_shift, uint64_t dcols, uint64_t nparity) 451168404Spjd{ 452168404Spjd raidz_map_t *rm; 453251629Sdelphij /* The starting RAIDZ (parent) vdev sector of the block. */ 454255750Sdelphij uint64_t b = offset >> unit_shift; 455251629Sdelphij /* The zio's size in units of the vdev's minimum sector size. */ 456255750Sdelphij uint64_t s = size >> unit_shift; 457251629Sdelphij /* The first column for this stripe. */ 458168404Spjd uint64_t f = b % dcols; 459251629Sdelphij /* The starting byte offset on each child vdev. */ 460168404Spjd uint64_t o = (b / dcols) << unit_shift; 461219089Spjd uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; 462168404Spjd 463251629Sdelphij /* 464251629Sdelphij * "Quotient": The number of data sectors for this stripe on all but 465251629Sdelphij * the "big column" child vdevs that also contain "remainder" data. 466251629Sdelphij */ 467168404Spjd q = s / (dcols - nparity); 468251629Sdelphij 469251629Sdelphij /* 470251629Sdelphij * "Remainder": The number of partial stripe data sectors in this I/O. 471251629Sdelphij * This will add a sector to some, but not all, child vdevs. 472251629Sdelphij */ 473168404Spjd r = s - q * (dcols - nparity); 474251629Sdelphij 475251629Sdelphij /* The number of "big columns" - those which contain remainder data. */ 476168404Spjd bc = (r == 0 ? 0 : r + nparity); 477251629Sdelphij 478251629Sdelphij /* 479251629Sdelphij * The total number of data and parity sectors associated with 480251629Sdelphij * this I/O. 481251629Sdelphij */ 482219089Spjd tot = s + nparity * (q + (r == 0 ? 0 : 1)); 483168404Spjd 484251629Sdelphij /* acols: The columns that will be accessed. */ 485251629Sdelphij /* scols: The columns that will be accessed or skipped. */ 486219089Spjd if (q == 0) { 487251629Sdelphij /* Our I/O request doesn't span all child vdevs. */ 488219089Spjd acols = bc; 489219089Spjd scols = MIN(dcols, roundup(bc, nparity + 1)); 490219089Spjd } else { 491219089Spjd acols = dcols; 492219089Spjd scols = dcols; 493219089Spjd } 494168404Spjd 495219089Spjd ASSERT3U(acols, <=, scols); 496168404Spjd 497219089Spjd rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP); 498219089Spjd 499168404Spjd rm->rm_cols = acols; 500219089Spjd rm->rm_scols = scols; 501168404Spjd rm->rm_bigcols = bc; 502219089Spjd rm->rm_skipstart = bc; 503168404Spjd rm->rm_missingdata = 0; 504168404Spjd rm->rm_missingparity = 0; 505168404Spjd rm->rm_firstdatacol = nparity; 506219089Spjd rm->rm_datacopy = NULL; 507219089Spjd rm->rm_reports = 0; 508219089Spjd rm->rm_freed = 0; 509219089Spjd rm->rm_ecksuminjected = 0; 510168404Spjd 511219089Spjd asize = 0; 512219089Spjd 513219089Spjd for (c = 0; c < scols; c++) { 514168404Spjd col = f + c; 515168404Spjd coff = o; 516168404Spjd if (col >= dcols) { 517168404Spjd col -= dcols; 518168404Spjd coff += 1ULL << unit_shift; 519168404Spjd } 520168404Spjd rm->rm_col[c].rc_devidx = col; 521168404Spjd rm->rm_col[c].rc_offset = coff; 522168404Spjd rm->rm_col[c].rc_data = NULL; 523219089Spjd rm->rm_col[c].rc_gdata = NULL; 524168404Spjd rm->rm_col[c].rc_error = 0; 525168404Spjd rm->rm_col[c].rc_tried = 0; 526168404Spjd rm->rm_col[c].rc_skipped = 0; 527219089Spjd 528219089Spjd if (c >= acols) 529219089Spjd rm->rm_col[c].rc_size = 0; 530219089Spjd else if (c < bc) 531219089Spjd rm->rm_col[c].rc_size = (q + 1) << unit_shift; 532219089Spjd else 533219089Spjd rm->rm_col[c].rc_size = q << unit_shift; 534219089Spjd 535219089Spjd asize += rm->rm_col[c].rc_size; 536168404Spjd } 537168404Spjd 538219089Spjd ASSERT3U(asize, ==, tot << unit_shift); 539219089Spjd rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); 540219089Spjd rm->rm_nskip = roundup(tot, nparity + 1) - tot; 541219089Spjd ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); 542219089Spjd ASSERT3U(rm->rm_nskip, <=, nparity); 543168404Spjd 544255750Sdelphij if (!dofree) { 545240868Spjd for (c = 0; c < rm->rm_firstdatacol; c++) { 546240868Spjd rm->rm_col[c].rc_data = 547240868Spjd zio_buf_alloc(rm->rm_col[c].rc_size); 548240868Spjd } 549168404Spjd 550255750Sdelphij rm->rm_col[c].rc_data = data; 551168404Spjd 552240868Spjd for (c = c + 1; c < acols; c++) { 553240868Spjd rm->rm_col[c].rc_data = 554240868Spjd (char *)rm->rm_col[c - 1].rc_data + 555240868Spjd rm->rm_col[c - 1].rc_size; 556240868Spjd } 557240868Spjd } 558168404Spjd 559168404Spjd /* 560168404Spjd * If all data stored spans all columns, there's a danger that parity 561168404Spjd * will always be on the same device and, since parity isn't read 562168404Spjd * during normal operation, that that device's I/O bandwidth won't be 563168404Spjd * used effectively. We therefore switch the parity every 1MB. 564168404Spjd * 565168404Spjd * ... at least that was, ostensibly, the theory. As a practical 566168404Spjd * matter unless we juggle the parity between all devices evenly, we 567168404Spjd * won't see any benefit. Further, occasional writes that aren't a 568168404Spjd * multiple of the LCM of the number of children and the minimum 569168404Spjd * stripe width are sufficient to avoid pessimal behavior. 570168404Spjd * Unfortunately, this decision created an implicit on-disk format 571168404Spjd * requirement that we need to support for all eternity, but only 572168404Spjd * for single-parity RAID-Z. 573219089Spjd * 574219089Spjd * If we intend to skip a sector in the zeroth column for padding 575219089Spjd * we must make sure to note this swap. We will never intend to 576219089Spjd * skip the first column since at least one data and one parity 577219089Spjd * column must appear in each row. 578168404Spjd */ 579168404Spjd ASSERT(rm->rm_cols >= 2); 580168404Spjd ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); 581168404Spjd 582255750Sdelphij if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) { 583168404Spjd devidx = rm->rm_col[0].rc_devidx; 584168404Spjd o = rm->rm_col[0].rc_offset; 585168404Spjd rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; 586168404Spjd rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; 587168404Spjd rm->rm_col[1].rc_devidx = devidx; 588168404Spjd rm->rm_col[1].rc_offset = o; 589219089Spjd 590219089Spjd if (rm->rm_skipstart == 0) 591219089Spjd rm->rm_skipstart = 1; 592168404Spjd } 593168404Spjd 594168404Spjd return (rm); 595168404Spjd} 596168404Spjd 597168404Spjdstatic void 598168404Spjdvdev_raidz_generate_parity_p(raidz_map_t *rm) 599168404Spjd{ 600168404Spjd uint64_t *p, *src, pcount, ccount, i; 601168404Spjd int c; 602168404Spjd 603168404Spjd pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 604168404Spjd 605168404Spjd for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 606168404Spjd src = rm->rm_col[c].rc_data; 607168404Spjd p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 608168404Spjd ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 609168404Spjd 610168404Spjd if (c == rm->rm_firstdatacol) { 611168404Spjd ASSERT(ccount == pcount); 612219089Spjd for (i = 0; i < ccount; i++, src++, p++) { 613168404Spjd *p = *src; 614168404Spjd } 615168404Spjd } else { 616168404Spjd ASSERT(ccount <= pcount); 617219089Spjd for (i = 0; i < ccount; i++, src++, p++) { 618168404Spjd *p ^= *src; 619168404Spjd } 620168404Spjd } 621168404Spjd } 622168404Spjd} 623168404Spjd 624168404Spjdstatic void 625168404Spjdvdev_raidz_generate_parity_pq(raidz_map_t *rm) 626168404Spjd{ 627219089Spjd uint64_t *p, *q, *src, pcnt, ccnt, mask, i; 628168404Spjd int c; 629168404Spjd 630219089Spjd pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 631168404Spjd ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 632168404Spjd rm->rm_col[VDEV_RAIDZ_Q].rc_size); 633168404Spjd 634168404Spjd for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 635168404Spjd src = rm->rm_col[c].rc_data; 636168404Spjd p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 637168404Spjd q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 638168404Spjd 639219089Spjd ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 640219089Spjd 641168404Spjd if (c == rm->rm_firstdatacol) { 642219089Spjd ASSERT(ccnt == pcnt || ccnt == 0); 643219089Spjd for (i = 0; i < ccnt; i++, src++, p++, q++) { 644219089Spjd *p = *src; 645168404Spjd *q = *src; 646219089Spjd } 647219089Spjd for (; i < pcnt; i++, src++, p++, q++) { 648219089Spjd *p = 0; 649219089Spjd *q = 0; 650219089Spjd } 651219089Spjd } else { 652219089Spjd ASSERT(ccnt <= pcnt); 653219089Spjd 654219089Spjd /* 655219089Spjd * Apply the algorithm described above by multiplying 656219089Spjd * the previous result and adding in the new value. 657219089Spjd */ 658219089Spjd for (i = 0; i < ccnt; i++, src++, p++, q++) { 659219089Spjd *p ^= *src; 660219089Spjd 661219089Spjd VDEV_RAIDZ_64MUL_2(*q, mask); 662219089Spjd *q ^= *src; 663219089Spjd } 664219089Spjd 665219089Spjd /* 666219089Spjd * Treat short columns as though they are full of 0s. 667219089Spjd * Note that there's therefore nothing needed for P. 668219089Spjd */ 669219089Spjd for (; i < pcnt; i++, q++) { 670219089Spjd VDEV_RAIDZ_64MUL_2(*q, mask); 671219089Spjd } 672219089Spjd } 673219089Spjd } 674219089Spjd} 675219089Spjd 676219089Spjdstatic void 677219089Spjdvdev_raidz_generate_parity_pqr(raidz_map_t *rm) 678219089Spjd{ 679219089Spjd uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i; 680219089Spjd int c; 681219089Spjd 682219089Spjd pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 683219089Spjd ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 684219089Spjd rm->rm_col[VDEV_RAIDZ_Q].rc_size); 685219089Spjd ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 686219089Spjd rm->rm_col[VDEV_RAIDZ_R].rc_size); 687219089Spjd 688219089Spjd for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 689219089Spjd src = rm->rm_col[c].rc_data; 690219089Spjd p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 691219089Spjd q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 692219089Spjd r = rm->rm_col[VDEV_RAIDZ_R].rc_data; 693219089Spjd 694219089Spjd ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 695219089Spjd 696219089Spjd if (c == rm->rm_firstdatacol) { 697219089Spjd ASSERT(ccnt == pcnt || ccnt == 0); 698219089Spjd for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 699168404Spjd *p = *src; 700219089Spjd *q = *src; 701219089Spjd *r = *src; 702168404Spjd } 703219089Spjd for (; i < pcnt; i++, src++, p++, q++, r++) { 704219089Spjd *p = 0; 705168404Spjd *q = 0; 706219089Spjd *r = 0; 707168404Spjd } 708168404Spjd } else { 709219089Spjd ASSERT(ccnt <= pcnt); 710168404Spjd 711168404Spjd /* 712219089Spjd * Apply the algorithm described above by multiplying 713219089Spjd * the previous result and adding in the new value. 714168404Spjd */ 715219089Spjd for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 716219089Spjd *p ^= *src; 717219089Spjd 718219089Spjd VDEV_RAIDZ_64MUL_2(*q, mask); 719168404Spjd *q ^= *src; 720219089Spjd 721219089Spjd VDEV_RAIDZ_64MUL_4(*r, mask); 722219089Spjd *r ^= *src; 723168404Spjd } 724168404Spjd 725168404Spjd /* 726168404Spjd * Treat short columns as though they are full of 0s. 727219089Spjd * Note that there's therefore nothing needed for P. 728168404Spjd */ 729219089Spjd for (; i < pcnt; i++, q++, r++) { 730219089Spjd VDEV_RAIDZ_64MUL_2(*q, mask); 731219089Spjd VDEV_RAIDZ_64MUL_4(*r, mask); 732168404Spjd } 733168404Spjd } 734168404Spjd } 735168404Spjd} 736168404Spjd 737219089Spjd/* 738219089Spjd * Generate RAID parity in the first virtual columns according to the number of 739219089Spjd * parity columns available. 740219089Spjd */ 741168404Spjdstatic void 742219089Spjdvdev_raidz_generate_parity(raidz_map_t *rm) 743168404Spjd{ 744219089Spjd switch (rm->rm_firstdatacol) { 745219089Spjd case 1: 746219089Spjd vdev_raidz_generate_parity_p(rm); 747219089Spjd break; 748219089Spjd case 2: 749219089Spjd vdev_raidz_generate_parity_pq(rm); 750219089Spjd break; 751219089Spjd case 3: 752219089Spjd vdev_raidz_generate_parity_pqr(rm); 753219089Spjd break; 754219089Spjd default: 755219089Spjd cmn_err(CE_PANIC, "invalid RAID-Z configuration"); 756219089Spjd } 757219089Spjd} 758219089Spjd 759219089Spjdstatic int 760219089Spjdvdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) 761219089Spjd{ 762168404Spjd uint64_t *dst, *src, xcount, ccount, count, i; 763219089Spjd int x = tgts[0]; 764168404Spjd int c; 765168404Spjd 766219089Spjd ASSERT(ntgts == 1); 767219089Spjd ASSERT(x >= rm->rm_firstdatacol); 768219089Spjd ASSERT(x < rm->rm_cols); 769219089Spjd 770168404Spjd xcount = rm->rm_col[x].rc_size / sizeof (src[0]); 771168404Spjd ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0])); 772168404Spjd ASSERT(xcount > 0); 773168404Spjd 774168404Spjd src = rm->rm_col[VDEV_RAIDZ_P].rc_data; 775168404Spjd dst = rm->rm_col[x].rc_data; 776168404Spjd for (i = 0; i < xcount; i++, dst++, src++) { 777168404Spjd *dst = *src; 778168404Spjd } 779168404Spjd 780168404Spjd for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 781168404Spjd src = rm->rm_col[c].rc_data; 782168404Spjd dst = rm->rm_col[x].rc_data; 783168404Spjd 784168404Spjd if (c == x) 785168404Spjd continue; 786168404Spjd 787168404Spjd ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 788168404Spjd count = MIN(ccount, xcount); 789168404Spjd 790168404Spjd for (i = 0; i < count; i++, dst++, src++) { 791168404Spjd *dst ^= *src; 792168404Spjd } 793168404Spjd } 794219089Spjd 795219089Spjd return (1 << VDEV_RAIDZ_P); 796168404Spjd} 797168404Spjd 798219089Spjdstatic int 799219089Spjdvdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) 800168404Spjd{ 801168404Spjd uint64_t *dst, *src, xcount, ccount, count, mask, i; 802168404Spjd uint8_t *b; 803219089Spjd int x = tgts[0]; 804168404Spjd int c, j, exp; 805168404Spjd 806219089Spjd ASSERT(ntgts == 1); 807219089Spjd 808168404Spjd xcount = rm->rm_col[x].rc_size / sizeof (src[0]); 809168404Spjd ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0])); 810168404Spjd 811168404Spjd for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 812168404Spjd src = rm->rm_col[c].rc_data; 813168404Spjd dst = rm->rm_col[x].rc_data; 814168404Spjd 815168404Spjd if (c == x) 816168404Spjd ccount = 0; 817168404Spjd else 818168404Spjd ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 819168404Spjd 820168404Spjd count = MIN(ccount, xcount); 821168404Spjd 822168404Spjd if (c == rm->rm_firstdatacol) { 823168404Spjd for (i = 0; i < count; i++, dst++, src++) { 824168404Spjd *dst = *src; 825168404Spjd } 826168404Spjd for (; i < xcount; i++, dst++) { 827168404Spjd *dst = 0; 828168404Spjd } 829168404Spjd 830168404Spjd } else { 831168404Spjd for (i = 0; i < count; i++, dst++, src++) { 832219089Spjd VDEV_RAIDZ_64MUL_2(*dst, mask); 833168404Spjd *dst ^= *src; 834168404Spjd } 835168404Spjd 836168404Spjd for (; i < xcount; i++, dst++) { 837219089Spjd VDEV_RAIDZ_64MUL_2(*dst, mask); 838168404Spjd } 839168404Spjd } 840168404Spjd } 841168404Spjd 842168404Spjd src = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 843168404Spjd dst = rm->rm_col[x].rc_data; 844168404Spjd exp = 255 - (rm->rm_cols - 1 - x); 845168404Spjd 846168404Spjd for (i = 0; i < xcount; i++, dst++, src++) { 847168404Spjd *dst ^= *src; 848168404Spjd for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { 849168404Spjd *b = vdev_raidz_exp2(*b, exp); 850168404Spjd } 851168404Spjd } 852219089Spjd 853219089Spjd return (1 << VDEV_RAIDZ_Q); 854168404Spjd} 855168404Spjd 856219089Spjdstatic int 857219089Spjdvdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) 858168404Spjd{ 859168404Spjd uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp; 860168404Spjd void *pdata, *qdata; 861168404Spjd uint64_t xsize, ysize, i; 862219089Spjd int x = tgts[0]; 863219089Spjd int y = tgts[1]; 864168404Spjd 865219089Spjd ASSERT(ntgts == 2); 866168404Spjd ASSERT(x < y); 867168404Spjd ASSERT(x >= rm->rm_firstdatacol); 868168404Spjd ASSERT(y < rm->rm_cols); 869168404Spjd 870168404Spjd ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size); 871168404Spjd 872168404Spjd /* 873168404Spjd * Move the parity data aside -- we're going to compute parity as 874168404Spjd * though columns x and y were full of zeros -- Pxy and Qxy. We want to 875168404Spjd * reuse the parity generation mechanism without trashing the actual 876168404Spjd * parity so we make those columns appear to be full of zeros by 877168404Spjd * setting their lengths to zero. 878168404Spjd */ 879168404Spjd pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data; 880168404Spjd qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 881168404Spjd xsize = rm->rm_col[x].rc_size; 882168404Spjd ysize = rm->rm_col[y].rc_size; 883168404Spjd 884168404Spjd rm->rm_col[VDEV_RAIDZ_P].rc_data = 885168404Spjd zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size); 886168404Spjd rm->rm_col[VDEV_RAIDZ_Q].rc_data = 887168404Spjd zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size); 888168404Spjd rm->rm_col[x].rc_size = 0; 889168404Spjd rm->rm_col[y].rc_size = 0; 890168404Spjd 891168404Spjd vdev_raidz_generate_parity_pq(rm); 892168404Spjd 893168404Spjd rm->rm_col[x].rc_size = xsize; 894168404Spjd rm->rm_col[y].rc_size = ysize; 895168404Spjd 896168404Spjd p = pdata; 897168404Spjd q = qdata; 898168404Spjd pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data; 899168404Spjd qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 900168404Spjd xd = rm->rm_col[x].rc_data; 901168404Spjd yd = rm->rm_col[y].rc_data; 902168404Spjd 903168404Spjd /* 904168404Spjd * We now have: 905168404Spjd * Pxy = P + D_x + D_y 906168404Spjd * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y 907168404Spjd * 908168404Spjd * We can then solve for D_x: 909168404Spjd * D_x = A * (P + Pxy) + B * (Q + Qxy) 910168404Spjd * where 911168404Spjd * A = 2^(x - y) * (2^(x - y) + 1)^-1 912168404Spjd * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 913168404Spjd * 914168404Spjd * With D_x in hand, we can easily solve for D_y: 915168404Spjd * D_y = P + Pxy + D_x 916168404Spjd */ 917168404Spjd 918168404Spjd a = vdev_raidz_pow2[255 + x - y]; 919168404Spjd b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)]; 920168404Spjd tmp = 255 - vdev_raidz_log2[a ^ 1]; 921168404Spjd 922168404Spjd aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; 923168404Spjd bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; 924168404Spjd 925168404Spjd for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) { 926168404Spjd *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^ 927168404Spjd vdev_raidz_exp2(*q ^ *qxy, bexp); 928168404Spjd 929168404Spjd if (i < ysize) 930168404Spjd *yd = *p ^ *pxy ^ *xd; 931168404Spjd } 932168404Spjd 933168404Spjd zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data, 934168404Spjd rm->rm_col[VDEV_RAIDZ_P].rc_size); 935168404Spjd zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data, 936168404Spjd rm->rm_col[VDEV_RAIDZ_Q].rc_size); 937168404Spjd 938168404Spjd /* 939168404Spjd * Restore the saved parity data. 940168404Spjd */ 941168404Spjd rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata; 942168404Spjd rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata; 943219089Spjd 944219089Spjd return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q)); 945168404Spjd} 946168404Spjd 947219089Spjd/* BEGIN CSTYLED */ 948219089Spjd/* 949219089Spjd * In the general case of reconstruction, we must solve the system of linear 950219089Spjd * equations defined by the coeffecients used to generate parity as well as 951219089Spjd * the contents of the data and parity disks. This can be expressed with 952219089Spjd * vectors for the original data (D) and the actual data (d) and parity (p) 953219089Spjd * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 954219089Spjd * 955219089Spjd * __ __ __ __ 956219089Spjd * | | __ __ | p_0 | 957219089Spjd * | V | | D_0 | | p_m-1 | 958219089Spjd * | | x | : | = | d_0 | 959219089Spjd * | I | | D_n-1 | | : | 960219089Spjd * | | ~~ ~~ | d_n-1 | 961219089Spjd * ~~ ~~ ~~ ~~ 962219089Spjd * 963219089Spjd * I is simply a square identity matrix of size n, and V is a vandermonde 964219089Spjd * matrix defined by the coeffecients we chose for the various parity columns 965219089Spjd * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 966219089Spjd * computation as well as linear separability. 967219089Spjd * 968219089Spjd * __ __ __ __ 969219089Spjd * | 1 .. 1 1 1 | | p_0 | 970219089Spjd * | 2^n-1 .. 4 2 1 | __ __ | : | 971219089Spjd * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 972219089Spjd * | 1 .. 0 0 0 | | D_1 | | d_0 | 973219089Spjd * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 974219089Spjd * | : : : : | | : | | d_2 | 975219089Spjd * | 0 .. 1 0 0 | | D_n-1 | | : | 976219089Spjd * | 0 .. 0 1 0 | ~~ ~~ | : | 977219089Spjd * | 0 .. 0 0 1 | | d_n-1 | 978219089Spjd * ~~ ~~ ~~ ~~ 979219089Spjd * 980219089Spjd * Note that I, V, d, and p are known. To compute D, we must invert the 981219089Spjd * matrix and use the known data and parity values to reconstruct the unknown 982219089Spjd * data values. We begin by removing the rows in V|I and d|p that correspond 983219089Spjd * to failed or missing columns; we then make V|I square (n x n) and d|p 984219089Spjd * sized n by removing rows corresponding to unused parity from the bottom up 985219089Spjd * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 986219089Spjd * using Gauss-Jordan elimination. In the example below we use m=3 parity 987219089Spjd * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 988219089Spjd * __ __ 989219089Spjd * | 1 1 1 1 1 1 1 1 | 990219089Spjd * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 991219089Spjd * | 19 205 116 29 64 16 4 1 | / / 992219089Spjd * | 1 0 0 0 0 0 0 0 | / / 993219089Spjd * | 0 1 0 0 0 0 0 0 | <--' / 994219089Spjd * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 995219089Spjd * | 0 0 0 1 0 0 0 0 | 996219089Spjd * | 0 0 0 0 1 0 0 0 | 997219089Spjd * | 0 0 0 0 0 1 0 0 | 998219089Spjd * | 0 0 0 0 0 0 1 0 | 999219089Spjd * | 0 0 0 0 0 0 0 1 | 1000219089Spjd * ~~ ~~ 1001219089Spjd * __ __ 1002219089Spjd * | 1 1 1 1 1 1 1 1 | 1003219089Spjd * | 19 205 116 29 64 16 4 1 | 1004219089Spjd * | 1 0 0 0 0 0 0 0 | 1005255750Sdelphij * (V|I)' = | 0 0 0 1 0 0 0 0 | 1006219089Spjd * | 0 0 0 0 1 0 0 0 | 1007219089Spjd * | 0 0 0 0 0 1 0 0 | 1008219089Spjd * | 0 0 0 0 0 0 1 0 | 1009219089Spjd * | 0 0 0 0 0 0 0 1 | 1010219089Spjd * ~~ ~~ 1011219089Spjd * 1012219089Spjd * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 1013219089Spjd * have carefully chosen the seed values 1, 2, and 4 to ensure that this 1014219089Spjd * matrix is not singular. 1015219089Spjd * __ __ 1016219089Spjd * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1017219089Spjd * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1018219089Spjd * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1019219089Spjd * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1020219089Spjd * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1021219089Spjd * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1022219089Spjd * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1023219089Spjd * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1024219089Spjd * ~~ ~~ 1025219089Spjd * __ __ 1026219089Spjd * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1027219089Spjd * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1028219089Spjd * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1029219089Spjd * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1030219089Spjd * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1031219089Spjd * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1032219089Spjd * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1033219089Spjd * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1034219089Spjd * ~~ ~~ 1035219089Spjd * __ __ 1036219089Spjd * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1037219089Spjd * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1038219089Spjd * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 1039219089Spjd * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1040219089Spjd * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1041219089Spjd * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1042219089Spjd * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1043219089Spjd * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1044219089Spjd * ~~ ~~ 1045219089Spjd * __ __ 1046219089Spjd * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1047219089Spjd * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1048219089Spjd * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 1049219089Spjd * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1050219089Spjd * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1051219089Spjd * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1052219089Spjd * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1053219089Spjd * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1054219089Spjd * ~~ ~~ 1055219089Spjd * __ __ 1056219089Spjd * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1057219089Spjd * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1058219089Spjd * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1059219089Spjd * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1060219089Spjd * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1061219089Spjd * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1062219089Spjd * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1063219089Spjd * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1064219089Spjd * ~~ ~~ 1065219089Spjd * __ __ 1066219089Spjd * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1067219089Spjd * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 1068219089Spjd * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1069219089Spjd * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1070219089Spjd * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1071219089Spjd * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1072219089Spjd * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1073219089Spjd * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1074219089Spjd * ~~ ~~ 1075219089Spjd * __ __ 1076219089Spjd * | 0 0 1 0 0 0 0 0 | 1077219089Spjd * | 167 100 5 41 159 169 217 208 | 1078219089Spjd * | 166 100 4 40 158 168 216 209 | 1079219089Spjd * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 1080219089Spjd * | 0 0 0 0 1 0 0 0 | 1081219089Spjd * | 0 0 0 0 0 1 0 0 | 1082219089Spjd * | 0 0 0 0 0 0 1 0 | 1083219089Spjd * | 0 0 0 0 0 0 0 1 | 1084219089Spjd * ~~ ~~ 1085219089Spjd * 1086219089Spjd * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 1087219089Spjd * of the missing data. 1088219089Spjd * 1089219089Spjd * As is apparent from the example above, the only non-trivial rows in the 1090219089Spjd * inverse matrix correspond to the data disks that we're trying to 1091219089Spjd * reconstruct. Indeed, those are the only rows we need as the others would 1092219089Spjd * only be useful for reconstructing data known or assumed to be valid. For 1093219089Spjd * that reason, we only build the coefficients in the rows that correspond to 1094219089Spjd * targeted columns. 1095219089Spjd */ 1096219089Spjd/* END CSTYLED */ 1097168404Spjd 1098219089Spjdstatic void 1099219089Spjdvdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, 1100219089Spjd uint8_t **rows) 1101219089Spjd{ 1102219089Spjd int i, j; 1103219089Spjd int pow; 1104219089Spjd 1105219089Spjd ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); 1106219089Spjd 1107219089Spjd /* 1108219089Spjd * Fill in the missing rows of interest. 1109219089Spjd */ 1110219089Spjd for (i = 0; i < nmap; i++) { 1111219089Spjd ASSERT3S(0, <=, map[i]); 1112219089Spjd ASSERT3S(map[i], <=, 2); 1113219089Spjd 1114219089Spjd pow = map[i] * n; 1115219089Spjd if (pow > 255) 1116219089Spjd pow -= 255; 1117219089Spjd ASSERT(pow <= 255); 1118219089Spjd 1119219089Spjd for (j = 0; j < n; j++) { 1120219089Spjd pow -= map[i]; 1121219089Spjd if (pow < 0) 1122219089Spjd pow += 255; 1123219089Spjd rows[i][j] = vdev_raidz_pow2[pow]; 1124219089Spjd } 1125219089Spjd } 1126219089Spjd} 1127219089Spjd 1128219089Spjdstatic void 1129219089Spjdvdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, 1130219089Spjd uint8_t **rows, uint8_t **invrows, const uint8_t *used) 1131219089Spjd{ 1132219089Spjd int i, j, ii, jj; 1133219089Spjd uint8_t log; 1134219089Spjd 1135219089Spjd /* 1136219089Spjd * Assert that the first nmissing entries from the array of used 1137219089Spjd * columns correspond to parity columns and that subsequent entries 1138219089Spjd * correspond to data columns. 1139219089Spjd */ 1140219089Spjd for (i = 0; i < nmissing; i++) { 1141219089Spjd ASSERT3S(used[i], <, rm->rm_firstdatacol); 1142219089Spjd } 1143219089Spjd for (; i < n; i++) { 1144219089Spjd ASSERT3S(used[i], >=, rm->rm_firstdatacol); 1145219089Spjd } 1146219089Spjd 1147219089Spjd /* 1148219089Spjd * First initialize the storage where we'll compute the inverse rows. 1149219089Spjd */ 1150219089Spjd for (i = 0; i < nmissing; i++) { 1151219089Spjd for (j = 0; j < n; j++) { 1152219089Spjd invrows[i][j] = (i == j) ? 1 : 0; 1153219089Spjd } 1154219089Spjd } 1155219089Spjd 1156219089Spjd /* 1157219089Spjd * Subtract all trivial rows from the rows of consequence. 1158219089Spjd */ 1159219089Spjd for (i = 0; i < nmissing; i++) { 1160219089Spjd for (j = nmissing; j < n; j++) { 1161219089Spjd ASSERT3U(used[j], >=, rm->rm_firstdatacol); 1162219089Spjd jj = used[j] - rm->rm_firstdatacol; 1163219089Spjd ASSERT3S(jj, <, n); 1164219089Spjd invrows[i][j] = rows[i][jj]; 1165219089Spjd rows[i][jj] = 0; 1166219089Spjd } 1167219089Spjd } 1168219089Spjd 1169219089Spjd /* 1170219089Spjd * For each of the rows of interest, we must normalize it and subtract 1171219089Spjd * a multiple of it from the other rows. 1172219089Spjd */ 1173219089Spjd for (i = 0; i < nmissing; i++) { 1174219089Spjd for (j = 0; j < missing[i]; j++) { 1175240415Smm ASSERT0(rows[i][j]); 1176219089Spjd } 1177219089Spjd ASSERT3U(rows[i][missing[i]], !=, 0); 1178219089Spjd 1179219089Spjd /* 1180219089Spjd * Compute the inverse of the first element and multiply each 1181219089Spjd * element in the row by that value. 1182219089Spjd */ 1183219089Spjd log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 1184219089Spjd 1185219089Spjd for (j = 0; j < n; j++) { 1186219089Spjd rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 1187219089Spjd invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 1188219089Spjd } 1189219089Spjd 1190219089Spjd for (ii = 0; ii < nmissing; ii++) { 1191219089Spjd if (i == ii) 1192219089Spjd continue; 1193219089Spjd 1194219089Spjd ASSERT3U(rows[ii][missing[i]], !=, 0); 1195219089Spjd 1196219089Spjd log = vdev_raidz_log2[rows[ii][missing[i]]]; 1197219089Spjd 1198219089Spjd for (j = 0; j < n; j++) { 1199219089Spjd rows[ii][j] ^= 1200219089Spjd vdev_raidz_exp2(rows[i][j], log); 1201219089Spjd invrows[ii][j] ^= 1202219089Spjd vdev_raidz_exp2(invrows[i][j], log); 1203219089Spjd } 1204219089Spjd } 1205219089Spjd } 1206219089Spjd 1207219089Spjd /* 1208219089Spjd * Verify that the data that is left in the rows are properly part of 1209219089Spjd * an identity matrix. 1210219089Spjd */ 1211219089Spjd for (i = 0; i < nmissing; i++) { 1212219089Spjd for (j = 0; j < n; j++) { 1213219089Spjd if (j == missing[i]) { 1214219089Spjd ASSERT3U(rows[i][j], ==, 1); 1215219089Spjd } else { 1216240415Smm ASSERT0(rows[i][j]); 1217219089Spjd } 1218219089Spjd } 1219219089Spjd } 1220219089Spjd} 1221219089Spjd 1222219089Spjdstatic void 1223219089Spjdvdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, 1224219089Spjd int *missing, uint8_t **invrows, const uint8_t *used) 1225219089Spjd{ 1226219089Spjd int i, j, x, cc, c; 1227219089Spjd uint8_t *src; 1228219089Spjd uint64_t ccount; 1229219089Spjd uint8_t *dst[VDEV_RAIDZ_MAXPARITY]; 1230219089Spjd uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; 1231247187Smm uint8_t log = 0; 1232247187Smm uint8_t val; 1233219089Spjd int ll; 1234219089Spjd uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 1235219089Spjd uint8_t *p, *pp; 1236219089Spjd size_t psize; 1237219089Spjd 1238219089Spjd psize = sizeof (invlog[0][0]) * n * nmissing; 1239219089Spjd p = kmem_alloc(psize, KM_SLEEP); 1240219089Spjd 1241219089Spjd for (pp = p, i = 0; i < nmissing; i++) { 1242219089Spjd invlog[i] = pp; 1243219089Spjd pp += n; 1244219089Spjd } 1245219089Spjd 1246219089Spjd for (i = 0; i < nmissing; i++) { 1247219089Spjd for (j = 0; j < n; j++) { 1248219089Spjd ASSERT3U(invrows[i][j], !=, 0); 1249219089Spjd invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 1250219089Spjd } 1251219089Spjd } 1252219089Spjd 1253219089Spjd for (i = 0; i < n; i++) { 1254219089Spjd c = used[i]; 1255219089Spjd ASSERT3U(c, <, rm->rm_cols); 1256219089Spjd 1257219089Spjd src = rm->rm_col[c].rc_data; 1258219089Spjd ccount = rm->rm_col[c].rc_size; 1259219089Spjd for (j = 0; j < nmissing; j++) { 1260219089Spjd cc = missing[j] + rm->rm_firstdatacol; 1261219089Spjd ASSERT3U(cc, >=, rm->rm_firstdatacol); 1262219089Spjd ASSERT3U(cc, <, rm->rm_cols); 1263219089Spjd ASSERT3U(cc, !=, c); 1264219089Spjd 1265219089Spjd dst[j] = rm->rm_col[cc].rc_data; 1266219089Spjd dcount[j] = rm->rm_col[cc].rc_size; 1267219089Spjd } 1268219089Spjd 1269219089Spjd ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); 1270219089Spjd 1271219089Spjd for (x = 0; x < ccount; x++, src++) { 1272219089Spjd if (*src != 0) 1273219089Spjd log = vdev_raidz_log2[*src]; 1274219089Spjd 1275219089Spjd for (cc = 0; cc < nmissing; cc++) { 1276219089Spjd if (x >= dcount[cc]) 1277219089Spjd continue; 1278219089Spjd 1279219089Spjd if (*src == 0) { 1280219089Spjd val = 0; 1281219089Spjd } else { 1282219089Spjd if ((ll = log + invlog[cc][i]) >= 255) 1283219089Spjd ll -= 255; 1284219089Spjd val = vdev_raidz_pow2[ll]; 1285219089Spjd } 1286219089Spjd 1287219089Spjd if (i == 0) 1288219089Spjd dst[cc][x] = val; 1289219089Spjd else 1290219089Spjd dst[cc][x] ^= val; 1291219089Spjd } 1292219089Spjd } 1293219089Spjd } 1294219089Spjd 1295219089Spjd kmem_free(p, psize); 1296219089Spjd} 1297219089Spjd 1298168404Spjdstatic int 1299219089Spjdvdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) 1300219089Spjd{ 1301219089Spjd int n, i, c, t, tt; 1302219089Spjd int nmissing_rows; 1303219089Spjd int missing_rows[VDEV_RAIDZ_MAXPARITY]; 1304219089Spjd int parity_map[VDEV_RAIDZ_MAXPARITY]; 1305219089Spjd 1306219089Spjd uint8_t *p, *pp; 1307219089Spjd size_t psize; 1308219089Spjd 1309219089Spjd uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 1310219089Spjd uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 1311219089Spjd uint8_t *used; 1312219089Spjd 1313219089Spjd int code = 0; 1314219089Spjd 1315219089Spjd 1316219089Spjd n = rm->rm_cols - rm->rm_firstdatacol; 1317219089Spjd 1318219089Spjd /* 1319219089Spjd * Figure out which data columns are missing. 1320219089Spjd */ 1321219089Spjd nmissing_rows = 0; 1322219089Spjd for (t = 0; t < ntgts; t++) { 1323219089Spjd if (tgts[t] >= rm->rm_firstdatacol) { 1324219089Spjd missing_rows[nmissing_rows++] = 1325219089Spjd tgts[t] - rm->rm_firstdatacol; 1326219089Spjd } 1327219089Spjd } 1328219089Spjd 1329219089Spjd /* 1330219089Spjd * Figure out which parity columns to use to help generate the missing 1331219089Spjd * data columns. 1332219089Spjd */ 1333219089Spjd for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 1334219089Spjd ASSERT(tt < ntgts); 1335219089Spjd ASSERT(c < rm->rm_firstdatacol); 1336219089Spjd 1337219089Spjd /* 1338219089Spjd * Skip any targeted parity columns. 1339219089Spjd */ 1340219089Spjd if (c == tgts[tt]) { 1341219089Spjd tt++; 1342219089Spjd continue; 1343219089Spjd } 1344219089Spjd 1345219089Spjd code |= 1 << c; 1346219089Spjd 1347219089Spjd parity_map[i] = c; 1348219089Spjd i++; 1349219089Spjd } 1350219089Spjd 1351219089Spjd ASSERT(code != 0); 1352219089Spjd ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); 1353219089Spjd 1354219089Spjd psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 1355219089Spjd nmissing_rows * n + sizeof (used[0]) * n; 1356219089Spjd p = kmem_alloc(psize, KM_SLEEP); 1357219089Spjd 1358219089Spjd for (pp = p, i = 0; i < nmissing_rows; i++) { 1359219089Spjd rows[i] = pp; 1360219089Spjd pp += n; 1361219089Spjd invrows[i] = pp; 1362219089Spjd pp += n; 1363219089Spjd } 1364219089Spjd used = pp; 1365219089Spjd 1366219089Spjd for (i = 0; i < nmissing_rows; i++) { 1367219089Spjd used[i] = parity_map[i]; 1368219089Spjd } 1369219089Spjd 1370219089Spjd for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1371219089Spjd if (tt < nmissing_rows && 1372219089Spjd c == missing_rows[tt] + rm->rm_firstdatacol) { 1373219089Spjd tt++; 1374219089Spjd continue; 1375219089Spjd } 1376219089Spjd 1377219089Spjd ASSERT3S(i, <, n); 1378219089Spjd used[i] = c; 1379219089Spjd i++; 1380219089Spjd } 1381219089Spjd 1382219089Spjd /* 1383219089Spjd * Initialize the interesting rows of the matrix. 1384219089Spjd */ 1385219089Spjd vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); 1386219089Spjd 1387219089Spjd /* 1388219089Spjd * Invert the matrix. 1389219089Spjd */ 1390219089Spjd vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, 1391219089Spjd invrows, used); 1392219089Spjd 1393219089Spjd /* 1394219089Spjd * Reconstruct the missing data using the generated matrix. 1395219089Spjd */ 1396219089Spjd vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, 1397219089Spjd invrows, used); 1398219089Spjd 1399219089Spjd kmem_free(p, psize); 1400219089Spjd 1401219089Spjd return (code); 1402219089Spjd} 1403219089Spjd 1404219089Spjdstatic int 1405219089Spjdvdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) 1406219089Spjd{ 1407219089Spjd int tgts[VDEV_RAIDZ_MAXPARITY], *dt; 1408219089Spjd int ntgts; 1409219089Spjd int i, c; 1410219089Spjd int code; 1411219089Spjd int nbadparity, nbaddata; 1412219089Spjd int parity_valid[VDEV_RAIDZ_MAXPARITY]; 1413219089Spjd 1414219089Spjd /* 1415219089Spjd * The tgts list must already be sorted. 1416219089Spjd */ 1417219089Spjd for (i = 1; i < nt; i++) { 1418219089Spjd ASSERT(t[i] > t[i - 1]); 1419219089Spjd } 1420219089Spjd 1421219089Spjd nbadparity = rm->rm_firstdatacol; 1422219089Spjd nbaddata = rm->rm_cols - nbadparity; 1423219089Spjd ntgts = 0; 1424219089Spjd for (i = 0, c = 0; c < rm->rm_cols; c++) { 1425219089Spjd if (c < rm->rm_firstdatacol) 1426219089Spjd parity_valid[c] = B_FALSE; 1427219089Spjd 1428219089Spjd if (i < nt && c == t[i]) { 1429219089Spjd tgts[ntgts++] = c; 1430219089Spjd i++; 1431219089Spjd } else if (rm->rm_col[c].rc_error != 0) { 1432219089Spjd tgts[ntgts++] = c; 1433219089Spjd } else if (c >= rm->rm_firstdatacol) { 1434219089Spjd nbaddata--; 1435219089Spjd } else { 1436219089Spjd parity_valid[c] = B_TRUE; 1437219089Spjd nbadparity--; 1438219089Spjd } 1439219089Spjd } 1440219089Spjd 1441219089Spjd ASSERT(ntgts >= nt); 1442219089Spjd ASSERT(nbaddata >= 0); 1443219089Spjd ASSERT(nbaddata + nbadparity == ntgts); 1444219089Spjd 1445219089Spjd dt = &tgts[nbadparity]; 1446219089Spjd 1447219089Spjd /* 1448219089Spjd * See if we can use any of our optimized reconstruction routines. 1449219089Spjd */ 1450219089Spjd if (!vdev_raidz_default_to_general) { 1451219089Spjd switch (nbaddata) { 1452219089Spjd case 1: 1453219089Spjd if (parity_valid[VDEV_RAIDZ_P]) 1454219089Spjd return (vdev_raidz_reconstruct_p(rm, dt, 1)); 1455219089Spjd 1456219089Spjd ASSERT(rm->rm_firstdatacol > 1); 1457219089Spjd 1458219089Spjd if (parity_valid[VDEV_RAIDZ_Q]) 1459219089Spjd return (vdev_raidz_reconstruct_q(rm, dt, 1)); 1460219089Spjd 1461219089Spjd ASSERT(rm->rm_firstdatacol > 2); 1462219089Spjd break; 1463219089Spjd 1464219089Spjd case 2: 1465219089Spjd ASSERT(rm->rm_firstdatacol > 1); 1466219089Spjd 1467219089Spjd if (parity_valid[VDEV_RAIDZ_P] && 1468219089Spjd parity_valid[VDEV_RAIDZ_Q]) 1469219089Spjd return (vdev_raidz_reconstruct_pq(rm, dt, 2)); 1470219089Spjd 1471219089Spjd ASSERT(rm->rm_firstdatacol > 2); 1472219089Spjd 1473219089Spjd break; 1474219089Spjd } 1475219089Spjd } 1476219089Spjd 1477219089Spjd code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); 1478219089Spjd ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); 1479219089Spjd ASSERT(code > 0); 1480219089Spjd return (code); 1481219089Spjd} 1482219089Spjd 1483219089Spjdstatic int 1484236155Smmvdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, 1485254591Sgibbs uint64_t *logical_ashift, uint64_t *physical_ashift) 1486168404Spjd{ 1487168404Spjd vdev_t *cvd; 1488168404Spjd uint64_t nparity = vd->vdev_nparity; 1489219089Spjd int c; 1490168404Spjd int lasterror = 0; 1491168404Spjd int numerrors = 0; 1492168404Spjd 1493168404Spjd ASSERT(nparity > 0); 1494168404Spjd 1495168404Spjd if (nparity > VDEV_RAIDZ_MAXPARITY || 1496168404Spjd vd->vdev_children < nparity + 1) { 1497168404Spjd vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 1498249195Smm return (SET_ERROR(EINVAL)); 1499168404Spjd } 1500168404Spjd 1501219089Spjd vdev_open_children(vd); 1502219089Spjd 1503168404Spjd for (c = 0; c < vd->vdev_children; c++) { 1504168404Spjd cvd = vd->vdev_child[c]; 1505168404Spjd 1506219089Spjd if (cvd->vdev_open_error != 0) { 1507219089Spjd lasterror = cvd->vdev_open_error; 1508168404Spjd numerrors++; 1509168404Spjd continue; 1510168404Spjd } 1511168404Spjd 1512168404Spjd *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; 1513236155Smm *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; 1514254591Sgibbs *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift); 1515254591Sgibbs *physical_ashift = MAX(*physical_ashift, 1516254591Sgibbs cvd->vdev_physical_ashift); 1517168404Spjd } 1518168404Spjd 1519168404Spjd *asize *= vd->vdev_children; 1520236155Smm *max_asize *= vd->vdev_children; 1521168404Spjd 1522168404Spjd if (numerrors > nparity) { 1523168404Spjd vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; 1524168404Spjd return (lasterror); 1525168404Spjd } 1526168404Spjd 1527168404Spjd return (0); 1528168404Spjd} 1529168404Spjd 1530168404Spjdstatic void 1531168404Spjdvdev_raidz_close(vdev_t *vd) 1532168404Spjd{ 1533168404Spjd int c; 1534168404Spjd 1535168404Spjd for (c = 0; c < vd->vdev_children; c++) 1536168404Spjd vdev_close(vd->vdev_child[c]); 1537168404Spjd} 1538168404Spjd 1539255750Sdelphij#ifdef illumos 1540255750Sdelphij/* 1541255750Sdelphij * Handle a read or write I/O to a RAID-Z dump device. 1542255750Sdelphij * 1543255750Sdelphij * The dump device is in a unique situation compared to other ZFS datasets: 1544255750Sdelphij * writing to this device should be as simple and fast as possible. In 1545255750Sdelphij * addition, durability matters much less since the dump will be extracted 1546255750Sdelphij * once the machine reboots. For that reason, this function eschews parity for 1547255750Sdelphij * performance and simplicity. The dump device uses the checksum setting 1548255750Sdelphij * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this 1549255750Sdelphij * dataset. 1550255750Sdelphij * 1551255750Sdelphij * Blocks of size 128 KB have been preallocated for this volume. I/Os less than 1552255750Sdelphij * 128 KB will not fill an entire block; in addition, they may not be properly 1553255750Sdelphij * aligned. In that case, this function uses the preallocated 128 KB block and 1554255750Sdelphij * omits reading or writing any "empty" portions of that block, as opposed to 1555255750Sdelphij * allocating a fresh appropriately-sized block. 1556255750Sdelphij * 1557255750Sdelphij * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs: 1558255750Sdelphij * 1559255750Sdelphij * vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB) 1560255750Sdelphij * 1561255750Sdelphij * If this were a standard RAID-Z dataset, a block of at least 40 KB would be 1562255750Sdelphij * allocated which spans all five child vdevs. 8 KB of data would be written to 1563255750Sdelphij * each of four vdevs, with the fifth containing the parity bits. 1564255750Sdelphij * 1565255750Sdelphij * parity data data data data 1566255750Sdelphij * | PP | XX | XX | XX | XX | 1567255750Sdelphij * ^ ^ ^ ^ ^ 1568255750Sdelphij * | | | | | 1569255750Sdelphij * 8 KB parity ------8 KB data blocks------ 1570255750Sdelphij * 1571255750Sdelphij * However, when writing to the dump device, the behavior is different: 1572255750Sdelphij * 1573255750Sdelphij * vdev_raidz_physio(data, size: 32 KB, offset: 64 KB) 1574255750Sdelphij * 1575255750Sdelphij * Unlike the normal RAID-Z case in which the block is allocated based on the 1576255750Sdelphij * I/O size, reads and writes here always use a 128 KB logical I/O size. If the 1577255750Sdelphij * I/O size is less than 128 KB, only the actual portions of data are written. 1578255750Sdelphij * In this example the data is written to the third data vdev since that vdev 1579255750Sdelphij * contains the offset [64 KB, 96 KB). 1580255750Sdelphij * 1581255750Sdelphij * parity data data data data 1582255750Sdelphij * | | | | XX | | 1583255750Sdelphij * ^ 1584255750Sdelphij * | 1585255750Sdelphij * 32 KB data block 1586255750Sdelphij * 1587255750Sdelphij * As a result, an individual I/O may not span all child vdevs; moreover, a 1588255750Sdelphij * small I/O may only operate on a single child vdev. 1589255750Sdelphij * 1590255750Sdelphij * Note that since there are no parity bits calculated or written, this format 1591255750Sdelphij * remains the same no matter how many parity bits are used in a normal RAID-Z 1592255750Sdelphij * stripe. On a RAID-Z3 configuration with seven child vdevs, the example above 1593255750Sdelphij * would look like: 1594255750Sdelphij * 1595255750Sdelphij * parity parity parity data data data data 1596255750Sdelphij * | | | | | | XX | | 1597255750Sdelphij * ^ 1598255750Sdelphij * | 1599255750Sdelphij * 32 KB data block 1600255750Sdelphij */ 1601255750Sdelphijint 1602255750Sdelphijvdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size, 1603255750Sdelphij uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump) 1604255750Sdelphij{ 1605255750Sdelphij vdev_t *tvd = vd->vdev_top; 1606255750Sdelphij vdev_t *cvd; 1607255750Sdelphij raidz_map_t *rm; 1608255750Sdelphij raidz_col_t *rc; 1609255750Sdelphij int c, err = 0; 1610255750Sdelphij 1611255750Sdelphij uint64_t start, end, colstart, colend; 1612255750Sdelphij uint64_t coloffset, colsize, colskip; 1613255750Sdelphij 1614255750Sdelphij int flags = doread ? BIO_READ : BIO_WRITE; 1615255750Sdelphij 1616255750Sdelphij#ifdef _KERNEL 1617255750Sdelphij 1618255750Sdelphij /* 1619255750Sdelphij * Don't write past the end of the block 1620255750Sdelphij */ 1621255750Sdelphij VERIFY3U(offset + size, <=, origoffset + SPA_MAXBLOCKSIZE); 1622255750Sdelphij 1623255750Sdelphij start = offset; 1624255750Sdelphij end = start + size; 1625255750Sdelphij 1626255750Sdelphij /* 1627255750Sdelphij * Allocate a RAID-Z map for this block. Note that this block starts 1628255750Sdelphij * from the "original" offset, this is, the offset of the extent which 1629255750Sdelphij * contains the requisite offset of the data being read or written. 1630255750Sdelphij * 1631255750Sdelphij * Even if this I/O operation doesn't span the full block size, let's 1632255750Sdelphij * treat the on-disk format as if the only blocks are the complete 128 1633255750Sdelphij * KB size. 1634255750Sdelphij */ 1635255750Sdelphij rm = vdev_raidz_map_alloc(data - (offset - origoffset), 1636255750Sdelphij SPA_MAXBLOCKSIZE, origoffset, B_FALSE, tvd->vdev_ashift, vd->vdev_children, 1637255750Sdelphij vd->vdev_nparity); 1638255750Sdelphij 1639255750Sdelphij coloffset = origoffset; 1640255750Sdelphij 1641255750Sdelphij for (c = rm->rm_firstdatacol; c < rm->rm_cols; 1642255750Sdelphij c++, coloffset += rc->rc_size) { 1643255750Sdelphij rc = &rm->rm_col[c]; 1644255750Sdelphij cvd = vd->vdev_child[rc->rc_devidx]; 1645255750Sdelphij 1646255750Sdelphij /* 1647255750Sdelphij * Find the start and end of this column in the RAID-Z map, 1648255750Sdelphij * keeping in mind that the stated size and offset of the 1649255750Sdelphij * operation may not fill the entire column for this vdev. 1650255750Sdelphij * 1651255750Sdelphij * If any portion of the data spans this column, issue the 1652255750Sdelphij * appropriate operation to the vdev. 1653255750Sdelphij */ 1654255750Sdelphij if (coloffset + rc->rc_size <= start) 1655255750Sdelphij continue; 1656255750Sdelphij if (coloffset >= end) 1657255750Sdelphij continue; 1658255750Sdelphij 1659255750Sdelphij colstart = MAX(coloffset, start); 1660255750Sdelphij colend = MIN(end, coloffset + rc->rc_size); 1661255750Sdelphij colsize = colend - colstart; 1662255750Sdelphij colskip = colstart - coloffset; 1663255750Sdelphij 1664255750Sdelphij VERIFY3U(colsize, <=, rc->rc_size); 1665255750Sdelphij VERIFY3U(colskip, <=, rc->rc_size); 1666255750Sdelphij 1667255750Sdelphij /* 1668255750Sdelphij * Note that the child vdev will have a vdev label at the start 1669255750Sdelphij * of its range of offsets, hence the need for 1670255750Sdelphij * VDEV_LABEL_OFFSET(). See zio_vdev_child_io() for another 1671255750Sdelphij * example of why this calculation is needed. 1672255750Sdelphij */ 1673255750Sdelphij if ((err = vdev_disk_physio(cvd, 1674255750Sdelphij ((char *)rc->rc_data) + colskip, colsize, 1675255750Sdelphij VDEV_LABEL_OFFSET(rc->rc_offset) + colskip, 1676255750Sdelphij flags, isdump)) != 0) 1677255750Sdelphij break; 1678255750Sdelphij } 1679255750Sdelphij 1680255750Sdelphij vdev_raidz_map_free(rm); 1681255750Sdelphij#endif /* KERNEL */ 1682255750Sdelphij 1683255750Sdelphij return (err); 1684255750Sdelphij} 1685255750Sdelphij#endif 1686255750Sdelphij 1687168404Spjdstatic uint64_t 1688168404Spjdvdev_raidz_asize(vdev_t *vd, uint64_t psize) 1689168404Spjd{ 1690168404Spjd uint64_t asize; 1691168404Spjd uint64_t ashift = vd->vdev_top->vdev_ashift; 1692168404Spjd uint64_t cols = vd->vdev_children; 1693168404Spjd uint64_t nparity = vd->vdev_nparity; 1694168404Spjd 1695168404Spjd asize = ((psize - 1) >> ashift) + 1; 1696168404Spjd asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); 1697168404Spjd asize = roundup(asize, nparity + 1) << ashift; 1698168404Spjd 1699168404Spjd return (asize); 1700168404Spjd} 1701168404Spjd 1702168404Spjdstatic void 1703168404Spjdvdev_raidz_child_done(zio_t *zio) 1704168404Spjd{ 1705168404Spjd raidz_col_t *rc = zio->io_private; 1706168404Spjd 1707168404Spjd rc->rc_error = zio->io_error; 1708168404Spjd rc->rc_tried = 1; 1709168404Spjd rc->rc_skipped = 0; 1710168404Spjd} 1711168404Spjd 1712251629Sdelphij/* 1713251629Sdelphij * Start an IO operation on a RAIDZ VDev 1714251629Sdelphij * 1715251629Sdelphij * Outline: 1716251629Sdelphij * - For write operations: 1717251629Sdelphij * 1. Generate the parity data 1718251629Sdelphij * 2. Create child zio write operations to each column's vdev, for both 1719251629Sdelphij * data and parity. 1720251629Sdelphij * 3. If the column skips any sectors for padding, create optional dummy 1721251629Sdelphij * write zio children for those areas to improve aggregation continuity. 1722251629Sdelphij * - For read operations: 1723251629Sdelphij * 1. Create child zio read operations to each data column's vdev to read 1724251629Sdelphij * the range of data required for zio. 1725251629Sdelphij * 2. If this is a scrub or resilver operation, or if any of the data 1726251629Sdelphij * vdevs have had errors, then create zio read operations to the parity 1727251629Sdelphij * columns' VDevs as well. 1728251629Sdelphij */ 1729185029Spjdstatic int 1730168404Spjdvdev_raidz_io_start(zio_t *zio) 1731168404Spjd{ 1732168404Spjd vdev_t *vd = zio->io_vd; 1733168404Spjd vdev_t *tvd = vd->vdev_top; 1734168404Spjd vdev_t *cvd; 1735168404Spjd raidz_map_t *rm; 1736168404Spjd raidz_col_t *rc; 1737219089Spjd int c, i; 1738168404Spjd 1739255750Sdelphij rm = vdev_raidz_map_alloc(zio->io_data, zio->io_size, zio->io_offset, 1740255750Sdelphij zio->io_type == ZIO_TYPE_FREE, 1741255750Sdelphij tvd->vdev_ashift, vd->vdev_children, 1742168404Spjd vd->vdev_nparity); 1743168404Spjd 1744255750Sdelphij zio->io_vsd = rm; 1745255750Sdelphij zio->io_vsd_ops = &vdev_raidz_vsd_ops; 1746255750Sdelphij 1747168404Spjd ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); 1748168404Spjd 1749240868Spjd if (zio->io_type == ZIO_TYPE_FREE) { 1750240868Spjd for (c = 0; c < rm->rm_cols; c++) { 1751240868Spjd rc = &rm->rm_col[c]; 1752240868Spjd cvd = vd->vdev_child[rc->rc_devidx]; 1753240868Spjd zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1754240868Spjd rc->rc_offset, rc->rc_data, rc->rc_size, 1755240868Spjd zio->io_type, zio->io_priority, 0, 1756240868Spjd vdev_raidz_child_done, rc)); 1757240868Spjd } 1758270312Ssmh 1759270312Ssmh zio_interrupt(zio); 1760270312Ssmh return (ZIO_PIPELINE_STOP); 1761240868Spjd } 1762240868Spjd 1763168404Spjd if (zio->io_type == ZIO_TYPE_WRITE) { 1764219089Spjd vdev_raidz_generate_parity(rm); 1765168404Spjd 1766168404Spjd for (c = 0; c < rm->rm_cols; c++) { 1767168404Spjd rc = &rm->rm_col[c]; 1768168404Spjd cvd = vd->vdev_child[rc->rc_devidx]; 1769168404Spjd zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1770168404Spjd rc->rc_offset, rc->rc_data, rc->rc_size, 1771185029Spjd zio->io_type, zio->io_priority, 0, 1772168404Spjd vdev_raidz_child_done, rc)); 1773168404Spjd } 1774185029Spjd 1775219089Spjd /* 1776219089Spjd * Generate optional I/Os for any skipped sectors to improve 1777219089Spjd * aggregation contiguity. 1778219089Spjd */ 1779219089Spjd for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) { 1780219089Spjd ASSERT(c <= rm->rm_scols); 1781219089Spjd if (c == rm->rm_scols) 1782219089Spjd c = 0; 1783219089Spjd rc = &rm->rm_col[c]; 1784219089Spjd cvd = vd->vdev_child[rc->rc_devidx]; 1785219089Spjd zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1786219089Spjd rc->rc_offset + rc->rc_size, NULL, 1787219089Spjd 1 << tvd->vdev_ashift, 1788219089Spjd zio->io_type, zio->io_priority, 1789219089Spjd ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); 1790219089Spjd } 1791219089Spjd 1792270312Ssmh zio_interrupt(zio); 1793270312Ssmh return (ZIO_PIPELINE_STOP); 1794168404Spjd } 1795168404Spjd 1796168404Spjd ASSERT(zio->io_type == ZIO_TYPE_READ); 1797168404Spjd 1798168404Spjd /* 1799168404Spjd * Iterate over the columns in reverse order so that we hit the parity 1800219089Spjd * last -- any errors along the way will force us to read the parity. 1801168404Spjd */ 1802168404Spjd for (c = rm->rm_cols - 1; c >= 0; c--) { 1803168404Spjd rc = &rm->rm_col[c]; 1804168404Spjd cvd = vd->vdev_child[rc->rc_devidx]; 1805185029Spjd if (!vdev_readable(cvd)) { 1806168404Spjd if (c >= rm->rm_firstdatacol) 1807168404Spjd rm->rm_missingdata++; 1808168404Spjd else 1809168404Spjd rm->rm_missingparity++; 1810249195Smm rc->rc_error = SET_ERROR(ENXIO); 1811168404Spjd rc->rc_tried = 1; /* don't even try */ 1812168404Spjd rc->rc_skipped = 1; 1813168404Spjd continue; 1814168404Spjd } 1815219089Spjd if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 1816168404Spjd if (c >= rm->rm_firstdatacol) 1817168404Spjd rm->rm_missingdata++; 1818168404Spjd else 1819168404Spjd rm->rm_missingparity++; 1820249195Smm rc->rc_error = SET_ERROR(ESTALE); 1821168404Spjd rc->rc_skipped = 1; 1822168404Spjd continue; 1823168404Spjd } 1824168404Spjd if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || 1825209095Smm (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { 1826168404Spjd zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1827168404Spjd rc->rc_offset, rc->rc_data, rc->rc_size, 1828185029Spjd zio->io_type, zio->io_priority, 0, 1829168404Spjd vdev_raidz_child_done, rc)); 1830168404Spjd } 1831168404Spjd } 1832168404Spjd 1833270312Ssmh zio_interrupt(zio); 1834270312Ssmh return (ZIO_PIPELINE_STOP); 1835168404Spjd} 1836168404Spjd 1837219089Spjd 1838168404Spjd/* 1839168404Spjd * Report a checksum error for a child of a RAID-Z device. 1840168404Spjd */ 1841168404Spjdstatic void 1842219089Spjdraidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data) 1843168404Spjd{ 1844168404Spjd vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; 1845168404Spjd 1846168404Spjd if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 1847219089Spjd zio_bad_cksum_t zbc; 1848219089Spjd raidz_map_t *rm = zio->io_vsd; 1849219089Spjd 1850168404Spjd mutex_enter(&vd->vdev_stat_lock); 1851168404Spjd vd->vdev_stat.vs_checksum_errors++; 1852168404Spjd mutex_exit(&vd->vdev_stat_lock); 1853219089Spjd 1854219089Spjd zbc.zbc_has_cksum = 0; 1855219089Spjd zbc.zbc_injected = rm->rm_ecksuminjected; 1856219089Spjd 1857219089Spjd zfs_ereport_post_checksum(zio->io_spa, vd, zio, 1858219089Spjd rc->rc_offset, rc->rc_size, rc->rc_data, bad_data, 1859219089Spjd &zbc); 1860168404Spjd } 1861219089Spjd} 1862168404Spjd 1863219089Spjd/* 1864219089Spjd * We keep track of whether or not there were any injected errors, so that 1865219089Spjd * any ereports we generate can note it. 1866219089Spjd */ 1867219089Spjdstatic int 1868219089Spjdraidz_checksum_verify(zio_t *zio) 1869219089Spjd{ 1870219089Spjd zio_bad_cksum_t zbc; 1871219089Spjd raidz_map_t *rm = zio->io_vsd; 1872219089Spjd 1873219089Spjd int ret = zio_checksum_error(zio, &zbc); 1874219089Spjd if (ret != 0 && zbc.zbc_injected != 0) 1875219089Spjd rm->rm_ecksuminjected = 1; 1876219089Spjd 1877219089Spjd return (ret); 1878168404Spjd} 1879168404Spjd 1880168404Spjd/* 1881168404Spjd * Generate the parity from the data columns. If we tried and were able to 1882168404Spjd * read the parity without error, verify that the generated parity matches the 1883168404Spjd * data we read. If it doesn't, we fire off a checksum error. Return the 1884168404Spjd * number such failures. 1885168404Spjd */ 1886168404Spjdstatic int 1887168404Spjdraidz_parity_verify(zio_t *zio, raidz_map_t *rm) 1888168404Spjd{ 1889168404Spjd void *orig[VDEV_RAIDZ_MAXPARITY]; 1890168404Spjd int c, ret = 0; 1891168404Spjd raidz_col_t *rc; 1892168404Spjd 1893255750Sdelphij blkptr_t *bp = zio->io_bp; 1894255750Sdelphij enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum : 1895255750Sdelphij (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); 1896255750Sdelphij 1897255750Sdelphij if (checksum == ZIO_CHECKSUM_NOPARITY) 1898255750Sdelphij return (ret); 1899255750Sdelphij 1900168404Spjd for (c = 0; c < rm->rm_firstdatacol; c++) { 1901168404Spjd rc = &rm->rm_col[c]; 1902168404Spjd if (!rc->rc_tried || rc->rc_error != 0) 1903168404Spjd continue; 1904168404Spjd orig[c] = zio_buf_alloc(rc->rc_size); 1905168404Spjd bcopy(rc->rc_data, orig[c], rc->rc_size); 1906168404Spjd } 1907168404Spjd 1908219089Spjd vdev_raidz_generate_parity(rm); 1909168404Spjd 1910168404Spjd for (c = 0; c < rm->rm_firstdatacol; c++) { 1911168404Spjd rc = &rm->rm_col[c]; 1912168404Spjd if (!rc->rc_tried || rc->rc_error != 0) 1913168404Spjd continue; 1914168404Spjd if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { 1915219089Spjd raidz_checksum_error(zio, rc, orig[c]); 1916249195Smm rc->rc_error = SET_ERROR(ECKSUM); 1917168404Spjd ret++; 1918168404Spjd } 1919168404Spjd zio_buf_free(orig[c], rc->rc_size); 1920168404Spjd } 1921168404Spjd 1922168404Spjd return (ret); 1923168404Spjd} 1924168404Spjd 1925219089Spjd/* 1926219089Spjd * Keep statistics on all the ways that we used parity to correct data. 1927219089Spjd */ 1928219089Spjdstatic uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY]; 1929168404Spjd 1930185029Spjdstatic int 1931185029Spjdvdev_raidz_worst_error(raidz_map_t *rm) 1932185029Spjd{ 1933185029Spjd int error = 0; 1934185029Spjd 1935185029Spjd for (int c = 0; c < rm->rm_cols; c++) 1936185029Spjd error = zio_worst_error(error, rm->rm_col[c].rc_error); 1937185029Spjd 1938185029Spjd return (error); 1939185029Spjd} 1940185029Spjd 1941219089Spjd/* 1942219089Spjd * Iterate over all combinations of bad data and attempt a reconstruction. 1943219089Spjd * Note that the algorithm below is non-optimal because it doesn't take into 1944219089Spjd * account how reconstruction is actually performed. For example, with 1945219089Spjd * triple-parity RAID-Z the reconstruction procedure is the same if column 4 1946219089Spjd * is targeted as invalid as if columns 1 and 4 are targeted since in both 1947219089Spjd * cases we'd only use parity information in column 0. 1948219089Spjd */ 1949219089Spjdstatic int 1950219089Spjdvdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) 1951219089Spjd{ 1952219089Spjd raidz_map_t *rm = zio->io_vsd; 1953219089Spjd raidz_col_t *rc; 1954219089Spjd void *orig[VDEV_RAIDZ_MAXPARITY]; 1955219089Spjd int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 1956219089Spjd int *tgts = &tstore[1]; 1957219089Spjd int current, next, i, c, n; 1958219089Spjd int code, ret = 0; 1959219089Spjd 1960219089Spjd ASSERT(total_errors < rm->rm_firstdatacol); 1961219089Spjd 1962219089Spjd /* 1963219089Spjd * This simplifies one edge condition. 1964219089Spjd */ 1965219089Spjd tgts[-1] = -1; 1966219089Spjd 1967219089Spjd for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { 1968219089Spjd /* 1969219089Spjd * Initialize the targets array by finding the first n columns 1970219089Spjd * that contain no error. 1971219089Spjd * 1972219089Spjd * If there were no data errors, we need to ensure that we're 1973219089Spjd * always explicitly attempting to reconstruct at least one 1974219089Spjd * data column. To do this, we simply push the highest target 1975219089Spjd * up into the data columns. 1976219089Spjd */ 1977219089Spjd for (c = 0, i = 0; i < n; i++) { 1978219089Spjd if (i == n - 1 && data_errors == 0 && 1979219089Spjd c < rm->rm_firstdatacol) { 1980219089Spjd c = rm->rm_firstdatacol; 1981219089Spjd } 1982219089Spjd 1983219089Spjd while (rm->rm_col[c].rc_error != 0) { 1984219089Spjd c++; 1985219089Spjd ASSERT3S(c, <, rm->rm_cols); 1986219089Spjd } 1987219089Spjd 1988219089Spjd tgts[i] = c++; 1989219089Spjd } 1990219089Spjd 1991219089Spjd /* 1992219089Spjd * Setting tgts[n] simplifies the other edge condition. 1993219089Spjd */ 1994219089Spjd tgts[n] = rm->rm_cols; 1995219089Spjd 1996219089Spjd /* 1997219089Spjd * These buffers were allocated in previous iterations. 1998219089Spjd */ 1999219089Spjd for (i = 0; i < n - 1; i++) { 2000219089Spjd ASSERT(orig[i] != NULL); 2001219089Spjd } 2002219089Spjd 2003219089Spjd orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size); 2004219089Spjd 2005219089Spjd current = 0; 2006219089Spjd next = tgts[current]; 2007219089Spjd 2008219089Spjd while (current != n) { 2009219089Spjd tgts[current] = next; 2010219089Spjd current = 0; 2011219089Spjd 2012219089Spjd /* 2013219089Spjd * Save off the original data that we're going to 2014219089Spjd * attempt to reconstruct. 2015219089Spjd */ 2016219089Spjd for (i = 0; i < n; i++) { 2017219089Spjd ASSERT(orig[i] != NULL); 2018219089Spjd c = tgts[i]; 2019219089Spjd ASSERT3S(c, >=, 0); 2020219089Spjd ASSERT3S(c, <, rm->rm_cols); 2021219089Spjd rc = &rm->rm_col[c]; 2022219089Spjd bcopy(rc->rc_data, orig[i], rc->rc_size); 2023219089Spjd } 2024219089Spjd 2025219089Spjd /* 2026219089Spjd * Attempt a reconstruction and exit the outer loop on 2027219089Spjd * success. 2028219089Spjd */ 2029219089Spjd code = vdev_raidz_reconstruct(rm, tgts, n); 2030219089Spjd if (raidz_checksum_verify(zio) == 0) { 2031219089Spjd atomic_inc_64(&raidz_corrected[code]); 2032219089Spjd 2033219089Spjd for (i = 0; i < n; i++) { 2034219089Spjd c = tgts[i]; 2035219089Spjd rc = &rm->rm_col[c]; 2036219089Spjd ASSERT(rc->rc_error == 0); 2037219089Spjd if (rc->rc_tried) 2038219089Spjd raidz_checksum_error(zio, rc, 2039219089Spjd orig[i]); 2040249195Smm rc->rc_error = SET_ERROR(ECKSUM); 2041219089Spjd } 2042219089Spjd 2043219089Spjd ret = code; 2044219089Spjd goto done; 2045219089Spjd } 2046219089Spjd 2047219089Spjd /* 2048219089Spjd * Restore the original data. 2049219089Spjd */ 2050219089Spjd for (i = 0; i < n; i++) { 2051219089Spjd c = tgts[i]; 2052219089Spjd rc = &rm->rm_col[c]; 2053219089Spjd bcopy(orig[i], rc->rc_data, rc->rc_size); 2054219089Spjd } 2055219089Spjd 2056219089Spjd do { 2057219089Spjd /* 2058219089Spjd * Find the next valid column after the current 2059219089Spjd * position.. 2060219089Spjd */ 2061219089Spjd for (next = tgts[current] + 1; 2062219089Spjd next < rm->rm_cols && 2063219089Spjd rm->rm_col[next].rc_error != 0; next++) 2064219089Spjd continue; 2065219089Spjd 2066219089Spjd ASSERT(next <= tgts[current + 1]); 2067219089Spjd 2068219089Spjd /* 2069219089Spjd * If that spot is available, we're done here. 2070219089Spjd */ 2071219089Spjd if (next != tgts[current + 1]) 2072219089Spjd break; 2073219089Spjd 2074219089Spjd /* 2075219089Spjd * Otherwise, find the next valid column after 2076219089Spjd * the previous position. 2077219089Spjd */ 2078219089Spjd for (c = tgts[current - 1] + 1; 2079219089Spjd rm->rm_col[c].rc_error != 0; c++) 2080219089Spjd continue; 2081219089Spjd 2082219089Spjd tgts[current] = c; 2083219089Spjd current++; 2084219089Spjd 2085219089Spjd } while (current != n); 2086219089Spjd } 2087219089Spjd } 2088219089Spjd n--; 2089219089Spjddone: 2090219089Spjd for (i = 0; i < n; i++) { 2091219089Spjd zio_buf_free(orig[i], rm->rm_col[0].rc_size); 2092219089Spjd } 2093219089Spjd 2094219089Spjd return (ret); 2095219089Spjd} 2096219089Spjd 2097251629Sdelphij/* 2098251629Sdelphij * Complete an IO operation on a RAIDZ VDev 2099251629Sdelphij * 2100251629Sdelphij * Outline: 2101251629Sdelphij * - For write operations: 2102251629Sdelphij * 1. Check for errors on the child IOs. 2103251629Sdelphij * 2. Return, setting an error code if too few child VDevs were written 2104251629Sdelphij * to reconstruct the data later. Note that partial writes are 2105251629Sdelphij * considered successful if they can be reconstructed at all. 2106251629Sdelphij * - For read operations: 2107251629Sdelphij * 1. Check for errors on the child IOs. 2108251629Sdelphij * 2. If data errors occurred: 2109251629Sdelphij * a. Try to reassemble the data from the parity available. 2110251629Sdelphij * b. If we haven't yet read the parity drives, read them now. 2111251629Sdelphij * c. If all parity drives have been read but the data still doesn't 2112251629Sdelphij * reassemble with a correct checksum, then try combinatorial 2113251629Sdelphij * reconstruction. 2114251629Sdelphij * d. If that doesn't work, return an error. 2115251629Sdelphij * 3. If there were unexpected errors or this is a resilver operation, 2116251629Sdelphij * rewrite the vdevs that had errors. 2117251629Sdelphij */ 2118168404Spjdstatic void 2119168404Spjdvdev_raidz_io_done(zio_t *zio) 2120168404Spjd{ 2121168404Spjd vdev_t *vd = zio->io_vd; 2122168404Spjd vdev_t *cvd; 2123168404Spjd raidz_map_t *rm = zio->io_vsd; 2124219089Spjd raidz_col_t *rc; 2125168404Spjd int unexpected_errors = 0; 2126168404Spjd int parity_errors = 0; 2127168404Spjd int parity_untried = 0; 2128168404Spjd int data_errors = 0; 2129185029Spjd int total_errors = 0; 2130219089Spjd int n, c; 2131219089Spjd int tgts[VDEV_RAIDZ_MAXPARITY]; 2132219089Spjd int code; 2133168404Spjd 2134168404Spjd ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ 2135168404Spjd 2136168404Spjd ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); 2137168404Spjd ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); 2138168404Spjd 2139168404Spjd for (c = 0; c < rm->rm_cols; c++) { 2140168404Spjd rc = &rm->rm_col[c]; 2141168404Spjd 2142168404Spjd if (rc->rc_error) { 2143185029Spjd ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 2144168404Spjd 2145168404Spjd if (c < rm->rm_firstdatacol) 2146168404Spjd parity_errors++; 2147168404Spjd else 2148168404Spjd data_errors++; 2149168404Spjd 2150168404Spjd if (!rc->rc_skipped) 2151168404Spjd unexpected_errors++; 2152168404Spjd 2153185029Spjd total_errors++; 2154168404Spjd } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { 2155168404Spjd parity_untried++; 2156168404Spjd } 2157168404Spjd } 2158168404Spjd 2159168404Spjd if (zio->io_type == ZIO_TYPE_WRITE) { 2160168404Spjd /* 2161185029Spjd * XXX -- for now, treat partial writes as a success. 2162185029Spjd * (If we couldn't write enough columns to reconstruct 2163185029Spjd * the data, the I/O failed. Otherwise, good enough.) 2164185029Spjd * 2165185029Spjd * Now that we support write reallocation, it would be better 2166185029Spjd * to treat partial failure as real failure unless there are 2167185029Spjd * no non-degraded top-level vdevs left, and not update DTLs 2168185029Spjd * if we intend to reallocate. 2169168404Spjd */ 2170168404Spjd /* XXPOLICY */ 2171185029Spjd if (total_errors > rm->rm_firstdatacol) 2172185029Spjd zio->io_error = vdev_raidz_worst_error(rm); 2173168404Spjd 2174168404Spjd return; 2175240868Spjd } else if (zio->io_type == ZIO_TYPE_FREE) { 2176240868Spjd return; 2177168404Spjd } 2178168404Spjd 2179168404Spjd ASSERT(zio->io_type == ZIO_TYPE_READ); 2180168404Spjd /* 2181168404Spjd * There are three potential phases for a read: 2182168404Spjd * 1. produce valid data from the columns read 2183168404Spjd * 2. read all disks and try again 2184168404Spjd * 3. perform combinatorial reconstruction 2185168404Spjd * 2186168404Spjd * Each phase is progressively both more expensive and less likely to 2187168404Spjd * occur. If we encounter more errors than we can repair or all phases 2188168404Spjd * fail, we have no choice but to return an error. 2189168404Spjd */ 2190168404Spjd 2191168404Spjd /* 2192168404Spjd * If the number of errors we saw was correctable -- less than or equal 2193168404Spjd * to the number of parity disks read -- attempt to produce data that 2194168404Spjd * has a valid checksum. Naturally, this case applies in the absence of 2195168404Spjd * any errors. 2196168404Spjd */ 2197185029Spjd if (total_errors <= rm->rm_firstdatacol - parity_untried) { 2198219089Spjd if (data_errors == 0) { 2199219089Spjd if (raidz_checksum_verify(zio) == 0) { 2200168738Spjd /* 2201168738Spjd * If we read parity information (unnecessarily 2202168738Spjd * as it happens since no reconstruction was 2203168738Spjd * needed) regenerate and verify the parity. 2204168738Spjd * We also regenerate parity when resilvering 2205168738Spjd * so we can write it out to the failed device 2206168738Spjd * later. 2207168738Spjd */ 2208168404Spjd if (parity_errors + parity_untried < 2209168738Spjd rm->rm_firstdatacol || 2210168738Spjd (zio->io_flags & ZIO_FLAG_RESILVER)) { 2211168404Spjd n = raidz_parity_verify(zio, rm); 2212168404Spjd unexpected_errors += n; 2213168404Spjd ASSERT(parity_errors + n <= 2214168404Spjd rm->rm_firstdatacol); 2215168404Spjd } 2216168404Spjd goto done; 2217168404Spjd } 2218219089Spjd } else { 2219168404Spjd /* 2220168404Spjd * We either attempt to read all the parity columns or 2221168404Spjd * none of them. If we didn't try to read parity, we 2222168404Spjd * wouldn't be here in the correctable case. There must 2223168404Spjd * also have been fewer parity errors than parity 2224168404Spjd * columns or, again, we wouldn't be in this code path. 2225168404Spjd */ 2226168404Spjd ASSERT(parity_untried == 0); 2227168404Spjd ASSERT(parity_errors < rm->rm_firstdatacol); 2228168404Spjd 2229168404Spjd /* 2230219089Spjd * Identify the data columns that reported an error. 2231168404Spjd */ 2232219089Spjd n = 0; 2233168404Spjd for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 2234168404Spjd rc = &rm->rm_col[c]; 2235219089Spjd if (rc->rc_error != 0) { 2236219089Spjd ASSERT(n < VDEV_RAIDZ_MAXPARITY); 2237219089Spjd tgts[n++] = c; 2238219089Spjd } 2239168404Spjd } 2240168404Spjd 2241219089Spjd ASSERT(rm->rm_firstdatacol >= n); 2242168404Spjd 2243219089Spjd code = vdev_raidz_reconstruct(rm, tgts, n); 2244168404Spjd 2245219089Spjd if (raidz_checksum_verify(zio) == 0) { 2246219089Spjd atomic_inc_64(&raidz_corrected[code]); 2247219089Spjd 2248168404Spjd /* 2249219089Spjd * If we read more parity disks than were used 2250219089Spjd * for reconstruction, confirm that the other 2251219089Spjd * parity disks produced correct data. This 2252219089Spjd * routine is suboptimal in that it regenerates 2253219089Spjd * the parity that we already used in addition 2254219089Spjd * to the parity that we're attempting to 2255219089Spjd * verify, but this should be a relatively 2256219089Spjd * uncommon case, and can be optimized if it 2257219089Spjd * becomes a problem. Note that we regenerate 2258219089Spjd * parity when resilvering so we can write it 2259219089Spjd * out to failed devices later. 2260168404Spjd */ 2261219089Spjd if (parity_errors < rm->rm_firstdatacol - n || 2262168738Spjd (zio->io_flags & ZIO_FLAG_RESILVER)) { 2263168404Spjd n = raidz_parity_verify(zio, rm); 2264168404Spjd unexpected_errors += n; 2265168404Spjd ASSERT(parity_errors + n <= 2266168404Spjd rm->rm_firstdatacol); 2267168404Spjd } 2268168404Spjd 2269168404Spjd goto done; 2270168404Spjd } 2271168404Spjd } 2272168404Spjd } 2273168404Spjd 2274168404Spjd /* 2275168404Spjd * This isn't a typical situation -- either we got a read error or 2276168404Spjd * a child silently returned bad data. Read every block so we can 2277168404Spjd * try again with as much data and parity as we can track down. If 2278168404Spjd * we've already been through once before, all children will be marked 2279168404Spjd * as tried so we'll proceed to combinatorial reconstruction. 2280168404Spjd */ 2281168404Spjd unexpected_errors = 1; 2282168404Spjd rm->rm_missingdata = 0; 2283168404Spjd rm->rm_missingparity = 0; 2284168404Spjd 2285168404Spjd for (c = 0; c < rm->rm_cols; c++) { 2286168404Spjd if (rm->rm_col[c].rc_tried) 2287168404Spjd continue; 2288168404Spjd 2289168404Spjd zio_vdev_io_redone(zio); 2290168404Spjd do { 2291168404Spjd rc = &rm->rm_col[c]; 2292168404Spjd if (rc->rc_tried) 2293168404Spjd continue; 2294168404Spjd zio_nowait(zio_vdev_child_io(zio, NULL, 2295168404Spjd vd->vdev_child[rc->rc_devidx], 2296168404Spjd rc->rc_offset, rc->rc_data, rc->rc_size, 2297185029Spjd zio->io_type, zio->io_priority, 0, 2298168404Spjd vdev_raidz_child_done, rc)); 2299168404Spjd } while (++c < rm->rm_cols); 2300185029Spjd 2301168404Spjd return; 2302168404Spjd } 2303168404Spjd 2304168404Spjd /* 2305168404Spjd * At this point we've attempted to reconstruct the data given the 2306168404Spjd * errors we detected, and we've attempted to read all columns. There 2307168404Spjd * must, therefore, be one or more additional problems -- silent errors 2308168404Spjd * resulting in invalid data rather than explicit I/O errors resulting 2309219089Spjd * in absent data. We check if there is enough additional data to 2310219089Spjd * possibly reconstruct the data and then perform combinatorial 2311219089Spjd * reconstruction over all possible combinations. If that fails, 2312219089Spjd * we're cooked. 2313168404Spjd */ 2314219089Spjd if (total_errors > rm->rm_firstdatacol) { 2315185029Spjd zio->io_error = vdev_raidz_worst_error(rm); 2316168404Spjd 2317219089Spjd } else if (total_errors < rm->rm_firstdatacol && 2318219089Spjd (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) { 2319168404Spjd /* 2320219089Spjd * If we didn't use all the available parity for the 2321219089Spjd * combinatorial reconstruction, verify that the remaining 2322219089Spjd * parity is correct. 2323168404Spjd */ 2324219089Spjd if (code != (1 << rm->rm_firstdatacol) - 1) 2325219089Spjd (void) raidz_parity_verify(zio, rm); 2326219089Spjd } else { 2327168404Spjd /* 2328219089Spjd * We're here because either: 2329219089Spjd * 2330219089Spjd * total_errors == rm_first_datacol, or 2331219089Spjd * vdev_raidz_combrec() failed 2332219089Spjd * 2333219089Spjd * In either case, there is enough bad data to prevent 2334219089Spjd * reconstruction. 2335219089Spjd * 2336219089Spjd * Start checksum ereports for all children which haven't 2337219089Spjd * failed, and the IO wasn't speculative. 2338168404Spjd */ 2339249195Smm zio->io_error = SET_ERROR(ECKSUM); 2340168404Spjd 2341219089Spjd if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2342219089Spjd for (c = 0; c < rm->rm_cols; c++) { 2343219089Spjd rc = &rm->rm_col[c]; 2344219089Spjd if (rc->rc_error == 0) { 2345219089Spjd zio_bad_cksum_t zbc; 2346219089Spjd zbc.zbc_has_cksum = 0; 2347219089Spjd zbc.zbc_injected = 2348219089Spjd rm->rm_ecksuminjected; 2349168404Spjd 2350219089Spjd zfs_ereport_start_checksum( 2351219089Spjd zio->io_spa, 2352219089Spjd vd->vdev_child[rc->rc_devidx], 2353219089Spjd zio, rc->rc_offset, rc->rc_size, 2354219089Spjd (void *)(uintptr_t)c, &zbc); 2355168404Spjd } 2356168404Spjd } 2357168404Spjd } 2358168404Spjd } 2359168404Spjd 2360168404Spjddone: 2361168404Spjd zio_checksum_verified(zio); 2362168404Spjd 2363209962Smm if (zio->io_error == 0 && spa_writeable(zio->io_spa) && 2364168404Spjd (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { 2365168404Spjd /* 2366168404Spjd * Use the good data we have in hand to repair damaged children. 2367168404Spjd */ 2368168404Spjd for (c = 0; c < rm->rm_cols; c++) { 2369168404Spjd rc = &rm->rm_col[c]; 2370168404Spjd cvd = vd->vdev_child[rc->rc_devidx]; 2371168404Spjd 2372168404Spjd if (rc->rc_error == 0) 2373168404Spjd continue; 2374168404Spjd 2375185029Spjd zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 2376168404Spjd rc->rc_offset, rc->rc_data, rc->rc_size, 2377260763Savg ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, 2378209962Smm ZIO_FLAG_IO_REPAIR | (unexpected_errors ? 2379209962Smm ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); 2380168404Spjd } 2381168404Spjd } 2382168404Spjd} 2383168404Spjd 2384168404Spjdstatic void 2385168404Spjdvdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) 2386168404Spjd{ 2387168404Spjd if (faulted > vd->vdev_nparity) 2388168404Spjd vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2389168404Spjd VDEV_AUX_NO_REPLICAS); 2390168404Spjd else if (degraded + faulted != 0) 2391168404Spjd vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); 2392168404Spjd else 2393168404Spjd vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); 2394168404Spjd} 2395168404Spjd 2396168404Spjdvdev_ops_t vdev_raidz_ops = { 2397168404Spjd vdev_raidz_open, 2398168404Spjd vdev_raidz_close, 2399168404Spjd vdev_raidz_asize, 2400168404Spjd vdev_raidz_io_start, 2401168404Spjd vdev_raidz_io_done, 2402168404Spjd vdev_raidz_state_change, 2403219089Spjd NULL, 2404219089Spjd NULL, 2405168404Spjd VDEV_TYPE_RAIDZ, /* name of this vdev type */ 2406168404Spjd B_FALSE /* not a leaf vdev */ 2407168404Spjd}; 2408