1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd 22168404Spjd/* 23219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24297078Smav * Copyright (c) 2012, 2014 by Delphix. All rights reserved. 25255750Sdelphij * Copyright (c) 2013, Joyent, Inc. All rights reserved. 26297112Smav * Copyright (c) 2014 Integros [integros.com] 27168404Spjd */ 28168404Spjd 29168404Spjd#include <sys/zfs_context.h> 30168404Spjd#include <sys/spa.h> 31168404Spjd#include <sys/vdev_impl.h> 32255750Sdelphij#ifdef illumos 33255750Sdelphij#include <sys/vdev_disk.h> 34255750Sdelphij#endif 35255750Sdelphij#include <sys/vdev_file.h> 36255750Sdelphij#include <sys/vdev_raidz.h> 37168404Spjd#include <sys/zio.h> 38168404Spjd#include <sys/zio_checksum.h> 39168404Spjd#include <sys/fs/zfs.h> 40168404Spjd#include <sys/fm/fs/zfs.h> 41255750Sdelphij#include <sys/bio.h> 42168404Spjd 43168404Spjd/* 44168404Spjd * Virtual device vector for RAID-Z. 45168404Spjd * 46219089Spjd * This vdev supports single, double, and triple parity. For single parity, 47219089Spjd * we use a simple XOR of all the data columns. For double or triple parity, 48219089Spjd * we use a special case of Reed-Solomon coding. This extends the 49219089Spjd * technique described in "The mathematics of RAID-6" by H. Peter Anvin by 50219089Spjd * drawing on the system described in "A Tutorial on Reed-Solomon Coding for 51219089Spjd * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the 52219089Spjd * former is also based. The latter is designed to provide higher performance 53219089Spjd * for writes. 54168404Spjd * 55219089Spjd * Note that the Plank paper claimed to support arbitrary N+M, but was then 56219089Spjd * amended six years later identifying a critical flaw that invalidates its 57219089Spjd * claims. Nevertheless, the technique can be adapted to work for up to 58219089Spjd * triple parity. For additional parity, the amendment "Note: Correction to 59219089Spjd * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding 60219089Spjd * is viable, but the additional complexity means that write performance will 61219089Spjd * suffer. 62219089Spjd * 63219089Spjd * All of the methods above operate on a Galois field, defined over the 64219089Spjd * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements 65219089Spjd * can be expressed with a single byte. Briefly, the operations on the 66219089Spjd * field are defined as follows: 67219089Spjd * 68168404Spjd * o addition (+) is represented by a bitwise XOR 69168404Spjd * o subtraction (-) is therefore identical to addition: A + B = A - B 70168404Spjd * o multiplication of A by 2 is defined by the following bitwise expression: 71251631Sdelphij * 72168404Spjd * (A * 2)_7 = A_6 73168404Spjd * (A * 2)_6 = A_5 74168404Spjd * (A * 2)_5 = A_4 75168404Spjd * (A * 2)_4 = A_3 + A_7 76168404Spjd * (A * 2)_3 = A_2 + A_7 77168404Spjd * (A * 2)_2 = A_1 + A_7 78168404Spjd * (A * 2)_1 = A_0 79168404Spjd * (A * 2)_0 = A_7 80168404Spjd * 81168404Spjd * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). 82219089Spjd * As an aside, this multiplication is derived from the error correcting 83219089Spjd * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1. 84168404Spjd * 85168404Spjd * Observe that any number in the field (except for 0) can be expressed as a 86168404Spjd * power of 2 -- a generator for the field. We store a table of the powers of 87168404Spjd * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can 88168404Spjd * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather 89219089Spjd * than field addition). The inverse of a field element A (A^-1) is therefore 90219089Spjd * A ^ (255 - 1) = A^254. 91168404Spjd * 92219089Spjd * The up-to-three parity columns, P, Q, R over several data columns, 93219089Spjd * D_0, ... D_n-1, can be expressed by field operations: 94168404Spjd * 95168404Spjd * P = D_0 + D_1 + ... + D_n-2 + D_n-1 96168404Spjd * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 97168404Spjd * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 98219089Spjd * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1 99219089Spjd * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1 100168404Spjd * 101219089Spjd * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival 102219089Spjd * XOR operation, and 2 and 4 can be computed quickly and generate linearly- 103219089Spjd * independent coefficients. (There are no additional coefficients that have 104219089Spjd * this property which is why the uncorrected Plank method breaks down.) 105219089Spjd * 106219089Spjd * See the reconstruction code below for how P, Q and R can used individually 107219089Spjd * or in concert to recover missing data columns. 108168404Spjd */ 109168404Spjd 110168404Spjdtypedef struct raidz_col { 111168404Spjd uint64_t rc_devidx; /* child device index for I/O */ 112168404Spjd uint64_t rc_offset; /* device offset */ 113168404Spjd uint64_t rc_size; /* I/O size */ 114168404Spjd void *rc_data; /* I/O data */ 115219089Spjd void *rc_gdata; /* used to store the "good" version */ 116168404Spjd int rc_error; /* I/O error for this device */ 117168404Spjd uint8_t rc_tried; /* Did we attempt this I/O column? */ 118168404Spjd uint8_t rc_skipped; /* Did we skip this I/O column? */ 119168404Spjd} raidz_col_t; 120168404Spjd 121168404Spjdtypedef struct raidz_map { 122219089Spjd uint64_t rm_cols; /* Regular column count */ 123219089Spjd uint64_t rm_scols; /* Count including skipped columns */ 124168404Spjd uint64_t rm_bigcols; /* Number of oversized columns */ 125168404Spjd uint64_t rm_asize; /* Actual total I/O size */ 126168404Spjd uint64_t rm_missingdata; /* Count of missing data devices */ 127168404Spjd uint64_t rm_missingparity; /* Count of missing parity devices */ 128168404Spjd uint64_t rm_firstdatacol; /* First data column/parity count */ 129219089Spjd uint64_t rm_nskip; /* Skipped sectors for padding */ 130251631Sdelphij uint64_t rm_skipstart; /* Column index of padding start */ 131219089Spjd void *rm_datacopy; /* rm_asize-buffer of copied data */ 132219089Spjd uintptr_t rm_reports; /* # of referencing checksum reports */ 133219089Spjd uint8_t rm_freed; /* map no longer has referencing ZIO */ 134219089Spjd uint8_t rm_ecksuminjected; /* checksum error was injected */ 135168404Spjd raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ 136168404Spjd} raidz_map_t; 137168404Spjd 138168404Spjd#define VDEV_RAIDZ_P 0 139168404Spjd#define VDEV_RAIDZ_Q 1 140219089Spjd#define VDEV_RAIDZ_R 2 141168404Spjd 142219089Spjd#define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 143219089Spjd#define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 144168404Spjd 145219089Spjd/* 146219089Spjd * We provide a mechanism to perform the field multiplication operation on a 147219089Spjd * 64-bit value all at once rather than a byte at a time. This works by 148219089Spjd * creating a mask from the top bit in each byte and using that to 149219089Spjd * conditionally apply the XOR of 0x1d. 150219089Spjd */ 151219089Spjd#define VDEV_RAIDZ_64MUL_2(x, mask) \ 152219089Spjd{ \ 153219089Spjd (mask) = (x) & 0x8080808080808080ULL; \ 154219089Spjd (mask) = ((mask) << 1) - ((mask) >> 7); \ 155219089Spjd (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 156219089Spjd ((mask) & 0x1d1d1d1d1d1d1d1d); \ 157219089Spjd} 158168404Spjd 159219089Spjd#define VDEV_RAIDZ_64MUL_4(x, mask) \ 160219089Spjd{ \ 161219089Spjd VDEV_RAIDZ_64MUL_2((x), mask); \ 162219089Spjd VDEV_RAIDZ_64MUL_2((x), mask); \ 163219089Spjd} 164219089Spjd 165255750Sdelphij#define VDEV_LABEL_OFFSET(x) (x + VDEV_LABEL_START_SIZE) 166255750Sdelphij 167168404Spjd/* 168219089Spjd * Force reconstruction to use the general purpose method. 169219089Spjd */ 170219089Spjdint vdev_raidz_default_to_general; 171219089Spjd 172251631Sdelphij/* Powers of 2 in the Galois field defined above. */ 173168404Spjdstatic const uint8_t vdev_raidz_pow2[256] = { 174168404Spjd 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 175168404Spjd 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 176168404Spjd 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 177168404Spjd 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 178168404Spjd 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 179168404Spjd 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 180168404Spjd 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, 181168404Spjd 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 182168404Spjd 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 183168404Spjd 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, 184168404Spjd 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 185168404Spjd 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 186168404Spjd 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 187168404Spjd 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 188168404Spjd 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 189168404Spjd 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 190168404Spjd 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 191168404Spjd 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 192168404Spjd 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 193168404Spjd 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 194168404Spjd 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 195168404Spjd 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 196168404Spjd 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 197168404Spjd 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 198168404Spjd 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, 199168404Spjd 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 200168404Spjd 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 201168404Spjd 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, 202168404Spjd 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 203168404Spjd 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, 204168404Spjd 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 205168404Spjd 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 206168404Spjd}; 207251631Sdelphij/* Logs of 2 in the Galois field defined above. */ 208168404Spjdstatic const uint8_t vdev_raidz_log2[256] = { 209168404Spjd 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, 210168404Spjd 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, 211168404Spjd 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, 212168404Spjd 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, 213168404Spjd 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, 214168404Spjd 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, 215168404Spjd 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, 216168404Spjd 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, 217168404Spjd 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, 218168404Spjd 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, 219168404Spjd 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, 220168404Spjd 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, 221168404Spjd 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, 222168404Spjd 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, 223168404Spjd 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, 224168404Spjd 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, 225168404Spjd 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, 226168404Spjd 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, 227168404Spjd 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, 228168404Spjd 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, 229168404Spjd 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, 230168404Spjd 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, 231168404Spjd 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, 232168404Spjd 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, 233168404Spjd 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, 234168404Spjd 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, 235168404Spjd 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, 236168404Spjd 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, 237168404Spjd 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, 238168404Spjd 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, 239168404Spjd 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, 240168404Spjd 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, 241168404Spjd}; 242168404Spjd 243219089Spjdstatic void vdev_raidz_generate_parity(raidz_map_t *rm); 244219089Spjd 245168404Spjd/* 246168404Spjd * Multiply a given number by 2 raised to the given power. 247168404Spjd */ 248168404Spjdstatic uint8_t 249168404Spjdvdev_raidz_exp2(uint_t a, int exp) 250168404Spjd{ 251168404Spjd if (a == 0) 252168404Spjd return (0); 253168404Spjd 254168404Spjd ASSERT(exp >= 0); 255168404Spjd ASSERT(vdev_raidz_log2[a] > 0 || a == 1); 256168404Spjd 257168404Spjd exp += vdev_raidz_log2[a]; 258168404Spjd if (exp > 255) 259168404Spjd exp -= 255; 260168404Spjd 261168404Spjd return (vdev_raidz_pow2[exp]); 262168404Spjd} 263168404Spjd 264185029Spjdstatic void 265219089Spjdvdev_raidz_map_free(raidz_map_t *rm) 266185029Spjd{ 267185029Spjd int c; 268219089Spjd size_t size; 269185029Spjd 270219089Spjd for (c = 0; c < rm->rm_firstdatacol; c++) { 271240868Spjd if (rm->rm_col[c].rc_data != NULL) 272240868Spjd zio_buf_free(rm->rm_col[c].rc_data, 273240868Spjd rm->rm_col[c].rc_size); 274185029Spjd 275219089Spjd if (rm->rm_col[c].rc_gdata != NULL) 276219089Spjd zio_buf_free(rm->rm_col[c].rc_gdata, 277219089Spjd rm->rm_col[c].rc_size); 278219089Spjd } 279219089Spjd 280219089Spjd size = 0; 281219089Spjd for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) 282219089Spjd size += rm->rm_col[c].rc_size; 283219089Spjd 284219089Spjd if (rm->rm_datacopy != NULL) 285219089Spjd zio_buf_free(rm->rm_datacopy, size); 286219089Spjd 287219089Spjd kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); 288185029Spjd} 289185029Spjd 290219089Spjdstatic void 291219089Spjdvdev_raidz_map_free_vsd(zio_t *zio) 292219089Spjd{ 293219089Spjd raidz_map_t *rm = zio->io_vsd; 294219089Spjd 295240415Smm ASSERT0(rm->rm_freed); 296219089Spjd rm->rm_freed = 1; 297219089Spjd 298219089Spjd if (rm->rm_reports == 0) 299219089Spjd vdev_raidz_map_free(rm); 300219089Spjd} 301219089Spjd 302219089Spjd/*ARGSUSED*/ 303219089Spjdstatic void 304219089Spjdvdev_raidz_cksum_free(void *arg, size_t ignored) 305219089Spjd{ 306219089Spjd raidz_map_t *rm = arg; 307219089Spjd 308219089Spjd ASSERT3U(rm->rm_reports, >, 0); 309219089Spjd 310219089Spjd if (--rm->rm_reports == 0 && rm->rm_freed != 0) 311219089Spjd vdev_raidz_map_free(rm); 312219089Spjd} 313219089Spjd 314219089Spjdstatic void 315219089Spjdvdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) 316219089Spjd{ 317219089Spjd raidz_map_t *rm = zcr->zcr_cbdata; 318219089Spjd size_t c = zcr->zcr_cbinfo; 319219089Spjd size_t x; 320219089Spjd 321219089Spjd const char *good = NULL; 322219089Spjd const char *bad = rm->rm_col[c].rc_data; 323219089Spjd 324219089Spjd if (good_data == NULL) { 325219089Spjd zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); 326219089Spjd return; 327219089Spjd } 328219089Spjd 329219089Spjd if (c < rm->rm_firstdatacol) { 330219089Spjd /* 331219089Spjd * The first time through, calculate the parity blocks for 332219089Spjd * the good data (this relies on the fact that the good 333219089Spjd * data never changes for a given logical ZIO) 334219089Spjd */ 335219089Spjd if (rm->rm_col[0].rc_gdata == NULL) { 336219089Spjd char *bad_parity[VDEV_RAIDZ_MAXPARITY]; 337219089Spjd char *buf; 338219089Spjd 339219089Spjd /* 340219089Spjd * Set up the rm_col[]s to generate the parity for 341219089Spjd * good_data, first saving the parity bufs and 342219089Spjd * replacing them with buffers to hold the result. 343219089Spjd */ 344219089Spjd for (x = 0; x < rm->rm_firstdatacol; x++) { 345219089Spjd bad_parity[x] = rm->rm_col[x].rc_data; 346219089Spjd rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata = 347219089Spjd zio_buf_alloc(rm->rm_col[x].rc_size); 348219089Spjd } 349219089Spjd 350219089Spjd /* fill in the data columns from good_data */ 351219089Spjd buf = (char *)good_data; 352219089Spjd for (; x < rm->rm_cols; x++) { 353219089Spjd rm->rm_col[x].rc_data = buf; 354219089Spjd buf += rm->rm_col[x].rc_size; 355219089Spjd } 356219089Spjd 357219089Spjd /* 358219089Spjd * Construct the parity from the good data. 359219089Spjd */ 360219089Spjd vdev_raidz_generate_parity(rm); 361219089Spjd 362219089Spjd /* restore everything back to its original state */ 363219089Spjd for (x = 0; x < rm->rm_firstdatacol; x++) 364219089Spjd rm->rm_col[x].rc_data = bad_parity[x]; 365219089Spjd 366219089Spjd buf = rm->rm_datacopy; 367219089Spjd for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) { 368219089Spjd rm->rm_col[x].rc_data = buf; 369219089Spjd buf += rm->rm_col[x].rc_size; 370219089Spjd } 371219089Spjd } 372219089Spjd 373219089Spjd ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL); 374219089Spjd good = rm->rm_col[c].rc_gdata; 375219089Spjd } else { 376219089Spjd /* adjust good_data to point at the start of our column */ 377219089Spjd good = good_data; 378219089Spjd 379219089Spjd for (x = rm->rm_firstdatacol; x < c; x++) 380219089Spjd good += rm->rm_col[x].rc_size; 381219089Spjd } 382219089Spjd 383219089Spjd /* we drop the ereport if it ends up that the data was good */ 384219089Spjd zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE); 385219089Spjd} 386219089Spjd 387219089Spjd/* 388219089Spjd * Invoked indirectly by zfs_ereport_start_checksum(), called 389219089Spjd * below when our read operation fails completely. The main point 390219089Spjd * is to keep a copy of everything we read from disk, so that at 391219089Spjd * vdev_raidz_cksum_finish() time we can compare it with the good data. 392219089Spjd */ 393219089Spjdstatic void 394219089Spjdvdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) 395219089Spjd{ 396219089Spjd size_t c = (size_t)(uintptr_t)arg; 397219089Spjd caddr_t buf; 398219089Spjd 399219089Spjd raidz_map_t *rm = zio->io_vsd; 400219089Spjd size_t size; 401219089Spjd 402219089Spjd /* set up the report and bump the refcount */ 403219089Spjd zcr->zcr_cbdata = rm; 404219089Spjd zcr->zcr_cbinfo = c; 405219089Spjd zcr->zcr_finish = vdev_raidz_cksum_finish; 406219089Spjd zcr->zcr_free = vdev_raidz_cksum_free; 407219089Spjd 408219089Spjd rm->rm_reports++; 409219089Spjd ASSERT3U(rm->rm_reports, >, 0); 410219089Spjd 411219089Spjd if (rm->rm_datacopy != NULL) 412219089Spjd return; 413219089Spjd 414219089Spjd /* 415219089Spjd * It's the first time we're called for this raidz_map_t, so we need 416219089Spjd * to copy the data aside; there's no guarantee that our zio's buffer 417219089Spjd * won't be re-used for something else. 418219089Spjd * 419219089Spjd * Our parity data is already in separate buffers, so there's no need 420219089Spjd * to copy them. 421219089Spjd */ 422219089Spjd 423219089Spjd size = 0; 424219089Spjd for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) 425219089Spjd size += rm->rm_col[c].rc_size; 426219089Spjd 427219089Spjd buf = rm->rm_datacopy = zio_buf_alloc(size); 428219089Spjd 429219089Spjd for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 430219089Spjd raidz_col_t *col = &rm->rm_col[c]; 431219089Spjd 432219089Spjd bcopy(col->rc_data, buf, col->rc_size); 433219089Spjd col->rc_data = buf; 434219089Spjd 435219089Spjd buf += col->rc_size; 436219089Spjd } 437219089Spjd ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size); 438219089Spjd} 439219089Spjd 440219089Spjdstatic const zio_vsd_ops_t vdev_raidz_vsd_ops = { 441219089Spjd vdev_raidz_map_free_vsd, 442219089Spjd vdev_raidz_cksum_report 443219089Spjd}; 444219089Spjd 445251629Sdelphij/* 446251629Sdelphij * Divides the IO evenly across all child vdevs; usually, dcols is 447251629Sdelphij * the number of children in the target vdev. 448251629Sdelphij */ 449168404Spjdstatic raidz_map_t * 450255750Sdelphijvdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset, boolean_t dofree, 451255750Sdelphij uint64_t unit_shift, uint64_t dcols, uint64_t nparity) 452168404Spjd{ 453168404Spjd raidz_map_t *rm; 454251629Sdelphij /* The starting RAIDZ (parent) vdev sector of the block. */ 455255750Sdelphij uint64_t b = offset >> unit_shift; 456251629Sdelphij /* The zio's size in units of the vdev's minimum sector size. */ 457255750Sdelphij uint64_t s = size >> unit_shift; 458251629Sdelphij /* The first column for this stripe. */ 459168404Spjd uint64_t f = b % dcols; 460251629Sdelphij /* The starting byte offset on each child vdev. */ 461168404Spjd uint64_t o = (b / dcols) << unit_shift; 462219089Spjd uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; 463168404Spjd 464251629Sdelphij /* 465251629Sdelphij * "Quotient": The number of data sectors for this stripe on all but 466251629Sdelphij * the "big column" child vdevs that also contain "remainder" data. 467251629Sdelphij */ 468168404Spjd q = s / (dcols - nparity); 469251629Sdelphij 470251629Sdelphij /* 471251629Sdelphij * "Remainder": The number of partial stripe data sectors in this I/O. 472251629Sdelphij * This will add a sector to some, but not all, child vdevs. 473251629Sdelphij */ 474168404Spjd r = s - q * (dcols - nparity); 475251629Sdelphij 476251629Sdelphij /* The number of "big columns" - those which contain remainder data. */ 477168404Spjd bc = (r == 0 ? 0 : r + nparity); 478251629Sdelphij 479251629Sdelphij /* 480251629Sdelphij * The total number of data and parity sectors associated with 481251629Sdelphij * this I/O. 482251629Sdelphij */ 483219089Spjd tot = s + nparity * (q + (r == 0 ? 0 : 1)); 484168404Spjd 485251629Sdelphij /* acols: The columns that will be accessed. */ 486251629Sdelphij /* scols: The columns that will be accessed or skipped. */ 487219089Spjd if (q == 0) { 488251629Sdelphij /* Our I/O request doesn't span all child vdevs. */ 489219089Spjd acols = bc; 490219089Spjd scols = MIN(dcols, roundup(bc, nparity + 1)); 491219089Spjd } else { 492219089Spjd acols = dcols; 493219089Spjd scols = dcols; 494219089Spjd } 495168404Spjd 496219089Spjd ASSERT3U(acols, <=, scols); 497168404Spjd 498219089Spjd rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP); 499219089Spjd 500168404Spjd rm->rm_cols = acols; 501219089Spjd rm->rm_scols = scols; 502168404Spjd rm->rm_bigcols = bc; 503219089Spjd rm->rm_skipstart = bc; 504168404Spjd rm->rm_missingdata = 0; 505168404Spjd rm->rm_missingparity = 0; 506168404Spjd rm->rm_firstdatacol = nparity; 507219089Spjd rm->rm_datacopy = NULL; 508219089Spjd rm->rm_reports = 0; 509219089Spjd rm->rm_freed = 0; 510219089Spjd rm->rm_ecksuminjected = 0; 511168404Spjd 512219089Spjd asize = 0; 513219089Spjd 514219089Spjd for (c = 0; c < scols; c++) { 515168404Spjd col = f + c; 516168404Spjd coff = o; 517168404Spjd if (col >= dcols) { 518168404Spjd col -= dcols; 519168404Spjd coff += 1ULL << unit_shift; 520168404Spjd } 521168404Spjd rm->rm_col[c].rc_devidx = col; 522168404Spjd rm->rm_col[c].rc_offset = coff; 523168404Spjd rm->rm_col[c].rc_data = NULL; 524219089Spjd rm->rm_col[c].rc_gdata = NULL; 525168404Spjd rm->rm_col[c].rc_error = 0; 526168404Spjd rm->rm_col[c].rc_tried = 0; 527168404Spjd rm->rm_col[c].rc_skipped = 0; 528219089Spjd 529219089Spjd if (c >= acols) 530219089Spjd rm->rm_col[c].rc_size = 0; 531219089Spjd else if (c < bc) 532219089Spjd rm->rm_col[c].rc_size = (q + 1) << unit_shift; 533219089Spjd else 534219089Spjd rm->rm_col[c].rc_size = q << unit_shift; 535219089Spjd 536219089Spjd asize += rm->rm_col[c].rc_size; 537168404Spjd } 538168404Spjd 539219089Spjd ASSERT3U(asize, ==, tot << unit_shift); 540219089Spjd rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); 541219089Spjd rm->rm_nskip = roundup(tot, nparity + 1) - tot; 542219089Spjd ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); 543219089Spjd ASSERT3U(rm->rm_nskip, <=, nparity); 544168404Spjd 545255750Sdelphij if (!dofree) { 546240868Spjd for (c = 0; c < rm->rm_firstdatacol; c++) { 547240868Spjd rm->rm_col[c].rc_data = 548240868Spjd zio_buf_alloc(rm->rm_col[c].rc_size); 549240868Spjd } 550168404Spjd 551255750Sdelphij rm->rm_col[c].rc_data = data; 552168404Spjd 553240868Spjd for (c = c + 1; c < acols; c++) { 554240868Spjd rm->rm_col[c].rc_data = 555240868Spjd (char *)rm->rm_col[c - 1].rc_data + 556240868Spjd rm->rm_col[c - 1].rc_size; 557240868Spjd } 558240868Spjd } 559168404Spjd 560168404Spjd /* 561168404Spjd * If all data stored spans all columns, there's a danger that parity 562168404Spjd * will always be on the same device and, since parity isn't read 563168404Spjd * during normal operation, that that device's I/O bandwidth won't be 564168404Spjd * used effectively. We therefore switch the parity every 1MB. 565168404Spjd * 566168404Spjd * ... at least that was, ostensibly, the theory. As a practical 567168404Spjd * matter unless we juggle the parity between all devices evenly, we 568168404Spjd * won't see any benefit. Further, occasional writes that aren't a 569168404Spjd * multiple of the LCM of the number of children and the minimum 570168404Spjd * stripe width are sufficient to avoid pessimal behavior. 571168404Spjd * Unfortunately, this decision created an implicit on-disk format 572168404Spjd * requirement that we need to support for all eternity, but only 573168404Spjd * for single-parity RAID-Z. 574219089Spjd * 575219089Spjd * If we intend to skip a sector in the zeroth column for padding 576219089Spjd * we must make sure to note this swap. We will never intend to 577219089Spjd * skip the first column since at least one data and one parity 578219089Spjd * column must appear in each row. 579168404Spjd */ 580168404Spjd ASSERT(rm->rm_cols >= 2); 581168404Spjd ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); 582168404Spjd 583255750Sdelphij if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) { 584168404Spjd devidx = rm->rm_col[0].rc_devidx; 585168404Spjd o = rm->rm_col[0].rc_offset; 586168404Spjd rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; 587168404Spjd rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; 588168404Spjd rm->rm_col[1].rc_devidx = devidx; 589168404Spjd rm->rm_col[1].rc_offset = o; 590219089Spjd 591219089Spjd if (rm->rm_skipstart == 0) 592219089Spjd rm->rm_skipstart = 1; 593168404Spjd } 594168404Spjd 595168404Spjd return (rm); 596168404Spjd} 597168404Spjd 598168404Spjdstatic void 599168404Spjdvdev_raidz_generate_parity_p(raidz_map_t *rm) 600168404Spjd{ 601168404Spjd uint64_t *p, *src, pcount, ccount, i; 602168404Spjd int c; 603168404Spjd 604168404Spjd pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 605168404Spjd 606168404Spjd for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 607168404Spjd src = rm->rm_col[c].rc_data; 608168404Spjd p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 609168404Spjd ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 610168404Spjd 611168404Spjd if (c == rm->rm_firstdatacol) { 612168404Spjd ASSERT(ccount == pcount); 613219089Spjd for (i = 0; i < ccount; i++, src++, p++) { 614168404Spjd *p = *src; 615168404Spjd } 616168404Spjd } else { 617168404Spjd ASSERT(ccount <= pcount); 618219089Spjd for (i = 0; i < ccount; i++, src++, p++) { 619168404Spjd *p ^= *src; 620168404Spjd } 621168404Spjd } 622168404Spjd } 623168404Spjd} 624168404Spjd 625168404Spjdstatic void 626168404Spjdvdev_raidz_generate_parity_pq(raidz_map_t *rm) 627168404Spjd{ 628219089Spjd uint64_t *p, *q, *src, pcnt, ccnt, mask, i; 629168404Spjd int c; 630168404Spjd 631219089Spjd pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 632168404Spjd ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 633168404Spjd rm->rm_col[VDEV_RAIDZ_Q].rc_size); 634168404Spjd 635168404Spjd for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 636168404Spjd src = rm->rm_col[c].rc_data; 637168404Spjd p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 638168404Spjd q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 639168404Spjd 640219089Spjd ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 641219089Spjd 642168404Spjd if (c == rm->rm_firstdatacol) { 643219089Spjd ASSERT(ccnt == pcnt || ccnt == 0); 644219089Spjd for (i = 0; i < ccnt; i++, src++, p++, q++) { 645219089Spjd *p = *src; 646168404Spjd *q = *src; 647219089Spjd } 648219089Spjd for (; i < pcnt; i++, src++, p++, q++) { 649219089Spjd *p = 0; 650219089Spjd *q = 0; 651219089Spjd } 652219089Spjd } else { 653219089Spjd ASSERT(ccnt <= pcnt); 654219089Spjd 655219089Spjd /* 656219089Spjd * Apply the algorithm described above by multiplying 657219089Spjd * the previous result and adding in the new value. 658219089Spjd */ 659219089Spjd for (i = 0; i < ccnt; i++, src++, p++, q++) { 660219089Spjd *p ^= *src; 661219089Spjd 662219089Spjd VDEV_RAIDZ_64MUL_2(*q, mask); 663219089Spjd *q ^= *src; 664219089Spjd } 665219089Spjd 666219089Spjd /* 667219089Spjd * Treat short columns as though they are full of 0s. 668219089Spjd * Note that there's therefore nothing needed for P. 669219089Spjd */ 670219089Spjd for (; i < pcnt; i++, q++) { 671219089Spjd VDEV_RAIDZ_64MUL_2(*q, mask); 672219089Spjd } 673219089Spjd } 674219089Spjd } 675219089Spjd} 676219089Spjd 677219089Spjdstatic void 678219089Spjdvdev_raidz_generate_parity_pqr(raidz_map_t *rm) 679219089Spjd{ 680219089Spjd uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i; 681219089Spjd int c; 682219089Spjd 683219089Spjd pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 684219089Spjd ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 685219089Spjd rm->rm_col[VDEV_RAIDZ_Q].rc_size); 686219089Spjd ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 687219089Spjd rm->rm_col[VDEV_RAIDZ_R].rc_size); 688219089Spjd 689219089Spjd for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 690219089Spjd src = rm->rm_col[c].rc_data; 691219089Spjd p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 692219089Spjd q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 693219089Spjd r = rm->rm_col[VDEV_RAIDZ_R].rc_data; 694219089Spjd 695219089Spjd ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 696219089Spjd 697219089Spjd if (c == rm->rm_firstdatacol) { 698219089Spjd ASSERT(ccnt == pcnt || ccnt == 0); 699219089Spjd for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 700168404Spjd *p = *src; 701219089Spjd *q = *src; 702219089Spjd *r = *src; 703168404Spjd } 704219089Spjd for (; i < pcnt; i++, src++, p++, q++, r++) { 705219089Spjd *p = 0; 706168404Spjd *q = 0; 707219089Spjd *r = 0; 708168404Spjd } 709168404Spjd } else { 710219089Spjd ASSERT(ccnt <= pcnt); 711168404Spjd 712168404Spjd /* 713219089Spjd * Apply the algorithm described above by multiplying 714219089Spjd * the previous result and adding in the new value. 715168404Spjd */ 716219089Spjd for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 717219089Spjd *p ^= *src; 718219089Spjd 719219089Spjd VDEV_RAIDZ_64MUL_2(*q, mask); 720168404Spjd *q ^= *src; 721219089Spjd 722219089Spjd VDEV_RAIDZ_64MUL_4(*r, mask); 723219089Spjd *r ^= *src; 724168404Spjd } 725168404Spjd 726168404Spjd /* 727168404Spjd * Treat short columns as though they are full of 0s. 728219089Spjd * Note that there's therefore nothing needed for P. 729168404Spjd */ 730219089Spjd for (; i < pcnt; i++, q++, r++) { 731219089Spjd VDEV_RAIDZ_64MUL_2(*q, mask); 732219089Spjd VDEV_RAIDZ_64MUL_4(*r, mask); 733168404Spjd } 734168404Spjd } 735168404Spjd } 736168404Spjd} 737168404Spjd 738219089Spjd/* 739219089Spjd * Generate RAID parity in the first virtual columns according to the number of 740219089Spjd * parity columns available. 741219089Spjd */ 742168404Spjdstatic void 743219089Spjdvdev_raidz_generate_parity(raidz_map_t *rm) 744168404Spjd{ 745219089Spjd switch (rm->rm_firstdatacol) { 746219089Spjd case 1: 747219089Spjd vdev_raidz_generate_parity_p(rm); 748219089Spjd break; 749219089Spjd case 2: 750219089Spjd vdev_raidz_generate_parity_pq(rm); 751219089Spjd break; 752219089Spjd case 3: 753219089Spjd vdev_raidz_generate_parity_pqr(rm); 754219089Spjd break; 755219089Spjd default: 756219089Spjd cmn_err(CE_PANIC, "invalid RAID-Z configuration"); 757219089Spjd } 758219089Spjd} 759219089Spjd 760219089Spjdstatic int 761219089Spjdvdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) 762219089Spjd{ 763168404Spjd uint64_t *dst, *src, xcount, ccount, count, i; 764219089Spjd int x = tgts[0]; 765168404Spjd int c; 766168404Spjd 767219089Spjd ASSERT(ntgts == 1); 768219089Spjd ASSERT(x >= rm->rm_firstdatacol); 769219089Spjd ASSERT(x < rm->rm_cols); 770219089Spjd 771168404Spjd xcount = rm->rm_col[x].rc_size / sizeof (src[0]); 772168404Spjd ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0])); 773168404Spjd ASSERT(xcount > 0); 774168404Spjd 775168404Spjd src = rm->rm_col[VDEV_RAIDZ_P].rc_data; 776168404Spjd dst = rm->rm_col[x].rc_data; 777168404Spjd for (i = 0; i < xcount; i++, dst++, src++) { 778168404Spjd *dst = *src; 779168404Spjd } 780168404Spjd 781168404Spjd for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 782168404Spjd src = rm->rm_col[c].rc_data; 783168404Spjd dst = rm->rm_col[x].rc_data; 784168404Spjd 785168404Spjd if (c == x) 786168404Spjd continue; 787168404Spjd 788168404Spjd ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 789168404Spjd count = MIN(ccount, xcount); 790168404Spjd 791168404Spjd for (i = 0; i < count; i++, dst++, src++) { 792168404Spjd *dst ^= *src; 793168404Spjd } 794168404Spjd } 795219089Spjd 796219089Spjd return (1 << VDEV_RAIDZ_P); 797168404Spjd} 798168404Spjd 799219089Spjdstatic int 800219089Spjdvdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) 801168404Spjd{ 802168404Spjd uint64_t *dst, *src, xcount, ccount, count, mask, i; 803168404Spjd uint8_t *b; 804219089Spjd int x = tgts[0]; 805168404Spjd int c, j, exp; 806168404Spjd 807219089Spjd ASSERT(ntgts == 1); 808219089Spjd 809168404Spjd xcount = rm->rm_col[x].rc_size / sizeof (src[0]); 810168404Spjd ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0])); 811168404Spjd 812168404Spjd for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 813168404Spjd src = rm->rm_col[c].rc_data; 814168404Spjd dst = rm->rm_col[x].rc_data; 815168404Spjd 816168404Spjd if (c == x) 817168404Spjd ccount = 0; 818168404Spjd else 819168404Spjd ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 820168404Spjd 821168404Spjd count = MIN(ccount, xcount); 822168404Spjd 823168404Spjd if (c == rm->rm_firstdatacol) { 824168404Spjd for (i = 0; i < count; i++, dst++, src++) { 825168404Spjd *dst = *src; 826168404Spjd } 827168404Spjd for (; i < xcount; i++, dst++) { 828168404Spjd *dst = 0; 829168404Spjd } 830168404Spjd 831168404Spjd } else { 832168404Spjd for (i = 0; i < count; i++, dst++, src++) { 833219089Spjd VDEV_RAIDZ_64MUL_2(*dst, mask); 834168404Spjd *dst ^= *src; 835168404Spjd } 836168404Spjd 837168404Spjd for (; i < xcount; i++, dst++) { 838219089Spjd VDEV_RAIDZ_64MUL_2(*dst, mask); 839168404Spjd } 840168404Spjd } 841168404Spjd } 842168404Spjd 843168404Spjd src = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 844168404Spjd dst = rm->rm_col[x].rc_data; 845168404Spjd exp = 255 - (rm->rm_cols - 1 - x); 846168404Spjd 847168404Spjd for (i = 0; i < xcount; i++, dst++, src++) { 848168404Spjd *dst ^= *src; 849168404Spjd for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { 850168404Spjd *b = vdev_raidz_exp2(*b, exp); 851168404Spjd } 852168404Spjd } 853219089Spjd 854219089Spjd return (1 << VDEV_RAIDZ_Q); 855168404Spjd} 856168404Spjd 857219089Spjdstatic int 858219089Spjdvdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) 859168404Spjd{ 860168404Spjd uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp; 861168404Spjd void *pdata, *qdata; 862168404Spjd uint64_t xsize, ysize, i; 863219089Spjd int x = tgts[0]; 864219089Spjd int y = tgts[1]; 865168404Spjd 866219089Spjd ASSERT(ntgts == 2); 867168404Spjd ASSERT(x < y); 868168404Spjd ASSERT(x >= rm->rm_firstdatacol); 869168404Spjd ASSERT(y < rm->rm_cols); 870168404Spjd 871168404Spjd ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size); 872168404Spjd 873168404Spjd /* 874168404Spjd * Move the parity data aside -- we're going to compute parity as 875168404Spjd * though columns x and y were full of zeros -- Pxy and Qxy. We want to 876168404Spjd * reuse the parity generation mechanism without trashing the actual 877168404Spjd * parity so we make those columns appear to be full of zeros by 878168404Spjd * setting their lengths to zero. 879168404Spjd */ 880168404Spjd pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data; 881168404Spjd qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 882168404Spjd xsize = rm->rm_col[x].rc_size; 883168404Spjd ysize = rm->rm_col[y].rc_size; 884168404Spjd 885168404Spjd rm->rm_col[VDEV_RAIDZ_P].rc_data = 886168404Spjd zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size); 887168404Spjd rm->rm_col[VDEV_RAIDZ_Q].rc_data = 888168404Spjd zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size); 889168404Spjd rm->rm_col[x].rc_size = 0; 890168404Spjd rm->rm_col[y].rc_size = 0; 891168404Spjd 892168404Spjd vdev_raidz_generate_parity_pq(rm); 893168404Spjd 894168404Spjd rm->rm_col[x].rc_size = xsize; 895168404Spjd rm->rm_col[y].rc_size = ysize; 896168404Spjd 897168404Spjd p = pdata; 898168404Spjd q = qdata; 899168404Spjd pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data; 900168404Spjd qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 901168404Spjd xd = rm->rm_col[x].rc_data; 902168404Spjd yd = rm->rm_col[y].rc_data; 903168404Spjd 904168404Spjd /* 905168404Spjd * We now have: 906168404Spjd * Pxy = P + D_x + D_y 907168404Spjd * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y 908168404Spjd * 909168404Spjd * We can then solve for D_x: 910168404Spjd * D_x = A * (P + Pxy) + B * (Q + Qxy) 911168404Spjd * where 912168404Spjd * A = 2^(x - y) * (2^(x - y) + 1)^-1 913168404Spjd * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 914168404Spjd * 915168404Spjd * With D_x in hand, we can easily solve for D_y: 916168404Spjd * D_y = P + Pxy + D_x 917168404Spjd */ 918168404Spjd 919168404Spjd a = vdev_raidz_pow2[255 + x - y]; 920168404Spjd b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)]; 921168404Spjd tmp = 255 - vdev_raidz_log2[a ^ 1]; 922168404Spjd 923168404Spjd aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; 924168404Spjd bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; 925168404Spjd 926168404Spjd for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) { 927168404Spjd *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^ 928168404Spjd vdev_raidz_exp2(*q ^ *qxy, bexp); 929168404Spjd 930168404Spjd if (i < ysize) 931168404Spjd *yd = *p ^ *pxy ^ *xd; 932168404Spjd } 933168404Spjd 934168404Spjd zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data, 935168404Spjd rm->rm_col[VDEV_RAIDZ_P].rc_size); 936168404Spjd zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data, 937168404Spjd rm->rm_col[VDEV_RAIDZ_Q].rc_size); 938168404Spjd 939168404Spjd /* 940168404Spjd * Restore the saved parity data. 941168404Spjd */ 942168404Spjd rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata; 943168404Spjd rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata; 944219089Spjd 945219089Spjd return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q)); 946168404Spjd} 947168404Spjd 948219089Spjd/* BEGIN CSTYLED */ 949219089Spjd/* 950219089Spjd * In the general case of reconstruction, we must solve the system of linear 951219089Spjd * equations defined by the coeffecients used to generate parity as well as 952219089Spjd * the contents of the data and parity disks. This can be expressed with 953219089Spjd * vectors for the original data (D) and the actual data (d) and parity (p) 954219089Spjd * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 955219089Spjd * 956219089Spjd * __ __ __ __ 957219089Spjd * | | __ __ | p_0 | 958219089Spjd * | V | | D_0 | | p_m-1 | 959219089Spjd * | | x | : | = | d_0 | 960219089Spjd * | I | | D_n-1 | | : | 961219089Spjd * | | ~~ ~~ | d_n-1 | 962219089Spjd * ~~ ~~ ~~ ~~ 963219089Spjd * 964219089Spjd * I is simply a square identity matrix of size n, and V is a vandermonde 965219089Spjd * matrix defined by the coeffecients we chose for the various parity columns 966219089Spjd * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 967219089Spjd * computation as well as linear separability. 968219089Spjd * 969219089Spjd * __ __ __ __ 970219089Spjd * | 1 .. 1 1 1 | | p_0 | 971219089Spjd * | 2^n-1 .. 4 2 1 | __ __ | : | 972219089Spjd * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 973219089Spjd * | 1 .. 0 0 0 | | D_1 | | d_0 | 974219089Spjd * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 975219089Spjd * | : : : : | | : | | d_2 | 976219089Spjd * | 0 .. 1 0 0 | | D_n-1 | | : | 977219089Spjd * | 0 .. 0 1 0 | ~~ ~~ | : | 978219089Spjd * | 0 .. 0 0 1 | | d_n-1 | 979219089Spjd * ~~ ~~ ~~ ~~ 980219089Spjd * 981219089Spjd * Note that I, V, d, and p are known. To compute D, we must invert the 982219089Spjd * matrix and use the known data and parity values to reconstruct the unknown 983219089Spjd * data values. We begin by removing the rows in V|I and d|p that correspond 984219089Spjd * to failed or missing columns; we then make V|I square (n x n) and d|p 985219089Spjd * sized n by removing rows corresponding to unused parity from the bottom up 986219089Spjd * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 987219089Spjd * using Gauss-Jordan elimination. In the example below we use m=3 parity 988219089Spjd * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 989219089Spjd * __ __ 990219089Spjd * | 1 1 1 1 1 1 1 1 | 991219089Spjd * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 992219089Spjd * | 19 205 116 29 64 16 4 1 | / / 993219089Spjd * | 1 0 0 0 0 0 0 0 | / / 994219089Spjd * | 0 1 0 0 0 0 0 0 | <--' / 995219089Spjd * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 996219089Spjd * | 0 0 0 1 0 0 0 0 | 997219089Spjd * | 0 0 0 0 1 0 0 0 | 998219089Spjd * | 0 0 0 0 0 1 0 0 | 999219089Spjd * | 0 0 0 0 0 0 1 0 | 1000219089Spjd * | 0 0 0 0 0 0 0 1 | 1001219089Spjd * ~~ ~~ 1002219089Spjd * __ __ 1003219089Spjd * | 1 1 1 1 1 1 1 1 | 1004219089Spjd * | 19 205 116 29 64 16 4 1 | 1005219089Spjd * | 1 0 0 0 0 0 0 0 | 1006255750Sdelphij * (V|I)' = | 0 0 0 1 0 0 0 0 | 1007219089Spjd * | 0 0 0 0 1 0 0 0 | 1008219089Spjd * | 0 0 0 0 0 1 0 0 | 1009219089Spjd * | 0 0 0 0 0 0 1 0 | 1010219089Spjd * | 0 0 0 0 0 0 0 1 | 1011219089Spjd * ~~ ~~ 1012219089Spjd * 1013219089Spjd * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 1014219089Spjd * have carefully chosen the seed values 1, 2, and 4 to ensure that this 1015219089Spjd * matrix is not singular. 1016219089Spjd * __ __ 1017219089Spjd * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1018219089Spjd * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1019219089Spjd * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1020219089Spjd * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1021219089Spjd * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1022219089Spjd * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1023219089Spjd * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1024219089Spjd * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1025219089Spjd * ~~ ~~ 1026219089Spjd * __ __ 1027219089Spjd * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1028219089Spjd * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 1029219089Spjd * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 1030219089Spjd * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1031219089Spjd * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1032219089Spjd * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1033219089Spjd * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1034219089Spjd * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1035219089Spjd * ~~ ~~ 1036219089Spjd * __ __ 1037219089Spjd * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1038219089Spjd * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1039219089Spjd * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 1040219089Spjd * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1041219089Spjd * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1042219089Spjd * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1043219089Spjd * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1044219089Spjd * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1045219089Spjd * ~~ ~~ 1046219089Spjd * __ __ 1047219089Spjd * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1048219089Spjd * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1049219089Spjd * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 1050219089Spjd * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1051219089Spjd * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1052219089Spjd * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1053219089Spjd * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1054219089Spjd * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1055219089Spjd * ~~ ~~ 1056219089Spjd * __ __ 1057219089Spjd * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1058219089Spjd * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 1059219089Spjd * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1060219089Spjd * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1061219089Spjd * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1062219089Spjd * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1063219089Spjd * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1064219089Spjd * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1065219089Spjd * ~~ ~~ 1066219089Spjd * __ __ 1067219089Spjd * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 1068219089Spjd * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 1069219089Spjd * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 1070219089Spjd * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 1071219089Spjd * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 1072219089Spjd * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 1073219089Spjd * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 1074219089Spjd * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 1075219089Spjd * ~~ ~~ 1076219089Spjd * __ __ 1077219089Spjd * | 0 0 1 0 0 0 0 0 | 1078219089Spjd * | 167 100 5 41 159 169 217 208 | 1079219089Spjd * | 166 100 4 40 158 168 216 209 | 1080219089Spjd * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 1081219089Spjd * | 0 0 0 0 1 0 0 0 | 1082219089Spjd * | 0 0 0 0 0 1 0 0 | 1083219089Spjd * | 0 0 0 0 0 0 1 0 | 1084219089Spjd * | 0 0 0 0 0 0 0 1 | 1085219089Spjd * ~~ ~~ 1086219089Spjd * 1087219089Spjd * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 1088219089Spjd * of the missing data. 1089219089Spjd * 1090219089Spjd * As is apparent from the example above, the only non-trivial rows in the 1091219089Spjd * inverse matrix correspond to the data disks that we're trying to 1092219089Spjd * reconstruct. Indeed, those are the only rows we need as the others would 1093219089Spjd * only be useful for reconstructing data known or assumed to be valid. For 1094219089Spjd * that reason, we only build the coefficients in the rows that correspond to 1095219089Spjd * targeted columns. 1096219089Spjd */ 1097219089Spjd/* END CSTYLED */ 1098168404Spjd 1099219089Spjdstatic void 1100219089Spjdvdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, 1101219089Spjd uint8_t **rows) 1102219089Spjd{ 1103219089Spjd int i, j; 1104219089Spjd int pow; 1105219089Spjd 1106219089Spjd ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); 1107219089Spjd 1108219089Spjd /* 1109219089Spjd * Fill in the missing rows of interest. 1110219089Spjd */ 1111219089Spjd for (i = 0; i < nmap; i++) { 1112219089Spjd ASSERT3S(0, <=, map[i]); 1113219089Spjd ASSERT3S(map[i], <=, 2); 1114219089Spjd 1115219089Spjd pow = map[i] * n; 1116219089Spjd if (pow > 255) 1117219089Spjd pow -= 255; 1118219089Spjd ASSERT(pow <= 255); 1119219089Spjd 1120219089Spjd for (j = 0; j < n; j++) { 1121219089Spjd pow -= map[i]; 1122219089Spjd if (pow < 0) 1123219089Spjd pow += 255; 1124219089Spjd rows[i][j] = vdev_raidz_pow2[pow]; 1125219089Spjd } 1126219089Spjd } 1127219089Spjd} 1128219089Spjd 1129219089Spjdstatic void 1130219089Spjdvdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, 1131219089Spjd uint8_t **rows, uint8_t **invrows, const uint8_t *used) 1132219089Spjd{ 1133219089Spjd int i, j, ii, jj; 1134219089Spjd uint8_t log; 1135219089Spjd 1136219089Spjd /* 1137219089Spjd * Assert that the first nmissing entries from the array of used 1138219089Spjd * columns correspond to parity columns and that subsequent entries 1139219089Spjd * correspond to data columns. 1140219089Spjd */ 1141219089Spjd for (i = 0; i < nmissing; i++) { 1142219089Spjd ASSERT3S(used[i], <, rm->rm_firstdatacol); 1143219089Spjd } 1144219089Spjd for (; i < n; i++) { 1145219089Spjd ASSERT3S(used[i], >=, rm->rm_firstdatacol); 1146219089Spjd } 1147219089Spjd 1148219089Spjd /* 1149219089Spjd * First initialize the storage where we'll compute the inverse rows. 1150219089Spjd */ 1151219089Spjd for (i = 0; i < nmissing; i++) { 1152219089Spjd for (j = 0; j < n; j++) { 1153219089Spjd invrows[i][j] = (i == j) ? 1 : 0; 1154219089Spjd } 1155219089Spjd } 1156219089Spjd 1157219089Spjd /* 1158219089Spjd * Subtract all trivial rows from the rows of consequence. 1159219089Spjd */ 1160219089Spjd for (i = 0; i < nmissing; i++) { 1161219089Spjd for (j = nmissing; j < n; j++) { 1162219089Spjd ASSERT3U(used[j], >=, rm->rm_firstdatacol); 1163219089Spjd jj = used[j] - rm->rm_firstdatacol; 1164219089Spjd ASSERT3S(jj, <, n); 1165219089Spjd invrows[i][j] = rows[i][jj]; 1166219089Spjd rows[i][jj] = 0; 1167219089Spjd } 1168219089Spjd } 1169219089Spjd 1170219089Spjd /* 1171219089Spjd * For each of the rows of interest, we must normalize it and subtract 1172219089Spjd * a multiple of it from the other rows. 1173219089Spjd */ 1174219089Spjd for (i = 0; i < nmissing; i++) { 1175219089Spjd for (j = 0; j < missing[i]; j++) { 1176240415Smm ASSERT0(rows[i][j]); 1177219089Spjd } 1178219089Spjd ASSERT3U(rows[i][missing[i]], !=, 0); 1179219089Spjd 1180219089Spjd /* 1181219089Spjd * Compute the inverse of the first element and multiply each 1182219089Spjd * element in the row by that value. 1183219089Spjd */ 1184219089Spjd log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 1185219089Spjd 1186219089Spjd for (j = 0; j < n; j++) { 1187219089Spjd rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 1188219089Spjd invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 1189219089Spjd } 1190219089Spjd 1191219089Spjd for (ii = 0; ii < nmissing; ii++) { 1192219089Spjd if (i == ii) 1193219089Spjd continue; 1194219089Spjd 1195219089Spjd ASSERT3U(rows[ii][missing[i]], !=, 0); 1196219089Spjd 1197219089Spjd log = vdev_raidz_log2[rows[ii][missing[i]]]; 1198219089Spjd 1199219089Spjd for (j = 0; j < n; j++) { 1200219089Spjd rows[ii][j] ^= 1201219089Spjd vdev_raidz_exp2(rows[i][j], log); 1202219089Spjd invrows[ii][j] ^= 1203219089Spjd vdev_raidz_exp2(invrows[i][j], log); 1204219089Spjd } 1205219089Spjd } 1206219089Spjd } 1207219089Spjd 1208219089Spjd /* 1209219089Spjd * Verify that the data that is left in the rows are properly part of 1210219089Spjd * an identity matrix. 1211219089Spjd */ 1212219089Spjd for (i = 0; i < nmissing; i++) { 1213219089Spjd for (j = 0; j < n; j++) { 1214219089Spjd if (j == missing[i]) { 1215219089Spjd ASSERT3U(rows[i][j], ==, 1); 1216219089Spjd } else { 1217240415Smm ASSERT0(rows[i][j]); 1218219089Spjd } 1219219089Spjd } 1220219089Spjd } 1221219089Spjd} 1222219089Spjd 1223219089Spjdstatic void 1224219089Spjdvdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, 1225219089Spjd int *missing, uint8_t **invrows, const uint8_t *used) 1226219089Spjd{ 1227219089Spjd int i, j, x, cc, c; 1228219089Spjd uint8_t *src; 1229219089Spjd uint64_t ccount; 1230219089Spjd uint8_t *dst[VDEV_RAIDZ_MAXPARITY]; 1231219089Spjd uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; 1232247187Smm uint8_t log = 0; 1233247187Smm uint8_t val; 1234219089Spjd int ll; 1235219089Spjd uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 1236219089Spjd uint8_t *p, *pp; 1237219089Spjd size_t psize; 1238219089Spjd 1239219089Spjd psize = sizeof (invlog[0][0]) * n * nmissing; 1240219089Spjd p = kmem_alloc(psize, KM_SLEEP); 1241219089Spjd 1242219089Spjd for (pp = p, i = 0; i < nmissing; i++) { 1243219089Spjd invlog[i] = pp; 1244219089Spjd pp += n; 1245219089Spjd } 1246219089Spjd 1247219089Spjd for (i = 0; i < nmissing; i++) { 1248219089Spjd for (j = 0; j < n; j++) { 1249219089Spjd ASSERT3U(invrows[i][j], !=, 0); 1250219089Spjd invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 1251219089Spjd } 1252219089Spjd } 1253219089Spjd 1254219089Spjd for (i = 0; i < n; i++) { 1255219089Spjd c = used[i]; 1256219089Spjd ASSERT3U(c, <, rm->rm_cols); 1257219089Spjd 1258219089Spjd src = rm->rm_col[c].rc_data; 1259219089Spjd ccount = rm->rm_col[c].rc_size; 1260219089Spjd for (j = 0; j < nmissing; j++) { 1261219089Spjd cc = missing[j] + rm->rm_firstdatacol; 1262219089Spjd ASSERT3U(cc, >=, rm->rm_firstdatacol); 1263219089Spjd ASSERT3U(cc, <, rm->rm_cols); 1264219089Spjd ASSERT3U(cc, !=, c); 1265219089Spjd 1266219089Spjd dst[j] = rm->rm_col[cc].rc_data; 1267219089Spjd dcount[j] = rm->rm_col[cc].rc_size; 1268219089Spjd } 1269219089Spjd 1270219089Spjd ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); 1271219089Spjd 1272219089Spjd for (x = 0; x < ccount; x++, src++) { 1273219089Spjd if (*src != 0) 1274219089Spjd log = vdev_raidz_log2[*src]; 1275219089Spjd 1276219089Spjd for (cc = 0; cc < nmissing; cc++) { 1277219089Spjd if (x >= dcount[cc]) 1278219089Spjd continue; 1279219089Spjd 1280219089Spjd if (*src == 0) { 1281219089Spjd val = 0; 1282219089Spjd } else { 1283219089Spjd if ((ll = log + invlog[cc][i]) >= 255) 1284219089Spjd ll -= 255; 1285219089Spjd val = vdev_raidz_pow2[ll]; 1286219089Spjd } 1287219089Spjd 1288219089Spjd if (i == 0) 1289219089Spjd dst[cc][x] = val; 1290219089Spjd else 1291219089Spjd dst[cc][x] ^= val; 1292219089Spjd } 1293219089Spjd } 1294219089Spjd } 1295219089Spjd 1296219089Spjd kmem_free(p, psize); 1297219089Spjd} 1298219089Spjd 1299168404Spjdstatic int 1300219089Spjdvdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) 1301219089Spjd{ 1302219089Spjd int n, i, c, t, tt; 1303219089Spjd int nmissing_rows; 1304219089Spjd int missing_rows[VDEV_RAIDZ_MAXPARITY]; 1305219089Spjd int parity_map[VDEV_RAIDZ_MAXPARITY]; 1306219089Spjd 1307219089Spjd uint8_t *p, *pp; 1308219089Spjd size_t psize; 1309219089Spjd 1310219089Spjd uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 1311219089Spjd uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 1312219089Spjd uint8_t *used; 1313219089Spjd 1314219089Spjd int code = 0; 1315219089Spjd 1316219089Spjd 1317219089Spjd n = rm->rm_cols - rm->rm_firstdatacol; 1318219089Spjd 1319219089Spjd /* 1320219089Spjd * Figure out which data columns are missing. 1321219089Spjd */ 1322219089Spjd nmissing_rows = 0; 1323219089Spjd for (t = 0; t < ntgts; t++) { 1324219089Spjd if (tgts[t] >= rm->rm_firstdatacol) { 1325219089Spjd missing_rows[nmissing_rows++] = 1326219089Spjd tgts[t] - rm->rm_firstdatacol; 1327219089Spjd } 1328219089Spjd } 1329219089Spjd 1330219089Spjd /* 1331219089Spjd * Figure out which parity columns to use to help generate the missing 1332219089Spjd * data columns. 1333219089Spjd */ 1334219089Spjd for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 1335219089Spjd ASSERT(tt < ntgts); 1336219089Spjd ASSERT(c < rm->rm_firstdatacol); 1337219089Spjd 1338219089Spjd /* 1339219089Spjd * Skip any targeted parity columns. 1340219089Spjd */ 1341219089Spjd if (c == tgts[tt]) { 1342219089Spjd tt++; 1343219089Spjd continue; 1344219089Spjd } 1345219089Spjd 1346219089Spjd code |= 1 << c; 1347219089Spjd 1348219089Spjd parity_map[i] = c; 1349219089Spjd i++; 1350219089Spjd } 1351219089Spjd 1352219089Spjd ASSERT(code != 0); 1353219089Spjd ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); 1354219089Spjd 1355219089Spjd psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 1356219089Spjd nmissing_rows * n + sizeof (used[0]) * n; 1357219089Spjd p = kmem_alloc(psize, KM_SLEEP); 1358219089Spjd 1359219089Spjd for (pp = p, i = 0; i < nmissing_rows; i++) { 1360219089Spjd rows[i] = pp; 1361219089Spjd pp += n; 1362219089Spjd invrows[i] = pp; 1363219089Spjd pp += n; 1364219089Spjd } 1365219089Spjd used = pp; 1366219089Spjd 1367219089Spjd for (i = 0; i < nmissing_rows; i++) { 1368219089Spjd used[i] = parity_map[i]; 1369219089Spjd } 1370219089Spjd 1371219089Spjd for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1372219089Spjd if (tt < nmissing_rows && 1373219089Spjd c == missing_rows[tt] + rm->rm_firstdatacol) { 1374219089Spjd tt++; 1375219089Spjd continue; 1376219089Spjd } 1377219089Spjd 1378219089Spjd ASSERT3S(i, <, n); 1379219089Spjd used[i] = c; 1380219089Spjd i++; 1381219089Spjd } 1382219089Spjd 1383219089Spjd /* 1384219089Spjd * Initialize the interesting rows of the matrix. 1385219089Spjd */ 1386219089Spjd vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); 1387219089Spjd 1388219089Spjd /* 1389219089Spjd * Invert the matrix. 1390219089Spjd */ 1391219089Spjd vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, 1392219089Spjd invrows, used); 1393219089Spjd 1394219089Spjd /* 1395219089Spjd * Reconstruct the missing data using the generated matrix. 1396219089Spjd */ 1397219089Spjd vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, 1398219089Spjd invrows, used); 1399219089Spjd 1400219089Spjd kmem_free(p, psize); 1401219089Spjd 1402219089Spjd return (code); 1403219089Spjd} 1404219089Spjd 1405219089Spjdstatic int 1406219089Spjdvdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) 1407219089Spjd{ 1408219089Spjd int tgts[VDEV_RAIDZ_MAXPARITY], *dt; 1409219089Spjd int ntgts; 1410219089Spjd int i, c; 1411219089Spjd int code; 1412219089Spjd int nbadparity, nbaddata; 1413219089Spjd int parity_valid[VDEV_RAIDZ_MAXPARITY]; 1414219089Spjd 1415219089Spjd /* 1416219089Spjd * The tgts list must already be sorted. 1417219089Spjd */ 1418219089Spjd for (i = 1; i < nt; i++) { 1419219089Spjd ASSERT(t[i] > t[i - 1]); 1420219089Spjd } 1421219089Spjd 1422219089Spjd nbadparity = rm->rm_firstdatacol; 1423219089Spjd nbaddata = rm->rm_cols - nbadparity; 1424219089Spjd ntgts = 0; 1425219089Spjd for (i = 0, c = 0; c < rm->rm_cols; c++) { 1426219089Spjd if (c < rm->rm_firstdatacol) 1427219089Spjd parity_valid[c] = B_FALSE; 1428219089Spjd 1429219089Spjd if (i < nt && c == t[i]) { 1430219089Spjd tgts[ntgts++] = c; 1431219089Spjd i++; 1432219089Spjd } else if (rm->rm_col[c].rc_error != 0) { 1433219089Spjd tgts[ntgts++] = c; 1434219089Spjd } else if (c >= rm->rm_firstdatacol) { 1435219089Spjd nbaddata--; 1436219089Spjd } else { 1437219089Spjd parity_valid[c] = B_TRUE; 1438219089Spjd nbadparity--; 1439219089Spjd } 1440219089Spjd } 1441219089Spjd 1442219089Spjd ASSERT(ntgts >= nt); 1443219089Spjd ASSERT(nbaddata >= 0); 1444219089Spjd ASSERT(nbaddata + nbadparity == ntgts); 1445219089Spjd 1446219089Spjd dt = &tgts[nbadparity]; 1447219089Spjd 1448219089Spjd /* 1449219089Spjd * See if we can use any of our optimized reconstruction routines. 1450219089Spjd */ 1451219089Spjd if (!vdev_raidz_default_to_general) { 1452219089Spjd switch (nbaddata) { 1453219089Spjd case 1: 1454219089Spjd if (parity_valid[VDEV_RAIDZ_P]) 1455219089Spjd return (vdev_raidz_reconstruct_p(rm, dt, 1)); 1456219089Spjd 1457219089Spjd ASSERT(rm->rm_firstdatacol > 1); 1458219089Spjd 1459219089Spjd if (parity_valid[VDEV_RAIDZ_Q]) 1460219089Spjd return (vdev_raidz_reconstruct_q(rm, dt, 1)); 1461219089Spjd 1462219089Spjd ASSERT(rm->rm_firstdatacol > 2); 1463219089Spjd break; 1464219089Spjd 1465219089Spjd case 2: 1466219089Spjd ASSERT(rm->rm_firstdatacol > 1); 1467219089Spjd 1468219089Spjd if (parity_valid[VDEV_RAIDZ_P] && 1469219089Spjd parity_valid[VDEV_RAIDZ_Q]) 1470219089Spjd return (vdev_raidz_reconstruct_pq(rm, dt, 2)); 1471219089Spjd 1472219089Spjd ASSERT(rm->rm_firstdatacol > 2); 1473219089Spjd 1474219089Spjd break; 1475219089Spjd } 1476219089Spjd } 1477219089Spjd 1478219089Spjd code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); 1479219089Spjd ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); 1480219089Spjd ASSERT(code > 0); 1481219089Spjd return (code); 1482219089Spjd} 1483219089Spjd 1484219089Spjdstatic int 1485236155Smmvdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, 1486254591Sgibbs uint64_t *logical_ashift, uint64_t *physical_ashift) 1487168404Spjd{ 1488168404Spjd vdev_t *cvd; 1489168404Spjd uint64_t nparity = vd->vdev_nparity; 1490219089Spjd int c; 1491168404Spjd int lasterror = 0; 1492168404Spjd int numerrors = 0; 1493168404Spjd 1494168404Spjd ASSERT(nparity > 0); 1495168404Spjd 1496168404Spjd if (nparity > VDEV_RAIDZ_MAXPARITY || 1497168404Spjd vd->vdev_children < nparity + 1) { 1498168404Spjd vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 1499249195Smm return (SET_ERROR(EINVAL)); 1500168404Spjd } 1501168404Spjd 1502219089Spjd vdev_open_children(vd); 1503219089Spjd 1504168404Spjd for (c = 0; c < vd->vdev_children; c++) { 1505168404Spjd cvd = vd->vdev_child[c]; 1506168404Spjd 1507219089Spjd if (cvd->vdev_open_error != 0) { 1508219089Spjd lasterror = cvd->vdev_open_error; 1509168404Spjd numerrors++; 1510168404Spjd continue; 1511168404Spjd } 1512168404Spjd 1513168404Spjd *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; 1514236155Smm *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; 1515254591Sgibbs *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift); 1516254591Sgibbs *physical_ashift = MAX(*physical_ashift, 1517254591Sgibbs cvd->vdev_physical_ashift); 1518168404Spjd } 1519168404Spjd 1520168404Spjd *asize *= vd->vdev_children; 1521236155Smm *max_asize *= vd->vdev_children; 1522168404Spjd 1523168404Spjd if (numerrors > nparity) { 1524168404Spjd vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; 1525168404Spjd return (lasterror); 1526168404Spjd } 1527168404Spjd 1528168404Spjd return (0); 1529168404Spjd} 1530168404Spjd 1531168404Spjdstatic void 1532168404Spjdvdev_raidz_close(vdev_t *vd) 1533168404Spjd{ 1534168404Spjd int c; 1535168404Spjd 1536168404Spjd for (c = 0; c < vd->vdev_children; c++) 1537168404Spjd vdev_close(vd->vdev_child[c]); 1538168404Spjd} 1539168404Spjd 1540255750Sdelphij#ifdef illumos 1541255750Sdelphij/* 1542255750Sdelphij * Handle a read or write I/O to a RAID-Z dump device. 1543255750Sdelphij * 1544255750Sdelphij * The dump device is in a unique situation compared to other ZFS datasets: 1545255750Sdelphij * writing to this device should be as simple and fast as possible. In 1546255750Sdelphij * addition, durability matters much less since the dump will be extracted 1547255750Sdelphij * once the machine reboots. For that reason, this function eschews parity for 1548255750Sdelphij * performance and simplicity. The dump device uses the checksum setting 1549255750Sdelphij * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this 1550255750Sdelphij * dataset. 1551255750Sdelphij * 1552255750Sdelphij * Blocks of size 128 KB have been preallocated for this volume. I/Os less than 1553255750Sdelphij * 128 KB will not fill an entire block; in addition, they may not be properly 1554255750Sdelphij * aligned. In that case, this function uses the preallocated 128 KB block and 1555255750Sdelphij * omits reading or writing any "empty" portions of that block, as opposed to 1556255750Sdelphij * allocating a fresh appropriately-sized block. 1557255750Sdelphij * 1558255750Sdelphij * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs: 1559255750Sdelphij * 1560255750Sdelphij * vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB) 1561255750Sdelphij * 1562255750Sdelphij * If this were a standard RAID-Z dataset, a block of at least 40 KB would be 1563255750Sdelphij * allocated which spans all five child vdevs. 8 KB of data would be written to 1564255750Sdelphij * each of four vdevs, with the fifth containing the parity bits. 1565255750Sdelphij * 1566255750Sdelphij * parity data data data data 1567255750Sdelphij * | PP | XX | XX | XX | XX | 1568255750Sdelphij * ^ ^ ^ ^ ^ 1569255750Sdelphij * | | | | | 1570255750Sdelphij * 8 KB parity ------8 KB data blocks------ 1571255750Sdelphij * 1572255750Sdelphij * However, when writing to the dump device, the behavior is different: 1573255750Sdelphij * 1574255750Sdelphij * vdev_raidz_physio(data, size: 32 KB, offset: 64 KB) 1575255750Sdelphij * 1576255750Sdelphij * Unlike the normal RAID-Z case in which the block is allocated based on the 1577255750Sdelphij * I/O size, reads and writes here always use a 128 KB logical I/O size. If the 1578255750Sdelphij * I/O size is less than 128 KB, only the actual portions of data are written. 1579255750Sdelphij * In this example the data is written to the third data vdev since that vdev 1580255750Sdelphij * contains the offset [64 KB, 96 KB). 1581255750Sdelphij * 1582255750Sdelphij * parity data data data data 1583255750Sdelphij * | | | | XX | | 1584255750Sdelphij * ^ 1585255750Sdelphij * | 1586255750Sdelphij * 32 KB data block 1587255750Sdelphij * 1588255750Sdelphij * As a result, an individual I/O may not span all child vdevs; moreover, a 1589255750Sdelphij * small I/O may only operate on a single child vdev. 1590255750Sdelphij * 1591255750Sdelphij * Note that since there are no parity bits calculated or written, this format 1592255750Sdelphij * remains the same no matter how many parity bits are used in a normal RAID-Z 1593255750Sdelphij * stripe. On a RAID-Z3 configuration with seven child vdevs, the example above 1594255750Sdelphij * would look like: 1595255750Sdelphij * 1596255750Sdelphij * parity parity parity data data data data 1597255750Sdelphij * | | | | | | XX | | 1598255750Sdelphij * ^ 1599255750Sdelphij * | 1600255750Sdelphij * 32 KB data block 1601255750Sdelphij */ 1602255750Sdelphijint 1603255750Sdelphijvdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size, 1604255750Sdelphij uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump) 1605255750Sdelphij{ 1606255750Sdelphij vdev_t *tvd = vd->vdev_top; 1607255750Sdelphij vdev_t *cvd; 1608255750Sdelphij raidz_map_t *rm; 1609255750Sdelphij raidz_col_t *rc; 1610255750Sdelphij int c, err = 0; 1611255750Sdelphij 1612255750Sdelphij uint64_t start, end, colstart, colend; 1613255750Sdelphij uint64_t coloffset, colsize, colskip; 1614255750Sdelphij 1615255750Sdelphij int flags = doread ? BIO_READ : BIO_WRITE; 1616255750Sdelphij 1617255750Sdelphij#ifdef _KERNEL 1618255750Sdelphij 1619255750Sdelphij /* 1620255750Sdelphij * Don't write past the end of the block 1621255750Sdelphij */ 1622276081Sdelphij VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE); 1623255750Sdelphij 1624255750Sdelphij start = offset; 1625255750Sdelphij end = start + size; 1626255750Sdelphij 1627255750Sdelphij /* 1628255750Sdelphij * Allocate a RAID-Z map for this block. Note that this block starts 1629255750Sdelphij * from the "original" offset, this is, the offset of the extent which 1630255750Sdelphij * contains the requisite offset of the data being read or written. 1631255750Sdelphij * 1632255750Sdelphij * Even if this I/O operation doesn't span the full block size, let's 1633255750Sdelphij * treat the on-disk format as if the only blocks are the complete 128 1634255750Sdelphij * KB size. 1635255750Sdelphij */ 1636255750Sdelphij rm = vdev_raidz_map_alloc(data - (offset - origoffset), 1637276081Sdelphij SPA_OLD_MAXBLOCKSIZE, origoffset, B_FALSE, tvd->vdev_ashift, 1638276081Sdelphij vd->vdev_children, vd->vdev_nparity); 1639255750Sdelphij 1640255750Sdelphij coloffset = origoffset; 1641255750Sdelphij 1642255750Sdelphij for (c = rm->rm_firstdatacol; c < rm->rm_cols; 1643255750Sdelphij c++, coloffset += rc->rc_size) { 1644255750Sdelphij rc = &rm->rm_col[c]; 1645255750Sdelphij cvd = vd->vdev_child[rc->rc_devidx]; 1646255750Sdelphij 1647255750Sdelphij /* 1648255750Sdelphij * Find the start and end of this column in the RAID-Z map, 1649255750Sdelphij * keeping in mind that the stated size and offset of the 1650255750Sdelphij * operation may not fill the entire column for this vdev. 1651255750Sdelphij * 1652255750Sdelphij * If any portion of the data spans this column, issue the 1653255750Sdelphij * appropriate operation to the vdev. 1654255750Sdelphij */ 1655255750Sdelphij if (coloffset + rc->rc_size <= start) 1656255750Sdelphij continue; 1657255750Sdelphij if (coloffset >= end) 1658255750Sdelphij continue; 1659255750Sdelphij 1660255750Sdelphij colstart = MAX(coloffset, start); 1661255750Sdelphij colend = MIN(end, coloffset + rc->rc_size); 1662255750Sdelphij colsize = colend - colstart; 1663255750Sdelphij colskip = colstart - coloffset; 1664255750Sdelphij 1665255750Sdelphij VERIFY3U(colsize, <=, rc->rc_size); 1666255750Sdelphij VERIFY3U(colskip, <=, rc->rc_size); 1667255750Sdelphij 1668255750Sdelphij /* 1669255750Sdelphij * Note that the child vdev will have a vdev label at the start 1670255750Sdelphij * of its range of offsets, hence the need for 1671255750Sdelphij * VDEV_LABEL_OFFSET(). See zio_vdev_child_io() for another 1672255750Sdelphij * example of why this calculation is needed. 1673255750Sdelphij */ 1674255750Sdelphij if ((err = vdev_disk_physio(cvd, 1675255750Sdelphij ((char *)rc->rc_data) + colskip, colsize, 1676255750Sdelphij VDEV_LABEL_OFFSET(rc->rc_offset) + colskip, 1677255750Sdelphij flags, isdump)) != 0) 1678255750Sdelphij break; 1679255750Sdelphij } 1680255750Sdelphij 1681255750Sdelphij vdev_raidz_map_free(rm); 1682255750Sdelphij#endif /* KERNEL */ 1683255750Sdelphij 1684255750Sdelphij return (err); 1685255750Sdelphij} 1686255750Sdelphij#endif 1687255750Sdelphij 1688168404Spjdstatic uint64_t 1689168404Spjdvdev_raidz_asize(vdev_t *vd, uint64_t psize) 1690168404Spjd{ 1691168404Spjd uint64_t asize; 1692168404Spjd uint64_t ashift = vd->vdev_top->vdev_ashift; 1693168404Spjd uint64_t cols = vd->vdev_children; 1694168404Spjd uint64_t nparity = vd->vdev_nparity; 1695168404Spjd 1696168404Spjd asize = ((psize - 1) >> ashift) + 1; 1697168404Spjd asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); 1698168404Spjd asize = roundup(asize, nparity + 1) << ashift; 1699168404Spjd 1700168404Spjd return (asize); 1701168404Spjd} 1702168404Spjd 1703168404Spjdstatic void 1704168404Spjdvdev_raidz_child_done(zio_t *zio) 1705168404Spjd{ 1706168404Spjd raidz_col_t *rc = zio->io_private; 1707168404Spjd 1708168404Spjd rc->rc_error = zio->io_error; 1709168404Spjd rc->rc_tried = 1; 1710168404Spjd rc->rc_skipped = 0; 1711168404Spjd} 1712168404Spjd 1713251629Sdelphij/* 1714251629Sdelphij * Start an IO operation on a RAIDZ VDev 1715251629Sdelphij * 1716251629Sdelphij * Outline: 1717251629Sdelphij * - For write operations: 1718251629Sdelphij * 1. Generate the parity data 1719251629Sdelphij * 2. Create child zio write operations to each column's vdev, for both 1720251629Sdelphij * data and parity. 1721251629Sdelphij * 3. If the column skips any sectors for padding, create optional dummy 1722251629Sdelphij * write zio children for those areas to improve aggregation continuity. 1723251629Sdelphij * - For read operations: 1724251629Sdelphij * 1. Create child zio read operations to each data column's vdev to read 1725251629Sdelphij * the range of data required for zio. 1726251629Sdelphij * 2. If this is a scrub or resilver operation, or if any of the data 1727251629Sdelphij * vdevs have had errors, then create zio read operations to the parity 1728251629Sdelphij * columns' VDevs as well. 1729251629Sdelphij */ 1730297078Smavstatic void 1731168404Spjdvdev_raidz_io_start(zio_t *zio) 1732168404Spjd{ 1733168404Spjd vdev_t *vd = zio->io_vd; 1734168404Spjd vdev_t *tvd = vd->vdev_top; 1735168404Spjd vdev_t *cvd; 1736168404Spjd raidz_map_t *rm; 1737168404Spjd raidz_col_t *rc; 1738219089Spjd int c, i; 1739168404Spjd 1740255750Sdelphij rm = vdev_raidz_map_alloc(zio->io_data, zio->io_size, zio->io_offset, 1741255750Sdelphij zio->io_type == ZIO_TYPE_FREE, 1742255750Sdelphij tvd->vdev_ashift, vd->vdev_children, 1743168404Spjd vd->vdev_nparity); 1744168404Spjd 1745255750Sdelphij zio->io_vsd = rm; 1746255750Sdelphij zio->io_vsd_ops = &vdev_raidz_vsd_ops; 1747255750Sdelphij 1748168404Spjd ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); 1749168404Spjd 1750240868Spjd if (zio->io_type == ZIO_TYPE_FREE) { 1751240868Spjd for (c = 0; c < rm->rm_cols; c++) { 1752240868Spjd rc = &rm->rm_col[c]; 1753240868Spjd cvd = vd->vdev_child[rc->rc_devidx]; 1754240868Spjd zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1755240868Spjd rc->rc_offset, rc->rc_data, rc->rc_size, 1756240868Spjd zio->io_type, zio->io_priority, 0, 1757240868Spjd vdev_raidz_child_done, rc)); 1758240868Spjd } 1759270312Ssmh 1760297078Smav zio_execute(zio); 1761297078Smav return; 1762240868Spjd } 1763240868Spjd 1764168404Spjd if (zio->io_type == ZIO_TYPE_WRITE) { 1765219089Spjd vdev_raidz_generate_parity(rm); 1766168404Spjd 1767168404Spjd for (c = 0; c < rm->rm_cols; c++) { 1768168404Spjd rc = &rm->rm_col[c]; 1769168404Spjd cvd = vd->vdev_child[rc->rc_devidx]; 1770168404Spjd zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1771168404Spjd rc->rc_offset, rc->rc_data, rc->rc_size, 1772185029Spjd zio->io_type, zio->io_priority, 0, 1773168404Spjd vdev_raidz_child_done, rc)); 1774168404Spjd } 1775185029Spjd 1776219089Spjd /* 1777219089Spjd * Generate optional I/Os for any skipped sectors to improve 1778219089Spjd * aggregation contiguity. 1779219089Spjd */ 1780219089Spjd for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) { 1781219089Spjd ASSERT(c <= rm->rm_scols); 1782219089Spjd if (c == rm->rm_scols) 1783219089Spjd c = 0; 1784219089Spjd rc = &rm->rm_col[c]; 1785219089Spjd cvd = vd->vdev_child[rc->rc_devidx]; 1786219089Spjd zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1787219089Spjd rc->rc_offset + rc->rc_size, NULL, 1788219089Spjd 1 << tvd->vdev_ashift, 1789219089Spjd zio->io_type, zio->io_priority, 1790219089Spjd ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); 1791219089Spjd } 1792219089Spjd 1793297078Smav zio_execute(zio); 1794297078Smav return; 1795168404Spjd } 1796168404Spjd 1797168404Spjd ASSERT(zio->io_type == ZIO_TYPE_READ); 1798168404Spjd 1799168404Spjd /* 1800168404Spjd * Iterate over the columns in reverse order so that we hit the parity 1801219089Spjd * last -- any errors along the way will force us to read the parity. 1802168404Spjd */ 1803168404Spjd for (c = rm->rm_cols - 1; c >= 0; c--) { 1804168404Spjd rc = &rm->rm_col[c]; 1805168404Spjd cvd = vd->vdev_child[rc->rc_devidx]; 1806185029Spjd if (!vdev_readable(cvd)) { 1807168404Spjd if (c >= rm->rm_firstdatacol) 1808168404Spjd rm->rm_missingdata++; 1809168404Spjd else 1810168404Spjd rm->rm_missingparity++; 1811249195Smm rc->rc_error = SET_ERROR(ENXIO); 1812168404Spjd rc->rc_tried = 1; /* don't even try */ 1813168404Spjd rc->rc_skipped = 1; 1814168404Spjd continue; 1815168404Spjd } 1816219089Spjd if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 1817168404Spjd if (c >= rm->rm_firstdatacol) 1818168404Spjd rm->rm_missingdata++; 1819168404Spjd else 1820168404Spjd rm->rm_missingparity++; 1821249195Smm rc->rc_error = SET_ERROR(ESTALE); 1822168404Spjd rc->rc_skipped = 1; 1823168404Spjd continue; 1824168404Spjd } 1825168404Spjd if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || 1826209095Smm (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { 1827168404Spjd zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 1828168404Spjd rc->rc_offset, rc->rc_data, rc->rc_size, 1829185029Spjd zio->io_type, zio->io_priority, 0, 1830168404Spjd vdev_raidz_child_done, rc)); 1831168404Spjd } 1832168404Spjd } 1833168404Spjd 1834297078Smav zio_execute(zio); 1835168404Spjd} 1836168404Spjd 1837219089Spjd 1838168404Spjd/* 1839168404Spjd * Report a checksum error for a child of a RAID-Z device. 1840168404Spjd */ 1841168404Spjdstatic void 1842219089Spjdraidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data) 1843168404Spjd{ 1844168404Spjd vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; 1845168404Spjd 1846168404Spjd if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 1847219089Spjd zio_bad_cksum_t zbc; 1848219089Spjd raidz_map_t *rm = zio->io_vsd; 1849219089Spjd 1850168404Spjd mutex_enter(&vd->vdev_stat_lock); 1851168404Spjd vd->vdev_stat.vs_checksum_errors++; 1852168404Spjd mutex_exit(&vd->vdev_stat_lock); 1853219089Spjd 1854219089Spjd zbc.zbc_has_cksum = 0; 1855219089Spjd zbc.zbc_injected = rm->rm_ecksuminjected; 1856219089Spjd 1857219089Spjd zfs_ereport_post_checksum(zio->io_spa, vd, zio, 1858219089Spjd rc->rc_offset, rc->rc_size, rc->rc_data, bad_data, 1859219089Spjd &zbc); 1860168404Spjd } 1861219089Spjd} 1862168404Spjd 1863219089Spjd/* 1864219089Spjd * We keep track of whether or not there were any injected errors, so that 1865219089Spjd * any ereports we generate can note it. 1866219089Spjd */ 1867219089Spjdstatic int 1868219089Spjdraidz_checksum_verify(zio_t *zio) 1869219089Spjd{ 1870219089Spjd zio_bad_cksum_t zbc; 1871219089Spjd raidz_map_t *rm = zio->io_vsd; 1872219089Spjd 1873219089Spjd int ret = zio_checksum_error(zio, &zbc); 1874219089Spjd if (ret != 0 && zbc.zbc_injected != 0) 1875219089Spjd rm->rm_ecksuminjected = 1; 1876219089Spjd 1877219089Spjd return (ret); 1878168404Spjd} 1879168404Spjd 1880168404Spjd/* 1881168404Spjd * Generate the parity from the data columns. If we tried and were able to 1882168404Spjd * read the parity without error, verify that the generated parity matches the 1883168404Spjd * data we read. If it doesn't, we fire off a checksum error. Return the 1884168404Spjd * number such failures. 1885168404Spjd */ 1886168404Spjdstatic int 1887168404Spjdraidz_parity_verify(zio_t *zio, raidz_map_t *rm) 1888168404Spjd{ 1889168404Spjd void *orig[VDEV_RAIDZ_MAXPARITY]; 1890168404Spjd int c, ret = 0; 1891168404Spjd raidz_col_t *rc; 1892168404Spjd 1893255750Sdelphij blkptr_t *bp = zio->io_bp; 1894255750Sdelphij enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum : 1895255750Sdelphij (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); 1896255750Sdelphij 1897255750Sdelphij if (checksum == ZIO_CHECKSUM_NOPARITY) 1898255750Sdelphij return (ret); 1899255750Sdelphij 1900168404Spjd for (c = 0; c < rm->rm_firstdatacol; c++) { 1901168404Spjd rc = &rm->rm_col[c]; 1902168404Spjd if (!rc->rc_tried || rc->rc_error != 0) 1903168404Spjd continue; 1904168404Spjd orig[c] = zio_buf_alloc(rc->rc_size); 1905168404Spjd bcopy(rc->rc_data, orig[c], rc->rc_size); 1906168404Spjd } 1907168404Spjd 1908219089Spjd vdev_raidz_generate_parity(rm); 1909168404Spjd 1910168404Spjd for (c = 0; c < rm->rm_firstdatacol; c++) { 1911168404Spjd rc = &rm->rm_col[c]; 1912168404Spjd if (!rc->rc_tried || rc->rc_error != 0) 1913168404Spjd continue; 1914168404Spjd if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { 1915219089Spjd raidz_checksum_error(zio, rc, orig[c]); 1916249195Smm rc->rc_error = SET_ERROR(ECKSUM); 1917168404Spjd ret++; 1918168404Spjd } 1919168404Spjd zio_buf_free(orig[c], rc->rc_size); 1920168404Spjd } 1921168404Spjd 1922168404Spjd return (ret); 1923168404Spjd} 1924168404Spjd 1925219089Spjd/* 1926219089Spjd * Keep statistics on all the ways that we used parity to correct data. 1927219089Spjd */ 1928219089Spjdstatic uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY]; 1929168404Spjd 1930185029Spjdstatic int 1931185029Spjdvdev_raidz_worst_error(raidz_map_t *rm) 1932185029Spjd{ 1933185029Spjd int error = 0; 1934185029Spjd 1935185029Spjd for (int c = 0; c < rm->rm_cols; c++) 1936185029Spjd error = zio_worst_error(error, rm->rm_col[c].rc_error); 1937185029Spjd 1938185029Spjd return (error); 1939185029Spjd} 1940185029Spjd 1941219089Spjd/* 1942219089Spjd * Iterate over all combinations of bad data and attempt a reconstruction. 1943219089Spjd * Note that the algorithm below is non-optimal because it doesn't take into 1944219089Spjd * account how reconstruction is actually performed. For example, with 1945219089Spjd * triple-parity RAID-Z the reconstruction procedure is the same if column 4 1946219089Spjd * is targeted as invalid as if columns 1 and 4 are targeted since in both 1947219089Spjd * cases we'd only use parity information in column 0. 1948219089Spjd */ 1949219089Spjdstatic int 1950219089Spjdvdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) 1951219089Spjd{ 1952219089Spjd raidz_map_t *rm = zio->io_vsd; 1953219089Spjd raidz_col_t *rc; 1954219089Spjd void *orig[VDEV_RAIDZ_MAXPARITY]; 1955219089Spjd int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 1956219089Spjd int *tgts = &tstore[1]; 1957219089Spjd int current, next, i, c, n; 1958219089Spjd int code, ret = 0; 1959219089Spjd 1960219089Spjd ASSERT(total_errors < rm->rm_firstdatacol); 1961219089Spjd 1962219089Spjd /* 1963219089Spjd * This simplifies one edge condition. 1964219089Spjd */ 1965219089Spjd tgts[-1] = -1; 1966219089Spjd 1967219089Spjd for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { 1968219089Spjd /* 1969219089Spjd * Initialize the targets array by finding the first n columns 1970219089Spjd * that contain no error. 1971219089Spjd * 1972219089Spjd * If there were no data errors, we need to ensure that we're 1973219089Spjd * always explicitly attempting to reconstruct at least one 1974219089Spjd * data column. To do this, we simply push the highest target 1975219089Spjd * up into the data columns. 1976219089Spjd */ 1977219089Spjd for (c = 0, i = 0; i < n; i++) { 1978219089Spjd if (i == n - 1 && data_errors == 0 && 1979219089Spjd c < rm->rm_firstdatacol) { 1980219089Spjd c = rm->rm_firstdatacol; 1981219089Spjd } 1982219089Spjd 1983219089Spjd while (rm->rm_col[c].rc_error != 0) { 1984219089Spjd c++; 1985219089Spjd ASSERT3S(c, <, rm->rm_cols); 1986219089Spjd } 1987219089Spjd 1988219089Spjd tgts[i] = c++; 1989219089Spjd } 1990219089Spjd 1991219089Spjd /* 1992219089Spjd * Setting tgts[n] simplifies the other edge condition. 1993219089Spjd */ 1994219089Spjd tgts[n] = rm->rm_cols; 1995219089Spjd 1996219089Spjd /* 1997219089Spjd * These buffers were allocated in previous iterations. 1998219089Spjd */ 1999219089Spjd for (i = 0; i < n - 1; i++) { 2000219089Spjd ASSERT(orig[i] != NULL); 2001219089Spjd } 2002219089Spjd 2003219089Spjd orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size); 2004219089Spjd 2005219089Spjd current = 0; 2006219089Spjd next = tgts[current]; 2007219089Spjd 2008219089Spjd while (current != n) { 2009219089Spjd tgts[current] = next; 2010219089Spjd current = 0; 2011219089Spjd 2012219089Spjd /* 2013219089Spjd * Save off the original data that we're going to 2014219089Spjd * attempt to reconstruct. 2015219089Spjd */ 2016219089Spjd for (i = 0; i < n; i++) { 2017219089Spjd ASSERT(orig[i] != NULL); 2018219089Spjd c = tgts[i]; 2019219089Spjd ASSERT3S(c, >=, 0); 2020219089Spjd ASSERT3S(c, <, rm->rm_cols); 2021219089Spjd rc = &rm->rm_col[c]; 2022219089Spjd bcopy(rc->rc_data, orig[i], rc->rc_size); 2023219089Spjd } 2024219089Spjd 2025219089Spjd /* 2026219089Spjd * Attempt a reconstruction and exit the outer loop on 2027219089Spjd * success. 2028219089Spjd */ 2029219089Spjd code = vdev_raidz_reconstruct(rm, tgts, n); 2030219089Spjd if (raidz_checksum_verify(zio) == 0) { 2031219089Spjd atomic_inc_64(&raidz_corrected[code]); 2032219089Spjd 2033219089Spjd for (i = 0; i < n; i++) { 2034219089Spjd c = tgts[i]; 2035219089Spjd rc = &rm->rm_col[c]; 2036219089Spjd ASSERT(rc->rc_error == 0); 2037219089Spjd if (rc->rc_tried) 2038219089Spjd raidz_checksum_error(zio, rc, 2039219089Spjd orig[i]); 2040249195Smm rc->rc_error = SET_ERROR(ECKSUM); 2041219089Spjd } 2042219089Spjd 2043219089Spjd ret = code; 2044219089Spjd goto done; 2045219089Spjd } 2046219089Spjd 2047219089Spjd /* 2048219089Spjd * Restore the original data. 2049219089Spjd */ 2050219089Spjd for (i = 0; i < n; i++) { 2051219089Spjd c = tgts[i]; 2052219089Spjd rc = &rm->rm_col[c]; 2053219089Spjd bcopy(orig[i], rc->rc_data, rc->rc_size); 2054219089Spjd } 2055219089Spjd 2056219089Spjd do { 2057219089Spjd /* 2058219089Spjd * Find the next valid column after the current 2059219089Spjd * position.. 2060219089Spjd */ 2061219089Spjd for (next = tgts[current] + 1; 2062219089Spjd next < rm->rm_cols && 2063219089Spjd rm->rm_col[next].rc_error != 0; next++) 2064219089Spjd continue; 2065219089Spjd 2066219089Spjd ASSERT(next <= tgts[current + 1]); 2067219089Spjd 2068219089Spjd /* 2069219089Spjd * If that spot is available, we're done here. 2070219089Spjd */ 2071219089Spjd if (next != tgts[current + 1]) 2072219089Spjd break; 2073219089Spjd 2074219089Spjd /* 2075219089Spjd * Otherwise, find the next valid column after 2076219089Spjd * the previous position. 2077219089Spjd */ 2078219089Spjd for (c = tgts[current - 1] + 1; 2079219089Spjd rm->rm_col[c].rc_error != 0; c++) 2080219089Spjd continue; 2081219089Spjd 2082219089Spjd tgts[current] = c; 2083219089Spjd current++; 2084219089Spjd 2085219089Spjd } while (current != n); 2086219089Spjd } 2087219089Spjd } 2088219089Spjd n--; 2089219089Spjddone: 2090219089Spjd for (i = 0; i < n; i++) { 2091219089Spjd zio_buf_free(orig[i], rm->rm_col[0].rc_size); 2092219089Spjd } 2093219089Spjd 2094219089Spjd return (ret); 2095219089Spjd} 2096219089Spjd 2097251629Sdelphij/* 2098251629Sdelphij * Complete an IO operation on a RAIDZ VDev 2099251629Sdelphij * 2100251629Sdelphij * Outline: 2101251629Sdelphij * - For write operations: 2102251629Sdelphij * 1. Check for errors on the child IOs. 2103251629Sdelphij * 2. Return, setting an error code if too few child VDevs were written 2104251629Sdelphij * to reconstruct the data later. Note that partial writes are 2105251629Sdelphij * considered successful if they can be reconstructed at all. 2106251629Sdelphij * - For read operations: 2107251629Sdelphij * 1. Check for errors on the child IOs. 2108251629Sdelphij * 2. If data errors occurred: 2109251629Sdelphij * a. Try to reassemble the data from the parity available. 2110251629Sdelphij * b. If we haven't yet read the parity drives, read them now. 2111251629Sdelphij * c. If all parity drives have been read but the data still doesn't 2112251629Sdelphij * reassemble with a correct checksum, then try combinatorial 2113251629Sdelphij * reconstruction. 2114251629Sdelphij * d. If that doesn't work, return an error. 2115251629Sdelphij * 3. If there were unexpected errors or this is a resilver operation, 2116251629Sdelphij * rewrite the vdevs that had errors. 2117251629Sdelphij */ 2118168404Spjdstatic void 2119168404Spjdvdev_raidz_io_done(zio_t *zio) 2120168404Spjd{ 2121168404Spjd vdev_t *vd = zio->io_vd; 2122168404Spjd vdev_t *cvd; 2123168404Spjd raidz_map_t *rm = zio->io_vsd; 2124219089Spjd raidz_col_t *rc; 2125168404Spjd int unexpected_errors = 0; 2126168404Spjd int parity_errors = 0; 2127168404Spjd int parity_untried = 0; 2128168404Spjd int data_errors = 0; 2129185029Spjd int total_errors = 0; 2130219089Spjd int n, c; 2131219089Spjd int tgts[VDEV_RAIDZ_MAXPARITY]; 2132219089Spjd int code; 2133168404Spjd 2134168404Spjd ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ 2135168404Spjd 2136168404Spjd ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); 2137168404Spjd ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); 2138168404Spjd 2139168404Spjd for (c = 0; c < rm->rm_cols; c++) { 2140168404Spjd rc = &rm->rm_col[c]; 2141168404Spjd 2142168404Spjd if (rc->rc_error) { 2143185029Spjd ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 2144168404Spjd 2145168404Spjd if (c < rm->rm_firstdatacol) 2146168404Spjd parity_errors++; 2147168404Spjd else 2148168404Spjd data_errors++; 2149168404Spjd 2150168404Spjd if (!rc->rc_skipped) 2151168404Spjd unexpected_errors++; 2152168404Spjd 2153185029Spjd total_errors++; 2154168404Spjd } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { 2155168404Spjd parity_untried++; 2156168404Spjd } 2157168404Spjd } 2158168404Spjd 2159168404Spjd if (zio->io_type == ZIO_TYPE_WRITE) { 2160168404Spjd /* 2161185029Spjd * XXX -- for now, treat partial writes as a success. 2162185029Spjd * (If we couldn't write enough columns to reconstruct 2163185029Spjd * the data, the I/O failed. Otherwise, good enough.) 2164185029Spjd * 2165185029Spjd * Now that we support write reallocation, it would be better 2166185029Spjd * to treat partial failure as real failure unless there are 2167185029Spjd * no non-degraded top-level vdevs left, and not update DTLs 2168185029Spjd * if we intend to reallocate. 2169168404Spjd */ 2170168404Spjd /* XXPOLICY */ 2171185029Spjd if (total_errors > rm->rm_firstdatacol) 2172185029Spjd zio->io_error = vdev_raidz_worst_error(rm); 2173168404Spjd 2174168404Spjd return; 2175240868Spjd } else if (zio->io_type == ZIO_TYPE_FREE) { 2176240868Spjd return; 2177168404Spjd } 2178168404Spjd 2179168404Spjd ASSERT(zio->io_type == ZIO_TYPE_READ); 2180168404Spjd /* 2181168404Spjd * There are three potential phases for a read: 2182168404Spjd * 1. produce valid data from the columns read 2183168404Spjd * 2. read all disks and try again 2184168404Spjd * 3. perform combinatorial reconstruction 2185168404Spjd * 2186168404Spjd * Each phase is progressively both more expensive and less likely to 2187168404Spjd * occur. If we encounter more errors than we can repair or all phases 2188168404Spjd * fail, we have no choice but to return an error. 2189168404Spjd */ 2190168404Spjd 2191168404Spjd /* 2192168404Spjd * If the number of errors we saw was correctable -- less than or equal 2193168404Spjd * to the number of parity disks read -- attempt to produce data that 2194168404Spjd * has a valid checksum. Naturally, this case applies in the absence of 2195168404Spjd * any errors. 2196168404Spjd */ 2197185029Spjd if (total_errors <= rm->rm_firstdatacol - parity_untried) { 2198219089Spjd if (data_errors == 0) { 2199219089Spjd if (raidz_checksum_verify(zio) == 0) { 2200168738Spjd /* 2201168738Spjd * If we read parity information (unnecessarily 2202168738Spjd * as it happens since no reconstruction was 2203168738Spjd * needed) regenerate and verify the parity. 2204168738Spjd * We also regenerate parity when resilvering 2205168738Spjd * so we can write it out to the failed device 2206168738Spjd * later. 2207168738Spjd */ 2208168404Spjd if (parity_errors + parity_untried < 2209168738Spjd rm->rm_firstdatacol || 2210168738Spjd (zio->io_flags & ZIO_FLAG_RESILVER)) { 2211168404Spjd n = raidz_parity_verify(zio, rm); 2212168404Spjd unexpected_errors += n; 2213168404Spjd ASSERT(parity_errors + n <= 2214168404Spjd rm->rm_firstdatacol); 2215168404Spjd } 2216168404Spjd goto done; 2217168404Spjd } 2218219089Spjd } else { 2219168404Spjd /* 2220168404Spjd * We either attempt to read all the parity columns or 2221168404Spjd * none of them. If we didn't try to read parity, we 2222168404Spjd * wouldn't be here in the correctable case. There must 2223168404Spjd * also have been fewer parity errors than parity 2224168404Spjd * columns or, again, we wouldn't be in this code path. 2225168404Spjd */ 2226168404Spjd ASSERT(parity_untried == 0); 2227168404Spjd ASSERT(parity_errors < rm->rm_firstdatacol); 2228168404Spjd 2229168404Spjd /* 2230219089Spjd * Identify the data columns that reported an error. 2231168404Spjd */ 2232219089Spjd n = 0; 2233168404Spjd for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 2234168404Spjd rc = &rm->rm_col[c]; 2235219089Spjd if (rc->rc_error != 0) { 2236219089Spjd ASSERT(n < VDEV_RAIDZ_MAXPARITY); 2237219089Spjd tgts[n++] = c; 2238219089Spjd } 2239168404Spjd } 2240168404Spjd 2241219089Spjd ASSERT(rm->rm_firstdatacol >= n); 2242168404Spjd 2243219089Spjd code = vdev_raidz_reconstruct(rm, tgts, n); 2244168404Spjd 2245219089Spjd if (raidz_checksum_verify(zio) == 0) { 2246219089Spjd atomic_inc_64(&raidz_corrected[code]); 2247219089Spjd 2248168404Spjd /* 2249219089Spjd * If we read more parity disks than were used 2250219089Spjd * for reconstruction, confirm that the other 2251219089Spjd * parity disks produced correct data. This 2252219089Spjd * routine is suboptimal in that it regenerates 2253219089Spjd * the parity that we already used in addition 2254219089Spjd * to the parity that we're attempting to 2255219089Spjd * verify, but this should be a relatively 2256219089Spjd * uncommon case, and can be optimized if it 2257219089Spjd * becomes a problem. Note that we regenerate 2258219089Spjd * parity when resilvering so we can write it 2259219089Spjd * out to failed devices later. 2260168404Spjd */ 2261219089Spjd if (parity_errors < rm->rm_firstdatacol - n || 2262168738Spjd (zio->io_flags & ZIO_FLAG_RESILVER)) { 2263168404Spjd n = raidz_parity_verify(zio, rm); 2264168404Spjd unexpected_errors += n; 2265168404Spjd ASSERT(parity_errors + n <= 2266168404Spjd rm->rm_firstdatacol); 2267168404Spjd } 2268168404Spjd 2269168404Spjd goto done; 2270168404Spjd } 2271168404Spjd } 2272168404Spjd } 2273168404Spjd 2274168404Spjd /* 2275168404Spjd * This isn't a typical situation -- either we got a read error or 2276168404Spjd * a child silently returned bad data. Read every block so we can 2277168404Spjd * try again with as much data and parity as we can track down. If 2278168404Spjd * we've already been through once before, all children will be marked 2279168404Spjd * as tried so we'll proceed to combinatorial reconstruction. 2280168404Spjd */ 2281168404Spjd unexpected_errors = 1; 2282168404Spjd rm->rm_missingdata = 0; 2283168404Spjd rm->rm_missingparity = 0; 2284168404Spjd 2285168404Spjd for (c = 0; c < rm->rm_cols; c++) { 2286168404Spjd if (rm->rm_col[c].rc_tried) 2287168404Spjd continue; 2288168404Spjd 2289168404Spjd zio_vdev_io_redone(zio); 2290168404Spjd do { 2291168404Spjd rc = &rm->rm_col[c]; 2292168404Spjd if (rc->rc_tried) 2293168404Spjd continue; 2294168404Spjd zio_nowait(zio_vdev_child_io(zio, NULL, 2295168404Spjd vd->vdev_child[rc->rc_devidx], 2296168404Spjd rc->rc_offset, rc->rc_data, rc->rc_size, 2297185029Spjd zio->io_type, zio->io_priority, 0, 2298168404Spjd vdev_raidz_child_done, rc)); 2299168404Spjd } while (++c < rm->rm_cols); 2300185029Spjd 2301168404Spjd return; 2302168404Spjd } 2303168404Spjd 2304168404Spjd /* 2305168404Spjd * At this point we've attempted to reconstruct the data given the 2306168404Spjd * errors we detected, and we've attempted to read all columns. There 2307168404Spjd * must, therefore, be one or more additional problems -- silent errors 2308168404Spjd * resulting in invalid data rather than explicit I/O errors resulting 2309219089Spjd * in absent data. We check if there is enough additional data to 2310219089Spjd * possibly reconstruct the data and then perform combinatorial 2311219089Spjd * reconstruction over all possible combinations. If that fails, 2312219089Spjd * we're cooked. 2313168404Spjd */ 2314219089Spjd if (total_errors > rm->rm_firstdatacol) { 2315185029Spjd zio->io_error = vdev_raidz_worst_error(rm); 2316168404Spjd 2317219089Spjd } else if (total_errors < rm->rm_firstdatacol && 2318219089Spjd (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) { 2319168404Spjd /* 2320219089Spjd * If we didn't use all the available parity for the 2321219089Spjd * combinatorial reconstruction, verify that the remaining 2322219089Spjd * parity is correct. 2323168404Spjd */ 2324219089Spjd if (code != (1 << rm->rm_firstdatacol) - 1) 2325219089Spjd (void) raidz_parity_verify(zio, rm); 2326219089Spjd } else { 2327168404Spjd /* 2328219089Spjd * We're here because either: 2329219089Spjd * 2330219089Spjd * total_errors == rm_first_datacol, or 2331219089Spjd * vdev_raidz_combrec() failed 2332219089Spjd * 2333219089Spjd * In either case, there is enough bad data to prevent 2334219089Spjd * reconstruction. 2335219089Spjd * 2336219089Spjd * Start checksum ereports for all children which haven't 2337219089Spjd * failed, and the IO wasn't speculative. 2338168404Spjd */ 2339249195Smm zio->io_error = SET_ERROR(ECKSUM); 2340168404Spjd 2341219089Spjd if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2342219089Spjd for (c = 0; c < rm->rm_cols; c++) { 2343219089Spjd rc = &rm->rm_col[c]; 2344219089Spjd if (rc->rc_error == 0) { 2345219089Spjd zio_bad_cksum_t zbc; 2346219089Spjd zbc.zbc_has_cksum = 0; 2347219089Spjd zbc.zbc_injected = 2348219089Spjd rm->rm_ecksuminjected; 2349168404Spjd 2350219089Spjd zfs_ereport_start_checksum( 2351219089Spjd zio->io_spa, 2352219089Spjd vd->vdev_child[rc->rc_devidx], 2353219089Spjd zio, rc->rc_offset, rc->rc_size, 2354219089Spjd (void *)(uintptr_t)c, &zbc); 2355168404Spjd } 2356168404Spjd } 2357168404Spjd } 2358168404Spjd } 2359168404Spjd 2360168404Spjddone: 2361168404Spjd zio_checksum_verified(zio); 2362168404Spjd 2363209962Smm if (zio->io_error == 0 && spa_writeable(zio->io_spa) && 2364168404Spjd (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { 2365168404Spjd /* 2366168404Spjd * Use the good data we have in hand to repair damaged children. 2367168404Spjd */ 2368168404Spjd for (c = 0; c < rm->rm_cols; c++) { 2369168404Spjd rc = &rm->rm_col[c]; 2370168404Spjd cvd = vd->vdev_child[rc->rc_devidx]; 2371168404Spjd 2372168404Spjd if (rc->rc_error == 0) 2373168404Spjd continue; 2374168404Spjd 2375185029Spjd zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 2376168404Spjd rc->rc_offset, rc->rc_data, rc->rc_size, 2377260763Savg ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, 2378209962Smm ZIO_FLAG_IO_REPAIR | (unexpected_errors ? 2379209962Smm ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); 2380168404Spjd } 2381168404Spjd } 2382168404Spjd} 2383168404Spjd 2384168404Spjdstatic void 2385168404Spjdvdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) 2386168404Spjd{ 2387168404Spjd if (faulted > vd->vdev_nparity) 2388168404Spjd vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2389168404Spjd VDEV_AUX_NO_REPLICAS); 2390168404Spjd else if (degraded + faulted != 0) 2391168404Spjd vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); 2392168404Spjd else 2393168404Spjd vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); 2394168404Spjd} 2395168404Spjd 2396168404Spjdvdev_ops_t vdev_raidz_ops = { 2397168404Spjd vdev_raidz_open, 2398168404Spjd vdev_raidz_close, 2399168404Spjd vdev_raidz_asize, 2400168404Spjd vdev_raidz_io_start, 2401168404Spjd vdev_raidz_io_done, 2402168404Spjd vdev_raidz_state_change, 2403219089Spjd NULL, 2404219089Spjd NULL, 2405168404Spjd VDEV_TYPE_RAIDZ, /* name of this vdev type */ 2406168404Spjd B_FALSE /* not a leaf vdev */ 2407168404Spjd}; 2408