1185029Spjd/* 2185029Spjd * CDDL HEADER START 3185029Spjd * 4185029Spjd * The contents of this file are subject to the terms of the 5185029Spjd * Common Development and Distribution License (the "License"). 6185029Spjd * You may not use this file except in compliance with the License. 7185029Spjd * 8185029Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9185029Spjd * or http://www.opensolaris.org/os/licensing. 10185029Spjd * See the License for the specific language governing permissions 11185029Spjd * and limitations under the License. 12185029Spjd * 13185029Spjd * When distributing Covered Code, include this CDDL HEADER in each 14185029Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15185029Spjd * If applicable, add the following below this CDDL HEADER, with the 16185029Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17185029Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18185029Spjd * 19185029Spjd * CDDL HEADER END 20185029Spjd */ 21185029Spjd/* 22185029Spjd * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23185029Spjd * Use is subject to license terms. 24185029Spjd */ 25185029Spjd 26185029Spjd#include <sys/cdefs.h> 27192640Sdes__FBSDID("$FreeBSD: releng/10.2/sys/cddl/boot/zfs/zfssubr.c 268649 2014-07-15 04:53:34Z delphij $"); 28185029Spjd 29185029Spjdstatic uint64_t zfs_crc64_table[256]; 30185029Spjd 31219089Spjd#define ECKSUM 666 32219089Spjd 33268649Sdelphij#define ASSERT3S(x, y, z) ((void)0) 34268649Sdelphij#define ASSERT3U(x, y, z) ((void)0) 35268649Sdelphij#define ASSERT3P(x, y, z) ((void)0) 36268649Sdelphij#define ASSERT0(x) ((void)0) 37268649Sdelphij#define ASSERT(x) ((void)0) 38219089Spjd 39219089Spjd#define panic(...) do { \ 40219089Spjd printf(__VA_ARGS__); \ 41219089Spjd for (;;) ; \ 42219089Spjd} while (0) 43219089Spjd 44219089Spjd#define kmem_alloc(size, flag) zfs_alloc((size)) 45219089Spjd#define kmem_free(ptr, size) zfs_free((ptr), (size)) 46219089Spjd 47185029Spjdstatic void 48185029Spjdzfs_init_crc(void) 49185029Spjd{ 50185029Spjd int i, j; 51185029Spjd uint64_t *ct; 52185029Spjd 53185029Spjd /* 54185029Spjd * Calculate the crc64 table (used for the zap hash 55185029Spjd * function). 56185029Spjd */ 57185029Spjd if (zfs_crc64_table[128] != ZFS_CRC64_POLY) { 58185029Spjd memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table)); 59185029Spjd for (i = 0; i < 256; i++) 60185029Spjd for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 61185029Spjd *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 62185029Spjd } 63185029Spjd} 64185029Spjd 65185029Spjdstatic void 66185029Spjdzio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp) 67185029Spjd{ 68185029Spjd ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); 69185029Spjd} 70185029Spjd 71185029Spjd/* 72185029Spjd * Signature for checksum functions. 73185029Spjd */ 74185029Spjdtypedef void zio_checksum_t(const void *data, uint64_t size, zio_cksum_t *zcp); 75185029Spjd 76185029Spjd/* 77185029Spjd * Information about each checksum function. 78185029Spjd */ 79185029Spjdtypedef struct zio_checksum_info { 80185029Spjd zio_checksum_t *ci_func[2]; /* checksum function for each byteorder */ 81185029Spjd int ci_correctable; /* number of correctable bits */ 82219089Spjd int ci_eck; /* uses zio embedded checksum? */ 83219089Spjd int ci_dedup; /* strong enough for dedup? */ 84185029Spjd const char *ci_name; /* descriptive name */ 85185029Spjd} zio_checksum_info_t; 86185029Spjd 87268649Sdelphij#include "blkptr.c" 88268649Sdelphij 89185029Spjd#include "fletcher.c" 90185029Spjd#include "sha256.c" 91185029Spjd 92185029Spjdstatic zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { 93219089Spjd {{NULL, NULL}, 0, 0, 0, "inherit"}, 94219089Spjd {{NULL, NULL}, 0, 0, 0, "on"}, 95219089Spjd {{zio_checksum_off, zio_checksum_off}, 0, 0, 0, "off"}, 96219089Spjd {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, 0, "label"}, 97219089Spjd {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, 0, "gang_header"}, 98219089Spjd {{fletcher_2_native, fletcher_2_byteswap}, 0, 1, 0, "zilog"}, 99219089Spjd {{fletcher_2_native, fletcher_2_byteswap}, 0, 0, 0, "fletcher2"}, 100219089Spjd {{fletcher_4_native, fletcher_4_byteswap}, 1, 0, 0, "fletcher4"}, 101219089Spjd {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 0, 1, "SHA256"}, 102219089Spjd {{fletcher_4_native, fletcher_4_byteswap}, 0, 1, 0, "zillog2"}, 103185029Spjd}; 104185029Spjd 105219089Spjd 106185029Spjd/* 107185029Spjd * Common signature for all zio compress/decompress functions. 108185029Spjd */ 109185029Spjdtypedef size_t zio_compress_func_t(void *src, void *dst, 110185029Spjd size_t s_len, size_t d_len, int); 111185029Spjdtypedef int zio_decompress_func_t(void *src, void *dst, 112185029Spjd size_t s_len, size_t d_len, int); 113185029Spjd 114185029Spjd/* 115185029Spjd * Information about each compression function. 116185029Spjd */ 117185029Spjdtypedef struct zio_compress_info { 118185029Spjd zio_compress_func_t *ci_compress; /* compression function */ 119185029Spjd zio_decompress_func_t *ci_decompress; /* decompression function */ 120185029Spjd int ci_level; /* level parameter */ 121185029Spjd const char *ci_name; /* algorithm name */ 122185029Spjd} zio_compress_info_t; 123185029Spjd 124185029Spjd#include "lzjb.c" 125219089Spjd#include "zle.c" 126246586Sdelphij#include "lz4.c" 127185029Spjd 128185029Spjd/* 129185029Spjd * Compression vectors. 130185029Spjd */ 131185029Spjdstatic zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = { 132185029Spjd {NULL, NULL, 0, "inherit"}, 133185029Spjd {NULL, NULL, 0, "on"}, 134185029Spjd {NULL, NULL, 0, "uncompressed"}, 135185029Spjd {NULL, lzjb_decompress, 0, "lzjb"}, 136185029Spjd {NULL, NULL, 0, "empty"}, 137185029Spjd {NULL, NULL, 1, "gzip-1"}, 138185029Spjd {NULL, NULL, 2, "gzip-2"}, 139185029Spjd {NULL, NULL, 3, "gzip-3"}, 140185029Spjd {NULL, NULL, 4, "gzip-4"}, 141185029Spjd {NULL, NULL, 5, "gzip-5"}, 142185029Spjd {NULL, NULL, 6, "gzip-6"}, 143185029Spjd {NULL, NULL, 7, "gzip-7"}, 144185029Spjd {NULL, NULL, 8, "gzip-8"}, 145185029Spjd {NULL, NULL, 9, "gzip-9"}, 146219089Spjd {NULL, zle_decompress, 64, "zle"}, 147246586Sdelphij {NULL, lz4_decompress, 0, "lz4"}, 148185029Spjd}; 149185029Spjd 150219089Spjdstatic void 151219089Spjdbyteswap_uint64_array(void *vbuf, size_t size) 152219089Spjd{ 153219089Spjd uint64_t *buf = vbuf; 154219089Spjd size_t count = size >> 3; 155219089Spjd int i; 156219089Spjd 157219089Spjd ASSERT((size & 7) == 0); 158219089Spjd 159219089Spjd for (i = 0; i < count; i++) 160219089Spjd buf[i] = BSWAP_64(buf[i]); 161219089Spjd} 162219089Spjd 163219089Spjd/* 164219089Spjd * Set the external verifier for a gang block based on <vdev, offset, txg>, 165219089Spjd * a tuple which is guaranteed to be unique for the life of the pool. 166219089Spjd */ 167219089Spjdstatic void 168219089Spjdzio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp) 169219089Spjd{ 170219089Spjd const dva_t *dva = BP_IDENTITY(bp); 171219089Spjd uint64_t txg = BP_PHYSICAL_BIRTH(bp); 172219089Spjd 173219089Spjd ASSERT(BP_IS_GANG(bp)); 174219089Spjd 175219089Spjd ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0); 176219089Spjd} 177219089Spjd 178219089Spjd/* 179219089Spjd * Set the external verifier for a label block based on its offset. 180219089Spjd * The vdev is implicit, and the txg is unknowable at pool open time -- 181219089Spjd * hence the logic in vdev_uberblock_load() to find the most recent copy. 182219089Spjd */ 183219089Spjdstatic void 184219089Spjdzio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset) 185219089Spjd{ 186219089Spjd ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0); 187219089Spjd} 188219089Spjd 189185029Spjdstatic int 190226568Spjdzio_checksum_verify(const blkptr_t *bp, void *data) 191185029Spjd{ 192226568Spjd uint64_t size; 193226568Spjd unsigned int checksum; 194219089Spjd zio_checksum_info_t *ci; 195219089Spjd zio_cksum_t actual_cksum, expected_cksum, verifier; 196219089Spjd int byteswap; 197185029Spjd 198226568Spjd checksum = BP_GET_CHECKSUM(bp); 199226568Spjd size = BP_GET_PSIZE(bp); 200226568Spjd 201219089Spjd if (checksum >= ZIO_CHECKSUM_FUNCTIONS) 202185029Spjd return (EINVAL); 203219089Spjd ci = &zio_checksum_table[checksum]; 204219089Spjd if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL) 205219089Spjd return (EINVAL); 206185029Spjd 207219089Spjd if (ci->ci_eck) { 208219089Spjd zio_eck_t *eck; 209219089Spjd 210219089Spjd ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER || 211219089Spjd checksum == ZIO_CHECKSUM_LABEL); 212219089Spjd 213219089Spjd eck = (zio_eck_t *)((char *)data + size) - 1; 214219089Spjd 215219089Spjd if (checksum == ZIO_CHECKSUM_GANG_HEADER) 216219089Spjd zio_checksum_gang_verifier(&verifier, bp); 217219089Spjd else if (checksum == ZIO_CHECKSUM_LABEL) 218226568Spjd zio_checksum_label_verifier(&verifier, 219226568Spjd DVA_GET_OFFSET(BP_IDENTITY(bp))); 220219089Spjd else 221219089Spjd verifier = bp->blk_cksum; 222219089Spjd 223219089Spjd byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC)); 224219089Spjd 225219089Spjd if (byteswap) 226219089Spjd byteswap_uint64_array(&verifier, sizeof (zio_cksum_t)); 227219089Spjd 228219089Spjd expected_cksum = eck->zec_cksum; 229219089Spjd eck->zec_cksum = verifier; 230219089Spjd ci->ci_func[byteswap](data, size, &actual_cksum); 231219089Spjd eck->zec_cksum = expected_cksum; 232219089Spjd 233219089Spjd if (byteswap) 234219089Spjd byteswap_uint64_array(&expected_cksum, 235219089Spjd sizeof (zio_cksum_t)); 236185029Spjd } else { 237219089Spjd expected_cksum = bp->blk_cksum; 238185029Spjd ci->ci_func[0](data, size, &actual_cksum); 239185029Spjd } 240185029Spjd 241219089Spjd if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) { 242185029Spjd /*printf("ZFS: read checksum failed\n");*/ 243185029Spjd return (EIO); 244185029Spjd } 245185029Spjd 246185029Spjd return (0); 247185029Spjd} 248185029Spjd 249185029Spjdstatic int 250185029Spjdzio_decompress_data(int cpfunc, void *src, uint64_t srcsize, 251185029Spjd void *dest, uint64_t destsize) 252185029Spjd{ 253219089Spjd zio_compress_info_t *ci; 254185029Spjd 255219089Spjd if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) { 256185097Sdfr printf("ZFS: unsupported compression algorithm %u\n", cpfunc); 257185029Spjd return (EIO); 258185029Spjd } 259185029Spjd 260219089Spjd ci = &zio_compress_table[cpfunc]; 261219089Spjd if (!ci->ci_decompress) { 262219089Spjd printf("ZFS: unsupported compression algorithm %s\n", 263219089Spjd ci->ci_name); 264219089Spjd return (EIO); 265219089Spjd } 266219089Spjd 267185029Spjd return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level)); 268185029Spjd} 269185029Spjd 270185029Spjdstatic uint64_t 271185029Spjdzap_hash(uint64_t salt, const char *name) 272185029Spjd{ 273185029Spjd const uint8_t *cp; 274185029Spjd uint8_t c; 275185029Spjd uint64_t crc = salt; 276185029Spjd 277219089Spjd ASSERT(crc != 0); 278219089Spjd ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 279185029Spjd for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++) 280185029Spjd crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF]; 281185029Spjd 282185029Spjd /* 283185029Spjd * Only use 28 bits, since we need 4 bits in the cookie for the 284185029Spjd * collision differentiator. We MUST use the high bits, since 285185029Spjd * those are the onces that we first pay attention to when 286185029Spjd * chosing the bucket. 287185029Spjd */ 288185029Spjd crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1); 289185029Spjd 290185029Spjd return (crc); 291185029Spjd} 292192194Sdfr 293219089Spjdstatic void *zfs_alloc(size_t size); 294219089Spjdstatic void zfs_free(void *ptr, size_t size); 295192194Sdfr 296192194Sdfrtypedef struct raidz_col { 297192194Sdfr uint64_t rc_devidx; /* child device index for I/O */ 298192194Sdfr uint64_t rc_offset; /* device offset */ 299192194Sdfr uint64_t rc_size; /* I/O size */ 300192194Sdfr void *rc_data; /* I/O data */ 301192194Sdfr int rc_error; /* I/O error for this device */ 302192194Sdfr uint8_t rc_tried; /* Did we attempt this I/O column? */ 303192194Sdfr uint8_t rc_skipped; /* Did we skip this I/O column? */ 304192194Sdfr} raidz_col_t; 305192194Sdfr 306219089Spjdtypedef struct raidz_map { 307219089Spjd uint64_t rm_cols; /* Regular column count */ 308219089Spjd uint64_t rm_scols; /* Count including skipped columns */ 309219089Spjd uint64_t rm_bigcols; /* Number of oversized columns */ 310219089Spjd uint64_t rm_asize; /* Actual total I/O size */ 311219089Spjd uint64_t rm_missingdata; /* Count of missing data devices */ 312219089Spjd uint64_t rm_missingparity; /* Count of missing parity devices */ 313219089Spjd uint64_t rm_firstdatacol; /* First data column/parity count */ 314219089Spjd uint64_t rm_nskip; /* Skipped sectors for padding */ 315219089Spjd uint64_t rm_skipstart; /* Column index of padding start */ 316219089Spjd uintptr_t rm_reports; /* # of referencing checksum reports */ 317219089Spjd uint8_t rm_freed; /* map no longer has referencing ZIO */ 318219089Spjd uint8_t rm_ecksuminjected; /* checksum error was injected */ 319219089Spjd raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ 320219089Spjd} raidz_map_t; 321219089Spjd 322192194Sdfr#define VDEV_RAIDZ_P 0 323192194Sdfr#define VDEV_RAIDZ_Q 1 324219089Spjd#define VDEV_RAIDZ_R 2 325192194Sdfr 326219089Spjd#define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 327219089Spjd#define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 328192194Sdfr 329219089Spjd/* 330219089Spjd * We provide a mechanism to perform the field multiplication operation on a 331219089Spjd * 64-bit value all at once rather than a byte at a time. This works by 332219089Spjd * creating a mask from the top bit in each byte and using that to 333219089Spjd * conditionally apply the XOR of 0x1d. 334219089Spjd */ 335219089Spjd#define VDEV_RAIDZ_64MUL_2(x, mask) \ 336219089Spjd{ \ 337219089Spjd (mask) = (x) & 0x8080808080808080ULL; \ 338219089Spjd (mask) = ((mask) << 1) - ((mask) >> 7); \ 339219089Spjd (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 340225531Savg ((mask) & 0x1d1d1d1d1d1d1d1dULL); \ 341219089Spjd} 342192194Sdfr 343219089Spjd#define VDEV_RAIDZ_64MUL_4(x, mask) \ 344219089Spjd{ \ 345219089Spjd VDEV_RAIDZ_64MUL_2((x), mask); \ 346219089Spjd VDEV_RAIDZ_64MUL_2((x), mask); \ 347192194Sdfr} 348192194Sdfr 349192194Sdfr/* 350192194Sdfr * These two tables represent powers and logs of 2 in the Galois field defined 351192194Sdfr * above. These values were computed by repeatedly multiplying by 2 as above. 352192194Sdfr */ 353192194Sdfrstatic const uint8_t vdev_raidz_pow2[256] = { 354192194Sdfr 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 355192194Sdfr 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 356192194Sdfr 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 357192194Sdfr 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 358192194Sdfr 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 359192194Sdfr 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 360192194Sdfr 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, 361192194Sdfr 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 362192194Sdfr 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 363192194Sdfr 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, 364192194Sdfr 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 365192194Sdfr 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 366192194Sdfr 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 367192194Sdfr 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 368192194Sdfr 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 369192194Sdfr 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 370192194Sdfr 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 371192194Sdfr 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 372192194Sdfr 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 373192194Sdfr 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 374192194Sdfr 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 375192194Sdfr 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 376192194Sdfr 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 377192194Sdfr 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 378192194Sdfr 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, 379192194Sdfr 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 380192194Sdfr 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 381192194Sdfr 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, 382192194Sdfr 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 383192194Sdfr 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, 384192194Sdfr 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 385192194Sdfr 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 386192194Sdfr}; 387192194Sdfrstatic const uint8_t vdev_raidz_log2[256] = { 388192194Sdfr 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, 389192194Sdfr 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, 390192194Sdfr 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, 391192194Sdfr 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, 392192194Sdfr 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, 393192194Sdfr 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, 394192194Sdfr 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, 395192194Sdfr 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, 396192194Sdfr 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, 397192194Sdfr 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, 398192194Sdfr 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, 399192194Sdfr 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, 400192194Sdfr 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, 401192194Sdfr 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, 402192194Sdfr 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, 403192194Sdfr 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, 404192194Sdfr 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, 405192194Sdfr 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, 406192194Sdfr 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, 407192194Sdfr 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, 408192194Sdfr 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, 409192194Sdfr 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, 410192194Sdfr 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, 411192194Sdfr 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, 412192194Sdfr 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, 413192194Sdfr 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, 414192194Sdfr 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, 415192194Sdfr 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, 416192194Sdfr 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, 417192194Sdfr 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, 418192194Sdfr 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, 419192194Sdfr 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, 420192194Sdfr}; 421192194Sdfr 422192194Sdfr/* 423192194Sdfr * Multiply a given number by 2 raised to the given power. 424192194Sdfr */ 425192194Sdfrstatic uint8_t 426192194Sdfrvdev_raidz_exp2(uint8_t a, int exp) 427192194Sdfr{ 428192194Sdfr if (a == 0) 429192194Sdfr return (0); 430192194Sdfr 431219089Spjd ASSERT(exp >= 0); 432219089Spjd ASSERT(vdev_raidz_log2[a] > 0 || a == 1); 433192194Sdfr 434192194Sdfr exp += vdev_raidz_log2[a]; 435192194Sdfr if (exp > 255) 436192194Sdfr exp -= 255; 437192194Sdfr 438192194Sdfr return (vdev_raidz_pow2[exp]); 439192194Sdfr} 440192194Sdfr 441192194Sdfrstatic void 442219089Spjdvdev_raidz_generate_parity_p(raidz_map_t *rm) 443192194Sdfr{ 444219089Spjd uint64_t *p, *src, pcount, ccount, i; 445192194Sdfr int c; 446192194Sdfr 447219089Spjd pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 448192194Sdfr 449219089Spjd for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 450219089Spjd src = rm->rm_col[c].rc_data; 451219089Spjd p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 452219089Spjd ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 453192194Sdfr 454219089Spjd if (c == rm->rm_firstdatacol) { 455219089Spjd ASSERT(ccount == pcount); 456219089Spjd for (i = 0; i < ccount; i++, src++, p++) { 457192194Sdfr *p = *src; 458192194Sdfr } 459219089Spjd } else { 460219089Spjd ASSERT(ccount <= pcount); 461219089Spjd for (i = 0; i < ccount; i++, src++, p++) { 462219089Spjd *p ^= *src; 463219089Spjd } 464219089Spjd } 465219089Spjd } 466219089Spjd} 467219089Spjd 468219089Spjdstatic void 469219089Spjdvdev_raidz_generate_parity_pq(raidz_map_t *rm) 470219089Spjd{ 471219089Spjd uint64_t *p, *q, *src, pcnt, ccnt, mask, i; 472219089Spjd int c; 473219089Spjd 474219089Spjd pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 475219089Spjd ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 476219089Spjd rm->rm_col[VDEV_RAIDZ_Q].rc_size); 477219089Spjd 478219089Spjd for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 479219089Spjd src = rm->rm_col[c].rc_data; 480219089Spjd p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 481219089Spjd q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 482219089Spjd 483219089Spjd ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 484219089Spjd 485219089Spjd if (c == rm->rm_firstdatacol) { 486219089Spjd ASSERT(ccnt == pcnt || ccnt == 0); 487219089Spjd for (i = 0; i < ccnt; i++, src++, p++, q++) { 488219089Spjd *p = *src; 489219089Spjd *q = *src; 490219089Spjd } 491219089Spjd for (; i < pcnt; i++, src++, p++, q++) { 492219089Spjd *p = 0; 493192194Sdfr *q = 0; 494192194Sdfr } 495192194Sdfr } else { 496219089Spjd ASSERT(ccnt <= pcnt); 497192194Sdfr 498192194Sdfr /* 499219089Spjd * Apply the algorithm described above by multiplying 500219089Spjd * the previous result and adding in the new value. 501192194Sdfr */ 502219089Spjd for (i = 0; i < ccnt; i++, src++, p++, q++) { 503219089Spjd *p ^= *src; 504219089Spjd 505219089Spjd VDEV_RAIDZ_64MUL_2(*q, mask); 506192194Sdfr *q ^= *src; 507192194Sdfr } 508192194Sdfr 509192194Sdfr /* 510192194Sdfr * Treat short columns as though they are full of 0s. 511219089Spjd * Note that there's therefore nothing needed for P. 512192194Sdfr */ 513219089Spjd for (; i < pcnt; i++, q++) { 514219089Spjd VDEV_RAIDZ_64MUL_2(*q, mask); 515192194Sdfr } 516192194Sdfr } 517192194Sdfr } 518192194Sdfr} 519192194Sdfr 520192194Sdfrstatic void 521219089Spjdvdev_raidz_generate_parity_pqr(raidz_map_t *rm) 522192194Sdfr{ 523219089Spjd uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i; 524219089Spjd int c; 525192194Sdfr 526219089Spjd pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 527219089Spjd ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 528219089Spjd rm->rm_col[VDEV_RAIDZ_Q].rc_size); 529219089Spjd ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 530219089Spjd rm->rm_col[VDEV_RAIDZ_R].rc_size); 531192194Sdfr 532219089Spjd for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 533219089Spjd src = rm->rm_col[c].rc_data; 534219089Spjd p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 535219089Spjd q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 536219089Spjd r = rm->rm_col[VDEV_RAIDZ_R].rc_data; 537192194Sdfr 538219089Spjd ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 539192194Sdfr 540219089Spjd if (c == rm->rm_firstdatacol) { 541219089Spjd ASSERT(ccnt == pcnt || ccnt == 0); 542219089Spjd for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 543219089Spjd *p = *src; 544219089Spjd *q = *src; 545219089Spjd *r = *src; 546192194Sdfr } 547219089Spjd for (; i < pcnt; i++, src++, p++, q++, r++) { 548219089Spjd *p = 0; 549219089Spjd *q = 0; 550219089Spjd *r = 0; 551192194Sdfr } 552219089Spjd } else { 553219089Spjd ASSERT(ccnt <= pcnt); 554192194Sdfr 555192194Sdfr /* 556219089Spjd * Apply the algorithm described above by multiplying 557219089Spjd * the previous result and adding in the new value. 558192194Sdfr */ 559219089Spjd for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 560219089Spjd *p ^= *src; 561219089Spjd 562219089Spjd VDEV_RAIDZ_64MUL_2(*q, mask); 563219089Spjd *q ^= *src; 564219089Spjd 565219089Spjd VDEV_RAIDZ_64MUL_4(*r, mask); 566219089Spjd *r ^= *src; 567192194Sdfr } 568192194Sdfr 569219089Spjd /* 570219089Spjd * Treat short columns as though they are full of 0s. 571219089Spjd * Note that there's therefore nothing needed for P. 572219089Spjd */ 573219089Spjd for (; i < pcnt; i++, q++, r++) { 574219089Spjd VDEV_RAIDZ_64MUL_2(*q, mask); 575219089Spjd VDEV_RAIDZ_64MUL_4(*r, mask); 576192194Sdfr } 577192194Sdfr } 578192194Sdfr } 579219089Spjd} 580192194Sdfr 581219089Spjd/* 582219089Spjd * Generate RAID parity in the first virtual columns according to the number of 583219089Spjd * parity columns available. 584219089Spjd */ 585219089Spjdstatic void 586219089Spjdvdev_raidz_generate_parity(raidz_map_t *rm) 587219089Spjd{ 588219089Spjd switch (rm->rm_firstdatacol) { 589219089Spjd case 1: 590219089Spjd vdev_raidz_generate_parity_p(rm); 591219089Spjd break; 592219089Spjd case 2: 593219089Spjd vdev_raidz_generate_parity_pq(rm); 594219089Spjd break; 595219089Spjd case 3: 596219089Spjd vdev_raidz_generate_parity_pqr(rm); 597219089Spjd break; 598219089Spjd default: 599219089Spjd panic("invalid RAID-Z configuration"); 600219089Spjd } 601219089Spjd} 602192194Sdfr 603219089Spjd/* BEGIN CSTYLED */ 604219089Spjd/* 605219089Spjd * In the general case of reconstruction, we must solve the system of linear 606219089Spjd * equations defined by the coeffecients used to generate parity as well as 607219089Spjd * the contents of the data and parity disks. This can be expressed with 608219089Spjd * vectors for the original data (D) and the actual data (d) and parity (p) 609219089Spjd * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 610219089Spjd * 611219089Spjd * __ __ __ __ 612219089Spjd * | | __ __ | p_0 | 613219089Spjd * | V | | D_0 | | p_m-1 | 614219089Spjd * | | x | : | = | d_0 | 615219089Spjd * | I | | D_n-1 | | : | 616219089Spjd * | | ~~ ~~ | d_n-1 | 617219089Spjd * ~~ ~~ ~~ ~~ 618219089Spjd * 619219089Spjd * I is simply a square identity matrix of size n, and V is a vandermonde 620219089Spjd * matrix defined by the coeffecients we chose for the various parity columns 621219089Spjd * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 622219089Spjd * computation as well as linear separability. 623219089Spjd * 624219089Spjd * __ __ __ __ 625219089Spjd * | 1 .. 1 1 1 | | p_0 | 626219089Spjd * | 2^n-1 .. 4 2 1 | __ __ | : | 627219089Spjd * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 628219089Spjd * | 1 .. 0 0 0 | | D_1 | | d_0 | 629219089Spjd * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 630219089Spjd * | : : : : | | : | | d_2 | 631219089Spjd * | 0 .. 1 0 0 | | D_n-1 | | : | 632219089Spjd * | 0 .. 0 1 0 | ~~ ~~ | : | 633219089Spjd * | 0 .. 0 0 1 | | d_n-1 | 634219089Spjd * ~~ ~~ ~~ ~~ 635219089Spjd * 636219089Spjd * Note that I, V, d, and p are known. To compute D, we must invert the 637219089Spjd * matrix and use the known data and parity values to reconstruct the unknown 638219089Spjd * data values. We begin by removing the rows in V|I and d|p that correspond 639219089Spjd * to failed or missing columns; we then make V|I square (n x n) and d|p 640219089Spjd * sized n by removing rows corresponding to unused parity from the bottom up 641219089Spjd * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 642219089Spjd * using Gauss-Jordan elimination. In the example below we use m=3 parity 643219089Spjd * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 644219089Spjd * __ __ 645219089Spjd * | 1 1 1 1 1 1 1 1 | 646219089Spjd * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 647219089Spjd * | 19 205 116 29 64 16 4 1 | / / 648219089Spjd * | 1 0 0 0 0 0 0 0 | / / 649219089Spjd * | 0 1 0 0 0 0 0 0 | <--' / 650219089Spjd * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 651219089Spjd * | 0 0 0 1 0 0 0 0 | 652219089Spjd * | 0 0 0 0 1 0 0 0 | 653219089Spjd * | 0 0 0 0 0 1 0 0 | 654219089Spjd * | 0 0 0 0 0 0 1 0 | 655219089Spjd * | 0 0 0 0 0 0 0 1 | 656219089Spjd * ~~ ~~ 657219089Spjd * __ __ 658219089Spjd * | 1 1 1 1 1 1 1 1 | 659219089Spjd * | 128 64 32 16 8 4 2 1 | 660219089Spjd * | 19 205 116 29 64 16 4 1 | 661219089Spjd * | 1 0 0 0 0 0 0 0 | 662219089Spjd * | 0 1 0 0 0 0 0 0 | 663219089Spjd * (V|I)' = | 0 0 1 0 0 0 0 0 | 664219089Spjd * | 0 0 0 1 0 0 0 0 | 665219089Spjd * | 0 0 0 0 1 0 0 0 | 666219089Spjd * | 0 0 0 0 0 1 0 0 | 667219089Spjd * | 0 0 0 0 0 0 1 0 | 668219089Spjd * | 0 0 0 0 0 0 0 1 | 669219089Spjd * ~~ ~~ 670219089Spjd * 671219089Spjd * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 672219089Spjd * have carefully chosen the seed values 1, 2, and 4 to ensure that this 673219089Spjd * matrix is not singular. 674219089Spjd * __ __ 675219089Spjd * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 676219089Spjd * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 677219089Spjd * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 678219089Spjd * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 679219089Spjd * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 680219089Spjd * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 681219089Spjd * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 682219089Spjd * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 683219089Spjd * ~~ ~~ 684219089Spjd * __ __ 685219089Spjd * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 686219089Spjd * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 687219089Spjd * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 688219089Spjd * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 689219089Spjd * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 690219089Spjd * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 691219089Spjd * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 692219089Spjd * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 693219089Spjd * ~~ ~~ 694219089Spjd * __ __ 695219089Spjd * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 696219089Spjd * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 697219089Spjd * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 698219089Spjd * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 699219089Spjd * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 700219089Spjd * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 701219089Spjd * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 702219089Spjd * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 703219089Spjd * ~~ ~~ 704219089Spjd * __ __ 705219089Spjd * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 706219089Spjd * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 707219089Spjd * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 708219089Spjd * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 709219089Spjd * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 710219089Spjd * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 711219089Spjd * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 712219089Spjd * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 713219089Spjd * ~~ ~~ 714219089Spjd * __ __ 715219089Spjd * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 716219089Spjd * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 717219089Spjd * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 718219089Spjd * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 719219089Spjd * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 720219089Spjd * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 721219089Spjd * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 722219089Spjd * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 723219089Spjd * ~~ ~~ 724219089Spjd * __ __ 725219089Spjd * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 726219089Spjd * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 727219089Spjd * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 728219089Spjd * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 729219089Spjd * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 730219089Spjd * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 731219089Spjd * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 732219089Spjd * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 733219089Spjd * ~~ ~~ 734219089Spjd * __ __ 735219089Spjd * | 0 0 1 0 0 0 0 0 | 736219089Spjd * | 167 100 5 41 159 169 217 208 | 737219089Spjd * | 166 100 4 40 158 168 216 209 | 738219089Spjd * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 739219089Spjd * | 0 0 0 0 1 0 0 0 | 740219089Spjd * | 0 0 0 0 0 1 0 0 | 741219089Spjd * | 0 0 0 0 0 0 1 0 | 742219089Spjd * | 0 0 0 0 0 0 0 1 | 743219089Spjd * ~~ ~~ 744219089Spjd * 745219089Spjd * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 746219089Spjd * of the missing data. 747219089Spjd * 748219089Spjd * As is apparent from the example above, the only non-trivial rows in the 749219089Spjd * inverse matrix correspond to the data disks that we're trying to 750219089Spjd * reconstruct. Indeed, those are the only rows we need as the others would 751219089Spjd * only be useful for reconstructing data known or assumed to be valid. For 752219089Spjd * that reason, we only build the coefficients in the rows that correspond to 753219089Spjd * targeted columns. 754219089Spjd */ 755219089Spjd/* END CSTYLED */ 756219089Spjd 757219089Spjdstatic void 758219089Spjdvdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, 759219089Spjd uint8_t **rows) 760219089Spjd{ 761219089Spjd int i, j; 762219089Spjd int pow; 763219089Spjd 764219089Spjd ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); 765219089Spjd 766219089Spjd /* 767219089Spjd * Fill in the missing rows of interest. 768219089Spjd */ 769219089Spjd for (i = 0; i < nmap; i++) { 770219089Spjd ASSERT3S(0, <=, map[i]); 771219089Spjd ASSERT3S(map[i], <=, 2); 772219089Spjd 773219089Spjd pow = map[i] * n; 774219089Spjd if (pow > 255) 775219089Spjd pow -= 255; 776219089Spjd ASSERT(pow <= 255); 777219089Spjd 778219089Spjd for (j = 0; j < n; j++) { 779219089Spjd pow -= map[i]; 780219089Spjd if (pow < 0) 781219089Spjd pow += 255; 782219089Spjd rows[i][j] = vdev_raidz_pow2[pow]; 783192194Sdfr } 784192194Sdfr } 785192194Sdfr} 786192194Sdfr 787192194Sdfrstatic void 788219089Spjdvdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, 789219089Spjd uint8_t **rows, uint8_t **invrows, const uint8_t *used) 790192194Sdfr{ 791219089Spjd int i, j, ii, jj; 792219089Spjd uint8_t log; 793192194Sdfr 794219089Spjd /* 795219089Spjd * Assert that the first nmissing entries from the array of used 796219089Spjd * columns correspond to parity columns and that subsequent entries 797219089Spjd * correspond to data columns. 798219089Spjd */ 799219089Spjd for (i = 0; i < nmissing; i++) { 800219089Spjd ASSERT3S(used[i], <, rm->rm_firstdatacol); 801219089Spjd } 802219089Spjd for (; i < n; i++) { 803219089Spjd ASSERT3S(used[i], >=, rm->rm_firstdatacol); 804219089Spjd } 805192194Sdfr 806219089Spjd /* 807219089Spjd * First initialize the storage where we'll compute the inverse rows. 808219089Spjd */ 809219089Spjd for (i = 0; i < nmissing; i++) { 810219089Spjd for (j = 0; j < n; j++) { 811219089Spjd invrows[i][j] = (i == j) ? 1 : 0; 812219089Spjd } 813219089Spjd } 814192194Sdfr 815192194Sdfr /* 816219089Spjd * Subtract all trivial rows from the rows of consequence. 817192194Sdfr */ 818219089Spjd for (i = 0; i < nmissing; i++) { 819219089Spjd for (j = nmissing; j < n; j++) { 820219089Spjd ASSERT3U(used[j], >=, rm->rm_firstdatacol); 821219089Spjd jj = used[j] - rm->rm_firstdatacol; 822219089Spjd ASSERT3S(jj, <, n); 823219089Spjd invrows[i][j] = rows[i][jj]; 824219089Spjd rows[i][jj] = 0; 825219089Spjd } 826219089Spjd } 827192194Sdfr 828219089Spjd /* 829219089Spjd * For each of the rows of interest, we must normalize it and subtract 830219089Spjd * a multiple of it from the other rows. 831219089Spjd */ 832219089Spjd for (i = 0; i < nmissing; i++) { 833219089Spjd for (j = 0; j < missing[i]; j++) { 834219089Spjd ASSERT3U(rows[i][j], ==, 0); 835219089Spjd } 836219089Spjd ASSERT3U(rows[i][missing[i]], !=, 0); 837192194Sdfr 838219089Spjd /* 839219089Spjd * Compute the inverse of the first element and multiply each 840219089Spjd * element in the row by that value. 841219089Spjd */ 842219089Spjd log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 843192194Sdfr 844219089Spjd for (j = 0; j < n; j++) { 845219089Spjd rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 846219089Spjd invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 847219089Spjd } 848192194Sdfr 849219089Spjd for (ii = 0; ii < nmissing; ii++) { 850219089Spjd if (i == ii) 851219089Spjd continue; 852192194Sdfr 853219089Spjd ASSERT3U(rows[ii][missing[i]], !=, 0); 854219089Spjd 855219089Spjd log = vdev_raidz_log2[rows[ii][missing[i]]]; 856219089Spjd 857219089Spjd for (j = 0; j < n; j++) { 858219089Spjd rows[ii][j] ^= 859219089Spjd vdev_raidz_exp2(rows[i][j], log); 860219089Spjd invrows[ii][j] ^= 861219089Spjd vdev_raidz_exp2(invrows[i][j], log); 862219089Spjd } 863219089Spjd } 864219089Spjd } 865219089Spjd 866192194Sdfr /* 867219089Spjd * Verify that the data that is left in the rows are properly part of 868219089Spjd * an identity matrix. 869192194Sdfr */ 870219089Spjd for (i = 0; i < nmissing; i++) { 871219089Spjd for (j = 0; j < n; j++) { 872219089Spjd if (j == missing[i]) { 873219089Spjd ASSERT3U(rows[i][j], ==, 1); 874219089Spjd } else { 875219089Spjd ASSERT3U(rows[i][j], ==, 0); 876219089Spjd } 877219089Spjd } 878219089Spjd } 879219089Spjd} 880192194Sdfr 881219089Spjdstatic void 882219089Spjdvdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, 883219089Spjd int *missing, uint8_t **invrows, const uint8_t *used) 884219089Spjd{ 885219089Spjd int i, j, x, cc, c; 886219089Spjd uint8_t *src; 887219089Spjd uint64_t ccount; 888219089Spjd uint8_t *dst[VDEV_RAIDZ_MAXPARITY]; 889219089Spjd uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; 890219089Spjd uint8_t log, val; 891219089Spjd int ll; 892219089Spjd uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 893219089Spjd uint8_t *p, *pp; 894219089Spjd size_t psize; 895192194Sdfr 896219089Spjd log = 0; /* gcc */ 897219089Spjd psize = sizeof (invlog[0][0]) * n * nmissing; 898219089Spjd p = zfs_alloc(psize); 899192194Sdfr 900219089Spjd for (pp = p, i = 0; i < nmissing; i++) { 901219089Spjd invlog[i] = pp; 902219089Spjd pp += n; 903219089Spjd } 904192194Sdfr 905219089Spjd for (i = 0; i < nmissing; i++) { 906219089Spjd for (j = 0; j < n; j++) { 907219089Spjd ASSERT3U(invrows[i][j], !=, 0); 908219089Spjd invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 909219089Spjd } 910192194Sdfr } 911192194Sdfr 912219089Spjd for (i = 0; i < n; i++) { 913219089Spjd c = used[i]; 914219089Spjd ASSERT3U(c, <, rm->rm_cols); 915219089Spjd 916219089Spjd src = rm->rm_col[c].rc_data; 917219089Spjd ccount = rm->rm_col[c].rc_size; 918219089Spjd for (j = 0; j < nmissing; j++) { 919219089Spjd cc = missing[j] + rm->rm_firstdatacol; 920219089Spjd ASSERT3U(cc, >=, rm->rm_firstdatacol); 921219089Spjd ASSERT3U(cc, <, rm->rm_cols); 922219089Spjd ASSERT3U(cc, !=, c); 923219089Spjd 924219089Spjd dst[j] = rm->rm_col[cc].rc_data; 925219089Spjd dcount[j] = rm->rm_col[cc].rc_size; 926219089Spjd } 927219089Spjd 928219089Spjd ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); 929219089Spjd 930219089Spjd for (x = 0; x < ccount; x++, src++) { 931219089Spjd if (*src != 0) 932219089Spjd log = vdev_raidz_log2[*src]; 933219089Spjd 934219089Spjd for (cc = 0; cc < nmissing; cc++) { 935219089Spjd if (x >= dcount[cc]) 936219089Spjd continue; 937219089Spjd 938219089Spjd if (*src == 0) { 939219089Spjd val = 0; 940219089Spjd } else { 941219089Spjd if ((ll = log + invlog[cc][i]) >= 255) 942219089Spjd ll -= 255; 943219089Spjd val = vdev_raidz_pow2[ll]; 944219089Spjd } 945219089Spjd 946219089Spjd if (i == 0) 947219089Spjd dst[cc][x] = val; 948219089Spjd else 949219089Spjd dst[cc][x] ^= val; 950219089Spjd } 951219089Spjd } 952219089Spjd } 953219089Spjd 954219089Spjd zfs_free(p, psize); 955219089Spjd} 956219089Spjd 957219089Spjdstatic int 958219089Spjdvdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) 959219089Spjd{ 960219089Spjd int n, i, c, t, tt; 961219089Spjd int nmissing_rows; 962219089Spjd int missing_rows[VDEV_RAIDZ_MAXPARITY]; 963219089Spjd int parity_map[VDEV_RAIDZ_MAXPARITY]; 964219089Spjd 965219089Spjd uint8_t *p, *pp; 966219089Spjd size_t psize; 967219089Spjd 968219089Spjd uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 969219089Spjd uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 970219089Spjd uint8_t *used; 971219089Spjd 972219089Spjd int code = 0; 973219089Spjd 974219089Spjd 975219089Spjd n = rm->rm_cols - rm->rm_firstdatacol; 976219089Spjd 977192194Sdfr /* 978219089Spjd * Figure out which data columns are missing. 979192194Sdfr */ 980219089Spjd nmissing_rows = 0; 981219089Spjd for (t = 0; t < ntgts; t++) { 982219089Spjd if (tgts[t] >= rm->rm_firstdatacol) { 983219089Spjd missing_rows[nmissing_rows++] = 984219089Spjd tgts[t] - rm->rm_firstdatacol; 985219089Spjd } 986219089Spjd } 987219089Spjd 988219089Spjd /* 989219089Spjd * Figure out which parity columns to use to help generate the missing 990219089Spjd * data columns. 991219089Spjd */ 992219089Spjd for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 993219089Spjd ASSERT(tt < ntgts); 994219089Spjd ASSERT(c < rm->rm_firstdatacol); 995219089Spjd 996219089Spjd /* 997219089Spjd * Skip any targeted parity columns. 998219089Spjd */ 999219089Spjd if (c == tgts[tt]) { 1000219089Spjd tt++; 1001219089Spjd continue; 1002219089Spjd } 1003219089Spjd 1004219089Spjd code |= 1 << c; 1005219089Spjd 1006219089Spjd parity_map[i] = c; 1007219089Spjd i++; 1008219089Spjd } 1009219089Spjd 1010219089Spjd ASSERT(code != 0); 1011219089Spjd ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); 1012219089Spjd 1013219089Spjd psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 1014219089Spjd nmissing_rows * n + sizeof (used[0]) * n; 1015219089Spjd p = kmem_alloc(psize, KM_SLEEP); 1016219089Spjd 1017219089Spjd for (pp = p, i = 0; i < nmissing_rows; i++) { 1018219089Spjd rows[i] = pp; 1019219089Spjd pp += n; 1020219089Spjd invrows[i] = pp; 1021219089Spjd pp += n; 1022219089Spjd } 1023219089Spjd used = pp; 1024219089Spjd 1025219089Spjd for (i = 0; i < nmissing_rows; i++) { 1026219089Spjd used[i] = parity_map[i]; 1027219089Spjd } 1028219089Spjd 1029219089Spjd for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1030219089Spjd if (tt < nmissing_rows && 1031219089Spjd c == missing_rows[tt] + rm->rm_firstdatacol) { 1032219089Spjd tt++; 1033219089Spjd continue; 1034219089Spjd } 1035219089Spjd 1036219089Spjd ASSERT3S(i, <, n); 1037219089Spjd used[i] = c; 1038219089Spjd i++; 1039219089Spjd } 1040219089Spjd 1041219089Spjd /* 1042219089Spjd * Initialize the interesting rows of the matrix. 1043219089Spjd */ 1044219089Spjd vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); 1045219089Spjd 1046219089Spjd /* 1047219089Spjd * Invert the matrix. 1048219089Spjd */ 1049219089Spjd vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, 1050219089Spjd invrows, used); 1051219089Spjd 1052219089Spjd /* 1053219089Spjd * Reconstruct the missing data using the generated matrix. 1054219089Spjd */ 1055219089Spjd vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, 1056219089Spjd invrows, used); 1057219089Spjd 1058219089Spjd kmem_free(p, psize); 1059219089Spjd 1060219089Spjd return (code); 1061192194Sdfr} 1062192194Sdfr 1063192194Sdfrstatic int 1064219089Spjdvdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) 1065192194Sdfr{ 1066219089Spjd int tgts[VDEV_RAIDZ_MAXPARITY]; 1067219089Spjd int ntgts; 1068219089Spjd int i, c; 1069219089Spjd int code; 1070219089Spjd int nbadparity, nbaddata; 1071219089Spjd 1072219089Spjd /* 1073219089Spjd * The tgts list must already be sorted. 1074219089Spjd */ 1075219089Spjd for (i = 1; i < nt; i++) { 1076219089Spjd ASSERT(t[i] > t[i - 1]); 1077219089Spjd } 1078219089Spjd 1079219089Spjd nbadparity = rm->rm_firstdatacol; 1080219089Spjd nbaddata = rm->rm_cols - nbadparity; 1081219089Spjd ntgts = 0; 1082219089Spjd for (i = 0, c = 0; c < rm->rm_cols; c++) { 1083219089Spjd if (i < nt && c == t[i]) { 1084219089Spjd tgts[ntgts++] = c; 1085219089Spjd i++; 1086219089Spjd } else if (rm->rm_col[c].rc_error != 0) { 1087219089Spjd tgts[ntgts++] = c; 1088219089Spjd } else if (c >= rm->rm_firstdatacol) { 1089219089Spjd nbaddata--; 1090219089Spjd } else { 1091219089Spjd nbadparity--; 1092219089Spjd } 1093219089Spjd } 1094219089Spjd 1095219089Spjd ASSERT(ntgts >= nt); 1096219089Spjd ASSERT(nbaddata >= 0); 1097219089Spjd ASSERT(nbaddata + nbadparity == ntgts); 1098219089Spjd 1099219089Spjd code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); 1100219089Spjd ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); 1101219089Spjd ASSERT(code > 0); 1102219089Spjd return (code); 1103219089Spjd} 1104219089Spjd 1105219089Spjdstatic raidz_map_t * 1106219089Spjdvdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift, 1107219089Spjd uint64_t dcols, uint64_t nparity) 1108219089Spjd{ 1109219089Spjd raidz_map_t *rm; 1110192194Sdfr uint64_t b = offset >> unit_shift; 1111219089Spjd uint64_t s = size >> unit_shift; 1112192194Sdfr uint64_t f = b % dcols; 1113192194Sdfr uint64_t o = (b / dcols) << unit_shift; 1114219089Spjd uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; 1115192194Sdfr 1116192194Sdfr q = s / (dcols - nparity); 1117192194Sdfr r = s - q * (dcols - nparity); 1118192194Sdfr bc = (r == 0 ? 0 : r + nparity); 1119219089Spjd tot = s + nparity * (q + (r == 0 ? 0 : 1)); 1120192194Sdfr 1121219089Spjd if (q == 0) { 1122219089Spjd acols = bc; 1123219089Spjd scols = MIN(dcols, roundup(bc, nparity + 1)); 1124219089Spjd } else { 1125219089Spjd acols = dcols; 1126219089Spjd scols = dcols; 1127219089Spjd } 1128219089Spjd 1129219089Spjd ASSERT3U(acols, <=, scols); 1130219089Spjd 1131219089Spjd rm = zfs_alloc(offsetof(raidz_map_t, rm_col[scols])); 1132219089Spjd 1133219089Spjd rm->rm_cols = acols; 1134219089Spjd rm->rm_scols = scols; 1135219089Spjd rm->rm_bigcols = bc; 1136219089Spjd rm->rm_skipstart = bc; 1137219089Spjd rm->rm_missingdata = 0; 1138219089Spjd rm->rm_missingparity = 0; 1139219089Spjd rm->rm_firstdatacol = nparity; 1140219089Spjd rm->rm_reports = 0; 1141219089Spjd rm->rm_freed = 0; 1142219089Spjd rm->rm_ecksuminjected = 0; 1143219089Spjd 1144192194Sdfr asize = 0; 1145219089Spjd 1146219089Spjd for (c = 0; c < scols; c++) { 1147192194Sdfr col = f + c; 1148192194Sdfr coff = o; 1149192194Sdfr if (col >= dcols) { 1150192194Sdfr col -= dcols; 1151192194Sdfr coff += 1ULL << unit_shift; 1152192194Sdfr } 1153219089Spjd rm->rm_col[c].rc_devidx = col; 1154219089Spjd rm->rm_col[c].rc_offset = coff; 1155219089Spjd rm->rm_col[c].rc_data = NULL; 1156219089Spjd rm->rm_col[c].rc_error = 0; 1157219089Spjd rm->rm_col[c].rc_tried = 0; 1158219089Spjd rm->rm_col[c].rc_skipped = 0; 1159192194Sdfr 1160219089Spjd if (c >= acols) 1161219089Spjd rm->rm_col[c].rc_size = 0; 1162219089Spjd else if (c < bc) 1163219089Spjd rm->rm_col[c].rc_size = (q + 1) << unit_shift; 1164219089Spjd else 1165219089Spjd rm->rm_col[c].rc_size = q << unit_shift; 1166192194Sdfr 1167219089Spjd asize += rm->rm_col[c].rc_size; 1168192194Sdfr } 1169192194Sdfr 1170219089Spjd ASSERT3U(asize, ==, tot << unit_shift); 1171219089Spjd rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); 1172219089Spjd rm->rm_nskip = roundup(tot, nparity + 1) - tot; 1173219089Spjd ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); 1174219089Spjd ASSERT3U(rm->rm_nskip, <=, nparity); 1175192194Sdfr 1176219089Spjd for (c = 0; c < rm->rm_firstdatacol; c++) 1177219089Spjd rm->rm_col[c].rc_data = zfs_alloc(rm->rm_col[c].rc_size); 1178219089Spjd 1179219089Spjd rm->rm_col[c].rc_data = data; 1180219089Spjd 1181192194Sdfr for (c = c + 1; c < acols; c++) 1182219089Spjd rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + 1183219089Spjd rm->rm_col[c - 1].rc_size; 1184192194Sdfr 1185192194Sdfr /* 1186219089Spjd * If all data stored spans all columns, there's a danger that parity 1187219089Spjd * will always be on the same device and, since parity isn't read 1188219089Spjd * during normal operation, that that device's I/O bandwidth won't be 1189219089Spjd * used effectively. We therefore switch the parity every 1MB. 1190192194Sdfr * 1191219089Spjd * ... at least that was, ostensibly, the theory. As a practical 1192219089Spjd * matter unless we juggle the parity between all devices evenly, we 1193219089Spjd * won't see any benefit. Further, occasional writes that aren't a 1194219089Spjd * multiple of the LCM of the number of children and the minimum 1195219089Spjd * stripe width are sufficient to avoid pessimal behavior. 1196219089Spjd * Unfortunately, this decision created an implicit on-disk format 1197219089Spjd * requirement that we need to support for all eternity, but only 1198219089Spjd * for single-parity RAID-Z. 1199219089Spjd * 1200219089Spjd * If we intend to skip a sector in the zeroth column for padding 1201219089Spjd * we must make sure to note this swap. We will never intend to 1202219089Spjd * skip the first column since at least one data and one parity 1203219089Spjd * column must appear in each row. 1204192194Sdfr */ 1205219089Spjd ASSERT(rm->rm_cols >= 2); 1206219089Spjd ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); 1207192194Sdfr 1208219089Spjd if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) { 1209219089Spjd devidx = rm->rm_col[0].rc_devidx; 1210219089Spjd o = rm->rm_col[0].rc_offset; 1211219089Spjd rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; 1212219089Spjd rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; 1213219089Spjd rm->rm_col[1].rc_devidx = devidx; 1214219089Spjd rm->rm_col[1].rc_offset = o; 1215219089Spjd 1216219089Spjd if (rm->rm_skipstart == 0) 1217219089Spjd rm->rm_skipstart = 1; 1218192194Sdfr } 1219192194Sdfr 1220219089Spjd return (rm); 1221219089Spjd} 1222219089Spjd 1223219089Spjdstatic void 1224219089Spjdvdev_raidz_map_free(raidz_map_t *rm) 1225219089Spjd{ 1226219089Spjd int c; 1227219089Spjd 1228219089Spjd for (c = rm->rm_firstdatacol - 1; c >= 0; c--) 1229219089Spjd zfs_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); 1230219089Spjd 1231219089Spjd zfs_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); 1232219089Spjd} 1233219089Spjd 1234219089Spjdstatic vdev_t * 1235219089Spjdvdev_child(vdev_t *pvd, uint64_t devidx) 1236219089Spjd{ 1237219089Spjd vdev_t *cvd; 1238219089Spjd 1239219089Spjd STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) { 1240219089Spjd if (cvd->v_id == devidx) 1241219089Spjd break; 1242219089Spjd } 1243219089Spjd 1244219089Spjd return (cvd); 1245219089Spjd} 1246219089Spjd 1247219089Spjd/* 1248219089Spjd * We keep track of whether or not there were any injected errors, so that 1249219089Spjd * any ereports we generate can note it. 1250219089Spjd */ 1251219089Spjdstatic int 1252226553Spjdraidz_checksum_verify(const blkptr_t *bp, void *data, uint64_t size) 1253219089Spjd{ 1254219089Spjd 1255226568Spjd return (zio_checksum_verify(bp, data)); 1256219089Spjd} 1257219089Spjd 1258219089Spjd/* 1259219089Spjd * Generate the parity from the data columns. If we tried and were able to 1260219089Spjd * read the parity without error, verify that the generated parity matches the 1261219089Spjd * data we read. If it doesn't, we fire off a checksum error. Return the 1262219089Spjd * number such failures. 1263219089Spjd */ 1264219089Spjdstatic int 1265219089Spjdraidz_parity_verify(raidz_map_t *rm) 1266219089Spjd{ 1267219089Spjd void *orig[VDEV_RAIDZ_MAXPARITY]; 1268219089Spjd int c, ret = 0; 1269219089Spjd raidz_col_t *rc; 1270219089Spjd 1271219089Spjd for (c = 0; c < rm->rm_firstdatacol; c++) { 1272219089Spjd rc = &rm->rm_col[c]; 1273219089Spjd if (!rc->rc_tried || rc->rc_error != 0) 1274219089Spjd continue; 1275219089Spjd orig[c] = zfs_alloc(rc->rc_size); 1276219089Spjd bcopy(rc->rc_data, orig[c], rc->rc_size); 1277219089Spjd } 1278219089Spjd 1279219089Spjd vdev_raidz_generate_parity(rm); 1280219089Spjd 1281219089Spjd for (c = rm->rm_firstdatacol - 1; c >= 0; c--) { 1282219089Spjd rc = &rm->rm_col[c]; 1283219089Spjd if (!rc->rc_tried || rc->rc_error != 0) 1284219089Spjd continue; 1285219089Spjd if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { 1286219089Spjd rc->rc_error = ECKSUM; 1287219089Spjd ret++; 1288219089Spjd } 1289219089Spjd zfs_free(orig[c], rc->rc_size); 1290219089Spjd } 1291219089Spjd 1292219089Spjd return (ret); 1293219089Spjd} 1294219089Spjd 1295219089Spjd/* 1296219089Spjd * Iterate over all combinations of bad data and attempt a reconstruction. 1297219089Spjd * Note that the algorithm below is non-optimal because it doesn't take into 1298219089Spjd * account how reconstruction is actually performed. For example, with 1299219089Spjd * triple-parity RAID-Z the reconstruction procedure is the same if column 4 1300219089Spjd * is targeted as invalid as if columns 1 and 4 are targeted since in both 1301219089Spjd * cases we'd only use parity information in column 0. 1302219089Spjd */ 1303219089Spjdstatic int 1304219089Spjdvdev_raidz_combrec(raidz_map_t *rm, const blkptr_t *bp, void *data, 1305226553Spjd off_t offset, uint64_t bytes, int total_errors, int data_errors) 1306219089Spjd{ 1307219089Spjd raidz_col_t *rc; 1308219089Spjd void *orig[VDEV_RAIDZ_MAXPARITY]; 1309219089Spjd int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 1310219089Spjd int *tgts = &tstore[1]; 1311219089Spjd int current, next, i, c, n; 1312219089Spjd int code, ret = 0; 1313219089Spjd 1314219089Spjd ASSERT(total_errors < rm->rm_firstdatacol); 1315219089Spjd 1316192194Sdfr /* 1317219089Spjd * This simplifies one edge condition. 1318192194Sdfr */ 1319219089Spjd tgts[-1] = -1; 1320219089Spjd 1321219089Spjd for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { 1322219089Spjd /* 1323219089Spjd * Initialize the targets array by finding the first n columns 1324219089Spjd * that contain no error. 1325219089Spjd * 1326219089Spjd * If there were no data errors, we need to ensure that we're 1327219089Spjd * always explicitly attempting to reconstruct at least one 1328219089Spjd * data column. To do this, we simply push the highest target 1329219089Spjd * up into the data columns. 1330219089Spjd */ 1331219089Spjd for (c = 0, i = 0; i < n; i++) { 1332219089Spjd if (i == n - 1 && data_errors == 0 && 1333219089Spjd c < rm->rm_firstdatacol) { 1334219089Spjd c = rm->rm_firstdatacol; 1335219089Spjd } 1336219089Spjd 1337219089Spjd while (rm->rm_col[c].rc_error != 0) { 1338219089Spjd c++; 1339219089Spjd ASSERT3S(c, <, rm->rm_cols); 1340219089Spjd } 1341219089Spjd 1342219089Spjd tgts[i] = c++; 1343219089Spjd } 1344219089Spjd 1345219089Spjd /* 1346219089Spjd * Setting tgts[n] simplifies the other edge condition. 1347219089Spjd */ 1348219089Spjd tgts[n] = rm->rm_cols; 1349219089Spjd 1350219089Spjd /* 1351219089Spjd * These buffers were allocated in previous iterations. 1352219089Spjd */ 1353219089Spjd for (i = 0; i < n - 1; i++) { 1354219089Spjd ASSERT(orig[i] != NULL); 1355219089Spjd } 1356219089Spjd 1357219089Spjd orig[n - 1] = zfs_alloc(rm->rm_col[0].rc_size); 1358219089Spjd 1359219089Spjd current = 0; 1360219089Spjd next = tgts[current]; 1361219089Spjd 1362219089Spjd while (current != n) { 1363219089Spjd tgts[current] = next; 1364219089Spjd current = 0; 1365219089Spjd 1366219089Spjd /* 1367219089Spjd * Save off the original data that we're going to 1368219089Spjd * attempt to reconstruct. 1369219089Spjd */ 1370219089Spjd for (i = 0; i < n; i++) { 1371219089Spjd ASSERT(orig[i] != NULL); 1372219089Spjd c = tgts[i]; 1373219089Spjd ASSERT3S(c, >=, 0); 1374219089Spjd ASSERT3S(c, <, rm->rm_cols); 1375219089Spjd rc = &rm->rm_col[c]; 1376219089Spjd bcopy(rc->rc_data, orig[i], rc->rc_size); 1377219089Spjd } 1378219089Spjd 1379219089Spjd /* 1380219089Spjd * Attempt a reconstruction and exit the outer loop on 1381219089Spjd * success. 1382219089Spjd */ 1383219089Spjd code = vdev_raidz_reconstruct(rm, tgts, n); 1384226553Spjd if (raidz_checksum_verify(bp, data, bytes) == 0) { 1385219089Spjd for (i = 0; i < n; i++) { 1386219089Spjd c = tgts[i]; 1387219089Spjd rc = &rm->rm_col[c]; 1388219089Spjd ASSERT(rc->rc_error == 0); 1389219089Spjd rc->rc_error = ECKSUM; 1390219089Spjd } 1391219089Spjd 1392219089Spjd ret = code; 1393219089Spjd goto done; 1394219089Spjd } 1395219089Spjd 1396219089Spjd /* 1397219089Spjd * Restore the original data. 1398219089Spjd */ 1399219089Spjd for (i = 0; i < n; i++) { 1400219089Spjd c = tgts[i]; 1401219089Spjd rc = &rm->rm_col[c]; 1402219089Spjd bcopy(orig[i], rc->rc_data, rc->rc_size); 1403219089Spjd } 1404219089Spjd 1405219089Spjd do { 1406219089Spjd /* 1407219089Spjd * Find the next valid column after the current 1408219089Spjd * position.. 1409219089Spjd */ 1410219089Spjd for (next = tgts[current] + 1; 1411219089Spjd next < rm->rm_cols && 1412219089Spjd rm->rm_col[next].rc_error != 0; next++) 1413219089Spjd continue; 1414219089Spjd 1415219089Spjd ASSERT(next <= tgts[current + 1]); 1416219089Spjd 1417219089Spjd /* 1418219089Spjd * If that spot is available, we're done here. 1419219089Spjd */ 1420219089Spjd if (next != tgts[current + 1]) 1421219089Spjd break; 1422219089Spjd 1423219089Spjd /* 1424219089Spjd * Otherwise, find the next valid column after 1425219089Spjd * the previous position. 1426219089Spjd */ 1427219089Spjd for (c = tgts[current - 1] + 1; 1428219089Spjd rm->rm_col[c].rc_error != 0; c++) 1429219089Spjd continue; 1430219089Spjd 1431219089Spjd tgts[current] = c; 1432219089Spjd current++; 1433219089Spjd 1434219089Spjd } while (current != n); 1435219089Spjd } 1436219089Spjd } 1437219089Spjd n--; 1438219089Spjddone: 1439219089Spjd for (i = n - 1; i >= 0; i--) { 1440219089Spjd zfs_free(orig[i], rm->rm_col[0].rc_size); 1441219089Spjd } 1442219089Spjd 1443219089Spjd return (ret); 1444219089Spjd} 1445219089Spjd 1446219089Spjdstatic int 1447219089Spjdvdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data, 1448219089Spjd off_t offset, size_t bytes) 1449219089Spjd{ 1450219089Spjd vdev_t *tvd = vd->v_top; 1451219089Spjd vdev_t *cvd; 1452219089Spjd raidz_map_t *rm; 1453219089Spjd raidz_col_t *rc; 1454219089Spjd int c, error; 1455219089Spjd int unexpected_errors; 1456219089Spjd int parity_errors; 1457219089Spjd int parity_untried; 1458219089Spjd int data_errors; 1459219089Spjd int total_errors; 1460219089Spjd int n; 1461219089Spjd int tgts[VDEV_RAIDZ_MAXPARITY]; 1462219089Spjd int code; 1463219089Spjd 1464219089Spjd rc = NULL; /* gcc */ 1465219089Spjd error = 0; 1466219089Spjd 1467219089Spjd rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift, 1468219089Spjd vd->v_nchildren, vd->v_nparity); 1469219089Spjd 1470219089Spjd /* 1471219089Spjd * Iterate over the columns in reverse order so that we hit the parity 1472219089Spjd * last -- any errors along the way will force us to read the parity. 1473219089Spjd */ 1474219089Spjd for (c = rm->rm_cols - 1; c >= 0; c--) { 1475219089Spjd rc = &rm->rm_col[c]; 1476219089Spjd cvd = vdev_child(vd, rc->rc_devidx); 1477219089Spjd if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) { 1478219089Spjd if (c >= rm->rm_firstdatacol) 1479219089Spjd rm->rm_missingdata++; 1480192194Sdfr else 1481219089Spjd rm->rm_missingparity++; 1482192194Sdfr rc->rc_error = ENXIO; 1483192194Sdfr rc->rc_tried = 1; /* don't even try */ 1484192194Sdfr rc->rc_skipped = 1; 1485192194Sdfr continue; 1486192194Sdfr } 1487219089Spjd#if 0 /* XXX: Too hard for the boot code. */ 1488219089Spjd if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 1489219089Spjd if (c >= rm->rm_firstdatacol) 1490192194Sdfr rm->rm_missingdata++; 1491192194Sdfr else 1492192194Sdfr rm->rm_missingparity++; 1493192194Sdfr rc->rc_error = ESTALE; 1494192194Sdfr rc->rc_skipped = 1; 1495192194Sdfr continue; 1496192194Sdfr } 1497192194Sdfr#endif 1498219089Spjd if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) { 1499219089Spjd rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data, 1500219089Spjd rc->rc_offset, rc->rc_size); 1501192194Sdfr rc->rc_tried = 1; 1502192194Sdfr rc->rc_skipped = 0; 1503192194Sdfr } 1504192194Sdfr } 1505192194Sdfr 1506192194Sdfrreconstruct: 1507219089Spjd unexpected_errors = 0; 1508192194Sdfr parity_errors = 0; 1509219089Spjd parity_untried = 0; 1510192194Sdfr data_errors = 0; 1511192194Sdfr total_errors = 0; 1512192194Sdfr 1513219089Spjd ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); 1514219089Spjd ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); 1515219089Spjd 1516219089Spjd for (c = 0; c < rm->rm_cols; c++) { 1517219089Spjd rc = &rm->rm_col[c]; 1518219089Spjd 1519192194Sdfr if (rc->rc_error) { 1520219089Spjd ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 1521219089Spjd 1522219089Spjd if (c < rm->rm_firstdatacol) 1523192194Sdfr parity_errors++; 1524192194Sdfr else 1525192194Sdfr data_errors++; 1526192194Sdfr 1527192194Sdfr if (!rc->rc_skipped) 1528192194Sdfr unexpected_errors++; 1529192194Sdfr 1530192194Sdfr total_errors++; 1531219089Spjd } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { 1532192194Sdfr parity_untried++; 1533192194Sdfr } 1534192194Sdfr } 1535192194Sdfr 1536192194Sdfr /* 1537192194Sdfr * There are three potential phases for a read: 1538192194Sdfr * 1. produce valid data from the columns read 1539192194Sdfr * 2. read all disks and try again 1540192194Sdfr * 3. perform combinatorial reconstruction 1541192194Sdfr * 1542219089Spjd * Each phase is progressively both more expensive and less likely to 1543219089Spjd * occur. If we encounter more errors than we can repair or all phases 1544219089Spjd * fail, we have no choice but to return an error. 1545192194Sdfr */ 1546192194Sdfr 1547192194Sdfr /* 1548219089Spjd * If the number of errors we saw was correctable -- less than or equal 1549219089Spjd * to the number of parity disks read -- attempt to produce data that 1550219089Spjd * has a valid checksum. Naturally, this case applies in the absence of 1551219089Spjd * any errors. 1552192194Sdfr */ 1553219089Spjd if (total_errors <= rm->rm_firstdatacol - parity_untried) { 1554219089Spjd if (data_errors == 0) { 1555226553Spjd if (raidz_checksum_verify(bp, data, bytes) == 0) { 1556219089Spjd /* 1557219089Spjd * If we read parity information (unnecessarily 1558219089Spjd * as it happens since no reconstruction was 1559219089Spjd * needed) regenerate and verify the parity. 1560219089Spjd * We also regenerate parity when resilvering 1561219089Spjd * so we can write it out to the failed device 1562219089Spjd * later. 1563219089Spjd */ 1564219089Spjd if (parity_errors + parity_untried < 1565219089Spjd rm->rm_firstdatacol) { 1566219089Spjd n = raidz_parity_verify(rm); 1567219089Spjd unexpected_errors += n; 1568219089Spjd ASSERT(parity_errors + n <= 1569219089Spjd rm->rm_firstdatacol); 1570219089Spjd } 1571219089Spjd goto done; 1572219089Spjd } 1573219089Spjd } else { 1574192194Sdfr /* 1575192194Sdfr * We either attempt to read all the parity columns or 1576192194Sdfr * none of them. If we didn't try to read parity, we 1577192194Sdfr * wouldn't be here in the correctable case. There must 1578192194Sdfr * also have been fewer parity errors than parity 1579192194Sdfr * columns or, again, we wouldn't be in this code path. 1580192194Sdfr */ 1581219089Spjd ASSERT(parity_untried == 0); 1582219089Spjd ASSERT(parity_errors < rm->rm_firstdatacol); 1583192194Sdfr 1584192194Sdfr /* 1585219089Spjd * Identify the data columns that reported an error. 1586192194Sdfr */ 1587219089Spjd n = 0; 1588219089Spjd for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1589219089Spjd rc = &rm->rm_col[c]; 1590219089Spjd if (rc->rc_error != 0) { 1591219089Spjd ASSERT(n < VDEV_RAIDZ_MAXPARITY); 1592219089Spjd tgts[n++] = c; 1593219089Spjd } 1594192194Sdfr } 1595192194Sdfr 1596219089Spjd ASSERT(rm->rm_firstdatacol >= n); 1597192194Sdfr 1598219089Spjd code = vdev_raidz_reconstruct(rm, tgts, n); 1599192194Sdfr 1600226553Spjd if (raidz_checksum_verify(bp, data, bytes) == 0) { 1601219089Spjd /* 1602219089Spjd * If we read more parity disks than were used 1603219089Spjd * for reconstruction, confirm that the other 1604219089Spjd * parity disks produced correct data. This 1605219089Spjd * routine is suboptimal in that it regenerates 1606219089Spjd * the parity that we already used in addition 1607219089Spjd * to the parity that we're attempting to 1608219089Spjd * verify, but this should be a relatively 1609219089Spjd * uncommon case, and can be optimized if it 1610219089Spjd * becomes a problem. Note that we regenerate 1611219089Spjd * parity when resilvering so we can write it 1612219089Spjd * out to failed devices later. 1613219089Spjd */ 1614219089Spjd if (parity_errors < rm->rm_firstdatacol - n) { 1615219089Spjd n = raidz_parity_verify(rm); 1616219089Spjd unexpected_errors += n; 1617219089Spjd ASSERT(parity_errors + n <= 1618219089Spjd rm->rm_firstdatacol); 1619219089Spjd } 1620192194Sdfr 1621219089Spjd goto done; 1622192194Sdfr } 1623192194Sdfr } 1624192194Sdfr } 1625192194Sdfr 1626192194Sdfr /* 1627192194Sdfr * This isn't a typical situation -- either we got a read 1628192194Sdfr * error or a child silently returned bad data. Read every 1629192194Sdfr * block so we can try again with as much data and parity as 1630192194Sdfr * we can track down. If we've already been through once 1631192194Sdfr * before, all children will be marked as tried so we'll 1632192194Sdfr * proceed to combinatorial reconstruction. 1633192194Sdfr */ 1634219089Spjd unexpected_errors = 1; 1635219089Spjd rm->rm_missingdata = 0; 1636219089Spjd rm->rm_missingparity = 0; 1637219089Spjd 1638192194Sdfr n = 0; 1639219089Spjd for (c = 0; c < rm->rm_cols; c++) { 1640226550Spjd rc = &rm->rm_col[c]; 1641226550Spjd 1642226550Spjd if (rc->rc_tried) 1643192194Sdfr continue; 1644192194Sdfr 1645219089Spjd cvd = vdev_child(vd, rc->rc_devidx); 1646219089Spjd ASSERT(cvd != NULL); 1647219089Spjd rc->rc_error = cvd->v_read(cvd, NULL, 1648219089Spjd rc->rc_data, rc->rc_offset, rc->rc_size); 1649192194Sdfr if (rc->rc_error == 0) 1650192194Sdfr n++; 1651192194Sdfr rc->rc_tried = 1; 1652192194Sdfr rc->rc_skipped = 0; 1653192194Sdfr } 1654192194Sdfr /* 1655192194Sdfr * If we managed to read anything more, retry the 1656192194Sdfr * reconstruction. 1657192194Sdfr */ 1658219089Spjd if (n > 0) 1659192194Sdfr goto reconstruct; 1660192194Sdfr 1661192194Sdfr /* 1662192194Sdfr * At this point we've attempted to reconstruct the data given the 1663192194Sdfr * errors we detected, and we've attempted to read all columns. There 1664192194Sdfr * must, therefore, be one or more additional problems -- silent errors 1665192194Sdfr * resulting in invalid data rather than explicit I/O errors resulting 1666219089Spjd * in absent data. We check if there is enough additional data to 1667219089Spjd * possibly reconstruct the data and then perform combinatorial 1668219089Spjd * reconstruction over all possible combinations. If that fails, 1669219089Spjd * we're cooked. 1670192194Sdfr */ 1671219089Spjd if (total_errors > rm->rm_firstdatacol) { 1672219089Spjd error = EIO; 1673219089Spjd } else if (total_errors < rm->rm_firstdatacol && 1674226553Spjd (code = vdev_raidz_combrec(rm, bp, data, offset, bytes, 1675226553Spjd total_errors, data_errors)) != 0) { 1676192194Sdfr /* 1677219089Spjd * If we didn't use all the available parity for the 1678219089Spjd * combinatorial reconstruction, verify that the remaining 1679219089Spjd * parity is correct. 1680192194Sdfr */ 1681219089Spjd if (code != (1 << rm->rm_firstdatacol) - 1) 1682219089Spjd (void) raidz_parity_verify(rm); 1683219089Spjd } else { 1684192194Sdfr /* 1685219089Spjd * We're here because either: 1686219089Spjd * 1687219089Spjd * total_errors == rm_first_datacol, or 1688219089Spjd * vdev_raidz_combrec() failed 1689219089Spjd * 1690219089Spjd * In either case, there is enough bad data to prevent 1691219089Spjd * reconstruction. 1692219089Spjd * 1693219089Spjd * Start checksum ereports for all children which haven't 1694219089Spjd * failed, and the IO wasn't speculative. 1695192194Sdfr */ 1696219089Spjd error = ECKSUM; 1697192194Sdfr } 1698192194Sdfr 1699219089Spjddone: 1700219089Spjd vdev_raidz_map_free(rm); 1701192194Sdfr 1702219089Spjd return (error); 1703192194Sdfr} 1704