1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd/* 22209962Smm * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23168404Spjd * Use is subject to license terms. 24168404Spjd */ 25168404Spjd 26230514Smm/* 27230514Smm * Copyright (c) 2012 by Delphix. All rights reserved. 28230514Smm */ 29230514Smm 30168404Spjd#include <sys/spa.h> 31168404Spjd#include <sys/spa_impl.h> 32168404Spjd#include <sys/vdev.h> 33168404Spjd#include <sys/vdev_impl.h> 34168404Spjd#include <sys/zio.h> 35219089Spjd#include <sys/zio_checksum.h> 36168404Spjd 37168404Spjd#include <sys/fm/fs/zfs.h> 38168404Spjd#include <sys/fm/protocol.h> 39168404Spjd#include <sys/fm/util.h> 40219089Spjd#include <sys/sysevent.h> 41168404Spjd 42168404Spjd/* 43168404Spjd * This general routine is responsible for generating all the different ZFS 44168404Spjd * ereports. The payload is dependent on the class, and which arguments are 45168404Spjd * supplied to the function: 46168404Spjd * 47168404Spjd * EREPORT POOL VDEV IO 48168404Spjd * block X X X 49168404Spjd * data X X 50168404Spjd * device X X 51168404Spjd * pool X 52168404Spjd * 53168404Spjd * If we are in a loading state, all errors are chained together by the same 54185029Spjd * SPA-wide ENA (Error Numeric Association). 55168404Spjd * 56168404Spjd * For isolated I/O requests, we get the ENA from the zio_t. The propagation 57168404Spjd * gets very complicated due to RAID-Z, gang blocks, and vdev caching. We want 58168404Spjd * to chain together all ereports associated with a logical piece of data. For 59168404Spjd * read I/Os, there are basically three 'types' of I/O, which form a roughly 60168404Spjd * layered diagram: 61168404Spjd * 62168404Spjd * +---------------+ 63168404Spjd * | Aggregate I/O | No associated logical data or device 64168404Spjd * +---------------+ 65168404Spjd * | 66168404Spjd * V 67168404Spjd * +---------------+ Reads associated with a piece of logical data. 68168404Spjd * | Read I/O | This includes reads on behalf of RAID-Z, 69168404Spjd * +---------------+ mirrors, gang blocks, retries, etc. 70168404Spjd * | 71168404Spjd * V 72168404Spjd * +---------------+ Reads associated with a particular device, but 73168404Spjd * | Physical I/O | no logical data. Issued as part of vdev caching 74168404Spjd * +---------------+ and I/O aggregation. 75168404Spjd * 76168404Spjd * Note that 'physical I/O' here is not the same terminology as used in the rest 77168404Spjd * of ZIO. Typically, 'physical I/O' simply means that there is no attached 78168404Spjd * blockpointer. But I/O with no associated block pointer can still be related 79168404Spjd * to a logical piece of data (i.e. RAID-Z requests). 80168404Spjd * 81168404Spjd * Purely physical I/O always have unique ENAs. They are not related to a 82168404Spjd * particular piece of logical data, and therefore cannot be chained together. 83168404Spjd * We still generate an ereport, but the DE doesn't correlate it with any 84168404Spjd * logical piece of data. When such an I/O fails, the delegated I/O requests 85168404Spjd * will issue a retry, which will trigger the 'real' ereport with the correct 86168404Spjd * ENA. 87168404Spjd * 88168404Spjd * We keep track of the ENA for a ZIO chain through the 'io_logical' member. 89168404Spjd * When a new logical I/O is issued, we set this to point to itself. Child I/Os 90168404Spjd * then inherit this pointer, so that when it is first set subsequent failures 91185029Spjd * will use the same ENA. For vdev cache fill and queue aggregation I/O, 92185029Spjd * this pointer is set to NULL, and no ereport will be generated (since it 93185029Spjd * doesn't actually correspond to any particular device or piece of data, 94185029Spjd * and the caller will always retry without caching or queueing anyway). 95219089Spjd * 96219089Spjd * For checksum errors, we want to include more information about the actual 97219089Spjd * error which occurs. Accordingly, we build an ereport when the error is 98219089Spjd * noticed, but instead of sending it in immediately, we hang it off of the 99219089Spjd * io_cksum_report field of the logical IO. When the logical IO completes 100219089Spjd * (successfully or not), zfs_ereport_finish_checksum() is called with the 101219089Spjd * good and bad versions of the buffer (if available), and we annotate the 102219089Spjd * ereport with information about the differences. 103168404Spjd */ 104219089Spjd#ifdef _KERNEL 105219089Spjdstatic void 106219089Spjdzfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, 107219089Spjd const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, 108168404Spjd uint64_t stateoroffset, uint64_t size) 109168404Spjd{ 110219089Spjd nvlist_t *ereport, *detector; 111168404Spjd 112219089Spjd uint64_t ena; 113219089Spjd char class[64]; 114219089Spjd 115168404Spjd /* 116219089Spjd * If we are doing a spa_tryimport() or in recovery mode, 117219089Spjd * ignore errors. 118168404Spjd */ 119219089Spjd if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT || 120219089Spjd spa_load_state(spa) == SPA_LOAD_RECOVER) 121168404Spjd return; 122168404Spjd 123168404Spjd /* 124168404Spjd * If we are in the middle of opening a pool, and the previous attempt 125168404Spjd * failed, don't bother logging any new ereports - we're just going to 126168404Spjd * get the same diagnosis anyway. 127168404Spjd */ 128219089Spjd if (spa_load_state(spa) != SPA_LOAD_NONE && 129168404Spjd spa->spa_last_open_failed) 130168404Spjd return; 131168404Spjd 132185029Spjd if (zio != NULL) { 133185029Spjd /* 134185029Spjd * If this is not a read or write zio, ignore the error. This 135185029Spjd * can occur if the DKIOCFLUSHWRITECACHE ioctl fails. 136185029Spjd */ 137185029Spjd if (zio->io_type != ZIO_TYPE_READ && 138185029Spjd zio->io_type != ZIO_TYPE_WRITE) 139185029Spjd return; 140168404Spjd 141185029Spjd /* 142185029Spjd * Ignore any errors from speculative I/Os, as failure is an 143185029Spjd * expected result. 144185029Spjd */ 145185029Spjd if (zio->io_flags & ZIO_FLAG_SPECULATIVE) 146185029Spjd return; 147168404Spjd 148213198Smm /* 149213198Smm * If this I/O is not a retry I/O, don't post an ereport. 150213198Smm * Otherwise, we risk making bad diagnoses based on B_FAILFAST 151213198Smm * I/Os. 152213198Smm */ 153213198Smm if (zio->io_error == EIO && 154213198Smm !(zio->io_flags & ZIO_FLAG_IO_RETRY)) 155213198Smm return; 156213198Smm 157209962Smm if (vd != NULL) { 158209962Smm /* 159209962Smm * If the vdev has already been marked as failing due 160209962Smm * to a failed probe, then ignore any subsequent I/O 161209962Smm * errors, as the DE will automatically fault the vdev 162209962Smm * on the first such failure. This also catches cases 163209962Smm * where vdev_remove_wanted is set and the device has 164209962Smm * not yet been asynchronously placed into the REMOVED 165209962Smm * state. 166209962Smm */ 167219089Spjd if (zio->io_vd == vd && !vdev_accessible(vd, zio)) 168209962Smm return; 169209962Smm 170209962Smm /* 171209962Smm * Ignore checksum errors for reads from DTL regions of 172209962Smm * leaf vdevs. 173209962Smm */ 174209962Smm if (zio->io_type == ZIO_TYPE_READ && 175209962Smm zio->io_error == ECKSUM && 176209962Smm vd->vdev_ops->vdev_op_leaf && 177209962Smm vdev_dtl_contains(vd, DTL_MISSING, zio->io_txg, 1)) 178209962Smm return; 179209962Smm } 180185029Spjd } 181168404Spjd 182219089Spjd /* 183219089Spjd * For probe failure, we want to avoid posting ereports if we've 184219089Spjd * already removed the device in the meantime. 185219089Spjd */ 186219089Spjd if (vd != NULL && 187219089Spjd strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) == 0 && 188219089Spjd (vd->vdev_remove_wanted || vd->vdev_state == VDEV_STATE_REMOVED)) 189219089Spjd return; 190168404Spjd 191219089Spjd if ((ereport = fm_nvlist_create(NULL)) == NULL) 192219089Spjd return; 193219089Spjd 194219089Spjd if ((detector = fm_nvlist_create(NULL)) == NULL) { 195219089Spjd fm_nvlist_destroy(ereport, FM_NVA_FREE); 196219089Spjd return; 197219089Spjd } 198219089Spjd 199168404Spjd /* 200168404Spjd * Serialize ereport generation 201168404Spjd */ 202168404Spjd mutex_enter(&spa->spa_errlist_lock); 203168404Spjd 204168404Spjd /* 205168404Spjd * Determine the ENA to use for this event. If we are in a loading 206168404Spjd * state, use a SPA-wide ENA. Otherwise, if we are in an I/O state, use 207168404Spjd * a root zio-wide ENA. Otherwise, simply use a unique ENA. 208168404Spjd */ 209219089Spjd if (spa_load_state(spa) != SPA_LOAD_NONE) { 210168404Spjd if (spa->spa_ena == 0) 211168404Spjd spa->spa_ena = fm_ena_generate(0, FM_ENA_FMT1); 212168404Spjd ena = spa->spa_ena; 213168404Spjd } else if (zio != NULL && zio->io_logical != NULL) { 214168404Spjd if (zio->io_logical->io_ena == 0) 215168404Spjd zio->io_logical->io_ena = 216168404Spjd fm_ena_generate(0, FM_ENA_FMT1); 217168404Spjd ena = zio->io_logical->io_ena; 218168404Spjd } else { 219168404Spjd ena = fm_ena_generate(0, FM_ENA_FMT1); 220168404Spjd } 221168404Spjd 222168404Spjd /* 223168404Spjd * Construct the full class, detector, and other standard FMA fields. 224168404Spjd */ 225219089Spjd (void) snprintf(class, sizeof (class), "%s.%s", 226219089Spjd ZFS_ERROR_CLASS, subclass); 227168404Spjd 228219089Spjd fm_fmri_zfs_set(detector, FM_ZFS_SCHEME_VERSION, spa_guid(spa), 229219089Spjd vd != NULL ? vd->vdev_guid : 0); 230168404Spjd 231219089Spjd fm_ereport_set(ereport, FM_EREPORT_VERSION, class, ena, detector, NULL); 232219089Spjd 233168404Spjd /* 234168404Spjd * Construct the per-ereport payload, depending on which parameters are 235168404Spjd * passed in. 236168404Spjd */ 237168404Spjd 238168404Spjd /* 239168404Spjd * Generic payload members common to all ereports. 240168404Spjd */ 241219089Spjd fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL, 242219089Spjd DATA_TYPE_STRING, spa_name(spa), FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, 243219089Spjd DATA_TYPE_UINT64, spa_guid(spa), 244219089Spjd FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, DATA_TYPE_INT32, 245219089Spjd spa_load_state(spa), NULL); 246168404Spjd 247185029Spjd if (spa != NULL) { 248219089Spjd fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE, 249219089Spjd DATA_TYPE_STRING, 250185029Spjd spa_get_failmode(spa) == ZIO_FAILURE_MODE_WAIT ? 251185029Spjd FM_EREPORT_FAILMODE_WAIT : 252185029Spjd spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE ? 253219089Spjd FM_EREPORT_FAILMODE_CONTINUE : FM_EREPORT_FAILMODE_PANIC, 254219089Spjd NULL); 255185029Spjd } 256185029Spjd 257168404Spjd if (vd != NULL) { 258168404Spjd vdev_t *pvd = vd->vdev_parent; 259168404Spjd 260219089Spjd fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, 261219089Spjd DATA_TYPE_UINT64, vd->vdev_guid, 262219089Spjd FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, 263219089Spjd DATA_TYPE_STRING, vd->vdev_ops->vdev_op_type, NULL); 264209962Smm if (vd->vdev_path != NULL) 265219089Spjd fm_payload_set(ereport, 266219089Spjd FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH, 267219089Spjd DATA_TYPE_STRING, vd->vdev_path, NULL); 268209962Smm if (vd->vdev_devid != NULL) 269219089Spjd fm_payload_set(ereport, 270219089Spjd FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID, 271219089Spjd DATA_TYPE_STRING, vd->vdev_devid, NULL); 272209962Smm if (vd->vdev_fru != NULL) 273219089Spjd fm_payload_set(ereport, 274219089Spjd FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU, 275219089Spjd DATA_TYPE_STRING, vd->vdev_fru, NULL); 276168404Spjd 277168404Spjd if (pvd != NULL) { 278219089Spjd fm_payload_set(ereport, 279219089Spjd FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID, 280219089Spjd DATA_TYPE_UINT64, pvd->vdev_guid, 281168404Spjd FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE, 282219089Spjd DATA_TYPE_STRING, pvd->vdev_ops->vdev_op_type, 283219089Spjd NULL); 284168404Spjd if (pvd->vdev_path) 285219089Spjd fm_payload_set(ereport, 286168404Spjd FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH, 287219089Spjd DATA_TYPE_STRING, pvd->vdev_path, NULL); 288168404Spjd if (pvd->vdev_devid) 289219089Spjd fm_payload_set(ereport, 290168404Spjd FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID, 291219089Spjd DATA_TYPE_STRING, pvd->vdev_devid, NULL); 292168404Spjd } 293168404Spjd } 294168404Spjd 295168404Spjd if (zio != NULL) { 296168404Spjd /* 297168404Spjd * Payload common to all I/Os. 298168404Spjd */ 299219089Spjd fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR, 300219089Spjd DATA_TYPE_INT32, zio->io_error, NULL); 301168404Spjd 302168404Spjd /* 303168404Spjd * If the 'size' parameter is non-zero, it indicates this is a 304168404Spjd * RAID-Z or other I/O where the physical offset and length are 305168404Spjd * provided for us, instead of within the zio_t. 306168404Spjd */ 307168404Spjd if (vd != NULL) { 308219089Spjd if (size) 309219089Spjd fm_payload_set(ereport, 310168404Spjd FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET, 311219089Spjd DATA_TYPE_UINT64, stateoroffset, 312219089Spjd FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE, 313219089Spjd DATA_TYPE_UINT64, size, NULL); 314219089Spjd else 315219089Spjd fm_payload_set(ereport, 316168404Spjd FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET, 317219089Spjd DATA_TYPE_UINT64, zio->io_offset, 318168404Spjd FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE, 319219089Spjd DATA_TYPE_UINT64, zio->io_size, NULL); 320168404Spjd } 321168404Spjd 322168404Spjd /* 323168404Spjd * Payload for I/Os with corresponding logical information. 324168404Spjd */ 325219089Spjd if (zio->io_logical != NULL) 326219089Spjd fm_payload_set(ereport, 327219089Spjd FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET, 328219089Spjd DATA_TYPE_UINT64, 329219089Spjd zio->io_logical->io_bookmark.zb_objset, 330168404Spjd FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT, 331219089Spjd DATA_TYPE_UINT64, 332219089Spjd zio->io_logical->io_bookmark.zb_object, 333168404Spjd FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL, 334219089Spjd DATA_TYPE_INT64, 335219089Spjd zio->io_logical->io_bookmark.zb_level, 336168404Spjd FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID, 337219089Spjd DATA_TYPE_UINT64, 338219089Spjd zio->io_logical->io_bookmark.zb_blkid, NULL); 339168404Spjd } else if (vd != NULL) { 340168404Spjd /* 341168404Spjd * If we have a vdev but no zio, this is a device fault, and the 342168404Spjd * 'stateoroffset' parameter indicates the previous state of the 343168404Spjd * vdev. 344168404Spjd */ 345219089Spjd fm_payload_set(ereport, 346219089Spjd FM_EREPORT_PAYLOAD_ZFS_PREV_STATE, 347219089Spjd DATA_TYPE_UINT64, stateoroffset, NULL); 348168404Spjd } 349219089Spjd 350168404Spjd mutex_exit(&spa->spa_errlist_lock); 351168404Spjd 352219089Spjd *ereport_out = ereport; 353219089Spjd *detector_out = detector; 354219089Spjd} 355219089Spjd 356219089Spjd/* if it's <= 128 bytes, save the corruption directly */ 357219089Spjd#define ZFM_MAX_INLINE (128 / sizeof (uint64_t)) 358219089Spjd 359219089Spjd#define MAX_RANGES 16 360219089Spjd 361219089Spjdtypedef struct zfs_ecksum_info { 362219089Spjd /* histograms of set and cleared bits by bit number in a 64-bit word */ 363330588Savg uint32_t zei_histogram_set[sizeof (uint64_t) * NBBY]; 364330588Savg uint32_t zei_histogram_cleared[sizeof (uint64_t) * NBBY]; 365219089Spjd 366219089Spjd /* inline arrays of bits set and cleared. */ 367219089Spjd uint64_t zei_bits_set[ZFM_MAX_INLINE]; 368219089Spjd uint64_t zei_bits_cleared[ZFM_MAX_INLINE]; 369219089Spjd 370219089Spjd /* 371219089Spjd * for each range, the number of bits set and cleared. The Hamming 372219089Spjd * distance between the good and bad buffers is the sum of them all. 373219089Spjd */ 374219089Spjd uint32_t zei_range_sets[MAX_RANGES]; 375219089Spjd uint32_t zei_range_clears[MAX_RANGES]; 376219089Spjd 377219089Spjd struct zei_ranges { 378219089Spjd uint32_t zr_start; 379219089Spjd uint32_t zr_end; 380219089Spjd } zei_ranges[MAX_RANGES]; 381219089Spjd 382219089Spjd size_t zei_range_count; 383219089Spjd uint32_t zei_mingap; 384219089Spjd uint32_t zei_allowed_mingap; 385219089Spjd 386219089Spjd} zfs_ecksum_info_t; 387219089Spjd 388219089Spjdstatic void 389330588Savgupdate_histogram(uint64_t value_arg, uint32_t *hist, uint32_t *count) 390219089Spjd{ 391219089Spjd size_t i; 392219089Spjd size_t bits = 0; 393219089Spjd uint64_t value = BE_64(value_arg); 394219089Spjd 395219089Spjd /* We store the bits in big-endian (largest-first) order */ 396219089Spjd for (i = 0; i < 64; i++) { 397219089Spjd if (value & (1ull << i)) { 398219089Spjd hist[63 - i]++; 399219089Spjd ++bits; 400219089Spjd } 401219089Spjd } 402219089Spjd /* update the count of bits changed */ 403219089Spjd *count += bits; 404219089Spjd} 405219089Spjd 406219089Spjd/* 407219089Spjd * We've now filled up the range array, and need to increase "mingap" and 408219089Spjd * shrink the range list accordingly. zei_mingap is always the smallest 409219089Spjd * distance between array entries, so we set the new_allowed_gap to be 410219089Spjd * one greater than that. We then go through the list, joining together 411219089Spjd * any ranges which are closer than the new_allowed_gap. 412219089Spjd * 413219089Spjd * By construction, there will be at least one. We also update zei_mingap 414219089Spjd * to the new smallest gap, to prepare for our next invocation. 415219089Spjd */ 416219089Spjdstatic void 417219089Spjdshrink_ranges(zfs_ecksum_info_t *eip) 418219089Spjd{ 419219089Spjd uint32_t mingap = UINT32_MAX; 420219089Spjd uint32_t new_allowed_gap = eip->zei_mingap + 1; 421219089Spjd 422219089Spjd size_t idx, output; 423219089Spjd size_t max = eip->zei_range_count; 424219089Spjd 425219089Spjd struct zei_ranges *r = eip->zei_ranges; 426219089Spjd 427219089Spjd ASSERT3U(eip->zei_range_count, >, 0); 428219089Spjd ASSERT3U(eip->zei_range_count, <=, MAX_RANGES); 429219089Spjd 430219089Spjd output = idx = 0; 431219089Spjd while (idx < max - 1) { 432219089Spjd uint32_t start = r[idx].zr_start; 433219089Spjd uint32_t end = r[idx].zr_end; 434219089Spjd 435219089Spjd while (idx < max - 1) { 436219089Spjd idx++; 437219089Spjd 438219089Spjd uint32_t nstart = r[idx].zr_start; 439219089Spjd uint32_t nend = r[idx].zr_end; 440219089Spjd 441219089Spjd uint32_t gap = nstart - end; 442219089Spjd if (gap < new_allowed_gap) { 443219089Spjd end = nend; 444219089Spjd continue; 445219089Spjd } 446219089Spjd if (gap < mingap) 447219089Spjd mingap = gap; 448219089Spjd break; 449219089Spjd } 450219089Spjd r[output].zr_start = start; 451219089Spjd r[output].zr_end = end; 452219089Spjd output++; 453219089Spjd } 454219089Spjd ASSERT3U(output, <, eip->zei_range_count); 455219089Spjd eip->zei_range_count = output; 456219089Spjd eip->zei_mingap = mingap; 457219089Spjd eip->zei_allowed_mingap = new_allowed_gap; 458219089Spjd} 459219089Spjd 460219089Spjdstatic void 461219089Spjdadd_range(zfs_ecksum_info_t *eip, int start, int end) 462219089Spjd{ 463219089Spjd struct zei_ranges *r = eip->zei_ranges; 464219089Spjd size_t count = eip->zei_range_count; 465219089Spjd 466219089Spjd if (count >= MAX_RANGES) { 467219089Spjd shrink_ranges(eip); 468219089Spjd count = eip->zei_range_count; 469219089Spjd } 470219089Spjd if (count == 0) { 471219089Spjd eip->zei_mingap = UINT32_MAX; 472219089Spjd eip->zei_allowed_mingap = 1; 473219089Spjd } else { 474219089Spjd int gap = start - r[count - 1].zr_end; 475219089Spjd 476219089Spjd if (gap < eip->zei_allowed_mingap) { 477219089Spjd r[count - 1].zr_end = end; 478219089Spjd return; 479219089Spjd } 480219089Spjd if (gap < eip->zei_mingap) 481219089Spjd eip->zei_mingap = gap; 482219089Spjd } 483219089Spjd r[count].zr_start = start; 484219089Spjd r[count].zr_end = end; 485219089Spjd eip->zei_range_count++; 486219089Spjd} 487219089Spjd 488219089Spjdstatic size_t 489219089Spjdrange_total_size(zfs_ecksum_info_t *eip) 490219089Spjd{ 491219089Spjd struct zei_ranges *r = eip->zei_ranges; 492219089Spjd size_t count = eip->zei_range_count; 493219089Spjd size_t result = 0; 494219089Spjd size_t idx; 495219089Spjd 496219089Spjd for (idx = 0; idx < count; idx++) 497219089Spjd result += (r[idx].zr_end - r[idx].zr_start); 498219089Spjd 499219089Spjd return (result); 500219089Spjd} 501219089Spjd 502219089Spjdstatic zfs_ecksum_info_t * 503219089Spjdannotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info, 504219089Spjd const uint8_t *goodbuf, const uint8_t *badbuf, size_t size, 505219089Spjd boolean_t drop_if_identical) 506219089Spjd{ 507219089Spjd const uint64_t *good = (const uint64_t *)goodbuf; 508219089Spjd const uint64_t *bad = (const uint64_t *)badbuf; 509219089Spjd 510219089Spjd uint64_t allset = 0; 511219089Spjd uint64_t allcleared = 0; 512219089Spjd 513219089Spjd size_t nui64s = size / sizeof (uint64_t); 514219089Spjd 515219089Spjd size_t inline_size; 516219089Spjd int no_inline = 0; 517219089Spjd size_t idx; 518219089Spjd size_t range; 519219089Spjd 520219089Spjd size_t offset = 0; 521219089Spjd ssize_t start = -1; 522219089Spjd 523219089Spjd zfs_ecksum_info_t *eip = kmem_zalloc(sizeof (*eip), KM_SLEEP); 524219089Spjd 525219089Spjd /* don't do any annotation for injected checksum errors */ 526219089Spjd if (info != NULL && info->zbc_injected) 527219089Spjd return (eip); 528219089Spjd 529219089Spjd if (info != NULL && info->zbc_has_cksum) { 530219089Spjd fm_payload_set(ereport, 531219089Spjd FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED, 532219089Spjd DATA_TYPE_UINT64_ARRAY, 533219089Spjd sizeof (info->zbc_expected) / sizeof (uint64_t), 534219089Spjd (uint64_t *)&info->zbc_expected, 535219089Spjd FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL, 536219089Spjd DATA_TYPE_UINT64_ARRAY, 537219089Spjd sizeof (info->zbc_actual) / sizeof (uint64_t), 538219089Spjd (uint64_t *)&info->zbc_actual, 539219089Spjd FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO, 540219089Spjd DATA_TYPE_STRING, 541219089Spjd info->zbc_checksum_name, 542219089Spjd NULL); 543219089Spjd 544219089Spjd if (info->zbc_byteswapped) { 545219089Spjd fm_payload_set(ereport, 546219089Spjd FM_EREPORT_PAYLOAD_ZFS_CKSUM_BYTESWAP, 547219089Spjd DATA_TYPE_BOOLEAN, 1, 548219089Spjd NULL); 549219089Spjd } 550219089Spjd } 551219089Spjd 552219089Spjd if (badbuf == NULL || goodbuf == NULL) 553219089Spjd return (eip); 554219089Spjd 555330588Savg ASSERT3U(nui64s, <=, UINT32_MAX); 556219089Spjd ASSERT3U(size, ==, nui64s * sizeof (uint64_t)); 557219089Spjd ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); 558219089Spjd ASSERT3U(size, <=, UINT32_MAX); 559219089Spjd 560219089Spjd /* build up the range list by comparing the two buffers. */ 561219089Spjd for (idx = 0; idx < nui64s; idx++) { 562219089Spjd if (good[idx] == bad[idx]) { 563219089Spjd if (start == -1) 564219089Spjd continue; 565219089Spjd 566219089Spjd add_range(eip, start, idx); 567219089Spjd start = -1; 568219089Spjd } else { 569219089Spjd if (start != -1) 570219089Spjd continue; 571219089Spjd 572219089Spjd start = idx; 573219089Spjd } 574219089Spjd } 575219089Spjd if (start != -1) 576219089Spjd add_range(eip, start, idx); 577219089Spjd 578219089Spjd /* See if it will fit in our inline buffers */ 579219089Spjd inline_size = range_total_size(eip); 580219089Spjd if (inline_size > ZFM_MAX_INLINE) 581219089Spjd no_inline = 1; 582219089Spjd 583219089Spjd /* 584219089Spjd * If there is no change and we want to drop if the buffers are 585219089Spjd * identical, do so. 586219089Spjd */ 587219089Spjd if (inline_size == 0 && drop_if_identical) { 588219089Spjd kmem_free(eip, sizeof (*eip)); 589219089Spjd return (NULL); 590219089Spjd } 591219089Spjd 592219089Spjd /* 593219089Spjd * Now walk through the ranges, filling in the details of the 594219089Spjd * differences. Also convert our uint64_t-array offsets to byte 595219089Spjd * offsets. 596219089Spjd */ 597219089Spjd for (range = 0; range < eip->zei_range_count; range++) { 598219089Spjd size_t start = eip->zei_ranges[range].zr_start; 599219089Spjd size_t end = eip->zei_ranges[range].zr_end; 600219089Spjd 601219089Spjd for (idx = start; idx < end; idx++) { 602219089Spjd uint64_t set, cleared; 603219089Spjd 604219089Spjd // bits set in bad, but not in good 605219089Spjd set = ((~good[idx]) & bad[idx]); 606219089Spjd // bits set in good, but not in bad 607219089Spjd cleared = (good[idx] & (~bad[idx])); 608219089Spjd 609219089Spjd allset |= set; 610219089Spjd allcleared |= cleared; 611219089Spjd 612219089Spjd if (!no_inline) { 613219089Spjd ASSERT3U(offset, <, inline_size); 614219089Spjd eip->zei_bits_set[offset] = set; 615219089Spjd eip->zei_bits_cleared[offset] = cleared; 616219089Spjd offset++; 617219089Spjd } 618219089Spjd 619219089Spjd update_histogram(set, eip->zei_histogram_set, 620219089Spjd &eip->zei_range_sets[range]); 621219089Spjd update_histogram(cleared, eip->zei_histogram_cleared, 622219089Spjd &eip->zei_range_clears[range]); 623219089Spjd } 624219089Spjd 625219089Spjd /* convert to byte offsets */ 626219089Spjd eip->zei_ranges[range].zr_start *= sizeof (uint64_t); 627219089Spjd eip->zei_ranges[range].zr_end *= sizeof (uint64_t); 628219089Spjd } 629219089Spjd eip->zei_allowed_mingap *= sizeof (uint64_t); 630219089Spjd inline_size *= sizeof (uint64_t); 631219089Spjd 632219089Spjd /* fill in ereport */ 633219089Spjd fm_payload_set(ereport, 634219089Spjd FM_EREPORT_PAYLOAD_ZFS_BAD_OFFSET_RANGES, 635219089Spjd DATA_TYPE_UINT32_ARRAY, 2 * eip->zei_range_count, 636219089Spjd (uint32_t *)eip->zei_ranges, 637219089Spjd FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_MIN_GAP, 638219089Spjd DATA_TYPE_UINT32, eip->zei_allowed_mingap, 639219089Spjd FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_SETS, 640219089Spjd DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_sets, 641219089Spjd FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_CLEARS, 642219089Spjd DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_clears, 643219089Spjd NULL); 644219089Spjd 645219089Spjd if (!no_inline) { 646219089Spjd fm_payload_set(ereport, 647219089Spjd FM_EREPORT_PAYLOAD_ZFS_BAD_SET_BITS, 648219089Spjd DATA_TYPE_UINT8_ARRAY, 649219089Spjd inline_size, (uint8_t *)eip->zei_bits_set, 650219089Spjd FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_BITS, 651219089Spjd DATA_TYPE_UINT8_ARRAY, 652219089Spjd inline_size, (uint8_t *)eip->zei_bits_cleared, 653219089Spjd NULL); 654219089Spjd } else { 655219089Spjd fm_payload_set(ereport, 656219089Spjd FM_EREPORT_PAYLOAD_ZFS_BAD_SET_HISTOGRAM, 657330588Savg DATA_TYPE_UINT32_ARRAY, 658219089Spjd NBBY * sizeof (uint64_t), eip->zei_histogram_set, 659219089Spjd FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_HISTOGRAM, 660330588Savg DATA_TYPE_UINT32_ARRAY, 661219089Spjd NBBY * sizeof (uint64_t), eip->zei_histogram_cleared, 662219089Spjd NULL); 663219089Spjd } 664219089Spjd return (eip); 665219089Spjd} 666168404Spjd#endif 667219089Spjd 668219089Spjdvoid 669219089Spjdzfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, 670219089Spjd uint64_t stateoroffset, uint64_t size) 671219089Spjd{ 672219089Spjd#ifdef _KERNEL 673219089Spjd nvlist_t *ereport = NULL; 674219089Spjd nvlist_t *detector = NULL; 675219089Spjd 676219089Spjd zfs_ereport_start(&ereport, &detector, 677219089Spjd subclass, spa, vd, zio, stateoroffset, size); 678219089Spjd 679219089Spjd if (ereport == NULL) 680219089Spjd return; 681219089Spjd 682219089Spjd fm_ereport_post(ereport, EVCH_SLEEP); 683219089Spjd 684219089Spjd fm_nvlist_destroy(ereport, FM_NVA_FREE); 685219089Spjd fm_nvlist_destroy(detector, FM_NVA_FREE); 686219089Spjd#endif 687168404Spjd} 688168404Spjd 689219089Spjdvoid 690219089Spjdzfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, 691219089Spjd struct zio *zio, uint64_t offset, uint64_t length, void *arg, 692219089Spjd zio_bad_cksum_t *info) 693219089Spjd{ 694219089Spjd zio_cksum_report_t *report = kmem_zalloc(sizeof (*report), KM_SLEEP); 695219089Spjd 696219089Spjd if (zio->io_vsd != NULL) 697219089Spjd zio->io_vsd_ops->vsd_cksum_report(zio, report, arg); 698219089Spjd else 699219089Spjd zio_vsd_default_cksum_report(zio, report, arg); 700219089Spjd 701219089Spjd /* copy the checksum failure information if it was provided */ 702219089Spjd if (info != NULL) { 703219089Spjd report->zcr_ckinfo = kmem_zalloc(sizeof (*info), KM_SLEEP); 704219089Spjd bcopy(info, report->zcr_ckinfo, sizeof (*info)); 705219089Spjd } 706219089Spjd 707219089Spjd report->zcr_align = 1ULL << vd->vdev_top->vdev_ashift; 708219089Spjd report->zcr_length = length; 709219089Spjd 710219089Spjd#ifdef _KERNEL 711219089Spjd zfs_ereport_start(&report->zcr_ereport, &report->zcr_detector, 712219089Spjd FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio, offset, length); 713219089Spjd 714219089Spjd if (report->zcr_ereport == NULL) { 715219089Spjd report->zcr_free(report->zcr_cbdata, report->zcr_cbinfo); 716230514Smm if (report->zcr_ckinfo != NULL) { 717230514Smm kmem_free(report->zcr_ckinfo, 718230514Smm sizeof (*report->zcr_ckinfo)); 719230514Smm } 720219089Spjd kmem_free(report, sizeof (*report)); 721219089Spjd return; 722219089Spjd } 723219089Spjd#endif 724219089Spjd 725219089Spjd mutex_enter(&spa->spa_errlist_lock); 726219089Spjd report->zcr_next = zio->io_logical->io_cksum_report; 727219089Spjd zio->io_logical->io_cksum_report = report; 728219089Spjd mutex_exit(&spa->spa_errlist_lock); 729219089Spjd} 730219089Spjd 731219089Spjdvoid 732219089Spjdzfs_ereport_finish_checksum(zio_cksum_report_t *report, 733219089Spjd const void *good_data, const void *bad_data, boolean_t drop_if_identical) 734219089Spjd{ 735219089Spjd#ifdef _KERNEL 736219089Spjd zfs_ecksum_info_t *info = NULL; 737219089Spjd info = annotate_ecksum(report->zcr_ereport, report->zcr_ckinfo, 738219089Spjd good_data, bad_data, report->zcr_length, drop_if_identical); 739219089Spjd 740219089Spjd if (info != NULL) 741219089Spjd fm_ereport_post(report->zcr_ereport, EVCH_SLEEP); 742219089Spjd 743219089Spjd fm_nvlist_destroy(report->zcr_ereport, FM_NVA_FREE); 744219089Spjd fm_nvlist_destroy(report->zcr_detector, FM_NVA_FREE); 745219089Spjd report->zcr_ereport = report->zcr_detector = NULL; 746219089Spjd 747219089Spjd if (info != NULL) 748219089Spjd kmem_free(info, sizeof (*info)); 749219089Spjd#endif 750219089Spjd} 751219089Spjd 752219089Spjdvoid 753219089Spjdzfs_ereport_free_checksum(zio_cksum_report_t *rpt) 754219089Spjd{ 755219089Spjd#ifdef _KERNEL 756219089Spjd if (rpt->zcr_ereport != NULL) { 757219089Spjd fm_nvlist_destroy(rpt->zcr_ereport, 758219089Spjd FM_NVA_FREE); 759219089Spjd fm_nvlist_destroy(rpt->zcr_detector, 760219089Spjd FM_NVA_FREE); 761219089Spjd } 762219089Spjd#endif 763219089Spjd rpt->zcr_free(rpt->zcr_cbdata, rpt->zcr_cbinfo); 764219089Spjd 765219089Spjd if (rpt->zcr_ckinfo != NULL) 766219089Spjd kmem_free(rpt->zcr_ckinfo, sizeof (*rpt->zcr_ckinfo)); 767219089Spjd 768219089Spjd kmem_free(rpt, sizeof (*rpt)); 769219089Spjd} 770219089Spjd 771219089Spjdvoid 772219089Spjdzfs_ereport_send_interim_checksum(zio_cksum_report_t *report) 773219089Spjd{ 774219089Spjd#ifdef _KERNEL 775219089Spjd fm_ereport_post(report->zcr_ereport, EVCH_SLEEP); 776219089Spjd#endif 777219089Spjd} 778219089Spjd 779219089Spjdvoid 780219089Spjdzfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, 781219089Spjd struct zio *zio, uint64_t offset, uint64_t length, 782219089Spjd const void *good_data, const void *bad_data, zio_bad_cksum_t *zbc) 783219089Spjd{ 784219089Spjd#ifdef _KERNEL 785219089Spjd nvlist_t *ereport = NULL; 786219089Spjd nvlist_t *detector = NULL; 787219089Spjd zfs_ecksum_info_t *info; 788219089Spjd 789219089Spjd zfs_ereport_start(&ereport, &detector, 790219089Spjd FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio, offset, length); 791219089Spjd 792219089Spjd if (ereport == NULL) 793219089Spjd return; 794219089Spjd 795219089Spjd info = annotate_ecksum(ereport, zbc, good_data, bad_data, length, 796219089Spjd B_FALSE); 797219089Spjd 798219089Spjd if (info != NULL) 799219089Spjd fm_ereport_post(ereport, EVCH_SLEEP); 800219089Spjd 801219089Spjd fm_nvlist_destroy(ereport, FM_NVA_FREE); 802219089Spjd fm_nvlist_destroy(detector, FM_NVA_FREE); 803219089Spjd 804219089Spjd if (info != NULL) 805219089Spjd kmem_free(info, sizeof (*info)); 806219089Spjd#endif 807219089Spjd} 808219089Spjd 809185029Spjdstatic void 810185029Spjdzfs_post_common(spa_t *spa, vdev_t *vd, const char *name) 811168404Spjd{ 812168404Spjd#ifdef _KERNEL 813219089Spjd nvlist_t *resource; 814168404Spjd char class[64]; 815168404Spjd 816219089Spjd if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT) 817219089Spjd return; 818168404Spjd 819219089Spjd if ((resource = fm_nvlist_create(NULL)) == NULL) 820219089Spjd return; 821168404Spjd 822219089Spjd (void) snprintf(class, sizeof (class), "%s.%s.%s", FM_RSRC_RESOURCE, 823185029Spjd ZFS_ERROR_CLASS, name); 824219089Spjd VERIFY(nvlist_add_uint8(resource, FM_VERSION, FM_RSRC_VERSION) == 0); 825219089Spjd VERIFY(nvlist_add_string(resource, FM_CLASS, class) == 0); 826219089Spjd VERIFY(nvlist_add_uint64(resource, 827219089Spjd FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, spa_guid(spa)) == 0); 828168404Spjd if (vd) 829219089Spjd VERIFY(nvlist_add_uint64(resource, 830219089Spjd FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vd->vdev_guid) == 0); 831219089Spjd 832219089Spjd fm_ereport_post(resource, EVCH_SLEEP); 833219089Spjd 834219089Spjd fm_nvlist_destroy(resource, FM_NVA_FREE); 835168404Spjd#endif 836168404Spjd} 837185029Spjd 838185029Spjd/* 839185029Spjd * The 'resource.fs.zfs.removed' event is an internal signal that the given vdev 840185029Spjd * has been removed from the system. This will cause the DE to ignore any 841185029Spjd * recent I/O errors, inferring that they are due to the asynchronous device 842185029Spjd * removal. 843185029Spjd */ 844185029Spjdvoid 845185029Spjdzfs_post_remove(spa_t *spa, vdev_t *vd) 846185029Spjd{ 847185029Spjd zfs_post_common(spa, vd, FM_RESOURCE_REMOVED); 848185029Spjd} 849185029Spjd 850185029Spjd/* 851185029Spjd * The 'resource.fs.zfs.autoreplace' event is an internal signal that the pool 852185029Spjd * has the 'autoreplace' property set, and therefore any broken vdevs will be 853185029Spjd * handled by higher level logic, and no vdev fault should be generated. 854185029Spjd */ 855185029Spjdvoid 856185029Spjdzfs_post_autoreplace(spa_t *spa, vdev_t *vd) 857185029Spjd{ 858185029Spjd zfs_post_common(spa, vd, FM_RESOURCE_AUTOREPLACE); 859185029Spjd} 860219089Spjd 861219089Spjd/* 862219089Spjd * The 'resource.fs.zfs.statechange' event is an internal signal that the 863219089Spjd * given vdev has transitioned its state to DEGRADED or HEALTHY. This will 864219089Spjd * cause the retire agent to repair any outstanding fault management cases 865219089Spjd * open because the device was not found (fault.fs.zfs.device). 866219089Spjd */ 867219089Spjdvoid 868219089Spjdzfs_post_state_change(spa_t *spa, vdev_t *vd) 869219089Spjd{ 870219089Spjd zfs_post_common(spa, vd, FM_RESOURCE_STATECHANGE); 871219089Spjd} 872