zfs_fm.c revision 185029
1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd/* 22185029Spjd * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23168404Spjd * Use is subject to license terms. 24168404Spjd */ 25168404Spjd 26168404Spjd#include <sys/spa.h> 27168404Spjd#include <sys/spa_impl.h> 28168404Spjd#include <sys/vdev.h> 29168404Spjd#include <sys/vdev_impl.h> 30168404Spjd#include <sys/zio.h> 31168404Spjd 32168404Spjd#include <sys/fm/fs/zfs.h> 33168404Spjd#include <sys/fm/protocol.h> 34168404Spjd#include <sys/fm/util.h> 35168404Spjd 36168404Spjd#ifdef _KERNEL 37168404Spjd/* Including sys/bus.h is just too hard, so I declare what I need here. */ 38168404Spjdextern void devctl_notify(const char *__system, const char *__subsystem, 39168404Spjd const char *__type, const char *__data); 40168404Spjd#endif 41168404Spjd 42168404Spjd/* 43168404Spjd * This general routine is responsible for generating all the different ZFS 44168404Spjd * ereports. The payload is dependent on the class, and which arguments are 45168404Spjd * supplied to the function: 46168404Spjd * 47168404Spjd * EREPORT POOL VDEV IO 48168404Spjd * block X X X 49168404Spjd * data X X 50168404Spjd * device X X 51168404Spjd * pool X 52168404Spjd * 53168404Spjd * If we are in a loading state, all errors are chained together by the same 54185029Spjd * SPA-wide ENA (Error Numeric Association). 55168404Spjd * 56168404Spjd * For isolated I/O requests, we get the ENA from the zio_t. The propagation 57168404Spjd * gets very complicated due to RAID-Z, gang blocks, and vdev caching. We want 58168404Spjd * to chain together all ereports associated with a logical piece of data. For 59168404Spjd * read I/Os, there are basically three 'types' of I/O, which form a roughly 60168404Spjd * layered diagram: 61168404Spjd * 62168404Spjd * +---------------+ 63168404Spjd * | Aggregate I/O | No associated logical data or device 64168404Spjd * +---------------+ 65168404Spjd * | 66168404Spjd * V 67168404Spjd * +---------------+ Reads associated with a piece of logical data. 68168404Spjd * | Read I/O | This includes reads on behalf of RAID-Z, 69168404Spjd * +---------------+ mirrors, gang blocks, retries, etc. 70168404Spjd * | 71168404Spjd * V 72168404Spjd * +---------------+ Reads associated with a particular device, but 73168404Spjd * | Physical I/O | no logical data. Issued as part of vdev caching 74168404Spjd * +---------------+ and I/O aggregation. 75168404Spjd * 76168404Spjd * Note that 'physical I/O' here is not the same terminology as used in the rest 77168404Spjd * of ZIO. Typically, 'physical I/O' simply means that there is no attached 78168404Spjd * blockpointer. But I/O with no associated block pointer can still be related 79168404Spjd * to a logical piece of data (i.e. RAID-Z requests). 80168404Spjd * 81168404Spjd * Purely physical I/O always have unique ENAs. They are not related to a 82168404Spjd * particular piece of logical data, and therefore cannot be chained together. 83168404Spjd * We still generate an ereport, but the DE doesn't correlate it with any 84168404Spjd * logical piece of data. When such an I/O fails, the delegated I/O requests 85168404Spjd * will issue a retry, which will trigger the 'real' ereport with the correct 86168404Spjd * ENA. 87168404Spjd * 88168404Spjd * We keep track of the ENA for a ZIO chain through the 'io_logical' member. 89168404Spjd * When a new logical I/O is issued, we set this to point to itself. Child I/Os 90168404Spjd * then inherit this pointer, so that when it is first set subsequent failures 91185029Spjd * will use the same ENA. For vdev cache fill and queue aggregation I/O, 92185029Spjd * this pointer is set to NULL, and no ereport will be generated (since it 93185029Spjd * doesn't actually correspond to any particular device or piece of data, 94185029Spjd * and the caller will always retry without caching or queueing anyway). 95168404Spjd */ 96168404Spjdvoid 97168404Spjdzfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, 98168404Spjd uint64_t stateoroffset, uint64_t size) 99168404Spjd{ 100168404Spjd#ifdef _KERNEL 101168404Spjd char buf[1024]; 102168404Spjd struct sbuf sb; 103168404Spjd struct timespec ts; 104185029Spjd int state; 105168404Spjd 106168404Spjd /* 107168404Spjd * If we are doing a spa_tryimport(), ignore errors. 108168404Spjd */ 109168404Spjd if (spa->spa_load_state == SPA_LOAD_TRYIMPORT) 110168404Spjd return; 111168404Spjd 112168404Spjd /* 113168404Spjd * If we are in the middle of opening a pool, and the previous attempt 114168404Spjd * failed, don't bother logging any new ereports - we're just going to 115168404Spjd * get the same diagnosis anyway. 116168404Spjd */ 117168404Spjd if (spa->spa_load_state != SPA_LOAD_NONE && 118168404Spjd spa->spa_last_open_failed) 119168404Spjd return; 120168404Spjd 121185029Spjd if (zio != NULL) { 122185029Spjd /* 123185029Spjd * If this is not a read or write zio, ignore the error. This 124185029Spjd * can occur if the DKIOCFLUSHWRITECACHE ioctl fails. 125185029Spjd */ 126185029Spjd if (zio->io_type != ZIO_TYPE_READ && 127185029Spjd zio->io_type != ZIO_TYPE_WRITE) 128185029Spjd return; 129168404Spjd 130185029Spjd /* 131185029Spjd * Ignore any errors from speculative I/Os, as failure is an 132185029Spjd * expected result. 133185029Spjd */ 134185029Spjd if (zio->io_flags & ZIO_FLAG_SPECULATIVE) 135185029Spjd return; 136168404Spjd 137185029Spjd /* 138185029Spjd * If the vdev has already been marked as failing due to a 139185029Spjd * failed probe, then ignore any subsequent I/O errors, as the 140185029Spjd * DE will automatically fault the vdev on the first such 141185029Spjd * failure. 142185029Spjd */ 143185029Spjd if (vd != NULL && 144185029Spjd (!vdev_readable(vd) || !vdev_writeable(vd)) && 145185029Spjd strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) != 0) 146185029Spjd return; 147185029Spjd } 148168404Spjd nanotime(&ts); 149168404Spjd 150168404Spjd sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN); 151168494Spjd sbuf_printf(&sb, "time=%ju.%ld", (uintmax_t)ts.tv_sec, ts.tv_nsec); 152168404Spjd 153168404Spjd /* 154168404Spjd * Serialize ereport generation 155168404Spjd */ 156168404Spjd mutex_enter(&spa->spa_errlist_lock); 157168404Spjd 158168404Spjd#if 0 159168404Spjd /* 160168404Spjd * Determine the ENA to use for this event. If we are in a loading 161168404Spjd * state, use a SPA-wide ENA. Otherwise, if we are in an I/O state, use 162168404Spjd * a root zio-wide ENA. Otherwise, simply use a unique ENA. 163168404Spjd */ 164168404Spjd if (spa->spa_load_state != SPA_LOAD_NONE) { 165168404Spjd#if 0 166168404Spjd if (spa->spa_ena == 0) 167168404Spjd spa->spa_ena = fm_ena_generate(0, FM_ENA_FMT1); 168168404Spjd#endif 169168404Spjd ena = spa->spa_ena; 170168404Spjd } else if (zio != NULL && zio->io_logical != NULL) { 171168404Spjd#if 0 172168404Spjd if (zio->io_logical->io_ena == 0) 173168404Spjd zio->io_logical->io_ena = 174168404Spjd fm_ena_generate(0, FM_ENA_FMT1); 175168404Spjd#endif 176168404Spjd ena = zio->io_logical->io_ena; 177168404Spjd } else { 178168404Spjd#if 0 179168404Spjd ena = fm_ena_generate(0, FM_ENA_FMT1); 180168404Spjd#else 181168404Spjd ena = 0; 182168404Spjd#endif 183168404Spjd } 184168404Spjd#endif 185168404Spjd 186168404Spjd /* 187168404Spjd * Construct the full class, detector, and other standard FMA fields. 188168404Spjd */ 189168494Spjd sbuf_printf(&sb, " ereport_version=%u", FM_EREPORT_VERSION); 190168494Spjd sbuf_printf(&sb, " class=%s.%s", ZFS_ERROR_CLASS, subclass); 191168404Spjd 192168494Spjd sbuf_printf(&sb, " zfs_scheme_version=%u", FM_ZFS_SCHEME_VERSION); 193168404Spjd 194168404Spjd /* 195168404Spjd * Construct the per-ereport payload, depending on which parameters are 196168404Spjd * passed in. 197168404Spjd */ 198168404Spjd 199168404Spjd /* 200185029Spjd * If we are importing a faulted pool, then we treat it like an open, 201185029Spjd * not an import. Otherwise, the DE will ignore all faults during 202185029Spjd * import, since the default behavior is to mark the devices as 203185029Spjd * persistently unavailable, not leave them in the faulted state. 204185029Spjd */ 205185029Spjd state = spa->spa_import_faulted ? SPA_LOAD_OPEN : spa->spa_load_state; 206185029Spjd 207185029Spjd /* 208168404Spjd * Generic payload members common to all ereports. 209168404Spjd */ 210185029Spjd sbuf_printf(&sb, " %s=%s", FM_EREPORT_PAYLOAD_ZFS_POOL, spa_name(spa)); 211168494Spjd sbuf_printf(&sb, " %s=%ju", FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, 212168404Spjd spa_guid(spa)); 213185029Spjd sbuf_printf(&sb, " %s=%d", FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, state); 214168404Spjd 215185029Spjd if (spa != NULL) { 216185029Spjd sbuf_printf(&sb, " %s=%s", FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE, 217185029Spjd spa_get_failmode(spa) == ZIO_FAILURE_MODE_WAIT ? 218185029Spjd FM_EREPORT_FAILMODE_WAIT : 219185029Spjd spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE ? 220185029Spjd FM_EREPORT_FAILMODE_CONTINUE : FM_EREPORT_FAILMODE_PANIC); 221185029Spjd } 222185029Spjd 223168404Spjd if (vd != NULL) { 224168404Spjd vdev_t *pvd = vd->vdev_parent; 225168404Spjd 226168494Spjd sbuf_printf(&sb, " %s=%ju", FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, 227168404Spjd vd->vdev_guid); 228168494Spjd sbuf_printf(&sb, " %s=%s", FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, 229168404Spjd vd->vdev_ops->vdev_op_type); 230168404Spjd if (vd->vdev_path) 231168494Spjd sbuf_printf(&sb, " %s=%s", 232168404Spjd FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH, vd->vdev_path); 233168404Spjd if (vd->vdev_devid) 234168494Spjd sbuf_printf(&sb, " %s=%s", 235168404Spjd FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID, vd->vdev_devid); 236168404Spjd 237168404Spjd if (pvd != NULL) { 238168494Spjd sbuf_printf(&sb, " %s=%ju", 239168404Spjd FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID, pvd->vdev_guid); 240168494Spjd sbuf_printf(&sb, " %s=%s", 241168404Spjd FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE, 242168404Spjd pvd->vdev_ops->vdev_op_type); 243168404Spjd if (pvd->vdev_path) 244168494Spjd sbuf_printf(&sb, " %s=%s", 245168404Spjd FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH, 246168404Spjd pvd->vdev_path); 247168404Spjd if (pvd->vdev_devid) 248168494Spjd sbuf_printf(&sb, " %s=%s", 249168404Spjd FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID, 250168404Spjd pvd->vdev_devid); 251168404Spjd } 252168404Spjd } 253168404Spjd 254168404Spjd if (zio != NULL) { 255168404Spjd /* 256168404Spjd * Payload common to all I/Os. 257168404Spjd */ 258168494Spjd sbuf_printf(&sb, " %s=%u", FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR, 259168404Spjd zio->io_error); 260168404Spjd 261168404Spjd /* 262168404Spjd * If the 'size' parameter is non-zero, it indicates this is a 263168404Spjd * RAID-Z or other I/O where the physical offset and length are 264168404Spjd * provided for us, instead of within the zio_t. 265168404Spjd */ 266168404Spjd if (vd != NULL) { 267168404Spjd if (size) { 268168494Spjd sbuf_printf(&sb, " %s=%ju", 269168404Spjd FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET, 270168404Spjd stateoroffset); 271168494Spjd sbuf_printf(&sb, " %s=%ju", 272168404Spjd FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE, size); 273168404Spjd } else { 274168494Spjd sbuf_printf(&sb, " %s=%ju", 275168404Spjd FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET, 276168404Spjd zio->io_offset); 277168494Spjd sbuf_printf(&sb, " %s=%ju", 278168404Spjd FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE, 279168404Spjd zio->io_size); 280168404Spjd } 281168404Spjd } 282168404Spjd 283168404Spjd /* 284168404Spjd * Payload for I/Os with corresponding logical information. 285168404Spjd */ 286168404Spjd if (zio->io_logical != NULL) { 287168494Spjd sbuf_printf(&sb, " %s=%ju", 288168404Spjd FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT, 289168404Spjd zio->io_logical->io_bookmark.zb_object); 290168494Spjd sbuf_printf(&sb, " %s=%ju", 291168404Spjd FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL, 292168404Spjd zio->io_logical->io_bookmark.zb_level); 293168494Spjd sbuf_printf(&sb, " %s=%ju", 294168404Spjd FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID, 295168404Spjd zio->io_logical->io_bookmark.zb_blkid); 296168404Spjd } 297168404Spjd } else if (vd != NULL) { 298168404Spjd /* 299168404Spjd * If we have a vdev but no zio, this is a device fault, and the 300168404Spjd * 'stateoroffset' parameter indicates the previous state of the 301168404Spjd * vdev. 302168404Spjd */ 303168494Spjd sbuf_printf(&sb, " %s=%ju", FM_EREPORT_PAYLOAD_ZFS_PREV_STATE, 304168404Spjd stateoroffset); 305168404Spjd } 306168404Spjd mutex_exit(&spa->spa_errlist_lock); 307168404Spjd 308168404Spjd sbuf_finish(&sb); 309168494Spjd devctl_notify("ZFS", spa->spa_name, subclass, sbuf_data(&sb)); 310168404Spjd if (sbuf_overflowed(&sb)) 311168404Spjd printf("ZFS WARNING: sbuf overflowed\n"); 312168404Spjd sbuf_delete(&sb); 313168404Spjd#endif 314168404Spjd} 315168404Spjd 316185029Spjdstatic void 317185029Spjdzfs_post_common(spa_t *spa, vdev_t *vd, const char *name) 318168404Spjd{ 319168404Spjd#ifdef _KERNEL 320168404Spjd char buf[1024]; 321168404Spjd char class[64]; 322168404Spjd struct sbuf sb; 323168404Spjd struct timespec ts; 324168404Spjd 325168404Spjd nanotime(&ts); 326168404Spjd 327168404Spjd sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN); 328168494Spjd sbuf_printf(&sb, "time=%ju.%ld", (uintmax_t)ts.tv_sec, ts.tv_nsec); 329168404Spjd 330168404Spjd snprintf(class, sizeof(class), "%s.%s.%s", FM_RSRC_RESOURCE, 331185029Spjd ZFS_ERROR_CLASS, name); 332168494Spjd sbuf_printf(&sb, " %s=%hhu", FM_VERSION, FM_RSRC_VERSION); 333168494Spjd sbuf_printf(&sb, " %s=%s", FM_CLASS, class); 334168494Spjd sbuf_printf(&sb, " %s=%ju", FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, 335168404Spjd spa_guid(spa)); 336168404Spjd if (vd) 337168494Spjd sbuf_printf(&sb, " %s=%ju", FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, 338168404Spjd vd->vdev_guid); 339168404Spjd sbuf_finish(&sb); 340185029Spjd ZFS_LOG(1, "%s", sbuf_data(&sb)); 341168404Spjd devctl_notify("ZFS", spa->spa_name, class, sbuf_data(&sb)); 342168404Spjd if (sbuf_overflowed(&sb)) 343168404Spjd printf("ZFS WARNING: sbuf overflowed\n"); 344168404Spjd sbuf_delete(&sb); 345168404Spjd#endif 346168404Spjd} 347185029Spjd 348185029Spjd/* 349185029Spjd * The 'resource.fs.zfs.removed' event is an internal signal that the given vdev 350185029Spjd * has been removed from the system. This will cause the DE to ignore any 351185029Spjd * recent I/O errors, inferring that they are due to the asynchronous device 352185029Spjd * removal. 353185029Spjd */ 354185029Spjdvoid 355185029Spjdzfs_post_remove(spa_t *spa, vdev_t *vd) 356185029Spjd{ 357185029Spjd zfs_post_common(spa, vd, FM_RESOURCE_REMOVED); 358185029Spjd} 359185029Spjd 360185029Spjd/* 361185029Spjd * The 'resource.fs.zfs.autoreplace' event is an internal signal that the pool 362185029Spjd * has the 'autoreplace' property set, and therefore any broken vdevs will be 363185029Spjd * handled by higher level logic, and no vdev fault should be generated. 364185029Spjd */ 365185029Spjdvoid 366185029Spjdzfs_post_autoreplace(spa_t *spa, vdev_t *vd) 367185029Spjd{ 368185029Spjd zfs_post_common(spa, vd, FM_RESOURCE_AUTOREPLACE); 369185029Spjd} 370