zfs_fm.c revision 185029
1168404Spjd/*
2168404Spjd * CDDL HEADER START
3168404Spjd *
4168404Spjd * The contents of this file are subject to the terms of the
5168404Spjd * Common Development and Distribution License (the "License").
6168404Spjd * You may not use this file except in compliance with the License.
7168404Spjd *
8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9168404Spjd * or http://www.opensolaris.org/os/licensing.
10168404Spjd * See the License for the specific language governing permissions
11168404Spjd * and limitations under the License.
12168404Spjd *
13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each
14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15168404Spjd * If applicable, add the following below this CDDL HEADER, with the
16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying
17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner]
18168404Spjd *
19168404Spjd * CDDL HEADER END
20168404Spjd */
21168404Spjd/*
22185029Spjd * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23168404Spjd * Use is subject to license terms.
24168404Spjd */
25168404Spjd
26168404Spjd#include <sys/spa.h>
27168404Spjd#include <sys/spa_impl.h>
28168404Spjd#include <sys/vdev.h>
29168404Spjd#include <sys/vdev_impl.h>
30168404Spjd#include <sys/zio.h>
31168404Spjd
32168404Spjd#include <sys/fm/fs/zfs.h>
33168404Spjd#include <sys/fm/protocol.h>
34168404Spjd#include <sys/fm/util.h>
35168404Spjd
36168404Spjd#ifdef _KERNEL
37168404Spjd/* Including sys/bus.h is just too hard, so I declare what I need here. */
38168404Spjdextern void devctl_notify(const char *__system, const char *__subsystem,
39168404Spjd    const char *__type, const char *__data);
40168404Spjd#endif
41168404Spjd
42168404Spjd/*
43168404Spjd * This general routine is responsible for generating all the different ZFS
44168404Spjd * ereports.  The payload is dependent on the class, and which arguments are
45168404Spjd * supplied to the function:
46168404Spjd *
47168404Spjd * 	EREPORT			POOL	VDEV	IO
48168404Spjd * 	block			X	X	X
49168404Spjd * 	data			X		X
50168404Spjd * 	device			X	X
51168404Spjd * 	pool			X
52168404Spjd *
53168404Spjd * If we are in a loading state, all errors are chained together by the same
54185029Spjd * SPA-wide ENA (Error Numeric Association).
55168404Spjd *
56168404Spjd * For isolated I/O requests, we get the ENA from the zio_t. The propagation
57168404Spjd * gets very complicated due to RAID-Z, gang blocks, and vdev caching.  We want
58168404Spjd * to chain together all ereports associated with a logical piece of data.  For
59168404Spjd * read I/Os, there  are basically three 'types' of I/O, which form a roughly
60168404Spjd * layered diagram:
61168404Spjd *
62168404Spjd *      +---------------+
63168404Spjd * 	| Aggregate I/O |	No associated logical data or device
64168404Spjd * 	+---------------+
65168404Spjd *              |
66168404Spjd *              V
67168404Spjd * 	+---------------+	Reads associated with a piece of logical data.
68168404Spjd * 	|   Read I/O    |	This includes reads on behalf of RAID-Z,
69168404Spjd * 	+---------------+       mirrors, gang blocks, retries, etc.
70168404Spjd *              |
71168404Spjd *              V
72168404Spjd * 	+---------------+	Reads associated with a particular device, but
73168404Spjd * 	| Physical I/O  |	no logical data.  Issued as part of vdev caching
74168404Spjd * 	+---------------+	and I/O aggregation.
75168404Spjd *
76168404Spjd * Note that 'physical I/O' here is not the same terminology as used in the rest
77168404Spjd * of ZIO.  Typically, 'physical I/O' simply means that there is no attached
78168404Spjd * blockpointer.  But I/O with no associated block pointer can still be related
79168404Spjd * to a logical piece of data (i.e. RAID-Z requests).
80168404Spjd *
81168404Spjd * Purely physical I/O always have unique ENAs.  They are not related to a
82168404Spjd * particular piece of logical data, and therefore cannot be chained together.
83168404Spjd * We still generate an ereport, but the DE doesn't correlate it with any
84168404Spjd * logical piece of data.  When such an I/O fails, the delegated I/O requests
85168404Spjd * will issue a retry, which will trigger the 'real' ereport with the correct
86168404Spjd * ENA.
87168404Spjd *
88168404Spjd * We keep track of the ENA for a ZIO chain through the 'io_logical' member.
89168404Spjd * When a new logical I/O is issued, we set this to point to itself.  Child I/Os
90168404Spjd * then inherit this pointer, so that when it is first set subsequent failures
91185029Spjd * will use the same ENA.  For vdev cache fill and queue aggregation I/O,
92185029Spjd * this pointer is set to NULL, and no ereport will be generated (since it
93185029Spjd * doesn't actually correspond to any particular device or piece of data,
94185029Spjd * and the caller will always retry without caching or queueing anyway).
95168404Spjd */
96168404Spjdvoid
97168404Spjdzfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
98168404Spjd    uint64_t stateoroffset, uint64_t size)
99168404Spjd{
100168404Spjd#ifdef _KERNEL
101168404Spjd	char buf[1024];
102168404Spjd	struct sbuf sb;
103168404Spjd	struct timespec ts;
104185029Spjd	int state;
105168404Spjd
106168404Spjd	/*
107168404Spjd	 * If we are doing a spa_tryimport(), ignore errors.
108168404Spjd	 */
109168404Spjd	if (spa->spa_load_state == SPA_LOAD_TRYIMPORT)
110168404Spjd		return;
111168404Spjd
112168404Spjd	/*
113168404Spjd	 * If we are in the middle of opening a pool, and the previous attempt
114168404Spjd	 * failed, don't bother logging any new ereports - we're just going to
115168404Spjd	 * get the same diagnosis anyway.
116168404Spjd	 */
117168404Spjd	if (spa->spa_load_state != SPA_LOAD_NONE &&
118168404Spjd	    spa->spa_last_open_failed)
119168404Spjd		return;
120168404Spjd
121185029Spjd	if (zio != NULL) {
122185029Spjd		/*
123185029Spjd		 * If this is not a read or write zio, ignore the error.  This
124185029Spjd		 * can occur if the DKIOCFLUSHWRITECACHE ioctl fails.
125185029Spjd		 */
126185029Spjd		if (zio->io_type != ZIO_TYPE_READ &&
127185029Spjd		    zio->io_type != ZIO_TYPE_WRITE)
128185029Spjd			return;
129168404Spjd
130185029Spjd		/*
131185029Spjd		 * Ignore any errors from speculative I/Os, as failure is an
132185029Spjd		 * expected result.
133185029Spjd		 */
134185029Spjd		if (zio->io_flags & ZIO_FLAG_SPECULATIVE)
135185029Spjd			return;
136168404Spjd
137185029Spjd		/*
138185029Spjd		 * If the vdev has already been marked as failing due to a
139185029Spjd		 * failed probe, then ignore any subsequent I/O errors, as the
140185029Spjd		 * DE will automatically fault the vdev on the first such
141185029Spjd		 * failure.
142185029Spjd		 */
143185029Spjd		if (vd != NULL &&
144185029Spjd		    (!vdev_readable(vd) || !vdev_writeable(vd)) &&
145185029Spjd		    strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) != 0)
146185029Spjd			return;
147185029Spjd	}
148168404Spjd	nanotime(&ts);
149168404Spjd
150168404Spjd	sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
151168494Spjd	sbuf_printf(&sb, "time=%ju.%ld", (uintmax_t)ts.tv_sec, ts.tv_nsec);
152168404Spjd
153168404Spjd	/*
154168404Spjd	 * Serialize ereport generation
155168404Spjd	 */
156168404Spjd	mutex_enter(&spa->spa_errlist_lock);
157168404Spjd
158168404Spjd#if 0
159168404Spjd	/*
160168404Spjd	 * Determine the ENA to use for this event.  If we are in a loading
161168404Spjd	 * state, use a SPA-wide ENA.  Otherwise, if we are in an I/O state, use
162168404Spjd	 * a root zio-wide ENA.  Otherwise, simply use a unique ENA.
163168404Spjd	 */
164168404Spjd	if (spa->spa_load_state != SPA_LOAD_NONE) {
165168404Spjd#if 0
166168404Spjd		if (spa->spa_ena == 0)
167168404Spjd			spa->spa_ena = fm_ena_generate(0, FM_ENA_FMT1);
168168404Spjd#endif
169168404Spjd		ena = spa->spa_ena;
170168404Spjd	} else if (zio != NULL && zio->io_logical != NULL) {
171168404Spjd#if 0
172168404Spjd		if (zio->io_logical->io_ena == 0)
173168404Spjd			zio->io_logical->io_ena =
174168404Spjd			    fm_ena_generate(0, FM_ENA_FMT1);
175168404Spjd#endif
176168404Spjd		ena = zio->io_logical->io_ena;
177168404Spjd	} else {
178168404Spjd#if 0
179168404Spjd		ena = fm_ena_generate(0, FM_ENA_FMT1);
180168404Spjd#else
181168404Spjd		ena = 0;
182168404Spjd#endif
183168404Spjd	}
184168404Spjd#endif
185168404Spjd
186168404Spjd	/*
187168404Spjd	 * Construct the full class, detector, and other standard FMA fields.
188168404Spjd	 */
189168494Spjd	sbuf_printf(&sb, " ereport_version=%u", FM_EREPORT_VERSION);
190168494Spjd	sbuf_printf(&sb, " class=%s.%s", ZFS_ERROR_CLASS, subclass);
191168404Spjd
192168494Spjd	sbuf_printf(&sb, " zfs_scheme_version=%u", FM_ZFS_SCHEME_VERSION);
193168404Spjd
194168404Spjd	/*
195168404Spjd	 * Construct the per-ereport payload, depending on which parameters are
196168404Spjd	 * passed in.
197168404Spjd	 */
198168404Spjd
199168404Spjd	/*
200185029Spjd	 * If we are importing a faulted pool, then we treat it like an open,
201185029Spjd	 * not an import.  Otherwise, the DE will ignore all faults during
202185029Spjd	 * import, since the default behavior is to mark the devices as
203185029Spjd	 * persistently unavailable, not leave them in the faulted state.
204185029Spjd	 */
205185029Spjd	state = spa->spa_import_faulted ? SPA_LOAD_OPEN : spa->spa_load_state;
206185029Spjd
207185029Spjd	/*
208168404Spjd	 * Generic payload members common to all ereports.
209168404Spjd	 */
210185029Spjd	sbuf_printf(&sb, " %s=%s", FM_EREPORT_PAYLOAD_ZFS_POOL, spa_name(spa));
211168494Spjd	sbuf_printf(&sb, " %s=%ju", FM_EREPORT_PAYLOAD_ZFS_POOL_GUID,
212168404Spjd	    spa_guid(spa));
213185029Spjd	sbuf_printf(&sb, " %s=%d", FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, state);
214168404Spjd
215185029Spjd	if (spa != NULL) {
216185029Spjd		sbuf_printf(&sb, " %s=%s", FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE,
217185029Spjd		    spa_get_failmode(spa) == ZIO_FAILURE_MODE_WAIT ?
218185029Spjd		    FM_EREPORT_FAILMODE_WAIT :
219185029Spjd		    spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE ?
220185029Spjd		    FM_EREPORT_FAILMODE_CONTINUE : FM_EREPORT_FAILMODE_PANIC);
221185029Spjd	}
222185029Spjd
223168404Spjd	if (vd != NULL) {
224168404Spjd		vdev_t *pvd = vd->vdev_parent;
225168404Spjd
226168494Spjd		sbuf_printf(&sb, " %s=%ju", FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID,
227168404Spjd		    vd->vdev_guid);
228168494Spjd		sbuf_printf(&sb, " %s=%s", FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE,
229168404Spjd		    vd->vdev_ops->vdev_op_type);
230168404Spjd		if (vd->vdev_path)
231168494Spjd			sbuf_printf(&sb, " %s=%s",
232168404Spjd			    FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH, vd->vdev_path);
233168404Spjd		if (vd->vdev_devid)
234168494Spjd			sbuf_printf(&sb, " %s=%s",
235168404Spjd			    FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID, vd->vdev_devid);
236168404Spjd
237168404Spjd		if (pvd != NULL) {
238168494Spjd			sbuf_printf(&sb, " %s=%ju",
239168404Spjd			    FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID, pvd->vdev_guid);
240168494Spjd			sbuf_printf(&sb, " %s=%s",
241168404Spjd			    FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE,
242168404Spjd			    pvd->vdev_ops->vdev_op_type);
243168404Spjd			if (pvd->vdev_path)
244168494Spjd				sbuf_printf(&sb, " %s=%s",
245168404Spjd				    FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH,
246168404Spjd				    pvd->vdev_path);
247168404Spjd			if (pvd->vdev_devid)
248168494Spjd				sbuf_printf(&sb, " %s=%s",
249168404Spjd				    FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID,
250168404Spjd				    pvd->vdev_devid);
251168404Spjd		}
252168404Spjd	}
253168404Spjd
254168404Spjd	if (zio != NULL) {
255168404Spjd		/*
256168404Spjd		 * Payload common to all I/Os.
257168404Spjd		 */
258168494Spjd		sbuf_printf(&sb, " %s=%u", FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR,
259168404Spjd		    zio->io_error);
260168404Spjd
261168404Spjd		/*
262168404Spjd		 * If the 'size' parameter is non-zero, it indicates this is a
263168404Spjd		 * RAID-Z or other I/O where the physical offset and length are
264168404Spjd		 * provided for us, instead of within the zio_t.
265168404Spjd		 */
266168404Spjd		if (vd != NULL) {
267168404Spjd			if (size) {
268168494Spjd				sbuf_printf(&sb, " %s=%ju",
269168404Spjd				    FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET,
270168404Spjd				    stateoroffset);
271168494Spjd				sbuf_printf(&sb, " %s=%ju",
272168404Spjd				    FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE, size);
273168404Spjd			} else {
274168494Spjd				sbuf_printf(&sb, " %s=%ju",
275168404Spjd				    FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET,
276168404Spjd				    zio->io_offset);
277168494Spjd				sbuf_printf(&sb, " %s=%ju",
278168404Spjd				    FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE,
279168404Spjd				    zio->io_size);
280168404Spjd			}
281168404Spjd		}
282168404Spjd
283168404Spjd		/*
284168404Spjd		 * Payload for I/Os with corresponding logical information.
285168404Spjd		 */
286168404Spjd		if (zio->io_logical != NULL) {
287168494Spjd			sbuf_printf(&sb, " %s=%ju",
288168404Spjd			    FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT,
289168404Spjd			    zio->io_logical->io_bookmark.zb_object);
290168494Spjd			sbuf_printf(&sb, " %s=%ju",
291168404Spjd			    FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL,
292168404Spjd			    zio->io_logical->io_bookmark.zb_level);
293168494Spjd			sbuf_printf(&sb, " %s=%ju",
294168404Spjd			    FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID,
295168404Spjd			    zio->io_logical->io_bookmark.zb_blkid);
296168404Spjd		}
297168404Spjd	} else if (vd != NULL) {
298168404Spjd		/*
299168404Spjd		 * If we have a vdev but no zio, this is a device fault, and the
300168404Spjd		 * 'stateoroffset' parameter indicates the previous state of the
301168404Spjd		 * vdev.
302168404Spjd		 */
303168494Spjd		sbuf_printf(&sb, " %s=%ju", FM_EREPORT_PAYLOAD_ZFS_PREV_STATE,
304168404Spjd		    stateoroffset);
305168404Spjd	}
306168404Spjd	mutex_exit(&spa->spa_errlist_lock);
307168404Spjd
308168404Spjd	sbuf_finish(&sb);
309168494Spjd	devctl_notify("ZFS", spa->spa_name, subclass, sbuf_data(&sb));
310168404Spjd	if (sbuf_overflowed(&sb))
311168404Spjd		printf("ZFS WARNING: sbuf overflowed\n");
312168404Spjd	sbuf_delete(&sb);
313168404Spjd#endif
314168404Spjd}
315168404Spjd
316185029Spjdstatic void
317185029Spjdzfs_post_common(spa_t *spa, vdev_t *vd, const char *name)
318168404Spjd{
319168404Spjd#ifdef _KERNEL
320168404Spjd	char buf[1024];
321168404Spjd	char class[64];
322168404Spjd	struct sbuf sb;
323168404Spjd	struct timespec ts;
324168404Spjd
325168404Spjd	nanotime(&ts);
326168404Spjd
327168404Spjd	sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
328168494Spjd	sbuf_printf(&sb, "time=%ju.%ld", (uintmax_t)ts.tv_sec, ts.tv_nsec);
329168404Spjd
330168404Spjd	snprintf(class, sizeof(class), "%s.%s.%s", FM_RSRC_RESOURCE,
331185029Spjd	    ZFS_ERROR_CLASS, name);
332168494Spjd	sbuf_printf(&sb, " %s=%hhu", FM_VERSION, FM_RSRC_VERSION);
333168494Spjd	sbuf_printf(&sb, " %s=%s", FM_CLASS, class);
334168494Spjd	sbuf_printf(&sb, " %s=%ju", FM_EREPORT_PAYLOAD_ZFS_POOL_GUID,
335168404Spjd	    spa_guid(spa));
336168404Spjd	if (vd)
337168494Spjd		sbuf_printf(&sb, " %s=%ju", FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID,
338168404Spjd		    vd->vdev_guid);
339168404Spjd	sbuf_finish(&sb);
340185029Spjd	ZFS_LOG(1, "%s", sbuf_data(&sb));
341168404Spjd	devctl_notify("ZFS", spa->spa_name, class, sbuf_data(&sb));
342168404Spjd	if (sbuf_overflowed(&sb))
343168404Spjd		printf("ZFS WARNING: sbuf overflowed\n");
344168404Spjd	sbuf_delete(&sb);
345168404Spjd#endif
346168404Spjd}
347185029Spjd
348185029Spjd/*
349185029Spjd * The 'resource.fs.zfs.removed' event is an internal signal that the given vdev
350185029Spjd * has been removed from the system.  This will cause the DE to ignore any
351185029Spjd * recent I/O errors, inferring that they are due to the asynchronous device
352185029Spjd * removal.
353185029Spjd */
354185029Spjdvoid
355185029Spjdzfs_post_remove(spa_t *spa, vdev_t *vd)
356185029Spjd{
357185029Spjd	zfs_post_common(spa, vd, FM_RESOURCE_REMOVED);
358185029Spjd}
359185029Spjd
360185029Spjd/*
361185029Spjd * The 'resource.fs.zfs.autoreplace' event is an internal signal that the pool
362185029Spjd * has the 'autoreplace' property set, and therefore any broken vdevs will be
363185029Spjd * handled by higher level logic, and no vdev fault should be generated.
364185029Spjd */
365185029Spjdvoid
366185029Spjdzfs_post_autoreplace(spa_t *spa, vdev_t *vd)
367185029Spjd{
368185029Spjd	zfs_post_common(spa, vd, FM_RESOURCE_AUTOREPLACE);
369185029Spjd}
370