zfs_fm.c revision 172645
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#pragma ident	"%Z%%M%	%I%	%E% SMI"
27
28#include <sys/spa.h>
29#include <sys/spa_impl.h>
30#include <sys/vdev.h>
31#include <sys/vdev_impl.h>
32#include <sys/zio.h>
33
34#include <sys/fm/fs/zfs.h>
35#include <sys/fm/protocol.h>
36#include <sys/fm/util.h>
37
38#ifdef _KERNEL
39/* Including sys/bus.h is just too hard, so I declare what I need here. */
40extern void devctl_notify(const char *__system, const char *__subsystem,
41    const char *__type, const char *__data);
42#endif
43
44/*
45 * This general routine is responsible for generating all the different ZFS
46 * ereports.  The payload is dependent on the class, and which arguments are
47 * supplied to the function:
48 *
49 * 	EREPORT			POOL	VDEV	IO
50 * 	block			X	X	X
51 * 	data			X		X
52 * 	device			X	X
53 * 	pool			X
54 *
55 * If we are in a loading state, all errors are chained together by the same
56 * SPA-wide ENA.
57 *
58 * For isolated I/O requests, we get the ENA from the zio_t. The propagation
59 * gets very complicated due to RAID-Z, gang blocks, and vdev caching.  We want
60 * to chain together all ereports associated with a logical piece of data.  For
61 * read I/Os, there  are basically three 'types' of I/O, which form a roughly
62 * layered diagram:
63 *
64 *      +---------------+
65 * 	| Aggregate I/O |	No associated logical data or device
66 * 	+---------------+
67 *              |
68 *              V
69 * 	+---------------+	Reads associated with a piece of logical data.
70 * 	|   Read I/O    |	This includes reads on behalf of RAID-Z,
71 * 	+---------------+       mirrors, gang blocks, retries, etc.
72 *              |
73 *              V
74 * 	+---------------+	Reads associated with a particular device, but
75 * 	| Physical I/O  |	no logical data.  Issued as part of vdev caching
76 * 	+---------------+	and I/O aggregation.
77 *
78 * Note that 'physical I/O' here is not the same terminology as used in the rest
79 * of ZIO.  Typically, 'physical I/O' simply means that there is no attached
80 * blockpointer.  But I/O with no associated block pointer can still be related
81 * to a logical piece of data (i.e. RAID-Z requests).
82 *
83 * Purely physical I/O always have unique ENAs.  They are not related to a
84 * particular piece of logical data, and therefore cannot be chained together.
85 * We still generate an ereport, but the DE doesn't correlate it with any
86 * logical piece of data.  When such an I/O fails, the delegated I/O requests
87 * will issue a retry, which will trigger the 'real' ereport with the correct
88 * ENA.
89 *
90 * We keep track of the ENA for a ZIO chain through the 'io_logical' member.
91 * When a new logical I/O is issued, we set this to point to itself.  Child I/Os
92 * then inherit this pointer, so that when it is first set subsequent failures
93 * will use the same ENA.  If a physical I/O is issued (by passing the
94 * ZIO_FLAG_NOBOOKMARK flag), then this pointer is reset, guaranteeing that a
95 * unique ENA will be generated.  For an aggregate I/O, this pointer is set to
96 * NULL, and no ereport will be generated (since it doesn't actually correspond
97 * to any particular device or piece of data).
98 */
99void
100zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
101    uint64_t stateoroffset, uint64_t size)
102{
103#ifdef _KERNEL
104	char buf[1024];
105	struct sbuf sb;
106	struct timespec ts;
107
108	/*
109	 * If we are doing a spa_tryimport(), ignore errors.
110	 */
111	if (spa->spa_load_state == SPA_LOAD_TRYIMPORT)
112		return;
113
114	/*
115	 * If we are in the middle of opening a pool, and the previous attempt
116	 * failed, don't bother logging any new ereports - we're just going to
117	 * get the same diagnosis anyway.
118	 */
119	if (spa->spa_load_state != SPA_LOAD_NONE &&
120	    spa->spa_last_open_failed)
121		return;
122
123	/*
124	 * Ignore any errors from I/Os that we are going to retry anyway - we
125	 * only generate errors from the final failure.
126	 */
127	if (zio && zio_should_retry(zio))
128		return;
129
130	/*
131	 * If this is not a read or write zio, ignore the error.  This can occur
132	 * if the DKIOCFLUSHWRITECACHE ioctl fails.
133	 */
134	if (zio && zio->io_type != ZIO_TYPE_READ &&
135	    zio->io_type != ZIO_TYPE_WRITE)
136		return;
137
138	nanotime(&ts);
139
140	sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
141	sbuf_printf(&sb, "time=%ju.%ld", (uintmax_t)ts.tv_sec, ts.tv_nsec);
142
143	/*
144	 * Serialize ereport generation
145	 */
146	mutex_enter(&spa->spa_errlist_lock);
147
148#if 0
149	/*
150	 * Determine the ENA to use for this event.  If we are in a loading
151	 * state, use a SPA-wide ENA.  Otherwise, if we are in an I/O state, use
152	 * a root zio-wide ENA.  Otherwise, simply use a unique ENA.
153	 */
154	if (spa->spa_load_state != SPA_LOAD_NONE) {
155#if 0
156		if (spa->spa_ena == 0)
157			spa->spa_ena = fm_ena_generate(0, FM_ENA_FMT1);
158#endif
159		ena = spa->spa_ena;
160	} else if (zio != NULL && zio->io_logical != NULL) {
161#if 0
162		if (zio->io_logical->io_ena == 0)
163			zio->io_logical->io_ena =
164			    fm_ena_generate(0, FM_ENA_FMT1);
165#endif
166		ena = zio->io_logical->io_ena;
167	} else {
168#if 0
169		ena = fm_ena_generate(0, FM_ENA_FMT1);
170#else
171		ena = 0;
172#endif
173	}
174#endif
175
176	/*
177	 * Construct the full class, detector, and other standard FMA fields.
178	 */
179	sbuf_printf(&sb, " ereport_version=%u", FM_EREPORT_VERSION);
180	sbuf_printf(&sb, " class=%s.%s", ZFS_ERROR_CLASS, subclass);
181
182	sbuf_printf(&sb, " zfs_scheme_version=%u", FM_ZFS_SCHEME_VERSION);
183
184	/*
185	 * Construct the per-ereport payload, depending on which parameters are
186	 * passed in.
187	 */
188
189	/*
190	 * Generic payload members common to all ereports.
191	 *
192	 * The direct reference to spa_name is used rather than spa_name()
193	 * because of the asynchronous nature of the zio pipeline.  spa_name()
194	 * asserts that the config lock is held in some form.  This is always
195	 * the case in I/O context, but because the check for RW_WRITER compares
196	 * against 'curthread', we may be in an asynchronous context and blow
197	 * this assert.  Rather than loosen this assert, we acknowledge that all
198	 * contexts in which this function is called (pool open, I/O) are safe,
199	 * and dereference the name directly.
200	 */
201	sbuf_printf(&sb, " %s=%s", FM_EREPORT_PAYLOAD_ZFS_POOL, spa->spa_name);
202	sbuf_printf(&sb, " %s=%ju", FM_EREPORT_PAYLOAD_ZFS_POOL_GUID,
203	    spa_guid(spa));
204	sbuf_printf(&sb, " %s=%u", FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT,
205	    spa->spa_load_state);
206
207	if (vd != NULL) {
208		vdev_t *pvd = vd->vdev_parent;
209
210		sbuf_printf(&sb, " %s=%ju", FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID,
211		    vd->vdev_guid);
212		sbuf_printf(&sb, " %s=%s", FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE,
213		    vd->vdev_ops->vdev_op_type);
214		if (vd->vdev_path)
215			sbuf_printf(&sb, " %s=%s",
216			    FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH, vd->vdev_path);
217		if (vd->vdev_devid)
218			sbuf_printf(&sb, " %s=%s",
219			    FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID, vd->vdev_devid);
220
221		if (pvd != NULL) {
222			sbuf_printf(&sb, " %s=%ju",
223			    FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID, pvd->vdev_guid);
224			sbuf_printf(&sb, " %s=%s",
225			    FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE,
226			    pvd->vdev_ops->vdev_op_type);
227			if (pvd->vdev_path)
228				sbuf_printf(&sb, " %s=%s",
229				    FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH,
230				    pvd->vdev_path);
231			if (pvd->vdev_devid)
232				sbuf_printf(&sb, " %s=%s",
233				    FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID,
234				    pvd->vdev_devid);
235		}
236	}
237
238	if (zio != NULL) {
239		/*
240		 * Payload common to all I/Os.
241		 */
242		sbuf_printf(&sb, " %s=%u", FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR,
243		    zio->io_error);
244
245		/*
246		 * If the 'size' parameter is non-zero, it indicates this is a
247		 * RAID-Z or other I/O where the physical offset and length are
248		 * provided for us, instead of within the zio_t.
249		 */
250		if (vd != NULL) {
251			if (size) {
252				sbuf_printf(&sb, " %s=%ju",
253				    FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET,
254				    stateoroffset);
255				sbuf_printf(&sb, " %s=%ju",
256				    FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE, size);
257			} else {
258				sbuf_printf(&sb, " %s=%ju",
259				    FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET,
260				    zio->io_offset);
261				sbuf_printf(&sb, " %s=%ju",
262				    FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE,
263				    zio->io_size);
264			}
265		}
266
267		/*
268		 * Payload for I/Os with corresponding logical information.
269		 */
270		if (zio->io_logical != NULL) {
271			sbuf_printf(&sb, " %s=%ju",
272			    FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT,
273			    zio->io_logical->io_bookmark.zb_object);
274			sbuf_printf(&sb, " %s=%ju",
275			    FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL,
276			    zio->io_logical->io_bookmark.zb_level);
277			sbuf_printf(&sb, " %s=%ju",
278			    FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID,
279			    zio->io_logical->io_bookmark.zb_blkid);
280		}
281	} else if (vd != NULL) {
282		/*
283		 * If we have a vdev but no zio, this is a device fault, and the
284		 * 'stateoroffset' parameter indicates the previous state of the
285		 * vdev.
286		 */
287		sbuf_printf(&sb, " %s=%ju", FM_EREPORT_PAYLOAD_ZFS_PREV_STATE,
288		    stateoroffset);
289	}
290	mutex_exit(&spa->spa_errlist_lock);
291
292	sbuf_finish(&sb);
293	ZFS_LOG(1, "%s", sbuf_data(&sb));
294	devctl_notify("ZFS", spa->spa_name, subclass, sbuf_data(&sb));
295	if (sbuf_overflowed(&sb))
296		printf("ZFS WARNING: sbuf overflowed\n");
297	sbuf_delete(&sb);
298#endif
299}
300
301/*
302 * The 'resource.fs.zfs.ok' event is an internal signal that the associated
303 * resource (pool or disk) has been identified by ZFS as healthy.  This will
304 * then trigger the DE to close the associated case, if any.
305 */
306void
307zfs_post_ok(spa_t *spa, vdev_t *vd)
308{
309#ifdef _KERNEL
310	char buf[1024];
311	char class[64];
312	struct sbuf sb;
313	struct timespec ts;
314
315	nanotime(&ts);
316
317	sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
318	sbuf_printf(&sb, "time=%ju.%ld", (uintmax_t)ts.tv_sec, ts.tv_nsec);
319
320	snprintf(class, sizeof(class), "%s.%s.%s", FM_RSRC_RESOURCE,
321	    ZFS_ERROR_CLASS, FM_RESOURCE_OK);
322	sbuf_printf(&sb, " %s=%hhu", FM_VERSION, FM_RSRC_VERSION);
323	sbuf_printf(&sb, " %s=%s", FM_CLASS, class);
324	sbuf_printf(&sb, " %s=%ju", FM_EREPORT_PAYLOAD_ZFS_POOL_GUID,
325	    spa_guid(spa));
326	if (vd)
327		sbuf_printf(&sb, " %s=%ju", FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID,
328		    vd->vdev_guid);
329	sbuf_finish(&sb);
330	devctl_notify("ZFS", spa->spa_name, class, sbuf_data(&sb));
331	if (sbuf_overflowed(&sb))
332		printf("ZFS WARNING: sbuf overflowed\n");
333	sbuf_delete(&sb);
334#endif
335}
336