libzfs_status.c revision 168404
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#pragma ident	"%Z%%M%	%I%	%E% SMI"
27
28/*
29 * This file contains the functions which analyze the status of a pool.  This
30 * include both the status of an active pool, as well as the status exported
31 * pools.  Returns one of the ZPOOL_STATUS_* defines describing the status of
32 * the pool.  This status is independent (to a certain degree) from the state of
33 * the pool.  A pool's state descsribes only whether or not it is capable of
34 * providing the necessary fault tolerance for data.  The status describes the
35 * overall status of devices.  A pool that is online can still have a device
36 * that is experiencing errors.
37 *
38 * Only a subset of the possible faults can be detected using 'zpool status',
39 * and not all possible errors correspond to a FMA message ID.  The explanation
40 * is left up to the caller, depending on whether it is a live pool or an
41 * import.
42 */
43
44#include <libzfs.h>
45#include <string.h>
46#include "libzfs_impl.h"
47
48/*
49 * Message ID table.  This must be kep in sync with the ZPOOL_STATUS_* defines
50 * in libzfs.h.  Note that there are some status results which go past the end
51 * of this table, and hence have no associated message ID.
52 */
53static char *msgid_table[] = {
54	"ZFS-8000-14",
55	"ZFS-8000-2Q",
56	"ZFS-8000-3C",
57	"ZFS-8000-4J",
58	"ZFS-8000-5E",
59	"ZFS-8000-6X",
60	"ZFS-8000-72",
61	"ZFS-8000-8A",
62	"ZFS-8000-9P",
63	"ZFS-8000-A5"
64};
65
66/*
67 * If the pool is active, a certain class of static errors is overridden by the
68 * faults as analayzed by FMA.  These faults have separate knowledge articles,
69 * and the article referred to by 'zpool status' must match that indicated by
70 * the syslog error message.  We override missing data as well as corrupt pool.
71 */
72static char *msgid_table_active[] = {
73	"ZFS-8000-14",
74	"ZFS-8000-D3",		/* overridden */
75	"ZFS-8000-D3",		/* overridden */
76	"ZFS-8000-4J",
77	"ZFS-8000-5E",
78	"ZFS-8000-6X",
79	"ZFS-8000-CS",		/* overridden */
80	"ZFS-8000-8A",
81	"ZFS-8000-9P",
82	"ZFS-8000-CS",		/* overridden */
83};
84
85#define	NMSGID	(sizeof (msgid_table) / sizeof (msgid_table[0]))
86
87/* ARGSUSED */
88static int
89vdev_missing(uint64_t state, uint64_t aux, uint64_t errs)
90{
91	return (state == VDEV_STATE_CANT_OPEN &&
92	    aux == VDEV_AUX_OPEN_FAILED);
93}
94
95/* ARGSUSED */
96static int
97vdev_errors(uint64_t state, uint64_t aux, uint64_t errs)
98{
99	return (errs != 0);
100}
101
102/* ARGSUSED */
103static int
104vdev_broken(uint64_t state, uint64_t aux, uint64_t errs)
105{
106	return (state == VDEV_STATE_CANT_OPEN);
107}
108
109/* ARGSUSED */
110static int
111vdev_offlined(uint64_t state, uint64_t aux, uint64_t errs)
112{
113	return (state == VDEV_STATE_OFFLINE);
114}
115
116/*
117 * Detect if any leaf devices that have seen errors or could not be opened.
118 */
119static boolean_t
120find_vdev_problem(nvlist_t *vdev, int (*func)(uint64_t, uint64_t, uint64_t))
121{
122	nvlist_t **child;
123	vdev_stat_t *vs;
124	uint_t c, children;
125	char *type;
126
127	/*
128	 * Ignore problems within a 'replacing' vdev, since we're presumably in
129	 * the process of repairing any such errors, and don't want to call them
130	 * out again.  We'll pick up the fact that a resilver is happening
131	 * later.
132	 */
133	verify(nvlist_lookup_string(vdev, ZPOOL_CONFIG_TYPE, &type) == 0);
134	if (strcmp(type, VDEV_TYPE_REPLACING) == 0)
135		return (B_FALSE);
136
137	if (nvlist_lookup_nvlist_array(vdev, ZPOOL_CONFIG_CHILDREN, &child,
138	    &children) == 0) {
139		for (c = 0; c < children; c++)
140			if (find_vdev_problem(child[c], func))
141				return (B_TRUE);
142	} else {
143		verify(nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_STATS,
144		    (uint64_t **)&vs, &c) == 0);
145
146		if (func(vs->vs_state, vs->vs_aux,
147		    vs->vs_read_errors +
148		    vs->vs_write_errors +
149		    vs->vs_checksum_errors))
150			return (B_TRUE);
151	}
152
153	return (B_FALSE);
154}
155
156/*
157 * Active pool health status.
158 *
159 * To determine the status for a pool, we make several passes over the config,
160 * picking the most egregious error we find.  In order of importance, we do the
161 * following:
162 *
163 *	- Check for a complete and valid configuration
164 *	- Look for any missing devices in a non-replicated config
165 *	- Check for any data errors
166 *	- Check for any missing devices in a replicated config
167 *	- Look for any devices showing errors
168 *	- Check for any resilvering devices
169 *
170 * There can obviously be multiple errors within a single pool, so this routine
171 * only picks the most damaging of all the current errors to report.
172 */
173static zpool_status_t
174check_status(nvlist_t *config, boolean_t isimport)
175{
176	nvlist_t *nvroot;
177	vdev_stat_t *vs;
178	uint_t vsc;
179	uint64_t nerr;
180	uint64_t version;
181
182	verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
183	    &version) == 0);
184	verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
185	    &nvroot) == 0);
186	verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_STATS,
187	    (uint64_t **)&vs, &vsc) == 0);
188
189	/*
190	 * Newer on-disk version.
191	 */
192	if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
193	    vs->vs_aux == VDEV_AUX_VERSION_NEWER)
194		return (ZPOOL_STATUS_VERSION_NEWER);
195
196	/*
197	 * Check that the config is complete.
198	 */
199	if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
200	    vs->vs_aux == VDEV_AUX_BAD_GUID_SUM)
201		return (ZPOOL_STATUS_BAD_GUID_SUM);
202
203	/*
204	 * Missing devices in non-replicated config.
205	 */
206	if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
207	    find_vdev_problem(nvroot, vdev_missing))
208		return (ZPOOL_STATUS_MISSING_DEV_NR);
209
210	if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
211	    find_vdev_problem(nvroot, vdev_broken))
212		return (ZPOOL_STATUS_CORRUPT_LABEL_NR);
213
214	/*
215	 * Corrupted pool metadata
216	 */
217	if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
218	    vs->vs_aux == VDEV_AUX_CORRUPT_DATA)
219		return (ZPOOL_STATUS_CORRUPT_POOL);
220
221	/*
222	 * Persistent data errors.
223	 */
224	if (!isimport) {
225		if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_ERRCOUNT,
226		    &nerr) == 0 && nerr != 0)
227			return (ZPOOL_STATUS_CORRUPT_DATA);
228	}
229
230	/*
231	 * Missing devices in a replicated config.
232	 */
233	if (find_vdev_problem(nvroot, vdev_missing))
234		return (ZPOOL_STATUS_MISSING_DEV_R);
235	if (find_vdev_problem(nvroot, vdev_broken))
236		return (ZPOOL_STATUS_CORRUPT_LABEL_R);
237
238	/*
239	 * Devices with errors
240	 */
241	if (!isimport && find_vdev_problem(nvroot, vdev_errors))
242		return (ZPOOL_STATUS_FAILING_DEV);
243
244	/*
245	 * Offlined devices
246	 */
247	if (find_vdev_problem(nvroot, vdev_offlined))
248		return (ZPOOL_STATUS_OFFLINE_DEV);
249
250	/*
251	 * Currently resilvering
252	 */
253	if (!vs->vs_scrub_complete && vs->vs_scrub_type == POOL_SCRUB_RESILVER)
254		return (ZPOOL_STATUS_RESILVERING);
255
256	/*
257	 * Outdated, but usable, version
258	 */
259	if (version < ZFS_VERSION)
260		return (ZPOOL_STATUS_VERSION_OLDER);
261
262	return (ZPOOL_STATUS_OK);
263}
264
265zpool_status_t
266zpool_get_status(zpool_handle_t *zhp, char **msgid)
267{
268	zpool_status_t ret = check_status(zhp->zpool_config, B_FALSE);
269
270	if (ret >= NMSGID)
271		*msgid = NULL;
272	else
273		*msgid = msgid_table_active[ret];
274
275	return (ret);
276}
277
278zpool_status_t
279zpool_import_status(nvlist_t *config, char **msgid)
280{
281	zpool_status_t ret = check_status(config, B_TRUE);
282
283	if (ret >= NMSGID)
284		*msgid = NULL;
285	else
286		*msgid = msgid_table[ret];
287
288	return (ret);
289}
290