libzfs_status.c revision 168498
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#pragma ident	"%Z%%M%	%I%	%E% SMI"
27
28/*
29 * This file contains the functions which analyze the status of a pool.  This
30 * include both the status of an active pool, as well as the status exported
31 * pools.  Returns one of the ZPOOL_STATUS_* defines describing the status of
32 * the pool.  This status is independent (to a certain degree) from the state of
33 * the pool.  A pool's state descsribes only whether or not it is capable of
34 * providing the necessary fault tolerance for data.  The status describes the
35 * overall status of devices.  A pool that is online can still have a device
36 * that is experiencing errors.
37 *
38 * Only a subset of the possible faults can be detected using 'zpool status',
39 * and not all possible errors correspond to a FMA message ID.  The explanation
40 * is left up to the caller, depending on whether it is a live pool or an
41 * import.
42 */
43
44#include <libzfs.h>
45#include <string.h>
46#include <unistd.h>
47#include "libzfs_impl.h"
48
49/*
50 * Message ID table.  This must be kep in sync with the ZPOOL_STATUS_* defines
51 * in libzfs.h.  Note that there are some status results which go past the end
52 * of this table, and hence have no associated message ID.
53 */
54static char *zfs_msgid_table[] = {
55	"ZFS-8000-14",
56	"ZFS-8000-2Q",
57	"ZFS-8000-3C",
58	"ZFS-8000-4J",
59	"ZFS-8000-5E",
60	"ZFS-8000-6X",
61	"ZFS-8000-72",
62	"ZFS-8000-8A",
63	"ZFS-8000-9P",
64	"ZFS-8000-A5",
65	"ZFS-8000-EY"
66};
67
68/*
69 * If the pool is active, a certain class of static errors is overridden by the
70 * faults as analayzed by FMA.  These faults have separate knowledge articles,
71 * and the article referred to by 'zpool status' must match that indicated by
72 * the syslog error message.  We override missing data as well as corrupt pool.
73 */
74static char *zfs_msgid_table_active[] = {
75	"ZFS-8000-14",
76	"ZFS-8000-D3",		/* overridden */
77	"ZFS-8000-D3",		/* overridden */
78	"ZFS-8000-4J",
79	"ZFS-8000-5E",
80	"ZFS-8000-6X",
81	"ZFS-8000-CS",		/* overridden */
82	"ZFS-8000-8A",
83	"ZFS-8000-9P",
84	"ZFS-8000-CS",		/* overridden */
85};
86
87#define	NMSGID	(sizeof (zfs_msgid_table) / sizeof (zfs_msgid_table[0]))
88
89/* ARGSUSED */
90static int
91vdev_missing(uint64_t state, uint64_t aux, uint64_t errs)
92{
93	return (state == VDEV_STATE_CANT_OPEN &&
94	    aux == VDEV_AUX_OPEN_FAILED);
95}
96
97/* ARGSUSED */
98static int
99vdev_errors(uint64_t state, uint64_t aux, uint64_t errs)
100{
101	return (errs != 0);
102}
103
104/* ARGSUSED */
105static int
106vdev_broken(uint64_t state, uint64_t aux, uint64_t errs)
107{
108	return (state == VDEV_STATE_CANT_OPEN);
109}
110
111/* ARGSUSED */
112static int
113vdev_offlined(uint64_t state, uint64_t aux, uint64_t errs)
114{
115	return (state == VDEV_STATE_OFFLINE);
116}
117
118/*
119 * Detect if any leaf devices that have seen errors or could not be opened.
120 */
121static boolean_t
122find_vdev_problem(nvlist_t *vdev, int (*func)(uint64_t, uint64_t, uint64_t))
123{
124	nvlist_t **child;
125	vdev_stat_t *vs;
126	uint_t c, children;
127	char *type;
128
129	/*
130	 * Ignore problems within a 'replacing' vdev, since we're presumably in
131	 * the process of repairing any such errors, and don't want to call them
132	 * out again.  We'll pick up the fact that a resilver is happening
133	 * later.
134	 */
135	verify(nvlist_lookup_string(vdev, ZPOOL_CONFIG_TYPE, &type) == 0);
136	if (strcmp(type, VDEV_TYPE_REPLACING) == 0)
137		return (B_FALSE);
138
139	if (nvlist_lookup_nvlist_array(vdev, ZPOOL_CONFIG_CHILDREN, &child,
140	    &children) == 0) {
141		for (c = 0; c < children; c++)
142			if (find_vdev_problem(child[c], func))
143				return (B_TRUE);
144	} else {
145		verify(nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_STATS,
146		    (uint64_t **)&vs, &c) == 0);
147
148		if (func(vs->vs_state, vs->vs_aux,
149		    vs->vs_read_errors +
150		    vs->vs_write_errors +
151		    vs->vs_checksum_errors))
152			return (B_TRUE);
153	}
154
155	return (B_FALSE);
156}
157
158/*
159 * Active pool health status.
160 *
161 * To determine the status for a pool, we make several passes over the config,
162 * picking the most egregious error we find.  In order of importance, we do the
163 * following:
164 *
165 *	- Check for a complete and valid configuration
166 *	- Look for any missing devices in a non-replicated config
167 *	- Check for any data errors
168 *	- Check for any missing devices in a replicated config
169 *	- Look for any devices showing errors
170 *	- Check for any resilvering devices
171 *
172 * There can obviously be multiple errors within a single pool, so this routine
173 * only picks the most damaging of all the current errors to report.
174 */
175static zpool_status_t
176check_status(nvlist_t *config, boolean_t isimport)
177{
178	nvlist_t *nvroot;
179	vdev_stat_t *vs;
180	uint_t vsc;
181	uint64_t nerr;
182	uint64_t version;
183	uint64_t stateval;
184	uint64_t hostid = 0;
185
186	verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
187	    &version) == 0);
188	verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
189	    &nvroot) == 0);
190	verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_STATS,
191	    (uint64_t **)&vs, &vsc) == 0);
192	verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,
193	    &stateval) == 0);
194	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID, &hostid);
195
196	/*
197	 * Pool last accessed by another system.
198	 */
199	if (hostid != 0 && (unsigned long)hostid != gethostid() &&
200	    stateval == POOL_STATE_ACTIVE)
201		return (ZPOOL_STATUS_HOSTID_MISMATCH);
202
203	/*
204	 * Newer on-disk version.
205	 */
206	if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
207	    vs->vs_aux == VDEV_AUX_VERSION_NEWER)
208		return (ZPOOL_STATUS_VERSION_NEWER);
209
210	/*
211	 * Check that the config is complete.
212	 */
213	if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
214	    vs->vs_aux == VDEV_AUX_BAD_GUID_SUM)
215		return (ZPOOL_STATUS_BAD_GUID_SUM);
216
217	/*
218	 * Missing devices in non-replicated config.
219	 */
220	if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
221	    find_vdev_problem(nvroot, vdev_missing))
222		return (ZPOOL_STATUS_MISSING_DEV_NR);
223
224	if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
225	    find_vdev_problem(nvroot, vdev_broken))
226		return (ZPOOL_STATUS_CORRUPT_LABEL_NR);
227
228	/*
229	 * Corrupted pool metadata
230	 */
231	if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
232	    vs->vs_aux == VDEV_AUX_CORRUPT_DATA)
233		return (ZPOOL_STATUS_CORRUPT_POOL);
234
235	/*
236	 * Persistent data errors.
237	 */
238	if (!isimport) {
239		if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_ERRCOUNT,
240		    &nerr) == 0 && nerr != 0)
241			return (ZPOOL_STATUS_CORRUPT_DATA);
242	}
243
244	/*
245	 * Missing devices in a replicated config.
246	 */
247	if (find_vdev_problem(nvroot, vdev_missing))
248		return (ZPOOL_STATUS_MISSING_DEV_R);
249	if (find_vdev_problem(nvroot, vdev_broken))
250		return (ZPOOL_STATUS_CORRUPT_LABEL_R);
251
252	/*
253	 * Devices with errors
254	 */
255	if (!isimport && find_vdev_problem(nvroot, vdev_errors))
256		return (ZPOOL_STATUS_FAILING_DEV);
257
258	/*
259	 * Offlined devices
260	 */
261	if (find_vdev_problem(nvroot, vdev_offlined))
262		return (ZPOOL_STATUS_OFFLINE_DEV);
263
264	/*
265	 * Currently resilvering
266	 */
267	if (!vs->vs_scrub_complete && vs->vs_scrub_type == POOL_SCRUB_RESILVER)
268		return (ZPOOL_STATUS_RESILVERING);
269
270	/*
271	 * Outdated, but usable, version
272	 */
273	if (version < ZFS_VERSION)
274		return (ZPOOL_STATUS_VERSION_OLDER);
275
276	return (ZPOOL_STATUS_OK);
277}
278
279zpool_status_t
280zpool_get_status(zpool_handle_t *zhp, char **msgid)
281{
282	zpool_status_t ret = check_status(zhp->zpool_config, B_FALSE);
283
284	if (ret >= NMSGID)
285		*msgid = NULL;
286	else
287		*msgid = zfs_msgid_table_active[ret];
288
289	return (ret);
290}
291
292zpool_status_t
293zpool_import_status(nvlist_t *config, char **msgid)
294{
295	zpool_status_t ret = check_status(config, B_TRUE);
296
297	if (ret >= NMSGID)
298		*msgid = NULL;
299	else
300		*msgid = zfs_msgid_table[ret];
301
302	return (ret);
303}
304