libzfs_status.c revision 185029
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26/*
27 * This file contains the functions which analyze the status of a pool.  This
28 * include both the status of an active pool, as well as the status exported
29 * pools.  Returns one of the ZPOOL_STATUS_* defines describing the status of
30 * the pool.  This status is independent (to a certain degree) from the state of
31 * the pool.  A pool's state describes only whether or not it is capable of
32 * providing the necessary fault tolerance for data.  The status describes the
33 * overall status of devices.  A pool that is online can still have a device
34 * that is experiencing errors.
35 *
36 * Only a subset of the possible faults can be detected using 'zpool status',
37 * and not all possible errors correspond to a FMA message ID.  The explanation
38 * is left up to the caller, depending on whether it is a live pool or an
39 * import.
40 */
41
42#include <libzfs.h>
43#include <string.h>
44#include <unistd.h>
45#include "libzfs_impl.h"
46
47/*
48 * Message ID table.  This must be kept in sync with the ZPOOL_STATUS_* defines
49 * in libzfs.h.  Note that there are some status results which go past the end
50 * of this table, and hence have no associated message ID.
51 */
52static char *zfs_msgid_table[] = {
53	"ZFS-8000-14",
54	"ZFS-8000-2Q",
55	"ZFS-8000-3C",
56	"ZFS-8000-4J",
57	"ZFS-8000-5E",
58	"ZFS-8000-6X",
59	"ZFS-8000-72",
60	"ZFS-8000-8A",
61	"ZFS-8000-9P",
62	"ZFS-8000-A5",
63	"ZFS-8000-EY",
64	"ZFS-8000-HC",
65	"ZFS-8000-JQ",
66	"ZFS-8000-K4",
67};
68
69#define	NMSGID	(sizeof (zfs_msgid_table) / sizeof (zfs_msgid_table[0]))
70
71/* ARGSUSED */
72static int
73vdev_missing(uint64_t state, uint64_t aux, uint64_t errs)
74{
75	return (state == VDEV_STATE_CANT_OPEN &&
76	    aux == VDEV_AUX_OPEN_FAILED);
77}
78
79/* ARGSUSED */
80static int
81vdev_faulted(uint64_t state, uint64_t aux, uint64_t errs)
82{
83	return (state == VDEV_STATE_FAULTED);
84}
85
86/* ARGSUSED */
87static int
88vdev_errors(uint64_t state, uint64_t aux, uint64_t errs)
89{
90	return (state == VDEV_STATE_DEGRADED || errs != 0);
91}
92
93/* ARGSUSED */
94static int
95vdev_broken(uint64_t state, uint64_t aux, uint64_t errs)
96{
97	return (state == VDEV_STATE_CANT_OPEN);
98}
99
100/* ARGSUSED */
101static int
102vdev_offlined(uint64_t state, uint64_t aux, uint64_t errs)
103{
104	return (state == VDEV_STATE_OFFLINE);
105}
106
107/*
108 * Detect if any leaf devices that have seen errors or could not be opened.
109 */
110static boolean_t
111find_vdev_problem(nvlist_t *vdev, int (*func)(uint64_t, uint64_t, uint64_t))
112{
113	nvlist_t **child;
114	vdev_stat_t *vs;
115	uint_t c, children;
116	char *type;
117
118	/*
119	 * Ignore problems within a 'replacing' vdev, since we're presumably in
120	 * the process of repairing any such errors, and don't want to call them
121	 * out again.  We'll pick up the fact that a resilver is happening
122	 * later.
123	 */
124	verify(nvlist_lookup_string(vdev, ZPOOL_CONFIG_TYPE, &type) == 0);
125	if (strcmp(type, VDEV_TYPE_REPLACING) == 0)
126		return (B_FALSE);
127
128	if (nvlist_lookup_nvlist_array(vdev, ZPOOL_CONFIG_CHILDREN, &child,
129	    &children) == 0) {
130		for (c = 0; c < children; c++)
131			if (find_vdev_problem(child[c], func))
132				return (B_TRUE);
133	} else {
134		verify(nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_STATS,
135		    (uint64_t **)&vs, &c) == 0);
136
137		if (func(vs->vs_state, vs->vs_aux,
138		    vs->vs_read_errors +
139		    vs->vs_write_errors +
140		    vs->vs_checksum_errors))
141			return (B_TRUE);
142	}
143
144	return (B_FALSE);
145}
146
147/*
148 * Active pool health status.
149 *
150 * To determine the status for a pool, we make several passes over the config,
151 * picking the most egregious error we find.  In order of importance, we do the
152 * following:
153 *
154 *	- Check for a complete and valid configuration
155 *	- Look for any faulted or missing devices in a non-replicated config
156 *	- Check for any data errors
157 *	- Check for any faulted or missing devices in a replicated config
158 *	- Look for any devices showing errors
159 *	- Check for any resilvering devices
160 *
161 * There can obviously be multiple errors within a single pool, so this routine
162 * only picks the most damaging of all the current errors to report.
163 */
164static zpool_status_t
165check_status(nvlist_t *config, boolean_t isimport)
166{
167	nvlist_t *nvroot;
168	vdev_stat_t *vs;
169	uint_t vsc;
170	uint64_t nerr;
171	uint64_t version;
172	uint64_t stateval;
173	uint64_t suspended;
174	uint64_t hostid = 0;
175
176	verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
177	    &version) == 0);
178	verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
179	    &nvroot) == 0);
180	verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_STATS,
181	    (uint64_t **)&vs, &vsc) == 0);
182	verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,
183	    &stateval) == 0);
184	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID, &hostid);
185
186	/*
187	 * Pool last accessed by another system.
188	 */
189	if (hostid != 0 && (unsigned long)hostid != gethostid() &&
190	    stateval == POOL_STATE_ACTIVE)
191		return (ZPOOL_STATUS_HOSTID_MISMATCH);
192
193	/*
194	 * Newer on-disk version.
195	 */
196	if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
197	    vs->vs_aux == VDEV_AUX_VERSION_NEWER)
198		return (ZPOOL_STATUS_VERSION_NEWER);
199
200	/*
201	 * Check that the config is complete.
202	 */
203	if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
204	    vs->vs_aux == VDEV_AUX_BAD_GUID_SUM)
205		return (ZPOOL_STATUS_BAD_GUID_SUM);
206
207	/*
208	 * Check whether the pool has suspended due to failed I/O.
209	 */
210	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_SUSPENDED,
211	    &suspended) == 0) {
212		if (suspended == ZIO_FAILURE_MODE_CONTINUE)
213			return (ZPOOL_STATUS_IO_FAILURE_CONTINUE);
214		return (ZPOOL_STATUS_IO_FAILURE_WAIT);
215	}
216
217	/*
218	 * Could not read a log.
219	 */
220	if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
221	    vs->vs_aux == VDEV_AUX_BAD_LOG) {
222		return (ZPOOL_STATUS_BAD_LOG);
223	}
224
225	/*
226	 * Bad devices in non-replicated config.
227	 */
228	if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
229	    find_vdev_problem(nvroot, vdev_faulted))
230		return (ZPOOL_STATUS_FAULTED_DEV_NR);
231
232	if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
233	    find_vdev_problem(nvroot, vdev_missing))
234		return (ZPOOL_STATUS_MISSING_DEV_NR);
235
236	if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
237	    find_vdev_problem(nvroot, vdev_broken))
238		return (ZPOOL_STATUS_CORRUPT_LABEL_NR);
239
240	/*
241	 * Corrupted pool metadata
242	 */
243	if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
244	    vs->vs_aux == VDEV_AUX_CORRUPT_DATA)
245		return (ZPOOL_STATUS_CORRUPT_POOL);
246
247	/*
248	 * Persistent data errors.
249	 */
250	if (!isimport) {
251		if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_ERRCOUNT,
252		    &nerr) == 0 && nerr != 0)
253			return (ZPOOL_STATUS_CORRUPT_DATA);
254	}
255
256	/*
257	 * Missing devices in a replicated config.
258	 */
259	if (find_vdev_problem(nvroot, vdev_faulted))
260		return (ZPOOL_STATUS_FAULTED_DEV_R);
261	if (find_vdev_problem(nvroot, vdev_missing))
262		return (ZPOOL_STATUS_MISSING_DEV_R);
263	if (find_vdev_problem(nvroot, vdev_broken))
264		return (ZPOOL_STATUS_CORRUPT_LABEL_R);
265
266	/*
267	 * Devices with errors
268	 */
269	if (!isimport && find_vdev_problem(nvroot, vdev_errors))
270		return (ZPOOL_STATUS_FAILING_DEV);
271
272	/*
273	 * Offlined devices
274	 */
275	if (find_vdev_problem(nvroot, vdev_offlined))
276		return (ZPOOL_STATUS_OFFLINE_DEV);
277
278	/*
279	 * Currently resilvering
280	 */
281	if (!vs->vs_scrub_complete && vs->vs_scrub_type == POOL_SCRUB_RESILVER)
282		return (ZPOOL_STATUS_RESILVERING);
283
284	/*
285	 * Outdated, but usable, version
286	 */
287	if (version < SPA_VERSION)
288		return (ZPOOL_STATUS_VERSION_OLDER);
289
290	return (ZPOOL_STATUS_OK);
291}
292
293zpool_status_t
294zpool_get_status(zpool_handle_t *zhp, char **msgid)
295{
296	zpool_status_t ret = check_status(zhp->zpool_config, B_FALSE);
297
298	if (ret >= NMSGID)
299		*msgid = NULL;
300	else
301		*msgid = zfs_msgid_table[ret];
302
303	return (ret);
304}
305
306zpool_status_t
307zpool_import_status(nvlist_t *config, char **msgid)
308{
309	zpool_status_t ret = check_status(config, B_TRUE);
310
311	if (ret >= NMSGID)
312		*msgid = NULL;
313	else
314		*msgid = zfs_msgid_table[ret];
315
316	return (ret);
317}
318