vdev.c revision 228103
1168404Spjd/*
2168404Spjd * CDDL HEADER START
3168404Spjd *
4168404Spjd * The contents of this file are subject to the terms of the
5168404Spjd * Common Development and Distribution License (the "License").
6168404Spjd * You may not use this file except in compliance with the License.
7168404Spjd *
8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9168404Spjd * or http://www.opensolaris.org/os/licensing.
10168404Spjd * See the License for the specific language governing permissions
11168404Spjd * and limitations under the License.
12168404Spjd *
13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each
14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15168404Spjd * If applicable, add the following below this CDDL HEADER, with the
16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying
17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner]
18168404Spjd *
19168404Spjd * CDDL HEADER END
20168404Spjd */
21168404Spjd
22168404Spjd/*
23219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24228103Smm * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
25228103Smm * Copyright (c) 2011 by Delphix. All rights reserved.
26168404Spjd */
27168404Spjd
28168404Spjd#include <sys/zfs_context.h>
29168404Spjd#include <sys/fm/fs/zfs.h>
30168404Spjd#include <sys/spa.h>
31168404Spjd#include <sys/spa_impl.h>
32168404Spjd#include <sys/dmu.h>
33168404Spjd#include <sys/dmu_tx.h>
34168404Spjd#include <sys/vdev_impl.h>
35168404Spjd#include <sys/uberblock_impl.h>
36168404Spjd#include <sys/metaslab.h>
37168404Spjd#include <sys/metaslab_impl.h>
38168404Spjd#include <sys/space_map.h>
39168404Spjd#include <sys/zio.h>
40168404Spjd#include <sys/zap.h>
41168404Spjd#include <sys/fs/zfs.h>
42185029Spjd#include <sys/arc.h>
43213197Smm#include <sys/zil.h>
44219089Spjd#include <sys/dsl_scan.h>
45168404Spjd
46168404SpjdSYSCTL_DECL(_vfs_zfs);
47168404SpjdSYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW, 0, "ZFS VDEV");
48168404Spjd
49168404Spjd/*
50168404Spjd * Virtual device management.
51168404Spjd */
52168404Spjd
53168404Spjdstatic vdev_ops_t *vdev_ops_table[] = {
54168404Spjd	&vdev_root_ops,
55168404Spjd	&vdev_raidz_ops,
56168404Spjd	&vdev_mirror_ops,
57168404Spjd	&vdev_replacing_ops,
58168404Spjd	&vdev_spare_ops,
59168404Spjd#ifdef _KERNEL
60168404Spjd	&vdev_geom_ops,
61168404Spjd#else
62168404Spjd	&vdev_disk_ops,
63185029Spjd#endif
64168404Spjd	&vdev_file_ops,
65168404Spjd	&vdev_missing_ops,
66219089Spjd	&vdev_hole_ops,
67168404Spjd	NULL
68168404Spjd};
69168404Spjd
70185029Spjd/* maximum scrub/resilver I/O queue per leaf vdev */
71185029Spjdint zfs_scrub_limit = 10;
72168404Spjd
73185029SpjdTUNABLE_INT("vfs.zfs.scrub_limit", &zfs_scrub_limit);
74185029SpjdSYSCTL_INT(_vfs_zfs, OID_AUTO, scrub_limit, CTLFLAG_RDTUN, &zfs_scrub_limit, 0,
75185029Spjd    "Maximum scrub/resilver I/O queue");
76185029Spjd
77168404Spjd/*
78168404Spjd * Given a vdev type, return the appropriate ops vector.
79168404Spjd */
80168404Spjdstatic vdev_ops_t *
81168404Spjdvdev_getops(const char *type)
82168404Spjd{
83168404Spjd	vdev_ops_t *ops, **opspp;
84168404Spjd
85168404Spjd	for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++)
86168404Spjd		if (strcmp(ops->vdev_op_type, type) == 0)
87168404Spjd			break;
88168404Spjd
89168404Spjd	return (ops);
90168404Spjd}
91168404Spjd
92168404Spjd/*
93168404Spjd * Default asize function: return the MAX of psize with the asize of
94168404Spjd * all children.  This is what's used by anything other than RAID-Z.
95168404Spjd */
96168404Spjduint64_t
97168404Spjdvdev_default_asize(vdev_t *vd, uint64_t psize)
98168404Spjd{
99168404Spjd	uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift);
100168404Spjd	uint64_t csize;
101168404Spjd
102219089Spjd	for (int c = 0; c < vd->vdev_children; c++) {
103168404Spjd		csize = vdev_psize_to_asize(vd->vdev_child[c], psize);
104168404Spjd		asize = MAX(asize, csize);
105168404Spjd	}
106168404Spjd
107168404Spjd	return (asize);
108168404Spjd}
109168404Spjd
110168404Spjd/*
111219089Spjd * Get the minimum allocatable size. We define the allocatable size as
112219089Spjd * the vdev's asize rounded to the nearest metaslab. This allows us to
113219089Spjd * replace or attach devices which don't have the same physical size but
114219089Spjd * can still satisfy the same number of allocations.
115168404Spjd */
116168404Spjduint64_t
117219089Spjdvdev_get_min_asize(vdev_t *vd)
118168404Spjd{
119219089Spjd	vdev_t *pvd = vd->vdev_parent;
120168404Spjd
121219089Spjd	/*
122219089Spjd	 * The our parent is NULL (inactive spare or cache) or is the root,
123219089Spjd	 * just return our own asize.
124219089Spjd	 */
125219089Spjd	if (pvd == NULL)
126219089Spjd		return (vd->vdev_asize);
127168404Spjd
128168404Spjd	/*
129219089Spjd	 * The top-level vdev just returns the allocatable size rounded
130219089Spjd	 * to the nearest metaslab.
131168404Spjd	 */
132219089Spjd	if (vd == vd->vdev_top)
133219089Spjd		return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift));
134168404Spjd
135219089Spjd	/*
136219089Spjd	 * The allocatable space for a raidz vdev is N * sizeof(smallest child),
137219089Spjd	 * so each child must provide at least 1/Nth of its asize.
138219089Spjd	 */
139219089Spjd	if (pvd->vdev_ops == &vdev_raidz_ops)
140219089Spjd		return (pvd->vdev_min_asize / pvd->vdev_children);
141168404Spjd
142219089Spjd	return (pvd->vdev_min_asize);
143219089Spjd}
144168404Spjd
145219089Spjdvoid
146219089Spjdvdev_set_min_asize(vdev_t *vd)
147219089Spjd{
148219089Spjd	vd->vdev_min_asize = vdev_get_min_asize(vd);
149219089Spjd
150219089Spjd	for (int c = 0; c < vd->vdev_children; c++)
151219089Spjd		vdev_set_min_asize(vd->vdev_child[c]);
152168404Spjd}
153168404Spjd
154168404Spjdvdev_t *
155168404Spjdvdev_lookup_top(spa_t *spa, uint64_t vdev)
156168404Spjd{
157168404Spjd	vdev_t *rvd = spa->spa_root_vdev;
158168404Spjd
159185029Spjd	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
160185029Spjd
161185029Spjd	if (vdev < rvd->vdev_children) {
162185029Spjd		ASSERT(rvd->vdev_child[vdev] != NULL);
163168404Spjd		return (rvd->vdev_child[vdev]);
164185029Spjd	}
165168404Spjd
166168404Spjd	return (NULL);
167168404Spjd}
168168404Spjd
169168404Spjdvdev_t *
170168404Spjdvdev_lookup_by_guid(vdev_t *vd, uint64_t guid)
171168404Spjd{
172168404Spjd	vdev_t *mvd;
173168404Spjd
174168404Spjd	if (vd->vdev_guid == guid)
175168404Spjd		return (vd);
176168404Spjd
177219089Spjd	for (int c = 0; c < vd->vdev_children; c++)
178168404Spjd		if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) !=
179168404Spjd		    NULL)
180168404Spjd			return (mvd);
181168404Spjd
182168404Spjd	return (NULL);
183168404Spjd}
184168404Spjd
185168404Spjdvoid
186168404Spjdvdev_add_child(vdev_t *pvd, vdev_t *cvd)
187168404Spjd{
188168404Spjd	size_t oldsize, newsize;
189168404Spjd	uint64_t id = cvd->vdev_id;
190168404Spjd	vdev_t **newchild;
191168404Spjd
192185029Spjd	ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
193168404Spjd	ASSERT(cvd->vdev_parent == NULL);
194168404Spjd
195168404Spjd	cvd->vdev_parent = pvd;
196168404Spjd
197168404Spjd	if (pvd == NULL)
198168404Spjd		return;
199168404Spjd
200168404Spjd	ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL);
201168404Spjd
202168404Spjd	oldsize = pvd->vdev_children * sizeof (vdev_t *);
203168404Spjd	pvd->vdev_children = MAX(pvd->vdev_children, id + 1);
204168404Spjd	newsize = pvd->vdev_children * sizeof (vdev_t *);
205168404Spjd
206168404Spjd	newchild = kmem_zalloc(newsize, KM_SLEEP);
207168404Spjd	if (pvd->vdev_child != NULL) {
208168404Spjd		bcopy(pvd->vdev_child, newchild, oldsize);
209168404Spjd		kmem_free(pvd->vdev_child, oldsize);
210168404Spjd	}
211168404Spjd
212168404Spjd	pvd->vdev_child = newchild;
213168404Spjd	pvd->vdev_child[id] = cvd;
214168404Spjd
215168404Spjd	cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd);
216168404Spjd	ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL);
217168404Spjd
218168404Spjd	/*
219168404Spjd	 * Walk up all ancestors to update guid sum.
220168404Spjd	 */
221168404Spjd	for (; pvd != NULL; pvd = pvd->vdev_parent)
222168404Spjd		pvd->vdev_guid_sum += cvd->vdev_guid_sum;
223168404Spjd}
224168404Spjd
225168404Spjdvoid
226168404Spjdvdev_remove_child(vdev_t *pvd, vdev_t *cvd)
227168404Spjd{
228168404Spjd	int c;
229168404Spjd	uint_t id = cvd->vdev_id;
230168404Spjd
231168404Spjd	ASSERT(cvd->vdev_parent == pvd);
232168404Spjd
233168404Spjd	if (pvd == NULL)
234168404Spjd		return;
235168404Spjd
236168404Spjd	ASSERT(id < pvd->vdev_children);
237168404Spjd	ASSERT(pvd->vdev_child[id] == cvd);
238168404Spjd
239168404Spjd	pvd->vdev_child[id] = NULL;
240168404Spjd	cvd->vdev_parent = NULL;
241168404Spjd
242168404Spjd	for (c = 0; c < pvd->vdev_children; c++)
243168404Spjd		if (pvd->vdev_child[c])
244168404Spjd			break;
245168404Spjd
246168404Spjd	if (c == pvd->vdev_children) {
247168404Spjd		kmem_free(pvd->vdev_child, c * sizeof (vdev_t *));
248168404Spjd		pvd->vdev_child = NULL;
249168404Spjd		pvd->vdev_children = 0;
250168404Spjd	}
251168404Spjd
252168404Spjd	/*
253168404Spjd	 * Walk up all ancestors to update guid sum.
254168404Spjd	 */
255168404Spjd	for (; pvd != NULL; pvd = pvd->vdev_parent)
256168404Spjd		pvd->vdev_guid_sum -= cvd->vdev_guid_sum;
257168404Spjd}
258168404Spjd
259168404Spjd/*
260168404Spjd * Remove any holes in the child array.
261168404Spjd */
262168404Spjdvoid
263168404Spjdvdev_compact_children(vdev_t *pvd)
264168404Spjd{
265168404Spjd	vdev_t **newchild, *cvd;
266168404Spjd	int oldc = pvd->vdev_children;
267219089Spjd	int newc;
268168404Spjd
269185029Spjd	ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
270168404Spjd
271219089Spjd	for (int c = newc = 0; c < oldc; c++)
272168404Spjd		if (pvd->vdev_child[c])
273168404Spjd			newc++;
274168404Spjd
275168404Spjd	newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP);
276168404Spjd
277219089Spjd	for (int c = newc = 0; c < oldc; c++) {
278168404Spjd		if ((cvd = pvd->vdev_child[c]) != NULL) {
279168404Spjd			newchild[newc] = cvd;
280168404Spjd			cvd->vdev_id = newc++;
281168404Spjd		}
282168404Spjd	}
283168404Spjd
284168404Spjd	kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *));
285168404Spjd	pvd->vdev_child = newchild;
286168404Spjd	pvd->vdev_children = newc;
287168404Spjd}
288168404Spjd
289168404Spjd/*
290168404Spjd * Allocate and minimally initialize a vdev_t.
291168404Spjd */
292219089Spjdvdev_t *
293168404Spjdvdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
294168404Spjd{
295168404Spjd	vdev_t *vd;
296168404Spjd
297168404Spjd	vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);
298168404Spjd
299168404Spjd	if (spa->spa_root_vdev == NULL) {
300168404Spjd		ASSERT(ops == &vdev_root_ops);
301168404Spjd		spa->spa_root_vdev = vd;
302228103Smm		spa->spa_load_guid = spa_generate_guid(NULL);
303168404Spjd	}
304168404Spjd
305219089Spjd	if (guid == 0 && ops != &vdev_hole_ops) {
306168404Spjd		if (spa->spa_root_vdev == vd) {
307168404Spjd			/*
308168404Spjd			 * The root vdev's guid will also be the pool guid,
309168404Spjd			 * which must be unique among all pools.
310168404Spjd			 */
311219089Spjd			guid = spa_generate_guid(NULL);
312168404Spjd		} else {
313168404Spjd			/*
314168404Spjd			 * Any other vdev's guid must be unique within the pool.
315168404Spjd			 */
316219089Spjd			guid = spa_generate_guid(spa);
317168404Spjd		}
318168404Spjd		ASSERT(!spa_guid_exists(spa_guid(spa), guid));
319168404Spjd	}
320168404Spjd
321168404Spjd	vd->vdev_spa = spa;
322168404Spjd	vd->vdev_id = id;
323168404Spjd	vd->vdev_guid = guid;
324168404Spjd	vd->vdev_guid_sum = guid;
325168404Spjd	vd->vdev_ops = ops;
326168404Spjd	vd->vdev_state = VDEV_STATE_CLOSED;
327219089Spjd	vd->vdev_ishole = (ops == &vdev_hole_ops);
328168404Spjd
329168404Spjd	mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
330168404Spjd	mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
331185029Spjd	mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
332209962Smm	for (int t = 0; t < DTL_TYPES; t++) {
333209962Smm		space_map_create(&vd->vdev_dtl[t], 0, -1ULL, 0,
334209962Smm		    &vd->vdev_dtl_lock);
335209962Smm	}
336168404Spjd	txg_list_create(&vd->vdev_ms_list,
337168404Spjd	    offsetof(struct metaslab, ms_txg_node));
338168404Spjd	txg_list_create(&vd->vdev_dtl_list,
339168404Spjd	    offsetof(struct vdev, vdev_dtl_node));
340168404Spjd	vd->vdev_stat.vs_timestamp = gethrtime();
341185029Spjd	vdev_queue_init(vd);
342185029Spjd	vdev_cache_init(vd);
343168404Spjd
344168404Spjd	return (vd);
345168404Spjd}
346168404Spjd
347168404Spjd/*
348168404Spjd * Allocate a new vdev.  The 'alloctype' is used to control whether we are
349168404Spjd * creating a new vdev or loading an existing one - the behavior is slightly
350168404Spjd * different for each case.
351168404Spjd */
352168404Spjdint
353168404Spjdvdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
354168404Spjd    int alloctype)
355168404Spjd{
356168404Spjd	vdev_ops_t *ops;
357168404Spjd	char *type;
358185029Spjd	uint64_t guid = 0, islog, nparity;
359168404Spjd	vdev_t *vd;
360168404Spjd
361185029Spjd	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
362168404Spjd
363168404Spjd	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
364168404Spjd		return (EINVAL);
365168404Spjd
366168404Spjd	if ((ops = vdev_getops(type)) == NULL)
367168404Spjd		return (EINVAL);
368168404Spjd
369168404Spjd	/*
370168404Spjd	 * If this is a load, get the vdev guid from the nvlist.
371168404Spjd	 * Otherwise, vdev_alloc_common() will generate one for us.
372168404Spjd	 */
373168404Spjd	if (alloctype == VDEV_ALLOC_LOAD) {
374168404Spjd		uint64_t label_id;
375168404Spjd
376168404Spjd		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) ||
377168404Spjd		    label_id != id)
378168404Spjd			return (EINVAL);
379168404Spjd
380168404Spjd		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
381168404Spjd			return (EINVAL);
382168404Spjd	} else if (alloctype == VDEV_ALLOC_SPARE) {
383168404Spjd		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
384168404Spjd			return (EINVAL);
385185029Spjd	} else if (alloctype == VDEV_ALLOC_L2CACHE) {
386185029Spjd		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
387185029Spjd			return (EINVAL);
388219089Spjd	} else if (alloctype == VDEV_ALLOC_ROOTPOOL) {
389219089Spjd		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
390219089Spjd			return (EINVAL);
391168404Spjd	}
392168404Spjd
393168404Spjd	/*
394168404Spjd	 * The first allocated vdev must be of type 'root'.
395168404Spjd	 */
396168404Spjd	if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL)
397168404Spjd		return (EINVAL);
398168404Spjd
399185029Spjd	/*
400185029Spjd	 * Determine whether we're a log vdev.
401185029Spjd	 */
402185029Spjd	islog = 0;
403185029Spjd	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog);
404185029Spjd	if (islog && spa_version(spa) < SPA_VERSION_SLOGS)
405185029Spjd		return (ENOTSUP);
406168404Spjd
407219089Spjd	if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES)
408219089Spjd		return (ENOTSUP);
409219089Spjd
410168404Spjd	/*
411185029Spjd	 * Set the nparity property for RAID-Z vdevs.
412168404Spjd	 */
413185029Spjd	nparity = -1ULL;
414168404Spjd	if (ops == &vdev_raidz_ops) {
415168404Spjd		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
416185029Spjd		    &nparity) == 0) {
417219089Spjd			if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
418168404Spjd				return (EINVAL);
419168404Spjd			/*
420219089Spjd			 * Previous versions could only support 1 or 2 parity
421219089Spjd			 * device.
422168404Spjd			 */
423219089Spjd			if (nparity > 1 &&
424219089Spjd			    spa_version(spa) < SPA_VERSION_RAIDZ2)
425168404Spjd				return (ENOTSUP);
426219089Spjd			if (nparity > 2 &&
427219089Spjd			    spa_version(spa) < SPA_VERSION_RAIDZ3)
428219089Spjd				return (ENOTSUP);
429168404Spjd		} else {
430168404Spjd			/*
431168404Spjd			 * We require the parity to be specified for SPAs that
432168404Spjd			 * support multiple parity levels.
433168404Spjd			 */
434219089Spjd			if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
435168404Spjd				return (EINVAL);
436168404Spjd			/*
437168404Spjd			 * Otherwise, we default to 1 parity device for RAID-Z.
438168404Spjd			 */
439185029Spjd			nparity = 1;
440168404Spjd		}
441168404Spjd	} else {
442185029Spjd		nparity = 0;
443168404Spjd	}
444185029Spjd	ASSERT(nparity != -1ULL);
445168404Spjd
446185029Spjd	vd = vdev_alloc_common(spa, id, guid, ops);
447185029Spjd
448185029Spjd	vd->vdev_islog = islog;
449185029Spjd	vd->vdev_nparity = nparity;
450185029Spjd
451185029Spjd	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0)
452185029Spjd		vd->vdev_path = spa_strdup(vd->vdev_path);
453185029Spjd	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0)
454185029Spjd		vd->vdev_devid = spa_strdup(vd->vdev_devid);
455185029Spjd	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH,
456185029Spjd	    &vd->vdev_physpath) == 0)
457185029Spjd		vd->vdev_physpath = spa_strdup(vd->vdev_physpath);
458209962Smm	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0)
459209962Smm		vd->vdev_fru = spa_strdup(vd->vdev_fru);
460185029Spjd
461168404Spjd	/*
462168404Spjd	 * Set the whole_disk property.  If it's not specified, leave the value
463168404Spjd	 * as -1.
464168404Spjd	 */
465168404Spjd	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
466168404Spjd	    &vd->vdev_wholedisk) != 0)
467168404Spjd		vd->vdev_wholedisk = -1ULL;
468168404Spjd
469168404Spjd	/*
470168404Spjd	 * Look for the 'not present' flag.  This will only be set if the device
471168404Spjd	 * was not present at the time of import.
472168404Spjd	 */
473209962Smm	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
474209962Smm	    &vd->vdev_not_present);
475168404Spjd
476168404Spjd	/*
477168404Spjd	 * Get the alignment requirement.
478168404Spjd	 */
479168404Spjd	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift);
480168404Spjd
481168404Spjd	/*
482219089Spjd	 * Retrieve the vdev creation time.
483219089Spjd	 */
484219089Spjd	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG,
485219089Spjd	    &vd->vdev_crtxg);
486219089Spjd
487219089Spjd	/*
488168404Spjd	 * If we're a top-level vdev, try to load the allocation parameters.
489168404Spjd	 */
490219089Spjd	if (parent && !parent->vdev_parent &&
491219089Spjd	    (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) {
492168404Spjd		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
493168404Spjd		    &vd->vdev_ms_array);
494168404Spjd		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
495168404Spjd		    &vd->vdev_ms_shift);
496168404Spjd		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE,
497168404Spjd		    &vd->vdev_asize);
498219089Spjd		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING,
499219089Spjd		    &vd->vdev_removing);
500168404Spjd	}
501168404Spjd
502219089Spjd	if (parent && !parent->vdev_parent) {
503219089Spjd		ASSERT(alloctype == VDEV_ALLOC_LOAD ||
504219089Spjd		    alloctype == VDEV_ALLOC_ADD ||
505219089Spjd		    alloctype == VDEV_ALLOC_SPLIT ||
506219089Spjd		    alloctype == VDEV_ALLOC_ROOTPOOL);
507219089Spjd		vd->vdev_mg = metaslab_group_create(islog ?
508219089Spjd		    spa_log_class(spa) : spa_normal_class(spa), vd);
509219089Spjd	}
510219089Spjd
511168404Spjd	/*
512185029Spjd	 * If we're a leaf vdev, try to load the DTL object and other state.
513168404Spjd	 */
514185029Spjd	if (vd->vdev_ops->vdev_op_leaf &&
515219089Spjd	    (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE ||
516219089Spjd	    alloctype == VDEV_ALLOC_ROOTPOOL)) {
517185029Spjd		if (alloctype == VDEV_ALLOC_LOAD) {
518185029Spjd			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
519209962Smm			    &vd->vdev_dtl_smo.smo_object);
520185029Spjd			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE,
521185029Spjd			    &vd->vdev_unspare);
522185029Spjd		}
523219089Spjd
524219089Spjd		if (alloctype == VDEV_ALLOC_ROOTPOOL) {
525219089Spjd			uint64_t spare = 0;
526219089Spjd
527219089Spjd			if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE,
528219089Spjd			    &spare) == 0 && spare)
529219089Spjd				spa_spare_add(vd);
530219089Spjd		}
531219089Spjd
532168404Spjd		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE,
533168404Spjd		    &vd->vdev_offline);
534185029Spjd
535219089Spjd		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVERING,
536219089Spjd		    &vd->vdev_resilvering);
537219089Spjd
538185029Spjd		/*
539185029Spjd		 * When importing a pool, we want to ignore the persistent fault
540185029Spjd		 * state, as the diagnosis made on another system may not be
541219089Spjd		 * valid in the current context.  Local vdevs will
542219089Spjd		 * remain in the faulted state.
543185029Spjd		 */
544219089Spjd		if (spa_load_state(spa) == SPA_LOAD_OPEN) {
545185029Spjd			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED,
546185029Spjd			    &vd->vdev_faulted);
547185029Spjd			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED,
548185029Spjd			    &vd->vdev_degraded);
549185029Spjd			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED,
550185029Spjd			    &vd->vdev_removed);
551219089Spjd
552219089Spjd			if (vd->vdev_faulted || vd->vdev_degraded) {
553219089Spjd				char *aux;
554219089Spjd
555219089Spjd				vd->vdev_label_aux =
556219089Spjd				    VDEV_AUX_ERR_EXCEEDED;
557219089Spjd				if (nvlist_lookup_string(nv,
558219089Spjd				    ZPOOL_CONFIG_AUX_STATE, &aux) == 0 &&
559219089Spjd				    strcmp(aux, "external") == 0)
560219089Spjd					vd->vdev_label_aux = VDEV_AUX_EXTERNAL;
561219089Spjd			}
562185029Spjd		}
563168404Spjd	}
564168404Spjd
565168404Spjd	/*
566168404Spjd	 * Add ourselves to the parent's list of children.
567168404Spjd	 */
568168404Spjd	vdev_add_child(parent, vd);
569168404Spjd
570168404Spjd	*vdp = vd;
571168404Spjd
572168404Spjd	return (0);
573168404Spjd}
574168404Spjd
575168404Spjdvoid
576168404Spjdvdev_free(vdev_t *vd)
577168404Spjd{
578185029Spjd	spa_t *spa = vd->vdev_spa;
579168404Spjd
580168404Spjd	/*
581168404Spjd	 * vdev_free() implies closing the vdev first.  This is simpler than
582168404Spjd	 * trying to ensure complicated semantics for all callers.
583168404Spjd	 */
584168404Spjd	vdev_close(vd);
585168404Spjd
586185029Spjd	ASSERT(!list_link_active(&vd->vdev_config_dirty_node));
587219089Spjd	ASSERT(!list_link_active(&vd->vdev_state_dirty_node));
588168404Spjd
589168404Spjd	/*
590168404Spjd	 * Free all children.
591168404Spjd	 */
592219089Spjd	for (int c = 0; c < vd->vdev_children; c++)
593168404Spjd		vdev_free(vd->vdev_child[c]);
594168404Spjd
595168404Spjd	ASSERT(vd->vdev_child == NULL);
596168404Spjd	ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
597168404Spjd
598168404Spjd	/*
599168404Spjd	 * Discard allocation state.
600168404Spjd	 */
601219089Spjd	if (vd->vdev_mg != NULL) {
602168404Spjd		vdev_metaslab_fini(vd);
603219089Spjd		metaslab_group_destroy(vd->vdev_mg);
604219089Spjd	}
605168404Spjd
606168404Spjd	ASSERT3U(vd->vdev_stat.vs_space, ==, 0);
607168404Spjd	ASSERT3U(vd->vdev_stat.vs_dspace, ==, 0);
608168404Spjd	ASSERT3U(vd->vdev_stat.vs_alloc, ==, 0);
609168404Spjd
610168404Spjd	/*
611168404Spjd	 * Remove this vdev from its parent's child list.
612168404Spjd	 */
613168404Spjd	vdev_remove_child(vd->vdev_parent, vd);
614168404Spjd
615168404Spjd	ASSERT(vd->vdev_parent == NULL);
616168404Spjd
617185029Spjd	/*
618185029Spjd	 * Clean up vdev structure.
619185029Spjd	 */
620185029Spjd	vdev_queue_fini(vd);
621185029Spjd	vdev_cache_fini(vd);
622185029Spjd
623185029Spjd	if (vd->vdev_path)
624185029Spjd		spa_strfree(vd->vdev_path);
625185029Spjd	if (vd->vdev_devid)
626185029Spjd		spa_strfree(vd->vdev_devid);
627185029Spjd	if (vd->vdev_physpath)
628185029Spjd		spa_strfree(vd->vdev_physpath);
629209962Smm	if (vd->vdev_fru)
630209962Smm		spa_strfree(vd->vdev_fru);
631185029Spjd
632185029Spjd	if (vd->vdev_isspare)
633185029Spjd		spa_spare_remove(vd);
634185029Spjd	if (vd->vdev_isl2cache)
635185029Spjd		spa_l2cache_remove(vd);
636185029Spjd
637185029Spjd	txg_list_destroy(&vd->vdev_ms_list);
638185029Spjd	txg_list_destroy(&vd->vdev_dtl_list);
639209962Smm
640185029Spjd	mutex_enter(&vd->vdev_dtl_lock);
641209962Smm	for (int t = 0; t < DTL_TYPES; t++) {
642209962Smm		space_map_unload(&vd->vdev_dtl[t]);
643209962Smm		space_map_destroy(&vd->vdev_dtl[t]);
644209962Smm	}
645185029Spjd	mutex_exit(&vd->vdev_dtl_lock);
646209962Smm
647185029Spjd	mutex_destroy(&vd->vdev_dtl_lock);
648185029Spjd	mutex_destroy(&vd->vdev_stat_lock);
649185029Spjd	mutex_destroy(&vd->vdev_probe_lock);
650185029Spjd
651185029Spjd	if (vd == spa->spa_root_vdev)
652185029Spjd		spa->spa_root_vdev = NULL;
653185029Spjd
654185029Spjd	kmem_free(vd, sizeof (vdev_t));
655168404Spjd}
656168404Spjd
657168404Spjd/*
658168404Spjd * Transfer top-level vdev state from svd to tvd.
659168404Spjd */
660168404Spjdstatic void
661168404Spjdvdev_top_transfer(vdev_t *svd, vdev_t *tvd)
662168404Spjd{
663168404Spjd	spa_t *spa = svd->vdev_spa;
664168404Spjd	metaslab_t *msp;
665168404Spjd	vdev_t *vd;
666168404Spjd	int t;
667168404Spjd
668168404Spjd	ASSERT(tvd == tvd->vdev_top);
669168404Spjd
670168404Spjd	tvd->vdev_ms_array = svd->vdev_ms_array;
671168404Spjd	tvd->vdev_ms_shift = svd->vdev_ms_shift;
672168404Spjd	tvd->vdev_ms_count = svd->vdev_ms_count;
673168404Spjd
674168404Spjd	svd->vdev_ms_array = 0;
675168404Spjd	svd->vdev_ms_shift = 0;
676168404Spjd	svd->vdev_ms_count = 0;
677168404Spjd
678168404Spjd	tvd->vdev_mg = svd->vdev_mg;
679168404Spjd	tvd->vdev_ms = svd->vdev_ms;
680168404Spjd
681168404Spjd	svd->vdev_mg = NULL;
682168404Spjd	svd->vdev_ms = NULL;
683168404Spjd
684168404Spjd	if (tvd->vdev_mg != NULL)
685168404Spjd		tvd->vdev_mg->mg_vd = tvd;
686168404Spjd
687168404Spjd	tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc;
688168404Spjd	tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space;
689168404Spjd	tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace;
690168404Spjd
691168404Spjd	svd->vdev_stat.vs_alloc = 0;
692168404Spjd	svd->vdev_stat.vs_space = 0;
693168404Spjd	svd->vdev_stat.vs_dspace = 0;
694168404Spjd
695168404Spjd	for (t = 0; t < TXG_SIZE; t++) {
696168404Spjd		while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL)
697168404Spjd			(void) txg_list_add(&tvd->vdev_ms_list, msp, t);
698168404Spjd		while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL)
699168404Spjd			(void) txg_list_add(&tvd->vdev_dtl_list, vd, t);
700168404Spjd		if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t))
701168404Spjd			(void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t);
702168404Spjd	}
703168404Spjd
704185029Spjd	if (list_link_active(&svd->vdev_config_dirty_node)) {
705168404Spjd		vdev_config_clean(svd);
706168404Spjd		vdev_config_dirty(tvd);
707168404Spjd	}
708168404Spjd
709185029Spjd	if (list_link_active(&svd->vdev_state_dirty_node)) {
710185029Spjd		vdev_state_clean(svd);
711185029Spjd		vdev_state_dirty(tvd);
712185029Spjd	}
713168404Spjd
714168404Spjd	tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio;
715168404Spjd	svd->vdev_deflate_ratio = 0;
716185029Spjd
717185029Spjd	tvd->vdev_islog = svd->vdev_islog;
718185029Spjd	svd->vdev_islog = 0;
719168404Spjd}
720168404Spjd
721168404Spjdstatic void
722168404Spjdvdev_top_update(vdev_t *tvd, vdev_t *vd)
723168404Spjd{
724168404Spjd	if (vd == NULL)
725168404Spjd		return;
726168404Spjd
727168404Spjd	vd->vdev_top = tvd;
728168404Spjd
729219089Spjd	for (int c = 0; c < vd->vdev_children; c++)
730168404Spjd		vdev_top_update(tvd, vd->vdev_child[c]);
731168404Spjd}
732168404Spjd
733168404Spjd/*
734168404Spjd * Add a mirror/replacing vdev above an existing vdev.
735168404Spjd */
736168404Spjdvdev_t *
737168404Spjdvdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
738168404Spjd{
739168404Spjd	spa_t *spa = cvd->vdev_spa;
740168404Spjd	vdev_t *pvd = cvd->vdev_parent;
741168404Spjd	vdev_t *mvd;
742168404Spjd
743185029Spjd	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
744168404Spjd
745168404Spjd	mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops);
746168404Spjd
747168404Spjd	mvd->vdev_asize = cvd->vdev_asize;
748219089Spjd	mvd->vdev_min_asize = cvd->vdev_min_asize;
749168404Spjd	mvd->vdev_ashift = cvd->vdev_ashift;
750168404Spjd	mvd->vdev_state = cvd->vdev_state;
751219089Spjd	mvd->vdev_crtxg = cvd->vdev_crtxg;
752168404Spjd
753168404Spjd	vdev_remove_child(pvd, cvd);
754168404Spjd	vdev_add_child(pvd, mvd);
755168404Spjd	cvd->vdev_id = mvd->vdev_children;
756168404Spjd	vdev_add_child(mvd, cvd);
757168404Spjd	vdev_top_update(cvd->vdev_top, cvd->vdev_top);
758168404Spjd
759168404Spjd	if (mvd == mvd->vdev_top)
760168404Spjd		vdev_top_transfer(cvd, mvd);
761168404Spjd
762168404Spjd	return (mvd);
763168404Spjd}
764168404Spjd
765168404Spjd/*
766168404Spjd * Remove a 1-way mirror/replacing vdev from the tree.
767168404Spjd */
768168404Spjdvoid
769168404Spjdvdev_remove_parent(vdev_t *cvd)
770168404Spjd{
771168404Spjd	vdev_t *mvd = cvd->vdev_parent;
772168404Spjd	vdev_t *pvd = mvd->vdev_parent;
773168404Spjd
774185029Spjd	ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
775168404Spjd
776168404Spjd	ASSERT(mvd->vdev_children == 1);
777168404Spjd	ASSERT(mvd->vdev_ops == &vdev_mirror_ops ||
778168404Spjd	    mvd->vdev_ops == &vdev_replacing_ops ||
779168404Spjd	    mvd->vdev_ops == &vdev_spare_ops);
780168404Spjd	cvd->vdev_ashift = mvd->vdev_ashift;
781168404Spjd
782168404Spjd	vdev_remove_child(mvd, cvd);
783168404Spjd	vdev_remove_child(pvd, mvd);
784209962Smm
785185029Spjd	/*
786185029Spjd	 * If cvd will replace mvd as a top-level vdev, preserve mvd's guid.
787185029Spjd	 * Otherwise, we could have detached an offline device, and when we
788185029Spjd	 * go to import the pool we'll think we have two top-level vdevs,
789185029Spjd	 * instead of a different version of the same top-level vdev.
790185029Spjd	 */
791209962Smm	if (mvd->vdev_top == mvd) {
792209962Smm		uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid;
793219089Spjd		cvd->vdev_orig_guid = cvd->vdev_guid;
794209962Smm		cvd->vdev_guid += guid_delta;
795209962Smm		cvd->vdev_guid_sum += guid_delta;
796209962Smm	}
797168404Spjd	cvd->vdev_id = mvd->vdev_id;
798168404Spjd	vdev_add_child(pvd, cvd);
799168404Spjd	vdev_top_update(cvd->vdev_top, cvd->vdev_top);
800168404Spjd
801168404Spjd	if (cvd == cvd->vdev_top)
802168404Spjd		vdev_top_transfer(mvd, cvd);
803168404Spjd
804168404Spjd	ASSERT(mvd->vdev_children == 0);
805168404Spjd	vdev_free(mvd);
806168404Spjd}
807168404Spjd
808168404Spjdint
809168404Spjdvdev_metaslab_init(vdev_t *vd, uint64_t txg)
810168404Spjd{
811168404Spjd	spa_t *spa = vd->vdev_spa;
812168404Spjd	objset_t *mos = spa->spa_meta_objset;
813168404Spjd	uint64_t m;
814168404Spjd	uint64_t oldc = vd->vdev_ms_count;
815168404Spjd	uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
816168404Spjd	metaslab_t **mspp;
817168404Spjd	int error;
818168404Spjd
819219089Spjd	ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER));
820219089Spjd
821219089Spjd	/*
822219089Spjd	 * This vdev is not being allocated from yet or is a hole.
823219089Spjd	 */
824219089Spjd	if (vd->vdev_ms_shift == 0)
825168404Spjd		return (0);
826168404Spjd
827219089Spjd	ASSERT(!vd->vdev_ishole);
828219089Spjd
829213197Smm	/*
830213197Smm	 * Compute the raidz-deflation ratio.  Note, we hard-code
831213197Smm	 * in 128k (1 << 17) because it is the current "typical" blocksize.
832213197Smm	 * Even if SPA_MAXBLOCKSIZE changes, this algorithm must never change,
833213197Smm	 * or we will inconsistently account for existing bp's.
834213197Smm	 */
835213197Smm	vd->vdev_deflate_ratio = (1 << 17) /
836213197Smm	    (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT);
837213197Smm
838168404Spjd	ASSERT(oldc <= newc);
839168404Spjd
840168404Spjd	mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
841168404Spjd
842168404Spjd	if (oldc != 0) {
843168404Spjd		bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp));
844168404Spjd		kmem_free(vd->vdev_ms, oldc * sizeof (*mspp));
845168404Spjd	}
846168404Spjd
847168404Spjd	vd->vdev_ms = mspp;
848168404Spjd	vd->vdev_ms_count = newc;
849168404Spjd
850168404Spjd	for (m = oldc; m < newc; m++) {
851168404Spjd		space_map_obj_t smo = { 0, 0, 0 };
852168404Spjd		if (txg == 0) {
853168404Spjd			uint64_t object = 0;
854168404Spjd			error = dmu_read(mos, vd->vdev_ms_array,
855209962Smm			    m * sizeof (uint64_t), sizeof (uint64_t), &object,
856209962Smm			    DMU_READ_PREFETCH);
857168404Spjd			if (error)
858168404Spjd				return (error);
859168404Spjd			if (object != 0) {
860168404Spjd				dmu_buf_t *db;
861168404Spjd				error = dmu_bonus_hold(mos, object, FTAG, &db);
862168404Spjd				if (error)
863168404Spjd					return (error);
864185029Spjd				ASSERT3U(db->db_size, >=, sizeof (smo));
865185029Spjd				bcopy(db->db_data, &smo, sizeof (smo));
866168404Spjd				ASSERT3U(smo.smo_object, ==, object);
867168404Spjd				dmu_buf_rele(db, FTAG);
868168404Spjd			}
869168404Spjd		}
870168404Spjd		vd->vdev_ms[m] = metaslab_init(vd->vdev_mg, &smo,
871168404Spjd		    m << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg);
872168404Spjd	}
873168404Spjd
874219089Spjd	if (txg == 0)
875219089Spjd		spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER);
876219089Spjd
877219089Spjd	/*
878219089Spjd	 * If the vdev is being removed we don't activate
879219089Spjd	 * the metaslabs since we want to ensure that no new
880219089Spjd	 * allocations are performed on this device.
881219089Spjd	 */
882219089Spjd	if (oldc == 0 && !vd->vdev_removing)
883219089Spjd		metaslab_group_activate(vd->vdev_mg);
884219089Spjd
885219089Spjd	if (txg == 0)
886219089Spjd		spa_config_exit(spa, SCL_ALLOC, FTAG);
887219089Spjd
888168404Spjd	return (0);
889168404Spjd}
890168404Spjd
891168404Spjdvoid
892168404Spjdvdev_metaslab_fini(vdev_t *vd)
893168404Spjd{
894168404Spjd	uint64_t m;
895168404Spjd	uint64_t count = vd->vdev_ms_count;
896168404Spjd
897168404Spjd	if (vd->vdev_ms != NULL) {
898219089Spjd		metaslab_group_passivate(vd->vdev_mg);
899168404Spjd		for (m = 0; m < count; m++)
900168404Spjd			if (vd->vdev_ms[m] != NULL)
901168404Spjd				metaslab_fini(vd->vdev_ms[m]);
902168404Spjd		kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
903168404Spjd		vd->vdev_ms = NULL;
904168404Spjd	}
905168404Spjd}
906168404Spjd
907185029Spjdtypedef struct vdev_probe_stats {
908185029Spjd	boolean_t	vps_readable;
909185029Spjd	boolean_t	vps_writeable;
910185029Spjd	int		vps_flags;
911185029Spjd} vdev_probe_stats_t;
912185029Spjd
913185029Spjdstatic void
914185029Spjdvdev_probe_done(zio_t *zio)
915185029Spjd{
916209962Smm	spa_t *spa = zio->io_spa;
917209962Smm	vdev_t *vd = zio->io_vd;
918185029Spjd	vdev_probe_stats_t *vps = zio->io_private;
919185029Spjd
920209962Smm	ASSERT(vd->vdev_probe_zio != NULL);
921209962Smm
922185029Spjd	if (zio->io_type == ZIO_TYPE_READ) {
923185029Spjd		if (zio->io_error == 0)
924185029Spjd			vps->vps_readable = 1;
925209962Smm		if (zio->io_error == 0 && spa_writeable(spa)) {
926209962Smm			zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd,
927185029Spjd			    zio->io_offset, zio->io_size, zio->io_data,
928185029Spjd			    ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
929185029Spjd			    ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE));
930185029Spjd		} else {
931185029Spjd			zio_buf_free(zio->io_data, zio->io_size);
932185029Spjd		}
933185029Spjd	} else if (zio->io_type == ZIO_TYPE_WRITE) {
934185029Spjd		if (zio->io_error == 0)
935185029Spjd			vps->vps_writeable = 1;
936185029Spjd		zio_buf_free(zio->io_data, zio->io_size);
937185029Spjd	} else if (zio->io_type == ZIO_TYPE_NULL) {
938209962Smm		zio_t *pio;
939185029Spjd
940185029Spjd		vd->vdev_cant_read |= !vps->vps_readable;
941185029Spjd		vd->vdev_cant_write |= !vps->vps_writeable;
942185029Spjd
943185029Spjd		if (vdev_readable(vd) &&
944209962Smm		    (vdev_writeable(vd) || !spa_writeable(spa))) {
945185029Spjd			zio->io_error = 0;
946185029Spjd		} else {
947185029Spjd			ASSERT(zio->io_error != 0);
948185029Spjd			zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE,
949209962Smm			    spa, vd, NULL, 0, 0);
950185029Spjd			zio->io_error = ENXIO;
951185029Spjd		}
952209962Smm
953209962Smm		mutex_enter(&vd->vdev_probe_lock);
954209962Smm		ASSERT(vd->vdev_probe_zio == zio);
955209962Smm		vd->vdev_probe_zio = NULL;
956209962Smm		mutex_exit(&vd->vdev_probe_lock);
957209962Smm
958209962Smm		while ((pio = zio_walk_parents(zio)) != NULL)
959209962Smm			if (!vdev_accessible(vd, pio))
960209962Smm				pio->io_error = ENXIO;
961209962Smm
962185029Spjd		kmem_free(vps, sizeof (*vps));
963185029Spjd	}
964185029Spjd}
965185029Spjd
966168404Spjd/*
967185029Spjd * Determine whether this device is accessible by reading and writing
968185029Spjd * to several known locations: the pad regions of each vdev label
969185029Spjd * but the first (which we leave alone in case it contains a VTOC).
970185029Spjd */
971185029Spjdzio_t *
972209962Smmvdev_probe(vdev_t *vd, zio_t *zio)
973185029Spjd{
974185029Spjd	spa_t *spa = vd->vdev_spa;
975209962Smm	vdev_probe_stats_t *vps = NULL;
976209962Smm	zio_t *pio;
977185029Spjd
978209962Smm	ASSERT(vd->vdev_ops->vdev_op_leaf);
979185029Spjd
980209962Smm	/*
981209962Smm	 * Don't probe the probe.
982209962Smm	 */
983209962Smm	if (zio && (zio->io_flags & ZIO_FLAG_PROBE))
984209962Smm		return (NULL);
985185029Spjd
986209962Smm	/*
987209962Smm	 * To prevent 'probe storms' when a device fails, we create
988209962Smm	 * just one probe i/o at a time.  All zios that want to probe
989209962Smm	 * this vdev will become parents of the probe io.
990209962Smm	 */
991209962Smm	mutex_enter(&vd->vdev_probe_lock);
992209962Smm
993209962Smm	if ((pio = vd->vdev_probe_zio) == NULL) {
994209962Smm		vps = kmem_zalloc(sizeof (*vps), KM_SLEEP);
995209962Smm
996209962Smm		vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE |
997209962Smm		    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE |
998213198Smm		    ZIO_FLAG_TRYHARD;
999209962Smm
1000209962Smm		if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) {
1001209962Smm			/*
1002209962Smm			 * vdev_cant_read and vdev_cant_write can only
1003209962Smm			 * transition from TRUE to FALSE when we have the
1004209962Smm			 * SCL_ZIO lock as writer; otherwise they can only
1005209962Smm			 * transition from FALSE to TRUE.  This ensures that
1006209962Smm			 * any zio looking at these values can assume that
1007209962Smm			 * failures persist for the life of the I/O.  That's
1008209962Smm			 * important because when a device has intermittent
1009209962Smm			 * connectivity problems, we want to ensure that
1010209962Smm			 * they're ascribed to the device (ENXIO) and not
1011209962Smm			 * the zio (EIO).
1012209962Smm			 *
1013209962Smm			 * Since we hold SCL_ZIO as writer here, clear both
1014209962Smm			 * values so the probe can reevaluate from first
1015209962Smm			 * principles.
1016209962Smm			 */
1017209962Smm			vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER;
1018209962Smm			vd->vdev_cant_read = B_FALSE;
1019209962Smm			vd->vdev_cant_write = B_FALSE;
1020209962Smm		}
1021209962Smm
1022209962Smm		vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd,
1023209962Smm		    vdev_probe_done, vps,
1024209962Smm		    vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE);
1025209962Smm
1026219089Spjd		/*
1027219089Spjd		 * We can't change the vdev state in this context, so we
1028219089Spjd		 * kick off an async task to do it on our behalf.
1029219089Spjd		 */
1030209962Smm		if (zio != NULL) {
1031209962Smm			vd->vdev_probe_wanted = B_TRUE;
1032209962Smm			spa_async_request(spa, SPA_ASYNC_PROBE);
1033209962Smm		}
1034185029Spjd	}
1035185029Spjd
1036209962Smm	if (zio != NULL)
1037209962Smm		zio_add_child(zio, pio);
1038185029Spjd
1039209962Smm	mutex_exit(&vd->vdev_probe_lock);
1040185029Spjd
1041209962Smm	if (vps == NULL) {
1042209962Smm		ASSERT(zio != NULL);
1043209962Smm		return (NULL);
1044209962Smm	}
1045185029Spjd
1046185029Spjd	for (int l = 1; l < VDEV_LABELS; l++) {
1047209962Smm		zio_nowait(zio_read_phys(pio, vd,
1048185029Spjd		    vdev_label_offset(vd->vdev_psize, l,
1049209962Smm		    offsetof(vdev_label_t, vl_pad2)),
1050209962Smm		    VDEV_PAD_SIZE, zio_buf_alloc(VDEV_PAD_SIZE),
1051185029Spjd		    ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
1052185029Spjd		    ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE));
1053185029Spjd	}
1054185029Spjd
1055209962Smm	if (zio == NULL)
1056209962Smm		return (pio);
1057209962Smm
1058209962Smm	zio_nowait(pio);
1059209962Smm	return (NULL);
1060185029Spjd}
1061185029Spjd
1062219089Spjdstatic void
1063219089Spjdvdev_open_child(void *arg)
1064219089Spjd{
1065219089Spjd	vdev_t *vd = arg;
1066219089Spjd
1067219089Spjd	vd->vdev_open_thread = curthread;
1068219089Spjd	vd->vdev_open_error = vdev_open(vd);
1069219089Spjd	vd->vdev_open_thread = NULL;
1070219089Spjd}
1071219089Spjd
1072219089Spjdboolean_t
1073219089Spjdvdev_uses_zvols(vdev_t *vd)
1074219089Spjd{
1075219089Spjd	if (vd->vdev_path && strncmp(vd->vdev_path, ZVOL_DIR,
1076219089Spjd	    strlen(ZVOL_DIR)) == 0)
1077219089Spjd		return (B_TRUE);
1078219089Spjd	for (int c = 0; c < vd->vdev_children; c++)
1079219089Spjd		if (vdev_uses_zvols(vd->vdev_child[c]))
1080219089Spjd			return (B_TRUE);
1081219089Spjd	return (B_FALSE);
1082219089Spjd}
1083219089Spjd
1084219089Spjdvoid
1085219089Spjdvdev_open_children(vdev_t *vd)
1086219089Spjd{
1087219089Spjd	taskq_t *tq;
1088219089Spjd	int children = vd->vdev_children;
1089219089Spjd
1090219089Spjd	/*
1091219089Spjd	 * in order to handle pools on top of zvols, do the opens
1092219089Spjd	 * in a single thread so that the same thread holds the
1093219089Spjd	 * spa_namespace_lock
1094219089Spjd	 */
1095219089Spjd	if (B_TRUE || vdev_uses_zvols(vd)) {
1096219089Spjd		for (int c = 0; c < children; c++)
1097219089Spjd			vd->vdev_child[c]->vdev_open_error =
1098219089Spjd			    vdev_open(vd->vdev_child[c]);
1099219089Spjd		return;
1100219089Spjd	}
1101219089Spjd	tq = taskq_create("vdev_open", children, minclsyspri,
1102219089Spjd	    children, children, TASKQ_PREPOPULATE);
1103219089Spjd
1104219089Spjd	for (int c = 0; c < children; c++)
1105219089Spjd		VERIFY(taskq_dispatch(tq, vdev_open_child, vd->vdev_child[c],
1106219089Spjd		    TQ_SLEEP) != 0);
1107219089Spjd
1108219089Spjd	taskq_destroy(tq);
1109219089Spjd}
1110219089Spjd
1111185029Spjd/*
1112168404Spjd * Prepare a virtual device for access.
1113168404Spjd */
1114168404Spjdint
1115168404Spjdvdev_open(vdev_t *vd)
1116168404Spjd{
1117209962Smm	spa_t *spa = vd->vdev_spa;
1118168404Spjd	int error;
1119168404Spjd	uint64_t osize = 0;
1120168404Spjd	uint64_t asize, psize;
1121168404Spjd	uint64_t ashift = 0;
1122168404Spjd
1123219089Spjd	ASSERT(vd->vdev_open_thread == curthread ||
1124219089Spjd	    spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
1125168404Spjd	ASSERT(vd->vdev_state == VDEV_STATE_CLOSED ||
1126168404Spjd	    vd->vdev_state == VDEV_STATE_CANT_OPEN ||
1127168404Spjd	    vd->vdev_state == VDEV_STATE_OFFLINE);
1128168404Spjd
1129168404Spjd	vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
1130213197Smm	vd->vdev_cant_read = B_FALSE;
1131213197Smm	vd->vdev_cant_write = B_FALSE;
1132219089Spjd	vd->vdev_min_asize = vdev_get_min_asize(vd);
1133168404Spjd
1134219089Spjd	/*
1135219089Spjd	 * If this vdev is not removed, check its fault status.  If it's
1136219089Spjd	 * faulted, bail out of the open.
1137219089Spjd	 */
1138185029Spjd	if (!vd->vdev_removed && vd->vdev_faulted) {
1139168404Spjd		ASSERT(vd->vdev_children == 0);
1140219089Spjd		ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
1141219089Spjd		    vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
1142185029Spjd		vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
1143219089Spjd		    vd->vdev_label_aux);
1144185029Spjd		return (ENXIO);
1145185029Spjd	} else if (vd->vdev_offline) {
1146185029Spjd		ASSERT(vd->vdev_children == 0);
1147168404Spjd		vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE);
1148168404Spjd		return (ENXIO);
1149168404Spjd	}
1150168404Spjd
1151168404Spjd	error = vd->vdev_ops->vdev_op_open(vd, &osize, &ashift);
1152168404Spjd
1153219089Spjd	/*
1154219089Spjd	 * Reset the vdev_reopening flag so that we actually close
1155219089Spjd	 * the vdev on error.
1156219089Spjd	 */
1157219089Spjd	vd->vdev_reopening = B_FALSE;
1158168404Spjd	if (zio_injection_enabled && error == 0)
1159213198Smm		error = zio_handle_device_injection(vd, NULL, ENXIO);
1160168404Spjd
1161185029Spjd	if (error) {
1162185029Spjd		if (vd->vdev_removed &&
1163185029Spjd		    vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED)
1164185029Spjd			vd->vdev_removed = B_FALSE;
1165168404Spjd
1166168404Spjd		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
1167168404Spjd		    vd->vdev_stat.vs_aux);
1168168404Spjd		return (error);
1169168404Spjd	}
1170168404Spjd
1171185029Spjd	vd->vdev_removed = B_FALSE;
1172168404Spjd
1173219089Spjd	/*
1174219089Spjd	 * Recheck the faulted flag now that we have confirmed that
1175219089Spjd	 * the vdev is accessible.  If we're faulted, bail.
1176219089Spjd	 */
1177219089Spjd	if (vd->vdev_faulted) {
1178219089Spjd		ASSERT(vd->vdev_children == 0);
1179219089Spjd		ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
1180219089Spjd		    vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
1181219089Spjd		vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
1182219089Spjd		    vd->vdev_label_aux);
1183219089Spjd		return (ENXIO);
1184219089Spjd	}
1185219089Spjd
1186185029Spjd	if (vd->vdev_degraded) {
1187185029Spjd		ASSERT(vd->vdev_children == 0);
1188185029Spjd		vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
1189185029Spjd		    VDEV_AUX_ERR_EXCEEDED);
1190185029Spjd	} else {
1191219089Spjd		vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0);
1192185029Spjd	}
1193185029Spjd
1194219089Spjd	/*
1195219089Spjd	 * For hole or missing vdevs we just return success.
1196219089Spjd	 */
1197219089Spjd	if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops)
1198219089Spjd		return (0);
1199219089Spjd
1200219089Spjd	for (int c = 0; c < vd->vdev_children; c++) {
1201168404Spjd		if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) {
1202168404Spjd			vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
1203168404Spjd			    VDEV_AUX_NONE);
1204168404Spjd			break;
1205168404Spjd		}
1206219089Spjd	}
1207168404Spjd
1208168404Spjd	osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t));
1209168404Spjd
1210168404Spjd	if (vd->vdev_children == 0) {
1211168404Spjd		if (osize < SPA_MINDEVSIZE) {
1212168404Spjd			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
1213168404Spjd			    VDEV_AUX_TOO_SMALL);
1214168404Spjd			return (EOVERFLOW);
1215168404Spjd		}
1216168404Spjd		psize = osize;
1217168404Spjd		asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE);
1218168404Spjd	} else {
1219168404Spjd		if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE -
1220168404Spjd		    (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) {
1221168404Spjd			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
1222168404Spjd			    VDEV_AUX_TOO_SMALL);
1223168404Spjd			return (EOVERFLOW);
1224168404Spjd		}
1225168404Spjd		psize = 0;
1226168404Spjd		asize = osize;
1227168404Spjd	}
1228168404Spjd
1229168404Spjd	vd->vdev_psize = psize;
1230168404Spjd
1231219089Spjd	/*
1232219089Spjd	 * Make sure the allocatable size hasn't shrunk.
1233219089Spjd	 */
1234219089Spjd	if (asize < vd->vdev_min_asize) {
1235219089Spjd		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
1236219089Spjd		    VDEV_AUX_BAD_LABEL);
1237219089Spjd		return (EINVAL);
1238219089Spjd	}
1239219089Spjd
1240168404Spjd	if (vd->vdev_asize == 0) {
1241168404Spjd		/*
1242168404Spjd		 * This is the first-ever open, so use the computed values.
1243168404Spjd		 * For testing purposes, a higher ashift can be requested.
1244168404Spjd		 */
1245168404Spjd		vd->vdev_asize = asize;
1246168404Spjd		vd->vdev_ashift = MAX(ashift, vd->vdev_ashift);
1247168404Spjd	} else {
1248168404Spjd		/*
1249168404Spjd		 * Make sure the alignment requirement hasn't increased.
1250168404Spjd		 */
1251168404Spjd		if (ashift > vd->vdev_top->vdev_ashift) {
1252168404Spjd			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
1253168404Spjd			    VDEV_AUX_BAD_LABEL);
1254168404Spjd			return (EINVAL);
1255168404Spjd		}
1256219089Spjd	}
1257168404Spjd
1258219089Spjd	/*
1259219089Spjd	 * If all children are healthy and the asize has increased,
1260219089Spjd	 * then we've experienced dynamic LUN growth.  If automatic
1261219089Spjd	 * expansion is enabled then use the additional space.
1262219089Spjd	 */
1263219089Spjd	if (vd->vdev_state == VDEV_STATE_HEALTHY && asize > vd->vdev_asize &&
1264219089Spjd	    (vd->vdev_expanding || spa->spa_autoexpand))
1265219089Spjd		vd->vdev_asize = asize;
1266168404Spjd
1267219089Spjd	vdev_set_min_asize(vd);
1268168404Spjd
1269168404Spjd	/*
1270185029Spjd	 * Ensure we can issue some IO before declaring the
1271185029Spjd	 * vdev open for business.
1272185029Spjd	 */
1273185029Spjd	if (vd->vdev_ops->vdev_op_leaf &&
1274185029Spjd	    (error = zio_wait(vdev_probe(vd, NULL))) != 0) {
1275219089Spjd		vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
1276219089Spjd		    VDEV_AUX_ERR_EXCEEDED);
1277185029Spjd		return (error);
1278185029Spjd	}
1279185029Spjd
1280185029Spjd	/*
1281185029Spjd	 * If a leaf vdev has a DTL, and seems healthy, then kick off a
1282209962Smm	 * resilver.  But don't do this if we are doing a reopen for a scrub,
1283209962Smm	 * since this would just restart the scrub we are already doing.
1284168404Spjd	 */
1285209962Smm	if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen &&
1286209962Smm	    vdev_resilver_needed(vd, NULL, NULL))
1287209962Smm		spa_async_request(spa, SPA_ASYNC_RESILVER);
1288168404Spjd
1289168404Spjd	return (0);
1290168404Spjd}
1291168404Spjd
1292168404Spjd/*
1293168404Spjd * Called once the vdevs are all opened, this routine validates the label
1294168404Spjd * contents.  This needs to be done before vdev_load() so that we don't
1295185029Spjd * inadvertently do repair I/Os to the wrong device.
1296168404Spjd *
1297168404Spjd * This function will only return failure if one of the vdevs indicates that it
1298168404Spjd * has since been destroyed or exported.  This is only possible if
1299168404Spjd * /etc/zfs/zpool.cache was readonly at the time.  Otherwise, the vdev state
1300168404Spjd * will be updated but the function will return 0.
1301168404Spjd */
1302168404Spjdint
1303168404Spjdvdev_validate(vdev_t *vd)
1304168404Spjd{
1305168404Spjd	spa_t *spa = vd->vdev_spa;
1306168404Spjd	nvlist_t *label;
1307219089Spjd	uint64_t guid = 0, top_guid;
1308168404Spjd	uint64_t state;
1309168404Spjd
1310219089Spjd	for (int c = 0; c < vd->vdev_children; c++)
1311168404Spjd		if (vdev_validate(vd->vdev_child[c]) != 0)
1312168926Spjd			return (EBADF);
1313168404Spjd
1314168404Spjd	/*
1315168404Spjd	 * If the device has already failed, or was marked offline, don't do
1316168404Spjd	 * any further validation.  Otherwise, label I/O will fail and we will
1317168404Spjd	 * overwrite the previous state.
1318168404Spjd	 */
1319185029Spjd	if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) {
1320219089Spjd		uint64_t aux_guid = 0;
1321219089Spjd		nvlist_t *nvl;
1322168404Spjd
1323168404Spjd		if ((label = vdev_label_read_config(vd)) == NULL) {
1324168404Spjd			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
1325168404Spjd			    VDEV_AUX_BAD_LABEL);
1326168404Spjd			return (0);
1327168404Spjd		}
1328168404Spjd
1329219089Spjd		/*
1330219089Spjd		 * Determine if this vdev has been split off into another
1331219089Spjd		 * pool.  If so, then refuse to open it.
1332219089Spjd		 */
1333219089Spjd		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID,
1334219089Spjd		    &aux_guid) == 0 && aux_guid == spa_guid(spa)) {
1335219089Spjd			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
1336219089Spjd			    VDEV_AUX_SPLIT_POOL);
1337219089Spjd			nvlist_free(label);
1338219089Spjd			return (0);
1339219089Spjd		}
1340219089Spjd
1341168404Spjd		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID,
1342168404Spjd		    &guid) != 0 || guid != spa_guid(spa)) {
1343168404Spjd			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
1344168404Spjd			    VDEV_AUX_CORRUPT_DATA);
1345168404Spjd			nvlist_free(label);
1346168404Spjd			return (0);
1347168404Spjd		}
1348168404Spjd
1349219089Spjd		if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl)
1350219089Spjd		    != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID,
1351219089Spjd		    &aux_guid) != 0)
1352219089Spjd			aux_guid = 0;
1353219089Spjd
1354185029Spjd		/*
1355185029Spjd		 * If this vdev just became a top-level vdev because its
1356185029Spjd		 * sibling was detached, it will have adopted the parent's
1357185029Spjd		 * vdev guid -- but the label may or may not be on disk yet.
1358185029Spjd		 * Fortunately, either version of the label will have the
1359185029Spjd		 * same top guid, so if we're a top-level vdev, we can
1360185029Spjd		 * safely compare to that instead.
1361219089Spjd		 *
1362219089Spjd		 * If we split this vdev off instead, then we also check the
1363219089Spjd		 * original pool's guid.  We don't want to consider the vdev
1364219089Spjd		 * corrupt if it is partway through a split operation.
1365185029Spjd		 */
1366168404Spjd		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID,
1367185029Spjd		    &guid) != 0 ||
1368185029Spjd		    nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID,
1369185029Spjd		    &top_guid) != 0 ||
1370219089Spjd		    ((vd->vdev_guid != guid && vd->vdev_guid != aux_guid) &&
1371185029Spjd		    (vd->vdev_guid != top_guid || vd != vd->vdev_top))) {
1372168404Spjd			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
1373168404Spjd			    VDEV_AUX_CORRUPT_DATA);
1374168404Spjd			nvlist_free(label);
1375168404Spjd			return (0);
1376168404Spjd		}
1377168404Spjd
1378168404Spjd		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
1379168404Spjd		    &state) != 0) {
1380168404Spjd			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
1381168404Spjd			    VDEV_AUX_CORRUPT_DATA);
1382168404Spjd			nvlist_free(label);
1383168404Spjd			return (0);
1384168404Spjd		}
1385168404Spjd
1386168404Spjd		nvlist_free(label);
1387168404Spjd
1388209962Smm		/*
1389219089Spjd		 * If this is a verbatim import, no need to check the
1390209962Smm		 * state of the pool.
1391209962Smm		 */
1392219089Spjd		if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) &&
1393219089Spjd		    spa_load_state(spa) == SPA_LOAD_OPEN &&
1394168404Spjd		    state != POOL_STATE_ACTIVE)
1395168926Spjd			return (EBADF);
1396185029Spjd
1397185029Spjd		/*
1398185029Spjd		 * If we were able to open and validate a vdev that was
1399185029Spjd		 * previously marked permanently unavailable, clear that state
1400185029Spjd		 * now.
1401185029Spjd		 */
1402185029Spjd		if (vd->vdev_not_present)
1403185029Spjd			vd->vdev_not_present = 0;
1404168404Spjd	}
1405168404Spjd
1406168404Spjd	return (0);
1407168404Spjd}
1408168404Spjd
1409168404Spjd/*
1410168404Spjd * Close a virtual device.
1411168404Spjd */
1412168404Spjdvoid
1413168404Spjdvdev_close(vdev_t *vd)
1414168404Spjd{
1415209962Smm	spa_t *spa = vd->vdev_spa;
1416219089Spjd	vdev_t *pvd = vd->vdev_parent;
1417209962Smm
1418209962Smm	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
1419209962Smm
1420219089Spjd	/*
1421219089Spjd	 * If our parent is reopening, then we are as well, unless we are
1422219089Spjd	 * going offline.
1423219089Spjd	 */
1424219089Spjd	if (pvd != NULL && pvd->vdev_reopening)
1425219089Spjd		vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline);
1426219089Spjd
1427168404Spjd	vd->vdev_ops->vdev_op_close(vd);
1428168404Spjd
1429185029Spjd	vdev_cache_purge(vd);
1430168404Spjd
1431168404Spjd	/*
1432219089Spjd	 * We record the previous state before we close it, so that if we are
1433168404Spjd	 * doing a reopen(), we don't generate FMA ereports if we notice that
1434168404Spjd	 * it's still faulted.
1435168404Spjd	 */
1436168404Spjd	vd->vdev_prevstate = vd->vdev_state;
1437168404Spjd
1438168404Spjd	if (vd->vdev_offline)
1439168404Spjd		vd->vdev_state = VDEV_STATE_OFFLINE;
1440168404Spjd	else
1441168404Spjd		vd->vdev_state = VDEV_STATE_CLOSED;
1442168404Spjd	vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
1443168404Spjd}
1444168404Spjd
1445168404Spjdvoid
1446219089Spjdvdev_hold(vdev_t *vd)
1447219089Spjd{
1448219089Spjd	spa_t *spa = vd->vdev_spa;
1449219089Spjd
1450219089Spjd	ASSERT(spa_is_root(spa));
1451219089Spjd	if (spa->spa_state == POOL_STATE_UNINITIALIZED)
1452219089Spjd		return;
1453219089Spjd
1454219089Spjd	for (int c = 0; c < vd->vdev_children; c++)
1455219089Spjd		vdev_hold(vd->vdev_child[c]);
1456219089Spjd
1457219089Spjd	if (vd->vdev_ops->vdev_op_leaf)
1458219089Spjd		vd->vdev_ops->vdev_op_hold(vd);
1459219089Spjd}
1460219089Spjd
1461219089Spjdvoid
1462219089Spjdvdev_rele(vdev_t *vd)
1463219089Spjd{
1464219089Spjd	spa_t *spa = vd->vdev_spa;
1465219089Spjd
1466219089Spjd	ASSERT(spa_is_root(spa));
1467219089Spjd	for (int c = 0; c < vd->vdev_children; c++)
1468219089Spjd		vdev_rele(vd->vdev_child[c]);
1469219089Spjd
1470219089Spjd	if (vd->vdev_ops->vdev_op_leaf)
1471219089Spjd		vd->vdev_ops->vdev_op_rele(vd);
1472219089Spjd}
1473219089Spjd
1474219089Spjd/*
1475219089Spjd * Reopen all interior vdevs and any unopened leaves.  We don't actually
1476219089Spjd * reopen leaf vdevs which had previously been opened as they might deadlock
1477219089Spjd * on the spa_config_lock.  Instead we only obtain the leaf's physical size.
1478219089Spjd * If the leaf has never been opened then open it, as usual.
1479219089Spjd */
1480219089Spjdvoid
1481168404Spjdvdev_reopen(vdev_t *vd)
1482168404Spjd{
1483168404Spjd	spa_t *spa = vd->vdev_spa;
1484168404Spjd
1485185029Spjd	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
1486168404Spjd
1487219089Spjd	/* set the reopening flag unless we're taking the vdev offline */
1488219089Spjd	vd->vdev_reopening = !vd->vdev_offline;
1489168404Spjd	vdev_close(vd);
1490168404Spjd	(void) vdev_open(vd);
1491168404Spjd
1492168404Spjd	/*
1493168404Spjd	 * Call vdev_validate() here to make sure we have the same device.
1494168404Spjd	 * Otherwise, a device with an invalid label could be successfully
1495168404Spjd	 * opened in response to vdev_reopen().
1496168404Spjd	 */
1497185029Spjd	if (vd->vdev_aux) {
1498185029Spjd		(void) vdev_validate_aux(vd);
1499185029Spjd		if (vdev_readable(vd) && vdev_writeable(vd) &&
1500209962Smm		    vd->vdev_aux == &spa->spa_l2cache &&
1501219089Spjd		    !l2arc_vdev_present(vd))
1502219089Spjd			l2arc_add_vdev(spa, vd);
1503185029Spjd	} else {
1504185029Spjd		(void) vdev_validate(vd);
1505185029Spjd	}
1506168404Spjd
1507168404Spjd	/*
1508185029Spjd	 * Reassess parent vdev's health.
1509168404Spjd	 */
1510185029Spjd	vdev_propagate_state(vd);
1511168404Spjd}
1512168404Spjd
1513168404Spjdint
1514168404Spjdvdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing)
1515168404Spjd{
1516168404Spjd	int error;
1517168404Spjd
1518168404Spjd	/*
1519168404Spjd	 * Normally, partial opens (e.g. of a mirror) are allowed.
1520168404Spjd	 * For a create, however, we want to fail the request if
1521168404Spjd	 * there are any components we can't open.
1522168404Spjd	 */
1523168404Spjd	error = vdev_open(vd);
1524168404Spjd
1525168404Spjd	if (error || vd->vdev_state != VDEV_STATE_HEALTHY) {
1526168404Spjd		vdev_close(vd);
1527168404Spjd		return (error ? error : ENXIO);
1528168404Spjd	}
1529168404Spjd
1530168404Spjd	/*
1531168404Spjd	 * Recursively initialize all labels.
1532168404Spjd	 */
1533168404Spjd	if ((error = vdev_label_init(vd, txg, isreplacing ?
1534168404Spjd	    VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) {
1535168404Spjd		vdev_close(vd);
1536168404Spjd		return (error);
1537168404Spjd	}
1538168404Spjd
1539168404Spjd	return (0);
1540168404Spjd}
1541168404Spjd
1542168404Spjdvoid
1543219089Spjdvdev_metaslab_set_size(vdev_t *vd)
1544168404Spjd{
1545168404Spjd	/*
1546168404Spjd	 * Aim for roughly 200 metaslabs per vdev.
1547168404Spjd	 */
1548168404Spjd	vd->vdev_ms_shift = highbit(vd->vdev_asize / 200);
1549168404Spjd	vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT);
1550168404Spjd}
1551168404Spjd
1552168404Spjdvoid
1553168404Spjdvdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
1554168404Spjd{
1555168404Spjd	ASSERT(vd == vd->vdev_top);
1556219089Spjd	ASSERT(!vd->vdev_ishole);
1557168404Spjd	ASSERT(ISP2(flags));
1558219089Spjd	ASSERT(spa_writeable(vd->vdev_spa));
1559168404Spjd
1560168404Spjd	if (flags & VDD_METASLAB)
1561168404Spjd		(void) txg_list_add(&vd->vdev_ms_list, arg, txg);
1562168404Spjd
1563168404Spjd	if (flags & VDD_DTL)
1564168404Spjd		(void) txg_list_add(&vd->vdev_dtl_list, arg, txg);
1565168404Spjd
1566168404Spjd	(void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg);
1567168404Spjd}
1568168404Spjd
1569209962Smm/*
1570209962Smm * DTLs.
1571209962Smm *
1572209962Smm * A vdev's DTL (dirty time log) is the set of transaction groups for which
1573219089Spjd * the vdev has less than perfect replication.  There are four kinds of DTL:
1574209962Smm *
1575209962Smm * DTL_MISSING: txgs for which the vdev has no valid copies of the data
1576209962Smm *
1577209962Smm * DTL_PARTIAL: txgs for which data is available, but not fully replicated
1578209962Smm *
1579209962Smm * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon
1580209962Smm *	scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of
1581209962Smm *	txgs that was scrubbed.
1582209962Smm *
1583209962Smm * DTL_OUTAGE: txgs which cannot currently be read, whether due to
1584209962Smm *	persistent errors or just some device being offline.
1585209962Smm *	Unlike the other three, the DTL_OUTAGE map is not generally
1586209962Smm *	maintained; it's only computed when needed, typically to
1587209962Smm *	determine whether a device can be detached.
1588209962Smm *
1589209962Smm * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device
1590209962Smm * either has the data or it doesn't.
1591209962Smm *
1592209962Smm * For interior vdevs such as mirror and RAID-Z the picture is more complex.
1593209962Smm * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because
1594209962Smm * if any child is less than fully replicated, then so is its parent.
1595209962Smm * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs,
1596209962Smm * comprising only those txgs which appear in 'maxfaults' or more children;
1597209962Smm * those are the txgs we don't have enough replication to read.  For example,
1598209962Smm * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2);
1599209962Smm * thus, its DTL_MISSING consists of the set of txgs that appear in more than
1600209962Smm * two child DTL_MISSING maps.
1601209962Smm *
1602209962Smm * It should be clear from the above that to compute the DTLs and outage maps
1603209962Smm * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps.
1604209962Smm * Therefore, that is all we keep on disk.  When loading the pool, or after
1605209962Smm * a configuration change, we generate all other DTLs from first principles.
1606209962Smm */
1607168404Spjdvoid
1608209962Smmvdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
1609168404Spjd{
1610209962Smm	space_map_t *sm = &vd->vdev_dtl[t];
1611209962Smm
1612209962Smm	ASSERT(t < DTL_TYPES);
1613209962Smm	ASSERT(vd != vd->vdev_spa->spa_root_vdev);
1614219089Spjd	ASSERT(spa_writeable(vd->vdev_spa));
1615209962Smm
1616168404Spjd	mutex_enter(sm->sm_lock);
1617168404Spjd	if (!space_map_contains(sm, txg, size))
1618168404Spjd		space_map_add(sm, txg, size);
1619168404Spjd	mutex_exit(sm->sm_lock);
1620168404Spjd}
1621168404Spjd
1622209962Smmboolean_t
1623209962Smmvdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
1624168404Spjd{
1625209962Smm	space_map_t *sm = &vd->vdev_dtl[t];
1626209962Smm	boolean_t dirty = B_FALSE;
1627168404Spjd
1628209962Smm	ASSERT(t < DTL_TYPES);
1629209962Smm	ASSERT(vd != vd->vdev_spa->spa_root_vdev);
1630168404Spjd
1631168404Spjd	mutex_enter(sm->sm_lock);
1632209962Smm	if (sm->sm_space != 0)
1633209962Smm		dirty = space_map_contains(sm, txg, size);
1634168404Spjd	mutex_exit(sm->sm_lock);
1635168404Spjd
1636168404Spjd	return (dirty);
1637168404Spjd}
1638168404Spjd
1639209962Smmboolean_t
1640209962Smmvdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
1641209962Smm{
1642209962Smm	space_map_t *sm = &vd->vdev_dtl[t];
1643209962Smm	boolean_t empty;
1644209962Smm
1645209962Smm	mutex_enter(sm->sm_lock);
1646209962Smm	empty = (sm->sm_space == 0);
1647209962Smm	mutex_exit(sm->sm_lock);
1648209962Smm
1649209962Smm	return (empty);
1650209962Smm}
1651209962Smm
1652168404Spjd/*
1653168404Spjd * Reassess DTLs after a config change or scrub completion.
1654168404Spjd */
1655168404Spjdvoid
1656168404Spjdvdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
1657168404Spjd{
1658168404Spjd	spa_t *spa = vd->vdev_spa;
1659209962Smm	avl_tree_t reftree;
1660209962Smm	int minref;
1661168404Spjd
1662209962Smm	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
1663168404Spjd
1664209962Smm	for (int c = 0; c < vd->vdev_children; c++)
1665209962Smm		vdev_dtl_reassess(vd->vdev_child[c], txg,
1666209962Smm		    scrub_txg, scrub_done);
1667209962Smm
1668219089Spjd	if (vd == spa->spa_root_vdev || vd->vdev_ishole || vd->vdev_aux)
1669209962Smm		return;
1670209962Smm
1671209962Smm	if (vd->vdev_ops->vdev_op_leaf) {
1672219089Spjd		dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
1673219089Spjd
1674168404Spjd		mutex_enter(&vd->vdev_dtl_lock);
1675185029Spjd		if (scrub_txg != 0 &&
1676219089Spjd		    (spa->spa_scrub_started ||
1677219089Spjd		    (scn && scn->scn_phys.scn_errors == 0))) {
1678185029Spjd			/*
1679185029Spjd			 * We completed a scrub up to scrub_txg.  If we
1680185029Spjd			 * did it without rebooting, then the scrub dtl
1681185029Spjd			 * will be valid, so excise the old region and
1682185029Spjd			 * fold in the scrub dtl.  Otherwise, leave the
1683185029Spjd			 * dtl as-is if there was an error.
1684209962Smm			 *
1685209962Smm			 * There's little trick here: to excise the beginning
1686209962Smm			 * of the DTL_MISSING map, we put it into a reference
1687209962Smm			 * tree and then add a segment with refcnt -1 that
1688209962Smm			 * covers the range [0, scrub_txg).  This means
1689209962Smm			 * that each txg in that range has refcnt -1 or 0.
1690209962Smm			 * We then add DTL_SCRUB with a refcnt of 2, so that
1691209962Smm			 * entries in the range [0, scrub_txg) will have a
1692209962Smm			 * positive refcnt -- either 1 or 2.  We then convert
1693209962Smm			 * the reference tree into the new DTL_MISSING map.
1694185029Spjd			 */
1695209962Smm			space_map_ref_create(&reftree);
1696209962Smm			space_map_ref_add_map(&reftree,
1697209962Smm			    &vd->vdev_dtl[DTL_MISSING], 1);
1698209962Smm			space_map_ref_add_seg(&reftree, 0, scrub_txg, -1);
1699209962Smm			space_map_ref_add_map(&reftree,
1700209962Smm			    &vd->vdev_dtl[DTL_SCRUB], 2);
1701209962Smm			space_map_ref_generate_map(&reftree,
1702209962Smm			    &vd->vdev_dtl[DTL_MISSING], 1);
1703209962Smm			space_map_ref_destroy(&reftree);
1704168404Spjd		}
1705209962Smm		space_map_vacate(&vd->vdev_dtl[DTL_PARTIAL], NULL, NULL);
1706209962Smm		space_map_walk(&vd->vdev_dtl[DTL_MISSING],
1707209962Smm		    space_map_add, &vd->vdev_dtl[DTL_PARTIAL]);
1708168404Spjd		if (scrub_done)
1709209962Smm			space_map_vacate(&vd->vdev_dtl[DTL_SCRUB], NULL, NULL);
1710209962Smm		space_map_vacate(&vd->vdev_dtl[DTL_OUTAGE], NULL, NULL);
1711209962Smm		if (!vdev_readable(vd))
1712209962Smm			space_map_add(&vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL);
1713209962Smm		else
1714209962Smm			space_map_walk(&vd->vdev_dtl[DTL_MISSING],
1715209962Smm			    space_map_add, &vd->vdev_dtl[DTL_OUTAGE]);
1716168404Spjd		mutex_exit(&vd->vdev_dtl_lock);
1717185029Spjd
1718168404Spjd		if (txg != 0)
1719168404Spjd			vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg);
1720168404Spjd		return;
1721168404Spjd	}
1722168404Spjd
1723168404Spjd	mutex_enter(&vd->vdev_dtl_lock);
1724209962Smm	for (int t = 0; t < DTL_TYPES; t++) {
1725209962Smm		/* account for child's outage in parent's missing map */
1726209962Smm		int s = (t == DTL_MISSING) ? DTL_OUTAGE: t;
1727209962Smm		if (t == DTL_SCRUB)
1728209962Smm			continue;			/* leaf vdevs only */
1729209962Smm		if (t == DTL_PARTIAL)
1730209962Smm			minref = 1;			/* i.e. non-zero */
1731209962Smm		else if (vd->vdev_nparity != 0)
1732209962Smm			minref = vd->vdev_nparity + 1;	/* RAID-Z */
1733209962Smm		else
1734209962Smm			minref = vd->vdev_children;	/* any kind of mirror */
1735209962Smm		space_map_ref_create(&reftree);
1736209962Smm		for (int c = 0; c < vd->vdev_children; c++) {
1737209962Smm			vdev_t *cvd = vd->vdev_child[c];
1738209962Smm			mutex_enter(&cvd->vdev_dtl_lock);
1739209962Smm			space_map_ref_add_map(&reftree, &cvd->vdev_dtl[s], 1);
1740209962Smm			mutex_exit(&cvd->vdev_dtl_lock);
1741209962Smm		}
1742209962Smm		space_map_ref_generate_map(&reftree, &vd->vdev_dtl[t], minref);
1743209962Smm		space_map_ref_destroy(&reftree);
1744209962Smm	}
1745168404Spjd	mutex_exit(&vd->vdev_dtl_lock);
1746168404Spjd}
1747168404Spjd
1748168404Spjdstatic int
1749168404Spjdvdev_dtl_load(vdev_t *vd)
1750168404Spjd{
1751168404Spjd	spa_t *spa = vd->vdev_spa;
1752209962Smm	space_map_obj_t *smo = &vd->vdev_dtl_smo;
1753168404Spjd	objset_t *mos = spa->spa_meta_objset;
1754168404Spjd	dmu_buf_t *db;
1755168404Spjd	int error;
1756168404Spjd
1757168404Spjd	ASSERT(vd->vdev_children == 0);
1758168404Spjd
1759168404Spjd	if (smo->smo_object == 0)
1760168404Spjd		return (0);
1761168404Spjd
1762219089Spjd	ASSERT(!vd->vdev_ishole);
1763219089Spjd
1764168404Spjd	if ((error = dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)) != 0)
1765168404Spjd		return (error);
1766168404Spjd
1767185029Spjd	ASSERT3U(db->db_size, >=, sizeof (*smo));
1768185029Spjd	bcopy(db->db_data, smo, sizeof (*smo));
1769168404Spjd	dmu_buf_rele(db, FTAG);
1770168404Spjd
1771168404Spjd	mutex_enter(&vd->vdev_dtl_lock);
1772209962Smm	error = space_map_load(&vd->vdev_dtl[DTL_MISSING],
1773209962Smm	    NULL, SM_ALLOC, smo, mos);
1774168404Spjd	mutex_exit(&vd->vdev_dtl_lock);
1775168404Spjd
1776168404Spjd	return (error);
1777168404Spjd}
1778168404Spjd
1779168404Spjdvoid
1780168404Spjdvdev_dtl_sync(vdev_t *vd, uint64_t txg)
1781168404Spjd{
1782168404Spjd	spa_t *spa = vd->vdev_spa;
1783209962Smm	space_map_obj_t *smo = &vd->vdev_dtl_smo;
1784209962Smm	space_map_t *sm = &vd->vdev_dtl[DTL_MISSING];
1785168404Spjd	objset_t *mos = spa->spa_meta_objset;
1786168404Spjd	space_map_t smsync;
1787168404Spjd	kmutex_t smlock;
1788168404Spjd	dmu_buf_t *db;
1789168404Spjd	dmu_tx_t *tx;
1790168404Spjd
1791219089Spjd	ASSERT(!vd->vdev_ishole);
1792219089Spjd
1793168404Spjd	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
1794168404Spjd
1795168404Spjd	if (vd->vdev_detached) {
1796168404Spjd		if (smo->smo_object != 0) {
1797168404Spjd			int err = dmu_object_free(mos, smo->smo_object, tx);
1798168404Spjd			ASSERT3U(err, ==, 0);
1799168404Spjd			smo->smo_object = 0;
1800168404Spjd		}
1801168404Spjd		dmu_tx_commit(tx);
1802168404Spjd		return;
1803168404Spjd	}
1804168404Spjd
1805168404Spjd	if (smo->smo_object == 0) {
1806168404Spjd		ASSERT(smo->smo_objsize == 0);
1807168404Spjd		ASSERT(smo->smo_alloc == 0);
1808168404Spjd		smo->smo_object = dmu_object_alloc(mos,
1809168404Spjd		    DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT,
1810168404Spjd		    DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx);
1811168404Spjd		ASSERT(smo->smo_object != 0);
1812168404Spjd		vdev_config_dirty(vd->vdev_top);
1813168404Spjd	}
1814168404Spjd
1815168404Spjd	mutex_init(&smlock, NULL, MUTEX_DEFAULT, NULL);
1816168404Spjd
1817168404Spjd	space_map_create(&smsync, sm->sm_start, sm->sm_size, sm->sm_shift,
1818168404Spjd	    &smlock);
1819168404Spjd
1820168404Spjd	mutex_enter(&smlock);
1821168404Spjd
1822168404Spjd	mutex_enter(&vd->vdev_dtl_lock);
1823168404Spjd	space_map_walk(sm, space_map_add, &smsync);
1824168404Spjd	mutex_exit(&vd->vdev_dtl_lock);
1825168404Spjd
1826168404Spjd	space_map_truncate(smo, mos, tx);
1827168404Spjd	space_map_sync(&smsync, SM_ALLOC, smo, mos, tx);
1828168404Spjd
1829168404Spjd	space_map_destroy(&smsync);
1830168404Spjd
1831168404Spjd	mutex_exit(&smlock);
1832168404Spjd	mutex_destroy(&smlock);
1833168404Spjd
1834168404Spjd	VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db));
1835168404Spjd	dmu_buf_will_dirty(db, tx);
1836185029Spjd	ASSERT3U(db->db_size, >=, sizeof (*smo));
1837185029Spjd	bcopy(smo, db->db_data, sizeof (*smo));
1838168404Spjd	dmu_buf_rele(db, FTAG);
1839168404Spjd
1840168404Spjd	dmu_tx_commit(tx);
1841168404Spjd}
1842168404Spjd
1843185029Spjd/*
1844209962Smm * Determine whether the specified vdev can be offlined/detached/removed
1845209962Smm * without losing data.
1846209962Smm */
1847209962Smmboolean_t
1848209962Smmvdev_dtl_required(vdev_t *vd)
1849209962Smm{
1850209962Smm	spa_t *spa = vd->vdev_spa;
1851209962Smm	vdev_t *tvd = vd->vdev_top;
1852209962Smm	uint8_t cant_read = vd->vdev_cant_read;
1853209962Smm	boolean_t required;
1854209962Smm
1855209962Smm	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
1856209962Smm
1857209962Smm	if (vd == spa->spa_root_vdev || vd == tvd)
1858209962Smm		return (B_TRUE);
1859209962Smm
1860209962Smm	/*
1861209962Smm	 * Temporarily mark the device as unreadable, and then determine
1862209962Smm	 * whether this results in any DTL outages in the top-level vdev.
1863209962Smm	 * If not, we can safely offline/detach/remove the device.
1864209962Smm	 */
1865209962Smm	vd->vdev_cant_read = B_TRUE;
1866209962Smm	vdev_dtl_reassess(tvd, 0, 0, B_FALSE);
1867209962Smm	required = !vdev_dtl_empty(tvd, DTL_OUTAGE);
1868209962Smm	vd->vdev_cant_read = cant_read;
1869209962Smm	vdev_dtl_reassess(tvd, 0, 0, B_FALSE);
1870209962Smm
1871219089Spjd	if (!required && zio_injection_enabled)
1872219089Spjd		required = !!zio_handle_device_injection(vd, NULL, ECHILD);
1873219089Spjd
1874209962Smm	return (required);
1875209962Smm}
1876209962Smm
1877209962Smm/*
1878185029Spjd * Determine if resilver is needed, and if so the txg range.
1879185029Spjd */
1880185029Spjdboolean_t
1881185029Spjdvdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp)
1882185029Spjd{
1883185029Spjd	boolean_t needed = B_FALSE;
1884185029Spjd	uint64_t thismin = UINT64_MAX;
1885185029Spjd	uint64_t thismax = 0;
1886185029Spjd
1887185029Spjd	if (vd->vdev_children == 0) {
1888185029Spjd		mutex_enter(&vd->vdev_dtl_lock);
1889209962Smm		if (vd->vdev_dtl[DTL_MISSING].sm_space != 0 &&
1890209962Smm		    vdev_writeable(vd)) {
1891185029Spjd			space_seg_t *ss;
1892185029Spjd
1893209962Smm			ss = avl_first(&vd->vdev_dtl[DTL_MISSING].sm_root);
1894185029Spjd			thismin = ss->ss_start - 1;
1895209962Smm			ss = avl_last(&vd->vdev_dtl[DTL_MISSING].sm_root);
1896185029Spjd			thismax = ss->ss_end;
1897185029Spjd			needed = B_TRUE;
1898185029Spjd		}
1899185029Spjd		mutex_exit(&vd->vdev_dtl_lock);
1900185029Spjd	} else {
1901209962Smm		for (int c = 0; c < vd->vdev_children; c++) {
1902185029Spjd			vdev_t *cvd = vd->vdev_child[c];
1903185029Spjd			uint64_t cmin, cmax;
1904185029Spjd
1905185029Spjd			if (vdev_resilver_needed(cvd, &cmin, &cmax)) {
1906185029Spjd				thismin = MIN(thismin, cmin);
1907185029Spjd				thismax = MAX(thismax, cmax);
1908185029Spjd				needed = B_TRUE;
1909185029Spjd			}
1910185029Spjd		}
1911185029Spjd	}
1912185029Spjd
1913185029Spjd	if (needed && minp) {
1914185029Spjd		*minp = thismin;
1915185029Spjd		*maxp = thismax;
1916185029Spjd	}
1917185029Spjd	return (needed);
1918185029Spjd}
1919185029Spjd
1920168404Spjdvoid
1921168404Spjdvdev_load(vdev_t *vd)
1922168404Spjd{
1923168404Spjd	/*
1924168404Spjd	 * Recursively load all children.
1925168404Spjd	 */
1926209962Smm	for (int c = 0; c < vd->vdev_children; c++)
1927168404Spjd		vdev_load(vd->vdev_child[c]);
1928168404Spjd
1929168404Spjd	/*
1930168404Spjd	 * If this is a top-level vdev, initialize its metaslabs.
1931168404Spjd	 */
1932219089Spjd	if (vd == vd->vdev_top && !vd->vdev_ishole &&
1933168404Spjd	    (vd->vdev_ashift == 0 || vd->vdev_asize == 0 ||
1934168404Spjd	    vdev_metaslab_init(vd, 0) != 0))
1935168404Spjd		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
1936168404Spjd		    VDEV_AUX_CORRUPT_DATA);
1937168404Spjd
1938168404Spjd	/*
1939168404Spjd	 * If this is a leaf vdev, load its DTL.
1940168404Spjd	 */
1941168404Spjd	if (vd->vdev_ops->vdev_op_leaf && vdev_dtl_load(vd) != 0)
1942168404Spjd		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
1943168404Spjd		    VDEV_AUX_CORRUPT_DATA);
1944168404Spjd}
1945168404Spjd
1946168404Spjd/*
1947185029Spjd * The special vdev case is used for hot spares and l2cache devices.  Its
1948185029Spjd * sole purpose it to set the vdev state for the associated vdev.  To do this,
1949185029Spjd * we make sure that we can open the underlying device, then try to read the
1950185029Spjd * label, and make sure that the label is sane and that it hasn't been
1951185029Spjd * repurposed to another pool.
1952168404Spjd */
1953168404Spjdint
1954185029Spjdvdev_validate_aux(vdev_t *vd)
1955168404Spjd{
1956168404Spjd	nvlist_t *label;
1957168404Spjd	uint64_t guid, version;
1958168404Spjd	uint64_t state;
1959168404Spjd
1960185029Spjd	if (!vdev_readable(vd))
1961185029Spjd		return (0);
1962185029Spjd
1963168404Spjd	if ((label = vdev_label_read_config(vd)) == NULL) {
1964168404Spjd		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
1965168404Spjd		    VDEV_AUX_CORRUPT_DATA);
1966168404Spjd		return (-1);
1967168404Spjd	}
1968168404Spjd
1969168404Spjd	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 ||
1970185029Spjd	    version > SPA_VERSION ||
1971168404Spjd	    nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 ||
1972168404Spjd	    guid != vd->vdev_guid ||
1973168404Spjd	    nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) {
1974168404Spjd		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
1975168404Spjd		    VDEV_AUX_CORRUPT_DATA);
1976168404Spjd		nvlist_free(label);
1977168404Spjd		return (-1);
1978168404Spjd	}
1979168404Spjd
1980168404Spjd	/*
1981168404Spjd	 * We don't actually check the pool state here.  If it's in fact in
1982168404Spjd	 * use by another pool, we update this fact on the fly when requested.
1983168404Spjd	 */
1984168404Spjd	nvlist_free(label);
1985168404Spjd	return (0);
1986168404Spjd}
1987168404Spjd
1988168404Spjdvoid
1989219089Spjdvdev_remove(vdev_t *vd, uint64_t txg)
1990219089Spjd{
1991219089Spjd	spa_t *spa = vd->vdev_spa;
1992219089Spjd	objset_t *mos = spa->spa_meta_objset;
1993219089Spjd	dmu_tx_t *tx;
1994219089Spjd
1995219089Spjd	tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
1996219089Spjd
1997219089Spjd	if (vd->vdev_dtl_smo.smo_object) {
1998219089Spjd		ASSERT3U(vd->vdev_dtl_smo.smo_alloc, ==, 0);
1999219089Spjd		(void) dmu_object_free(mos, vd->vdev_dtl_smo.smo_object, tx);
2000219089Spjd		vd->vdev_dtl_smo.smo_object = 0;
2001219089Spjd	}
2002219089Spjd
2003219089Spjd	if (vd->vdev_ms != NULL) {
2004219089Spjd		for (int m = 0; m < vd->vdev_ms_count; m++) {
2005219089Spjd			metaslab_t *msp = vd->vdev_ms[m];
2006219089Spjd
2007219089Spjd			if (msp == NULL || msp->ms_smo.smo_object == 0)
2008219089Spjd				continue;
2009219089Spjd
2010219089Spjd			ASSERT3U(msp->ms_smo.smo_alloc, ==, 0);
2011219089Spjd			(void) dmu_object_free(mos, msp->ms_smo.smo_object, tx);
2012219089Spjd			msp->ms_smo.smo_object = 0;
2013219089Spjd		}
2014219089Spjd	}
2015219089Spjd
2016219089Spjd	if (vd->vdev_ms_array) {
2017219089Spjd		(void) dmu_object_free(mos, vd->vdev_ms_array, tx);
2018219089Spjd		vd->vdev_ms_array = 0;
2019219089Spjd		vd->vdev_ms_shift = 0;
2020219089Spjd	}
2021219089Spjd	dmu_tx_commit(tx);
2022219089Spjd}
2023219089Spjd
2024219089Spjdvoid
2025168404Spjdvdev_sync_done(vdev_t *vd, uint64_t txg)
2026168404Spjd{
2027168404Spjd	metaslab_t *msp;
2028211931Smm	boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg));
2029168404Spjd
2030219089Spjd	ASSERT(!vd->vdev_ishole);
2031219089Spjd
2032168404Spjd	while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
2033168404Spjd		metaslab_sync_done(msp, txg);
2034211931Smm
2035211931Smm	if (reassess)
2036211931Smm		metaslab_sync_reassess(vd->vdev_mg);
2037168404Spjd}
2038168404Spjd
2039168404Spjdvoid
2040168404Spjdvdev_sync(vdev_t *vd, uint64_t txg)
2041168404Spjd{
2042168404Spjd	spa_t *spa = vd->vdev_spa;
2043168404Spjd	vdev_t *lvd;
2044168404Spjd	metaslab_t *msp;
2045168404Spjd	dmu_tx_t *tx;
2046168404Spjd
2047219089Spjd	ASSERT(!vd->vdev_ishole);
2048219089Spjd
2049168404Spjd	if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0) {
2050168404Spjd		ASSERT(vd == vd->vdev_top);
2051168404Spjd		tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
2052168404Spjd		vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset,
2053168404Spjd		    DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx);
2054168404Spjd		ASSERT(vd->vdev_ms_array != 0);
2055168404Spjd		vdev_config_dirty(vd);
2056168404Spjd		dmu_tx_commit(tx);
2057168404Spjd	}
2058168404Spjd
2059219089Spjd	/*
2060219089Spjd	 * Remove the metadata associated with this vdev once it's empty.
2061219089Spjd	 */
2062219089Spjd	if (vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing)
2063219089Spjd		vdev_remove(vd, txg);
2064219089Spjd
2065168404Spjd	while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) {
2066168404Spjd		metaslab_sync(msp, txg);
2067168404Spjd		(void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg));
2068168404Spjd	}
2069168404Spjd
2070168404Spjd	while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL)
2071168404Spjd		vdev_dtl_sync(lvd, txg);
2072168404Spjd
2073168404Spjd	(void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg));
2074168404Spjd}
2075168404Spjd
2076168404Spjduint64_t
2077168404Spjdvdev_psize_to_asize(vdev_t *vd, uint64_t psize)
2078168404Spjd{
2079168404Spjd	return (vd->vdev_ops->vdev_op_asize(vd, psize));
2080168404Spjd}
2081168404Spjd
2082185029Spjd/*
2083185029Spjd * Mark the given vdev faulted.  A faulted vdev behaves as if the device could
2084185029Spjd * not be opened, and no I/O is attempted.
2085185029Spjd */
2086185029Spjdint
2087219089Spjdvdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux)
2088168404Spjd{
2089219089Spjd	vdev_t *vd, *tvd;
2090168404Spjd
2091219089Spjd	spa_vdev_state_enter(spa, SCL_NONE);
2092185029Spjd
2093185029Spjd	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
2094185029Spjd		return (spa_vdev_state_exit(spa, NULL, ENODEV));
2095185029Spjd
2096185029Spjd	if (!vd->vdev_ops->vdev_op_leaf)
2097185029Spjd		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
2098185029Spjd
2099219089Spjd	tvd = vd->vdev_top;
2100219089Spjd
2101185029Spjd	/*
2102219089Spjd	 * We don't directly use the aux state here, but if we do a
2103219089Spjd	 * vdev_reopen(), we need this value to be present to remember why we
2104219089Spjd	 * were faulted.
2105219089Spjd	 */
2106219089Spjd	vd->vdev_label_aux = aux;
2107219089Spjd
2108219089Spjd	/*
2109185029Spjd	 * Faulted state takes precedence over degraded.
2110185029Spjd	 */
2111219089Spjd	vd->vdev_delayed_close = B_FALSE;
2112185029Spjd	vd->vdev_faulted = 1ULL;
2113185029Spjd	vd->vdev_degraded = 0ULL;
2114219089Spjd	vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux);
2115185029Spjd
2116185029Spjd	/*
2117219089Spjd	 * If this device has the only valid copy of the data, then
2118219089Spjd	 * back off and simply mark the vdev as degraded instead.
2119185029Spjd	 */
2120219089Spjd	if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) {
2121185029Spjd		vd->vdev_degraded = 1ULL;
2122185029Spjd		vd->vdev_faulted = 0ULL;
2123185029Spjd
2124185029Spjd		/*
2125185029Spjd		 * If we reopen the device and it's not dead, only then do we
2126185029Spjd		 * mark it degraded.
2127185029Spjd		 */
2128219089Spjd		vdev_reopen(tvd);
2129185029Spjd
2130219089Spjd		if (vdev_readable(vd))
2131219089Spjd			vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux);
2132185029Spjd	}
2133185029Spjd
2134185029Spjd	return (spa_vdev_state_exit(spa, vd, 0));
2135168404Spjd}
2136168404Spjd
2137185029Spjd/*
2138185029Spjd * Mark the given vdev degraded.  A degraded vdev is purely an indication to the
2139185029Spjd * user that something is wrong.  The vdev continues to operate as normal as far
2140185029Spjd * as I/O is concerned.
2141185029Spjd */
2142185029Spjdint
2143219089Spjdvdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux)
2144168404Spjd{
2145185029Spjd	vdev_t *vd;
2146168404Spjd
2147219089Spjd	spa_vdev_state_enter(spa, SCL_NONE);
2148168404Spjd
2149185029Spjd	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
2150185029Spjd		return (spa_vdev_state_exit(spa, NULL, ENODEV));
2151168404Spjd
2152185029Spjd	if (!vd->vdev_ops->vdev_op_leaf)
2153185029Spjd		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
2154185029Spjd
2155185029Spjd	/*
2156185029Spjd	 * If the vdev is already faulted, then don't do anything.
2157185029Spjd	 */
2158185029Spjd	if (vd->vdev_faulted || vd->vdev_degraded)
2159185029Spjd		return (spa_vdev_state_exit(spa, NULL, 0));
2160185029Spjd
2161185029Spjd	vd->vdev_degraded = 1ULL;
2162185029Spjd	if (!vdev_is_dead(vd))
2163185029Spjd		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED,
2164219089Spjd		    aux);
2165185029Spjd
2166185029Spjd	return (spa_vdev_state_exit(spa, vd, 0));
2167168404Spjd}
2168168404Spjd
2169185029Spjd/*
2170185029Spjd * Online the given vdev.  If 'unspare' is set, it implies two things.  First,
2171185029Spjd * any attached spare device should be detached when the device finishes
2172185029Spjd * resilvering.  Second, the online should be treated like a 'test' online case,
2173185029Spjd * so no FMA events are generated if the device fails to open.
2174185029Spjd */
2175168404Spjdint
2176185029Spjdvdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
2177168404Spjd{
2178219089Spjd	vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev;
2179168404Spjd
2180219089Spjd	spa_vdev_state_enter(spa, SCL_NONE);
2181168404Spjd
2182185029Spjd	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
2183185029Spjd		return (spa_vdev_state_exit(spa, NULL, ENODEV));
2184168404Spjd
2185168404Spjd	if (!vd->vdev_ops->vdev_op_leaf)
2186185029Spjd		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
2187168404Spjd
2188219089Spjd	tvd = vd->vdev_top;
2189168404Spjd	vd->vdev_offline = B_FALSE;
2190168404Spjd	vd->vdev_tmpoffline = B_FALSE;
2191185029Spjd	vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE);
2192185029Spjd	vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT);
2193219089Spjd
2194219089Spjd	/* XXX - L2ARC 1.0 does not support expansion */
2195219089Spjd	if (!vd->vdev_aux) {
2196219089Spjd		for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
2197219089Spjd			pvd->vdev_expanding = !!(flags & ZFS_ONLINE_EXPAND);
2198219089Spjd	}
2199219089Spjd
2200219089Spjd	vdev_reopen(tvd);
2201185029Spjd	vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE;
2202168404Spjd
2203219089Spjd	if (!vd->vdev_aux) {
2204219089Spjd		for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
2205219089Spjd			pvd->vdev_expanding = B_FALSE;
2206219089Spjd	}
2207219089Spjd
2208185029Spjd	if (newstate)
2209185029Spjd		*newstate = vd->vdev_state;
2210185029Spjd	if ((flags & ZFS_ONLINE_UNSPARE) &&
2211185029Spjd	    !vdev_is_dead(vd) && vd->vdev_parent &&
2212185029Spjd	    vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
2213185029Spjd	    vd->vdev_parent->vdev_child[0] == vd)
2214185029Spjd		vd->vdev_unspare = B_TRUE;
2215168404Spjd
2216219089Spjd	if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) {
2217219089Spjd
2218219089Spjd		/* XXX - L2ARC 1.0 does not support expansion */
2219219089Spjd		if (vd->vdev_aux)
2220219089Spjd			return (spa_vdev_state_exit(spa, vd, ENOTSUP));
2221219089Spjd		spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
2222219089Spjd	}
2223209962Smm	return (spa_vdev_state_exit(spa, vd, 0));
2224168404Spjd}
2225168404Spjd
2226219089Spjdstatic int
2227219089Spjdvdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags)
2228168404Spjd{
2229213197Smm	vdev_t *vd, *tvd;
2230219089Spjd	int error = 0;
2231219089Spjd	uint64_t generation;
2232219089Spjd	metaslab_group_t *mg;
2233168404Spjd
2234219089Spjdtop:
2235219089Spjd	spa_vdev_state_enter(spa, SCL_ALLOC);
2236168404Spjd
2237185029Spjd	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
2238185029Spjd		return (spa_vdev_state_exit(spa, NULL, ENODEV));
2239168404Spjd
2240168404Spjd	if (!vd->vdev_ops->vdev_op_leaf)
2241185029Spjd		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
2242168404Spjd
2243213197Smm	tvd = vd->vdev_top;
2244219089Spjd	mg = tvd->vdev_mg;
2245219089Spjd	generation = spa->spa_config_generation + 1;
2246213197Smm
2247168404Spjd	/*
2248168404Spjd	 * If the device isn't already offline, try to offline it.
2249168404Spjd	 */
2250168404Spjd	if (!vd->vdev_offline) {
2251168404Spjd		/*
2252209962Smm		 * If this device has the only valid copy of some data,
2253213197Smm		 * don't allow it to be offlined. Log devices are always
2254213197Smm		 * expendable.
2255168404Spjd		 */
2256213197Smm		if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
2257213197Smm		    vdev_dtl_required(vd))
2258185029Spjd			return (spa_vdev_state_exit(spa, NULL, EBUSY));
2259168404Spjd
2260168404Spjd		/*
2261219089Spjd		 * If the top-level is a slog and it has had allocations
2262219089Spjd		 * then proceed.  We check that the vdev's metaslab group
2263219089Spjd		 * is not NULL since it's possible that we may have just
2264219089Spjd		 * added this vdev but not yet initialized its metaslabs.
2265219089Spjd		 */
2266219089Spjd		if (tvd->vdev_islog && mg != NULL) {
2267219089Spjd			/*
2268219089Spjd			 * Prevent any future allocations.
2269219089Spjd			 */
2270219089Spjd			metaslab_group_passivate(mg);
2271219089Spjd			(void) spa_vdev_state_exit(spa, vd, 0);
2272219089Spjd
2273219089Spjd			error = spa_offline_log(spa);
2274219089Spjd
2275219089Spjd			spa_vdev_state_enter(spa, SCL_ALLOC);
2276219089Spjd
2277219089Spjd			/*
2278219089Spjd			 * Check to see if the config has changed.
2279219089Spjd			 */
2280219089Spjd			if (error || generation != spa->spa_config_generation) {
2281219089Spjd				metaslab_group_activate(mg);
2282219089Spjd				if (error)
2283219089Spjd					return (spa_vdev_state_exit(spa,
2284219089Spjd					    vd, error));
2285219089Spjd				(void) spa_vdev_state_exit(spa, vd, 0);
2286219089Spjd				goto top;
2287219089Spjd			}
2288219089Spjd			ASSERT3U(tvd->vdev_stat.vs_alloc, ==, 0);
2289219089Spjd		}
2290219089Spjd
2291219089Spjd		/*
2292168404Spjd		 * Offline this device and reopen its top-level vdev.
2293213197Smm		 * If the top-level vdev is a log device then just offline
2294213197Smm		 * it. Otherwise, if this action results in the top-level
2295213197Smm		 * vdev becoming unusable, undo it and fail the request.
2296168404Spjd		 */
2297168404Spjd		vd->vdev_offline = B_TRUE;
2298213197Smm		vdev_reopen(tvd);
2299213197Smm
2300213197Smm		if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
2301213197Smm		    vdev_is_dead(tvd)) {
2302168404Spjd			vd->vdev_offline = B_FALSE;
2303213197Smm			vdev_reopen(tvd);
2304185029Spjd			return (spa_vdev_state_exit(spa, NULL, EBUSY));
2305168404Spjd		}
2306219089Spjd
2307219089Spjd		/*
2308219089Spjd		 * Add the device back into the metaslab rotor so that
2309219089Spjd		 * once we online the device it's open for business.
2310219089Spjd		 */
2311219089Spjd		if (tvd->vdev_islog && mg != NULL)
2312219089Spjd			metaslab_group_activate(mg);
2313168404Spjd	}
2314168404Spjd
2315185029Spjd	vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY);
2316168404Spjd
2317219089Spjd	return (spa_vdev_state_exit(spa, vd, 0));
2318219089Spjd}
2319213197Smm
2320219089Spjdint
2321219089Spjdvdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
2322219089Spjd{
2323219089Spjd	int error;
2324213197Smm
2325219089Spjd	mutex_enter(&spa->spa_vdev_top_lock);
2326219089Spjd	error = vdev_offline_locked(spa, guid, flags);
2327219089Spjd	mutex_exit(&spa->spa_vdev_top_lock);
2328219089Spjd
2329219089Spjd	return (error);
2330168404Spjd}
2331168404Spjd
2332168404Spjd/*
2333168404Spjd * Clear the error counts associated with this vdev.  Unlike vdev_online() and
2334168404Spjd * vdev_offline(), we assume the spa config is locked.  We also clear all
2335168404Spjd * children.  If 'vd' is NULL, then the user wants to clear all vdevs.
2336168404Spjd */
2337168404Spjdvoid
2338168404Spjdvdev_clear(spa_t *spa, vdev_t *vd)
2339168404Spjd{
2340185029Spjd	vdev_t *rvd = spa->spa_root_vdev;
2341168404Spjd
2342185029Spjd	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
2343185029Spjd
2344168404Spjd	if (vd == NULL)
2345185029Spjd		vd = rvd;
2346168404Spjd
2347168404Spjd	vd->vdev_stat.vs_read_errors = 0;
2348168404Spjd	vd->vdev_stat.vs_write_errors = 0;
2349168404Spjd	vd->vdev_stat.vs_checksum_errors = 0;
2350168404Spjd
2351185029Spjd	for (int c = 0; c < vd->vdev_children; c++)
2352168404Spjd		vdev_clear(spa, vd->vdev_child[c]);
2353185029Spjd
2354185029Spjd	/*
2355185029Spjd	 * If we're in the FAULTED state or have experienced failed I/O, then
2356185029Spjd	 * clear the persistent state and attempt to reopen the device.  We
2357185029Spjd	 * also mark the vdev config dirty, so that the new faulted state is
2358185029Spjd	 * written out to disk.
2359185029Spjd	 */
2360185029Spjd	if (vd->vdev_faulted || vd->vdev_degraded ||
2361185029Spjd	    !vdev_readable(vd) || !vdev_writeable(vd)) {
2362185029Spjd
2363219089Spjd		/*
2364219089Spjd		 * When reopening in reponse to a clear event, it may be due to
2365219089Spjd		 * a fmadm repair request.  In this case, if the device is
2366219089Spjd		 * still broken, we want to still post the ereport again.
2367219089Spjd		 */
2368219089Spjd		vd->vdev_forcefault = B_TRUE;
2369219089Spjd
2370219089Spjd		vd->vdev_faulted = vd->vdev_degraded = 0ULL;
2371185029Spjd		vd->vdev_cant_read = B_FALSE;
2372185029Spjd		vd->vdev_cant_write = B_FALSE;
2373185029Spjd
2374219089Spjd		vdev_reopen(vd == rvd ? rvd : vd->vdev_top);
2375185029Spjd
2376219089Spjd		vd->vdev_forcefault = B_FALSE;
2377219089Spjd
2378219089Spjd		if (vd != rvd && vdev_writeable(vd->vdev_top))
2379185029Spjd			vdev_state_dirty(vd->vdev_top);
2380185029Spjd
2381185029Spjd		if (vd->vdev_aux == NULL && !vdev_is_dead(vd))
2382185029Spjd			spa_async_request(spa, SPA_ASYNC_RESILVER);
2383185029Spjd
2384185029Spjd		spa_event_notify(spa, vd, ESC_ZFS_VDEV_CLEAR);
2385185029Spjd	}
2386219089Spjd
2387219089Spjd	/*
2388219089Spjd	 * When clearing a FMA-diagnosed fault, we always want to
2389219089Spjd	 * unspare the device, as we assume that the original spare was
2390219089Spjd	 * done in response to the FMA fault.
2391219089Spjd	 */
2392219089Spjd	if (!vdev_is_dead(vd) && vd->vdev_parent != NULL &&
2393219089Spjd	    vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
2394219089Spjd	    vd->vdev_parent->vdev_child[0] == vd)
2395219089Spjd		vd->vdev_unspare = B_TRUE;
2396168404Spjd}
2397168404Spjd
2398185029Spjdboolean_t
2399168404Spjdvdev_is_dead(vdev_t *vd)
2400168404Spjd{
2401219089Spjd	/*
2402219089Spjd	 * Holes and missing devices are always considered "dead".
2403219089Spjd	 * This simplifies the code since we don't have to check for
2404219089Spjd	 * these types of devices in the various code paths.
2405219089Spjd	 * Instead we rely on the fact that we skip over dead devices
2406219089Spjd	 * before issuing I/O to them.
2407219089Spjd	 */
2408219089Spjd	return (vd->vdev_state < VDEV_STATE_DEGRADED || vd->vdev_ishole ||
2409219089Spjd	    vd->vdev_ops == &vdev_missing_ops);
2410168404Spjd}
2411168404Spjd
2412185029Spjdboolean_t
2413185029Spjdvdev_readable(vdev_t *vd)
2414168404Spjd{
2415185029Spjd	return (!vdev_is_dead(vd) && !vd->vdev_cant_read);
2416185029Spjd}
2417168404Spjd
2418185029Spjdboolean_t
2419185029Spjdvdev_writeable(vdev_t *vd)
2420185029Spjd{
2421185029Spjd	return (!vdev_is_dead(vd) && !vd->vdev_cant_write);
2422185029Spjd}
2423168404Spjd
2424185029Spjdboolean_t
2425208370Smmvdev_allocatable(vdev_t *vd)
2426208370Smm{
2427209962Smm	uint64_t state = vd->vdev_state;
2428209962Smm
2429208370Smm	/*
2430209962Smm	 * We currently allow allocations from vdevs which may be in the
2431208370Smm	 * process of reopening (i.e. VDEV_STATE_CLOSED). If the device
2432208370Smm	 * fails to reopen then we'll catch it later when we're holding
2433209962Smm	 * the proper locks.  Note that we have to get the vdev state
2434209962Smm	 * in a local variable because although it changes atomically,
2435209962Smm	 * we're asking two separate questions about it.
2436208370Smm	 */
2437209962Smm	return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) &&
2438219089Spjd	    !vd->vdev_cant_write && !vd->vdev_ishole);
2439208370Smm}
2440208370Smm
2441208370Smmboolean_t
2442185029Spjdvdev_accessible(vdev_t *vd, zio_t *zio)
2443185029Spjd{
2444185029Spjd	ASSERT(zio->io_vd == vd);
2445168404Spjd
2446185029Spjd	if (vdev_is_dead(vd) || vd->vdev_remove_wanted)
2447185029Spjd		return (B_FALSE);
2448168404Spjd
2449185029Spjd	if (zio->io_type == ZIO_TYPE_READ)
2450185029Spjd		return (!vd->vdev_cant_read);
2451168404Spjd
2452185029Spjd	if (zio->io_type == ZIO_TYPE_WRITE)
2453185029Spjd		return (!vd->vdev_cant_write);
2454168404Spjd
2455185029Spjd	return (B_TRUE);
2456168404Spjd}
2457168404Spjd
2458168404Spjd/*
2459168404Spjd * Get statistics for the given vdev.
2460168404Spjd */
2461168404Spjdvoid
2462168404Spjdvdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
2463168404Spjd{
2464168404Spjd	vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
2465168404Spjd
2466168404Spjd	mutex_enter(&vd->vdev_stat_lock);
2467168404Spjd	bcopy(&vd->vdev_stat, vs, sizeof (*vs));
2468168404Spjd	vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
2469168404Spjd	vs->vs_state = vd->vdev_state;
2470219089Spjd	vs->vs_rsize = vdev_get_min_asize(vd);
2471219089Spjd	if (vd->vdev_ops->vdev_op_leaf)
2472219089Spjd		vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
2473168404Spjd	mutex_exit(&vd->vdev_stat_lock);
2474168404Spjd
2475168404Spjd	/*
2476168404Spjd	 * If we're getting stats on the root vdev, aggregate the I/O counts
2477168404Spjd	 * over all top-level vdevs (i.e. the direct children of the root).
2478168404Spjd	 */
2479168404Spjd	if (vd == rvd) {
2480185029Spjd		for (int c = 0; c < rvd->vdev_children; c++) {
2481168404Spjd			vdev_t *cvd = rvd->vdev_child[c];
2482168404Spjd			vdev_stat_t *cvs = &cvd->vdev_stat;
2483168404Spjd
2484168404Spjd			mutex_enter(&vd->vdev_stat_lock);
2485185029Spjd			for (int t = 0; t < ZIO_TYPES; t++) {
2486168404Spjd				vs->vs_ops[t] += cvs->vs_ops[t];
2487168404Spjd				vs->vs_bytes[t] += cvs->vs_bytes[t];
2488168404Spjd			}
2489219089Spjd			cvs->vs_scan_removing = cvd->vdev_removing;
2490168404Spjd			mutex_exit(&vd->vdev_stat_lock);
2491168404Spjd		}
2492168404Spjd	}
2493168404Spjd}
2494168404Spjd
2495168404Spjdvoid
2496185029Spjdvdev_clear_stats(vdev_t *vd)
2497168404Spjd{
2498185029Spjd	mutex_enter(&vd->vdev_stat_lock);
2499185029Spjd	vd->vdev_stat.vs_space = 0;
2500185029Spjd	vd->vdev_stat.vs_dspace = 0;
2501185029Spjd	vd->vdev_stat.vs_alloc = 0;
2502185029Spjd	mutex_exit(&vd->vdev_stat_lock);
2503185029Spjd}
2504185029Spjd
2505185029Spjdvoid
2506219089Spjdvdev_scan_stat_init(vdev_t *vd)
2507219089Spjd{
2508219089Spjd	vdev_stat_t *vs = &vd->vdev_stat;
2509219089Spjd
2510219089Spjd	for (int c = 0; c < vd->vdev_children; c++)
2511219089Spjd		vdev_scan_stat_init(vd->vdev_child[c]);
2512219089Spjd
2513219089Spjd	mutex_enter(&vd->vdev_stat_lock);
2514219089Spjd	vs->vs_scan_processed = 0;
2515219089Spjd	mutex_exit(&vd->vdev_stat_lock);
2516219089Spjd}
2517219089Spjd
2518219089Spjdvoid
2519185029Spjdvdev_stat_update(zio_t *zio, uint64_t psize)
2520185029Spjd{
2521209962Smm	spa_t *spa = zio->io_spa;
2522209962Smm	vdev_t *rvd = spa->spa_root_vdev;
2523185029Spjd	vdev_t *vd = zio->io_vd ? zio->io_vd : rvd;
2524168404Spjd	vdev_t *pvd;
2525168404Spjd	uint64_t txg = zio->io_txg;
2526168404Spjd	vdev_stat_t *vs = &vd->vdev_stat;
2527168404Spjd	zio_type_t type = zio->io_type;
2528168404Spjd	int flags = zio->io_flags;
2529168404Spjd
2530185029Spjd	/*
2531185029Spjd	 * If this i/o is a gang leader, it didn't do any actual work.
2532185029Spjd	 */
2533185029Spjd	if (zio->io_gang_tree)
2534185029Spjd		return;
2535185029Spjd
2536168404Spjd	if (zio->io_error == 0) {
2537185029Spjd		/*
2538185029Spjd		 * If this is a root i/o, don't count it -- we've already
2539185029Spjd		 * counted the top-level vdevs, and vdev_get_stats() will
2540185029Spjd		 * aggregate them when asked.  This reduces contention on
2541185029Spjd		 * the root vdev_stat_lock and implicitly handles blocks
2542185029Spjd		 * that compress away to holes, for which there is no i/o.
2543185029Spjd		 * (Holes never create vdev children, so all the counters
2544185029Spjd		 * remain zero, which is what we want.)
2545185029Spjd		 *
2546185029Spjd		 * Note: this only applies to successful i/o (io_error == 0)
2547185029Spjd		 * because unlike i/o counts, errors are not additive.
2548185029Spjd		 * When reading a ditto block, for example, failure of
2549185029Spjd		 * one top-level vdev does not imply a root-level error.
2550185029Spjd		 */
2551185029Spjd		if (vd == rvd)
2552185029Spjd			return;
2553185029Spjd
2554185029Spjd		ASSERT(vd == zio->io_vd);
2555209962Smm
2556209962Smm		if (flags & ZIO_FLAG_IO_BYPASS)
2557209962Smm			return;
2558209962Smm
2559209962Smm		mutex_enter(&vd->vdev_stat_lock);
2560209962Smm
2561185029Spjd		if (flags & ZIO_FLAG_IO_REPAIR) {
2562219089Spjd			if (flags & ZIO_FLAG_SCAN_THREAD) {
2563219089Spjd				dsl_scan_phys_t *scn_phys =
2564219089Spjd				    &spa->spa_dsl_pool->dp_scan->scn_phys;
2565219089Spjd				uint64_t *processed = &scn_phys->scn_processed;
2566219089Spjd
2567219089Spjd				/* XXX cleanup? */
2568219089Spjd				if (vd->vdev_ops->vdev_op_leaf)
2569219089Spjd					atomic_add_64(processed, psize);
2570219089Spjd				vs->vs_scan_processed += psize;
2571219089Spjd			}
2572219089Spjd
2573209962Smm			if (flags & ZIO_FLAG_SELF_HEAL)
2574185029Spjd				vs->vs_self_healed += psize;
2575168404Spjd		}
2576209962Smm
2577209962Smm		vs->vs_ops[type]++;
2578209962Smm		vs->vs_bytes[type] += psize;
2579209962Smm
2580209962Smm		mutex_exit(&vd->vdev_stat_lock);
2581168404Spjd		return;
2582168404Spjd	}
2583168404Spjd
2584168404Spjd	if (flags & ZIO_FLAG_SPECULATIVE)
2585168404Spjd		return;
2586168404Spjd
2587213198Smm	/*
2588213198Smm	 * If this is an I/O error that is going to be retried, then ignore the
2589213198Smm	 * error.  Otherwise, the user may interpret B_FAILFAST I/O errors as
2590213198Smm	 * hard errors, when in reality they can happen for any number of
2591213198Smm	 * innocuous reasons (bus resets, MPxIO link failure, etc).
2592213198Smm	 */
2593213198Smm	if (zio->io_error == EIO &&
2594213198Smm	    !(zio->io_flags & ZIO_FLAG_IO_RETRY))
2595213198Smm		return;
2596213198Smm
2597219089Spjd	/*
2598219089Spjd	 * Intent logs writes won't propagate their error to the root
2599219089Spjd	 * I/O so don't mark these types of failures as pool-level
2600219089Spjd	 * errors.
2601219089Spjd	 */
2602219089Spjd	if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
2603219089Spjd		return;
2604219089Spjd
2605185029Spjd	mutex_enter(&vd->vdev_stat_lock);
2606209962Smm	if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) {
2607185029Spjd		if (zio->io_error == ECKSUM)
2608185029Spjd			vs->vs_checksum_errors++;
2609185029Spjd		else
2610185029Spjd			vs->vs_read_errors++;
2611168404Spjd	}
2612209962Smm	if (type == ZIO_TYPE_WRITE && !vdev_is_dead(vd))
2613185029Spjd		vs->vs_write_errors++;
2614185029Spjd	mutex_exit(&vd->vdev_stat_lock);
2615168404Spjd
2616209962Smm	if (type == ZIO_TYPE_WRITE && txg != 0 &&
2617209962Smm	    (!(flags & ZIO_FLAG_IO_REPAIR) ||
2618219089Spjd	    (flags & ZIO_FLAG_SCAN_THREAD) ||
2619219089Spjd	    spa->spa_claiming)) {
2620209962Smm		/*
2621219089Spjd		 * This is either a normal write (not a repair), or it's
2622219089Spjd		 * a repair induced by the scrub thread, or it's a repair
2623219089Spjd		 * made by zil_claim() during spa_load() in the first txg.
2624219089Spjd		 * In the normal case, we commit the DTL change in the same
2625219089Spjd		 * txg as the block was born.  In the scrub-induced repair
2626219089Spjd		 * case, we know that scrubs run in first-pass syncing context,
2627219089Spjd		 * so we commit the DTL change in spa_syncing_txg(spa).
2628219089Spjd		 * In the zil_claim() case, we commit in spa_first_txg(spa).
2629209962Smm		 *
2630209962Smm		 * We currently do not make DTL entries for failed spontaneous
2631209962Smm		 * self-healing writes triggered by normal (non-scrubbing)
2632209962Smm		 * reads, because we have no transactional context in which to
2633209962Smm		 * do so -- and it's not clear that it'd be desirable anyway.
2634209962Smm		 */
2635209962Smm		if (vd->vdev_ops->vdev_op_leaf) {
2636209962Smm			uint64_t commit_txg = txg;
2637219089Spjd			if (flags & ZIO_FLAG_SCAN_THREAD) {
2638209962Smm				ASSERT(flags & ZIO_FLAG_IO_REPAIR);
2639209962Smm				ASSERT(spa_sync_pass(spa) == 1);
2640209962Smm				vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1);
2641219089Spjd				commit_txg = spa_syncing_txg(spa);
2642219089Spjd			} else if (spa->spa_claiming) {
2643219089Spjd				ASSERT(flags & ZIO_FLAG_IO_REPAIR);
2644219089Spjd				commit_txg = spa_first_txg(spa);
2645209962Smm			}
2646219089Spjd			ASSERT(commit_txg >= spa_syncing_txg(spa));
2647209962Smm			if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1))
2648168404Spjd				return;
2649209962Smm			for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
2650209962Smm				vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1);
2651209962Smm			vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg);
2652168404Spjd		}
2653209962Smm		if (vd != rvd)
2654209962Smm			vdev_dtl_dirty(vd, DTL_MISSING, txg, 1);
2655168404Spjd	}
2656168404Spjd}
2657168404Spjd
2658168404Spjd/*
2659219089Spjd * Update the in-core space usage stats for this vdev, its metaslab class,
2660219089Spjd * and the root vdev.
2661168404Spjd */
2662168404Spjdvoid
2663219089Spjdvdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta,
2664219089Spjd    int64_t space_delta)
2665168404Spjd{
2666168404Spjd	int64_t dspace_delta = space_delta;
2667185029Spjd	spa_t *spa = vd->vdev_spa;
2668185029Spjd	vdev_t *rvd = spa->spa_root_vdev;
2669219089Spjd	metaslab_group_t *mg = vd->vdev_mg;
2670219089Spjd	metaslab_class_t *mc = mg ? mg->mg_class : NULL;
2671168404Spjd
2672185029Spjd	ASSERT(vd == vd->vdev_top);
2673168404Spjd
2674185029Spjd	/*
2675185029Spjd	 * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion
2676185029Spjd	 * factor.  We must calculate this here and not at the root vdev
2677185029Spjd	 * because the root vdev's psize-to-asize is simply the max of its
2678185029Spjd	 * childrens', thus not accurate enough for us.
2679185029Spjd	 */
2680185029Spjd	ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0);
2681213197Smm	ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache);
2682185029Spjd	dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) *
2683185029Spjd	    vd->vdev_deflate_ratio;
2684185029Spjd
2685185029Spjd	mutex_enter(&vd->vdev_stat_lock);
2686219089Spjd	vd->vdev_stat.vs_alloc += alloc_delta;
2687185029Spjd	vd->vdev_stat.vs_space += space_delta;
2688185029Spjd	vd->vdev_stat.vs_dspace += dspace_delta;
2689185029Spjd	mutex_exit(&vd->vdev_stat_lock);
2690185029Spjd
2691219089Spjd	if (mc == spa_normal_class(spa)) {
2692185029Spjd		mutex_enter(&rvd->vdev_stat_lock);
2693219089Spjd		rvd->vdev_stat.vs_alloc += alloc_delta;
2694185029Spjd		rvd->vdev_stat.vs_space += space_delta;
2695185029Spjd		rvd->vdev_stat.vs_dspace += dspace_delta;
2696185029Spjd		mutex_exit(&rvd->vdev_stat_lock);
2697185029Spjd	}
2698219089Spjd
2699219089Spjd	if (mc != NULL) {
2700219089Spjd		ASSERT(rvd == vd->vdev_parent);
2701219089Spjd		ASSERT(vd->vdev_ms_count != 0);
2702219089Spjd
2703219089Spjd		metaslab_class_space_update(mc,
2704219089Spjd		    alloc_delta, defer_delta, space_delta, dspace_delta);
2705219089Spjd	}
2706168404Spjd}
2707168404Spjd
2708168404Spjd/*
2709168404Spjd * Mark a top-level vdev's config as dirty, placing it on the dirty list
2710168404Spjd * so that it will be written out next time the vdev configuration is synced.
2711168404Spjd * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs.
2712168404Spjd */
2713168404Spjdvoid
2714168404Spjdvdev_config_dirty(vdev_t *vd)
2715168404Spjd{
2716168404Spjd	spa_t *spa = vd->vdev_spa;
2717168404Spjd	vdev_t *rvd = spa->spa_root_vdev;
2718168404Spjd	int c;
2719168404Spjd
2720219089Spjd	ASSERT(spa_writeable(spa));
2721219089Spjd
2722168404Spjd	/*
2723209962Smm	 * If this is an aux vdev (as with l2cache and spare devices), then we
2724209962Smm	 * update the vdev config manually and set the sync flag.
2725185029Spjd	 */
2726185029Spjd	if (vd->vdev_aux != NULL) {
2727185029Spjd		spa_aux_vdev_t *sav = vd->vdev_aux;
2728185029Spjd		nvlist_t **aux;
2729185029Spjd		uint_t naux;
2730185029Spjd
2731185029Spjd		for (c = 0; c < sav->sav_count; c++) {
2732185029Spjd			if (sav->sav_vdevs[c] == vd)
2733185029Spjd				break;
2734185029Spjd		}
2735185029Spjd
2736185029Spjd		if (c == sav->sav_count) {
2737185029Spjd			/*
2738185029Spjd			 * We're being removed.  There's nothing more to do.
2739185029Spjd			 */
2740185029Spjd			ASSERT(sav->sav_sync == B_TRUE);
2741185029Spjd			return;
2742185029Spjd		}
2743185029Spjd
2744185029Spjd		sav->sav_sync = B_TRUE;
2745185029Spjd
2746209962Smm		if (nvlist_lookup_nvlist_array(sav->sav_config,
2747209962Smm		    ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) {
2748209962Smm			VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
2749209962Smm			    ZPOOL_CONFIG_SPARES, &aux, &naux) == 0);
2750209962Smm		}
2751185029Spjd
2752185029Spjd		ASSERT(c < naux);
2753185029Spjd
2754185029Spjd		/*
2755185029Spjd		 * Setting the nvlist in the middle if the array is a little
2756185029Spjd		 * sketchy, but it will work.
2757185029Spjd		 */
2758185029Spjd		nvlist_free(aux[c]);
2759219089Spjd		aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0);
2760185029Spjd
2761185029Spjd		return;
2762185029Spjd	}
2763185029Spjd
2764185029Spjd	/*
2765185029Spjd	 * The dirty list is protected by the SCL_CONFIG lock.  The caller
2766185029Spjd	 * must either hold SCL_CONFIG as writer, or must be the sync thread
2767185029Spjd	 * (which holds SCL_CONFIG as reader).  There's only one sync thread,
2768168404Spjd	 * so this is sufficient to ensure mutual exclusion.
2769168404Spjd	 */
2770185029Spjd	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) ||
2771185029Spjd	    (dsl_pool_sync_context(spa_get_dsl(spa)) &&
2772185029Spjd	    spa_config_held(spa, SCL_CONFIG, RW_READER)));
2773168404Spjd
2774168404Spjd	if (vd == rvd) {
2775168404Spjd		for (c = 0; c < rvd->vdev_children; c++)
2776168404Spjd			vdev_config_dirty(rvd->vdev_child[c]);
2777168404Spjd	} else {
2778168404Spjd		ASSERT(vd == vd->vdev_top);
2779168404Spjd
2780219089Spjd		if (!list_link_active(&vd->vdev_config_dirty_node) &&
2781219089Spjd		    !vd->vdev_ishole)
2782185029Spjd			list_insert_head(&spa->spa_config_dirty_list, vd);
2783168404Spjd	}
2784168404Spjd}
2785168404Spjd
2786168404Spjdvoid
2787168404Spjdvdev_config_clean(vdev_t *vd)
2788168404Spjd{
2789168404Spjd	spa_t *spa = vd->vdev_spa;
2790168404Spjd
2791185029Spjd	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) ||
2792185029Spjd	    (dsl_pool_sync_context(spa_get_dsl(spa)) &&
2793185029Spjd	    spa_config_held(spa, SCL_CONFIG, RW_READER)));
2794168404Spjd
2795185029Spjd	ASSERT(list_link_active(&vd->vdev_config_dirty_node));
2796185029Spjd	list_remove(&spa->spa_config_dirty_list, vd);
2797168404Spjd}
2798168404Spjd
2799185029Spjd/*
2800185029Spjd * Mark a top-level vdev's state as dirty, so that the next pass of
2801185029Spjd * spa_sync() can convert this into vdev_config_dirty().  We distinguish
2802185029Spjd * the state changes from larger config changes because they require
2803185029Spjd * much less locking, and are often needed for administrative actions.
2804185029Spjd */
2805168404Spjdvoid
2806185029Spjdvdev_state_dirty(vdev_t *vd)
2807185029Spjd{
2808185029Spjd	spa_t *spa = vd->vdev_spa;
2809185029Spjd
2810219089Spjd	ASSERT(spa_writeable(spa));
2811185029Spjd	ASSERT(vd == vd->vdev_top);
2812185029Spjd
2813185029Spjd	/*
2814185029Spjd	 * The state list is protected by the SCL_STATE lock.  The caller
2815185029Spjd	 * must either hold SCL_STATE as writer, or must be the sync thread
2816185029Spjd	 * (which holds SCL_STATE as reader).  There's only one sync thread,
2817185029Spjd	 * so this is sufficient to ensure mutual exclusion.
2818185029Spjd	 */
2819185029Spjd	ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) ||
2820185029Spjd	    (dsl_pool_sync_context(spa_get_dsl(spa)) &&
2821185029Spjd	    spa_config_held(spa, SCL_STATE, RW_READER)));
2822185029Spjd
2823219089Spjd	if (!list_link_active(&vd->vdev_state_dirty_node) && !vd->vdev_ishole)
2824185029Spjd		list_insert_head(&spa->spa_state_dirty_list, vd);
2825185029Spjd}
2826185029Spjd
2827185029Spjdvoid
2828185029Spjdvdev_state_clean(vdev_t *vd)
2829185029Spjd{
2830185029Spjd	spa_t *spa = vd->vdev_spa;
2831185029Spjd
2832185029Spjd	ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) ||
2833185029Spjd	    (dsl_pool_sync_context(spa_get_dsl(spa)) &&
2834185029Spjd	    spa_config_held(spa, SCL_STATE, RW_READER)));
2835185029Spjd
2836185029Spjd	ASSERT(list_link_active(&vd->vdev_state_dirty_node));
2837185029Spjd	list_remove(&spa->spa_state_dirty_list, vd);
2838185029Spjd}
2839185029Spjd
2840185029Spjd/*
2841185029Spjd * Propagate vdev state up from children to parent.
2842185029Spjd */
2843185029Spjdvoid
2844168404Spjdvdev_propagate_state(vdev_t *vd)
2845168404Spjd{
2846209962Smm	spa_t *spa = vd->vdev_spa;
2847209962Smm	vdev_t *rvd = spa->spa_root_vdev;
2848168404Spjd	int degraded = 0, faulted = 0;
2849168404Spjd	int corrupted = 0;
2850168404Spjd	vdev_t *child;
2851168404Spjd
2852185029Spjd	if (vd->vdev_children > 0) {
2853219089Spjd		for (int c = 0; c < vd->vdev_children; c++) {
2854185029Spjd			child = vd->vdev_child[c];
2855168404Spjd
2856219089Spjd			/*
2857219089Spjd			 * Don't factor holes into the decision.
2858219089Spjd			 */
2859219089Spjd			if (child->vdev_ishole)
2860219089Spjd				continue;
2861219089Spjd
2862185029Spjd			if (!vdev_readable(child) ||
2863209962Smm			    (!vdev_writeable(child) && spa_writeable(spa))) {
2864185029Spjd				/*
2865185029Spjd				 * Root special: if there is a top-level log
2866185029Spjd				 * device, treat the root vdev as if it were
2867185029Spjd				 * degraded.
2868185029Spjd				 */
2869185029Spjd				if (child->vdev_islog && vd == rvd)
2870185029Spjd					degraded++;
2871185029Spjd				else
2872185029Spjd					faulted++;
2873185029Spjd			} else if (child->vdev_state <= VDEV_STATE_DEGRADED) {
2874185029Spjd				degraded++;
2875185029Spjd			}
2876185029Spjd
2877185029Spjd			if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA)
2878185029Spjd				corrupted++;
2879185029Spjd		}
2880185029Spjd
2881185029Spjd		vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded);
2882185029Spjd
2883185029Spjd		/*
2884185029Spjd		 * Root special: if there is a top-level vdev that cannot be
2885185029Spjd		 * opened due to corrupted metadata, then propagate the root
2886185029Spjd		 * vdev's aux state as 'corrupt' rather than 'insufficient
2887185029Spjd		 * replicas'.
2888185029Spjd		 */
2889185029Spjd		if (corrupted && vd == rvd &&
2890185029Spjd		    rvd->vdev_state == VDEV_STATE_CANT_OPEN)
2891185029Spjd			vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN,
2892185029Spjd			    VDEV_AUX_CORRUPT_DATA);
2893168404Spjd	}
2894168404Spjd
2895185029Spjd	if (vd->vdev_parent)
2896185029Spjd		vdev_propagate_state(vd->vdev_parent);
2897168404Spjd}
2898168404Spjd
2899168404Spjd/*
2900168404Spjd * Set a vdev's state.  If this is during an open, we don't update the parent
2901168404Spjd * state, because we're in the process of opening children depth-first.
2902168404Spjd * Otherwise, we propagate the change to the parent.
2903168404Spjd *
2904168404Spjd * If this routine places a device in a faulted state, an appropriate ereport is
2905168404Spjd * generated.
2906168404Spjd */
2907168404Spjdvoid
2908168404Spjdvdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
2909168404Spjd{
2910168404Spjd	uint64_t save_state;
2911185029Spjd	spa_t *spa = vd->vdev_spa;
2912168404Spjd
2913168404Spjd	if (state == vd->vdev_state) {
2914168404Spjd		vd->vdev_stat.vs_aux = aux;
2915168404Spjd		return;
2916168404Spjd	}
2917168404Spjd
2918168404Spjd	save_state = vd->vdev_state;
2919168404Spjd
2920168404Spjd	vd->vdev_state = state;
2921168404Spjd	vd->vdev_stat.vs_aux = aux;
2922168404Spjd
2923173373Spjd	/*
2924173373Spjd	 * If we are setting the vdev state to anything but an open state, then
2925219089Spjd	 * always close the underlying device unless the device has requested
2926219089Spjd	 * a delayed close (i.e. we're about to remove or fault the device).
2927219089Spjd	 * Otherwise, we keep accessible but invalid devices open forever.
2928219089Spjd	 * We don't call vdev_close() itself, because that implies some extra
2929219089Spjd	 * checks (offline, etc) that we don't want here.  This is limited to
2930219089Spjd	 * leaf devices, because otherwise closing the device will affect other
2931219089Spjd	 * children.
2932173373Spjd	 */
2933219089Spjd	if (!vd->vdev_delayed_close && vdev_is_dead(vd) &&
2934219089Spjd	    vd->vdev_ops->vdev_op_leaf)
2935173373Spjd		vd->vdev_ops->vdev_op_close(vd);
2936173373Spjd
2937219089Spjd	/*
2938219089Spjd	 * If we have brought this vdev back into service, we need
2939219089Spjd	 * to notify fmd so that it can gracefully repair any outstanding
2940219089Spjd	 * cases due to a missing device.  We do this in all cases, even those
2941219089Spjd	 * that probably don't correlate to a repaired fault.  This is sure to
2942219089Spjd	 * catch all cases, and we let the zfs-retire agent sort it out.  If
2943219089Spjd	 * this is a transient state it's OK, as the retire agent will
2944219089Spjd	 * double-check the state of the vdev before repairing it.
2945219089Spjd	 */
2946219089Spjd	if (state == VDEV_STATE_HEALTHY && vd->vdev_ops->vdev_op_leaf &&
2947219089Spjd	    vd->vdev_prevstate != state)
2948219089Spjd		zfs_post_state_change(spa, vd);
2949219089Spjd
2950185029Spjd	if (vd->vdev_removed &&
2951185029Spjd	    state == VDEV_STATE_CANT_OPEN &&
2952185029Spjd	    (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) {
2953168404Spjd		/*
2954185029Spjd		 * If the previous state is set to VDEV_STATE_REMOVED, then this
2955185029Spjd		 * device was previously marked removed and someone attempted to
2956185029Spjd		 * reopen it.  If this failed due to a nonexistent device, then
2957185029Spjd		 * keep the device in the REMOVED state.  We also let this be if
2958185029Spjd		 * it is one of our special test online cases, which is only
2959185029Spjd		 * attempting to online the device and shouldn't generate an FMA
2960185029Spjd		 * fault.
2961185029Spjd		 */
2962185029Spjd		vd->vdev_state = VDEV_STATE_REMOVED;
2963185029Spjd		vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
2964185029Spjd	} else if (state == VDEV_STATE_REMOVED) {
2965185029Spjd		vd->vdev_removed = B_TRUE;
2966185029Spjd	} else if (state == VDEV_STATE_CANT_OPEN) {
2967185029Spjd		/*
2968219089Spjd		 * If we fail to open a vdev during an import or recovery, we
2969219089Spjd		 * mark it as "not available", which signifies that it was
2970219089Spjd		 * never there to begin with.  Failure to open such a device
2971219089Spjd		 * is not considered an error.
2972168404Spjd		 */
2973219089Spjd		if ((spa_load_state(spa) == SPA_LOAD_IMPORT ||
2974219089Spjd		    spa_load_state(spa) == SPA_LOAD_RECOVER) &&
2975168404Spjd		    vd->vdev_ops->vdev_op_leaf)
2976168404Spjd			vd->vdev_not_present = 1;
2977168404Spjd
2978168404Spjd		/*
2979168404Spjd		 * Post the appropriate ereport.  If the 'prevstate' field is
2980168404Spjd		 * set to something other than VDEV_STATE_UNKNOWN, it indicates
2981168404Spjd		 * that this is part of a vdev_reopen().  In this case, we don't
2982168404Spjd		 * want to post the ereport if the device was already in the
2983168404Spjd		 * CANT_OPEN state beforehand.
2984185029Spjd		 *
2985185029Spjd		 * If the 'checkremove' flag is set, then this is an attempt to
2986185029Spjd		 * online the device in response to an insertion event.  If we
2987185029Spjd		 * hit this case, then we have detected an insertion event for a
2988185029Spjd		 * faulted or offline device that wasn't in the removed state.
2989185029Spjd		 * In this scenario, we don't post an ereport because we are
2990185029Spjd		 * about to replace the device, or attempt an online with
2991185029Spjd		 * vdev_forcefault, which will generate the fault for us.
2992168404Spjd		 */
2993185029Spjd		if ((vd->vdev_prevstate != state || vd->vdev_forcefault) &&
2994185029Spjd		    !vd->vdev_not_present && !vd->vdev_checkremove &&
2995185029Spjd		    vd != spa->spa_root_vdev) {
2996168404Spjd			const char *class;
2997168404Spjd
2998168404Spjd			switch (aux) {
2999168404Spjd			case VDEV_AUX_OPEN_FAILED:
3000168404Spjd				class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED;
3001168404Spjd				break;
3002168404Spjd			case VDEV_AUX_CORRUPT_DATA:
3003168404Spjd				class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA;
3004168404Spjd				break;
3005168404Spjd			case VDEV_AUX_NO_REPLICAS:
3006168404Spjd				class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS;
3007168404Spjd				break;
3008168404Spjd			case VDEV_AUX_BAD_GUID_SUM:
3009168404Spjd				class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM;
3010168404Spjd				break;
3011168404Spjd			case VDEV_AUX_TOO_SMALL:
3012168404Spjd				class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL;
3013168404Spjd				break;
3014168404Spjd			case VDEV_AUX_BAD_LABEL:
3015168404Spjd				class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL;
3016168404Spjd				break;
3017168404Spjd			default:
3018168404Spjd				class = FM_EREPORT_ZFS_DEVICE_UNKNOWN;
3019168404Spjd			}
3020168404Spjd
3021185029Spjd			zfs_ereport_post(class, spa, vd, NULL, save_state, 0);
3022168404Spjd		}
3023185029Spjd
3024185029Spjd		/* Erase any notion of persistent removed state */
3025185029Spjd		vd->vdev_removed = B_FALSE;
3026185029Spjd	} else {
3027185029Spjd		vd->vdev_removed = B_FALSE;
3028168404Spjd	}
3029168404Spjd
3030209962Smm	if (!isopen && vd->vdev_parent)
3031209962Smm		vdev_propagate_state(vd->vdev_parent);
3032185029Spjd}
3033168404Spjd
3034185029Spjd/*
3035185029Spjd * Check the vdev configuration to ensure that it's capable of supporting
3036193163Sdfr * a root pool.
3037193163Sdfr *
3038193163Sdfr * On Solaris, we do not support RAID-Z or partial configuration.  In
3039193163Sdfr * addition, only a single top-level vdev is allowed and none of the
3040193163Sdfr * leaves can be wholedisks.
3041193163Sdfr *
3042193163Sdfr * For FreeBSD, we can boot from any configuration. There is a
3043193163Sdfr * limitation that the boot filesystem must be either uncompressed or
3044193163Sdfr * compresses with lzjb compression but I'm not sure how to enforce
3045193163Sdfr * that here.
3046185029Spjd */
3047185029Spjdboolean_t
3048185029Spjdvdev_is_bootable(vdev_t *vd)
3049185029Spjd{
3050213197Smm#ifdef sun
3051185029Spjd	if (!vd->vdev_ops->vdev_op_leaf) {
3052185029Spjd		char *vdev_type = vd->vdev_ops->vdev_op_type;
3053185029Spjd
3054185029Spjd		if (strcmp(vdev_type, VDEV_TYPE_ROOT) == 0 &&
3055185029Spjd		    vd->vdev_children > 1) {
3056185029Spjd			return (B_FALSE);
3057185029Spjd		} else if (strcmp(vdev_type, VDEV_TYPE_RAIDZ) == 0 ||
3058185029Spjd		    strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) {
3059185029Spjd			return (B_FALSE);
3060185029Spjd		}
3061185029Spjd	} else if (vd->vdev_wholedisk == 1) {
3062185029Spjd		return (B_FALSE);
3063185029Spjd	}
3064185029Spjd
3065219089Spjd	for (int c = 0; c < vd->vdev_children; c++) {
3066185029Spjd		if (!vdev_is_bootable(vd->vdev_child[c]))
3067185029Spjd			return (B_FALSE);
3068185029Spjd	}
3069213197Smm#endif	/* sun */
3070185029Spjd	return (B_TRUE);
3071168404Spjd}
3072213197Smm
3073219089Spjd/*
3074219089Spjd * Load the state from the original vdev tree (ovd) which
3075219089Spjd * we've retrieved from the MOS config object. If the original
3076219089Spjd * vdev was offline or faulted then we transfer that state to the
3077219089Spjd * device in the current vdev tree (nvd).
3078219089Spjd */
3079213197Smmvoid
3080219089Spjdvdev_load_log_state(vdev_t *nvd, vdev_t *ovd)
3081213197Smm{
3082219089Spjd	spa_t *spa = nvd->vdev_spa;
3083213197Smm
3084219089Spjd	ASSERT(nvd->vdev_top->vdev_islog);
3085219089Spjd	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
3086219089Spjd	ASSERT3U(nvd->vdev_guid, ==, ovd->vdev_guid);
3087213197Smm
3088219089Spjd	for (int c = 0; c < nvd->vdev_children; c++)
3089219089Spjd		vdev_load_log_state(nvd->vdev_child[c], ovd->vdev_child[c]);
3090213197Smm
3091219089Spjd	if (nvd->vdev_ops->vdev_op_leaf) {
3092213197Smm		/*
3093219089Spjd		 * Restore the persistent vdev state
3094213197Smm		 */
3095219089Spjd		nvd->vdev_offline = ovd->vdev_offline;
3096219089Spjd		nvd->vdev_faulted = ovd->vdev_faulted;
3097219089Spjd		nvd->vdev_degraded = ovd->vdev_degraded;
3098219089Spjd		nvd->vdev_removed = ovd->vdev_removed;
3099213197Smm	}
3100213197Smm}
3101219089Spjd
3102219089Spjd/*
3103219089Spjd * Determine if a log device has valid content.  If the vdev was
3104219089Spjd * removed or faulted in the MOS config then we know that
3105219089Spjd * the content on the log device has already been written to the pool.
3106219089Spjd */
3107219089Spjdboolean_t
3108219089Spjdvdev_log_state_valid(vdev_t *vd)
3109219089Spjd{
3110219089Spjd	if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted &&
3111219089Spjd	    !vd->vdev_removed)
3112219089Spjd		return (B_TRUE);
3113219089Spjd
3114219089Spjd	for (int c = 0; c < vd->vdev_children; c++)
3115219089Spjd		if (vdev_log_state_valid(vd->vdev_child[c]))
3116219089Spjd			return (B_TRUE);
3117219089Spjd
3118219089Spjd	return (B_FALSE);
3119219089Spjd}
3120219089Spjd
3121219089Spjd/*
3122219089Spjd * Expand a vdev if possible.
3123219089Spjd */
3124219089Spjdvoid
3125219089Spjdvdev_expand(vdev_t *vd, uint64_t txg)
3126219089Spjd{
3127219089Spjd	ASSERT(vd->vdev_top == vd);
3128219089Spjd	ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
3129219089Spjd
3130219089Spjd	if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count) {
3131219089Spjd		VERIFY(vdev_metaslab_init(vd, txg) == 0);
3132219089Spjd		vdev_config_dirty(vd);
3133219089Spjd	}
3134219089Spjd}
3135219089Spjd
3136219089Spjd/*
3137219089Spjd * Split a vdev.
3138219089Spjd */
3139219089Spjdvoid
3140219089Spjdvdev_split(vdev_t *vd)
3141219089Spjd{
3142219089Spjd	vdev_t *cvd, *pvd = vd->vdev_parent;
3143219089Spjd
3144219089Spjd	vdev_remove_child(pvd, vd);
3145219089Spjd	vdev_compact_children(pvd);
3146219089Spjd
3147219089Spjd	cvd = pvd->vdev_child[0];
3148219089Spjd	if (pvd->vdev_children == 1) {
3149219089Spjd		vdev_remove_parent(cvd);
3150219089Spjd		cvd->vdev_splitting = B_TRUE;
3151219089Spjd	}
3152219089Spjd	vdev_propagate_state(cvd);
3153219089Spjd}
3154