spa.c revision 211931
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27/*
28 * This file contains all the routines used when modifying on-disk SPA state.
29 * This includes opening, importing, destroying, exporting a pool, and syncing a
30 * pool.
31 */
32
33#include <sys/zfs_context.h>
34#include <sys/fm/fs/zfs.h>
35#include <sys/spa_impl.h>
36#include <sys/zio.h>
37#include <sys/zio_checksum.h>
38#include <sys/zio_compress.h>
39#include <sys/dmu.h>
40#include <sys/dmu_tx.h>
41#include <sys/zap.h>
42#include <sys/zil.h>
43#include <sys/vdev_impl.h>
44#include <sys/metaslab.h>
45#include <sys/uberblock_impl.h>
46#include <sys/txg.h>
47#include <sys/avl.h>
48#include <sys/dmu_traverse.h>
49#include <sys/dmu_objset.h>
50#include <sys/unique.h>
51#include <sys/dsl_pool.h>
52#include <sys/dsl_dataset.h>
53#include <sys/dsl_dir.h>
54#include <sys/dsl_prop.h>
55#include <sys/dsl_synctask.h>
56#include <sys/fs/zfs.h>
57#include <sys/arc.h>
58#include <sys/callb.h>
59#include <sys/sunddi.h>
60#include <sys/spa_boot.h>
61
62#include "zfs_prop.h"
63#include "zfs_comutil.h"
64
65/* Check hostid on import? */
66static int check_hostid = 1;
67
68SYSCTL_DECL(_vfs_zfs);
69TUNABLE_INT("vfs.zfs.check_hostid", &check_hostid);
70SYSCTL_INT(_vfs_zfs, OID_AUTO, check_hostid, CTLFLAG_RW, &check_hostid, 0,
71    "Check hostid on import?");
72
73enum zti_modes {
74	zti_mode_fixed,			/* value is # of threads (min 1) */
75	zti_mode_online_percent,	/* value is % of online CPUs */
76	zti_mode_tune,			/* fill from zio_taskq_tune_* */
77	zti_mode_null,			/* don't create a taskq */
78	zti_nmodes
79};
80
81#define	ZTI_FIX(n)	{ zti_mode_fixed, (n) }
82#define	ZTI_PCT(n)	{ zti_mode_online_percent, (n) }
83#define	ZTI_TUNE	{ zti_mode_tune, 0 }
84#define	ZTI_NULL	{ zti_mode_null, 0 }
85
86#define	ZTI_ONE		ZTI_FIX(1)
87
88typedef struct zio_taskq_info {
89	enum zti_modes zti_mode;
90	uint_t zti_value;
91} zio_taskq_info_t;
92
93static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
94		"issue", "issue_high", "intr", "intr_high"
95};
96
97/*
98 * Define the taskq threads for the following I/O types:
99 * 	NULL, READ, WRITE, FREE, CLAIM, and IOCTL
100 */
101const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
102	/* ISSUE	ISSUE_HIGH	INTR		INTR_HIGH */
103	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL },
104	{ ZTI_FIX(8),	ZTI_NULL,	ZTI_TUNE,	ZTI_NULL },
105	{ ZTI_TUNE,	ZTI_FIX(5),	ZTI_FIX(8),	ZTI_FIX(5) },
106	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL },
107	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL },
108	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL },
109};
110
111enum zti_modes zio_taskq_tune_mode = zti_mode_online_percent;
112uint_t zio_taskq_tune_value = 80;	/* #threads = 80% of # online CPUs */
113
114static void spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx);
115static boolean_t spa_has_active_shared_spare(spa_t *spa);
116
117/*
118 * ==========================================================================
119 * SPA properties routines
120 * ==========================================================================
121 */
122
123/*
124 * Add a (source=src, propname=propval) list to an nvlist.
125 */
126static void
127spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval,
128    uint64_t intval, zprop_source_t src)
129{
130	const char *propname = zpool_prop_to_name(prop);
131	nvlist_t *propval;
132
133	VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
134	VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0);
135
136	if (strval != NULL)
137		VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0);
138	else
139		VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0);
140
141	VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0);
142	nvlist_free(propval);
143}
144
145/*
146 * Get property values from the spa configuration.
147 */
148static void
149spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
150{
151	uint64_t size;
152	uint64_t used;
153	uint64_t cap, version;
154	zprop_source_t src = ZPROP_SRC_NONE;
155	spa_config_dirent_t *dp;
156
157	ASSERT(MUTEX_HELD(&spa->spa_props_lock));
158
159	if (spa->spa_root_vdev != NULL) {
160		size = spa_get_space(spa);
161		used = spa_get_alloc(spa);
162		spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
163		spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
164		spa_prop_add_list(*nvp, ZPOOL_PROP_USED, NULL, used, src);
165		spa_prop_add_list(*nvp, ZPOOL_PROP_AVAILABLE, NULL,
166		    size - used, src);
167
168		cap = (size == 0) ? 0 : (used * 100 / size);
169		spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);
170
171		spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
172		    spa->spa_root_vdev->vdev_state, src);
173
174		version = spa_version(spa);
175		if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION))
176			src = ZPROP_SRC_DEFAULT;
177		else
178			src = ZPROP_SRC_LOCAL;
179		spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src);
180	}
181
182	spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
183
184	if (spa->spa_root != NULL)
185		spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root,
186		    0, ZPROP_SRC_LOCAL);
187
188	if ((dp = list_head(&spa->spa_config_list)) != NULL) {
189		if (dp->scd_path == NULL) {
190			spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
191			    "none", 0, ZPROP_SRC_LOCAL);
192		} else if (strcmp(dp->scd_path, spa_config_path) != 0) {
193			spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
194			    dp->scd_path, 0, ZPROP_SRC_LOCAL);
195		}
196	}
197}
198
199/*
200 * Get zpool property values.
201 */
202int
203spa_prop_get(spa_t *spa, nvlist_t **nvp)
204{
205	zap_cursor_t zc;
206	zap_attribute_t za;
207	objset_t *mos = spa->spa_meta_objset;
208	int err;
209
210	VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
211
212	mutex_enter(&spa->spa_props_lock);
213
214	/*
215	 * Get properties from the spa config.
216	 */
217	spa_prop_get_config(spa, nvp);
218
219	/* If no pool property object, no more prop to get. */
220	if (spa->spa_pool_props_object == 0) {
221		mutex_exit(&spa->spa_props_lock);
222		return (0);
223	}
224
225	/*
226	 * Get properties from the MOS pool property object.
227	 */
228	for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object);
229	    (err = zap_cursor_retrieve(&zc, &za)) == 0;
230	    zap_cursor_advance(&zc)) {
231		uint64_t intval = 0;
232		char *strval = NULL;
233		zprop_source_t src = ZPROP_SRC_DEFAULT;
234		zpool_prop_t prop;
235
236		if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL)
237			continue;
238
239		switch (za.za_integer_length) {
240		case 8:
241			/* integer property */
242			if (za.za_first_integer !=
243			    zpool_prop_default_numeric(prop))
244				src = ZPROP_SRC_LOCAL;
245
246			if (prop == ZPOOL_PROP_BOOTFS) {
247				dsl_pool_t *dp;
248				dsl_dataset_t *ds = NULL;
249
250				dp = spa_get_dsl(spa);
251				rw_enter(&dp->dp_config_rwlock, RW_READER);
252				if (err = dsl_dataset_hold_obj(dp,
253				    za.za_first_integer, FTAG, &ds)) {
254					rw_exit(&dp->dp_config_rwlock);
255					break;
256				}
257
258				strval = kmem_alloc(
259				    MAXNAMELEN + strlen(MOS_DIR_NAME) + 1,
260				    KM_SLEEP);
261				dsl_dataset_name(ds, strval);
262				dsl_dataset_rele(ds, FTAG);
263				rw_exit(&dp->dp_config_rwlock);
264			} else {
265				strval = NULL;
266				intval = za.za_first_integer;
267			}
268
269			spa_prop_add_list(*nvp, prop, strval, intval, src);
270
271			if (strval != NULL)
272				kmem_free(strval,
273				    MAXNAMELEN + strlen(MOS_DIR_NAME) + 1);
274
275			break;
276
277		case 1:
278			/* string property */
279			strval = kmem_alloc(za.za_num_integers, KM_SLEEP);
280			err = zap_lookup(mos, spa->spa_pool_props_object,
281			    za.za_name, 1, za.za_num_integers, strval);
282			if (err) {
283				kmem_free(strval, za.za_num_integers);
284				break;
285			}
286			spa_prop_add_list(*nvp, prop, strval, 0, src);
287			kmem_free(strval, za.za_num_integers);
288			break;
289
290		default:
291			break;
292		}
293	}
294	zap_cursor_fini(&zc);
295	mutex_exit(&spa->spa_props_lock);
296out:
297	if (err && err != ENOENT) {
298		nvlist_free(*nvp);
299		*nvp = NULL;
300		return (err);
301	}
302
303	return (0);
304}
305
306/*
307 * Validate the given pool properties nvlist and modify the list
308 * for the property values to be set.
309 */
310static int
311spa_prop_validate(spa_t *spa, nvlist_t *props)
312{
313	nvpair_t *elem;
314	int error = 0, reset_bootfs = 0;
315	uint64_t objnum;
316
317	elem = NULL;
318	while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
319		zpool_prop_t prop;
320		char *propname, *strval;
321		uint64_t intval;
322		objset_t *os;
323		char *slash;
324
325		propname = nvpair_name(elem);
326
327		if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL)
328			return (EINVAL);
329
330		switch (prop) {
331		case ZPOOL_PROP_VERSION:
332			error = nvpair_value_uint64(elem, &intval);
333			if (!error &&
334			    (intval < spa_version(spa) || intval > SPA_VERSION))
335				error = EINVAL;
336			break;
337
338		case ZPOOL_PROP_DELEGATION:
339		case ZPOOL_PROP_AUTOREPLACE:
340		case ZPOOL_PROP_LISTSNAPS:
341			error = nvpair_value_uint64(elem, &intval);
342			if (!error && intval > 1)
343				error = EINVAL;
344			break;
345
346		case ZPOOL_PROP_BOOTFS:
347			/*
348			 * If the pool version is less than SPA_VERSION_BOOTFS,
349			 * or the pool is still being created (version == 0),
350			 * the bootfs property cannot be set.
351			 */
352			if (spa_version(spa) < SPA_VERSION_BOOTFS) {
353				error = ENOTSUP;
354				break;
355			}
356
357			/*
358			 * Make sure the vdev config is bootable
359			 */
360			if (!vdev_is_bootable(spa->spa_root_vdev)) {
361				error = ENOTSUP;
362				break;
363			}
364
365			reset_bootfs = 1;
366
367			error = nvpair_value_string(elem, &strval);
368
369			if (!error) {
370				uint64_t compress;
371
372				if (strval == NULL || strval[0] == '\0') {
373					objnum = zpool_prop_default_numeric(
374					    ZPOOL_PROP_BOOTFS);
375					break;
376				}
377
378				if (error = dmu_objset_open(strval, DMU_OST_ZFS,
379				    DS_MODE_USER | DS_MODE_READONLY, &os))
380					break;
381
382				/* We don't support gzip bootable datasets */
383				if ((error = dsl_prop_get_integer(strval,
384				    zfs_prop_to_name(ZFS_PROP_COMPRESSION),
385				    &compress, NULL)) == 0 &&
386				    !BOOTFS_COMPRESS_VALID(compress)) {
387					error = ENOTSUP;
388				} else {
389					objnum = dmu_objset_id(os);
390				}
391				dmu_objset_close(os);
392			}
393			break;
394
395		case ZPOOL_PROP_FAILUREMODE:
396			error = nvpair_value_uint64(elem, &intval);
397			if (!error && (intval < ZIO_FAILURE_MODE_WAIT ||
398			    intval > ZIO_FAILURE_MODE_PANIC))
399				error = EINVAL;
400
401			/*
402			 * This is a special case which only occurs when
403			 * the pool has completely failed. This allows
404			 * the user to change the in-core failmode property
405			 * without syncing it out to disk (I/Os might
406			 * currently be blocked). We do this by returning
407			 * EIO to the caller (spa_prop_set) to trick it
408			 * into thinking we encountered a property validation
409			 * error.
410			 */
411			if (!error && spa_suspended(spa)) {
412				spa->spa_failmode = intval;
413				error = EIO;
414			}
415			break;
416
417		case ZPOOL_PROP_CACHEFILE:
418			if ((error = nvpair_value_string(elem, &strval)) != 0)
419				break;
420
421			if (strval[0] == '\0')
422				break;
423
424			if (strcmp(strval, "none") == 0)
425				break;
426
427			if (strval[0] != '/') {
428				error = EINVAL;
429				break;
430			}
431
432			slash = strrchr(strval, '/');
433			ASSERT(slash != NULL);
434
435			if (slash[1] == '\0' || strcmp(slash, "/.") == 0 ||
436			    strcmp(slash, "/..") == 0)
437				error = EINVAL;
438			break;
439		}
440
441		if (error)
442			break;
443	}
444
445	if (!error && reset_bootfs) {
446		error = nvlist_remove(props,
447		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING);
448
449		if (!error) {
450			error = nvlist_add_uint64(props,
451			    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum);
452		}
453	}
454
455	return (error);
456}
457
458void
459spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync)
460{
461	char *cachefile;
462	spa_config_dirent_t *dp;
463
464	if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE),
465	    &cachefile) != 0)
466		return;
467
468	dp = kmem_alloc(sizeof (spa_config_dirent_t),
469	    KM_SLEEP);
470
471	if (cachefile[0] == '\0')
472		dp->scd_path = spa_strdup(spa_config_path);
473	else if (strcmp(cachefile, "none") == 0)
474		dp->scd_path = NULL;
475	else
476		dp->scd_path = spa_strdup(cachefile);
477
478	list_insert_head(&spa->spa_config_list, dp);
479	if (need_sync)
480		spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
481}
482
483int
484spa_prop_set(spa_t *spa, nvlist_t *nvp)
485{
486	int error;
487	nvpair_t *elem;
488	boolean_t need_sync = B_FALSE;
489	zpool_prop_t prop;
490
491	if ((error = spa_prop_validate(spa, nvp)) != 0)
492		return (error);
493
494	elem = NULL;
495	while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) {
496		if ((prop = zpool_name_to_prop(
497		    nvpair_name(elem))) == ZPROP_INVAL)
498			return (EINVAL);
499
500		if (prop == ZPOOL_PROP_CACHEFILE || prop == ZPOOL_PROP_ALTROOT)
501			continue;
502
503		need_sync = B_TRUE;
504		break;
505	}
506
507	if (need_sync)
508		return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props,
509		    spa, nvp, 3));
510	else
511		return (0);
512}
513
514/*
515 * If the bootfs property value is dsobj, clear it.
516 */
517void
518spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
519{
520	if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
521		VERIFY(zap_remove(spa->spa_meta_objset,
522		    spa->spa_pool_props_object,
523		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0);
524		spa->spa_bootfs = 0;
525	}
526}
527
528/*
529 * ==========================================================================
530 * SPA state manipulation (open/create/destroy/import/export)
531 * ==========================================================================
532 */
533
534static int
535spa_error_entry_compare(const void *a, const void *b)
536{
537	spa_error_entry_t *sa = (spa_error_entry_t *)a;
538	spa_error_entry_t *sb = (spa_error_entry_t *)b;
539	int ret;
540
541	ret = bcmp(&sa->se_bookmark, &sb->se_bookmark,
542	    sizeof (zbookmark_t));
543
544	if (ret < 0)
545		return (-1);
546	else if (ret > 0)
547		return (1);
548	else
549		return (0);
550}
551
552/*
553 * Utility function which retrieves copies of the current logs and
554 * re-initializes them in the process.
555 */
556void
557spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
558{
559	ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
560
561	bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t));
562	bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t));
563
564	avl_create(&spa->spa_errlist_scrub,
565	    spa_error_entry_compare, sizeof (spa_error_entry_t),
566	    offsetof(spa_error_entry_t, se_avl));
567	avl_create(&spa->spa_errlist_last,
568	    spa_error_entry_compare, sizeof (spa_error_entry_t),
569	    offsetof(spa_error_entry_t, se_avl));
570}
571
572/*
573 * Activate an uninitialized pool.
574 */
575static void
576spa_activate(spa_t *spa, int mode)
577{
578	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
579
580	spa->spa_state = POOL_STATE_ACTIVE;
581	spa->spa_mode = mode;
582
583	spa->spa_normal_class = metaslab_class_create(zfs_metaslab_ops);
584	spa->spa_log_class = metaslab_class_create(zfs_metaslab_ops);
585
586	for (int t = 0; t < ZIO_TYPES; t++) {
587		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
588			const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
589			enum zti_modes mode = ztip->zti_mode;
590			uint_t value = ztip->zti_value;
591			char name[32];
592
593			(void) snprintf(name, sizeof (name),
594			    "%s_%s", zio_type_name[t], zio_taskq_types[q]);
595
596			if (mode == zti_mode_tune) {
597				mode = zio_taskq_tune_mode;
598				value = zio_taskq_tune_value;
599				if (mode == zti_mode_tune)
600					mode = zti_mode_online_percent;
601			}
602
603			switch (mode) {
604			case zti_mode_fixed:
605				ASSERT3U(value, >=, 1);
606				value = MAX(value, 1);
607
608				spa->spa_zio_taskq[t][q] = taskq_create(name,
609				    value, maxclsyspri, 50, INT_MAX,
610				    TASKQ_PREPOPULATE);
611				break;
612
613			case zti_mode_online_percent:
614				spa->spa_zio_taskq[t][q] = taskq_create(name,
615				    value, maxclsyspri, 50, INT_MAX,
616				    TASKQ_PREPOPULATE | TASKQ_THREADS_CPU_PCT);
617				break;
618
619			case zti_mode_null:
620				spa->spa_zio_taskq[t][q] = NULL;
621				break;
622
623			case zti_mode_tune:
624			default:
625				panic("unrecognized mode for "
626				    "zio_taskqs[%u]->zti_nthreads[%u] (%u:%u) "
627				    "in spa_activate()",
628				    t, q, mode, value);
629				break;
630			}
631		}
632	}
633
634	list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
635	    offsetof(vdev_t, vdev_config_dirty_node));
636	list_create(&spa->spa_state_dirty_list, sizeof (vdev_t),
637	    offsetof(vdev_t, vdev_state_dirty_node));
638
639	txg_list_create(&spa->spa_vdev_txg_list,
640	    offsetof(struct vdev, vdev_txg_node));
641
642	avl_create(&spa->spa_errlist_scrub,
643	    spa_error_entry_compare, sizeof (spa_error_entry_t),
644	    offsetof(spa_error_entry_t, se_avl));
645	avl_create(&spa->spa_errlist_last,
646	    spa_error_entry_compare, sizeof (spa_error_entry_t),
647	    offsetof(spa_error_entry_t, se_avl));
648}
649
650/*
651 * Opposite of spa_activate().
652 */
653static void
654spa_deactivate(spa_t *spa)
655{
656	ASSERT(spa->spa_sync_on == B_FALSE);
657	ASSERT(spa->spa_dsl_pool == NULL);
658	ASSERT(spa->spa_root_vdev == NULL);
659	ASSERT(spa->spa_async_zio_root == NULL);
660	ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
661
662	txg_list_destroy(&spa->spa_vdev_txg_list);
663
664	list_destroy(&spa->spa_config_dirty_list);
665	list_destroy(&spa->spa_state_dirty_list);
666
667	for (int t = 0; t < ZIO_TYPES; t++) {
668		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
669			if (spa->spa_zio_taskq[t][q] != NULL)
670				taskq_destroy(spa->spa_zio_taskq[t][q]);
671			spa->spa_zio_taskq[t][q] = NULL;
672		}
673	}
674
675	metaslab_class_destroy(spa->spa_normal_class);
676	spa->spa_normal_class = NULL;
677
678	metaslab_class_destroy(spa->spa_log_class);
679	spa->spa_log_class = NULL;
680
681	/*
682	 * If this was part of an import or the open otherwise failed, we may
683	 * still have errors left in the queues.  Empty them just in case.
684	 */
685	spa_errlog_drain(spa);
686
687	avl_destroy(&spa->spa_errlist_scrub);
688	avl_destroy(&spa->spa_errlist_last);
689
690	spa->spa_state = POOL_STATE_UNINITIALIZED;
691}
692
693/*
694 * Verify a pool configuration, and construct the vdev tree appropriately.  This
695 * will create all the necessary vdevs in the appropriate layout, with each vdev
696 * in the CLOSED state.  This will prep the pool before open/creation/import.
697 * All vdev validation is done by the vdev_alloc() routine.
698 */
699static int
700spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
701    uint_t id, int atype)
702{
703	nvlist_t **child;
704	uint_t c, children;
705	int error;
706
707	if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
708		return (error);
709
710	if ((*vdp)->vdev_ops->vdev_op_leaf)
711		return (0);
712
713	error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
714	    &child, &children);
715
716	if (error == ENOENT)
717		return (0);
718
719	if (error) {
720		vdev_free(*vdp);
721		*vdp = NULL;
722		return (EINVAL);
723	}
724
725	for (c = 0; c < children; c++) {
726		vdev_t *vd;
727		if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
728		    atype)) != 0) {
729			vdev_free(*vdp);
730			*vdp = NULL;
731			return (error);
732		}
733	}
734
735	ASSERT(*vdp != NULL);
736
737	return (0);
738}
739
740/*
741 * Opposite of spa_load().
742 */
743static void
744spa_unload(spa_t *spa)
745{
746	int i;
747
748	ASSERT(MUTEX_HELD(&spa_namespace_lock));
749
750	/*
751	 * Stop async tasks.
752	 */
753	spa_async_suspend(spa);
754
755	/*
756	 * Stop syncing.
757	 */
758	if (spa->spa_sync_on) {
759		txg_sync_stop(spa->spa_dsl_pool);
760		spa->spa_sync_on = B_FALSE;
761	}
762
763	/*
764	 * Wait for any outstanding async I/O to complete.
765	 */
766	if (spa->spa_async_zio_root != NULL) {
767		(void) zio_wait(spa->spa_async_zio_root);
768		spa->spa_async_zio_root = NULL;
769	}
770
771	/*
772	 * Close the dsl pool.
773	 */
774	if (spa->spa_dsl_pool) {
775		dsl_pool_close(spa->spa_dsl_pool);
776		spa->spa_dsl_pool = NULL;
777	}
778
779	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
780
781	/*
782	 * Drop and purge level 2 cache
783	 */
784	spa_l2cache_drop(spa);
785
786	/*
787	 * Close all vdevs.
788	 */
789	if (spa->spa_root_vdev)
790		vdev_free(spa->spa_root_vdev);
791	ASSERT(spa->spa_root_vdev == NULL);
792
793	for (i = 0; i < spa->spa_spares.sav_count; i++)
794		vdev_free(spa->spa_spares.sav_vdevs[i]);
795	if (spa->spa_spares.sav_vdevs) {
796		kmem_free(spa->spa_spares.sav_vdevs,
797		    spa->spa_spares.sav_count * sizeof (void *));
798		spa->spa_spares.sav_vdevs = NULL;
799	}
800	if (spa->spa_spares.sav_config) {
801		nvlist_free(spa->spa_spares.sav_config);
802		spa->spa_spares.sav_config = NULL;
803	}
804	spa->spa_spares.sav_count = 0;
805
806	for (i = 0; i < spa->spa_l2cache.sav_count; i++)
807		vdev_free(spa->spa_l2cache.sav_vdevs[i]);
808	if (spa->spa_l2cache.sav_vdevs) {
809		kmem_free(spa->spa_l2cache.sav_vdevs,
810		    spa->spa_l2cache.sav_count * sizeof (void *));
811		spa->spa_l2cache.sav_vdevs = NULL;
812	}
813	if (spa->spa_l2cache.sav_config) {
814		nvlist_free(spa->spa_l2cache.sav_config);
815		spa->spa_l2cache.sav_config = NULL;
816	}
817	spa->spa_l2cache.sav_count = 0;
818
819	spa->spa_async_suspended = 0;
820
821	spa_config_exit(spa, SCL_ALL, FTAG);
822}
823
824/*
825 * Load (or re-load) the current list of vdevs describing the active spares for
826 * this pool.  When this is called, we have some form of basic information in
827 * 'spa_spares.sav_config'.  We parse this into vdevs, try to open them, and
828 * then re-generate a more complete list including status information.
829 */
830static void
831spa_load_spares(spa_t *spa)
832{
833	nvlist_t **spares;
834	uint_t nspares;
835	int i;
836	vdev_t *vd, *tvd;
837
838	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
839
840	/*
841	 * First, close and free any existing spare vdevs.
842	 */
843	for (i = 0; i < spa->spa_spares.sav_count; i++) {
844		vd = spa->spa_spares.sav_vdevs[i];
845
846		/* Undo the call to spa_activate() below */
847		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
848		    B_FALSE)) != NULL && tvd->vdev_isspare)
849			spa_spare_remove(tvd);
850		vdev_close(vd);
851		vdev_free(vd);
852	}
853
854	if (spa->spa_spares.sav_vdevs)
855		kmem_free(spa->spa_spares.sav_vdevs,
856		    spa->spa_spares.sav_count * sizeof (void *));
857
858	if (spa->spa_spares.sav_config == NULL)
859		nspares = 0;
860	else
861		VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
862		    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
863
864	spa->spa_spares.sav_count = (int)nspares;
865	spa->spa_spares.sav_vdevs = NULL;
866
867	if (nspares == 0)
868		return;
869
870	/*
871	 * Construct the array of vdevs, opening them to get status in the
872	 * process.   For each spare, there is potentially two different vdev_t
873	 * structures associated with it: one in the list of spares (used only
874	 * for basic validation purposes) and one in the active vdev
875	 * configuration (if it's spared in).  During this phase we open and
876	 * validate each vdev on the spare list.  If the vdev also exists in the
877	 * active configuration, then we also mark this vdev as an active spare.
878	 */
879	spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *),
880	    KM_SLEEP);
881	for (i = 0; i < spa->spa_spares.sav_count; i++) {
882		VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0,
883		    VDEV_ALLOC_SPARE) == 0);
884		ASSERT(vd != NULL);
885
886		spa->spa_spares.sav_vdevs[i] = vd;
887
888		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
889		    B_FALSE)) != NULL) {
890			if (!tvd->vdev_isspare)
891				spa_spare_add(tvd);
892
893			/*
894			 * We only mark the spare active if we were successfully
895			 * able to load the vdev.  Otherwise, importing a pool
896			 * with a bad active spare would result in strange
897			 * behavior, because multiple pool would think the spare
898			 * is actively in use.
899			 *
900			 * There is a vulnerability here to an equally bizarre
901			 * circumstance, where a dead active spare is later
902			 * brought back to life (onlined or otherwise).  Given
903			 * the rarity of this scenario, and the extra complexity
904			 * it adds, we ignore the possibility.
905			 */
906			if (!vdev_is_dead(tvd))
907				spa_spare_activate(tvd);
908		}
909
910		vd->vdev_top = vd;
911		vd->vdev_aux = &spa->spa_spares;
912
913		if (vdev_open(vd) != 0)
914			continue;
915
916		if (vdev_validate_aux(vd) == 0)
917			spa_spare_add(vd);
918	}
919
920	/*
921	 * Recompute the stashed list of spares, with status information
922	 * this time.
923	 */
924	VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES,
925	    DATA_TYPE_NVLIST_ARRAY) == 0);
926
927	spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *),
928	    KM_SLEEP);
929	for (i = 0; i < spa->spa_spares.sav_count; i++)
930		spares[i] = vdev_config_generate(spa,
931		    spa->spa_spares.sav_vdevs[i], B_TRUE, B_TRUE, B_FALSE);
932	VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
933	    ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0);
934	for (i = 0; i < spa->spa_spares.sav_count; i++)
935		nvlist_free(spares[i]);
936	kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *));
937}
938
939/*
940 * Load (or re-load) the current list of vdevs describing the active l2cache for
941 * this pool.  When this is called, we have some form of basic information in
942 * 'spa_l2cache.sav_config'.  We parse this into vdevs, try to open them, and
943 * then re-generate a more complete list including status information.
944 * Devices which are already active have their details maintained, and are
945 * not re-opened.
946 */
947static void
948spa_load_l2cache(spa_t *spa)
949{
950	nvlist_t **l2cache;
951	uint_t nl2cache;
952	int i, j, oldnvdevs;
953	uint64_t guid, size;
954	vdev_t *vd, **oldvdevs, **newvdevs;
955	spa_aux_vdev_t *sav = &spa->spa_l2cache;
956
957	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
958
959	if (sav->sav_config != NULL) {
960		VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
961		    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
962		newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP);
963	} else {
964		nl2cache = 0;
965	}
966
967	oldvdevs = sav->sav_vdevs;
968	oldnvdevs = sav->sav_count;
969	sav->sav_vdevs = NULL;
970	sav->sav_count = 0;
971
972	/*
973	 * Process new nvlist of vdevs.
974	 */
975	for (i = 0; i < nl2cache; i++) {
976		VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID,
977		    &guid) == 0);
978
979		newvdevs[i] = NULL;
980		for (j = 0; j < oldnvdevs; j++) {
981			vd = oldvdevs[j];
982			if (vd != NULL && guid == vd->vdev_guid) {
983				/*
984				 * Retain previous vdev for add/remove ops.
985				 */
986				newvdevs[i] = vd;
987				oldvdevs[j] = NULL;
988				break;
989			}
990		}
991
992		if (newvdevs[i] == NULL) {
993			/*
994			 * Create new vdev
995			 */
996			VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0,
997			    VDEV_ALLOC_L2CACHE) == 0);
998			ASSERT(vd != NULL);
999			newvdevs[i] = vd;
1000
1001			/*
1002			 * Commit this vdev as an l2cache device,
1003			 * even if it fails to open.
1004			 */
1005			spa_l2cache_add(vd);
1006
1007			vd->vdev_top = vd;
1008			vd->vdev_aux = sav;
1009
1010			spa_l2cache_activate(vd);
1011
1012			if (vdev_open(vd) != 0)
1013				continue;
1014
1015			(void) vdev_validate_aux(vd);
1016
1017			if (!vdev_is_dead(vd)) {
1018				size = vdev_get_rsize(vd);
1019				l2arc_add_vdev(spa, vd,
1020				    VDEV_LABEL_START_SIZE,
1021				    size - VDEV_LABEL_START_SIZE);
1022			}
1023		}
1024	}
1025
1026	/*
1027	 * Purge vdevs that were dropped
1028	 */
1029	for (i = 0; i < oldnvdevs; i++) {
1030		uint64_t pool;
1031
1032		vd = oldvdevs[i];
1033		if (vd != NULL) {
1034			if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
1035			    pool != 0ULL && l2arc_vdev_present(vd))
1036				l2arc_remove_vdev(vd);
1037			(void) vdev_close(vd);
1038			spa_l2cache_remove(vd);
1039		}
1040	}
1041
1042	if (oldvdevs)
1043		kmem_free(oldvdevs, oldnvdevs * sizeof (void *));
1044
1045	if (sav->sav_config == NULL)
1046		goto out;
1047
1048	sav->sav_vdevs = newvdevs;
1049	sav->sav_count = (int)nl2cache;
1050
1051	/*
1052	 * Recompute the stashed list of l2cache devices, with status
1053	 * information this time.
1054	 */
1055	VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
1056	    DATA_TYPE_NVLIST_ARRAY) == 0);
1057
1058	l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
1059	for (i = 0; i < sav->sav_count; i++)
1060		l2cache[i] = vdev_config_generate(spa,
1061		    sav->sav_vdevs[i], B_TRUE, B_FALSE, B_TRUE);
1062	VERIFY(nvlist_add_nvlist_array(sav->sav_config,
1063	    ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0);
1064out:
1065	for (i = 0; i < sav->sav_count; i++)
1066		nvlist_free(l2cache[i]);
1067	if (sav->sav_count)
1068		kmem_free(l2cache, sav->sav_count * sizeof (void *));
1069}
1070
1071static int
1072load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
1073{
1074	dmu_buf_t *db;
1075	char *packed = NULL;
1076	size_t nvsize = 0;
1077	int error;
1078	*value = NULL;
1079
1080	VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
1081	nvsize = *(uint64_t *)db->db_data;
1082	dmu_buf_rele(db, FTAG);
1083
1084	packed = kmem_alloc(nvsize, KM_SLEEP);
1085	error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed,
1086	    DMU_READ_PREFETCH);
1087	if (error == 0)
1088		error = nvlist_unpack(packed, nvsize, value, 0);
1089	kmem_free(packed, nvsize);
1090
1091	return (error);
1092}
1093
1094/*
1095 * Checks to see if the given vdev could not be opened, in which case we post a
1096 * sysevent to notify the autoreplace code that the device has been removed.
1097 */
1098static void
1099spa_check_removed(vdev_t *vd)
1100{
1101	int c;
1102
1103	for (c = 0; c < vd->vdev_children; c++)
1104		spa_check_removed(vd->vdev_child[c]);
1105
1106	if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) {
1107		zfs_post_autoreplace(vd->vdev_spa, vd);
1108		spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK);
1109	}
1110}
1111
1112/*
1113 * Check for missing log devices
1114 */
1115int
1116spa_check_logs(spa_t *spa)
1117{
1118	switch (spa->spa_log_state) {
1119	case SPA_LOG_MISSING:
1120		/* need to recheck in case slog has been restored */
1121	case SPA_LOG_UNKNOWN:
1122		if (dmu_objset_find(spa->spa_name, zil_check_log_chain, NULL,
1123		    DS_FIND_CHILDREN)) {
1124			spa->spa_log_state = SPA_LOG_MISSING;
1125			return (1);
1126		}
1127		break;
1128
1129	case SPA_LOG_CLEAR:
1130		(void) dmu_objset_find(spa->spa_name, zil_clear_log_chain, NULL,
1131		    DS_FIND_CHILDREN);
1132		break;
1133	}
1134	spa->spa_log_state = SPA_LOG_GOOD;
1135	return (0);
1136}
1137
1138/*
1139 * Load an existing storage pool, using the pool's builtin spa_config as a
1140 * source of configuration information.
1141 */
1142static int
1143spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
1144{
1145	int error = 0;
1146	nvlist_t *nvroot = NULL;
1147	vdev_t *rvd;
1148	uberblock_t *ub = &spa->spa_uberblock;
1149	uint64_t config_cache_txg = spa->spa_config_txg;
1150	uint64_t pool_guid;
1151	uint64_t version;
1152	uint64_t autoreplace = 0;
1153	int orig_mode = spa->spa_mode;
1154	char *ereport = FM_EREPORT_ZFS_POOL;
1155
1156	/*
1157	 * If this is an untrusted config, access the pool in read-only mode.
1158	 * This prevents things like resilvering recently removed devices.
1159	 */
1160	if (!mosconfig)
1161		spa->spa_mode = FREAD;
1162
1163	ASSERT(MUTEX_HELD(&spa_namespace_lock));
1164
1165	spa->spa_load_state = state;
1166
1167	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) ||
1168	    nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) {
1169		error = EINVAL;
1170		goto out;
1171	}
1172
1173	/*
1174	 * Versioning wasn't explicitly added to the label until later, so if
1175	 * it's not present treat it as the initial version.
1176	 */
1177	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0)
1178		version = SPA_VERSION_INITIAL;
1179
1180	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
1181	    &spa->spa_config_txg);
1182
1183	if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) &&
1184	    spa_guid_exists(pool_guid, 0)) {
1185		error = EEXIST;
1186		goto out;
1187	}
1188
1189	spa->spa_load_guid = pool_guid;
1190
1191	/*
1192	 * Create "The Godfather" zio to hold all async IOs
1193	 */
1194	spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
1195	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
1196
1197	/*
1198	 * Parse the configuration into a vdev tree.  We explicitly set the
1199	 * value that will be returned by spa_version() since parsing the
1200	 * configuration requires knowing the version number.
1201	 */
1202	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1203	spa->spa_ubsync.ub_version = version;
1204	error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD);
1205	spa_config_exit(spa, SCL_ALL, FTAG);
1206
1207	if (error != 0)
1208		goto out;
1209
1210	ASSERT(spa->spa_root_vdev == rvd);
1211	ASSERT(spa_guid(spa) == pool_guid);
1212
1213	/*
1214	 * Try to open all vdevs, loading each label in the process.
1215	 */
1216	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1217	error = vdev_open(rvd);
1218	spa_config_exit(spa, SCL_ALL, FTAG);
1219	if (error != 0)
1220		goto out;
1221
1222	/*
1223	 * We need to validate the vdev labels against the configuration that
1224	 * we have in hand, which is dependent on the setting of mosconfig. If
1225	 * mosconfig is true then we're validating the vdev labels based on
1226	 * that config. Otherwise, we're validating against the cached config
1227	 * (zpool.cache) that was read when we loaded the zfs module, and then
1228	 * later we will recursively call spa_load() and validate against
1229	 * the vdev config.
1230	 */
1231	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1232	error = vdev_validate(rvd);
1233	spa_config_exit(spa, SCL_ALL, FTAG);
1234	if (error != 0)
1235		goto out;
1236
1237	if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
1238		error = ENXIO;
1239		goto out;
1240	}
1241
1242	/*
1243	 * Find the best uberblock.
1244	 */
1245	vdev_uberblock_load(NULL, rvd, ub);
1246
1247	/*
1248	 * If we weren't able to find a single valid uberblock, return failure.
1249	 */
1250	if (ub->ub_txg == 0) {
1251		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1252		    VDEV_AUX_CORRUPT_DATA);
1253		error = ENXIO;
1254		goto out;
1255	}
1256
1257	/*
1258	 * If the pool is newer than the code, we can't open it.
1259	 */
1260	if (ub->ub_version > SPA_VERSION) {
1261		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1262		    VDEV_AUX_VERSION_NEWER);
1263		error = ENOTSUP;
1264		goto out;
1265	}
1266
1267	/*
1268	 * If the vdev guid sum doesn't match the uberblock, we have an
1269	 * incomplete configuration.
1270	 */
1271	if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) {
1272		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1273		    VDEV_AUX_BAD_GUID_SUM);
1274		error = ENXIO;
1275		goto out;
1276	}
1277
1278	/*
1279	 * Initialize internal SPA structures.
1280	 */
1281	spa->spa_state = POOL_STATE_ACTIVE;
1282	spa->spa_ubsync = spa->spa_uberblock;
1283	spa->spa_first_txg = spa_last_synced_txg(spa) + 1;
1284	error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
1285	if (error) {
1286		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1287		    VDEV_AUX_CORRUPT_DATA);
1288		goto out;
1289	}
1290	spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
1291
1292	if (zap_lookup(spa->spa_meta_objset,
1293	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
1294	    sizeof (uint64_t), 1, &spa->spa_config_object) != 0) {
1295		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1296		    VDEV_AUX_CORRUPT_DATA);
1297		error = EIO;
1298		goto out;
1299	}
1300
1301	if (!mosconfig) {
1302		nvlist_t *newconfig;
1303		uint64_t hostid;
1304
1305		if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) {
1306			vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1307			    VDEV_AUX_CORRUPT_DATA);
1308			error = EIO;
1309			goto out;
1310		}
1311
1312		if (!spa_is_root(spa) && nvlist_lookup_uint64(newconfig,
1313		    ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
1314			char *hostname;
1315			unsigned long myhostid = 0;
1316
1317			VERIFY(nvlist_lookup_string(newconfig,
1318			    ZPOOL_CONFIG_HOSTNAME, &hostname) == 0);
1319
1320			(void) ddi_strtoul(hw_serial, NULL, 10, &myhostid);
1321			if (check_hostid && hostid != 0 && myhostid != 0 &&
1322			    (unsigned long)hostid != myhostid) {
1323				cmn_err(CE_WARN, "pool '%s' could not be "
1324				    "loaded as it was last accessed by "
1325				    "another system (host: %s hostid: 0x%lx). "
1326				    "See: http://www.sun.com/msg/ZFS-8000-EY",
1327				    spa_name(spa), hostname,
1328				    (unsigned long)hostid);
1329				error = EBADF;
1330				goto out;
1331			}
1332		}
1333
1334		spa_config_set(spa, newconfig);
1335		spa_unload(spa);
1336		spa_deactivate(spa);
1337		spa_activate(spa, orig_mode);
1338
1339		return (spa_load(spa, newconfig, state, B_TRUE));
1340	}
1341
1342	if (zap_lookup(spa->spa_meta_objset,
1343	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
1344	    sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) {
1345		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1346		    VDEV_AUX_CORRUPT_DATA);
1347		error = EIO;
1348		goto out;
1349	}
1350
1351	/*
1352	 * Load the bit that tells us to use the new accounting function
1353	 * (raid-z deflation).  If we have an older pool, this will not
1354	 * be present.
1355	 */
1356	error = zap_lookup(spa->spa_meta_objset,
1357	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
1358	    sizeof (uint64_t), 1, &spa->spa_deflate);
1359	if (error != 0 && error != ENOENT) {
1360		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1361		    VDEV_AUX_CORRUPT_DATA);
1362		error = EIO;
1363		goto out;
1364	}
1365
1366	/*
1367	 * Load the persistent error log.  If we have an older pool, this will
1368	 * not be present.
1369	 */
1370	error = zap_lookup(spa->spa_meta_objset,
1371	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST,
1372	    sizeof (uint64_t), 1, &spa->spa_errlog_last);
1373	if (error != 0 && error != ENOENT) {
1374		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1375		    VDEV_AUX_CORRUPT_DATA);
1376		error = EIO;
1377		goto out;
1378	}
1379
1380	error = zap_lookup(spa->spa_meta_objset,
1381	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB,
1382	    sizeof (uint64_t), 1, &spa->spa_errlog_scrub);
1383	if (error != 0 && error != ENOENT) {
1384		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1385		    VDEV_AUX_CORRUPT_DATA);
1386		error = EIO;
1387		goto out;
1388	}
1389
1390	/*
1391	 * Load the history object.  If we have an older pool, this
1392	 * will not be present.
1393	 */
1394	error = zap_lookup(spa->spa_meta_objset,
1395	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY,
1396	    sizeof (uint64_t), 1, &spa->spa_history);
1397	if (error != 0 && error != ENOENT) {
1398		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1399		    VDEV_AUX_CORRUPT_DATA);
1400		error = EIO;
1401		goto out;
1402	}
1403
1404	/*
1405	 * Load any hot spares for this pool.
1406	 */
1407	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
1408	    DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares.sav_object);
1409	if (error != 0 && error != ENOENT) {
1410		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1411		    VDEV_AUX_CORRUPT_DATA);
1412		error = EIO;
1413		goto out;
1414	}
1415	if (error == 0) {
1416		ASSERT(spa_version(spa) >= SPA_VERSION_SPARES);
1417		if (load_nvlist(spa, spa->spa_spares.sav_object,
1418		    &spa->spa_spares.sav_config) != 0) {
1419			vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1420			    VDEV_AUX_CORRUPT_DATA);
1421			error = EIO;
1422			goto out;
1423		}
1424
1425		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1426		spa_load_spares(spa);
1427		spa_config_exit(spa, SCL_ALL, FTAG);
1428	}
1429
1430	/*
1431	 * Load any level 2 ARC devices for this pool.
1432	 */
1433	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
1434	    DMU_POOL_L2CACHE, sizeof (uint64_t), 1,
1435	    &spa->spa_l2cache.sav_object);
1436	if (error != 0 && error != ENOENT) {
1437		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1438		    VDEV_AUX_CORRUPT_DATA);
1439		error = EIO;
1440		goto out;
1441	}
1442	if (error == 0) {
1443		ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE);
1444		if (load_nvlist(spa, spa->spa_l2cache.sav_object,
1445		    &spa->spa_l2cache.sav_config) != 0) {
1446			vdev_set_state(rvd, B_TRUE,
1447			    VDEV_STATE_CANT_OPEN,
1448			    VDEV_AUX_CORRUPT_DATA);
1449			error = EIO;
1450			goto out;
1451		}
1452
1453		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1454		spa_load_l2cache(spa);
1455		spa_config_exit(spa, SCL_ALL, FTAG);
1456	}
1457
1458	if (spa_check_logs(spa)) {
1459		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1460		    VDEV_AUX_BAD_LOG);
1461		error = ENXIO;
1462		ereport = FM_EREPORT_ZFS_LOG_REPLAY;
1463		goto out;
1464	}
1465
1466
1467	spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
1468
1469	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
1470	    DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object);
1471
1472	if (error && error != ENOENT) {
1473		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1474		    VDEV_AUX_CORRUPT_DATA);
1475		error = EIO;
1476		goto out;
1477	}
1478
1479	if (error == 0) {
1480		(void) zap_lookup(spa->spa_meta_objset,
1481		    spa->spa_pool_props_object,
1482		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS),
1483		    sizeof (uint64_t), 1, &spa->spa_bootfs);
1484		(void) zap_lookup(spa->spa_meta_objset,
1485		    spa->spa_pool_props_object,
1486		    zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE),
1487		    sizeof (uint64_t), 1, &autoreplace);
1488		(void) zap_lookup(spa->spa_meta_objset,
1489		    spa->spa_pool_props_object,
1490		    zpool_prop_to_name(ZPOOL_PROP_DELEGATION),
1491		    sizeof (uint64_t), 1, &spa->spa_delegation);
1492		(void) zap_lookup(spa->spa_meta_objset,
1493		    spa->spa_pool_props_object,
1494		    zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE),
1495		    sizeof (uint64_t), 1, &spa->spa_failmode);
1496	}
1497
1498	/*
1499	 * If the 'autoreplace' property is set, then post a resource notifying
1500	 * the ZFS DE that it should not issue any faults for unopenable
1501	 * devices.  We also iterate over the vdevs, and post a sysevent for any
1502	 * unopenable vdevs so that the normal autoreplace handler can take
1503	 * over.
1504	 */
1505	if (autoreplace && state != SPA_LOAD_TRYIMPORT)
1506		spa_check_removed(spa->spa_root_vdev);
1507
1508	/*
1509	 * Load the vdev state for all toplevel vdevs.
1510	 */
1511	vdev_load(rvd);
1512
1513	/*
1514	 * Propagate the leaf DTLs we just loaded all the way up the tree.
1515	 */
1516	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1517	vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
1518	spa_config_exit(spa, SCL_ALL, FTAG);
1519
1520	/*
1521	 * Check the state of the root vdev.  If it can't be opened, it
1522	 * indicates one or more toplevel vdevs are faulted.
1523	 */
1524	if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
1525		error = ENXIO;
1526		goto out;
1527	}
1528
1529	if (spa_writeable(spa)) {
1530		dmu_tx_t *tx;
1531		int need_update = B_FALSE;
1532
1533		ASSERT(state != SPA_LOAD_TRYIMPORT);
1534
1535		/*
1536		 * Claim log blocks that haven't been committed yet.
1537		 * This must all happen in a single txg.
1538		 */
1539		tx = dmu_tx_create_assigned(spa_get_dsl(spa),
1540		    spa_first_txg(spa));
1541		(void) dmu_objset_find(spa_name(spa),
1542		    zil_claim, tx, DS_FIND_CHILDREN);
1543		dmu_tx_commit(tx);
1544
1545		spa->spa_sync_on = B_TRUE;
1546		txg_sync_start(spa->spa_dsl_pool);
1547
1548		/*
1549		 * Wait for all claims to sync.
1550		 */
1551		txg_wait_synced(spa->spa_dsl_pool, 0);
1552
1553		/*
1554		 * If the config cache is stale, or we have uninitialized
1555		 * metaslabs (see spa_vdev_add()), then update the config.
1556		 *
1557		 * If spa_load_verbatim is true, trust the current
1558		 * in-core spa_config and update the disk labels.
1559		 */
1560		if (config_cache_txg != spa->spa_config_txg ||
1561		    state == SPA_LOAD_IMPORT || spa->spa_load_verbatim)
1562			need_update = B_TRUE;
1563
1564		for (int c = 0; c < rvd->vdev_children; c++)
1565			if (rvd->vdev_child[c]->vdev_ms_array == 0)
1566				need_update = B_TRUE;
1567
1568		/*
1569		 * Update the config cache asychronously in case we're the
1570		 * root pool, in which case the config cache isn't writable yet.
1571		 */
1572		if (need_update)
1573			spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
1574
1575		/*
1576		 * Check all DTLs to see if anything needs resilvering.
1577		 */
1578		if (vdev_resilver_needed(rvd, NULL, NULL))
1579			spa_async_request(spa, SPA_ASYNC_RESILVER);
1580	}
1581
1582	error = 0;
1583out:
1584	spa->spa_minref = refcount_count(&spa->spa_refcount);
1585	if (error && error != EBADF)
1586		zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0);
1587	spa->spa_load_state = SPA_LOAD_NONE;
1588	spa->spa_ena = 0;
1589
1590	return (error);
1591}
1592
1593/*
1594 * Pool Open/Import
1595 *
1596 * The import case is identical to an open except that the configuration is sent
1597 * down from userland, instead of grabbed from the configuration cache.  For the
1598 * case of an open, the pool configuration will exist in the
1599 * POOL_STATE_UNINITIALIZED state.
1600 *
1601 * The stats information (gen/count/ustats) is used to gather vdev statistics at
1602 * the same time open the pool, without having to keep around the spa_t in some
1603 * ambiguous state.
1604 */
1605static int
1606spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config)
1607{
1608	spa_t *spa;
1609	int error;
1610	int locked = B_FALSE;
1611
1612	*spapp = NULL;
1613
1614	/*
1615	 * As disgusting as this is, we need to support recursive calls to this
1616	 * function because dsl_dir_open() is called during spa_load(), and ends
1617	 * up calling spa_open() again.  The real fix is to figure out how to
1618	 * avoid dsl_dir_open() calling this in the first place.
1619	 */
1620	if (mutex_owner(&spa_namespace_lock) != curthread) {
1621		mutex_enter(&spa_namespace_lock);
1622		locked = B_TRUE;
1623	}
1624
1625	if ((spa = spa_lookup(pool)) == NULL) {
1626		if (locked)
1627			mutex_exit(&spa_namespace_lock);
1628		return (ENOENT);
1629	}
1630	if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
1631
1632		spa_activate(spa, spa_mode_global);
1633
1634		error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE);
1635
1636		if (error == EBADF) {
1637			/*
1638			 * If vdev_validate() returns failure (indicated by
1639			 * EBADF), it indicates that one of the vdevs indicates
1640			 * that the pool has been exported or destroyed.  If
1641			 * this is the case, the config cache is out of sync and
1642			 * we should remove the pool from the namespace.
1643			 */
1644			spa_unload(spa);
1645			spa_deactivate(spa);
1646			spa_config_sync(spa, B_TRUE, B_TRUE);
1647			spa_remove(spa);
1648			if (locked)
1649				mutex_exit(&spa_namespace_lock);
1650			return (ENOENT);
1651		}
1652
1653		if (error) {
1654			/*
1655			 * We can't open the pool, but we still have useful
1656			 * information: the state of each vdev after the
1657			 * attempted vdev_open().  Return this to the user.
1658			 */
1659			if (config != NULL && spa->spa_root_vdev != NULL)
1660				*config = spa_config_generate(spa, NULL, -1ULL,
1661				    B_TRUE);
1662			spa_unload(spa);
1663			spa_deactivate(spa);
1664			spa->spa_last_open_failed = B_TRUE;
1665			if (locked)
1666				mutex_exit(&spa_namespace_lock);
1667			*spapp = NULL;
1668			return (error);
1669		} else {
1670			spa->spa_last_open_failed = B_FALSE;
1671		}
1672	}
1673
1674	spa_open_ref(spa, tag);
1675
1676	if (locked)
1677		mutex_exit(&spa_namespace_lock);
1678
1679	*spapp = spa;
1680
1681	if (config != NULL)
1682		*config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
1683
1684	return (0);
1685}
1686
1687int
1688spa_open(const char *name, spa_t **spapp, void *tag)
1689{
1690	return (spa_open_common(name, spapp, tag, NULL));
1691}
1692
1693/*
1694 * Lookup the given spa_t, incrementing the inject count in the process,
1695 * preventing it from being exported or destroyed.
1696 */
1697spa_t *
1698spa_inject_addref(char *name)
1699{
1700	spa_t *spa;
1701
1702	mutex_enter(&spa_namespace_lock);
1703	if ((spa = spa_lookup(name)) == NULL) {
1704		mutex_exit(&spa_namespace_lock);
1705		return (NULL);
1706	}
1707	spa->spa_inject_ref++;
1708	mutex_exit(&spa_namespace_lock);
1709
1710	return (spa);
1711}
1712
1713void
1714spa_inject_delref(spa_t *spa)
1715{
1716	mutex_enter(&spa_namespace_lock);
1717	spa->spa_inject_ref--;
1718	mutex_exit(&spa_namespace_lock);
1719}
1720
1721/*
1722 * Add spares device information to the nvlist.
1723 */
1724static void
1725spa_add_spares(spa_t *spa, nvlist_t *config)
1726{
1727	nvlist_t **spares;
1728	uint_t i, nspares;
1729	nvlist_t *nvroot;
1730	uint64_t guid;
1731	vdev_stat_t *vs;
1732	uint_t vsc;
1733	uint64_t pool;
1734
1735	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
1736
1737	if (spa->spa_spares.sav_count == 0)
1738		return;
1739
1740	VERIFY(nvlist_lookup_nvlist(config,
1741	    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
1742	VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
1743	    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
1744	if (nspares != 0) {
1745		VERIFY(nvlist_add_nvlist_array(nvroot,
1746		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
1747		VERIFY(nvlist_lookup_nvlist_array(nvroot,
1748		    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
1749
1750		/*
1751		 * Go through and find any spares which have since been
1752		 * repurposed as an active spare.  If this is the case, update
1753		 * their status appropriately.
1754		 */
1755		for (i = 0; i < nspares; i++) {
1756			VERIFY(nvlist_lookup_uint64(spares[i],
1757			    ZPOOL_CONFIG_GUID, &guid) == 0);
1758			if (spa_spare_exists(guid, &pool, NULL) &&
1759			    pool != 0ULL) {
1760				VERIFY(nvlist_lookup_uint64_array(
1761				    spares[i], ZPOOL_CONFIG_STATS,
1762				    (uint64_t **)&vs, &vsc) == 0);
1763				vs->vs_state = VDEV_STATE_CANT_OPEN;
1764				vs->vs_aux = VDEV_AUX_SPARED;
1765			}
1766		}
1767	}
1768}
1769
1770/*
1771 * Add l2cache device information to the nvlist, including vdev stats.
1772 */
1773static void
1774spa_add_l2cache(spa_t *spa, nvlist_t *config)
1775{
1776	nvlist_t **l2cache;
1777	uint_t i, j, nl2cache;
1778	nvlist_t *nvroot;
1779	uint64_t guid;
1780	vdev_t *vd;
1781	vdev_stat_t *vs;
1782	uint_t vsc;
1783
1784	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
1785
1786	if (spa->spa_l2cache.sav_count == 0)
1787		return;
1788
1789	VERIFY(nvlist_lookup_nvlist(config,
1790	    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
1791	VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
1792	    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
1793	if (nl2cache != 0) {
1794		VERIFY(nvlist_add_nvlist_array(nvroot,
1795		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
1796		VERIFY(nvlist_lookup_nvlist_array(nvroot,
1797		    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
1798
1799		/*
1800		 * Update level 2 cache device stats.
1801		 */
1802
1803		for (i = 0; i < nl2cache; i++) {
1804			VERIFY(nvlist_lookup_uint64(l2cache[i],
1805			    ZPOOL_CONFIG_GUID, &guid) == 0);
1806
1807			vd = NULL;
1808			for (j = 0; j < spa->spa_l2cache.sav_count; j++) {
1809				if (guid ==
1810				    spa->spa_l2cache.sav_vdevs[j]->vdev_guid) {
1811					vd = spa->spa_l2cache.sav_vdevs[j];
1812					break;
1813				}
1814			}
1815			ASSERT(vd != NULL);
1816
1817			VERIFY(nvlist_lookup_uint64_array(l2cache[i],
1818			    ZPOOL_CONFIG_STATS, (uint64_t **)&vs, &vsc) == 0);
1819			vdev_get_stats(vd, vs);
1820		}
1821	}
1822}
1823
1824int
1825spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen)
1826{
1827	int error;
1828	spa_t *spa;
1829
1830	*config = NULL;
1831	error = spa_open_common(name, &spa, FTAG, config);
1832
1833	if (spa != NULL) {
1834		/*
1835		 * This still leaves a window of inconsistency where the spares
1836		 * or l2cache devices could change and the config would be
1837		 * self-inconsistent.
1838		 */
1839		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
1840
1841		if (*config != NULL) {
1842			VERIFY(nvlist_add_uint64(*config,
1843			    ZPOOL_CONFIG_ERRCOUNT,
1844			    spa_get_errlog_size(spa)) == 0);
1845
1846			if (spa_suspended(spa))
1847				VERIFY(nvlist_add_uint64(*config,
1848				    ZPOOL_CONFIG_SUSPENDED,
1849				    spa->spa_failmode) == 0);
1850
1851			spa_add_spares(spa, *config);
1852			spa_add_l2cache(spa, *config);
1853		}
1854	}
1855
1856	/*
1857	 * We want to get the alternate root even for faulted pools, so we cheat
1858	 * and call spa_lookup() directly.
1859	 */
1860	if (altroot) {
1861		if (spa == NULL) {
1862			mutex_enter(&spa_namespace_lock);
1863			spa = spa_lookup(name);
1864			if (spa)
1865				spa_altroot(spa, altroot, buflen);
1866			else
1867				altroot[0] = '\0';
1868			spa = NULL;
1869			mutex_exit(&spa_namespace_lock);
1870		} else {
1871			spa_altroot(spa, altroot, buflen);
1872		}
1873	}
1874
1875	if (spa != NULL) {
1876		spa_config_exit(spa, SCL_CONFIG, FTAG);
1877		spa_close(spa, FTAG);
1878	}
1879
1880	return (error);
1881}
1882
1883/*
1884 * Validate that the auxiliary device array is well formed.  We must have an
1885 * array of nvlists, each which describes a valid leaf vdev.  If this is an
1886 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be
1887 * specified, as long as they are well-formed.
1888 */
1889static int
1890spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode,
1891    spa_aux_vdev_t *sav, const char *config, uint64_t version,
1892    vdev_labeltype_t label)
1893{
1894	nvlist_t **dev;
1895	uint_t i, ndev;
1896	vdev_t *vd;
1897	int error;
1898
1899	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
1900
1901	/*
1902	 * It's acceptable to have no devs specified.
1903	 */
1904	if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0)
1905		return (0);
1906
1907	if (ndev == 0)
1908		return (EINVAL);
1909
1910	/*
1911	 * Make sure the pool is formatted with a version that supports this
1912	 * device type.
1913	 */
1914	if (spa_version(spa) < version)
1915		return (ENOTSUP);
1916
1917	/*
1918	 * Set the pending device list so we correctly handle device in-use
1919	 * checking.
1920	 */
1921	sav->sav_pending = dev;
1922	sav->sav_npending = ndev;
1923
1924	for (i = 0; i < ndev; i++) {
1925		if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0,
1926		    mode)) != 0)
1927			goto out;
1928
1929		if (!vd->vdev_ops->vdev_op_leaf) {
1930			vdev_free(vd);
1931			error = EINVAL;
1932			goto out;
1933		}
1934
1935		/*
1936		 * The L2ARC currently only supports disk devices in
1937		 * kernel context.  For user-level testing, we allow it.
1938		 */
1939#ifdef _KERNEL
1940		if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) &&
1941		    strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) {
1942			error = ENOTBLK;
1943			goto out;
1944		}
1945#endif
1946		vd->vdev_top = vd;
1947
1948		if ((error = vdev_open(vd)) == 0 &&
1949		    (error = vdev_label_init(vd, crtxg, label)) == 0) {
1950			VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID,
1951			    vd->vdev_guid) == 0);
1952		}
1953
1954		vdev_free(vd);
1955
1956		if (error &&
1957		    (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE))
1958			goto out;
1959		else
1960			error = 0;
1961	}
1962
1963out:
1964	sav->sav_pending = NULL;
1965	sav->sav_npending = 0;
1966	return (error);
1967}
1968
1969static int
1970spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode)
1971{
1972	int error;
1973
1974	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
1975
1976	if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode,
1977	    &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES,
1978	    VDEV_LABEL_SPARE)) != 0) {
1979		return (error);
1980	}
1981
1982	return (spa_validate_aux_devs(spa, nvroot, crtxg, mode,
1983	    &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE,
1984	    VDEV_LABEL_L2CACHE));
1985}
1986
1987static void
1988spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs,
1989    const char *config)
1990{
1991	int i;
1992
1993	if (sav->sav_config != NULL) {
1994		nvlist_t **olddevs;
1995		uint_t oldndevs;
1996		nvlist_t **newdevs;
1997
1998		/*
1999		 * Generate new dev list by concatentating with the
2000		 * current dev list.
2001		 */
2002		VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config,
2003		    &olddevs, &oldndevs) == 0);
2004
2005		newdevs = kmem_alloc(sizeof (void *) *
2006		    (ndevs + oldndevs), KM_SLEEP);
2007		for (i = 0; i < oldndevs; i++)
2008			VERIFY(nvlist_dup(olddevs[i], &newdevs[i],
2009			    KM_SLEEP) == 0);
2010		for (i = 0; i < ndevs; i++)
2011			VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs],
2012			    KM_SLEEP) == 0);
2013
2014		VERIFY(nvlist_remove(sav->sav_config, config,
2015		    DATA_TYPE_NVLIST_ARRAY) == 0);
2016
2017		VERIFY(nvlist_add_nvlist_array(sav->sav_config,
2018		    config, newdevs, ndevs + oldndevs) == 0);
2019		for (i = 0; i < oldndevs + ndevs; i++)
2020			nvlist_free(newdevs[i]);
2021		kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *));
2022	} else {
2023		/*
2024		 * Generate a new dev list.
2025		 */
2026		VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME,
2027		    KM_SLEEP) == 0);
2028		VERIFY(nvlist_add_nvlist_array(sav->sav_config, config,
2029		    devs, ndevs) == 0);
2030	}
2031}
2032
2033/*
2034 * Stop and drop level 2 ARC devices
2035 */
2036void
2037spa_l2cache_drop(spa_t *spa)
2038{
2039	vdev_t *vd;
2040	int i;
2041	spa_aux_vdev_t *sav = &spa->spa_l2cache;
2042
2043	for (i = 0; i < sav->sav_count; i++) {
2044		uint64_t pool;
2045
2046		vd = sav->sav_vdevs[i];
2047		ASSERT(vd != NULL);
2048
2049		if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
2050		    pool != 0ULL && l2arc_vdev_present(vd))
2051			l2arc_remove_vdev(vd);
2052		if (vd->vdev_isl2cache)
2053			spa_l2cache_remove(vd);
2054		vdev_clear_stats(vd);
2055		(void) vdev_close(vd);
2056	}
2057}
2058
2059/*
2060 * Pool Creation
2061 */
2062int
2063spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
2064    const char *history_str, nvlist_t *zplprops)
2065{
2066	spa_t *spa;
2067	char *altroot = NULL;
2068	vdev_t *rvd;
2069	dsl_pool_t *dp;
2070	dmu_tx_t *tx;
2071	int c, error = 0;
2072	uint64_t txg = TXG_INITIAL;
2073	nvlist_t **spares, **l2cache;
2074	uint_t nspares, nl2cache;
2075	uint64_t version;
2076
2077	/*
2078	 * If this pool already exists, return failure.
2079	 */
2080	mutex_enter(&spa_namespace_lock);
2081	if (spa_lookup(pool) != NULL) {
2082		mutex_exit(&spa_namespace_lock);
2083		return (EEXIST);
2084	}
2085
2086	/*
2087	 * Allocate a new spa_t structure.
2088	 */
2089	(void) nvlist_lookup_string(props,
2090	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
2091	spa = spa_add(pool, altroot);
2092	spa_activate(spa, spa_mode_global);
2093
2094	spa->spa_uberblock.ub_txg = txg - 1;
2095
2096	if (props && (error = spa_prop_validate(spa, props))) {
2097		spa_deactivate(spa);
2098		spa_remove(spa);
2099		mutex_exit(&spa_namespace_lock);
2100		return (error);
2101	}
2102
2103	if (nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION),
2104	    &version) != 0)
2105		version = SPA_VERSION;
2106	ASSERT(version <= SPA_VERSION);
2107	spa->spa_uberblock.ub_version = version;
2108	spa->spa_ubsync = spa->spa_uberblock;
2109
2110	/*
2111	 * Create "The Godfather" zio to hold all async IOs
2112	 */
2113	spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
2114	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
2115
2116	/*
2117	 * Create the root vdev.
2118	 */
2119	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2120
2121	error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD);
2122
2123	ASSERT(error != 0 || rvd != NULL);
2124	ASSERT(error != 0 || spa->spa_root_vdev == rvd);
2125
2126	if (error == 0 && !zfs_allocatable_devs(nvroot))
2127		error = EINVAL;
2128
2129	if (error == 0 &&
2130	    (error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
2131	    (error = spa_validate_aux(spa, nvroot, txg,
2132	    VDEV_ALLOC_ADD)) == 0) {
2133		for (c = 0; c < rvd->vdev_children; c++)
2134			vdev_init(rvd->vdev_child[c], txg);
2135		vdev_config_dirty(rvd);
2136	}
2137
2138	spa_config_exit(spa, SCL_ALL, FTAG);
2139
2140	if (error != 0) {
2141		spa_unload(spa);
2142		spa_deactivate(spa);
2143		spa_remove(spa);
2144		mutex_exit(&spa_namespace_lock);
2145		return (error);
2146	}
2147
2148	/*
2149	 * Get the list of spares, if specified.
2150	 */
2151	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
2152	    &spares, &nspares) == 0) {
2153		VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME,
2154		    KM_SLEEP) == 0);
2155		VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
2156		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
2157		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2158		spa_load_spares(spa);
2159		spa_config_exit(spa, SCL_ALL, FTAG);
2160		spa->spa_spares.sav_sync = B_TRUE;
2161	}
2162
2163	/*
2164	 * Get the list of level 2 cache devices, if specified.
2165	 */
2166	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
2167	    &l2cache, &nl2cache) == 0) {
2168		VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
2169		    NV_UNIQUE_NAME, KM_SLEEP) == 0);
2170		VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
2171		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
2172		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2173		spa_load_l2cache(spa);
2174		spa_config_exit(spa, SCL_ALL, FTAG);
2175		spa->spa_l2cache.sav_sync = B_TRUE;
2176	}
2177
2178	spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg);
2179	spa->spa_meta_objset = dp->dp_meta_objset;
2180
2181	tx = dmu_tx_create_assigned(dp, txg);
2182
2183	/*
2184	 * Create the pool config object.
2185	 */
2186	spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
2187	    DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE,
2188	    DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
2189
2190	if (zap_add(spa->spa_meta_objset,
2191	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
2192	    sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
2193		cmn_err(CE_PANIC, "failed to add pool config");
2194	}
2195
2196	/* Newly created pools with the right version are always deflated. */
2197	if (version >= SPA_VERSION_RAIDZ_DEFLATE) {
2198		spa->spa_deflate = TRUE;
2199		if (zap_add(spa->spa_meta_objset,
2200		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
2201		    sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
2202			cmn_err(CE_PANIC, "failed to add deflate");
2203		}
2204	}
2205
2206	/*
2207	 * Create the deferred-free bplist object.  Turn off compression
2208	 * because sync-to-convergence takes longer if the blocksize
2209	 * keeps changing.
2210	 */
2211	spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset,
2212	    1 << 14, tx);
2213	dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj,
2214	    ZIO_COMPRESS_OFF, tx);
2215
2216	if (zap_add(spa->spa_meta_objset,
2217	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
2218	    sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) {
2219		cmn_err(CE_PANIC, "failed to add bplist");
2220	}
2221
2222	/*
2223	 * Create the pool's history object.
2224	 */
2225	if (version >= SPA_VERSION_ZPOOL_HISTORY)
2226		spa_history_create_obj(spa, tx);
2227
2228	/*
2229	 * Set pool properties.
2230	 */
2231	spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
2232	spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
2233	spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
2234	if (props != NULL) {
2235		spa_configfile_set(spa, props, B_FALSE);
2236		spa_sync_props(spa, props, CRED(), tx);
2237	}
2238
2239	dmu_tx_commit(tx);
2240
2241	spa->spa_sync_on = B_TRUE;
2242	txg_sync_start(spa->spa_dsl_pool);
2243
2244	/*
2245	 * We explicitly wait for the first transaction to complete so that our
2246	 * bean counters are appropriately updated.
2247	 */
2248	txg_wait_synced(spa->spa_dsl_pool, txg);
2249
2250	spa_config_sync(spa, B_FALSE, B_TRUE);
2251
2252	if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL)
2253		(void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE);
2254
2255	spa->spa_minref = refcount_count(&spa->spa_refcount);
2256
2257	mutex_exit(&spa_namespace_lock);
2258
2259	return (0);
2260}
2261
2262#ifdef sun
2263#ifdef _KERNEL
2264/*
2265 * Build a "root" vdev for a top level vdev read in from a rootpool
2266 * device label.
2267 */
2268static void
2269spa_build_rootpool_config(nvlist_t *config)
2270{
2271	nvlist_t *nvtop, *nvroot;
2272	uint64_t pgid;
2273
2274	/*
2275	 * Add this top-level vdev to the child array.
2276	 */
2277	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtop)
2278	    == 0);
2279	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pgid)
2280	    == 0);
2281
2282	/*
2283	 * Put this pool's top-level vdevs into a root vdev.
2284	 */
2285	VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
2286	VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT)
2287	    == 0);
2288	VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0);
2289	VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0);
2290	VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
2291	    &nvtop, 1) == 0);
2292
2293	/*
2294	 * Replace the existing vdev_tree with the new root vdev in
2295	 * this pool's configuration (remove the old, add the new).
2296	 */
2297	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
2298	nvlist_free(nvroot);
2299}
2300
2301/*
2302 * Get the root pool information from the root disk, then import the root pool
2303 * during the system boot up time.
2304 */
2305extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **);
2306
2307int
2308spa_check_rootconf(char *devpath, char *devid, nvlist_t **bestconf,
2309    uint64_t *besttxg)
2310{
2311	nvlist_t *config;
2312	uint64_t txg;
2313	int error;
2314
2315	if (error = vdev_disk_read_rootlabel(devpath, devid, &config))
2316		return (error);
2317
2318	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
2319
2320	if (bestconf != NULL)
2321		*bestconf = config;
2322	else
2323		nvlist_free(config);
2324	*besttxg = txg;
2325	return (0);
2326}
2327
2328boolean_t
2329spa_rootdev_validate(nvlist_t *nv)
2330{
2331	uint64_t ival;
2332
2333	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &ival) == 0 ||
2334	    nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, &ival) == 0 ||
2335	    nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, &ival) == 0)
2336		return (B_FALSE);
2337
2338	return (B_TRUE);
2339}
2340
2341
2342/*
2343 * Given the boot device's physical path or devid, check if the device
2344 * is in a valid state.  If so, return the configuration from the vdev
2345 * label.
2346 */
2347int
2348spa_get_rootconf(char *devpath, char *devid, nvlist_t **bestconf)
2349{
2350	nvlist_t *conf = NULL;
2351	uint64_t txg = 0;
2352	nvlist_t *nvtop, **child;
2353	char *type;
2354	char *bootpath = NULL;
2355	uint_t children, c;
2356	char *tmp;
2357	int error;
2358
2359	if (devpath && ((tmp = strchr(devpath, ' ')) != NULL))
2360		*tmp = '\0';
2361	if (error = spa_check_rootconf(devpath, devid, &conf, &txg)) {
2362		cmn_err(CE_NOTE, "error reading device label");
2363		return (error);
2364	}
2365	if (txg == 0) {
2366		cmn_err(CE_NOTE, "this device is detached");
2367		nvlist_free(conf);
2368		return (EINVAL);
2369	}
2370
2371	VERIFY(nvlist_lookup_nvlist(conf, ZPOOL_CONFIG_VDEV_TREE,
2372	    &nvtop) == 0);
2373	VERIFY(nvlist_lookup_string(nvtop, ZPOOL_CONFIG_TYPE, &type) == 0);
2374
2375	if (strcmp(type, VDEV_TYPE_DISK) == 0) {
2376		if (spa_rootdev_validate(nvtop)) {
2377			goto out;
2378		} else {
2379			nvlist_free(conf);
2380			return (EINVAL);
2381		}
2382	}
2383
2384	ASSERT(strcmp(type, VDEV_TYPE_MIRROR) == 0);
2385
2386	VERIFY(nvlist_lookup_nvlist_array(nvtop, ZPOOL_CONFIG_CHILDREN,
2387	    &child, &children) == 0);
2388
2389	/*
2390	 * Go thru vdevs in the mirror to see if the given device
2391	 * has the most recent txg. Only the device with the most
2392	 * recent txg has valid information and should be booted.
2393	 */
2394	for (c = 0; c < children; c++) {
2395		char *cdevid, *cpath;
2396		uint64_t tmptxg;
2397
2398		cpath = NULL;
2399		cdevid = NULL;
2400		if (nvlist_lookup_string(child[c], ZPOOL_CONFIG_PHYS_PATH,
2401		    &cpath) != 0 && nvlist_lookup_string(child[c],
2402		    ZPOOL_CONFIG_DEVID, &cdevid) != 0)
2403			return (EINVAL);
2404		if ((spa_check_rootconf(cpath, cdevid, NULL,
2405		    &tmptxg) == 0) && (tmptxg > txg)) {
2406			txg = tmptxg;
2407			VERIFY(nvlist_lookup_string(child[c],
2408			    ZPOOL_CONFIG_PATH, &bootpath) == 0);
2409		}
2410	}
2411
2412	/* Does the best device match the one we've booted from? */
2413	if (bootpath) {
2414		cmn_err(CE_NOTE, "try booting from '%s'", bootpath);
2415		return (EINVAL);
2416	}
2417out:
2418	*bestconf = conf;
2419	return (0);
2420}
2421
2422/*
2423 * Import a root pool.
2424 *
2425 * For x86. devpath_list will consist of devid and/or physpath name of
2426 * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a").
2427 * The GRUB "findroot" command will return the vdev we should boot.
2428 *
2429 * For Sparc, devpath_list consists the physpath name of the booting device
2430 * no matter the rootpool is a single device pool or a mirrored pool.
2431 * e.g.
2432 *	"/pci@1f,0/ide@d/disk@0,0:a"
2433 */
2434int
2435spa_import_rootpool(char *devpath, char *devid)
2436{
2437	nvlist_t *conf = NULL;
2438	char *pname;
2439	int error;
2440	spa_t *spa;
2441
2442	/*
2443	 * Get the vdev pathname and configuation from the most
2444	 * recently updated vdev (highest txg).
2445	 */
2446	if (error = spa_get_rootconf(devpath, devid, &conf))
2447		goto msg_out;
2448
2449	/*
2450	 * Add type "root" vdev to the config.
2451	 */
2452	spa_build_rootpool_config(conf);
2453
2454	VERIFY(nvlist_lookup_string(conf, ZPOOL_CONFIG_POOL_NAME, &pname) == 0);
2455
2456	mutex_enter(&spa_namespace_lock);
2457	if ((spa = spa_lookup(pname)) != NULL) {
2458		/*
2459		 * Remove the existing root pool from the namespace so that we
2460		 * can replace it with the correct config we just read in.
2461		 */
2462		spa_remove(spa);
2463	}
2464
2465	spa = spa_add(pname, NULL);
2466	spa->spa_is_root = B_TRUE;
2467	spa->spa_load_verbatim = B_TRUE;
2468
2469	VERIFY(nvlist_dup(conf, &spa->spa_config, 0) == 0);
2470	mutex_exit(&spa_namespace_lock);
2471
2472	nvlist_free(conf);
2473	return (0);
2474
2475msg_out:
2476	cmn_err(CE_NOTE, "\n"
2477	    "  ***************************************************  \n"
2478	    "  *  This device is not bootable!                   *  \n"
2479	    "  *  It is either offlined or detached or faulted.  *  \n"
2480	    "  *  Please try to boot from a different device.    *  \n"
2481	    "  ***************************************************  ");
2482
2483	return (error);
2484}
2485#endif
2486#endif	/* sun */
2487
2488/*
2489 * Take a pool and insert it into the namespace as if it had been loaded at
2490 * boot.
2491 */
2492int
2493spa_import_verbatim(const char *pool, nvlist_t *config, nvlist_t *props)
2494{
2495	spa_t *spa;
2496	char *altroot = NULL;
2497
2498	mutex_enter(&spa_namespace_lock);
2499	if (spa_lookup(pool) != NULL) {
2500		mutex_exit(&spa_namespace_lock);
2501		return (EEXIST);
2502	}
2503
2504	(void) nvlist_lookup_string(props,
2505	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
2506	spa = spa_add(pool, altroot);
2507
2508	spa->spa_load_verbatim = B_TRUE;
2509
2510	VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0);
2511
2512	if (props != NULL)
2513		spa_configfile_set(spa, props, B_FALSE);
2514
2515	spa_config_sync(spa, B_FALSE, B_TRUE);
2516
2517	mutex_exit(&spa_namespace_lock);
2518
2519	return (0);
2520}
2521
2522/*
2523 * Import a non-root pool into the system.
2524 */
2525int
2526spa_import(const char *pool, nvlist_t *config, nvlist_t *props)
2527{
2528	spa_t *spa;
2529	char *altroot = NULL;
2530	int error;
2531	nvlist_t *nvroot;
2532	nvlist_t **spares, **l2cache;
2533	uint_t nspares, nl2cache;
2534
2535	/*
2536	 * If a pool with this name exists, return failure.
2537	 */
2538	mutex_enter(&spa_namespace_lock);
2539	if ((spa = spa_lookup(pool)) != NULL) {
2540		mutex_exit(&spa_namespace_lock);
2541		return (EEXIST);
2542	}
2543
2544	/*
2545	 * Create and initialize the spa structure.
2546	 */
2547	(void) nvlist_lookup_string(props,
2548	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
2549	spa = spa_add(pool, altroot);
2550	spa_activate(spa, spa_mode_global);
2551
2552	/*
2553	 * Don't start async tasks until we know everything is healthy.
2554	 */
2555	spa_async_suspend(spa);
2556
2557	/*
2558	 * Pass off the heavy lifting to spa_load().  Pass TRUE for mosconfig
2559	 * because the user-supplied config is actually the one to trust when
2560	 * doing an import.
2561	 */
2562	error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE);
2563
2564	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2565	/*
2566	 * Toss any existing sparelist, as it doesn't have any validity
2567	 * anymore, and conflicts with spa_has_spare().
2568	 */
2569	if (spa->spa_spares.sav_config) {
2570		nvlist_free(spa->spa_spares.sav_config);
2571		spa->spa_spares.sav_config = NULL;
2572		spa_load_spares(spa);
2573	}
2574	if (spa->spa_l2cache.sav_config) {
2575		nvlist_free(spa->spa_l2cache.sav_config);
2576		spa->spa_l2cache.sav_config = NULL;
2577		spa_load_l2cache(spa);
2578	}
2579
2580	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
2581	    &nvroot) == 0);
2582	if (error == 0)
2583		error = spa_validate_aux(spa, nvroot, -1ULL,
2584		    VDEV_ALLOC_SPARE);
2585	if (error == 0)
2586		error = spa_validate_aux(spa, nvroot, -1ULL,
2587		    VDEV_ALLOC_L2CACHE);
2588	spa_config_exit(spa, SCL_ALL, FTAG);
2589
2590	if (props != NULL)
2591		spa_configfile_set(spa, props, B_FALSE);
2592
2593	if (error != 0 || (props && spa_writeable(spa) &&
2594	    (error = spa_prop_set(spa, props)))) {
2595		spa_unload(spa);
2596		spa_deactivate(spa);
2597		spa_remove(spa);
2598		mutex_exit(&spa_namespace_lock);
2599		return (error);
2600	}
2601
2602	spa_async_resume(spa);
2603
2604	/*
2605	 * Override any spares and level 2 cache devices as specified by
2606	 * the user, as these may have correct device names/devids, etc.
2607	 */
2608	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
2609	    &spares, &nspares) == 0) {
2610		if (spa->spa_spares.sav_config)
2611			VERIFY(nvlist_remove(spa->spa_spares.sav_config,
2612			    ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
2613		else
2614			VERIFY(nvlist_alloc(&spa->spa_spares.sav_config,
2615			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
2616		VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
2617		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
2618		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2619		spa_load_spares(spa);
2620		spa_config_exit(spa, SCL_ALL, FTAG);
2621		spa->spa_spares.sav_sync = B_TRUE;
2622	}
2623	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
2624	    &l2cache, &nl2cache) == 0) {
2625		if (spa->spa_l2cache.sav_config)
2626			VERIFY(nvlist_remove(spa->spa_l2cache.sav_config,
2627			    ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0);
2628		else
2629			VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
2630			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
2631		VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
2632		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
2633		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2634		spa_load_l2cache(spa);
2635		spa_config_exit(spa, SCL_ALL, FTAG);
2636		spa->spa_l2cache.sav_sync = B_TRUE;
2637	}
2638
2639	if (spa_writeable(spa)) {
2640		/*
2641		 * Update the config cache to include the newly-imported pool.
2642		 */
2643		spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
2644	}
2645
2646	mutex_exit(&spa_namespace_lock);
2647
2648	return (0);
2649}
2650
2651/*
2652 * This (illegal) pool name is used when temporarily importing a spa_t in order
2653 * to get the vdev stats associated with the imported devices.
2654 */
2655#define	TRYIMPORT_NAME	"$import"
2656
2657nvlist_t *
2658spa_tryimport(nvlist_t *tryconfig)
2659{
2660	nvlist_t *config = NULL;
2661	char *poolname;
2662	spa_t *spa;
2663	uint64_t state;
2664	int error;
2665
2666	if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
2667		return (NULL);
2668
2669	if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state))
2670		return (NULL);
2671
2672	/*
2673	 * Create and initialize the spa structure.
2674	 */
2675	mutex_enter(&spa_namespace_lock);
2676	spa = spa_add(TRYIMPORT_NAME, NULL);
2677	spa_activate(spa, FREAD);
2678
2679	/*
2680	 * Pass off the heavy lifting to spa_load().
2681	 * Pass TRUE for mosconfig because the user-supplied config
2682	 * is actually the one to trust when doing an import.
2683	 */
2684	error = spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE);
2685
2686	/*
2687	 * If 'tryconfig' was at least parsable, return the current config.
2688	 */
2689	if (spa->spa_root_vdev != NULL) {
2690		config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
2691		VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
2692		    poolname) == 0);
2693		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
2694		    state) == 0);
2695		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
2696		    spa->spa_uberblock.ub_timestamp) == 0);
2697
2698		/*
2699		 * If the bootfs property exists on this pool then we
2700		 * copy it out so that external consumers can tell which
2701		 * pools are bootable.
2702		 */
2703		if ((!error || error == EEXIST) && spa->spa_bootfs) {
2704			char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
2705
2706			/*
2707			 * We have to play games with the name since the
2708			 * pool was opened as TRYIMPORT_NAME.
2709			 */
2710			if (dsl_dsobj_to_dsname(spa_name(spa),
2711			    spa->spa_bootfs, tmpname) == 0) {
2712				char *cp;
2713				char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
2714
2715				cp = strchr(tmpname, '/');
2716				if (cp == NULL) {
2717					(void) strlcpy(dsname, tmpname,
2718					    MAXPATHLEN);
2719				} else {
2720					(void) snprintf(dsname, MAXPATHLEN,
2721					    "%s/%s", poolname, ++cp);
2722				}
2723				VERIFY(nvlist_add_string(config,
2724				    ZPOOL_CONFIG_BOOTFS, dsname) == 0);
2725				kmem_free(dsname, MAXPATHLEN);
2726			}
2727			kmem_free(tmpname, MAXPATHLEN);
2728		}
2729
2730		/*
2731		 * Add the list of hot spares and level 2 cache devices.
2732		 */
2733		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
2734		spa_add_spares(spa, config);
2735		spa_add_l2cache(spa, config);
2736		spa_config_exit(spa, SCL_CONFIG, FTAG);
2737	}
2738
2739	spa_unload(spa);
2740	spa_deactivate(spa);
2741	spa_remove(spa);
2742	mutex_exit(&spa_namespace_lock);
2743
2744	return (config);
2745}
2746
2747/*
2748 * Pool export/destroy
2749 *
2750 * The act of destroying or exporting a pool is very simple.  We make sure there
2751 * is no more pending I/O and any references to the pool are gone.  Then, we
2752 * update the pool state and sync all the labels to disk, removing the
2753 * configuration from the cache afterwards. If the 'hardforce' flag is set, then
2754 * we don't sync the labels or remove the configuration cache.
2755 */
2756static int
2757spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
2758    boolean_t force, boolean_t hardforce)
2759{
2760	spa_t *spa;
2761
2762	if (oldconfig)
2763		*oldconfig = NULL;
2764
2765	if (!(spa_mode_global & FWRITE))
2766		return (EROFS);
2767
2768	mutex_enter(&spa_namespace_lock);
2769	if ((spa = spa_lookup(pool)) == NULL) {
2770		mutex_exit(&spa_namespace_lock);
2771		return (ENOENT);
2772	}
2773
2774	/*
2775	 * Put a hold on the pool, drop the namespace lock, stop async tasks,
2776	 * reacquire the namespace lock, and see if we can export.
2777	 */
2778	spa_open_ref(spa, FTAG);
2779	mutex_exit(&spa_namespace_lock);
2780	spa_async_suspend(spa);
2781	mutex_enter(&spa_namespace_lock);
2782	spa_close(spa, FTAG);
2783
2784	/*
2785	 * The pool will be in core if it's openable,
2786	 * in which case we can modify its state.
2787	 */
2788	if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) {
2789		/*
2790		 * Objsets may be open only because they're dirty, so we
2791		 * have to force it to sync before checking spa_refcnt.
2792		 */
2793		txg_wait_synced(spa->spa_dsl_pool, 0);
2794
2795		/*
2796		 * A pool cannot be exported or destroyed if there are active
2797		 * references.  If we are resetting a pool, allow references by
2798		 * fault injection handlers.
2799		 */
2800		if (!spa_refcount_zero(spa) ||
2801		    (spa->spa_inject_ref != 0 &&
2802		    new_state != POOL_STATE_UNINITIALIZED)) {
2803			spa_async_resume(spa);
2804			mutex_exit(&spa_namespace_lock);
2805			return (EBUSY);
2806		}
2807
2808		/*
2809		 * A pool cannot be exported if it has an active shared spare.
2810		 * This is to prevent other pools stealing the active spare
2811		 * from an exported pool. At user's own will, such pool can
2812		 * be forcedly exported.
2813		 */
2814		if (!force && new_state == POOL_STATE_EXPORTED &&
2815		    spa_has_active_shared_spare(spa)) {
2816			spa_async_resume(spa);
2817			mutex_exit(&spa_namespace_lock);
2818			return (EXDEV);
2819		}
2820
2821		/*
2822		 * We want this to be reflected on every label,
2823		 * so mark them all dirty.  spa_unload() will do the
2824		 * final sync that pushes these changes out.
2825		 */
2826		if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
2827			spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2828			spa->spa_state = new_state;
2829			spa->spa_final_txg = spa_last_synced_txg(spa) + 1;
2830			vdev_config_dirty(spa->spa_root_vdev);
2831			spa_config_exit(spa, SCL_ALL, FTAG);
2832		}
2833	}
2834
2835	spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY);
2836
2837	if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
2838		spa_unload(spa);
2839		spa_deactivate(spa);
2840	}
2841
2842	if (oldconfig && spa->spa_config)
2843		VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0);
2844
2845	if (new_state != POOL_STATE_UNINITIALIZED) {
2846		if (!hardforce)
2847			spa_config_sync(spa, B_TRUE, B_TRUE);
2848		spa_remove(spa);
2849	}
2850	mutex_exit(&spa_namespace_lock);
2851
2852	return (0);
2853}
2854
2855/*
2856 * Destroy a storage pool.
2857 */
2858int
2859spa_destroy(char *pool)
2860{
2861	return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL,
2862	    B_FALSE, B_FALSE));
2863}
2864
2865/*
2866 * Export a storage pool.
2867 */
2868int
2869spa_export(char *pool, nvlist_t **oldconfig, boolean_t force,
2870    boolean_t hardforce)
2871{
2872	return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig,
2873	    force, hardforce));
2874}
2875
2876/*
2877 * Similar to spa_export(), this unloads the spa_t without actually removing it
2878 * from the namespace in any way.
2879 */
2880int
2881spa_reset(char *pool)
2882{
2883	return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL,
2884	    B_FALSE, B_FALSE));
2885}
2886
2887/*
2888 * ==========================================================================
2889 * Device manipulation
2890 * ==========================================================================
2891 */
2892
2893/*
2894 * Add a device to a storage pool.
2895 */
2896int
2897spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
2898{
2899	uint64_t txg;
2900	int error;
2901	vdev_t *rvd = spa->spa_root_vdev;
2902	vdev_t *vd, *tvd;
2903	nvlist_t **spares, **l2cache;
2904	uint_t nspares, nl2cache;
2905
2906	txg = spa_vdev_enter(spa);
2907
2908	if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0,
2909	    VDEV_ALLOC_ADD)) != 0)
2910		return (spa_vdev_exit(spa, NULL, txg, error));
2911
2912	spa->spa_pending_vdev = vd;	/* spa_vdev_exit() will clear this */
2913
2914	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares,
2915	    &nspares) != 0)
2916		nspares = 0;
2917
2918	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache,
2919	    &nl2cache) != 0)
2920		nl2cache = 0;
2921
2922	if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0)
2923		return (spa_vdev_exit(spa, vd, txg, EINVAL));
2924
2925	if (vd->vdev_children != 0 &&
2926	    (error = vdev_create(vd, txg, B_FALSE)) != 0)
2927		return (spa_vdev_exit(spa, vd, txg, error));
2928
2929	/*
2930	 * We must validate the spares and l2cache devices after checking the
2931	 * children.  Otherwise, vdev_inuse() will blindly overwrite the spare.
2932	 */
2933	if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0)
2934		return (spa_vdev_exit(spa, vd, txg, error));
2935
2936	/*
2937	 * Transfer each new top-level vdev from vd to rvd.
2938	 */
2939	for (int c = 0; c < vd->vdev_children; c++) {
2940		tvd = vd->vdev_child[c];
2941		vdev_remove_child(vd, tvd);
2942		tvd->vdev_id = rvd->vdev_children;
2943		vdev_add_child(rvd, tvd);
2944		vdev_config_dirty(tvd);
2945	}
2946
2947	if (nspares != 0) {
2948		spa_set_aux_vdevs(&spa->spa_spares, spares, nspares,
2949		    ZPOOL_CONFIG_SPARES);
2950		spa_load_spares(spa);
2951		spa->spa_spares.sav_sync = B_TRUE;
2952	}
2953
2954	if (nl2cache != 0) {
2955		spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache,
2956		    ZPOOL_CONFIG_L2CACHE);
2957		spa_load_l2cache(spa);
2958		spa->spa_l2cache.sav_sync = B_TRUE;
2959	}
2960
2961	/*
2962	 * We have to be careful when adding new vdevs to an existing pool.
2963	 * If other threads start allocating from these vdevs before we
2964	 * sync the config cache, and we lose power, then upon reboot we may
2965	 * fail to open the pool because there are DVAs that the config cache
2966	 * can't translate.  Therefore, we first add the vdevs without
2967	 * initializing metaslabs; sync the config cache (via spa_vdev_exit());
2968	 * and then let spa_config_update() initialize the new metaslabs.
2969	 *
2970	 * spa_load() checks for added-but-not-initialized vdevs, so that
2971	 * if we lose power at any point in this sequence, the remaining
2972	 * steps will be completed the next time we load the pool.
2973	 */
2974	(void) spa_vdev_exit(spa, vd, txg, 0);
2975
2976	mutex_enter(&spa_namespace_lock);
2977	spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
2978	mutex_exit(&spa_namespace_lock);
2979
2980	return (0);
2981}
2982
2983/*
2984 * Attach a device to a mirror.  The arguments are the path to any device
2985 * in the mirror, and the nvroot for the new device.  If the path specifies
2986 * a device that is not mirrored, we automatically insert the mirror vdev.
2987 *
2988 * If 'replacing' is specified, the new device is intended to replace the
2989 * existing device; in this case the two devices are made into their own
2990 * mirror using the 'replacing' vdev, which is functionally identical to
2991 * the mirror vdev (it actually reuses all the same ops) but has a few
2992 * extra rules: you can't attach to it after it's been created, and upon
2993 * completion of resilvering, the first disk (the one being replaced)
2994 * is automatically detached.
2995 */
2996int
2997spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
2998{
2999	uint64_t txg, open_txg;
3000	vdev_t *rvd = spa->spa_root_vdev;
3001	vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
3002	vdev_ops_t *pvops;
3003	dmu_tx_t *tx;
3004	char *oldvdpath, *newvdpath;
3005	int newvd_isspare;
3006	int error;
3007
3008	txg = spa_vdev_enter(spa);
3009
3010	oldvd = spa_lookup_by_guid(spa, guid, B_FALSE);
3011
3012	if (oldvd == NULL)
3013		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
3014
3015	if (!oldvd->vdev_ops->vdev_op_leaf)
3016		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
3017
3018	pvd = oldvd->vdev_parent;
3019
3020	if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
3021	    VDEV_ALLOC_ADD)) != 0)
3022		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
3023
3024	if (newrootvd->vdev_children != 1)
3025		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
3026
3027	newvd = newrootvd->vdev_child[0];
3028
3029	if (!newvd->vdev_ops->vdev_op_leaf)
3030		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
3031
3032	if ((error = vdev_create(newrootvd, txg, replacing)) != 0)
3033		return (spa_vdev_exit(spa, newrootvd, txg, error));
3034
3035	/*
3036	 * Spares can't replace logs
3037	 */
3038	if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare)
3039		return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
3040
3041	if (!replacing) {
3042		/*
3043		 * For attach, the only allowable parent is a mirror or the root
3044		 * vdev.
3045		 */
3046		if (pvd->vdev_ops != &vdev_mirror_ops &&
3047		    pvd->vdev_ops != &vdev_root_ops)
3048			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
3049
3050		pvops = &vdev_mirror_ops;
3051	} else {
3052		/*
3053		 * Active hot spares can only be replaced by inactive hot
3054		 * spares.
3055		 */
3056		if (pvd->vdev_ops == &vdev_spare_ops &&
3057		    pvd->vdev_child[1] == oldvd &&
3058		    !spa_has_spare(spa, newvd->vdev_guid))
3059			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
3060
3061		/*
3062		 * If the source is a hot spare, and the parent isn't already a
3063		 * spare, then we want to create a new hot spare.  Otherwise, we
3064		 * want to create a replacing vdev.  The user is not allowed to
3065		 * attach to a spared vdev child unless the 'isspare' state is
3066		 * the same (spare replaces spare, non-spare replaces
3067		 * non-spare).
3068		 */
3069		if (pvd->vdev_ops == &vdev_replacing_ops)
3070			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
3071		else if (pvd->vdev_ops == &vdev_spare_ops &&
3072		    newvd->vdev_isspare != oldvd->vdev_isspare)
3073			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
3074		else if (pvd->vdev_ops != &vdev_spare_ops &&
3075		    newvd->vdev_isspare)
3076			pvops = &vdev_spare_ops;
3077		else
3078			pvops = &vdev_replacing_ops;
3079	}
3080
3081	/*
3082	 * Compare the new device size with the replaceable/attachable
3083	 * device size.
3084	 */
3085	if (newvd->vdev_psize < vdev_get_rsize(oldvd))
3086		return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
3087
3088	/*
3089	 * The new device cannot have a higher alignment requirement
3090	 * than the top-level vdev.
3091	 */
3092	if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
3093		return (spa_vdev_exit(spa, newrootvd, txg, EDOM));
3094
3095	/*
3096	 * If this is an in-place replacement, update oldvd's path and devid
3097	 * to make it distinguishable from newvd, and unopenable from now on.
3098	 */
3099	if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
3100		spa_strfree(oldvd->vdev_path);
3101		oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
3102		    KM_SLEEP);
3103		(void) sprintf(oldvd->vdev_path, "%s/%s",
3104		    newvd->vdev_path, "old");
3105		if (oldvd->vdev_devid != NULL) {
3106			spa_strfree(oldvd->vdev_devid);
3107			oldvd->vdev_devid = NULL;
3108		}
3109	}
3110
3111	/*
3112	 * If the parent is not a mirror, or if we're replacing, insert the new
3113	 * mirror/replacing/spare vdev above oldvd.
3114	 */
3115	if (pvd->vdev_ops != pvops)
3116		pvd = vdev_add_parent(oldvd, pvops);
3117
3118	ASSERT(pvd->vdev_top->vdev_parent == rvd);
3119	ASSERT(pvd->vdev_ops == pvops);
3120	ASSERT(oldvd->vdev_parent == pvd);
3121
3122	/*
3123	 * Extract the new device from its root and add it to pvd.
3124	 */
3125	vdev_remove_child(newrootvd, newvd);
3126	newvd->vdev_id = pvd->vdev_children;
3127	vdev_add_child(pvd, newvd);
3128
3129	/*
3130	 * If newvd is smaller than oldvd, but larger than its rsize,
3131	 * the addition of newvd may have decreased our parent's asize.
3132	 */
3133	pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize);
3134
3135	tvd = newvd->vdev_top;
3136	ASSERT(pvd->vdev_top == tvd);
3137	ASSERT(tvd->vdev_parent == rvd);
3138
3139	vdev_config_dirty(tvd);
3140
3141	/*
3142	 * Set newvd's DTL to [TXG_INITIAL, open_txg].  It will propagate
3143	 * upward when spa_vdev_exit() calls vdev_dtl_reassess().
3144	 */
3145	open_txg = txg + TXG_CONCURRENT_STATES - 1;
3146
3147	vdev_dtl_dirty(newvd, DTL_MISSING,
3148	    TXG_INITIAL, open_txg - TXG_INITIAL + 1);
3149
3150	if (newvd->vdev_isspare) {
3151		spa_spare_activate(newvd);
3152		spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE);
3153	}
3154
3155	oldvdpath = spa_strdup(oldvd->vdev_path);
3156	newvdpath = spa_strdup(newvd->vdev_path);
3157	newvd_isspare = newvd->vdev_isspare;
3158
3159	/*
3160	 * Mark newvd's DTL dirty in this txg.
3161	 */
3162	vdev_dirty(tvd, VDD_DTL, newvd, txg);
3163
3164	(void) spa_vdev_exit(spa, newrootvd, open_txg, 0);
3165
3166	tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
3167	if (dmu_tx_assign(tx, TXG_WAIT) == 0) {
3168		spa_history_internal_log(LOG_POOL_VDEV_ATTACH, spa, tx,
3169		    CRED(),  "%s vdev=%s %s vdev=%s",
3170		    replacing && newvd_isspare ? "spare in" :
3171		    replacing ? "replace" : "attach", newvdpath,
3172		    replacing ? "for" : "to", oldvdpath);
3173		dmu_tx_commit(tx);
3174	} else {
3175		dmu_tx_abort(tx);
3176	}
3177
3178	spa_strfree(oldvdpath);
3179	spa_strfree(newvdpath);
3180
3181	/*
3182	 * Kick off a resilver to update newvd.
3183	 */
3184	VERIFY3U(spa_scrub(spa, POOL_SCRUB_RESILVER), ==, 0);
3185
3186	return (0);
3187}
3188
3189/*
3190 * Detach a device from a mirror or replacing vdev.
3191 * If 'replace_done' is specified, only detach if the parent
3192 * is a replacing vdev.
3193 */
3194int
3195spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
3196{
3197	uint64_t txg;
3198	int error;
3199	vdev_t *rvd = spa->spa_root_vdev;
3200	vdev_t *vd, *pvd, *cvd, *tvd;
3201	boolean_t unspare = B_FALSE;
3202	uint64_t unspare_guid;
3203	size_t len;
3204
3205	txg = spa_vdev_enter(spa);
3206
3207	vd = spa_lookup_by_guid(spa, guid, B_FALSE);
3208
3209	if (vd == NULL)
3210		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
3211
3212	if (!vd->vdev_ops->vdev_op_leaf)
3213		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
3214
3215	pvd = vd->vdev_parent;
3216
3217	/*
3218	 * If the parent/child relationship is not as expected, don't do it.
3219	 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing
3220	 * vdev that's replacing B with C.  The user's intent in replacing
3221	 * is to go from M(A,B) to M(A,C).  If the user decides to cancel
3222	 * the replace by detaching C, the expected behavior is to end up
3223	 * M(A,B).  But suppose that right after deciding to detach C,
3224	 * the replacement of B completes.  We would have M(A,C), and then
3225	 * ask to detach C, which would leave us with just A -- not what
3226	 * the user wanted.  To prevent this, we make sure that the
3227	 * parent/child relationship hasn't changed -- in this example,
3228	 * that C's parent is still the replacing vdev R.
3229	 */
3230	if (pvd->vdev_guid != pguid && pguid != 0)
3231		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
3232
3233	/*
3234	 * If replace_done is specified, only remove this device if it's
3235	 * the first child of a replacing vdev.  For the 'spare' vdev, either
3236	 * disk can be removed.
3237	 */
3238	if (replace_done) {
3239		if (pvd->vdev_ops == &vdev_replacing_ops) {
3240			if (vd->vdev_id != 0)
3241				return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
3242		} else if (pvd->vdev_ops != &vdev_spare_ops) {
3243			return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
3244		}
3245	}
3246
3247	ASSERT(pvd->vdev_ops != &vdev_spare_ops ||
3248	    spa_version(spa) >= SPA_VERSION_SPARES);
3249
3250	/*
3251	 * Only mirror, replacing, and spare vdevs support detach.
3252	 */
3253	if (pvd->vdev_ops != &vdev_replacing_ops &&
3254	    pvd->vdev_ops != &vdev_mirror_ops &&
3255	    pvd->vdev_ops != &vdev_spare_ops)
3256		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
3257
3258	/*
3259	 * If this device has the only valid copy of some data,
3260	 * we cannot safely detach it.
3261	 */
3262	if (vdev_dtl_required(vd))
3263		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
3264
3265	ASSERT(pvd->vdev_children >= 2);
3266
3267	/*
3268	 * If we are detaching the second disk from a replacing vdev, then
3269	 * check to see if we changed the original vdev's path to have "/old"
3270	 * at the end in spa_vdev_attach().  If so, undo that change now.
3271	 */
3272	if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id == 1 &&
3273	    pvd->vdev_child[0]->vdev_path != NULL &&
3274	    pvd->vdev_child[1]->vdev_path != NULL) {
3275		ASSERT(pvd->vdev_child[1] == vd);
3276		cvd = pvd->vdev_child[0];
3277		len = strlen(vd->vdev_path);
3278		if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 &&
3279		    strcmp(cvd->vdev_path + len, "/old") == 0) {
3280			spa_strfree(cvd->vdev_path);
3281			cvd->vdev_path = spa_strdup(vd->vdev_path);
3282		}
3283	}
3284
3285	/*
3286	 * If we are detaching the original disk from a spare, then it implies
3287	 * that the spare should become a real disk, and be removed from the
3288	 * active spare list for the pool.
3289	 */
3290	if (pvd->vdev_ops == &vdev_spare_ops &&
3291	    vd->vdev_id == 0 && pvd->vdev_child[1]->vdev_isspare)
3292		unspare = B_TRUE;
3293
3294	/*
3295	 * Erase the disk labels so the disk can be used for other things.
3296	 * This must be done after all other error cases are handled,
3297	 * but before we disembowel vd (so we can still do I/O to it).
3298	 * But if we can't do it, don't treat the error as fatal --
3299	 * it may be that the unwritability of the disk is the reason
3300	 * it's being detached!
3301	 */
3302	error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
3303
3304	/*
3305	 * Remove vd from its parent and compact the parent's children.
3306	 */
3307	vdev_remove_child(pvd, vd);
3308	vdev_compact_children(pvd);
3309
3310	/*
3311	 * Remember one of the remaining children so we can get tvd below.
3312	 */
3313	cvd = pvd->vdev_child[0];
3314
3315	/*
3316	 * If we need to remove the remaining child from the list of hot spares,
3317	 * do it now, marking the vdev as no longer a spare in the process.
3318	 * We must do this before vdev_remove_parent(), because that can
3319	 * change the GUID if it creates a new toplevel GUID.  For a similar
3320	 * reason, we must remove the spare now, in the same txg as the detach;
3321	 * otherwise someone could attach a new sibling, change the GUID, and
3322	 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail.
3323	 */
3324	if (unspare) {
3325		ASSERT(cvd->vdev_isspare);
3326		spa_spare_remove(cvd);
3327		unspare_guid = cvd->vdev_guid;
3328		(void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
3329	}
3330
3331	/*
3332	 * If the parent mirror/replacing vdev only has one child,
3333	 * the parent is no longer needed.  Remove it from the tree.
3334	 */
3335	if (pvd->vdev_children == 1)
3336		vdev_remove_parent(cvd);
3337
3338	/*
3339	 * We don't set tvd until now because the parent we just removed
3340	 * may have been the previous top-level vdev.
3341	 */
3342	tvd = cvd->vdev_top;
3343	ASSERT(tvd->vdev_parent == rvd);
3344
3345	/*
3346	 * Reevaluate the parent vdev state.
3347	 */
3348	vdev_propagate_state(cvd);
3349
3350	/*
3351	 * If the device we just detached was smaller than the others, it may be
3352	 * possible to add metaslabs (i.e. grow the pool).  vdev_metaslab_init()
3353	 * can't fail because the existing metaslabs are already in core, so
3354	 * there's nothing to read from disk.
3355	 */
3356	VERIFY(vdev_metaslab_init(tvd, txg) == 0);
3357
3358	vdev_config_dirty(tvd);
3359
3360	/*
3361	 * Mark vd's DTL as dirty in this txg.  vdev_dtl_sync() will see that
3362	 * vd->vdev_detached is set and free vd's DTL object in syncing context.
3363	 * But first make sure we're not on any *other* txg's DTL list, to
3364	 * prevent vd from being accessed after it's freed.
3365	 */
3366	for (int t = 0; t < TXG_SIZE; t++)
3367		(void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
3368	vd->vdev_detached = B_TRUE;
3369	vdev_dirty(tvd, VDD_DTL, vd, txg);
3370
3371	spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE);
3372
3373	error = spa_vdev_exit(spa, vd, txg, 0);
3374
3375	/*
3376	 * If this was the removal of the original device in a hot spare vdev,
3377	 * then we want to go through and remove the device from the hot spare
3378	 * list of every other pool.
3379	 */
3380	if (unspare) {
3381		spa_t *myspa = spa;
3382		spa = NULL;
3383		mutex_enter(&spa_namespace_lock);
3384		while ((spa = spa_next(spa)) != NULL) {
3385			if (spa->spa_state != POOL_STATE_ACTIVE)
3386				continue;
3387			if (spa == myspa)
3388				continue;
3389			spa_open_ref(spa, FTAG);
3390			mutex_exit(&spa_namespace_lock);
3391			(void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
3392			mutex_enter(&spa_namespace_lock);
3393			spa_close(spa, FTAG);
3394		}
3395		mutex_exit(&spa_namespace_lock);
3396	}
3397
3398	return (error);
3399}
3400
3401static nvlist_t *
3402spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid)
3403{
3404	for (int i = 0; i < count; i++) {
3405		uint64_t guid;
3406
3407		VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID,
3408		    &guid) == 0);
3409
3410		if (guid == target_guid)
3411			return (nvpp[i]);
3412	}
3413
3414	return (NULL);
3415}
3416
3417static void
3418spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count,
3419	nvlist_t *dev_to_remove)
3420{
3421	nvlist_t **newdev = NULL;
3422
3423	if (count > 1)
3424		newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP);
3425
3426	for (int i = 0, j = 0; i < count; i++) {
3427		if (dev[i] == dev_to_remove)
3428			continue;
3429		VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0);
3430	}
3431
3432	VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0);
3433	VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0);
3434
3435	for (int i = 0; i < count - 1; i++)
3436		nvlist_free(newdev[i]);
3437
3438	if (count > 1)
3439		kmem_free(newdev, (count - 1) * sizeof (void *));
3440}
3441
3442/*
3443 * Remove a device from the pool.  Currently, this supports removing only hot
3444 * spares and level 2 ARC devices.
3445 */
3446int
3447spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
3448{
3449	vdev_t *vd;
3450	nvlist_t **spares, **l2cache, *nv;
3451	uint_t nspares, nl2cache;
3452	uint64_t txg = 0;
3453	int error = 0;
3454	boolean_t locked = MUTEX_HELD(&spa_namespace_lock);
3455
3456	if (!locked)
3457		txg = spa_vdev_enter(spa);
3458
3459	vd = spa_lookup_by_guid(spa, guid, B_FALSE);
3460
3461	if (spa->spa_spares.sav_vdevs != NULL &&
3462	    nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
3463	    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 &&
3464	    (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) {
3465		/*
3466		 * Only remove the hot spare if it's not currently in use
3467		 * in this pool.
3468		 */
3469		if (vd == NULL || unspare) {
3470			spa_vdev_remove_aux(spa->spa_spares.sav_config,
3471			    ZPOOL_CONFIG_SPARES, spares, nspares, nv);
3472			spa_load_spares(spa);
3473			spa->spa_spares.sav_sync = B_TRUE;
3474		} else {
3475			error = EBUSY;
3476		}
3477	} else if (spa->spa_l2cache.sav_vdevs != NULL &&
3478	    nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
3479	    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 &&
3480	    (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) {
3481		/*
3482		 * Cache devices can always be removed.
3483		 */
3484		spa_vdev_remove_aux(spa->spa_l2cache.sav_config,
3485		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv);
3486		spa_load_l2cache(spa);
3487		spa->spa_l2cache.sav_sync = B_TRUE;
3488	} else if (vd != NULL) {
3489		/*
3490		 * Normal vdevs cannot be removed (yet).
3491		 */
3492		error = ENOTSUP;
3493	} else {
3494		/*
3495		 * There is no vdev of any kind with the specified guid.
3496		 */
3497		error = ENOENT;
3498	}
3499
3500	if (!locked)
3501		return (spa_vdev_exit(spa, NULL, txg, error));
3502
3503	return (error);
3504}
3505
3506/*
3507 * Find any device that's done replacing, or a vdev marked 'unspare' that's
3508 * current spared, so we can detach it.
3509 */
3510static vdev_t *
3511spa_vdev_resilver_done_hunt(vdev_t *vd)
3512{
3513	vdev_t *newvd, *oldvd;
3514	int c;
3515
3516	for (c = 0; c < vd->vdev_children; c++) {
3517		oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]);
3518		if (oldvd != NULL)
3519			return (oldvd);
3520	}
3521
3522	/*
3523	 * Check for a completed replacement.
3524	 */
3525	if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) {
3526		oldvd = vd->vdev_child[0];
3527		newvd = vd->vdev_child[1];
3528
3529		if (vdev_dtl_empty(newvd, DTL_MISSING) &&
3530		    !vdev_dtl_required(oldvd))
3531			return (oldvd);
3532	}
3533
3534	/*
3535	 * Check for a completed resilver with the 'unspare' flag set.
3536	 */
3537	if (vd->vdev_ops == &vdev_spare_ops && vd->vdev_children == 2) {
3538		newvd = vd->vdev_child[0];
3539		oldvd = vd->vdev_child[1];
3540
3541		if (newvd->vdev_unspare &&
3542		    vdev_dtl_empty(newvd, DTL_MISSING) &&
3543		    !vdev_dtl_required(oldvd)) {
3544			newvd->vdev_unspare = 0;
3545			return (oldvd);
3546		}
3547	}
3548
3549	return (NULL);
3550}
3551
3552static void
3553spa_vdev_resilver_done(spa_t *spa)
3554{
3555	vdev_t *vd, *pvd, *ppvd;
3556	uint64_t guid, sguid, pguid, ppguid;
3557
3558	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3559
3560	while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) {
3561		pvd = vd->vdev_parent;
3562		ppvd = pvd->vdev_parent;
3563		guid = vd->vdev_guid;
3564		pguid = pvd->vdev_guid;
3565		ppguid = ppvd->vdev_guid;
3566		sguid = 0;
3567		/*
3568		 * If we have just finished replacing a hot spared device, then
3569		 * we need to detach the parent's first child (the original hot
3570		 * spare) as well.
3571		 */
3572		if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0) {
3573			ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
3574			ASSERT(ppvd->vdev_children == 2);
3575			sguid = ppvd->vdev_child[1]->vdev_guid;
3576		}
3577		spa_config_exit(spa, SCL_ALL, FTAG);
3578		if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0)
3579			return;
3580		if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0)
3581			return;
3582		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3583	}
3584
3585	spa_config_exit(spa, SCL_ALL, FTAG);
3586}
3587
3588/*
3589 * Update the stored path or FRU for this vdev.  Dirty the vdev configuration,
3590 * relying on spa_vdev_enter/exit() to synchronize the labels and cache.
3591 */
3592int
3593spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value,
3594    boolean_t ispath)
3595{
3596	vdev_t *vd;
3597	uint64_t txg;
3598
3599	txg = spa_vdev_enter(spa);
3600
3601	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
3602		return (spa_vdev_exit(spa, NULL, txg, ENOENT));
3603
3604	if (!vd->vdev_ops->vdev_op_leaf)
3605		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
3606
3607	if (ispath) {
3608		spa_strfree(vd->vdev_path);
3609		vd->vdev_path = spa_strdup(value);
3610	} else {
3611		if (vd->vdev_fru != NULL)
3612			spa_strfree(vd->vdev_fru);
3613		vd->vdev_fru = spa_strdup(value);
3614	}
3615
3616	vdev_config_dirty(vd->vdev_top);
3617
3618	return (spa_vdev_exit(spa, NULL, txg, 0));
3619}
3620
3621int
3622spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
3623{
3624	return (spa_vdev_set_common(spa, guid, newpath, B_TRUE));
3625}
3626
3627int
3628spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru)
3629{
3630	return (spa_vdev_set_common(spa, guid, newfru, B_FALSE));
3631}
3632
3633/*
3634 * ==========================================================================
3635 * SPA Scrubbing
3636 * ==========================================================================
3637 */
3638
3639int
3640spa_scrub(spa_t *spa, pool_scrub_type_t type)
3641{
3642	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
3643
3644	if ((uint_t)type >= POOL_SCRUB_TYPES)
3645		return (ENOTSUP);
3646
3647	/*
3648	 * If a resilver was requested, but there is no DTL on a
3649	 * writeable leaf device, we have nothing to do.
3650	 */
3651	if (type == POOL_SCRUB_RESILVER &&
3652	    !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
3653		spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
3654		return (0);
3655	}
3656
3657	if (type == POOL_SCRUB_EVERYTHING &&
3658	    spa->spa_dsl_pool->dp_scrub_func != SCRUB_FUNC_NONE &&
3659	    spa->spa_dsl_pool->dp_scrub_isresilver)
3660		return (EBUSY);
3661
3662	if (type == POOL_SCRUB_EVERYTHING || type == POOL_SCRUB_RESILVER) {
3663		return (dsl_pool_scrub_clean(spa->spa_dsl_pool));
3664	} else if (type == POOL_SCRUB_NONE) {
3665		return (dsl_pool_scrub_cancel(spa->spa_dsl_pool));
3666	} else {
3667		return (EINVAL);
3668	}
3669}
3670
3671/*
3672 * ==========================================================================
3673 * SPA async task processing
3674 * ==========================================================================
3675 */
3676
3677static void
3678spa_async_remove(spa_t *spa, vdev_t *vd)
3679{
3680	if (vd->vdev_remove_wanted) {
3681		vd->vdev_remove_wanted = 0;
3682		vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE);
3683
3684		/*
3685		 * We want to clear the stats, but we don't want to do a full
3686		 * vdev_clear() as that will cause us to throw away
3687		 * degraded/faulted state as well as attempt to reopen the
3688		 * device, all of which is a waste.
3689		 */
3690		vd->vdev_stat.vs_read_errors = 0;
3691		vd->vdev_stat.vs_write_errors = 0;
3692		vd->vdev_stat.vs_checksum_errors = 0;
3693
3694		vdev_state_dirty(vd->vdev_top);
3695	}
3696
3697	for (int c = 0; c < vd->vdev_children; c++)
3698		spa_async_remove(spa, vd->vdev_child[c]);
3699}
3700
3701static void
3702spa_async_probe(spa_t *spa, vdev_t *vd)
3703{
3704	if (vd->vdev_probe_wanted) {
3705		vd->vdev_probe_wanted = 0;
3706		vdev_reopen(vd);	/* vdev_open() does the actual probe */
3707	}
3708
3709	for (int c = 0; c < vd->vdev_children; c++)
3710		spa_async_probe(spa, vd->vdev_child[c]);
3711}
3712
3713static void
3714spa_async_thread(void *arg)
3715{
3716	spa_t *spa = arg;
3717	int tasks;
3718
3719	ASSERT(spa->spa_sync_on);
3720
3721	mutex_enter(&spa->spa_async_lock);
3722	tasks = spa->spa_async_tasks;
3723	spa->spa_async_tasks = 0;
3724	mutex_exit(&spa->spa_async_lock);
3725
3726	/*
3727	 * See if the config needs to be updated.
3728	 */
3729	if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
3730		mutex_enter(&spa_namespace_lock);
3731		spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
3732		mutex_exit(&spa_namespace_lock);
3733	}
3734
3735	/*
3736	 * See if any devices need to be marked REMOVED.
3737	 */
3738	if (tasks & SPA_ASYNC_REMOVE) {
3739		spa_vdev_state_enter(spa);
3740		spa_async_remove(spa, spa->spa_root_vdev);
3741		for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
3742			spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]);
3743		for (int i = 0; i < spa->spa_spares.sav_count; i++)
3744			spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]);
3745		(void) spa_vdev_state_exit(spa, NULL, 0);
3746	}
3747
3748	/*
3749	 * See if any devices need to be probed.
3750	 */
3751	if (tasks & SPA_ASYNC_PROBE) {
3752		spa_vdev_state_enter(spa);
3753		spa_async_probe(spa, spa->spa_root_vdev);
3754		(void) spa_vdev_state_exit(spa, NULL, 0);
3755	}
3756
3757	/*
3758	 * If any devices are done replacing, detach them.
3759	 */
3760	if (tasks & SPA_ASYNC_RESILVER_DONE)
3761		spa_vdev_resilver_done(spa);
3762
3763	/*
3764	 * Kick off a resilver.
3765	 */
3766	if (tasks & SPA_ASYNC_RESILVER)
3767		VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER) == 0);
3768
3769	/*
3770	 * Let the world know that we're done.
3771	 */
3772	mutex_enter(&spa->spa_async_lock);
3773	spa->spa_async_thread = NULL;
3774	cv_broadcast(&spa->spa_async_cv);
3775	mutex_exit(&spa->spa_async_lock);
3776	thread_exit();
3777}
3778
3779void
3780spa_async_suspend(spa_t *spa)
3781{
3782	mutex_enter(&spa->spa_async_lock);
3783	spa->spa_async_suspended++;
3784	while (spa->spa_async_thread != NULL)
3785		cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
3786	mutex_exit(&spa->spa_async_lock);
3787}
3788
3789void
3790spa_async_resume(spa_t *spa)
3791{
3792	mutex_enter(&spa->spa_async_lock);
3793	ASSERT(spa->spa_async_suspended != 0);
3794	spa->spa_async_suspended--;
3795	mutex_exit(&spa->spa_async_lock);
3796}
3797
3798static void
3799spa_async_dispatch(spa_t *spa)
3800{
3801	mutex_enter(&spa->spa_async_lock);
3802	if (spa->spa_async_tasks && !spa->spa_async_suspended &&
3803	    spa->spa_async_thread == NULL &&
3804	    rootdir != NULL && !vn_is_readonly(rootdir))
3805		spa->spa_async_thread = thread_create(NULL, 0,
3806		    spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
3807	mutex_exit(&spa->spa_async_lock);
3808}
3809
3810void
3811spa_async_request(spa_t *spa, int task)
3812{
3813	mutex_enter(&spa->spa_async_lock);
3814	spa->spa_async_tasks |= task;
3815	mutex_exit(&spa->spa_async_lock);
3816}
3817
3818/*
3819 * ==========================================================================
3820 * SPA syncing routines
3821 * ==========================================================================
3822 */
3823
3824static void
3825spa_sync_deferred_frees(spa_t *spa, uint64_t txg)
3826{
3827	bplist_t *bpl = &spa->spa_sync_bplist;
3828	dmu_tx_t *tx;
3829	blkptr_t blk;
3830	uint64_t itor = 0;
3831	zio_t *zio;
3832	int error;
3833	uint8_t c = 1;
3834
3835	zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
3836
3837	while (bplist_iterate(bpl, &itor, &blk) == 0) {
3838		ASSERT(blk.blk_birth < txg);
3839		zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL,
3840		    ZIO_FLAG_MUSTSUCCEED));
3841	}
3842
3843	error = zio_wait(zio);
3844	ASSERT3U(error, ==, 0);
3845
3846	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
3847	bplist_vacate(bpl, tx);
3848
3849	/*
3850	 * Pre-dirty the first block so we sync to convergence faster.
3851	 * (Usually only the first block is needed.)
3852	 */
3853	dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx);
3854	dmu_tx_commit(tx);
3855}
3856
3857static void
3858spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
3859{
3860	char *packed = NULL;
3861	size_t bufsize;
3862	size_t nvsize = 0;
3863	dmu_buf_t *db;
3864
3865	VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0);
3866
3867	/*
3868	 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration
3869	 * information.  This avoids the dbuf_will_dirty() path and
3870	 * saves us a pre-read to get data we don't actually care about.
3871	 */
3872	bufsize = P2ROUNDUP(nvsize, SPA_CONFIG_BLOCKSIZE);
3873	packed = kmem_alloc(bufsize, KM_SLEEP);
3874
3875	VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
3876	    KM_SLEEP) == 0);
3877	bzero(packed + nvsize, bufsize - nvsize);
3878
3879	dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx);
3880
3881	kmem_free(packed, bufsize);
3882
3883	VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
3884	dmu_buf_will_dirty(db, tx);
3885	*(uint64_t *)db->db_data = nvsize;
3886	dmu_buf_rele(db, FTAG);
3887}
3888
3889static void
3890spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx,
3891    const char *config, const char *entry)
3892{
3893	nvlist_t *nvroot;
3894	nvlist_t **list;
3895	int i;
3896
3897	if (!sav->sav_sync)
3898		return;
3899
3900	/*
3901	 * Update the MOS nvlist describing the list of available devices.
3902	 * spa_validate_aux() will have already made sure this nvlist is
3903	 * valid and the vdevs are labeled appropriately.
3904	 */
3905	if (sav->sav_object == 0) {
3906		sav->sav_object = dmu_object_alloc(spa->spa_meta_objset,
3907		    DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE,
3908		    sizeof (uint64_t), tx);
3909		VERIFY(zap_update(spa->spa_meta_objset,
3910		    DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1,
3911		    &sav->sav_object, tx) == 0);
3912	}
3913
3914	VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
3915	if (sav->sav_count == 0) {
3916		VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0);
3917	} else {
3918		list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
3919		for (i = 0; i < sav->sav_count; i++)
3920			list[i] = vdev_config_generate(spa, sav->sav_vdevs[i],
3921			    B_FALSE, B_FALSE, B_TRUE);
3922		VERIFY(nvlist_add_nvlist_array(nvroot, config, list,
3923		    sav->sav_count) == 0);
3924		for (i = 0; i < sav->sav_count; i++)
3925			nvlist_free(list[i]);
3926		kmem_free(list, sav->sav_count * sizeof (void *));
3927	}
3928
3929	spa_sync_nvlist(spa, sav->sav_object, nvroot, tx);
3930	nvlist_free(nvroot);
3931
3932	sav->sav_sync = B_FALSE;
3933}
3934
3935static void
3936spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
3937{
3938	nvlist_t *config;
3939
3940	if (list_is_empty(&spa->spa_config_dirty_list))
3941		return;
3942
3943	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
3944
3945	config = spa_config_generate(spa, spa->spa_root_vdev,
3946	    dmu_tx_get_txg(tx), B_FALSE);
3947
3948	spa_config_exit(spa, SCL_STATE, FTAG);
3949
3950	if (spa->spa_config_syncing)
3951		nvlist_free(spa->spa_config_syncing);
3952	spa->spa_config_syncing = config;
3953
3954	spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
3955}
3956
3957/*
3958 * Set zpool properties.
3959 */
3960static void
3961spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
3962{
3963	spa_t *spa = arg1;
3964	objset_t *mos = spa->spa_meta_objset;
3965	nvlist_t *nvp = arg2;
3966	nvpair_t *elem;
3967	uint64_t intval;
3968	char *strval;
3969	zpool_prop_t prop;
3970	const char *propname;
3971	zprop_type_t proptype;
3972
3973	mutex_enter(&spa->spa_props_lock);
3974
3975	elem = NULL;
3976	while ((elem = nvlist_next_nvpair(nvp, elem))) {
3977		switch (prop = zpool_name_to_prop(nvpair_name(elem))) {
3978		case ZPOOL_PROP_VERSION:
3979			/*
3980			 * Only set version for non-zpool-creation cases
3981			 * (set/import). spa_create() needs special care
3982			 * for version setting.
3983			 */
3984			if (tx->tx_txg != TXG_INITIAL) {
3985				VERIFY(nvpair_value_uint64(elem,
3986				    &intval) == 0);
3987				ASSERT(intval <= SPA_VERSION);
3988				ASSERT(intval >= spa_version(spa));
3989				spa->spa_uberblock.ub_version = intval;
3990				vdev_config_dirty(spa->spa_root_vdev);
3991			}
3992			break;
3993
3994		case ZPOOL_PROP_ALTROOT:
3995			/*
3996			 * 'altroot' is a non-persistent property. It should
3997			 * have been set temporarily at creation or import time.
3998			 */
3999			ASSERT(spa->spa_root != NULL);
4000			break;
4001
4002		case ZPOOL_PROP_CACHEFILE:
4003			/*
4004			 * 'cachefile' is also a non-persisitent property.
4005			 */
4006			break;
4007		default:
4008			/*
4009			 * Set pool property values in the poolprops mos object.
4010			 */
4011			if (spa->spa_pool_props_object == 0) {
4012				objset_t *mos = spa->spa_meta_objset;
4013
4014				VERIFY((spa->spa_pool_props_object =
4015				    zap_create(mos, DMU_OT_POOL_PROPS,
4016				    DMU_OT_NONE, 0, tx)) > 0);
4017
4018				VERIFY(zap_update(mos,
4019				    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS,
4020				    8, 1, &spa->spa_pool_props_object, tx)
4021				    == 0);
4022			}
4023
4024			/* normalize the property name */
4025			propname = zpool_prop_to_name(prop);
4026			proptype = zpool_prop_get_type(prop);
4027
4028			if (nvpair_type(elem) == DATA_TYPE_STRING) {
4029				ASSERT(proptype == PROP_TYPE_STRING);
4030				VERIFY(nvpair_value_string(elem, &strval) == 0);
4031				VERIFY(zap_update(mos,
4032				    spa->spa_pool_props_object, propname,
4033				    1, strlen(strval) + 1, strval, tx) == 0);
4034
4035			} else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
4036				VERIFY(nvpair_value_uint64(elem, &intval) == 0);
4037
4038				if (proptype == PROP_TYPE_INDEX) {
4039					const char *unused;
4040					VERIFY(zpool_prop_index_to_string(
4041					    prop, intval, &unused) == 0);
4042				}
4043				VERIFY(zap_update(mos,
4044				    spa->spa_pool_props_object, propname,
4045				    8, 1, &intval, tx) == 0);
4046			} else {
4047				ASSERT(0); /* not allowed */
4048			}
4049
4050			switch (prop) {
4051			case ZPOOL_PROP_DELEGATION:
4052				spa->spa_delegation = intval;
4053				break;
4054			case ZPOOL_PROP_BOOTFS:
4055				spa->spa_bootfs = intval;
4056				break;
4057			case ZPOOL_PROP_FAILUREMODE:
4058				spa->spa_failmode = intval;
4059				break;
4060			default:
4061				break;
4062			}
4063		}
4064
4065		/* log internal history if this is not a zpool create */
4066		if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY &&
4067		    tx->tx_txg != TXG_INITIAL) {
4068			spa_history_internal_log(LOG_POOL_PROPSET,
4069			    spa, tx, cr, "%s %lld %s",
4070			    nvpair_name(elem), intval, spa_name(spa));
4071		}
4072	}
4073
4074	mutex_exit(&spa->spa_props_lock);
4075}
4076
4077/*
4078 * Sync the specified transaction group.  New blocks may be dirtied as
4079 * part of the process, so we iterate until it converges.
4080 */
4081void
4082spa_sync(spa_t *spa, uint64_t txg)
4083{
4084	dsl_pool_t *dp = spa->spa_dsl_pool;
4085	objset_t *mos = spa->spa_meta_objset;
4086	bplist_t *bpl = &spa->spa_sync_bplist;
4087	vdev_t *rvd = spa->spa_root_vdev;
4088	vdev_t *vd;
4089	dmu_tx_t *tx;
4090	int dirty_vdevs;
4091	int error;
4092
4093	/*
4094	 * Lock out configuration changes.
4095	 */
4096	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4097
4098	spa->spa_syncing_txg = txg;
4099	spa->spa_sync_pass = 0;
4100
4101	/*
4102	 * If there are any pending vdev state changes, convert them
4103	 * into config changes that go out with this transaction group.
4104	 */
4105	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
4106	while (list_head(&spa->spa_state_dirty_list) != NULL) {
4107		/*
4108		 * We need the write lock here because, for aux vdevs,
4109		 * calling vdev_config_dirty() modifies sav_config.
4110		 * This is ugly and will become unnecessary when we
4111		 * eliminate the aux vdev wart by integrating all vdevs
4112		 * into the root vdev tree.
4113		 */
4114		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
4115		spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER);
4116		while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
4117			vdev_state_clean(vd);
4118			vdev_config_dirty(vd);
4119		}
4120		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
4121		spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
4122	}
4123	spa_config_exit(spa, SCL_STATE, FTAG);
4124
4125	VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj));
4126
4127	tx = dmu_tx_create_assigned(dp, txg);
4128
4129	/*
4130	 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
4131	 * set spa_deflate if we have no raid-z vdevs.
4132	 */
4133	if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE &&
4134	    spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) {
4135		int i;
4136
4137		for (i = 0; i < rvd->vdev_children; i++) {
4138			vd = rvd->vdev_child[i];
4139			if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
4140				break;
4141		}
4142		if (i == rvd->vdev_children) {
4143			spa->spa_deflate = TRUE;
4144			VERIFY(0 == zap_add(spa->spa_meta_objset,
4145			    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
4146			    sizeof (uint64_t), 1, &spa->spa_deflate, tx));
4147		}
4148	}
4149
4150	if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
4151	    spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) {
4152		dsl_pool_create_origin(dp, tx);
4153
4154		/* Keeping the origin open increases spa_minref */
4155		spa->spa_minref += 3;
4156	}
4157
4158	if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES &&
4159	    spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) {
4160		dsl_pool_upgrade_clones(dp, tx);
4161	}
4162
4163	/*
4164	 * If anything has changed in this txg, push the deferred frees
4165	 * from the previous txg.  If not, leave them alone so that we
4166	 * don't generate work on an otherwise idle system.
4167	 */
4168	if (!txg_list_empty(&dp->dp_dirty_datasets, txg) ||
4169	    !txg_list_empty(&dp->dp_dirty_dirs, txg) ||
4170	    !txg_list_empty(&dp->dp_sync_tasks, txg))
4171		spa_sync_deferred_frees(spa, txg);
4172
4173	/*
4174	 * Iterate to convergence.
4175	 */
4176	do {
4177		spa->spa_sync_pass++;
4178
4179		spa_sync_config_object(spa, tx);
4180		spa_sync_aux_dev(spa, &spa->spa_spares, tx,
4181		    ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES);
4182		spa_sync_aux_dev(spa, &spa->spa_l2cache, tx,
4183		    ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE);
4184		spa_errlog_sync(spa, txg);
4185		dsl_pool_sync(dp, txg);
4186
4187		dirty_vdevs = 0;
4188		while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) {
4189			vdev_sync(vd, txg);
4190			dirty_vdevs++;
4191		}
4192
4193		bplist_sync(bpl, tx);
4194	} while (dirty_vdevs);
4195
4196	bplist_close(bpl);
4197
4198	dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass);
4199
4200	/*
4201	 * Rewrite the vdev configuration (which includes the uberblock)
4202	 * to commit the transaction group.
4203	 *
4204	 * If there are no dirty vdevs, we sync the uberblock to a few
4205	 * random top-level vdevs that are known to be visible in the
4206	 * config cache (see spa_vdev_add() for a complete description).
4207	 * If there *are* dirty vdevs, sync the uberblock to all vdevs.
4208	 */
4209	for (;;) {
4210		/*
4211		 * We hold SCL_STATE to prevent vdev open/close/etc.
4212		 * while we're attempting to write the vdev labels.
4213		 */
4214		spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
4215
4216		if (list_is_empty(&spa->spa_config_dirty_list)) {
4217			vdev_t *svd[SPA_DVAS_PER_BP];
4218			int svdcount = 0;
4219			int children = rvd->vdev_children;
4220			int c0 = spa_get_random(children);
4221			int c;
4222
4223			for (c = 0; c < children; c++) {
4224				vd = rvd->vdev_child[(c0 + c) % children];
4225				if (vd->vdev_ms_array == 0 || vd->vdev_islog)
4226					continue;
4227				svd[svdcount++] = vd;
4228				if (svdcount == SPA_DVAS_PER_BP)
4229					break;
4230			}
4231			error = vdev_config_sync(svd, svdcount, txg);
4232		} else {
4233			error = vdev_config_sync(rvd->vdev_child,
4234			    rvd->vdev_children, txg);
4235		}
4236
4237		spa_config_exit(spa, SCL_STATE, FTAG);
4238
4239		if (error == 0)
4240			break;
4241		zio_suspend(spa, NULL);
4242		zio_resume_wait(spa);
4243	}
4244	dmu_tx_commit(tx);
4245
4246	/*
4247	 * Clear the dirty config list.
4248	 */
4249	while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL)
4250		vdev_config_clean(vd);
4251
4252	/*
4253	 * Now that the new config has synced transactionally,
4254	 * let it become visible to the config cache.
4255	 */
4256	if (spa->spa_config_syncing != NULL) {
4257		spa_config_set(spa, spa->spa_config_syncing);
4258		spa->spa_config_txg = txg;
4259		spa->spa_config_syncing = NULL;
4260	}
4261
4262	spa->spa_ubsync = spa->spa_uberblock;
4263
4264	/*
4265	 * Clean up the ZIL records for the synced txg.
4266	 */
4267	dsl_pool_zil_clean(dp);
4268
4269	/*
4270	 * Update usable space statistics.
4271	 */
4272	while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
4273		vdev_sync_done(vd, txg);
4274
4275	/*
4276	 * It had better be the case that we didn't dirty anything
4277	 * since vdev_config_sync().
4278	 */
4279	ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
4280	ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
4281	ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
4282	ASSERT(bpl->bpl_queue == NULL);
4283
4284	spa_config_exit(spa, SCL_CONFIG, FTAG);
4285
4286	/*
4287	 * If any async tasks have been requested, kick them off.
4288	 */
4289	spa_async_dispatch(spa);
4290}
4291
4292/*
4293 * Sync all pools.  We don't want to hold the namespace lock across these
4294 * operations, so we take a reference on the spa_t and drop the lock during the
4295 * sync.
4296 */
4297void
4298spa_sync_allpools(void)
4299{
4300	spa_t *spa = NULL;
4301	mutex_enter(&spa_namespace_lock);
4302	while ((spa = spa_next(spa)) != NULL) {
4303		if (spa_state(spa) != POOL_STATE_ACTIVE || spa_suspended(spa))
4304			continue;
4305		spa_open_ref(spa, FTAG);
4306		mutex_exit(&spa_namespace_lock);
4307		txg_wait_synced(spa_get_dsl(spa), 0);
4308		mutex_enter(&spa_namespace_lock);
4309		spa_close(spa, FTAG);
4310	}
4311	mutex_exit(&spa_namespace_lock);
4312}
4313
4314/*
4315 * ==========================================================================
4316 * Miscellaneous routines
4317 * ==========================================================================
4318 */
4319
4320/*
4321 * Remove all pools in the system.
4322 */
4323void
4324spa_evict_all(void)
4325{
4326	spa_t *spa;
4327
4328	/*
4329	 * Remove all cached state.  All pools should be closed now,
4330	 * so every spa in the AVL tree should be unreferenced.
4331	 */
4332	mutex_enter(&spa_namespace_lock);
4333	while ((spa = spa_next(NULL)) != NULL) {
4334		/*
4335		 * Stop async tasks.  The async thread may need to detach
4336		 * a device that's been replaced, which requires grabbing
4337		 * spa_namespace_lock, so we must drop it here.
4338		 */
4339		spa_open_ref(spa, FTAG);
4340		mutex_exit(&spa_namespace_lock);
4341		spa_async_suspend(spa);
4342		mutex_enter(&spa_namespace_lock);
4343		spa_close(spa, FTAG);
4344
4345		if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
4346			spa_unload(spa);
4347			spa_deactivate(spa);
4348		}
4349		spa_remove(spa);
4350	}
4351	mutex_exit(&spa_namespace_lock);
4352}
4353
4354vdev_t *
4355spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux)
4356{
4357	vdev_t *vd;
4358	int i;
4359
4360	if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL)
4361		return (vd);
4362
4363	if (aux) {
4364		for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
4365			vd = spa->spa_l2cache.sav_vdevs[i];
4366			if (vd->vdev_guid == guid)
4367				return (vd);
4368		}
4369
4370		for (i = 0; i < spa->spa_spares.sav_count; i++) {
4371			vd = spa->spa_spares.sav_vdevs[i];
4372			if (vd->vdev_guid == guid)
4373				return (vd);
4374		}
4375	}
4376
4377	return (NULL);
4378}
4379
4380void
4381spa_upgrade(spa_t *spa, uint64_t version)
4382{
4383	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4384
4385	/*
4386	 * This should only be called for a non-faulted pool, and since a
4387	 * future version would result in an unopenable pool, this shouldn't be
4388	 * possible.
4389	 */
4390	ASSERT(spa->spa_uberblock.ub_version <= SPA_VERSION);
4391	ASSERT(version >= spa->spa_uberblock.ub_version);
4392
4393	spa->spa_uberblock.ub_version = version;
4394	vdev_config_dirty(spa->spa_root_vdev);
4395
4396	spa_config_exit(spa, SCL_ALL, FTAG);
4397
4398	txg_wait_synced(spa_get_dsl(spa), 0);
4399}
4400
4401boolean_t
4402spa_has_spare(spa_t *spa, uint64_t guid)
4403{
4404	int i;
4405	uint64_t spareguid;
4406	spa_aux_vdev_t *sav = &spa->spa_spares;
4407
4408	for (i = 0; i < sav->sav_count; i++)
4409		if (sav->sav_vdevs[i]->vdev_guid == guid)
4410			return (B_TRUE);
4411
4412	for (i = 0; i < sav->sav_npending; i++) {
4413		if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID,
4414		    &spareguid) == 0 && spareguid == guid)
4415			return (B_TRUE);
4416	}
4417
4418	return (B_FALSE);
4419}
4420
4421/*
4422 * Check if a pool has an active shared spare device.
4423 * Note: reference count of an active spare is 2, as a spare and as a replace
4424 */
4425static boolean_t
4426spa_has_active_shared_spare(spa_t *spa)
4427{
4428	int i, refcnt;
4429	uint64_t pool;
4430	spa_aux_vdev_t *sav = &spa->spa_spares;
4431
4432	for (i = 0; i < sav->sav_count; i++) {
4433		if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool,
4434		    &refcnt) && pool != 0ULL && pool == spa_guid(spa) &&
4435		    refcnt > 2)
4436			return (B_TRUE);
4437	}
4438
4439	return (B_FALSE);
4440}
4441
4442/*
4443 * Post a sysevent corresponding to the given event.  The 'name' must be one of
4444 * the event definitions in sys/sysevent/eventdefs.h.  The payload will be
4445 * filled in from the spa and (optionally) the vdev.  This doesn't do anything
4446 * in the userland libzpool, as we don't want consumers to misinterpret ztest
4447 * or zdb as real changes.
4448 */
4449void
4450spa_event_notify(spa_t *spa, vdev_t *vd, const char *name)
4451{
4452#if 0
4453#ifdef _KERNEL
4454	sysevent_t		*ev;
4455	sysevent_attr_list_t	*attr = NULL;
4456	sysevent_value_t	value;
4457	sysevent_id_t		eid;
4458
4459	ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs",
4460	    SE_SLEEP);
4461
4462	value.value_type = SE_DATA_TYPE_STRING;
4463	value.value.sv_string = spa_name(spa);
4464	if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0)
4465		goto done;
4466
4467	value.value_type = SE_DATA_TYPE_UINT64;
4468	value.value.sv_uint64 = spa_guid(spa);
4469	if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0)
4470		goto done;
4471
4472	if (vd) {
4473		value.value_type = SE_DATA_TYPE_UINT64;
4474		value.value.sv_uint64 = vd->vdev_guid;
4475		if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value,
4476		    SE_SLEEP) != 0)
4477			goto done;
4478
4479		if (vd->vdev_path) {
4480			value.value_type = SE_DATA_TYPE_STRING;
4481			value.value.sv_string = vd->vdev_path;
4482			if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH,
4483			    &value, SE_SLEEP) != 0)
4484				goto done;
4485		}
4486	}
4487
4488	if (sysevent_attach_attributes(ev, attr) != 0)
4489		goto done;
4490	attr = NULL;
4491
4492	(void) log_sysevent(ev, SE_SLEEP, &eid);
4493
4494done:
4495	if (attr)
4496		sysevent_free_attr(attr);
4497	sysevent_free(ev);
4498#endif
4499#endif
4500}
4501