spa.c revision 168821
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#pragma ident	"%Z%%M%	%I%	%E% SMI"
28
29/*
30 * This file contains all the routines used when modifying on-disk SPA state.
31 * This includes opening, importing, destroying, exporting a pool, and syncing a
32 * pool.
33 */
34
35#include <sys/zfs_context.h>
36#include <sys/fm/fs/zfs.h>
37#include <sys/spa_impl.h>
38#include <sys/zio.h>
39#include <sys/zio_checksum.h>
40#include <sys/zio_compress.h>
41#include <sys/dmu.h>
42#include <sys/dmu_tx.h>
43#include <sys/zap.h>
44#include <sys/zil.h>
45#include <sys/vdev_impl.h>
46#include <sys/metaslab.h>
47#include <sys/uberblock_impl.h>
48#include <sys/txg.h>
49#include <sys/avl.h>
50#include <sys/dmu_traverse.h>
51#include <sys/dmu_objset.h>
52#include <sys/unique.h>
53#include <sys/dsl_pool.h>
54#include <sys/dsl_dataset.h>
55#include <sys/dsl_dir.h>
56#include <sys/dsl_prop.h>
57#include <sys/dsl_synctask.h>
58#include <sys/fs/zfs.h>
59#include <sys/callb.h>
60
61int zio_taskq_threads = 0;
62SYSCTL_DECL(_vfs_zfs);
63SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO");
64TUNABLE_INT("vfs.zfs.zio.taskq_threads", &zio_taskq_threads);
65SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, taskq_threads, CTLFLAG_RW,
66    &zio_taskq_threads, 0, "Number of ZIO threads per ZIO type");
67
68
69/*
70 * ==========================================================================
71 * SPA state manipulation (open/create/destroy/import/export)
72 * ==========================================================================
73 */
74
75static int
76spa_error_entry_compare(const void *a, const void *b)
77{
78	spa_error_entry_t *sa = (spa_error_entry_t *)a;
79	spa_error_entry_t *sb = (spa_error_entry_t *)b;
80	int ret;
81
82	ret = bcmp(&sa->se_bookmark, &sb->se_bookmark,
83	    sizeof (zbookmark_t));
84
85	if (ret < 0)
86		return (-1);
87	else if (ret > 0)
88		return (1);
89	else
90		return (0);
91}
92
93/*
94 * Utility function which retrieves copies of the current logs and
95 * re-initializes them in the process.
96 */
97void
98spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
99{
100	ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
101
102	bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t));
103	bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t));
104
105	avl_create(&spa->spa_errlist_scrub,
106	    spa_error_entry_compare, sizeof (spa_error_entry_t),
107	    offsetof(spa_error_entry_t, se_avl));
108	avl_create(&spa->spa_errlist_last,
109	    spa_error_entry_compare, sizeof (spa_error_entry_t),
110	    offsetof(spa_error_entry_t, se_avl));
111}
112
113/*
114 * Activate an uninitialized pool.
115 */
116static void
117spa_activate(spa_t *spa)
118{
119	int t;
120	int nthreads = zio_taskq_threads;
121	char name[32];
122
123	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
124
125	spa->spa_state = POOL_STATE_ACTIVE;
126
127	spa->spa_normal_class = metaslab_class_create();
128
129	if (nthreads == 0)
130		nthreads = max_ncpus;
131	for (t = 0; t < ZIO_TYPES; t++) {
132		snprintf(name, sizeof(name), "spa_zio_issue %d", t);
133		spa->spa_zio_issue_taskq[t] = taskq_create(name, nthreads,
134		    maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE);
135		snprintf(name, sizeof(name), "spa_zio_intr %d", t);
136		spa->spa_zio_intr_taskq[t] = taskq_create(name, nthreads,
137		    maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE);
138	}
139
140	rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL);
141
142	mutex_init(&spa->spa_uberblock_lock, NULL, MUTEX_DEFAULT, NULL);
143	mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL);
144	mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL);
145	mutex_init(&spa->spa_config_lock.scl_lock, NULL, MUTEX_DEFAULT, NULL);
146	cv_init(&spa->spa_config_lock.scl_cv, NULL, CV_DEFAULT, NULL);
147	mutex_init(&spa->spa_sync_bplist.bpl_lock, NULL, MUTEX_DEFAULT, NULL);
148	mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL);
149	mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL);
150
151	list_create(&spa->spa_dirty_list, sizeof (vdev_t),
152	    offsetof(vdev_t, vdev_dirty_node));
153
154	txg_list_create(&spa->spa_vdev_txg_list,
155	    offsetof(struct vdev, vdev_txg_node));
156
157	avl_create(&spa->spa_errlist_scrub,
158	    spa_error_entry_compare, sizeof (spa_error_entry_t),
159	    offsetof(spa_error_entry_t, se_avl));
160	avl_create(&spa->spa_errlist_last,
161	    spa_error_entry_compare, sizeof (spa_error_entry_t),
162	    offsetof(spa_error_entry_t, se_avl));
163}
164
165/*
166 * Opposite of spa_activate().
167 */
168static void
169spa_deactivate(spa_t *spa)
170{
171	int t;
172
173	ASSERT(spa->spa_sync_on == B_FALSE);
174	ASSERT(spa->spa_dsl_pool == NULL);
175	ASSERT(spa->spa_root_vdev == NULL);
176
177	ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
178
179	txg_list_destroy(&spa->spa_vdev_txg_list);
180
181	list_destroy(&spa->spa_dirty_list);
182
183	for (t = 0; t < ZIO_TYPES; t++) {
184		taskq_destroy(spa->spa_zio_issue_taskq[t]);
185		taskq_destroy(spa->spa_zio_intr_taskq[t]);
186		spa->spa_zio_issue_taskq[t] = NULL;
187		spa->spa_zio_intr_taskq[t] = NULL;
188	}
189
190	metaslab_class_destroy(spa->spa_normal_class);
191	spa->spa_normal_class = NULL;
192
193	/*
194	 * If this was part of an import or the open otherwise failed, we may
195	 * still have errors left in the queues.  Empty them just in case.
196	 */
197	spa_errlog_drain(spa);
198
199	avl_destroy(&spa->spa_errlist_scrub);
200	avl_destroy(&spa->spa_errlist_last);
201
202	rw_destroy(&spa->spa_traverse_lock);
203	mutex_destroy(&spa->spa_uberblock_lock);
204	mutex_destroy(&spa->spa_errlog_lock);
205	mutex_destroy(&spa->spa_errlist_lock);
206	mutex_destroy(&spa->spa_config_lock.scl_lock);
207	cv_destroy(&spa->spa_config_lock.scl_cv);
208	mutex_destroy(&spa->spa_sync_bplist.bpl_lock);
209	mutex_destroy(&spa->spa_history_lock);
210	mutex_destroy(&spa->spa_props_lock);
211
212	spa->spa_state = POOL_STATE_UNINITIALIZED;
213}
214
215/*
216 * Verify a pool configuration, and construct the vdev tree appropriately.  This
217 * will create all the necessary vdevs in the appropriate layout, with each vdev
218 * in the CLOSED state.  This will prep the pool before open/creation/import.
219 * All vdev validation is done by the vdev_alloc() routine.
220 */
221static int
222spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
223    uint_t id, int atype)
224{
225	nvlist_t **child;
226	uint_t c, children;
227	int error;
228
229	if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
230		return (error);
231
232	if ((*vdp)->vdev_ops->vdev_op_leaf)
233		return (0);
234
235	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
236	    &child, &children) != 0) {
237		vdev_free(*vdp);
238		*vdp = NULL;
239		return (EINVAL);
240	}
241
242	for (c = 0; c < children; c++) {
243		vdev_t *vd;
244		if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
245		    atype)) != 0) {
246			vdev_free(*vdp);
247			*vdp = NULL;
248			return (error);
249		}
250	}
251
252	ASSERT(*vdp != NULL);
253
254	return (0);
255}
256
257/*
258 * Opposite of spa_load().
259 */
260static void
261spa_unload(spa_t *spa)
262{
263	int i;
264
265	/*
266	 * Stop async tasks.
267	 */
268	spa_async_suspend(spa);
269
270	/*
271	 * Stop syncing.
272	 */
273	if (spa->spa_sync_on) {
274		txg_sync_stop(spa->spa_dsl_pool);
275		spa->spa_sync_on = B_FALSE;
276	}
277
278	/*
279	 * Wait for any outstanding prefetch I/O to complete.
280	 */
281	spa_config_enter(spa, RW_WRITER, FTAG);
282	spa_config_exit(spa, FTAG);
283
284	/*
285	 * Close the dsl pool.
286	 */
287	if (spa->spa_dsl_pool) {
288		dsl_pool_close(spa->spa_dsl_pool);
289		spa->spa_dsl_pool = NULL;
290	}
291
292	/*
293	 * Close all vdevs.
294	 */
295	if (spa->spa_root_vdev)
296		vdev_free(spa->spa_root_vdev);
297	ASSERT(spa->spa_root_vdev == NULL);
298
299	for (i = 0; i < spa->spa_nspares; i++)
300		vdev_free(spa->spa_spares[i]);
301	if (spa->spa_spares) {
302		kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *));
303		spa->spa_spares = NULL;
304	}
305	if (spa->spa_sparelist) {
306		nvlist_free(spa->spa_sparelist);
307		spa->spa_sparelist = NULL;
308	}
309
310	spa->spa_async_suspended = 0;
311}
312
313/*
314 * Load (or re-load) the current list of vdevs describing the active spares for
315 * this pool.  When this is called, we have some form of basic information in
316 * 'spa_sparelist'.  We parse this into vdevs, try to open them, and then
317 * re-generate a more complete list including status information.
318 */
319static void
320spa_load_spares(spa_t *spa)
321{
322	nvlist_t **spares;
323	uint_t nspares;
324	int i;
325	vdev_t *vd, *tvd;
326
327	/*
328	 * First, close and free any existing spare vdevs.
329	 */
330	for (i = 0; i < spa->spa_nspares; i++) {
331		vd = spa->spa_spares[i];
332
333		/* Undo the call to spa_activate() below */
334		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL &&
335		    tvd->vdev_isspare)
336			spa_spare_remove(tvd);
337		vdev_close(vd);
338		vdev_free(vd);
339	}
340
341	if (spa->spa_spares)
342		kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *));
343
344	if (spa->spa_sparelist == NULL)
345		nspares = 0;
346	else
347		VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist,
348		    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
349
350	spa->spa_nspares = (int)nspares;
351	spa->spa_spares = NULL;
352
353	if (nspares == 0)
354		return;
355
356	/*
357	 * Construct the array of vdevs, opening them to get status in the
358	 * process.   For each spare, there is potentially two different vdev_t
359	 * structures associated with it: one in the list of spares (used only
360	 * for basic validation purposes) and one in the active vdev
361	 * configuration (if it's spared in).  During this phase we open and
362	 * validate each vdev on the spare list.  If the vdev also exists in the
363	 * active configuration, then we also mark this vdev as an active spare.
364	 */
365	spa->spa_spares = kmem_alloc(nspares * sizeof (void *), KM_SLEEP);
366	for (i = 0; i < spa->spa_nspares; i++) {
367		VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0,
368		    VDEV_ALLOC_SPARE) == 0);
369		ASSERT(vd != NULL);
370
371		spa->spa_spares[i] = vd;
372
373		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL) {
374			if (!tvd->vdev_isspare)
375				spa_spare_add(tvd);
376
377			/*
378			 * We only mark the spare active if we were successfully
379			 * able to load the vdev.  Otherwise, importing a pool
380			 * with a bad active spare would result in strange
381			 * behavior, because multiple pool would think the spare
382			 * is actively in use.
383			 *
384			 * There is a vulnerability here to an equally bizarre
385			 * circumstance, where a dead active spare is later
386			 * brought back to life (onlined or otherwise).  Given
387			 * the rarity of this scenario, and the extra complexity
388			 * it adds, we ignore the possibility.
389			 */
390			if (!vdev_is_dead(tvd))
391				spa_spare_activate(tvd);
392		}
393
394		if (vdev_open(vd) != 0)
395			continue;
396
397		vd->vdev_top = vd;
398		(void) vdev_validate_spare(vd);
399	}
400
401	/*
402	 * Recompute the stashed list of spares, with status information
403	 * this time.
404	 */
405	VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES,
406	    DATA_TYPE_NVLIST_ARRAY) == 0);
407
408	spares = kmem_alloc(spa->spa_nspares * sizeof (void *), KM_SLEEP);
409	for (i = 0; i < spa->spa_nspares; i++)
410		spares[i] = vdev_config_generate(spa, spa->spa_spares[i],
411		    B_TRUE, B_TRUE);
412	VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES,
413	    spares, spa->spa_nspares) == 0);
414	for (i = 0; i < spa->spa_nspares; i++)
415		nvlist_free(spares[i]);
416	kmem_free(spares, spa->spa_nspares * sizeof (void *));
417}
418
419static int
420load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
421{
422	dmu_buf_t *db;
423	char *packed = NULL;
424	size_t nvsize = 0;
425	int error;
426	*value = NULL;
427
428	VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
429	nvsize = *(uint64_t *)db->db_data;
430	dmu_buf_rele(db, FTAG);
431
432	packed = kmem_alloc(nvsize, KM_SLEEP);
433	error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed);
434	if (error == 0)
435		error = nvlist_unpack(packed, nvsize, value, 0);
436	kmem_free(packed, nvsize);
437
438	return (error);
439}
440
441/*
442 * Load an existing storage pool, using the pool's builtin spa_config as a
443 * source of configuration information.
444 */
445static int
446spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
447{
448	int error = 0;
449	nvlist_t *nvroot = NULL;
450	vdev_t *rvd;
451	uberblock_t *ub = &spa->spa_uberblock;
452	uint64_t config_cache_txg = spa->spa_config_txg;
453	uint64_t pool_guid;
454	uint64_t version;
455	zio_t *zio;
456
457	spa->spa_load_state = state;
458
459	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) ||
460	    nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) {
461		error = EINVAL;
462		goto out;
463	}
464
465	/*
466	 * Versioning wasn't explicitly added to the label until later, so if
467	 * it's not present treat it as the initial version.
468	 */
469	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0)
470		version = ZFS_VERSION_INITIAL;
471
472	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
473	    &spa->spa_config_txg);
474
475	if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) &&
476	    spa_guid_exists(pool_guid, 0)) {
477		error = EEXIST;
478		goto out;
479	}
480
481	spa->spa_load_guid = pool_guid;
482
483	/*
484	 * Parse the configuration into a vdev tree.  We explicitly set the
485	 * value that will be returned by spa_version() since parsing the
486	 * configuration requires knowing the version number.
487	 */
488	spa_config_enter(spa, RW_WRITER, FTAG);
489	spa->spa_ubsync.ub_version = version;
490	error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD);
491	spa_config_exit(spa, FTAG);
492
493	if (error != 0)
494		goto out;
495
496	ASSERT(spa->spa_root_vdev == rvd);
497	ASSERT(spa_guid(spa) == pool_guid);
498
499	/*
500	 * Try to open all vdevs, loading each label in the process.
501	 */
502	if (vdev_open(rvd) != 0) {
503		error = ENXIO;
504		goto out;
505	}
506
507	/*
508	 * Validate the labels for all leaf vdevs.  We need to grab the config
509	 * lock because all label I/O is done with the ZIO_FLAG_CONFIG_HELD
510	 * flag.
511	 */
512	spa_config_enter(spa, RW_READER, FTAG);
513	error = vdev_validate(rvd);
514	spa_config_exit(spa, FTAG);
515
516	if (error != 0) {
517		error = EBADF;
518		goto out;
519	}
520
521	if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
522		error = ENXIO;
523		goto out;
524	}
525
526	/*
527	 * Find the best uberblock.
528	 */
529	bzero(ub, sizeof (uberblock_t));
530
531	zio = zio_root(spa, NULL, NULL,
532	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
533	vdev_uberblock_load(zio, rvd, ub);
534	error = zio_wait(zio);
535
536	/*
537	 * If we weren't able to find a single valid uberblock, return failure.
538	 */
539	if (ub->ub_txg == 0) {
540		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
541		    VDEV_AUX_CORRUPT_DATA);
542		error = ENXIO;
543		goto out;
544	}
545
546	/*
547	 * If the pool is newer than the code, we can't open it.
548	 */
549	if (ub->ub_version > ZFS_VERSION) {
550		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
551		    VDEV_AUX_VERSION_NEWER);
552		error = ENOTSUP;
553		goto out;
554	}
555
556	/*
557	 * If the vdev guid sum doesn't match the uberblock, we have an
558	 * incomplete configuration.
559	 */
560	if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) {
561		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
562		    VDEV_AUX_BAD_GUID_SUM);
563		error = ENXIO;
564		goto out;
565	}
566
567	/*
568	 * Initialize internal SPA structures.
569	 */
570	spa->spa_state = POOL_STATE_ACTIVE;
571	spa->spa_ubsync = spa->spa_uberblock;
572	spa->spa_first_txg = spa_last_synced_txg(spa) + 1;
573	error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
574	if (error) {
575		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
576		    VDEV_AUX_CORRUPT_DATA);
577		goto out;
578	}
579	spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
580
581	if (zap_lookup(spa->spa_meta_objset,
582	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
583	    sizeof (uint64_t), 1, &spa->spa_config_object) != 0) {
584		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
585		    VDEV_AUX_CORRUPT_DATA);
586		error = EIO;
587		goto out;
588	}
589
590	if (!mosconfig) {
591		nvlist_t *newconfig;
592		uint64_t hostid;
593
594		if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) {
595			vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
596			    VDEV_AUX_CORRUPT_DATA);
597			error = EIO;
598			goto out;
599		}
600
601		/*
602		 * hostid is set after the root file system is mounted, so
603		 * ignore the check until it's done.
604		 */
605		if (nvlist_lookup_uint64(newconfig, ZPOOL_CONFIG_HOSTID,
606		    &hostid) == 0 && root_mounted()) {
607			char *hostname;
608			unsigned long myhostid = 0;
609
610			VERIFY(nvlist_lookup_string(newconfig,
611			    ZPOOL_CONFIG_HOSTNAME, &hostname) == 0);
612
613			(void) ddi_strtoul(hw_serial, NULL, 10, &myhostid);
614			if ((unsigned long)hostid != myhostid) {
615				cmn_err(CE_WARN, "pool '%s' could not be "
616				    "loaded as it was last accessed by "
617				    "another system (host: %s hostid: 0x%lx).  "
618				    "See: http://www.sun.com/msg/ZFS-8000-EY",
619				    spa->spa_name, hostname,
620				    (unsigned long)hostid);
621				error = EBADF;
622				goto out;
623			}
624		}
625
626		spa_config_set(spa, newconfig);
627		spa_unload(spa);
628		spa_deactivate(spa);
629		spa_activate(spa);
630
631		return (spa_load(spa, newconfig, state, B_TRUE));
632	}
633
634	if (zap_lookup(spa->spa_meta_objset,
635	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
636	    sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) {
637		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
638		    VDEV_AUX_CORRUPT_DATA);
639		error = EIO;
640		goto out;
641	}
642
643	/*
644	 * Load the bit that tells us to use the new accounting function
645	 * (raid-z deflation).  If we have an older pool, this will not
646	 * be present.
647	 */
648	error = zap_lookup(spa->spa_meta_objset,
649	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
650	    sizeof (uint64_t), 1, &spa->spa_deflate);
651	if (error != 0 && error != ENOENT) {
652		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
653		    VDEV_AUX_CORRUPT_DATA);
654		error = EIO;
655		goto out;
656	}
657
658	/*
659	 * Load the persistent error log.  If we have an older pool, this will
660	 * not be present.
661	 */
662	error = zap_lookup(spa->spa_meta_objset,
663	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST,
664	    sizeof (uint64_t), 1, &spa->spa_errlog_last);
665	if (error != 0 && error != ENOENT) {
666		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
667		    VDEV_AUX_CORRUPT_DATA);
668		error = EIO;
669		goto out;
670	}
671
672	error = zap_lookup(spa->spa_meta_objset,
673	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB,
674	    sizeof (uint64_t), 1, &spa->spa_errlog_scrub);
675	if (error != 0 && error != ENOENT) {
676		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
677		    VDEV_AUX_CORRUPT_DATA);
678		error = EIO;
679		goto out;
680	}
681
682	/*
683	 * Load the history object.  If we have an older pool, this
684	 * will not be present.
685	 */
686	error = zap_lookup(spa->spa_meta_objset,
687	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY,
688	    sizeof (uint64_t), 1, &spa->spa_history);
689	if (error != 0 && error != ENOENT) {
690		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
691		    VDEV_AUX_CORRUPT_DATA);
692		error = EIO;
693		goto out;
694	}
695
696	/*
697	 * Load any hot spares for this pool.
698	 */
699	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
700	    DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares_object);
701	if (error != 0 && error != ENOENT) {
702		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
703		    VDEV_AUX_CORRUPT_DATA);
704		error = EIO;
705		goto out;
706	}
707	if (error == 0) {
708		ASSERT(spa_version(spa) >= ZFS_VERSION_SPARES);
709		if (load_nvlist(spa, spa->spa_spares_object,
710		    &spa->spa_sparelist) != 0) {
711			vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
712			    VDEV_AUX_CORRUPT_DATA);
713			error = EIO;
714			goto out;
715		}
716
717		spa_config_enter(spa, RW_WRITER, FTAG);
718		spa_load_spares(spa);
719		spa_config_exit(spa, FTAG);
720	}
721
722	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
723	    DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object);
724
725	if (error && error != ENOENT) {
726		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
727		    VDEV_AUX_CORRUPT_DATA);
728		error = EIO;
729		goto out;
730	}
731
732	if (error == 0) {
733		(void) zap_lookup(spa->spa_meta_objset,
734		    spa->spa_pool_props_object,
735		    zpool_prop_to_name(ZFS_PROP_BOOTFS),
736		    sizeof (uint64_t), 1, &spa->spa_bootfs);
737	}
738
739	/*
740	 * Load the vdev state for all toplevel vdevs.
741	 */
742	vdev_load(rvd);
743
744	/*
745	 * Propagate the leaf DTLs we just loaded all the way up the tree.
746	 */
747	spa_config_enter(spa, RW_WRITER, FTAG);
748	vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
749	spa_config_exit(spa, FTAG);
750
751	/*
752	 * Check the state of the root vdev.  If it can't be opened, it
753	 * indicates one or more toplevel vdevs are faulted.
754	 */
755	if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
756		error = ENXIO;
757		goto out;
758	}
759
760	if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) {
761		dmu_tx_t *tx;
762		int need_update = B_FALSE;
763		int c;
764
765		/*
766		 * Claim log blocks that haven't been committed yet.
767		 * This must all happen in a single txg.
768		 */
769		tx = dmu_tx_create_assigned(spa_get_dsl(spa),
770		    spa_first_txg(spa));
771		(void) dmu_objset_find(spa->spa_name,
772		    zil_claim, tx, DS_FIND_CHILDREN);
773		dmu_tx_commit(tx);
774
775		spa->spa_sync_on = B_TRUE;
776		txg_sync_start(spa->spa_dsl_pool);
777
778		/*
779		 * Wait for all claims to sync.
780		 */
781		txg_wait_synced(spa->spa_dsl_pool, 0);
782
783		/*
784		 * If the config cache is stale, or we have uninitialized
785		 * metaslabs (see spa_vdev_add()), then update the config.
786		 */
787		if (config_cache_txg != spa->spa_config_txg ||
788		    state == SPA_LOAD_IMPORT)
789			need_update = B_TRUE;
790
791		for (c = 0; c < rvd->vdev_children; c++)
792			if (rvd->vdev_child[c]->vdev_ms_array == 0)
793				need_update = B_TRUE;
794
795		/*
796		 * Update the config cache asychronously in case we're the
797		 * root pool, in which case the config cache isn't writable yet.
798		 */
799		if (need_update)
800			spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
801	}
802
803	error = 0;
804out:
805	if (error && error != EBADF)
806		zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0);
807	spa->spa_load_state = SPA_LOAD_NONE;
808	spa->spa_ena = 0;
809
810	return (error);
811}
812
813/*
814 * Pool Open/Import
815 *
816 * The import case is identical to an open except that the configuration is sent
817 * down from userland, instead of grabbed from the configuration cache.  For the
818 * case of an open, the pool configuration will exist in the
819 * POOL_STATE_UNITIALIZED state.
820 *
821 * The stats information (gen/count/ustats) is used to gather vdev statistics at
822 * the same time open the pool, without having to keep around the spa_t in some
823 * ambiguous state.
824 */
825static int
826spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config)
827{
828	spa_t *spa;
829	int error;
830	int loaded = B_FALSE;
831	int locked = B_FALSE;
832
833	*spapp = NULL;
834
835	/*
836	 * As disgusting as this is, we need to support recursive calls to this
837	 * function because dsl_dir_open() is called during spa_load(), and ends
838	 * up calling spa_open() again.  The real fix is to figure out how to
839	 * avoid dsl_dir_open() calling this in the first place.
840	 */
841	if (mutex_owner(&spa_namespace_lock) != curthread) {
842		mutex_enter(&spa_namespace_lock);
843		locked = B_TRUE;
844	}
845
846	if ((spa = spa_lookup(pool)) == NULL) {
847		if (locked)
848			mutex_exit(&spa_namespace_lock);
849		return (ENOENT);
850	}
851	if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
852
853		spa_activate(spa);
854
855		error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE);
856
857		if (error == EBADF) {
858			/*
859			 * If vdev_validate() returns failure (indicated by
860			 * EBADF), it indicates that one of the vdevs indicates
861			 * that the pool has been exported or destroyed.  If
862			 * this is the case, the config cache is out of sync and
863			 * we should remove the pool from the namespace.
864			 */
865			zfs_post_ok(spa, NULL);
866			spa_unload(spa);
867			spa_deactivate(spa);
868			spa_remove(spa);
869			spa_config_sync();
870			if (locked)
871				mutex_exit(&spa_namespace_lock);
872			return (ENOENT);
873		}
874
875		if (error) {
876			/*
877			 * We can't open the pool, but we still have useful
878			 * information: the state of each vdev after the
879			 * attempted vdev_open().  Return this to the user.
880			 */
881			if (config != NULL && spa->spa_root_vdev != NULL) {
882				spa_config_enter(spa, RW_READER, FTAG);
883				*config = spa_config_generate(spa, NULL, -1ULL,
884				    B_TRUE);
885				spa_config_exit(spa, FTAG);
886			}
887			spa_unload(spa);
888			spa_deactivate(spa);
889			spa->spa_last_open_failed = B_TRUE;
890			if (locked)
891				mutex_exit(&spa_namespace_lock);
892			*spapp = NULL;
893			return (error);
894		} else {
895			zfs_post_ok(spa, NULL);
896			spa->spa_last_open_failed = B_FALSE;
897		}
898
899		loaded = B_TRUE;
900	}
901
902	spa_open_ref(spa, tag);
903	if (locked)
904		mutex_exit(&spa_namespace_lock);
905
906	*spapp = spa;
907
908	if (config != NULL) {
909		spa_config_enter(spa, RW_READER, FTAG);
910		*config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
911		spa_config_exit(spa, FTAG);
912	}
913
914	/*
915	 * If we just loaded the pool, resilver anything that's out of date.
916	 */
917	if (loaded && (spa_mode & FWRITE))
918		VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
919
920	return (0);
921}
922
923int
924spa_open(const char *name, spa_t **spapp, void *tag)
925{
926	return (spa_open_common(name, spapp, tag, NULL));
927}
928
929/*
930 * Lookup the given spa_t, incrementing the inject count in the process,
931 * preventing it from being exported or destroyed.
932 */
933spa_t *
934spa_inject_addref(char *name)
935{
936	spa_t *spa;
937
938	mutex_enter(&spa_namespace_lock);
939	if ((spa = spa_lookup(name)) == NULL) {
940		mutex_exit(&spa_namespace_lock);
941		return (NULL);
942	}
943	spa->spa_inject_ref++;
944	mutex_exit(&spa_namespace_lock);
945
946	return (spa);
947}
948
949void
950spa_inject_delref(spa_t *spa)
951{
952	mutex_enter(&spa_namespace_lock);
953	spa->spa_inject_ref--;
954	mutex_exit(&spa_namespace_lock);
955}
956
957static void
958spa_add_spares(spa_t *spa, nvlist_t *config)
959{
960	nvlist_t **spares;
961	uint_t i, nspares;
962	nvlist_t *nvroot;
963	uint64_t guid;
964	vdev_stat_t *vs;
965	uint_t vsc;
966	uint64_t pool;
967
968	if (spa->spa_nspares == 0)
969		return;
970
971	VERIFY(nvlist_lookup_nvlist(config,
972	    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
973	VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist,
974	    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
975	if (nspares != 0) {
976		VERIFY(nvlist_add_nvlist_array(nvroot,
977		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
978		VERIFY(nvlist_lookup_nvlist_array(nvroot,
979		    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
980
981		/*
982		 * Go through and find any spares which have since been
983		 * repurposed as an active spare.  If this is the case, update
984		 * their status appropriately.
985		 */
986		for (i = 0; i < nspares; i++) {
987			VERIFY(nvlist_lookup_uint64(spares[i],
988			    ZPOOL_CONFIG_GUID, &guid) == 0);
989			if (spa_spare_exists(guid, &pool) && pool != 0ULL) {
990				VERIFY(nvlist_lookup_uint64_array(
991				    spares[i], ZPOOL_CONFIG_STATS,
992				    (uint64_t **)&vs, &vsc) == 0);
993				vs->vs_state = VDEV_STATE_CANT_OPEN;
994				vs->vs_aux = VDEV_AUX_SPARED;
995			}
996		}
997	}
998}
999
1000int
1001spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen)
1002{
1003	int error;
1004	spa_t *spa;
1005
1006	*config = NULL;
1007	error = spa_open_common(name, &spa, FTAG, config);
1008
1009	if (spa && *config != NULL) {
1010		VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT,
1011		    spa_get_errlog_size(spa)) == 0);
1012
1013		spa_add_spares(spa, *config);
1014	}
1015
1016	/*
1017	 * We want to get the alternate root even for faulted pools, so we cheat
1018	 * and call spa_lookup() directly.
1019	 */
1020	if (altroot) {
1021		if (spa == NULL) {
1022			mutex_enter(&spa_namespace_lock);
1023			spa = spa_lookup(name);
1024			if (spa)
1025				spa_altroot(spa, altroot, buflen);
1026			else
1027				altroot[0] = '\0';
1028			spa = NULL;
1029			mutex_exit(&spa_namespace_lock);
1030		} else {
1031			spa_altroot(spa, altroot, buflen);
1032		}
1033	}
1034
1035	if (spa != NULL)
1036		spa_close(spa, FTAG);
1037
1038	return (error);
1039}
1040
1041/*
1042 * Validate that the 'spares' array is well formed.  We must have an array of
1043 * nvlists, each which describes a valid leaf vdev.  If this is an import (mode
1044 * is VDEV_ALLOC_SPARE), then we allow corrupted spares to be specified, as long
1045 * as they are well-formed.
1046 */
1047static int
1048spa_validate_spares(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode)
1049{
1050	nvlist_t **spares;
1051	uint_t i, nspares;
1052	vdev_t *vd;
1053	int error;
1054
1055	/*
1056	 * It's acceptable to have no spares specified.
1057	 */
1058	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
1059	    &spares, &nspares) != 0)
1060		return (0);
1061
1062	if (nspares == 0)
1063		return (EINVAL);
1064
1065	/*
1066	 * Make sure the pool is formatted with a version that supports hot
1067	 * spares.
1068	 */
1069	if (spa_version(spa) < ZFS_VERSION_SPARES)
1070		return (ENOTSUP);
1071
1072	/*
1073	 * Set the pending spare list so we correctly handle device in-use
1074	 * checking.
1075	 */
1076	spa->spa_pending_spares = spares;
1077	spa->spa_pending_nspares = nspares;
1078
1079	for (i = 0; i < nspares; i++) {
1080		if ((error = spa_config_parse(spa, &vd, spares[i], NULL, 0,
1081		    mode)) != 0)
1082			goto out;
1083
1084		if (!vd->vdev_ops->vdev_op_leaf) {
1085			vdev_free(vd);
1086			error = EINVAL;
1087			goto out;
1088		}
1089
1090		vd->vdev_top = vd;
1091
1092		if ((error = vdev_open(vd)) == 0 &&
1093		    (error = vdev_label_init(vd, crtxg,
1094		    VDEV_LABEL_SPARE)) == 0) {
1095			VERIFY(nvlist_add_uint64(spares[i], ZPOOL_CONFIG_GUID,
1096			    vd->vdev_guid) == 0);
1097		}
1098
1099		vdev_free(vd);
1100
1101		if (error && mode != VDEV_ALLOC_SPARE)
1102			goto out;
1103		else
1104			error = 0;
1105	}
1106
1107out:
1108	spa->spa_pending_spares = NULL;
1109	spa->spa_pending_nspares = 0;
1110	return (error);
1111}
1112
1113/*
1114 * Pool Creation
1115 */
1116int
1117spa_create(const char *pool, nvlist_t *nvroot, const char *altroot)
1118{
1119	spa_t *spa;
1120	vdev_t *rvd;
1121	dsl_pool_t *dp;
1122	dmu_tx_t *tx;
1123	int c, error = 0;
1124	uint64_t txg = TXG_INITIAL;
1125	nvlist_t **spares;
1126	uint_t nspares;
1127
1128	/*
1129	 * If this pool already exists, return failure.
1130	 */
1131	mutex_enter(&spa_namespace_lock);
1132	if (spa_lookup(pool) != NULL) {
1133		mutex_exit(&spa_namespace_lock);
1134		return (EEXIST);
1135	}
1136
1137	/*
1138	 * Allocate a new spa_t structure.
1139	 */
1140	spa = spa_add(pool, altroot);
1141	spa_activate(spa);
1142
1143	spa->spa_uberblock.ub_txg = txg - 1;
1144	spa->spa_uberblock.ub_version = ZFS_VERSION;
1145	spa->spa_ubsync = spa->spa_uberblock;
1146
1147	/*
1148	 * Create the root vdev.
1149	 */
1150	spa_config_enter(spa, RW_WRITER, FTAG);
1151
1152	error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD);
1153
1154	ASSERT(error != 0 || rvd != NULL);
1155	ASSERT(error != 0 || spa->spa_root_vdev == rvd);
1156
1157	if (error == 0 && rvd->vdev_children == 0)
1158		error = EINVAL;
1159
1160	if (error == 0 &&
1161	    (error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
1162	    (error = spa_validate_spares(spa, nvroot, txg,
1163	    VDEV_ALLOC_ADD)) == 0) {
1164		for (c = 0; c < rvd->vdev_children; c++)
1165			vdev_init(rvd->vdev_child[c], txg);
1166		vdev_config_dirty(rvd);
1167	}
1168
1169	spa_config_exit(spa, FTAG);
1170
1171	if (error != 0) {
1172		spa_unload(spa);
1173		spa_deactivate(spa);
1174		spa_remove(spa);
1175		mutex_exit(&spa_namespace_lock);
1176		return (error);
1177	}
1178
1179	/*
1180	 * Get the list of spares, if specified.
1181	 */
1182	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
1183	    &spares, &nspares) == 0) {
1184		VERIFY(nvlist_alloc(&spa->spa_sparelist, NV_UNIQUE_NAME,
1185		    KM_SLEEP) == 0);
1186		VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist,
1187		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
1188		spa_config_enter(spa, RW_WRITER, FTAG);
1189		spa_load_spares(spa);
1190		spa_config_exit(spa, FTAG);
1191		spa->spa_sync_spares = B_TRUE;
1192	}
1193
1194	spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg);
1195	spa->spa_meta_objset = dp->dp_meta_objset;
1196
1197	tx = dmu_tx_create_assigned(dp, txg);
1198
1199	/*
1200	 * Create the pool config object.
1201	 */
1202	spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
1203	    DMU_OT_PACKED_NVLIST, 1 << 14,
1204	    DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
1205
1206	if (zap_add(spa->spa_meta_objset,
1207	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
1208	    sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
1209		cmn_err(CE_PANIC, "failed to add pool config");
1210	}
1211
1212	/* Newly created pools are always deflated. */
1213	spa->spa_deflate = TRUE;
1214	if (zap_add(spa->spa_meta_objset,
1215	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
1216	    sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
1217		cmn_err(CE_PANIC, "failed to add deflate");
1218	}
1219
1220	/*
1221	 * Create the deferred-free bplist object.  Turn off compression
1222	 * because sync-to-convergence takes longer if the blocksize
1223	 * keeps changing.
1224	 */
1225	spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset,
1226	    1 << 14, tx);
1227	dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj,
1228	    ZIO_COMPRESS_OFF, tx);
1229
1230	if (zap_add(spa->spa_meta_objset,
1231	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
1232	    sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) {
1233		cmn_err(CE_PANIC, "failed to add bplist");
1234	}
1235
1236	/*
1237	 * Create the pool's history object.
1238	 */
1239	spa_history_create_obj(spa, tx);
1240
1241	dmu_tx_commit(tx);
1242
1243	spa->spa_bootfs = zfs_prop_default_numeric(ZFS_PROP_BOOTFS);
1244	spa->spa_sync_on = B_TRUE;
1245	txg_sync_start(spa->spa_dsl_pool);
1246
1247	/*
1248	 * We explicitly wait for the first transaction to complete so that our
1249	 * bean counters are appropriately updated.
1250	 */
1251	txg_wait_synced(spa->spa_dsl_pool, txg);
1252
1253	spa_config_sync();
1254
1255	mutex_exit(&spa_namespace_lock);
1256
1257	return (0);
1258}
1259
1260/*
1261 * Import the given pool into the system.  We set up the necessary spa_t and
1262 * then call spa_load() to do the dirty work.
1263 */
1264int
1265spa_import(const char *pool, nvlist_t *config, const char *altroot)
1266{
1267	spa_t *spa;
1268	int error;
1269	nvlist_t *nvroot;
1270	nvlist_t **spares;
1271	uint_t nspares;
1272
1273	if (!(spa_mode & FWRITE))
1274		return (EROFS);
1275
1276	/*
1277	 * If a pool with this name exists, return failure.
1278	 */
1279	mutex_enter(&spa_namespace_lock);
1280	if (spa_lookup(pool) != NULL) {
1281		mutex_exit(&spa_namespace_lock);
1282		return (EEXIST);
1283	}
1284
1285	/*
1286	 * Create and initialize the spa structure.
1287	 */
1288	spa = spa_add(pool, altroot);
1289	spa_activate(spa);
1290
1291	/*
1292	 * Pass off the heavy lifting to spa_load().
1293	 * Pass TRUE for mosconfig because the user-supplied config
1294	 * is actually the one to trust when doing an import.
1295	 */
1296	error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE);
1297
1298	spa_config_enter(spa, RW_WRITER, FTAG);
1299	/*
1300	 * Toss any existing sparelist, as it doesn't have any validity anymore,
1301	 * and conflicts with spa_has_spare().
1302	 */
1303	if (spa->spa_sparelist) {
1304		nvlist_free(spa->spa_sparelist);
1305		spa->spa_sparelist = NULL;
1306		spa_load_spares(spa);
1307	}
1308
1309	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
1310	    &nvroot) == 0);
1311	if (error == 0)
1312		error = spa_validate_spares(spa, nvroot, -1ULL,
1313		    VDEV_ALLOC_SPARE);
1314	spa_config_exit(spa, FTAG);
1315
1316	if (error != 0) {
1317		spa_unload(spa);
1318		spa_deactivate(spa);
1319		spa_remove(spa);
1320		mutex_exit(&spa_namespace_lock);
1321		return (error);
1322	}
1323
1324	/*
1325	 * Override any spares as specified by the user, as these may have
1326	 * correct device names/devids, etc.
1327	 */
1328	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
1329	    &spares, &nspares) == 0) {
1330		if (spa->spa_sparelist)
1331			VERIFY(nvlist_remove(spa->spa_sparelist,
1332			    ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
1333		else
1334			VERIFY(nvlist_alloc(&spa->spa_sparelist,
1335			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
1336		VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist,
1337		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
1338		spa_config_enter(spa, RW_WRITER, FTAG);
1339		spa_load_spares(spa);
1340		spa_config_exit(spa, FTAG);
1341		spa->spa_sync_spares = B_TRUE;
1342	}
1343
1344	/*
1345	 * Update the config cache to include the newly-imported pool.
1346	 */
1347	spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
1348
1349	mutex_exit(&spa_namespace_lock);
1350
1351	/*
1352	 * Resilver anything that's out of date.
1353	 */
1354	if (spa_mode & FWRITE)
1355		VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
1356
1357	return (0);
1358}
1359
1360/*
1361 * This (illegal) pool name is used when temporarily importing a spa_t in order
1362 * to get the vdev stats associated with the imported devices.
1363 */
1364#define	TRYIMPORT_NAME	"$import"
1365
1366nvlist_t *
1367spa_tryimport(nvlist_t *tryconfig)
1368{
1369	nvlist_t *config = NULL;
1370	char *poolname;
1371	spa_t *spa;
1372	uint64_t state;
1373
1374	if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
1375		return (NULL);
1376
1377	if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state))
1378		return (NULL);
1379
1380	/*
1381	 * Create and initialize the spa structure.
1382	 */
1383	mutex_enter(&spa_namespace_lock);
1384	spa = spa_add(TRYIMPORT_NAME, NULL);
1385	spa_activate(spa);
1386
1387	/*
1388	 * Pass off the heavy lifting to spa_load().
1389	 * Pass TRUE for mosconfig because the user-supplied config
1390	 * is actually the one to trust when doing an import.
1391	 */
1392	(void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE);
1393
1394	/*
1395	 * If 'tryconfig' was at least parsable, return the current config.
1396	 */
1397	if (spa->spa_root_vdev != NULL) {
1398		spa_config_enter(spa, RW_READER, FTAG);
1399		config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
1400		spa_config_exit(spa, FTAG);
1401		VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
1402		    poolname) == 0);
1403		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
1404		    state) == 0);
1405		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
1406		    spa->spa_uberblock.ub_timestamp) == 0);
1407
1408		/*
1409		 * Add the list of hot spares.
1410		 */
1411		spa_add_spares(spa, config);
1412	}
1413
1414	spa_unload(spa);
1415	spa_deactivate(spa);
1416	spa_remove(spa);
1417	mutex_exit(&spa_namespace_lock);
1418
1419	return (config);
1420}
1421
1422/*
1423 * Pool export/destroy
1424 *
1425 * The act of destroying or exporting a pool is very simple.  We make sure there
1426 * is no more pending I/O and any references to the pool are gone.  Then, we
1427 * update the pool state and sync all the labels to disk, removing the
1428 * configuration from the cache afterwards.
1429 */
1430static int
1431spa_export_common(char *pool, int new_state, nvlist_t **oldconfig)
1432{
1433	spa_t *spa;
1434
1435	if (oldconfig)
1436		*oldconfig = NULL;
1437
1438	if (!(spa_mode & FWRITE))
1439		return (EROFS);
1440
1441	mutex_enter(&spa_namespace_lock);
1442	if ((spa = spa_lookup(pool)) == NULL) {
1443		mutex_exit(&spa_namespace_lock);
1444		return (ENOENT);
1445	}
1446
1447	/*
1448	 * Put a hold on the pool, drop the namespace lock, stop async tasks,
1449	 * reacquire the namespace lock, and see if we can export.
1450	 */
1451	spa_open_ref(spa, FTAG);
1452	mutex_exit(&spa_namespace_lock);
1453	spa_async_suspend(spa);
1454	mutex_enter(&spa_namespace_lock);
1455	spa_close(spa, FTAG);
1456
1457	/*
1458	 * The pool will be in core if it's openable,
1459	 * in which case we can modify its state.
1460	 */
1461	if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) {
1462		/*
1463		 * Objsets may be open only because they're dirty, so we
1464		 * have to force it to sync before checking spa_refcnt.
1465		 */
1466		spa_scrub_suspend(spa);
1467		txg_wait_synced(spa->spa_dsl_pool, 0);
1468
1469		/*
1470		 * A pool cannot be exported or destroyed if there are active
1471		 * references.  If we are resetting a pool, allow references by
1472		 * fault injection handlers.
1473		 */
1474		if (!spa_refcount_zero(spa) ||
1475		    (spa->spa_inject_ref != 0 &&
1476		    new_state != POOL_STATE_UNINITIALIZED)) {
1477			spa_scrub_resume(spa);
1478			spa_async_resume(spa);
1479			mutex_exit(&spa_namespace_lock);
1480			return (EBUSY);
1481		}
1482
1483		spa_scrub_resume(spa);
1484		VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0);
1485
1486		/*
1487		 * We want this to be reflected on every label,
1488		 * so mark them all dirty.  spa_unload() will do the
1489		 * final sync that pushes these changes out.
1490		 */
1491		if (new_state != POOL_STATE_UNINITIALIZED) {
1492			spa_config_enter(spa, RW_WRITER, FTAG);
1493			spa->spa_state = new_state;
1494			spa->spa_final_txg = spa_last_synced_txg(spa) + 1;
1495			vdev_config_dirty(spa->spa_root_vdev);
1496			spa_config_exit(spa, FTAG);
1497		}
1498	}
1499
1500	if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
1501		spa_unload(spa);
1502		spa_deactivate(spa);
1503	}
1504
1505	if (oldconfig && spa->spa_config)
1506		VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0);
1507
1508	if (new_state != POOL_STATE_UNINITIALIZED) {
1509		spa_remove(spa);
1510		spa_config_sync();
1511	}
1512	mutex_exit(&spa_namespace_lock);
1513
1514	return (0);
1515}
1516
1517/*
1518 * Destroy a storage pool.
1519 */
1520int
1521spa_destroy(char *pool)
1522{
1523	return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL));
1524}
1525
1526/*
1527 * Export a storage pool.
1528 */
1529int
1530spa_export(char *pool, nvlist_t **oldconfig)
1531{
1532	return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig));
1533}
1534
1535/*
1536 * Similar to spa_export(), this unloads the spa_t without actually removing it
1537 * from the namespace in any way.
1538 */
1539int
1540spa_reset(char *pool)
1541{
1542	return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL));
1543}
1544
1545
1546/*
1547 * ==========================================================================
1548 * Device manipulation
1549 * ==========================================================================
1550 */
1551
1552/*
1553 * Add capacity to a storage pool.
1554 */
1555int
1556spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
1557{
1558	uint64_t txg;
1559	int c, error;
1560	vdev_t *rvd = spa->spa_root_vdev;
1561	vdev_t *vd, *tvd;
1562	nvlist_t **spares;
1563	uint_t i, nspares;
1564
1565	txg = spa_vdev_enter(spa);
1566
1567	if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0,
1568	    VDEV_ALLOC_ADD)) != 0)
1569		return (spa_vdev_exit(spa, NULL, txg, error));
1570
1571	spa->spa_pending_vdev = vd;
1572
1573	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
1574	    &spares, &nspares) != 0)
1575		nspares = 0;
1576
1577	if (vd->vdev_children == 0 && nspares == 0) {
1578		spa->spa_pending_vdev = NULL;
1579		return (spa_vdev_exit(spa, vd, txg, EINVAL));
1580	}
1581
1582	if (vd->vdev_children != 0) {
1583		if ((error = vdev_create(vd, txg, B_FALSE)) != 0) {
1584			spa->spa_pending_vdev = NULL;
1585			return (spa_vdev_exit(spa, vd, txg, error));
1586		}
1587	}
1588
1589	/*
1590	 * We must validate the spares after checking the children.  Otherwise,
1591	 * vdev_inuse() will blindly overwrite the spare.
1592	 */
1593	if ((error = spa_validate_spares(spa, nvroot, txg,
1594	    VDEV_ALLOC_ADD)) != 0) {
1595		spa->spa_pending_vdev = NULL;
1596		return (spa_vdev_exit(spa, vd, txg, error));
1597	}
1598
1599	spa->spa_pending_vdev = NULL;
1600
1601	/*
1602	 * Transfer each new top-level vdev from vd to rvd.
1603	 */
1604	for (c = 0; c < vd->vdev_children; c++) {
1605		tvd = vd->vdev_child[c];
1606		vdev_remove_child(vd, tvd);
1607		tvd->vdev_id = rvd->vdev_children;
1608		vdev_add_child(rvd, tvd);
1609		vdev_config_dirty(tvd);
1610	}
1611
1612	if (nspares != 0) {
1613		if (spa->spa_sparelist != NULL) {
1614			nvlist_t **oldspares;
1615			uint_t oldnspares;
1616			nvlist_t **newspares;
1617
1618			VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist,
1619			    ZPOOL_CONFIG_SPARES, &oldspares, &oldnspares) == 0);
1620
1621			newspares = kmem_alloc(sizeof (void *) *
1622			    (nspares + oldnspares), KM_SLEEP);
1623			for (i = 0; i < oldnspares; i++)
1624				VERIFY(nvlist_dup(oldspares[i],
1625				    &newspares[i], KM_SLEEP) == 0);
1626			for (i = 0; i < nspares; i++)
1627				VERIFY(nvlist_dup(spares[i],
1628				    &newspares[i + oldnspares],
1629				    KM_SLEEP) == 0);
1630
1631			VERIFY(nvlist_remove(spa->spa_sparelist,
1632			    ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
1633
1634			VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist,
1635			    ZPOOL_CONFIG_SPARES, newspares,
1636			    nspares + oldnspares) == 0);
1637			for (i = 0; i < oldnspares + nspares; i++)
1638				nvlist_free(newspares[i]);
1639			kmem_free(newspares, (oldnspares + nspares) *
1640			    sizeof (void *));
1641		} else {
1642			VERIFY(nvlist_alloc(&spa->spa_sparelist,
1643			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
1644			VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist,
1645			    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
1646		}
1647
1648		spa_load_spares(spa);
1649		spa->spa_sync_spares = B_TRUE;
1650	}
1651
1652	/*
1653	 * We have to be careful when adding new vdevs to an existing pool.
1654	 * If other threads start allocating from these vdevs before we
1655	 * sync the config cache, and we lose power, then upon reboot we may
1656	 * fail to open the pool because there are DVAs that the config cache
1657	 * can't translate.  Therefore, we first add the vdevs without
1658	 * initializing metaslabs; sync the config cache (via spa_vdev_exit());
1659	 * and then let spa_config_update() initialize the new metaslabs.
1660	 *
1661	 * spa_load() checks for added-but-not-initialized vdevs, so that
1662	 * if we lose power at any point in this sequence, the remaining
1663	 * steps will be completed the next time we load the pool.
1664	 */
1665	(void) spa_vdev_exit(spa, vd, txg, 0);
1666
1667	mutex_enter(&spa_namespace_lock);
1668	spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
1669	mutex_exit(&spa_namespace_lock);
1670
1671	return (0);
1672}
1673
1674/*
1675 * Attach a device to a mirror.  The arguments are the path to any device
1676 * in the mirror, and the nvroot for the new device.  If the path specifies
1677 * a device that is not mirrored, we automatically insert the mirror vdev.
1678 *
1679 * If 'replacing' is specified, the new device is intended to replace the
1680 * existing device; in this case the two devices are made into their own
1681 * mirror using the 'replacing' vdev, which is functionally idendical to
1682 * the mirror vdev (it actually reuses all the same ops) but has a few
1683 * extra rules: you can't attach to it after it's been created, and upon
1684 * completion of resilvering, the first disk (the one being replaced)
1685 * is automatically detached.
1686 */
1687int
1688spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
1689{
1690	uint64_t txg, open_txg;
1691	int error;
1692	vdev_t *rvd = spa->spa_root_vdev;
1693	vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
1694	vdev_ops_t *pvops;
1695
1696	txg = spa_vdev_enter(spa);
1697
1698	oldvd = vdev_lookup_by_guid(rvd, guid);
1699
1700	if (oldvd == NULL)
1701		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
1702
1703	if (!oldvd->vdev_ops->vdev_op_leaf)
1704		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
1705
1706	pvd = oldvd->vdev_parent;
1707
1708	if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
1709	    VDEV_ALLOC_ADD)) != 0 || newrootvd->vdev_children != 1)
1710		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
1711
1712	newvd = newrootvd->vdev_child[0];
1713
1714	if (!newvd->vdev_ops->vdev_op_leaf)
1715		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
1716
1717	if ((error = vdev_create(newrootvd, txg, replacing)) != 0)
1718		return (spa_vdev_exit(spa, newrootvd, txg, error));
1719
1720	if (!replacing) {
1721		/*
1722		 * For attach, the only allowable parent is a mirror or the root
1723		 * vdev.
1724		 */
1725		if (pvd->vdev_ops != &vdev_mirror_ops &&
1726		    pvd->vdev_ops != &vdev_root_ops)
1727			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
1728
1729		pvops = &vdev_mirror_ops;
1730	} else {
1731		/*
1732		 * Active hot spares can only be replaced by inactive hot
1733		 * spares.
1734		 */
1735		if (pvd->vdev_ops == &vdev_spare_ops &&
1736		    pvd->vdev_child[1] == oldvd &&
1737		    !spa_has_spare(spa, newvd->vdev_guid))
1738			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
1739
1740		/*
1741		 * If the source is a hot spare, and the parent isn't already a
1742		 * spare, then we want to create a new hot spare.  Otherwise, we
1743		 * want to create a replacing vdev.  The user is not allowed to
1744		 * attach to a spared vdev child unless the 'isspare' state is
1745		 * the same (spare replaces spare, non-spare replaces
1746		 * non-spare).
1747		 */
1748		if (pvd->vdev_ops == &vdev_replacing_ops)
1749			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
1750		else if (pvd->vdev_ops == &vdev_spare_ops &&
1751		    newvd->vdev_isspare != oldvd->vdev_isspare)
1752			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
1753		else if (pvd->vdev_ops != &vdev_spare_ops &&
1754		    newvd->vdev_isspare)
1755			pvops = &vdev_spare_ops;
1756		else
1757			pvops = &vdev_replacing_ops;
1758	}
1759
1760	/*
1761	 * Compare the new device size with the replaceable/attachable
1762	 * device size.
1763	 */
1764	if (newvd->vdev_psize < vdev_get_rsize(oldvd))
1765		return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
1766
1767	/*
1768	 * The new device cannot have a higher alignment requirement
1769	 * than the top-level vdev.
1770	 */
1771	if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
1772		return (spa_vdev_exit(spa, newrootvd, txg, EDOM));
1773
1774	/*
1775	 * If this is an in-place replacement, update oldvd's path and devid
1776	 * to make it distinguishable from newvd, and unopenable from now on.
1777	 */
1778	if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
1779		spa_strfree(oldvd->vdev_path);
1780		oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
1781		    KM_SLEEP);
1782		(void) sprintf(oldvd->vdev_path, "%s/%s",
1783		    newvd->vdev_path, "old");
1784		if (oldvd->vdev_devid != NULL) {
1785			spa_strfree(oldvd->vdev_devid);
1786			oldvd->vdev_devid = NULL;
1787		}
1788	}
1789
1790	/*
1791	 * If the parent is not a mirror, or if we're replacing, insert the new
1792	 * mirror/replacing/spare vdev above oldvd.
1793	 */
1794	if (pvd->vdev_ops != pvops)
1795		pvd = vdev_add_parent(oldvd, pvops);
1796
1797	ASSERT(pvd->vdev_top->vdev_parent == rvd);
1798	ASSERT(pvd->vdev_ops == pvops);
1799	ASSERT(oldvd->vdev_parent == pvd);
1800
1801	/*
1802	 * Extract the new device from its root and add it to pvd.
1803	 */
1804	vdev_remove_child(newrootvd, newvd);
1805	newvd->vdev_id = pvd->vdev_children;
1806	vdev_add_child(pvd, newvd);
1807
1808	/*
1809	 * If newvd is smaller than oldvd, but larger than its rsize,
1810	 * the addition of newvd may have decreased our parent's asize.
1811	 */
1812	pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize);
1813
1814	tvd = newvd->vdev_top;
1815	ASSERT(pvd->vdev_top == tvd);
1816	ASSERT(tvd->vdev_parent == rvd);
1817
1818	vdev_config_dirty(tvd);
1819
1820	/*
1821	 * Set newvd's DTL to [TXG_INITIAL, open_txg].  It will propagate
1822	 * upward when spa_vdev_exit() calls vdev_dtl_reassess().
1823	 */
1824	open_txg = txg + TXG_CONCURRENT_STATES - 1;
1825
1826	mutex_enter(&newvd->vdev_dtl_lock);
1827	space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL,
1828	    open_txg - TXG_INITIAL + 1);
1829	mutex_exit(&newvd->vdev_dtl_lock);
1830
1831	if (newvd->vdev_isspare)
1832		spa_spare_activate(newvd);
1833
1834	/*
1835	 * Mark newvd's DTL dirty in this txg.
1836	 */
1837	vdev_dirty(tvd, VDD_DTL, newvd, txg);
1838
1839	(void) spa_vdev_exit(spa, newrootvd, open_txg, 0);
1840
1841	/*
1842	 * Kick off a resilver to update newvd.
1843	 */
1844	VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
1845
1846	return (0);
1847}
1848
1849/*
1850 * Detach a device from a mirror or replacing vdev.
1851 * If 'replace_done' is specified, only detach if the parent
1852 * is a replacing vdev.
1853 */
1854int
1855spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done)
1856{
1857	uint64_t txg;
1858	int c, t, error;
1859	vdev_t *rvd = spa->spa_root_vdev;
1860	vdev_t *vd, *pvd, *cvd, *tvd;
1861	boolean_t unspare = B_FALSE;
1862	uint64_t unspare_guid;
1863
1864	txg = spa_vdev_enter(spa);
1865
1866	vd = vdev_lookup_by_guid(rvd, guid);
1867
1868	if (vd == NULL)
1869		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
1870
1871	if (!vd->vdev_ops->vdev_op_leaf)
1872		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
1873
1874	pvd = vd->vdev_parent;
1875
1876	/*
1877	 * If replace_done is specified, only remove this device if it's
1878	 * the first child of a replacing vdev.  For the 'spare' vdev, either
1879	 * disk can be removed.
1880	 */
1881	if (replace_done) {
1882		if (pvd->vdev_ops == &vdev_replacing_ops) {
1883			if (vd->vdev_id != 0)
1884				return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
1885		} else if (pvd->vdev_ops != &vdev_spare_ops) {
1886			return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
1887		}
1888	}
1889
1890	ASSERT(pvd->vdev_ops != &vdev_spare_ops ||
1891	    spa_version(spa) >= ZFS_VERSION_SPARES);
1892
1893	/*
1894	 * Only mirror, replacing, and spare vdevs support detach.
1895	 */
1896	if (pvd->vdev_ops != &vdev_replacing_ops &&
1897	    pvd->vdev_ops != &vdev_mirror_ops &&
1898	    pvd->vdev_ops != &vdev_spare_ops)
1899		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
1900
1901	/*
1902	 * If there's only one replica, you can't detach it.
1903	 */
1904	if (pvd->vdev_children <= 1)
1905		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
1906
1907	/*
1908	 * If all siblings have non-empty DTLs, this device may have the only
1909	 * valid copy of the data, which means we cannot safely detach it.
1910	 *
1911	 * XXX -- as in the vdev_offline() case, we really want a more
1912	 * precise DTL check.
1913	 */
1914	for (c = 0; c < pvd->vdev_children; c++) {
1915		uint64_t dirty;
1916
1917		cvd = pvd->vdev_child[c];
1918		if (cvd == vd)
1919			continue;
1920		if (vdev_is_dead(cvd))
1921			continue;
1922		mutex_enter(&cvd->vdev_dtl_lock);
1923		dirty = cvd->vdev_dtl_map.sm_space |
1924		    cvd->vdev_dtl_scrub.sm_space;
1925		mutex_exit(&cvd->vdev_dtl_lock);
1926		if (!dirty)
1927			break;
1928	}
1929
1930	/*
1931	 * If we are a replacing or spare vdev, then we can always detach the
1932	 * latter child, as that is how one cancels the operation.
1933	 */
1934	if ((pvd->vdev_ops == &vdev_mirror_ops || vd->vdev_id != 1) &&
1935	    c == pvd->vdev_children)
1936		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
1937
1938	/*
1939	 * If we are detaching the original disk from a spare, then it implies
1940	 * that the spare should become a real disk, and be removed from the
1941	 * active spare list for the pool.
1942	 */
1943	if (pvd->vdev_ops == &vdev_spare_ops &&
1944	    vd->vdev_id == 0)
1945		unspare = B_TRUE;
1946
1947	/*
1948	 * Erase the disk labels so the disk can be used for other things.
1949	 * This must be done after all other error cases are handled,
1950	 * but before we disembowel vd (so we can still do I/O to it).
1951	 * But if we can't do it, don't treat the error as fatal --
1952	 * it may be that the unwritability of the disk is the reason
1953	 * it's being detached!
1954	 */
1955	error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
1956
1957	/*
1958	 * Remove vd from its parent and compact the parent's children.
1959	 */
1960	vdev_remove_child(pvd, vd);
1961	vdev_compact_children(pvd);
1962
1963	/*
1964	 * Remember one of the remaining children so we can get tvd below.
1965	 */
1966	cvd = pvd->vdev_child[0];
1967
1968	/*
1969	 * If we need to remove the remaining child from the list of hot spares,
1970	 * do it now, marking the vdev as no longer a spare in the process.  We
1971	 * must do this before vdev_remove_parent(), because that can change the
1972	 * GUID if it creates a new toplevel GUID.
1973	 */
1974	if (unspare) {
1975		ASSERT(cvd->vdev_isspare);
1976		spa_spare_remove(cvd);
1977		unspare_guid = cvd->vdev_guid;
1978	}
1979
1980	/*
1981	 * If the parent mirror/replacing vdev only has one child,
1982	 * the parent is no longer needed.  Remove it from the tree.
1983	 */
1984	if (pvd->vdev_children == 1)
1985		vdev_remove_parent(cvd);
1986
1987	/*
1988	 * We don't set tvd until now because the parent we just removed
1989	 * may have been the previous top-level vdev.
1990	 */
1991	tvd = cvd->vdev_top;
1992	ASSERT(tvd->vdev_parent == rvd);
1993
1994	/*
1995	 * Reevaluate the parent vdev state.
1996	 */
1997	vdev_propagate_state(cvd->vdev_parent);
1998
1999	/*
2000	 * If the device we just detached was smaller than the others, it may be
2001	 * possible to add metaslabs (i.e. grow the pool).  vdev_metaslab_init()
2002	 * can't fail because the existing metaslabs are already in core, so
2003	 * there's nothing to read from disk.
2004	 */
2005	VERIFY(vdev_metaslab_init(tvd, txg) == 0);
2006
2007	vdev_config_dirty(tvd);
2008
2009	/*
2010	 * Mark vd's DTL as dirty in this txg.  vdev_dtl_sync() will see that
2011	 * vd->vdev_detached is set and free vd's DTL object in syncing context.
2012	 * But first make sure we're not on any *other* txg's DTL list, to
2013	 * prevent vd from being accessed after it's freed.
2014	 */
2015	for (t = 0; t < TXG_SIZE; t++)
2016		(void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
2017	vd->vdev_detached = B_TRUE;
2018	vdev_dirty(tvd, VDD_DTL, vd, txg);
2019
2020	error = spa_vdev_exit(spa, vd, txg, 0);
2021
2022	/*
2023	 * If this was the removal of the original device in a hot spare vdev,
2024	 * then we want to go through and remove the device from the hot spare
2025	 * list of every other pool.
2026	 */
2027	if (unspare) {
2028		spa = NULL;
2029		mutex_enter(&spa_namespace_lock);
2030		while ((spa = spa_next(spa)) != NULL) {
2031			if (spa->spa_state != POOL_STATE_ACTIVE)
2032				continue;
2033
2034			(void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
2035		}
2036		mutex_exit(&spa_namespace_lock);
2037	}
2038
2039	return (error);
2040}
2041
2042/*
2043 * Remove a device from the pool.  Currently, this supports removing only hot
2044 * spares.
2045 */
2046int
2047spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
2048{
2049	vdev_t *vd;
2050	nvlist_t **spares, *nv, **newspares;
2051	uint_t i, j, nspares;
2052	int ret = 0;
2053
2054	spa_config_enter(spa, RW_WRITER, FTAG);
2055
2056	vd = spa_lookup_by_guid(spa, guid);
2057
2058	nv = NULL;
2059	if (spa->spa_spares != NULL &&
2060	    nvlist_lookup_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES,
2061	    &spares, &nspares) == 0) {
2062		for (i = 0; i < nspares; i++) {
2063			uint64_t theguid;
2064
2065			VERIFY(nvlist_lookup_uint64(spares[i],
2066			    ZPOOL_CONFIG_GUID, &theguid) == 0);
2067			if (theguid == guid) {
2068				nv = spares[i];
2069				break;
2070			}
2071		}
2072	}
2073
2074	/*
2075	 * We only support removing a hot spare, and only if it's not currently
2076	 * in use in this pool.
2077	 */
2078	if (nv == NULL && vd == NULL) {
2079		ret = ENOENT;
2080		goto out;
2081	}
2082
2083	if (nv == NULL && vd != NULL) {
2084		ret = ENOTSUP;
2085		goto out;
2086	}
2087
2088	if (!unspare && nv != NULL && vd != NULL) {
2089		ret = EBUSY;
2090		goto out;
2091	}
2092
2093	if (nspares == 1) {
2094		newspares = NULL;
2095	} else {
2096		newspares = kmem_alloc((nspares - 1) * sizeof (void *),
2097		    KM_SLEEP);
2098		for (i = 0, j = 0; i < nspares; i++) {
2099			if (spares[i] != nv)
2100				VERIFY(nvlist_dup(spares[i],
2101				    &newspares[j++], KM_SLEEP) == 0);
2102		}
2103	}
2104
2105	VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES,
2106	    DATA_TYPE_NVLIST_ARRAY) == 0);
2107	VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES,
2108	    newspares, nspares - 1) == 0);
2109	for (i = 0; i < nspares - 1; i++)
2110		nvlist_free(newspares[i]);
2111	kmem_free(newspares, (nspares - 1) * sizeof (void *));
2112	spa_load_spares(spa);
2113	spa->spa_sync_spares = B_TRUE;
2114
2115out:
2116	spa_config_exit(spa, FTAG);
2117
2118	return (ret);
2119}
2120
2121/*
2122 * Find any device that's done replacing, so we can detach it.
2123 */
2124static vdev_t *
2125spa_vdev_replace_done_hunt(vdev_t *vd)
2126{
2127	vdev_t *newvd, *oldvd;
2128	int c;
2129
2130	for (c = 0; c < vd->vdev_children; c++) {
2131		oldvd = spa_vdev_replace_done_hunt(vd->vdev_child[c]);
2132		if (oldvd != NULL)
2133			return (oldvd);
2134	}
2135
2136	if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) {
2137		oldvd = vd->vdev_child[0];
2138		newvd = vd->vdev_child[1];
2139
2140		mutex_enter(&newvd->vdev_dtl_lock);
2141		if (newvd->vdev_dtl_map.sm_space == 0 &&
2142		    newvd->vdev_dtl_scrub.sm_space == 0) {
2143			mutex_exit(&newvd->vdev_dtl_lock);
2144			return (oldvd);
2145		}
2146		mutex_exit(&newvd->vdev_dtl_lock);
2147	}
2148
2149	return (NULL);
2150}
2151
2152static void
2153spa_vdev_replace_done(spa_t *spa)
2154{
2155	vdev_t *vd;
2156	vdev_t *pvd;
2157	uint64_t guid;
2158	uint64_t pguid = 0;
2159
2160	spa_config_enter(spa, RW_READER, FTAG);
2161
2162	while ((vd = spa_vdev_replace_done_hunt(spa->spa_root_vdev)) != NULL) {
2163		guid = vd->vdev_guid;
2164		/*
2165		 * If we have just finished replacing a hot spared device, then
2166		 * we need to detach the parent's first child (the original hot
2167		 * spare) as well.
2168		 */
2169		pvd = vd->vdev_parent;
2170		if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops &&
2171		    pvd->vdev_id == 0) {
2172			ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
2173			ASSERT(pvd->vdev_parent->vdev_children == 2);
2174			pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid;
2175		}
2176		spa_config_exit(spa, FTAG);
2177		if (spa_vdev_detach(spa, guid, B_TRUE) != 0)
2178			return;
2179		if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0)
2180			return;
2181		spa_config_enter(spa, RW_READER, FTAG);
2182	}
2183
2184	spa_config_exit(spa, FTAG);
2185}
2186
2187/*
2188 * Update the stored path for this vdev.  Dirty the vdev configuration, relying
2189 * on spa_vdev_enter/exit() to synchronize the labels and cache.
2190 */
2191int
2192spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
2193{
2194	vdev_t *rvd, *vd;
2195	uint64_t txg;
2196
2197	rvd = spa->spa_root_vdev;
2198
2199	txg = spa_vdev_enter(spa);
2200
2201	if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) {
2202		/*
2203		 * Determine if this is a reference to a hot spare.  In that
2204		 * case, update the path as stored in the spare list.
2205		 */
2206		nvlist_t **spares;
2207		uint_t i, nspares;
2208		if (spa->spa_sparelist != NULL) {
2209			VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist,
2210			    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
2211			for (i = 0; i < nspares; i++) {
2212				uint64_t theguid;
2213				VERIFY(nvlist_lookup_uint64(spares[i],
2214				    ZPOOL_CONFIG_GUID, &theguid) == 0);
2215				if (theguid == guid)
2216					break;
2217			}
2218
2219			if (i == nspares)
2220				return (spa_vdev_exit(spa, NULL, txg, ENOENT));
2221
2222			VERIFY(nvlist_add_string(spares[i],
2223			    ZPOOL_CONFIG_PATH, newpath) == 0);
2224			spa_load_spares(spa);
2225			spa->spa_sync_spares = B_TRUE;
2226			return (spa_vdev_exit(spa, NULL, txg, 0));
2227		} else {
2228			return (spa_vdev_exit(spa, NULL, txg, ENOENT));
2229		}
2230	}
2231
2232	if (!vd->vdev_ops->vdev_op_leaf)
2233		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
2234
2235	spa_strfree(vd->vdev_path);
2236	vd->vdev_path = spa_strdup(newpath);
2237
2238	vdev_config_dirty(vd->vdev_top);
2239
2240	return (spa_vdev_exit(spa, NULL, txg, 0));
2241}
2242
2243/*
2244 * ==========================================================================
2245 * SPA Scrubbing
2246 * ==========================================================================
2247 */
2248
2249static void
2250spa_scrub_io_done(zio_t *zio)
2251{
2252	spa_t *spa = zio->io_spa;
2253
2254	zio_data_buf_free(zio->io_data, zio->io_size);
2255
2256	mutex_enter(&spa->spa_scrub_lock);
2257	if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2258		vdev_t *vd = zio->io_vd ? zio->io_vd : spa->spa_root_vdev;
2259		spa->spa_scrub_errors++;
2260		mutex_enter(&vd->vdev_stat_lock);
2261		vd->vdev_stat.vs_scrub_errors++;
2262		mutex_exit(&vd->vdev_stat_lock);
2263	}
2264
2265	if (--spa->spa_scrub_inflight < spa->spa_scrub_maxinflight)
2266		cv_broadcast(&spa->spa_scrub_io_cv);
2267
2268	ASSERT(spa->spa_scrub_inflight >= 0);
2269
2270	mutex_exit(&spa->spa_scrub_lock);
2271}
2272
2273static void
2274spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags,
2275    zbookmark_t *zb)
2276{
2277	size_t size = BP_GET_LSIZE(bp);
2278	void *data;
2279
2280	mutex_enter(&spa->spa_scrub_lock);
2281	/*
2282	 * Do not give too much work to vdev(s).
2283	 */
2284	while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight) {
2285		cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
2286	}
2287	spa->spa_scrub_inflight++;
2288	mutex_exit(&spa->spa_scrub_lock);
2289
2290	data = zio_data_buf_alloc(size);
2291
2292	if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)
2293		flags |= ZIO_FLAG_SPECULATIVE;	/* intent log block */
2294
2295	flags |= ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL;
2296
2297	zio_nowait(zio_read(NULL, spa, bp, data, size,
2298	    spa_scrub_io_done, NULL, priority, flags, zb));
2299}
2300
2301/* ARGSUSED */
2302static int
2303spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a)
2304{
2305	blkptr_t *bp = &bc->bc_blkptr;
2306	vdev_t *vd = spa->spa_root_vdev;
2307	dva_t *dva = bp->blk_dva;
2308	int needs_resilver = B_FALSE;
2309	int d;
2310
2311	if (bc->bc_errno) {
2312		/*
2313		 * We can't scrub this block, but we can continue to scrub
2314		 * the rest of the pool.  Note the error and move along.
2315		 */
2316		mutex_enter(&spa->spa_scrub_lock);
2317		spa->spa_scrub_errors++;
2318		mutex_exit(&spa->spa_scrub_lock);
2319
2320		mutex_enter(&vd->vdev_stat_lock);
2321		vd->vdev_stat.vs_scrub_errors++;
2322		mutex_exit(&vd->vdev_stat_lock);
2323
2324		return (ERESTART);
2325	}
2326
2327	ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg);
2328
2329	for (d = 0; d < BP_GET_NDVAS(bp); d++) {
2330		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]));
2331
2332		ASSERT(vd != NULL);
2333
2334		/*
2335		 * Keep track of how much data we've examined so that
2336		 * zpool(1M) status can make useful progress reports.
2337		 */
2338		mutex_enter(&vd->vdev_stat_lock);
2339		vd->vdev_stat.vs_scrub_examined += DVA_GET_ASIZE(&dva[d]);
2340		mutex_exit(&vd->vdev_stat_lock);
2341
2342		if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) {
2343			if (DVA_GET_GANG(&dva[d])) {
2344				/*
2345				 * Gang members may be spread across multiple
2346				 * vdevs, so the best we can do is look at the
2347				 * pool-wide DTL.
2348				 * XXX -- it would be better to change our
2349				 * allocation policy to ensure that this can't
2350				 * happen.
2351				 */
2352				vd = spa->spa_root_vdev;
2353			}
2354			if (vdev_dtl_contains(&vd->vdev_dtl_map,
2355			    bp->blk_birth, 1))
2356				needs_resilver = B_TRUE;
2357		}
2358	}
2359
2360	if (spa->spa_scrub_type == POOL_SCRUB_EVERYTHING)
2361		spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB,
2362		    ZIO_FLAG_SCRUB, &bc->bc_bookmark);
2363	else if (needs_resilver)
2364		spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER,
2365		    ZIO_FLAG_RESILVER, &bc->bc_bookmark);
2366
2367	return (0);
2368}
2369
2370static void
2371spa_scrub_thread(void *arg)
2372{
2373	spa_t *spa = arg;
2374	callb_cpr_t cprinfo;
2375	traverse_handle_t *th = spa->spa_scrub_th;
2376	vdev_t *rvd = spa->spa_root_vdev;
2377	pool_scrub_type_t scrub_type = spa->spa_scrub_type;
2378	int error = 0;
2379	boolean_t complete;
2380
2381	CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG);
2382
2383	/*
2384	 * If we're restarting due to a snapshot create/delete,
2385	 * wait for that to complete.
2386	 */
2387	txg_wait_synced(spa_get_dsl(spa), 0);
2388
2389	dprintf("start %s mintxg=%llu maxtxg=%llu\n",
2390	    scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub",
2391	    spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg);
2392
2393	spa_config_enter(spa, RW_WRITER, FTAG);
2394	vdev_reopen(rvd);		/* purge all vdev caches */
2395	vdev_config_dirty(rvd);		/* rewrite all disk labels */
2396	vdev_scrub_stat_update(rvd, scrub_type, B_FALSE);
2397	spa_config_exit(spa, FTAG);
2398
2399	mutex_enter(&spa->spa_scrub_lock);
2400	spa->spa_scrub_errors = 0;
2401	spa->spa_scrub_active = 1;
2402	ASSERT(spa->spa_scrub_inflight == 0);
2403
2404	while (!spa->spa_scrub_stop) {
2405		CALLB_CPR_SAFE_BEGIN(&cprinfo);
2406		while (spa->spa_scrub_suspended) {
2407			spa->spa_scrub_active = 0;
2408			cv_broadcast(&spa->spa_scrub_cv);
2409			cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
2410			spa->spa_scrub_active = 1;
2411		}
2412		CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock);
2413
2414		if (spa->spa_scrub_restart_txg != 0)
2415			break;
2416
2417		mutex_exit(&spa->spa_scrub_lock);
2418		error = traverse_more(th);
2419		mutex_enter(&spa->spa_scrub_lock);
2420		if (error != EAGAIN)
2421			break;
2422	}
2423
2424	while (spa->spa_scrub_inflight)
2425		cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
2426
2427	spa->spa_scrub_active = 0;
2428	cv_broadcast(&spa->spa_scrub_cv);
2429
2430	mutex_exit(&spa->spa_scrub_lock);
2431
2432	spa_config_enter(spa, RW_WRITER, FTAG);
2433
2434	mutex_enter(&spa->spa_scrub_lock);
2435
2436	/*
2437	 * Note: we check spa_scrub_restart_txg under both spa_scrub_lock
2438	 * AND the spa config lock to synchronize with any config changes
2439	 * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit().
2440	 */
2441	if (spa->spa_scrub_restart_txg != 0)
2442		error = ERESTART;
2443
2444	if (spa->spa_scrub_stop)
2445		error = EINTR;
2446
2447	/*
2448	 * Even if there were uncorrectable errors, we consider the scrub
2449	 * completed.  The downside is that if there is a transient error during
2450	 * a resilver, we won't resilver the data properly to the target.  But
2451	 * if the damage is permanent (more likely) we will resilver forever,
2452	 * which isn't really acceptable.  Since there is enough information for
2453	 * the user to know what has failed and why, this seems like a more
2454	 * tractable approach.
2455	 */
2456	complete = (error == 0);
2457
2458	dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n",
2459	    scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub",
2460	    spa->spa_scrub_maxtxg, complete ? "done" : "FAILED",
2461	    error, spa->spa_scrub_errors, spa->spa_scrub_stop);
2462
2463	mutex_exit(&spa->spa_scrub_lock);
2464
2465	/*
2466	 * If the scrub/resilver completed, update all DTLs to reflect this.
2467	 * Whether it succeeded or not, vacate all temporary scrub DTLs.
2468	 */
2469	vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1,
2470	    complete ? spa->spa_scrub_maxtxg : 0, B_TRUE);
2471	vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete);
2472	spa_errlog_rotate(spa);
2473
2474	spa_config_exit(spa, FTAG);
2475
2476	mutex_enter(&spa->spa_scrub_lock);
2477
2478	/*
2479	 * We may have finished replacing a device.
2480	 * Let the async thread assess this and handle the detach.
2481	 */
2482	spa_async_request(spa, SPA_ASYNC_REPLACE_DONE);
2483
2484	/*
2485	 * If we were told to restart, our final act is to start a new scrub.
2486	 */
2487	if (error == ERESTART)
2488		spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ?
2489		    SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB);
2490
2491	spa->spa_scrub_type = POOL_SCRUB_NONE;
2492	spa->spa_scrub_active = 0;
2493	spa->spa_scrub_thread = NULL;
2494	cv_broadcast(&spa->spa_scrub_cv);
2495	CALLB_CPR_EXIT(&cprinfo);	/* drops &spa->spa_scrub_lock */
2496	thread_exit();
2497}
2498
2499void
2500spa_scrub_suspend(spa_t *spa)
2501{
2502	mutex_enter(&spa->spa_scrub_lock);
2503	spa->spa_scrub_suspended++;
2504	while (spa->spa_scrub_active) {
2505		cv_broadcast(&spa->spa_scrub_cv);
2506		cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
2507	}
2508	while (spa->spa_scrub_inflight)
2509		cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
2510	mutex_exit(&spa->spa_scrub_lock);
2511}
2512
2513void
2514spa_scrub_resume(spa_t *spa)
2515{
2516	mutex_enter(&spa->spa_scrub_lock);
2517	ASSERT(spa->spa_scrub_suspended != 0);
2518	if (--spa->spa_scrub_suspended == 0)
2519		cv_broadcast(&spa->spa_scrub_cv);
2520	mutex_exit(&spa->spa_scrub_lock);
2521}
2522
2523void
2524spa_scrub_restart(spa_t *spa, uint64_t txg)
2525{
2526	/*
2527	 * Something happened (e.g. snapshot create/delete) that means
2528	 * we must restart any in-progress scrubs.  The itinerary will
2529	 * fix this properly.
2530	 */
2531	mutex_enter(&spa->spa_scrub_lock);
2532	spa->spa_scrub_restart_txg = txg;
2533	mutex_exit(&spa->spa_scrub_lock);
2534}
2535
2536int
2537spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force)
2538{
2539	space_seg_t *ss;
2540	uint64_t mintxg, maxtxg;
2541	vdev_t *rvd = spa->spa_root_vdev;
2542
2543	if ((uint_t)type >= POOL_SCRUB_TYPES)
2544		return (ENOTSUP);
2545
2546	mutex_enter(&spa->spa_scrub_lock);
2547
2548	/*
2549	 * If there's a scrub or resilver already in progress, stop it.
2550	 */
2551	while (spa->spa_scrub_thread != NULL) {
2552		/*
2553		 * Don't stop a resilver unless forced.
2554		 */
2555		if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) {
2556			mutex_exit(&spa->spa_scrub_lock);
2557			return (EBUSY);
2558		}
2559		spa->spa_scrub_stop = 1;
2560		cv_broadcast(&spa->spa_scrub_cv);
2561		cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
2562	}
2563
2564	/*
2565	 * Terminate the previous traverse.
2566	 */
2567	if (spa->spa_scrub_th != NULL) {
2568		traverse_fini(spa->spa_scrub_th);
2569		spa->spa_scrub_th = NULL;
2570	}
2571
2572	if (rvd == NULL) {
2573		ASSERT(spa->spa_scrub_stop == 0);
2574		ASSERT(spa->spa_scrub_type == type);
2575		ASSERT(spa->spa_scrub_restart_txg == 0);
2576		mutex_exit(&spa->spa_scrub_lock);
2577		return (0);
2578	}
2579
2580	mintxg = TXG_INITIAL - 1;
2581	maxtxg = spa_last_synced_txg(spa) + 1;
2582
2583	mutex_enter(&rvd->vdev_dtl_lock);
2584
2585	if (rvd->vdev_dtl_map.sm_space == 0) {
2586		/*
2587		 * The pool-wide DTL is empty.
2588		 * If this is a resilver, there's nothing to do except
2589		 * check whether any in-progress replacements have completed.
2590		 */
2591		if (type == POOL_SCRUB_RESILVER) {
2592			type = POOL_SCRUB_NONE;
2593			spa_async_request(spa, SPA_ASYNC_REPLACE_DONE);
2594		}
2595	} else {
2596		/*
2597		 * The pool-wide DTL is non-empty.
2598		 * If this is a normal scrub, upgrade to a resilver instead.
2599		 */
2600		if (type == POOL_SCRUB_EVERYTHING)
2601			type = POOL_SCRUB_RESILVER;
2602	}
2603
2604	if (type == POOL_SCRUB_RESILVER) {
2605		/*
2606		 * Determine the resilvering boundaries.
2607		 *
2608		 * Note: (mintxg, maxtxg) is an open interval,
2609		 * i.e. mintxg and maxtxg themselves are not included.
2610		 *
2611		 * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1
2612		 * so we don't claim to resilver a txg that's still changing.
2613		 */
2614		ss = avl_first(&rvd->vdev_dtl_map.sm_root);
2615		mintxg = ss->ss_start - 1;
2616		ss = avl_last(&rvd->vdev_dtl_map.sm_root);
2617		maxtxg = MIN(ss->ss_end, maxtxg);
2618	}
2619
2620	mutex_exit(&rvd->vdev_dtl_lock);
2621
2622	spa->spa_scrub_stop = 0;
2623	spa->spa_scrub_type = type;
2624	spa->spa_scrub_restart_txg = 0;
2625
2626	if (type != POOL_SCRUB_NONE) {
2627		spa->spa_scrub_mintxg = mintxg;
2628		spa->spa_scrub_maxtxg = maxtxg;
2629		spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL,
2630		    ADVANCE_PRE | ADVANCE_PRUNE | ADVANCE_ZIL,
2631		    ZIO_FLAG_CANFAIL);
2632		traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg);
2633		spa->spa_scrub_thread = thread_create(NULL, 0,
2634		    spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri);
2635	}
2636
2637	mutex_exit(&spa->spa_scrub_lock);
2638
2639	return (0);
2640}
2641
2642/*
2643 * ==========================================================================
2644 * SPA async task processing
2645 * ==========================================================================
2646 */
2647
2648static void
2649spa_async_reopen(spa_t *spa)
2650{
2651	vdev_t *rvd = spa->spa_root_vdev;
2652	vdev_t *tvd;
2653	int c;
2654
2655	spa_config_enter(spa, RW_WRITER, FTAG);
2656
2657	for (c = 0; c < rvd->vdev_children; c++) {
2658		tvd = rvd->vdev_child[c];
2659		if (tvd->vdev_reopen_wanted) {
2660			tvd->vdev_reopen_wanted = 0;
2661			vdev_reopen(tvd);
2662		}
2663	}
2664
2665	spa_config_exit(spa, FTAG);
2666}
2667
2668static void
2669spa_async_thread(void *arg)
2670{
2671	spa_t *spa = arg;
2672	int tasks;
2673
2674	ASSERT(spa->spa_sync_on);
2675
2676	mutex_enter(&spa->spa_async_lock);
2677	tasks = spa->spa_async_tasks;
2678	spa->spa_async_tasks = 0;
2679	mutex_exit(&spa->spa_async_lock);
2680
2681	/*
2682	 * See if the config needs to be updated.
2683	 */
2684	if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
2685		mutex_enter(&spa_namespace_lock);
2686		spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
2687		mutex_exit(&spa_namespace_lock);
2688	}
2689
2690	/*
2691	 * See if any devices need to be reopened.
2692	 */
2693	if (tasks & SPA_ASYNC_REOPEN)
2694		spa_async_reopen(spa);
2695
2696	/*
2697	 * If any devices are done replacing, detach them.
2698	 */
2699	if (tasks & SPA_ASYNC_REPLACE_DONE)
2700		spa_vdev_replace_done(spa);
2701
2702	/*
2703	 * Kick off a scrub.
2704	 */
2705	if (tasks & SPA_ASYNC_SCRUB)
2706		VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0);
2707
2708	/*
2709	 * Kick off a resilver.
2710	 */
2711	if (tasks & SPA_ASYNC_RESILVER)
2712		VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
2713
2714	/*
2715	 * Let the world know that we're done.
2716	 */
2717	mutex_enter(&spa->spa_async_lock);
2718	spa->spa_async_thread = NULL;
2719	cv_broadcast(&spa->spa_async_cv);
2720	mutex_exit(&spa->spa_async_lock);
2721	thread_exit();
2722}
2723
2724void
2725spa_async_suspend(spa_t *spa)
2726{
2727	mutex_enter(&spa->spa_async_lock);
2728	spa->spa_async_suspended++;
2729	while (spa->spa_async_thread != NULL)
2730		cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
2731	mutex_exit(&spa->spa_async_lock);
2732}
2733
2734void
2735spa_async_resume(spa_t *spa)
2736{
2737	mutex_enter(&spa->spa_async_lock);
2738	ASSERT(spa->spa_async_suspended != 0);
2739	spa->spa_async_suspended--;
2740	mutex_exit(&spa->spa_async_lock);
2741}
2742
2743static void
2744spa_async_dispatch(spa_t *spa)
2745{
2746	mutex_enter(&spa->spa_async_lock);
2747	if (spa->spa_async_tasks && !spa->spa_async_suspended &&
2748	    spa->spa_async_thread == NULL &&
2749	    rootdir != NULL && !vn_is_readonly(rootdir))
2750		spa->spa_async_thread = thread_create(NULL, 0,
2751		    spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
2752	mutex_exit(&spa->spa_async_lock);
2753}
2754
2755void
2756spa_async_request(spa_t *spa, int task)
2757{
2758	mutex_enter(&spa->spa_async_lock);
2759	spa->spa_async_tasks |= task;
2760	mutex_exit(&spa->spa_async_lock);
2761}
2762
2763/*
2764 * ==========================================================================
2765 * SPA syncing routines
2766 * ==========================================================================
2767 */
2768
2769static void
2770spa_sync_deferred_frees(spa_t *spa, uint64_t txg)
2771{
2772	bplist_t *bpl = &spa->spa_sync_bplist;
2773	dmu_tx_t *tx;
2774	blkptr_t blk;
2775	uint64_t itor = 0;
2776	zio_t *zio;
2777	int error;
2778	uint8_t c = 1;
2779
2780	zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD);
2781
2782	while (bplist_iterate(bpl, &itor, &blk) == 0)
2783		zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL));
2784
2785	error = zio_wait(zio);
2786	ASSERT3U(error, ==, 0);
2787
2788	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
2789	bplist_vacate(bpl, tx);
2790
2791	/*
2792	 * Pre-dirty the first block so we sync to convergence faster.
2793	 * (Usually only the first block is needed.)
2794	 */
2795	dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx);
2796	dmu_tx_commit(tx);
2797}
2798
2799static void
2800spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
2801{
2802	char *packed = NULL;
2803	size_t nvsize = 0;
2804	dmu_buf_t *db;
2805
2806	VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0);
2807
2808	packed = kmem_alloc(nvsize, KM_SLEEP);
2809
2810	VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
2811	    KM_SLEEP) == 0);
2812
2813	dmu_write(spa->spa_meta_objset, obj, 0, nvsize, packed, tx);
2814
2815	kmem_free(packed, nvsize);
2816
2817	VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
2818	dmu_buf_will_dirty(db, tx);
2819	*(uint64_t *)db->db_data = nvsize;
2820	dmu_buf_rele(db, FTAG);
2821}
2822
2823static void
2824spa_sync_spares(spa_t *spa, dmu_tx_t *tx)
2825{
2826	nvlist_t *nvroot;
2827	nvlist_t **spares;
2828	int i;
2829
2830	if (!spa->spa_sync_spares)
2831		return;
2832
2833	/*
2834	 * Update the MOS nvlist describing the list of available spares.
2835	 * spa_validate_spares() will have already made sure this nvlist is
2836	 * valid and the vdevs are labelled appropriately.
2837	 */
2838	if (spa->spa_spares_object == 0) {
2839		spa->spa_spares_object = dmu_object_alloc(spa->spa_meta_objset,
2840		    DMU_OT_PACKED_NVLIST, 1 << 14,
2841		    DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
2842		VERIFY(zap_update(spa->spa_meta_objset,
2843		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SPARES,
2844		    sizeof (uint64_t), 1, &spa->spa_spares_object, tx) == 0);
2845	}
2846
2847	VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
2848	if (spa->spa_nspares == 0) {
2849		VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
2850		    NULL, 0) == 0);
2851	} else {
2852		spares = kmem_alloc(spa->spa_nspares * sizeof (void *),
2853		    KM_SLEEP);
2854		for (i = 0; i < spa->spa_nspares; i++)
2855			spares[i] = vdev_config_generate(spa,
2856			    spa->spa_spares[i], B_FALSE, B_TRUE);
2857		VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
2858		    spares, spa->spa_nspares) == 0);
2859		for (i = 0; i < spa->spa_nspares; i++)
2860			nvlist_free(spares[i]);
2861		kmem_free(spares, spa->spa_nspares * sizeof (void *));
2862	}
2863
2864	spa_sync_nvlist(spa, spa->spa_spares_object, nvroot, tx);
2865	nvlist_free(nvroot);
2866
2867	spa->spa_sync_spares = B_FALSE;
2868}
2869
2870static void
2871spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
2872{
2873	nvlist_t *config;
2874
2875	if (list_is_empty(&spa->spa_dirty_list))
2876		return;
2877
2878	config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE);
2879
2880	if (spa->spa_config_syncing)
2881		nvlist_free(spa->spa_config_syncing);
2882	spa->spa_config_syncing = config;
2883
2884	spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
2885}
2886
2887static void
2888spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx)
2889{
2890	spa_t *spa = arg1;
2891	nvlist_t *nvp = arg2;
2892	nvpair_t *nvpair;
2893	objset_t *mos = spa->spa_meta_objset;
2894	uint64_t zapobj;
2895
2896	mutex_enter(&spa->spa_props_lock);
2897	if (spa->spa_pool_props_object == 0) {
2898		zapobj = zap_create(mos, DMU_OT_POOL_PROPS, DMU_OT_NONE, 0, tx);
2899		VERIFY(zapobj > 0);
2900
2901		spa->spa_pool_props_object = zapobj;
2902
2903		VERIFY(zap_update(mos, DMU_POOL_DIRECTORY_OBJECT,
2904		    DMU_POOL_PROPS, 8, 1,
2905		    &spa->spa_pool_props_object, tx) == 0);
2906	}
2907	mutex_exit(&spa->spa_props_lock);
2908
2909	nvpair = NULL;
2910	while ((nvpair = nvlist_next_nvpair(nvp, nvpair))) {
2911		switch (zpool_name_to_prop(nvpair_name(nvpair))) {
2912		case ZFS_PROP_BOOTFS:
2913			VERIFY(nvlist_lookup_uint64(nvp,
2914			    nvpair_name(nvpair), &spa->spa_bootfs) == 0);
2915			VERIFY(zap_update(mos,
2916			    spa->spa_pool_props_object,
2917			    zpool_prop_to_name(ZFS_PROP_BOOTFS), 8, 1,
2918			    &spa->spa_bootfs, tx) == 0);
2919			break;
2920		}
2921	}
2922}
2923
2924/*
2925 * Sync the specified transaction group.  New blocks may be dirtied as
2926 * part of the process, so we iterate until it converges.
2927 */
2928void
2929spa_sync(spa_t *spa, uint64_t txg)
2930{
2931	dsl_pool_t *dp = spa->spa_dsl_pool;
2932	objset_t *mos = spa->spa_meta_objset;
2933	bplist_t *bpl = &spa->spa_sync_bplist;
2934	vdev_t *rvd = spa->spa_root_vdev;
2935	vdev_t *vd;
2936	dmu_tx_t *tx;
2937	int dirty_vdevs;
2938
2939	/*
2940	 * Lock out configuration changes.
2941	 */
2942	spa_config_enter(spa, RW_READER, FTAG);
2943
2944	spa->spa_syncing_txg = txg;
2945	spa->spa_sync_pass = 0;
2946
2947	VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj));
2948
2949	tx = dmu_tx_create_assigned(dp, txg);
2950
2951	/*
2952	 * If we are upgrading to ZFS_VERSION_RAIDZ_DEFLATE this txg,
2953	 * set spa_deflate if we have no raid-z vdevs.
2954	 */
2955	if (spa->spa_ubsync.ub_version < ZFS_VERSION_RAIDZ_DEFLATE &&
2956	    spa->spa_uberblock.ub_version >= ZFS_VERSION_RAIDZ_DEFLATE) {
2957		int i;
2958
2959		for (i = 0; i < rvd->vdev_children; i++) {
2960			vd = rvd->vdev_child[i];
2961			if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
2962				break;
2963		}
2964		if (i == rvd->vdev_children) {
2965			spa->spa_deflate = TRUE;
2966			VERIFY(0 == zap_add(spa->spa_meta_objset,
2967			    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
2968			    sizeof (uint64_t), 1, &spa->spa_deflate, tx));
2969		}
2970	}
2971
2972	/*
2973	 * If anything has changed in this txg, push the deferred frees
2974	 * from the previous txg.  If not, leave them alone so that we
2975	 * don't generate work on an otherwise idle system.
2976	 */
2977	if (!txg_list_empty(&dp->dp_dirty_datasets, txg) ||
2978	    !txg_list_empty(&dp->dp_dirty_dirs, txg) ||
2979	    !txg_list_empty(&dp->dp_sync_tasks, txg))
2980		spa_sync_deferred_frees(spa, txg);
2981
2982	/*
2983	 * Iterate to convergence.
2984	 */
2985	do {
2986		spa->spa_sync_pass++;
2987
2988		spa_sync_config_object(spa, tx);
2989		spa_sync_spares(spa, tx);
2990		spa_errlog_sync(spa, txg);
2991		dsl_pool_sync(dp, txg);
2992
2993		dirty_vdevs = 0;
2994		while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) {
2995			vdev_sync(vd, txg);
2996			dirty_vdevs++;
2997		}
2998
2999		bplist_sync(bpl, tx);
3000	} while (dirty_vdevs);
3001
3002	bplist_close(bpl);
3003
3004	dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass);
3005
3006	/*
3007	 * Rewrite the vdev configuration (which includes the uberblock)
3008	 * to commit the transaction group.
3009	 *
3010	 * If there are any dirty vdevs, sync the uberblock to all vdevs.
3011	 * Otherwise, pick a random top-level vdev that's known to be
3012	 * visible in the config cache (see spa_vdev_add() for details).
3013	 * If the write fails, try the next vdev until we're tried them all.
3014	 */
3015	if (!list_is_empty(&spa->spa_dirty_list)) {
3016		VERIFY(vdev_config_sync(rvd, txg) == 0);
3017	} else {
3018		int children = rvd->vdev_children;
3019		int c0 = spa_get_random(children);
3020		int c;
3021
3022		for (c = 0; c < children; c++) {
3023			vd = rvd->vdev_child[(c0 + c) % children];
3024			if (vd->vdev_ms_array == 0)
3025				continue;
3026			if (vdev_config_sync(vd, txg) == 0)
3027				break;
3028		}
3029		if (c == children)
3030			VERIFY(vdev_config_sync(rvd, txg) == 0);
3031	}
3032
3033	dmu_tx_commit(tx);
3034
3035	/*
3036	 * Clear the dirty config list.
3037	 */
3038	while ((vd = list_head(&spa->spa_dirty_list)) != NULL)
3039		vdev_config_clean(vd);
3040
3041	/*
3042	 * Now that the new config has synced transactionally,
3043	 * let it become visible to the config cache.
3044	 */
3045	if (spa->spa_config_syncing != NULL) {
3046		spa_config_set(spa, spa->spa_config_syncing);
3047		spa->spa_config_txg = txg;
3048		spa->spa_config_syncing = NULL;
3049	}
3050
3051	/*
3052	 * Make a stable copy of the fully synced uberblock.
3053	 * We use this as the root for pool traversals.
3054	 */
3055	spa->spa_traverse_wanted = 1;	/* tells traverse_more() to stop */
3056
3057	spa_scrub_suspend(spa);		/* stop scrubbing and finish I/Os */
3058
3059	rw_enter(&spa->spa_traverse_lock, RW_WRITER);
3060	spa->spa_traverse_wanted = 0;
3061	spa->spa_ubsync = spa->spa_uberblock;
3062	rw_exit(&spa->spa_traverse_lock);
3063
3064	spa_scrub_resume(spa);		/* resume scrub with new ubsync */
3065
3066	/*
3067	 * Clean up the ZIL records for the synced txg.
3068	 */
3069	dsl_pool_zil_clean(dp);
3070
3071	/*
3072	 * Update usable space statistics.
3073	 */
3074	while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
3075		vdev_sync_done(vd, txg);
3076
3077	/*
3078	 * It had better be the case that we didn't dirty anything
3079	 * since vdev_config_sync().
3080	 */
3081	ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
3082	ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
3083	ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
3084	ASSERT(bpl->bpl_queue == NULL);
3085
3086	spa_config_exit(spa, FTAG);
3087
3088	/*
3089	 * If any async tasks have been requested, kick them off.
3090	 */
3091	spa_async_dispatch(spa);
3092}
3093
3094/*
3095 * Sync all pools.  We don't want to hold the namespace lock across these
3096 * operations, so we take a reference on the spa_t and drop the lock during the
3097 * sync.
3098 */
3099void
3100spa_sync_allpools(void)
3101{
3102	spa_t *spa = NULL;
3103	mutex_enter(&spa_namespace_lock);
3104	while ((spa = spa_next(spa)) != NULL) {
3105		if (spa_state(spa) != POOL_STATE_ACTIVE)
3106			continue;
3107		spa_open_ref(spa, FTAG);
3108		mutex_exit(&spa_namespace_lock);
3109		txg_wait_synced(spa_get_dsl(spa), 0);
3110		mutex_enter(&spa_namespace_lock);
3111		spa_close(spa, FTAG);
3112	}
3113	mutex_exit(&spa_namespace_lock);
3114}
3115
3116/*
3117 * ==========================================================================
3118 * Miscellaneous routines
3119 * ==========================================================================
3120 */
3121
3122/*
3123 * Remove all pools in the system.
3124 */
3125void
3126spa_evict_all(void)
3127{
3128	spa_t *spa;
3129
3130	/*
3131	 * Remove all cached state.  All pools should be closed now,
3132	 * so every spa in the AVL tree should be unreferenced.
3133	 */
3134	mutex_enter(&spa_namespace_lock);
3135	while ((spa = spa_next(NULL)) != NULL) {
3136		/*
3137		 * Stop async tasks.  The async thread may need to detach
3138		 * a device that's been replaced, which requires grabbing
3139		 * spa_namespace_lock, so we must drop it here.
3140		 */
3141		spa_open_ref(spa, FTAG);
3142		mutex_exit(&spa_namespace_lock);
3143		spa_async_suspend(spa);
3144		VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0);
3145		mutex_enter(&spa_namespace_lock);
3146		spa_close(spa, FTAG);
3147
3148		if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
3149			spa_unload(spa);
3150			spa_deactivate(spa);
3151		}
3152		spa_remove(spa);
3153	}
3154	mutex_exit(&spa_namespace_lock);
3155}
3156
3157vdev_t *
3158spa_lookup_by_guid(spa_t *spa, uint64_t guid)
3159{
3160	return (vdev_lookup_by_guid(spa->spa_root_vdev, guid));
3161}
3162
3163void
3164spa_upgrade(spa_t *spa)
3165{
3166	spa_config_enter(spa, RW_WRITER, FTAG);
3167
3168	/*
3169	 * This should only be called for a non-faulted pool, and since a
3170	 * future version would result in an unopenable pool, this shouldn't be
3171	 * possible.
3172	 */
3173	ASSERT(spa->spa_uberblock.ub_version <= ZFS_VERSION);
3174
3175	spa->spa_uberblock.ub_version = ZFS_VERSION;
3176	vdev_config_dirty(spa->spa_root_vdev);
3177
3178	spa_config_exit(spa, FTAG);
3179
3180	txg_wait_synced(spa_get_dsl(spa), 0);
3181}
3182
3183boolean_t
3184spa_has_spare(spa_t *spa, uint64_t guid)
3185{
3186	int i;
3187	uint64_t spareguid;
3188
3189	for (i = 0; i < spa->spa_nspares; i++)
3190		if (spa->spa_spares[i]->vdev_guid == guid)
3191			return (B_TRUE);
3192
3193	for (i = 0; i < spa->spa_pending_nspares; i++) {
3194		if (nvlist_lookup_uint64(spa->spa_pending_spares[i],
3195		    ZPOOL_CONFIG_GUID, &spareguid) == 0 &&
3196		    spareguid == guid)
3197			return (B_TRUE);
3198	}
3199
3200	return (B_FALSE);
3201}
3202
3203int
3204spa_set_props(spa_t *spa, nvlist_t *nvp)
3205{
3206	return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props,
3207	    spa, nvp, 3));
3208}
3209
3210int
3211spa_get_props(spa_t *spa, nvlist_t **nvp)
3212{
3213	zap_cursor_t zc;
3214	zap_attribute_t za;
3215	objset_t *mos = spa->spa_meta_objset;
3216	zfs_source_t src;
3217	zfs_prop_t prop;
3218	nvlist_t *propval;
3219	uint64_t value;
3220	int err;
3221
3222	VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
3223
3224	mutex_enter(&spa->spa_props_lock);
3225	/* If no props object, then just return empty nvlist */
3226	if (spa->spa_pool_props_object == 0) {
3227		mutex_exit(&spa->spa_props_lock);
3228		return (0);
3229	}
3230
3231	for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object);
3232	    (err = zap_cursor_retrieve(&zc, &za)) == 0;
3233	    zap_cursor_advance(&zc)) {
3234
3235		if ((prop = zpool_name_to_prop(za.za_name)) == ZFS_PROP_INVAL)
3236			continue;
3237
3238		VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
3239		switch (za.za_integer_length) {
3240		case 8:
3241			if (zfs_prop_default_numeric(prop) ==
3242			    za.za_first_integer)
3243				src = ZFS_SRC_DEFAULT;
3244			else
3245				src = ZFS_SRC_LOCAL;
3246			value = za.za_first_integer;
3247
3248			if (prop == ZFS_PROP_BOOTFS) {
3249				dsl_pool_t *dp;
3250				dsl_dataset_t *ds = NULL;
3251				char strval[MAXPATHLEN];
3252
3253				dp = spa_get_dsl(spa);
3254				rw_enter(&dp->dp_config_rwlock, RW_READER);
3255				if ((err = dsl_dataset_open_obj(dp,
3256				    za.za_first_integer, NULL, DS_MODE_NONE,
3257				    FTAG, &ds)) != 0) {
3258					rw_exit(&dp->dp_config_rwlock);
3259					break;
3260				}
3261				dsl_dataset_name(ds, strval);
3262				dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
3263				rw_exit(&dp->dp_config_rwlock);
3264
3265				VERIFY(nvlist_add_uint64(propval,
3266				    ZFS_PROP_SOURCE, src) == 0);
3267				VERIFY(nvlist_add_string(propval,
3268				    ZFS_PROP_VALUE, strval) == 0);
3269			} else {
3270				VERIFY(nvlist_add_uint64(propval,
3271				    ZFS_PROP_SOURCE, src) == 0);
3272				VERIFY(nvlist_add_uint64(propval,
3273				    ZFS_PROP_VALUE, value) == 0);
3274			}
3275			VERIFY(nvlist_add_nvlist(*nvp, za.za_name,
3276			    propval) == 0);
3277			break;
3278		}
3279		nvlist_free(propval);
3280	}
3281	zap_cursor_fini(&zc);
3282	mutex_exit(&spa->spa_props_lock);
3283	if (err && err != ENOENT) {
3284		nvlist_free(*nvp);
3285		return (err);
3286	}
3287
3288	return (0);
3289}
3290
3291/*
3292 * If the bootfs property value is dsobj, clear it.
3293 */
3294void
3295spa_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
3296{
3297	if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
3298		VERIFY(zap_remove(spa->spa_meta_objset,
3299		    spa->spa_pool_props_object,
3300		    zpool_prop_to_name(ZFS_PROP_BOOTFS), tx) == 0);
3301		spa->spa_bootfs = 0;
3302	}
3303}
3304