spa.c revision 346676
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
25 * Copyright (c) 2015, Nexenta Systems, Inc.  All rights reserved.
26 * Copyright (c) 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
27 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
28 * Copyright 2013 Saso Kiselkov. All rights reserved.
29 * Copyright (c) 2014 Integros [integros.com]
30 * Copyright 2016 Toomas Soome <tsoome@me.com>
31 * Copyright 2018 Joyent, Inc.
32 * Copyright (c) 2017 Datto Inc.
33 * Copyright 2018 OmniOS Community Edition (OmniOSce) Association.
34 */
35
36/*
37 * SPA: Storage Pool Allocator
38 *
39 * This file contains all the routines used when modifying on-disk SPA state.
40 * This includes opening, importing, destroying, exporting a pool, and syncing a
41 * pool.
42 */
43
44#include <sys/zfs_context.h>
45#include <sys/fm/fs/zfs.h>
46#include <sys/spa_impl.h>
47#include <sys/zio.h>
48#include <sys/zio_checksum.h>
49#include <sys/dmu.h>
50#include <sys/dmu_tx.h>
51#include <sys/zap.h>
52#include <sys/zil.h>
53#include <sys/ddt.h>
54#include <sys/vdev_impl.h>
55#include <sys/vdev_removal.h>
56#include <sys/vdev_indirect_mapping.h>
57#include <sys/vdev_indirect_births.h>
58#include <sys/vdev_initialize.h>
59#include <sys/metaslab.h>
60#include <sys/metaslab_impl.h>
61#include <sys/uberblock_impl.h>
62#include <sys/txg.h>
63#include <sys/avl.h>
64#include <sys/bpobj.h>
65#include <sys/dmu_traverse.h>
66#include <sys/dmu_objset.h>
67#include <sys/unique.h>
68#include <sys/dsl_pool.h>
69#include <sys/dsl_dataset.h>
70#include <sys/dsl_dir.h>
71#include <sys/dsl_prop.h>
72#include <sys/dsl_synctask.h>
73#include <sys/fs/zfs.h>
74#include <sys/arc.h>
75#include <sys/callb.h>
76#include <sys/spa_boot.h>
77#include <sys/zfs_ioctl.h>
78#include <sys/dsl_scan.h>
79#include <sys/dmu_send.h>
80#include <sys/dsl_destroy.h>
81#include <sys/dsl_userhold.h>
82#include <sys/zfeature.h>
83#include <sys/zvol.h>
84#include <sys/trim_map.h>
85#include <sys/abd.h>
86
87#ifdef	_KERNEL
88#include <sys/callb.h>
89#include <sys/cpupart.h>
90#include <sys/zone.h>
91#endif	/* _KERNEL */
92
93#include "zfs_prop.h"
94#include "zfs_comutil.h"
95
96/* Check hostid on import? */
97static int check_hostid = 1;
98
99/*
100 * The interval, in seconds, at which failed configuration cache file writes
101 * should be retried.
102 */
103int zfs_ccw_retry_interval = 300;
104
105SYSCTL_DECL(_vfs_zfs);
106SYSCTL_INT(_vfs_zfs, OID_AUTO, check_hostid, CTLFLAG_RWTUN, &check_hostid, 0,
107    "Check hostid on import?");
108TUNABLE_INT("vfs.zfs.ccw_retry_interval", &zfs_ccw_retry_interval);
109SYSCTL_INT(_vfs_zfs, OID_AUTO, ccw_retry_interval, CTLFLAG_RW,
110    &zfs_ccw_retry_interval, 0,
111    "Configuration cache file write, retry after failure, interval (seconds)");
112
113typedef enum zti_modes {
114	ZTI_MODE_FIXED,			/* value is # of threads (min 1) */
115	ZTI_MODE_BATCH,			/* cpu-intensive; value is ignored */
116	ZTI_MODE_NULL,			/* don't create a taskq */
117	ZTI_NMODES
118} zti_modes_t;
119
120#define	ZTI_P(n, q)	{ ZTI_MODE_FIXED, (n), (q) }
121#define	ZTI_BATCH	{ ZTI_MODE_BATCH, 0, 1 }
122#define	ZTI_NULL	{ ZTI_MODE_NULL, 0, 0 }
123
124#define	ZTI_N(n)	ZTI_P(n, 1)
125#define	ZTI_ONE		ZTI_N(1)
126
127typedef struct zio_taskq_info {
128	zti_modes_t zti_mode;
129	uint_t zti_value;
130	uint_t zti_count;
131} zio_taskq_info_t;
132
133static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
134	"issue", "issue_high", "intr", "intr_high"
135};
136
137/*
138 * This table defines the taskq settings for each ZFS I/O type. When
139 * initializing a pool, we use this table to create an appropriately sized
140 * taskq. Some operations are low volume and therefore have a small, static
141 * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE
142 * macros. Other operations process a large amount of data; the ZTI_BATCH
143 * macro causes us to create a taskq oriented for throughput. Some operations
144 * are so high frequency and short-lived that the taskq itself can become a a
145 * point of lock contention. The ZTI_P(#, #) macro indicates that we need an
146 * additional degree of parallelism specified by the number of threads per-
147 * taskq and the number of taskqs; when dispatching an event in this case, the
148 * particular taskq is chosen at random.
149 *
150 * The different taskq priorities are to handle the different contexts (issue
151 * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that
152 * need to be handled with minimum delay.
153 */
154const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
155	/* ISSUE	ISSUE_HIGH	INTR		INTR_HIGH */
156	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* NULL */
157	{ ZTI_N(8),	ZTI_NULL,	ZTI_P(12, 8),	ZTI_NULL }, /* READ */
158	{ ZTI_BATCH,	ZTI_N(5),	ZTI_N(8),	ZTI_N(5) }, /* WRITE */
159	{ ZTI_P(12, 8),	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* FREE */
160	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* CLAIM */
161	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* IOCTL */
162};
163
164static void spa_sync_version(void *arg, dmu_tx_t *tx);
165static void spa_sync_props(void *arg, dmu_tx_t *tx);
166static boolean_t spa_has_active_shared_spare(spa_t *spa);
167static int spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport);
168static void spa_vdev_resilver_done(spa_t *spa);
169
170uint_t		zio_taskq_batch_pct = 75;	/* 1 thread per cpu in pset */
171#ifdef PSRSET_BIND
172id_t		zio_taskq_psrset_bind = PS_NONE;
173#endif
174#ifdef SYSDC
175boolean_t	zio_taskq_sysdc = B_TRUE;	/* use SDC scheduling class */
176uint_t		zio_taskq_basedc = 80;		/* base duty cycle */
177#endif
178
179boolean_t	spa_create_process = B_TRUE;	/* no process ==> no sysdc */
180extern int	zfs_sync_pass_deferred_free;
181
182/*
183 * Report any spa_load_verify errors found, but do not fail spa_load.
184 * This is used by zdb to analyze non-idle pools.
185 */
186boolean_t	spa_load_verify_dryrun = B_FALSE;
187
188/*
189 * This (illegal) pool name is used when temporarily importing a spa_t in order
190 * to get the vdev stats associated with the imported devices.
191 */
192#define	TRYIMPORT_NAME	"$import"
193
194/*
195 * For debugging purposes: print out vdev tree during pool import.
196 */
197int	spa_load_print_vdev_tree = B_FALSE;
198
199/*
200 * A non-zero value for zfs_max_missing_tvds means that we allow importing
201 * pools with missing top-level vdevs. This is strictly intended for advanced
202 * pool recovery cases since missing data is almost inevitable. Pools with
203 * missing devices can only be imported read-only for safety reasons, and their
204 * fail-mode will be automatically set to "continue".
205 *
206 * With 1 missing vdev we should be able to import the pool and mount all
207 * datasets. User data that was not modified after the missing device has been
208 * added should be recoverable. This means that snapshots created prior to the
209 * addition of that device should be completely intact.
210 *
211 * With 2 missing vdevs, some datasets may fail to mount since there are
212 * dataset statistics that are stored as regular metadata. Some data might be
213 * recoverable if those vdevs were added recently.
214 *
215 * With 3 or more missing vdevs, the pool is severely damaged and MOS entries
216 * may be missing entirely. Chances of data recovery are very low. Note that
217 * there are also risks of performing an inadvertent rewind as we might be
218 * missing all the vdevs with the latest uberblocks.
219 */
220uint64_t	zfs_max_missing_tvds = 0;
221
222/*
223 * The parameters below are similar to zfs_max_missing_tvds but are only
224 * intended for a preliminary open of the pool with an untrusted config which
225 * might be incomplete or out-dated.
226 *
227 * We are more tolerant for pools opened from a cachefile since we could have
228 * an out-dated cachefile where a device removal was not registered.
229 * We could have set the limit arbitrarily high but in the case where devices
230 * are really missing we would want to return the proper error codes; we chose
231 * SPA_DVAS_PER_BP - 1 so that some copies of the MOS would still be available
232 * and we get a chance to retrieve the trusted config.
233 */
234uint64_t	zfs_max_missing_tvds_cachefile = SPA_DVAS_PER_BP - 1;
235
236/*
237 * In the case where config was assembled by scanning device paths (/dev/dsks
238 * by default) we are less tolerant since all the existing devices should have
239 * been detected and we want spa_load to return the right error codes.
240 */
241uint64_t	zfs_max_missing_tvds_scan = 0;
242
243
244SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_print_vdev_tree, CTLFLAG_RWTUN,
245    &spa_load_print_vdev_tree, 0,
246    "print out vdev tree during pool import");
247SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds, CTLFLAG_RWTUN,
248    &zfs_max_missing_tvds, 0,
249    "allow importing pools with missing top-level vdevs");
250SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds_cachefile, CTLFLAG_RWTUN,
251    &zfs_max_missing_tvds_cachefile, 0,
252    "allow importing pools with missing top-level vdevs in cache file");
253SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds_scan, CTLFLAG_RWTUN,
254    &zfs_max_missing_tvds_scan, 0,
255    "allow importing pools with missing top-level vdevs during scan");
256
257/*
258 * Debugging aid that pauses spa_sync() towards the end.
259 */
260boolean_t	zfs_pause_spa_sync = B_FALSE;
261
262/*
263 * ==========================================================================
264 * SPA properties routines
265 * ==========================================================================
266 */
267
268/*
269 * Add a (source=src, propname=propval) list to an nvlist.
270 */
271static void
272spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval,
273    uint64_t intval, zprop_source_t src)
274{
275	const char *propname = zpool_prop_to_name(prop);
276	nvlist_t *propval;
277
278	VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
279	VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0);
280
281	if (strval != NULL)
282		VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0);
283	else
284		VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0);
285
286	VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0);
287	nvlist_free(propval);
288}
289
290/*
291 * Get property values from the spa configuration.
292 */
293static void
294spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
295{
296	vdev_t *rvd = spa->spa_root_vdev;
297	dsl_pool_t *pool = spa->spa_dsl_pool;
298	uint64_t size, alloc, cap, version;
299	zprop_source_t src = ZPROP_SRC_NONE;
300	spa_config_dirent_t *dp;
301	metaslab_class_t *mc = spa_normal_class(spa);
302
303	ASSERT(MUTEX_HELD(&spa->spa_props_lock));
304
305	if (rvd != NULL) {
306		alloc = metaslab_class_get_alloc(spa_normal_class(spa));
307		size = metaslab_class_get_space(spa_normal_class(spa));
308		spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
309		spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
310		spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
311		spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL,
312		    size - alloc, src);
313		spa_prop_add_list(*nvp, ZPOOL_PROP_CHECKPOINT, NULL,
314		    spa->spa_checkpoint_info.sci_dspace, src);
315
316		spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL,
317		    metaslab_class_fragmentation(mc), src);
318		spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL,
319		    metaslab_class_expandable_space(mc), src);
320		spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL,
321		    (spa_mode(spa) == FREAD), src);
322
323		cap = (size == 0) ? 0 : (alloc * 100 / size);
324		spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);
325
326		spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL,
327		    ddt_get_pool_dedup_ratio(spa), src);
328
329		spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
330		    rvd->vdev_state, src);
331
332		version = spa_version(spa);
333		if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION))
334			src = ZPROP_SRC_DEFAULT;
335		else
336			src = ZPROP_SRC_LOCAL;
337		spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src);
338	}
339
340	if (pool != NULL) {
341		/*
342		 * The $FREE directory was introduced in SPA_VERSION_DEADLISTS,
343		 * when opening pools before this version freedir will be NULL.
344		 */
345		if (pool->dp_free_dir != NULL) {
346			spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL,
347			    dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes,
348			    src);
349		} else {
350			spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING,
351			    NULL, 0, src);
352		}
353
354		if (pool->dp_leak_dir != NULL) {
355			spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL,
356			    dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes,
357			    src);
358		} else {
359			spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED,
360			    NULL, 0, src);
361		}
362	}
363
364	spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
365
366	if (spa->spa_comment != NULL) {
367		spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment,
368		    0, ZPROP_SRC_LOCAL);
369	}
370
371	if (spa->spa_root != NULL)
372		spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root,
373		    0, ZPROP_SRC_LOCAL);
374
375	if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) {
376		spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
377		    MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE);
378	} else {
379		spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
380		    SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE);
381	}
382
383	if ((dp = list_head(&spa->spa_config_list)) != NULL) {
384		if (dp->scd_path == NULL) {
385			spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
386			    "none", 0, ZPROP_SRC_LOCAL);
387		} else if (strcmp(dp->scd_path, spa_config_path) != 0) {
388			spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
389			    dp->scd_path, 0, ZPROP_SRC_LOCAL);
390		}
391	}
392}
393
394/*
395 * Get zpool property values.
396 */
397int
398spa_prop_get(spa_t *spa, nvlist_t **nvp)
399{
400	objset_t *mos = spa->spa_meta_objset;
401	zap_cursor_t zc;
402	zap_attribute_t za;
403	int err;
404
405	VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
406
407	mutex_enter(&spa->spa_props_lock);
408
409	/*
410	 * Get properties from the spa config.
411	 */
412	spa_prop_get_config(spa, nvp);
413
414	/* If no pool property object, no more prop to get. */
415	if (mos == NULL || spa->spa_pool_props_object == 0) {
416		mutex_exit(&spa->spa_props_lock);
417		return (0);
418	}
419
420	/*
421	 * Get properties from the MOS pool property object.
422	 */
423	for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object);
424	    (err = zap_cursor_retrieve(&zc, &za)) == 0;
425	    zap_cursor_advance(&zc)) {
426		uint64_t intval = 0;
427		char *strval = NULL;
428		zprop_source_t src = ZPROP_SRC_DEFAULT;
429		zpool_prop_t prop;
430
431		if ((prop = zpool_name_to_prop(za.za_name)) == ZPOOL_PROP_INVAL)
432			continue;
433
434		switch (za.za_integer_length) {
435		case 8:
436			/* integer property */
437			if (za.za_first_integer !=
438			    zpool_prop_default_numeric(prop))
439				src = ZPROP_SRC_LOCAL;
440
441			if (prop == ZPOOL_PROP_BOOTFS) {
442				dsl_pool_t *dp;
443				dsl_dataset_t *ds = NULL;
444
445				dp = spa_get_dsl(spa);
446				dsl_pool_config_enter(dp, FTAG);
447				err = dsl_dataset_hold_obj(dp,
448				    za.za_first_integer, FTAG, &ds);
449				if (err != 0) {
450					dsl_pool_config_exit(dp, FTAG);
451					break;
452				}
453
454				strval = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN,
455				    KM_SLEEP);
456				dsl_dataset_name(ds, strval);
457				dsl_dataset_rele(ds, FTAG);
458				dsl_pool_config_exit(dp, FTAG);
459			} else {
460				strval = NULL;
461				intval = za.za_first_integer;
462			}
463
464			spa_prop_add_list(*nvp, prop, strval, intval, src);
465
466			if (strval != NULL)
467				kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN);
468
469			break;
470
471		case 1:
472			/* string property */
473			strval = kmem_alloc(za.za_num_integers, KM_SLEEP);
474			err = zap_lookup(mos, spa->spa_pool_props_object,
475			    za.za_name, 1, za.za_num_integers, strval);
476			if (err) {
477				kmem_free(strval, za.za_num_integers);
478				break;
479			}
480			spa_prop_add_list(*nvp, prop, strval, 0, src);
481			kmem_free(strval, za.za_num_integers);
482			break;
483
484		default:
485			break;
486		}
487	}
488	zap_cursor_fini(&zc);
489	mutex_exit(&spa->spa_props_lock);
490out:
491	if (err && err != ENOENT) {
492		nvlist_free(*nvp);
493		*nvp = NULL;
494		return (err);
495	}
496
497	return (0);
498}
499
500/*
501 * Validate the given pool properties nvlist and modify the list
502 * for the property values to be set.
503 */
504static int
505spa_prop_validate(spa_t *spa, nvlist_t *props)
506{
507	nvpair_t *elem;
508	int error = 0, reset_bootfs = 0;
509	uint64_t objnum = 0;
510	boolean_t has_feature = B_FALSE;
511
512	elem = NULL;
513	while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
514		uint64_t intval;
515		char *strval, *slash, *check, *fname;
516		const char *propname = nvpair_name(elem);
517		zpool_prop_t prop = zpool_name_to_prop(propname);
518
519		switch (prop) {
520		case ZPOOL_PROP_INVAL:
521			if (!zpool_prop_feature(propname)) {
522				error = SET_ERROR(EINVAL);
523				break;
524			}
525
526			/*
527			 * Sanitize the input.
528			 */
529			if (nvpair_type(elem) != DATA_TYPE_UINT64) {
530				error = SET_ERROR(EINVAL);
531				break;
532			}
533
534			if (nvpair_value_uint64(elem, &intval) != 0) {
535				error = SET_ERROR(EINVAL);
536				break;
537			}
538
539			if (intval != 0) {
540				error = SET_ERROR(EINVAL);
541				break;
542			}
543
544			fname = strchr(propname, '@') + 1;
545			if (zfeature_lookup_name(fname, NULL) != 0) {
546				error = SET_ERROR(EINVAL);
547				break;
548			}
549
550			has_feature = B_TRUE;
551			break;
552
553		case ZPOOL_PROP_VERSION:
554			error = nvpair_value_uint64(elem, &intval);
555			if (!error &&
556			    (intval < spa_version(spa) ||
557			    intval > SPA_VERSION_BEFORE_FEATURES ||
558			    has_feature))
559				error = SET_ERROR(EINVAL);
560			break;
561
562		case ZPOOL_PROP_DELEGATION:
563		case ZPOOL_PROP_AUTOREPLACE:
564		case ZPOOL_PROP_LISTSNAPS:
565		case ZPOOL_PROP_AUTOEXPAND:
566			error = nvpair_value_uint64(elem, &intval);
567			if (!error && intval > 1)
568				error = SET_ERROR(EINVAL);
569			break;
570
571		case ZPOOL_PROP_BOOTFS:
572			/*
573			 * If the pool version is less than SPA_VERSION_BOOTFS,
574			 * or the pool is still being created (version == 0),
575			 * the bootfs property cannot be set.
576			 */
577			if (spa_version(spa) < SPA_VERSION_BOOTFS) {
578				error = SET_ERROR(ENOTSUP);
579				break;
580			}
581
582			/*
583			 * Make sure the vdev config is bootable
584			 */
585			if (!vdev_is_bootable(spa->spa_root_vdev)) {
586				error = SET_ERROR(ENOTSUP);
587				break;
588			}
589
590			reset_bootfs = 1;
591
592			error = nvpair_value_string(elem, &strval);
593
594			if (!error) {
595				objset_t *os;
596				uint64_t propval;
597
598				if (strval == NULL || strval[0] == '\0') {
599					objnum = zpool_prop_default_numeric(
600					    ZPOOL_PROP_BOOTFS);
601					break;
602				}
603
604				error = dmu_objset_hold(strval, FTAG, &os);
605				if (error != 0)
606					break;
607
608				/*
609				 * Must be ZPL, and its property settings
610				 * must be supported by GRUB (compression
611				 * is not gzip, and large blocks are not used).
612				 */
613
614				if (dmu_objset_type(os) != DMU_OST_ZFS) {
615					error = SET_ERROR(ENOTSUP);
616				} else if ((error =
617				    dsl_prop_get_int_ds(dmu_objset_ds(os),
618				    zfs_prop_to_name(ZFS_PROP_COMPRESSION),
619				    &propval)) == 0 &&
620				    !BOOTFS_COMPRESS_VALID(propval)) {
621					error = SET_ERROR(ENOTSUP);
622				} else {
623					objnum = dmu_objset_id(os);
624				}
625				dmu_objset_rele(os, FTAG);
626			}
627			break;
628
629		case ZPOOL_PROP_FAILUREMODE:
630			error = nvpair_value_uint64(elem, &intval);
631			if (!error && (intval < ZIO_FAILURE_MODE_WAIT ||
632			    intval > ZIO_FAILURE_MODE_PANIC))
633				error = SET_ERROR(EINVAL);
634
635			/*
636			 * This is a special case which only occurs when
637			 * the pool has completely failed. This allows
638			 * the user to change the in-core failmode property
639			 * without syncing it out to disk (I/Os might
640			 * currently be blocked). We do this by returning
641			 * EIO to the caller (spa_prop_set) to trick it
642			 * into thinking we encountered a property validation
643			 * error.
644			 */
645			if (!error && spa_suspended(spa)) {
646				spa->spa_failmode = intval;
647				error = SET_ERROR(EIO);
648			}
649			break;
650
651		case ZPOOL_PROP_CACHEFILE:
652			if ((error = nvpair_value_string(elem, &strval)) != 0)
653				break;
654
655			if (strval[0] == '\0')
656				break;
657
658			if (strcmp(strval, "none") == 0)
659				break;
660
661			if (strval[0] != '/') {
662				error = SET_ERROR(EINVAL);
663				break;
664			}
665
666			slash = strrchr(strval, '/');
667			ASSERT(slash != NULL);
668
669			if (slash[1] == '\0' || strcmp(slash, "/.") == 0 ||
670			    strcmp(slash, "/..") == 0)
671				error = SET_ERROR(EINVAL);
672			break;
673
674		case ZPOOL_PROP_COMMENT:
675			if ((error = nvpair_value_string(elem, &strval)) != 0)
676				break;
677			for (check = strval; *check != '\0'; check++) {
678				/*
679				 * The kernel doesn't have an easy isprint()
680				 * check.  For this kernel check, we merely
681				 * check ASCII apart from DEL.  Fix this if
682				 * there is an easy-to-use kernel isprint().
683				 */
684				if (*check >= 0x7f) {
685					error = SET_ERROR(EINVAL);
686					break;
687				}
688			}
689			if (strlen(strval) > ZPROP_MAX_COMMENT)
690				error = E2BIG;
691			break;
692
693		case ZPOOL_PROP_DEDUPDITTO:
694			if (spa_version(spa) < SPA_VERSION_DEDUP)
695				error = SET_ERROR(ENOTSUP);
696			else
697				error = nvpair_value_uint64(elem, &intval);
698			if (error == 0 &&
699			    intval != 0 && intval < ZIO_DEDUPDITTO_MIN)
700				error = SET_ERROR(EINVAL);
701			break;
702		}
703
704		if (error)
705			break;
706	}
707
708	if (!error && reset_bootfs) {
709		error = nvlist_remove(props,
710		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING);
711
712		if (!error) {
713			error = nvlist_add_uint64(props,
714			    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum);
715		}
716	}
717
718	return (error);
719}
720
721void
722spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync)
723{
724	char *cachefile;
725	spa_config_dirent_t *dp;
726
727	if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE),
728	    &cachefile) != 0)
729		return;
730
731	dp = kmem_alloc(sizeof (spa_config_dirent_t),
732	    KM_SLEEP);
733
734	if (cachefile[0] == '\0')
735		dp->scd_path = spa_strdup(spa_config_path);
736	else if (strcmp(cachefile, "none") == 0)
737		dp->scd_path = NULL;
738	else
739		dp->scd_path = spa_strdup(cachefile);
740
741	list_insert_head(&spa->spa_config_list, dp);
742	if (need_sync)
743		spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
744}
745
746int
747spa_prop_set(spa_t *spa, nvlist_t *nvp)
748{
749	int error;
750	nvpair_t *elem = NULL;
751	boolean_t need_sync = B_FALSE;
752
753	if ((error = spa_prop_validate(spa, nvp)) != 0)
754		return (error);
755
756	while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) {
757		zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem));
758
759		if (prop == ZPOOL_PROP_CACHEFILE ||
760		    prop == ZPOOL_PROP_ALTROOT ||
761		    prop == ZPOOL_PROP_READONLY)
762			continue;
763
764		if (prop == ZPOOL_PROP_VERSION || prop == ZPOOL_PROP_INVAL) {
765			uint64_t ver;
766
767			if (prop == ZPOOL_PROP_VERSION) {
768				VERIFY(nvpair_value_uint64(elem, &ver) == 0);
769			} else {
770				ASSERT(zpool_prop_feature(nvpair_name(elem)));
771				ver = SPA_VERSION_FEATURES;
772				need_sync = B_TRUE;
773			}
774
775			/* Save time if the version is already set. */
776			if (ver == spa_version(spa))
777				continue;
778
779			/*
780			 * In addition to the pool directory object, we might
781			 * create the pool properties object, the features for
782			 * read object, the features for write object, or the
783			 * feature descriptions object.
784			 */
785			error = dsl_sync_task(spa->spa_name, NULL,
786			    spa_sync_version, &ver,
787			    6, ZFS_SPACE_CHECK_RESERVED);
788			if (error)
789				return (error);
790			continue;
791		}
792
793		need_sync = B_TRUE;
794		break;
795	}
796
797	if (need_sync) {
798		return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props,
799		    nvp, 6, ZFS_SPACE_CHECK_RESERVED));
800	}
801
802	return (0);
803}
804
805/*
806 * If the bootfs property value is dsobj, clear it.
807 */
808void
809spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
810{
811	if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
812		VERIFY(zap_remove(spa->spa_meta_objset,
813		    spa->spa_pool_props_object,
814		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0);
815		spa->spa_bootfs = 0;
816	}
817}
818
819/*ARGSUSED*/
820static int
821spa_change_guid_check(void *arg, dmu_tx_t *tx)
822{
823	uint64_t *newguid = arg;
824	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
825	vdev_t *rvd = spa->spa_root_vdev;
826	uint64_t vdev_state;
827
828	if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
829		int error = (spa_has_checkpoint(spa)) ?
830		    ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
831		return (SET_ERROR(error));
832	}
833
834	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
835	vdev_state = rvd->vdev_state;
836	spa_config_exit(spa, SCL_STATE, FTAG);
837
838	if (vdev_state != VDEV_STATE_HEALTHY)
839		return (SET_ERROR(ENXIO));
840
841	ASSERT3U(spa_guid(spa), !=, *newguid);
842
843	return (0);
844}
845
846static void
847spa_change_guid_sync(void *arg, dmu_tx_t *tx)
848{
849	uint64_t *newguid = arg;
850	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
851	uint64_t oldguid;
852	vdev_t *rvd = spa->spa_root_vdev;
853
854	oldguid = spa_guid(spa);
855
856	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
857	rvd->vdev_guid = *newguid;
858	rvd->vdev_guid_sum += (*newguid - oldguid);
859	vdev_config_dirty(rvd);
860	spa_config_exit(spa, SCL_STATE, FTAG);
861
862	spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu",
863	    oldguid, *newguid);
864}
865
866/*
867 * Change the GUID for the pool.  This is done so that we can later
868 * re-import a pool built from a clone of our own vdevs.  We will modify
869 * the root vdev's guid, our own pool guid, and then mark all of our
870 * vdevs dirty.  Note that we must make sure that all our vdevs are
871 * online when we do this, or else any vdevs that weren't present
872 * would be orphaned from our pool.  We are also going to issue a
873 * sysevent to update any watchers.
874 */
875int
876spa_change_guid(spa_t *spa)
877{
878	int error;
879	uint64_t guid;
880
881	mutex_enter(&spa->spa_vdev_top_lock);
882	mutex_enter(&spa_namespace_lock);
883	guid = spa_generate_guid(NULL);
884
885	error = dsl_sync_task(spa->spa_name, spa_change_guid_check,
886	    spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED);
887
888	if (error == 0) {
889		spa_write_cachefile(spa, B_FALSE, B_TRUE);
890		spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID);
891	}
892
893	mutex_exit(&spa_namespace_lock);
894	mutex_exit(&spa->spa_vdev_top_lock);
895
896	return (error);
897}
898
899/*
900 * ==========================================================================
901 * SPA state manipulation (open/create/destroy/import/export)
902 * ==========================================================================
903 */
904
905static int
906spa_error_entry_compare(const void *a, const void *b)
907{
908	const spa_error_entry_t *sa = (const spa_error_entry_t *)a;
909	const spa_error_entry_t *sb = (const spa_error_entry_t *)b;
910	int ret;
911
912	ret = memcmp(&sa->se_bookmark, &sb->se_bookmark,
913	    sizeof (zbookmark_phys_t));
914
915	return (AVL_ISIGN(ret));
916}
917
918/*
919 * Utility function which retrieves copies of the current logs and
920 * re-initializes them in the process.
921 */
922void
923spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
924{
925	ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
926
927	bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t));
928	bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t));
929
930	avl_create(&spa->spa_errlist_scrub,
931	    spa_error_entry_compare, sizeof (spa_error_entry_t),
932	    offsetof(spa_error_entry_t, se_avl));
933	avl_create(&spa->spa_errlist_last,
934	    spa_error_entry_compare, sizeof (spa_error_entry_t),
935	    offsetof(spa_error_entry_t, se_avl));
936}
937
938static void
939spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
940{
941	const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
942	enum zti_modes mode = ztip->zti_mode;
943	uint_t value = ztip->zti_value;
944	uint_t count = ztip->zti_count;
945	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
946	char name[32];
947	uint_t flags = 0;
948	boolean_t batch = B_FALSE;
949
950	if (mode == ZTI_MODE_NULL) {
951		tqs->stqs_count = 0;
952		tqs->stqs_taskq = NULL;
953		return;
954	}
955
956	ASSERT3U(count, >, 0);
957
958	tqs->stqs_count = count;
959	tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP);
960
961	switch (mode) {
962	case ZTI_MODE_FIXED:
963		ASSERT3U(value, >=, 1);
964		value = MAX(value, 1);
965		break;
966
967	case ZTI_MODE_BATCH:
968		batch = B_TRUE;
969		flags |= TASKQ_THREADS_CPU_PCT;
970		value = zio_taskq_batch_pct;
971		break;
972
973	default:
974		panic("unrecognized mode for %s_%s taskq (%u:%u) in "
975		    "spa_activate()",
976		    zio_type_name[t], zio_taskq_types[q], mode, value);
977		break;
978	}
979
980	for (uint_t i = 0; i < count; i++) {
981		taskq_t *tq;
982
983		if (count > 1) {
984			(void) snprintf(name, sizeof (name), "%s_%s_%u",
985			    zio_type_name[t], zio_taskq_types[q], i);
986		} else {
987			(void) snprintf(name, sizeof (name), "%s_%s",
988			    zio_type_name[t], zio_taskq_types[q]);
989		}
990
991#ifdef SYSDC
992		if (zio_taskq_sysdc && spa->spa_proc != &p0) {
993			if (batch)
994				flags |= TASKQ_DC_BATCH;
995
996			tq = taskq_create_sysdc(name, value, 50, INT_MAX,
997			    spa->spa_proc, zio_taskq_basedc, flags);
998		} else {
999#endif
1000			pri_t pri = maxclsyspri;
1001			/*
1002			 * The write issue taskq can be extremely CPU
1003			 * intensive.  Run it at slightly lower priority
1004			 * than the other taskqs.
1005			 * FreeBSD notes:
1006			 * - numerically higher priorities are lower priorities;
1007			 * - if priorities divided by four (RQ_PPQ) are equal
1008			 *   then a difference between them is insignificant.
1009			 */
1010			if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE)
1011#ifdef illumos
1012				pri--;
1013#else
1014				pri += 4;
1015#endif
1016
1017			tq = taskq_create_proc(name, value, pri, 50,
1018			    INT_MAX, spa->spa_proc, flags);
1019#ifdef SYSDC
1020		}
1021#endif
1022
1023		tqs->stqs_taskq[i] = tq;
1024	}
1025}
1026
1027static void
1028spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
1029{
1030	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
1031
1032	if (tqs->stqs_taskq == NULL) {
1033		ASSERT0(tqs->stqs_count);
1034		return;
1035	}
1036
1037	for (uint_t i = 0; i < tqs->stqs_count; i++) {
1038		ASSERT3P(tqs->stqs_taskq[i], !=, NULL);
1039		taskq_destroy(tqs->stqs_taskq[i]);
1040	}
1041
1042	kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *));
1043	tqs->stqs_taskq = NULL;
1044}
1045
1046/*
1047 * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority.
1048 * Note that a type may have multiple discrete taskqs to avoid lock contention
1049 * on the taskq itself. In that case we choose which taskq at random by using
1050 * the low bits of gethrtime().
1051 */
1052void
1053spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
1054    task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent)
1055{
1056	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
1057	taskq_t *tq;
1058
1059	ASSERT3P(tqs->stqs_taskq, !=, NULL);
1060	ASSERT3U(tqs->stqs_count, !=, 0);
1061
1062	if (tqs->stqs_count == 1) {
1063		tq = tqs->stqs_taskq[0];
1064	} else {
1065#ifdef _KERNEL
1066		tq = tqs->stqs_taskq[(u_int)(sbinuptime() + curcpu) %
1067		    tqs->stqs_count];
1068#else
1069		tq = tqs->stqs_taskq[gethrtime() % tqs->stqs_count];
1070#endif
1071	}
1072
1073	taskq_dispatch_ent(tq, func, arg, flags, ent);
1074}
1075
1076static void
1077spa_create_zio_taskqs(spa_t *spa)
1078{
1079	for (int t = 0; t < ZIO_TYPES; t++) {
1080		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
1081			spa_taskqs_init(spa, t, q);
1082		}
1083	}
1084}
1085
1086#ifdef _KERNEL
1087#ifdef SPA_PROCESS
1088static void
1089spa_thread(void *arg)
1090{
1091	callb_cpr_t cprinfo;
1092
1093	spa_t *spa = arg;
1094	user_t *pu = PTOU(curproc);
1095
1096	CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr,
1097	    spa->spa_name);
1098
1099	ASSERT(curproc != &p0);
1100	(void) snprintf(pu->u_psargs, sizeof (pu->u_psargs),
1101	    "zpool-%s", spa->spa_name);
1102	(void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm));
1103
1104#ifdef PSRSET_BIND
1105	/* bind this thread to the requested psrset */
1106	if (zio_taskq_psrset_bind != PS_NONE) {
1107		pool_lock();
1108		mutex_enter(&cpu_lock);
1109		mutex_enter(&pidlock);
1110		mutex_enter(&curproc->p_lock);
1111
1112		if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind,
1113		    0, NULL, NULL) == 0)  {
1114			curthread->t_bind_pset = zio_taskq_psrset_bind;
1115		} else {
1116			cmn_err(CE_WARN,
1117			    "Couldn't bind process for zfs pool \"%s\" to "
1118			    "pset %d\n", spa->spa_name, zio_taskq_psrset_bind);
1119		}
1120
1121		mutex_exit(&curproc->p_lock);
1122		mutex_exit(&pidlock);
1123		mutex_exit(&cpu_lock);
1124		pool_unlock();
1125	}
1126#endif
1127
1128#ifdef SYSDC
1129	if (zio_taskq_sysdc) {
1130		sysdc_thread_enter(curthread, 100, 0);
1131	}
1132#endif
1133
1134	spa->spa_proc = curproc;
1135	spa->spa_did = curthread->t_did;
1136
1137	spa_create_zio_taskqs(spa);
1138
1139	mutex_enter(&spa->spa_proc_lock);
1140	ASSERT(spa->spa_proc_state == SPA_PROC_CREATED);
1141
1142	spa->spa_proc_state = SPA_PROC_ACTIVE;
1143	cv_broadcast(&spa->spa_proc_cv);
1144
1145	CALLB_CPR_SAFE_BEGIN(&cprinfo);
1146	while (spa->spa_proc_state == SPA_PROC_ACTIVE)
1147		cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
1148	CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock);
1149
1150	ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE);
1151	spa->spa_proc_state = SPA_PROC_GONE;
1152	spa->spa_proc = &p0;
1153	cv_broadcast(&spa->spa_proc_cv);
1154	CALLB_CPR_EXIT(&cprinfo);	/* drops spa_proc_lock */
1155
1156	mutex_enter(&curproc->p_lock);
1157	lwp_exit();
1158}
1159#endif	/* SPA_PROCESS */
1160#endif
1161
1162/*
1163 * Activate an uninitialized pool.
1164 */
1165static void
1166spa_activate(spa_t *spa, int mode)
1167{
1168	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
1169
1170	spa->spa_state = POOL_STATE_ACTIVE;
1171	spa->spa_mode = mode;
1172
1173	spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops);
1174	spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops);
1175
1176	/* Try to create a covering process */
1177	mutex_enter(&spa->spa_proc_lock);
1178	ASSERT(spa->spa_proc_state == SPA_PROC_NONE);
1179	ASSERT(spa->spa_proc == &p0);
1180	spa->spa_did = 0;
1181
1182#ifdef SPA_PROCESS
1183	/* Only create a process if we're going to be around a while. */
1184	if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) {
1185		if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri,
1186		    NULL, 0) == 0) {
1187			spa->spa_proc_state = SPA_PROC_CREATED;
1188			while (spa->spa_proc_state == SPA_PROC_CREATED) {
1189				cv_wait(&spa->spa_proc_cv,
1190				    &spa->spa_proc_lock);
1191			}
1192			ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
1193			ASSERT(spa->spa_proc != &p0);
1194			ASSERT(spa->spa_did != 0);
1195		} else {
1196#ifdef _KERNEL
1197			cmn_err(CE_WARN,
1198			    "Couldn't create process for zfs pool \"%s\"\n",
1199			    spa->spa_name);
1200#endif
1201		}
1202	}
1203#endif	/* SPA_PROCESS */
1204	mutex_exit(&spa->spa_proc_lock);
1205
1206	/* If we didn't create a process, we need to create our taskqs. */
1207	ASSERT(spa->spa_proc == &p0);
1208	if (spa->spa_proc == &p0) {
1209		spa_create_zio_taskqs(spa);
1210	}
1211
1212	/*
1213	 * Start TRIM thread.
1214	 */
1215	trim_thread_create(spa);
1216
1217	for (size_t i = 0; i < TXG_SIZE; i++) {
1218		spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL,
1219		    ZIO_FLAG_CANFAIL);
1220	}
1221
1222	list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
1223	    offsetof(vdev_t, vdev_config_dirty_node));
1224	list_create(&spa->spa_evicting_os_list, sizeof (objset_t),
1225	    offsetof(objset_t, os_evicting_node));
1226	list_create(&spa->spa_state_dirty_list, sizeof (vdev_t),
1227	    offsetof(vdev_t, vdev_state_dirty_node));
1228
1229	txg_list_create(&spa->spa_vdev_txg_list, spa,
1230	    offsetof(struct vdev, vdev_txg_node));
1231
1232	avl_create(&spa->spa_errlist_scrub,
1233	    spa_error_entry_compare, sizeof (spa_error_entry_t),
1234	    offsetof(spa_error_entry_t, se_avl));
1235	avl_create(&spa->spa_errlist_last,
1236	    spa_error_entry_compare, sizeof (spa_error_entry_t),
1237	    offsetof(spa_error_entry_t, se_avl));
1238}
1239
1240/*
1241 * Opposite of spa_activate().
1242 */
1243static void
1244spa_deactivate(spa_t *spa)
1245{
1246	ASSERT(spa->spa_sync_on == B_FALSE);
1247	ASSERT(spa->spa_dsl_pool == NULL);
1248	ASSERT(spa->spa_root_vdev == NULL);
1249	ASSERT(spa->spa_async_zio_root == NULL);
1250	ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
1251
1252	/*
1253	 * Stop TRIM thread in case spa_unload() wasn't called directly
1254	 * before spa_deactivate().
1255	 */
1256	trim_thread_destroy(spa);
1257
1258	spa_evicting_os_wait(spa);
1259
1260	txg_list_destroy(&spa->spa_vdev_txg_list);
1261
1262	list_destroy(&spa->spa_config_dirty_list);
1263	list_destroy(&spa->spa_evicting_os_list);
1264	list_destroy(&spa->spa_state_dirty_list);
1265
1266	for (int t = 0; t < ZIO_TYPES; t++) {
1267		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
1268			spa_taskqs_fini(spa, t, q);
1269		}
1270	}
1271
1272	for (size_t i = 0; i < TXG_SIZE; i++) {
1273		ASSERT3P(spa->spa_txg_zio[i], !=, NULL);
1274		VERIFY0(zio_wait(spa->spa_txg_zio[i]));
1275		spa->spa_txg_zio[i] = NULL;
1276	}
1277
1278	metaslab_class_destroy(spa->spa_normal_class);
1279	spa->spa_normal_class = NULL;
1280
1281	metaslab_class_destroy(spa->spa_log_class);
1282	spa->spa_log_class = NULL;
1283
1284	/*
1285	 * If this was part of an import or the open otherwise failed, we may
1286	 * still have errors left in the queues.  Empty them just in case.
1287	 */
1288	spa_errlog_drain(spa);
1289
1290	avl_destroy(&spa->spa_errlist_scrub);
1291	avl_destroy(&spa->spa_errlist_last);
1292
1293	spa->spa_state = POOL_STATE_UNINITIALIZED;
1294
1295	mutex_enter(&spa->spa_proc_lock);
1296	if (spa->spa_proc_state != SPA_PROC_NONE) {
1297		ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
1298		spa->spa_proc_state = SPA_PROC_DEACTIVATE;
1299		cv_broadcast(&spa->spa_proc_cv);
1300		while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) {
1301			ASSERT(spa->spa_proc != &p0);
1302			cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
1303		}
1304		ASSERT(spa->spa_proc_state == SPA_PROC_GONE);
1305		spa->spa_proc_state = SPA_PROC_NONE;
1306	}
1307	ASSERT(spa->spa_proc == &p0);
1308	mutex_exit(&spa->spa_proc_lock);
1309
1310#ifdef SPA_PROCESS
1311	/*
1312	 * We want to make sure spa_thread() has actually exited the ZFS
1313	 * module, so that the module can't be unloaded out from underneath
1314	 * it.
1315	 */
1316	if (spa->spa_did != 0) {
1317		thread_join(spa->spa_did);
1318		spa->spa_did = 0;
1319	}
1320#endif	/* SPA_PROCESS */
1321}
1322
1323/*
1324 * Verify a pool configuration, and construct the vdev tree appropriately.  This
1325 * will create all the necessary vdevs in the appropriate layout, with each vdev
1326 * in the CLOSED state.  This will prep the pool before open/creation/import.
1327 * All vdev validation is done by the vdev_alloc() routine.
1328 */
1329static int
1330spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
1331    uint_t id, int atype)
1332{
1333	nvlist_t **child;
1334	uint_t children;
1335	int error;
1336
1337	if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
1338		return (error);
1339
1340	if ((*vdp)->vdev_ops->vdev_op_leaf)
1341		return (0);
1342
1343	error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
1344	    &child, &children);
1345
1346	if (error == ENOENT)
1347		return (0);
1348
1349	if (error) {
1350		vdev_free(*vdp);
1351		*vdp = NULL;
1352		return (SET_ERROR(EINVAL));
1353	}
1354
1355	for (int c = 0; c < children; c++) {
1356		vdev_t *vd;
1357		if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
1358		    atype)) != 0) {
1359			vdev_free(*vdp);
1360			*vdp = NULL;
1361			return (error);
1362		}
1363	}
1364
1365	ASSERT(*vdp != NULL);
1366
1367	return (0);
1368}
1369
1370/*
1371 * Opposite of spa_load().
1372 */
1373static void
1374spa_unload(spa_t *spa)
1375{
1376	int i;
1377
1378	ASSERT(MUTEX_HELD(&spa_namespace_lock));
1379
1380	spa_load_note(spa, "UNLOADING");
1381
1382	/*
1383	 * Stop TRIM thread.
1384	 */
1385	trim_thread_destroy(spa);
1386
1387	/*
1388	 * Stop async tasks.
1389	 */
1390	spa_async_suspend(spa);
1391
1392	if (spa->spa_root_vdev) {
1393		vdev_initialize_stop_all(spa->spa_root_vdev,
1394		    VDEV_INITIALIZE_ACTIVE);
1395	}
1396
1397	/*
1398	 * Stop syncing.
1399	 */
1400	if (spa->spa_sync_on) {
1401		txg_sync_stop(spa->spa_dsl_pool);
1402		spa->spa_sync_on = B_FALSE;
1403	}
1404
1405	/*
1406	 * Even though vdev_free() also calls vdev_metaslab_fini, we need
1407	 * to call it earlier, before we wait for async i/o to complete.
1408	 * This ensures that there is no async metaslab prefetching, by
1409	 * calling taskq_wait(mg_taskq).
1410	 */
1411	if (spa->spa_root_vdev != NULL) {
1412		spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
1413		for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++)
1414			vdev_metaslab_fini(spa->spa_root_vdev->vdev_child[c]);
1415		spa_config_exit(spa, SCL_ALL, spa);
1416	}
1417
1418	/*
1419	 * Wait for any outstanding async I/O to complete.
1420	 */
1421	if (spa->spa_async_zio_root != NULL) {
1422		for (int i = 0; i < max_ncpus; i++)
1423			(void) zio_wait(spa->spa_async_zio_root[i]);
1424		kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *));
1425		spa->spa_async_zio_root = NULL;
1426	}
1427
1428	if (spa->spa_vdev_removal != NULL) {
1429		spa_vdev_removal_destroy(spa->spa_vdev_removal);
1430		spa->spa_vdev_removal = NULL;
1431	}
1432
1433	if (spa->spa_condense_zthr != NULL) {
1434		ASSERT(!zthr_isrunning(spa->spa_condense_zthr));
1435		zthr_destroy(spa->spa_condense_zthr);
1436		spa->spa_condense_zthr = NULL;
1437	}
1438
1439	if (spa->spa_checkpoint_discard_zthr != NULL) {
1440		ASSERT(!zthr_isrunning(spa->spa_checkpoint_discard_zthr));
1441		zthr_destroy(spa->spa_checkpoint_discard_zthr);
1442		spa->spa_checkpoint_discard_zthr = NULL;
1443	}
1444
1445	spa_condense_fini(spa);
1446
1447	bpobj_close(&spa->spa_deferred_bpobj);
1448
1449	spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
1450
1451	/*
1452	 * Close all vdevs.
1453	 */
1454	if (spa->spa_root_vdev)
1455		vdev_free(spa->spa_root_vdev);
1456	ASSERT(spa->spa_root_vdev == NULL);
1457
1458	/*
1459	 * Close the dsl pool.
1460	 */
1461	if (spa->spa_dsl_pool) {
1462		dsl_pool_close(spa->spa_dsl_pool);
1463		spa->spa_dsl_pool = NULL;
1464		spa->spa_meta_objset = NULL;
1465	}
1466
1467	ddt_unload(spa);
1468
1469	/*
1470	 * Drop and purge level 2 cache
1471	 */
1472	spa_l2cache_drop(spa);
1473
1474	for (i = 0; i < spa->spa_spares.sav_count; i++)
1475		vdev_free(spa->spa_spares.sav_vdevs[i]);
1476	if (spa->spa_spares.sav_vdevs) {
1477		kmem_free(spa->spa_spares.sav_vdevs,
1478		    spa->spa_spares.sav_count * sizeof (void *));
1479		spa->spa_spares.sav_vdevs = NULL;
1480	}
1481	if (spa->spa_spares.sav_config) {
1482		nvlist_free(spa->spa_spares.sav_config);
1483		spa->spa_spares.sav_config = NULL;
1484	}
1485	spa->spa_spares.sav_count = 0;
1486
1487	for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
1488		vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]);
1489		vdev_free(spa->spa_l2cache.sav_vdevs[i]);
1490	}
1491	if (spa->spa_l2cache.sav_vdevs) {
1492		kmem_free(spa->spa_l2cache.sav_vdevs,
1493		    spa->spa_l2cache.sav_count * sizeof (void *));
1494		spa->spa_l2cache.sav_vdevs = NULL;
1495	}
1496	if (spa->spa_l2cache.sav_config) {
1497		nvlist_free(spa->spa_l2cache.sav_config);
1498		spa->spa_l2cache.sav_config = NULL;
1499	}
1500	spa->spa_l2cache.sav_count = 0;
1501
1502	spa->spa_async_suspended = 0;
1503
1504	spa->spa_indirect_vdevs_loaded = B_FALSE;
1505
1506	if (spa->spa_comment != NULL) {
1507		spa_strfree(spa->spa_comment);
1508		spa->spa_comment = NULL;
1509	}
1510
1511	spa_config_exit(spa, SCL_ALL, spa);
1512}
1513
1514/*
1515 * Load (or re-load) the current list of vdevs describing the active spares for
1516 * this pool.  When this is called, we have some form of basic information in
1517 * 'spa_spares.sav_config'.  We parse this into vdevs, try to open them, and
1518 * then re-generate a more complete list including status information.
1519 */
1520void
1521spa_load_spares(spa_t *spa)
1522{
1523	nvlist_t **spares;
1524	uint_t nspares;
1525	int i;
1526	vdev_t *vd, *tvd;
1527
1528#ifndef _KERNEL
1529	/*
1530	 * zdb opens both the current state of the pool and the
1531	 * checkpointed state (if present), with a different spa_t.
1532	 *
1533	 * As spare vdevs are shared among open pools, we skip loading
1534	 * them when we load the checkpointed state of the pool.
1535	 */
1536	if (!spa_writeable(spa))
1537		return;
1538#endif
1539
1540	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
1541
1542	/*
1543	 * First, close and free any existing spare vdevs.
1544	 */
1545	for (i = 0; i < spa->spa_spares.sav_count; i++) {
1546		vd = spa->spa_spares.sav_vdevs[i];
1547
1548		/* Undo the call to spa_activate() below */
1549		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
1550		    B_FALSE)) != NULL && tvd->vdev_isspare)
1551			spa_spare_remove(tvd);
1552		vdev_close(vd);
1553		vdev_free(vd);
1554	}
1555
1556	if (spa->spa_spares.sav_vdevs)
1557		kmem_free(spa->spa_spares.sav_vdevs,
1558		    spa->spa_spares.sav_count * sizeof (void *));
1559
1560	if (spa->spa_spares.sav_config == NULL)
1561		nspares = 0;
1562	else
1563		VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
1564		    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
1565
1566	spa->spa_spares.sav_count = (int)nspares;
1567	spa->spa_spares.sav_vdevs = NULL;
1568
1569	if (nspares == 0)
1570		return;
1571
1572	/*
1573	 * Construct the array of vdevs, opening them to get status in the
1574	 * process.   For each spare, there is potentially two different vdev_t
1575	 * structures associated with it: one in the list of spares (used only
1576	 * for basic validation purposes) and one in the active vdev
1577	 * configuration (if it's spared in).  During this phase we open and
1578	 * validate each vdev on the spare list.  If the vdev also exists in the
1579	 * active configuration, then we also mark this vdev as an active spare.
1580	 */
1581	spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *),
1582	    KM_SLEEP);
1583	for (i = 0; i < spa->spa_spares.sav_count; i++) {
1584		VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0,
1585		    VDEV_ALLOC_SPARE) == 0);
1586		ASSERT(vd != NULL);
1587
1588		spa->spa_spares.sav_vdevs[i] = vd;
1589
1590		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
1591		    B_FALSE)) != NULL) {
1592			if (!tvd->vdev_isspare)
1593				spa_spare_add(tvd);
1594
1595			/*
1596			 * We only mark the spare active if we were successfully
1597			 * able to load the vdev.  Otherwise, importing a pool
1598			 * with a bad active spare would result in strange
1599			 * behavior, because multiple pool would think the spare
1600			 * is actively in use.
1601			 *
1602			 * There is a vulnerability here to an equally bizarre
1603			 * circumstance, where a dead active spare is later
1604			 * brought back to life (onlined or otherwise).  Given
1605			 * the rarity of this scenario, and the extra complexity
1606			 * it adds, we ignore the possibility.
1607			 */
1608			if (!vdev_is_dead(tvd))
1609				spa_spare_activate(tvd);
1610		}
1611
1612		vd->vdev_top = vd;
1613		vd->vdev_aux = &spa->spa_spares;
1614
1615		if (vdev_open(vd) != 0)
1616			continue;
1617
1618		if (vdev_validate_aux(vd) == 0)
1619			spa_spare_add(vd);
1620	}
1621
1622	/*
1623	 * Recompute the stashed list of spares, with status information
1624	 * this time.
1625	 */
1626	VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES,
1627	    DATA_TYPE_NVLIST_ARRAY) == 0);
1628
1629	spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *),
1630	    KM_SLEEP);
1631	for (i = 0; i < spa->spa_spares.sav_count; i++)
1632		spares[i] = vdev_config_generate(spa,
1633		    spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE);
1634	VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
1635	    ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0);
1636	for (i = 0; i < spa->spa_spares.sav_count; i++)
1637		nvlist_free(spares[i]);
1638	kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *));
1639}
1640
1641/*
1642 * Load (or re-load) the current list of vdevs describing the active l2cache for
1643 * this pool.  When this is called, we have some form of basic information in
1644 * 'spa_l2cache.sav_config'.  We parse this into vdevs, try to open them, and
1645 * then re-generate a more complete list including status information.
1646 * Devices which are already active have their details maintained, and are
1647 * not re-opened.
1648 */
1649void
1650spa_load_l2cache(spa_t *spa)
1651{
1652	nvlist_t **l2cache;
1653	uint_t nl2cache;
1654	int i, j, oldnvdevs;
1655	uint64_t guid;
1656	vdev_t *vd, **oldvdevs, **newvdevs;
1657	spa_aux_vdev_t *sav = &spa->spa_l2cache;
1658
1659#ifndef _KERNEL
1660	/*
1661	 * zdb opens both the current state of the pool and the
1662	 * checkpointed state (if present), with a different spa_t.
1663	 *
1664	 * As L2 caches are part of the ARC which is shared among open
1665	 * pools, we skip loading them when we load the checkpointed
1666	 * state of the pool.
1667	 */
1668	if (!spa_writeable(spa))
1669		return;
1670#endif
1671
1672	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
1673
1674	if (sav->sav_config != NULL) {
1675		VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
1676		    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
1677		newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP);
1678	} else {
1679		nl2cache = 0;
1680		newvdevs = NULL;
1681	}
1682
1683	oldvdevs = sav->sav_vdevs;
1684	oldnvdevs = sav->sav_count;
1685	sav->sav_vdevs = NULL;
1686	sav->sav_count = 0;
1687
1688	/*
1689	 * Process new nvlist of vdevs.
1690	 */
1691	for (i = 0; i < nl2cache; i++) {
1692		VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID,
1693		    &guid) == 0);
1694
1695		newvdevs[i] = NULL;
1696		for (j = 0; j < oldnvdevs; j++) {
1697			vd = oldvdevs[j];
1698			if (vd != NULL && guid == vd->vdev_guid) {
1699				/*
1700				 * Retain previous vdev for add/remove ops.
1701				 */
1702				newvdevs[i] = vd;
1703				oldvdevs[j] = NULL;
1704				break;
1705			}
1706		}
1707
1708		if (newvdevs[i] == NULL) {
1709			/*
1710			 * Create new vdev
1711			 */
1712			VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0,
1713			    VDEV_ALLOC_L2CACHE) == 0);
1714			ASSERT(vd != NULL);
1715			newvdevs[i] = vd;
1716
1717			/*
1718			 * Commit this vdev as an l2cache device,
1719			 * even if it fails to open.
1720			 */
1721			spa_l2cache_add(vd);
1722
1723			vd->vdev_top = vd;
1724			vd->vdev_aux = sav;
1725
1726			spa_l2cache_activate(vd);
1727
1728			if (vdev_open(vd) != 0)
1729				continue;
1730
1731			(void) vdev_validate_aux(vd);
1732
1733			if (!vdev_is_dead(vd))
1734				l2arc_add_vdev(spa, vd);
1735		}
1736	}
1737
1738	/*
1739	 * Purge vdevs that were dropped
1740	 */
1741	for (i = 0; i < oldnvdevs; i++) {
1742		uint64_t pool;
1743
1744		vd = oldvdevs[i];
1745		if (vd != NULL) {
1746			ASSERT(vd->vdev_isl2cache);
1747
1748			if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
1749			    pool != 0ULL && l2arc_vdev_present(vd))
1750				l2arc_remove_vdev(vd);
1751			vdev_clear_stats(vd);
1752			vdev_free(vd);
1753		}
1754	}
1755
1756	if (oldvdevs)
1757		kmem_free(oldvdevs, oldnvdevs * sizeof (void *));
1758
1759	if (sav->sav_config == NULL)
1760		goto out;
1761
1762	sav->sav_vdevs = newvdevs;
1763	sav->sav_count = (int)nl2cache;
1764
1765	/*
1766	 * Recompute the stashed list of l2cache devices, with status
1767	 * information this time.
1768	 */
1769	VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
1770	    DATA_TYPE_NVLIST_ARRAY) == 0);
1771
1772	l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
1773	for (i = 0; i < sav->sav_count; i++)
1774		l2cache[i] = vdev_config_generate(spa,
1775		    sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE);
1776	VERIFY(nvlist_add_nvlist_array(sav->sav_config,
1777	    ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0);
1778out:
1779	for (i = 0; i < sav->sav_count; i++)
1780		nvlist_free(l2cache[i]);
1781	if (sav->sav_count)
1782		kmem_free(l2cache, sav->sav_count * sizeof (void *));
1783}
1784
1785static int
1786load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
1787{
1788	dmu_buf_t *db;
1789	char *packed = NULL;
1790	size_t nvsize = 0;
1791	int error;
1792	*value = NULL;
1793
1794	error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db);
1795	if (error != 0)
1796		return (error);
1797
1798	nvsize = *(uint64_t *)db->db_data;
1799	dmu_buf_rele(db, FTAG);
1800
1801	packed = kmem_alloc(nvsize, KM_SLEEP);
1802	error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed,
1803	    DMU_READ_PREFETCH);
1804	if (error == 0)
1805		error = nvlist_unpack(packed, nvsize, value, 0);
1806	kmem_free(packed, nvsize);
1807
1808	return (error);
1809}
1810
1811/*
1812 * Concrete top-level vdevs that are not missing and are not logs. At every
1813 * spa_sync we write new uberblocks to at least SPA_SYNC_MIN_VDEVS core tvds.
1814 */
1815static uint64_t
1816spa_healthy_core_tvds(spa_t *spa)
1817{
1818	vdev_t *rvd = spa->spa_root_vdev;
1819	uint64_t tvds = 0;
1820
1821	for (uint64_t i = 0; i < rvd->vdev_children; i++) {
1822		vdev_t *vd = rvd->vdev_child[i];
1823		if (vd->vdev_islog)
1824			continue;
1825		if (vdev_is_concrete(vd) && !vdev_is_dead(vd))
1826			tvds++;
1827	}
1828
1829	return (tvds);
1830}
1831
1832/*
1833 * Checks to see if the given vdev could not be opened, in which case we post a
1834 * sysevent to notify the autoreplace code that the device has been removed.
1835 */
1836static void
1837spa_check_removed(vdev_t *vd)
1838{
1839	for (uint64_t c = 0; c < vd->vdev_children; c++)
1840		spa_check_removed(vd->vdev_child[c]);
1841
1842	if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) &&
1843	    vdev_is_concrete(vd)) {
1844		zfs_post_autoreplace(vd->vdev_spa, vd);
1845		spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_CHECK);
1846	}
1847}
1848
1849static int
1850spa_check_for_missing_logs(spa_t *spa)
1851{
1852	vdev_t *rvd = spa->spa_root_vdev;
1853
1854	/*
1855	 * If we're doing a normal import, then build up any additional
1856	 * diagnostic information about missing log devices.
1857	 * We'll pass this up to the user for further processing.
1858	 */
1859	if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) {
1860		nvlist_t **child, *nv;
1861		uint64_t idx = 0;
1862
1863		child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **),
1864		    KM_SLEEP);
1865		VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
1866
1867		for (uint64_t c = 0; c < rvd->vdev_children; c++) {
1868			vdev_t *tvd = rvd->vdev_child[c];
1869
1870			/*
1871			 * We consider a device as missing only if it failed
1872			 * to open (i.e. offline or faulted is not considered
1873			 * as missing).
1874			 */
1875			if (tvd->vdev_islog &&
1876			    tvd->vdev_state == VDEV_STATE_CANT_OPEN) {
1877				child[idx++] = vdev_config_generate(spa, tvd,
1878				    B_FALSE, VDEV_CONFIG_MISSING);
1879			}
1880		}
1881
1882		if (idx > 0) {
1883			fnvlist_add_nvlist_array(nv,
1884			    ZPOOL_CONFIG_CHILDREN, child, idx);
1885			fnvlist_add_nvlist(spa->spa_load_info,
1886			    ZPOOL_CONFIG_MISSING_DEVICES, nv);
1887
1888			for (uint64_t i = 0; i < idx; i++)
1889				nvlist_free(child[i]);
1890		}
1891		nvlist_free(nv);
1892		kmem_free(child, rvd->vdev_children * sizeof (char **));
1893
1894		if (idx > 0) {
1895			spa_load_failed(spa, "some log devices are missing");
1896			vdev_dbgmsg_print_tree(rvd, 2);
1897			return (SET_ERROR(ENXIO));
1898		}
1899	} else {
1900		for (uint64_t c = 0; c < rvd->vdev_children; c++) {
1901			vdev_t *tvd = rvd->vdev_child[c];
1902
1903			if (tvd->vdev_islog &&
1904			    tvd->vdev_state == VDEV_STATE_CANT_OPEN) {
1905				spa_set_log_state(spa, SPA_LOG_CLEAR);
1906				spa_load_note(spa, "some log devices are "
1907				    "missing, ZIL is dropped.");
1908				vdev_dbgmsg_print_tree(rvd, 2);
1909				break;
1910			}
1911		}
1912	}
1913
1914	return (0);
1915}
1916
1917/*
1918 * Check for missing log devices
1919 */
1920static boolean_t
1921spa_check_logs(spa_t *spa)
1922{
1923	boolean_t rv = B_FALSE;
1924	dsl_pool_t *dp = spa_get_dsl(spa);
1925
1926	switch (spa->spa_log_state) {
1927	case SPA_LOG_MISSING:
1928		/* need to recheck in case slog has been restored */
1929	case SPA_LOG_UNKNOWN:
1930		rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
1931		    zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0);
1932		if (rv)
1933			spa_set_log_state(spa, SPA_LOG_MISSING);
1934		break;
1935	}
1936	return (rv);
1937}
1938
1939static boolean_t
1940spa_passivate_log(spa_t *spa)
1941{
1942	vdev_t *rvd = spa->spa_root_vdev;
1943	boolean_t slog_found = B_FALSE;
1944
1945	ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
1946
1947	if (!spa_has_slogs(spa))
1948		return (B_FALSE);
1949
1950	for (int c = 0; c < rvd->vdev_children; c++) {
1951		vdev_t *tvd = rvd->vdev_child[c];
1952		metaslab_group_t *mg = tvd->vdev_mg;
1953
1954		if (tvd->vdev_islog) {
1955			metaslab_group_passivate(mg);
1956			slog_found = B_TRUE;
1957		}
1958	}
1959
1960	return (slog_found);
1961}
1962
1963static void
1964spa_activate_log(spa_t *spa)
1965{
1966	vdev_t *rvd = spa->spa_root_vdev;
1967
1968	ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
1969
1970	for (int c = 0; c < rvd->vdev_children; c++) {
1971		vdev_t *tvd = rvd->vdev_child[c];
1972		metaslab_group_t *mg = tvd->vdev_mg;
1973
1974		if (tvd->vdev_islog)
1975			metaslab_group_activate(mg);
1976	}
1977}
1978
1979int
1980spa_reset_logs(spa_t *spa)
1981{
1982	int error;
1983
1984	error = dmu_objset_find(spa_name(spa), zil_reset,
1985	    NULL, DS_FIND_CHILDREN);
1986	if (error == 0) {
1987		/*
1988		 * We successfully offlined the log device, sync out the
1989		 * current txg so that the "stubby" block can be removed
1990		 * by zil_sync().
1991		 */
1992		txg_wait_synced(spa->spa_dsl_pool, 0);
1993	}
1994	return (error);
1995}
1996
1997static void
1998spa_aux_check_removed(spa_aux_vdev_t *sav)
1999{
2000	int i;
2001
2002	for (i = 0; i < sav->sav_count; i++)
2003		spa_check_removed(sav->sav_vdevs[i]);
2004}
2005
2006void
2007spa_claim_notify(zio_t *zio)
2008{
2009	spa_t *spa = zio->io_spa;
2010
2011	if (zio->io_error)
2012		return;
2013
2014	mutex_enter(&spa->spa_props_lock);	/* any mutex will do */
2015	if (spa->spa_claim_max_txg < zio->io_bp->blk_birth)
2016		spa->spa_claim_max_txg = zio->io_bp->blk_birth;
2017	mutex_exit(&spa->spa_props_lock);
2018}
2019
2020typedef struct spa_load_error {
2021	uint64_t	sle_meta_count;
2022	uint64_t	sle_data_count;
2023} spa_load_error_t;
2024
2025static void
2026spa_load_verify_done(zio_t *zio)
2027{
2028	blkptr_t *bp = zio->io_bp;
2029	spa_load_error_t *sle = zio->io_private;
2030	dmu_object_type_t type = BP_GET_TYPE(bp);
2031	int error = zio->io_error;
2032	spa_t *spa = zio->io_spa;
2033
2034	abd_free(zio->io_abd);
2035	if (error) {
2036		if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) &&
2037		    type != DMU_OT_INTENT_LOG)
2038			atomic_inc_64(&sle->sle_meta_count);
2039		else
2040			atomic_inc_64(&sle->sle_data_count);
2041	}
2042
2043	mutex_enter(&spa->spa_scrub_lock);
2044	spa->spa_load_verify_ios--;
2045	cv_broadcast(&spa->spa_scrub_io_cv);
2046	mutex_exit(&spa->spa_scrub_lock);
2047}
2048
2049/*
2050 * Maximum number of concurrent scrub i/os to create while verifying
2051 * a pool while importing it.
2052 */
2053int spa_load_verify_maxinflight = 10000;
2054boolean_t spa_load_verify_metadata = B_TRUE;
2055boolean_t spa_load_verify_data = B_TRUE;
2056
2057SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_maxinflight, CTLFLAG_RWTUN,
2058    &spa_load_verify_maxinflight, 0,
2059    "Maximum number of concurrent scrub I/Os to create while verifying a "
2060    "pool while importing it");
2061
2062SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_metadata, CTLFLAG_RWTUN,
2063    &spa_load_verify_metadata, 0,
2064    "Check metadata on import?");
2065
2066SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_data, CTLFLAG_RWTUN,
2067    &spa_load_verify_data, 0,
2068    "Check user data on import?");
2069
2070/*ARGSUSED*/
2071static int
2072spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
2073    const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
2074{
2075	if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
2076		return (0);
2077	/*
2078	 * Note: normally this routine will not be called if
2079	 * spa_load_verify_metadata is not set.  However, it may be useful
2080	 * to manually set the flag after the traversal has begun.
2081	 */
2082	if (!spa_load_verify_metadata)
2083		return (0);
2084	if (!BP_IS_METADATA(bp) && !spa_load_verify_data)
2085		return (0);
2086
2087	zio_t *rio = arg;
2088	size_t size = BP_GET_PSIZE(bp);
2089
2090	mutex_enter(&spa->spa_scrub_lock);
2091	while (spa->spa_load_verify_ios >= spa_load_verify_maxinflight)
2092		cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
2093	spa->spa_load_verify_ios++;
2094	mutex_exit(&spa->spa_scrub_lock);
2095
2096	zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size,
2097	    spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
2098	    ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
2099	    ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
2100	return (0);
2101}
2102
2103/* ARGSUSED */
2104int
2105verify_dataset_name_len(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
2106{
2107	if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN)
2108		return (SET_ERROR(ENAMETOOLONG));
2109
2110	return (0);
2111}
2112
2113static int
2114spa_load_verify(spa_t *spa)
2115{
2116	zio_t *rio;
2117	spa_load_error_t sle = { 0 };
2118	zpool_load_policy_t policy;
2119	boolean_t verify_ok = B_FALSE;
2120	int error = 0;
2121
2122	zpool_get_load_policy(spa->spa_config, &policy);
2123
2124	if (policy.zlp_rewind & ZPOOL_NEVER_REWIND)
2125		return (0);
2126
2127	dsl_pool_config_enter(spa->spa_dsl_pool, FTAG);
2128	error = dmu_objset_find_dp(spa->spa_dsl_pool,
2129	    spa->spa_dsl_pool->dp_root_dir_obj, verify_dataset_name_len, NULL,
2130	    DS_FIND_CHILDREN);
2131	dsl_pool_config_exit(spa->spa_dsl_pool, FTAG);
2132	if (error != 0)
2133		return (error);
2134
2135	rio = zio_root(spa, NULL, &sle,
2136	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
2137
2138	if (spa_load_verify_metadata) {
2139		if (spa->spa_extreme_rewind) {
2140			spa_load_note(spa, "performing a complete scan of the "
2141			    "pool since extreme rewind is on. This may take "
2142			    "a very long time.\n  (spa_load_verify_data=%u, "
2143			    "spa_load_verify_metadata=%u)",
2144			    spa_load_verify_data, spa_load_verify_metadata);
2145		}
2146		error = traverse_pool(spa, spa->spa_verify_min_txg,
2147		    TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA,
2148		    spa_load_verify_cb, rio);
2149	}
2150
2151	(void) zio_wait(rio);
2152
2153	spa->spa_load_meta_errors = sle.sle_meta_count;
2154	spa->spa_load_data_errors = sle.sle_data_count;
2155
2156	if (sle.sle_meta_count != 0 || sle.sle_data_count != 0) {
2157		spa_load_note(spa, "spa_load_verify found %llu metadata errors "
2158		    "and %llu data errors", (u_longlong_t)sle.sle_meta_count,
2159		    (u_longlong_t)sle.sle_data_count);
2160	}
2161
2162	if (spa_load_verify_dryrun ||
2163	    (!error && sle.sle_meta_count <= policy.zlp_maxmeta &&
2164	    sle.sle_data_count <= policy.zlp_maxdata)) {
2165		int64_t loss = 0;
2166
2167		verify_ok = B_TRUE;
2168		spa->spa_load_txg = spa->spa_uberblock.ub_txg;
2169		spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp;
2170
2171		loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts;
2172		VERIFY(nvlist_add_uint64(spa->spa_load_info,
2173		    ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0);
2174		VERIFY(nvlist_add_int64(spa->spa_load_info,
2175		    ZPOOL_CONFIG_REWIND_TIME, loss) == 0);
2176		VERIFY(nvlist_add_uint64(spa->spa_load_info,
2177		    ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0);
2178	} else {
2179		spa->spa_load_max_txg = spa->spa_uberblock.ub_txg;
2180	}
2181
2182	if (spa_load_verify_dryrun)
2183		return (0);
2184
2185	if (error) {
2186		if (error != ENXIO && error != EIO)
2187			error = SET_ERROR(EIO);
2188		return (error);
2189	}
2190
2191	return (verify_ok ? 0 : EIO);
2192}
2193
2194/*
2195 * Find a value in the pool props object.
2196 */
2197static void
2198spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val)
2199{
2200	(void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object,
2201	    zpool_prop_to_name(prop), sizeof (uint64_t), 1, val);
2202}
2203
2204/*
2205 * Find a value in the pool directory object.
2206 */
2207static int
2208spa_dir_prop(spa_t *spa, const char *name, uint64_t *val, boolean_t log_enoent)
2209{
2210	int error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
2211	    name, sizeof (uint64_t), 1, val);
2212
2213	if (error != 0 && (error != ENOENT || log_enoent)) {
2214		spa_load_failed(spa, "couldn't get '%s' value in MOS directory "
2215		    "[error=%d]", name, error);
2216	}
2217
2218	return (error);
2219}
2220
2221static int
2222spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err)
2223{
2224	vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux);
2225	return (SET_ERROR(err));
2226}
2227
2228static void
2229spa_spawn_aux_threads(spa_t *spa)
2230{
2231	ASSERT(spa_writeable(spa));
2232
2233	ASSERT(MUTEX_HELD(&spa_namespace_lock));
2234
2235	spa_start_indirect_condensing_thread(spa);
2236
2237	ASSERT3P(spa->spa_checkpoint_discard_zthr, ==, NULL);
2238	spa->spa_checkpoint_discard_zthr =
2239	    zthr_create(spa_checkpoint_discard_thread_check,
2240	    spa_checkpoint_discard_thread, spa);
2241}
2242
2243/*
2244 * Fix up config after a partly-completed split.  This is done with the
2245 * ZPOOL_CONFIG_SPLIT nvlist.  Both the splitting pool and the split-off
2246 * pool have that entry in their config, but only the splitting one contains
2247 * a list of all the guids of the vdevs that are being split off.
2248 *
2249 * This function determines what to do with that list: either rejoin
2250 * all the disks to the pool, or complete the splitting process.  To attempt
2251 * the rejoin, each disk that is offlined is marked online again, and
2252 * we do a reopen() call.  If the vdev label for every disk that was
2253 * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL)
2254 * then we call vdev_split() on each disk, and complete the split.
2255 *
2256 * Otherwise we leave the config alone, with all the vdevs in place in
2257 * the original pool.
2258 */
2259static void
2260spa_try_repair(spa_t *spa, nvlist_t *config)
2261{
2262	uint_t extracted;
2263	uint64_t *glist;
2264	uint_t i, gcount;
2265	nvlist_t *nvl;
2266	vdev_t **vd;
2267	boolean_t attempt_reopen;
2268
2269	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0)
2270		return;
2271
2272	/* check that the config is complete */
2273	if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
2274	    &glist, &gcount) != 0)
2275		return;
2276
2277	vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP);
2278
2279	/* attempt to online all the vdevs & validate */
2280	attempt_reopen = B_TRUE;
2281	for (i = 0; i < gcount; i++) {
2282		if (glist[i] == 0)	/* vdev is hole */
2283			continue;
2284
2285		vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE);
2286		if (vd[i] == NULL) {
2287			/*
2288			 * Don't bother attempting to reopen the disks;
2289			 * just do the split.
2290			 */
2291			attempt_reopen = B_FALSE;
2292		} else {
2293			/* attempt to re-online it */
2294			vd[i]->vdev_offline = B_FALSE;
2295		}
2296	}
2297
2298	if (attempt_reopen) {
2299		vdev_reopen(spa->spa_root_vdev);
2300
2301		/* check each device to see what state it's in */
2302		for (extracted = 0, i = 0; i < gcount; i++) {
2303			if (vd[i] != NULL &&
2304			    vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL)
2305				break;
2306			++extracted;
2307		}
2308	}
2309
2310	/*
2311	 * If every disk has been moved to the new pool, or if we never
2312	 * even attempted to look at them, then we split them off for
2313	 * good.
2314	 */
2315	if (!attempt_reopen || gcount == extracted) {
2316		for (i = 0; i < gcount; i++)
2317			if (vd[i] != NULL)
2318				vdev_split(vd[i]);
2319		vdev_reopen(spa->spa_root_vdev);
2320	}
2321
2322	kmem_free(vd, gcount * sizeof (vdev_t *));
2323}
2324
2325static int
2326spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type)
2327{
2328	char *ereport = FM_EREPORT_ZFS_POOL;
2329	int error;
2330
2331	spa->spa_load_state = state;
2332
2333	gethrestime(&spa->spa_loaded_ts);
2334	error = spa_load_impl(spa, type, &ereport);
2335
2336	/*
2337	 * Don't count references from objsets that are already closed
2338	 * and are making their way through the eviction process.
2339	 */
2340	spa_evicting_os_wait(spa);
2341	spa->spa_minref = refcount_count(&spa->spa_refcount);
2342	if (error) {
2343		if (error != EEXIST) {
2344			spa->spa_loaded_ts.tv_sec = 0;
2345			spa->spa_loaded_ts.tv_nsec = 0;
2346		}
2347		if (error != EBADF) {
2348			zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0);
2349		}
2350	}
2351	spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
2352	spa->spa_ena = 0;
2353
2354	return (error);
2355}
2356
2357/*
2358 * Count the number of per-vdev ZAPs associated with all of the vdevs in the
2359 * vdev tree rooted in the given vd, and ensure that each ZAP is present in the
2360 * spa's per-vdev ZAP list.
2361 */
2362static uint64_t
2363vdev_count_verify_zaps(vdev_t *vd)
2364{
2365	spa_t *spa = vd->vdev_spa;
2366	uint64_t total = 0;
2367	if (vd->vdev_top_zap != 0) {
2368		total++;
2369		ASSERT0(zap_lookup_int(spa->spa_meta_objset,
2370		    spa->spa_all_vdev_zaps, vd->vdev_top_zap));
2371	}
2372	if (vd->vdev_leaf_zap != 0) {
2373		total++;
2374		ASSERT0(zap_lookup_int(spa->spa_meta_objset,
2375		    spa->spa_all_vdev_zaps, vd->vdev_leaf_zap));
2376	}
2377
2378	for (uint64_t i = 0; i < vd->vdev_children; i++) {
2379		total += vdev_count_verify_zaps(vd->vdev_child[i]);
2380	}
2381
2382	return (total);
2383}
2384
2385static int
2386spa_verify_host(spa_t *spa, nvlist_t *mos_config)
2387{
2388	uint64_t hostid;
2389	char *hostname;
2390	uint64_t myhostid = 0;
2391
2392	if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config,
2393	    ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
2394		hostname = fnvlist_lookup_string(mos_config,
2395		    ZPOOL_CONFIG_HOSTNAME);
2396
2397		myhostid = zone_get_hostid(NULL);
2398
2399		if (hostid != 0 && myhostid != 0 && hostid != myhostid) {
2400			cmn_err(CE_WARN, "pool '%s' could not be "
2401			    "loaded as it was last accessed by "
2402			    "another system (host: %s hostid: 0x%llx). "
2403			    "See: http://illumos.org/msg/ZFS-8000-EY",
2404			    spa_name(spa), hostname, (u_longlong_t)hostid);
2405			spa_load_failed(spa, "hostid verification failed: pool "
2406			    "last accessed by host: %s (hostid: 0x%llx)",
2407			    hostname, (u_longlong_t)hostid);
2408			return (SET_ERROR(EBADF));
2409		}
2410	}
2411
2412	return (0);
2413}
2414
2415static int
2416spa_ld_parse_config(spa_t *spa, spa_import_type_t type)
2417{
2418	int error = 0;
2419	nvlist_t *nvtree, *nvl, *config = spa->spa_config;
2420	int parse;
2421	vdev_t *rvd;
2422	uint64_t pool_guid;
2423	char *comment;
2424
2425	/*
2426	 * Versioning wasn't explicitly added to the label until later, so if
2427	 * it's not present treat it as the initial version.
2428	 */
2429	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
2430	    &spa->spa_ubsync.ub_version) != 0)
2431		spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
2432
2433	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) {
2434		spa_load_failed(spa, "invalid config provided: '%s' missing",
2435		    ZPOOL_CONFIG_POOL_GUID);
2436		return (SET_ERROR(EINVAL));
2437	}
2438
2439	/*
2440	 * If we are doing an import, ensure that the pool is not already
2441	 * imported by checking if its pool guid already exists in the
2442	 * spa namespace.
2443	 *
2444	 * The only case that we allow an already imported pool to be
2445	 * imported again, is when the pool is checkpointed and we want to
2446	 * look at its checkpointed state from userland tools like zdb.
2447	 */
2448#ifdef _KERNEL
2449	if ((spa->spa_load_state == SPA_LOAD_IMPORT ||
2450	    spa->spa_load_state == SPA_LOAD_TRYIMPORT) &&
2451	    spa_guid_exists(pool_guid, 0)) {
2452#else
2453	if ((spa->spa_load_state == SPA_LOAD_IMPORT ||
2454	    spa->spa_load_state == SPA_LOAD_TRYIMPORT) &&
2455	    spa_guid_exists(pool_guid, 0) &&
2456	    !spa_importing_readonly_checkpoint(spa)) {
2457#endif
2458		spa_load_failed(spa, "a pool with guid %llu is already open",
2459		    (u_longlong_t)pool_guid);
2460		return (SET_ERROR(EEXIST));
2461	}
2462
2463	spa->spa_config_guid = pool_guid;
2464
2465	nvlist_free(spa->spa_load_info);
2466	spa->spa_load_info = fnvlist_alloc();
2467
2468	ASSERT(spa->spa_comment == NULL);
2469	if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0)
2470		spa->spa_comment = spa_strdup(comment);
2471
2472	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
2473	    &spa->spa_config_txg);
2474
2475	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) == 0)
2476		spa->spa_config_splitting = fnvlist_dup(nvl);
2477
2478	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtree)) {
2479		spa_load_failed(spa, "invalid config provided: '%s' missing",
2480		    ZPOOL_CONFIG_VDEV_TREE);
2481		return (SET_ERROR(EINVAL));
2482	}
2483
2484	/*
2485	 * Create "The Godfather" zio to hold all async IOs
2486	 */
2487	spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *),
2488	    KM_SLEEP);
2489	for (int i = 0; i < max_ncpus; i++) {
2490		spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
2491		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
2492		    ZIO_FLAG_GODFATHER);
2493	}
2494
2495	/*
2496	 * Parse the configuration into a vdev tree.  We explicitly set the
2497	 * value that will be returned by spa_version() since parsing the
2498	 * configuration requires knowing the version number.
2499	 */
2500	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2501	parse = (type == SPA_IMPORT_EXISTING ?
2502	    VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT);
2503	error = spa_config_parse(spa, &rvd, nvtree, NULL, 0, parse);
2504	spa_config_exit(spa, SCL_ALL, FTAG);
2505
2506	if (error != 0) {
2507		spa_load_failed(spa, "unable to parse config [error=%d]",
2508		    error);
2509		return (error);
2510	}
2511
2512	ASSERT(spa->spa_root_vdev == rvd);
2513	ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT);
2514	ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT);
2515
2516	if (type != SPA_IMPORT_ASSEMBLE) {
2517		ASSERT(spa_guid(spa) == pool_guid);
2518	}
2519
2520	return (0);
2521}
2522
2523/*
2524 * Recursively open all vdevs in the vdev tree. This function is called twice:
2525 * first with the untrusted config, then with the trusted config.
2526 */
2527static int
2528spa_ld_open_vdevs(spa_t *spa)
2529{
2530	int error = 0;
2531
2532	/*
2533	 * spa_missing_tvds_allowed defines how many top-level vdevs can be
2534	 * missing/unopenable for the root vdev to be still considered openable.
2535	 */
2536	if (spa->spa_trust_config) {
2537		spa->spa_missing_tvds_allowed = zfs_max_missing_tvds;
2538	} else if (spa->spa_config_source == SPA_CONFIG_SRC_CACHEFILE) {
2539		spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_cachefile;
2540	} else if (spa->spa_config_source == SPA_CONFIG_SRC_SCAN) {
2541		spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_scan;
2542	} else {
2543		spa->spa_missing_tvds_allowed = 0;
2544	}
2545
2546	spa->spa_missing_tvds_allowed =
2547	    MAX(zfs_max_missing_tvds, spa->spa_missing_tvds_allowed);
2548
2549	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2550	error = vdev_open(spa->spa_root_vdev);
2551	spa_config_exit(spa, SCL_ALL, FTAG);
2552
2553	if (spa->spa_missing_tvds != 0) {
2554		spa_load_note(spa, "vdev tree has %lld missing top-level "
2555		    "vdevs.", (u_longlong_t)spa->spa_missing_tvds);
2556		if (spa->spa_trust_config && (spa->spa_mode & FWRITE)) {
2557			/*
2558			 * Although theoretically we could allow users to open
2559			 * incomplete pools in RW mode, we'd need to add a lot
2560			 * of extra logic (e.g. adjust pool space to account
2561			 * for missing vdevs).
2562			 * This limitation also prevents users from accidentally
2563			 * opening the pool in RW mode during data recovery and
2564			 * damaging it further.
2565			 */
2566			spa_load_note(spa, "pools with missing top-level "
2567			    "vdevs can only be opened in read-only mode.");
2568			error = SET_ERROR(ENXIO);
2569		} else {
2570			spa_load_note(spa, "current settings allow for maximum "
2571			    "%lld missing top-level vdevs at this stage.",
2572			    (u_longlong_t)spa->spa_missing_tvds_allowed);
2573		}
2574	}
2575	if (error != 0) {
2576		spa_load_failed(spa, "unable to open vdev tree [error=%d]",
2577		    error);
2578	}
2579	if (spa->spa_missing_tvds != 0 || error != 0)
2580		vdev_dbgmsg_print_tree(spa->spa_root_vdev, 2);
2581
2582	return (error);
2583}
2584
2585/*
2586 * We need to validate the vdev labels against the configuration that
2587 * we have in hand. This function is called twice: first with an untrusted
2588 * config, then with a trusted config. The validation is more strict when the
2589 * config is trusted.
2590 */
2591static int
2592spa_ld_validate_vdevs(spa_t *spa)
2593{
2594	int error = 0;
2595	vdev_t *rvd = spa->spa_root_vdev;
2596
2597	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2598	error = vdev_validate(rvd);
2599	spa_config_exit(spa, SCL_ALL, FTAG);
2600
2601	if (error != 0) {
2602		spa_load_failed(spa, "vdev_validate failed [error=%d]", error);
2603		return (error);
2604	}
2605
2606	if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
2607		spa_load_failed(spa, "cannot open vdev tree after invalidating "
2608		    "some vdevs");
2609		vdev_dbgmsg_print_tree(rvd, 2);
2610		return (SET_ERROR(ENXIO));
2611	}
2612
2613	return (0);
2614}
2615
2616static void
2617spa_ld_select_uberblock_done(spa_t *spa, uberblock_t *ub)
2618{
2619	spa->spa_state = POOL_STATE_ACTIVE;
2620	spa->spa_ubsync = spa->spa_uberblock;
2621	spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
2622	    TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1;
2623	spa->spa_first_txg = spa->spa_last_ubsync_txg ?
2624	    spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
2625	spa->spa_claim_max_txg = spa->spa_first_txg;
2626	spa->spa_prev_software_version = ub->ub_software_version;
2627}
2628
2629static int
2630spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type)
2631{
2632	vdev_t *rvd = spa->spa_root_vdev;
2633	nvlist_t *label;
2634	uberblock_t *ub = &spa->spa_uberblock;
2635
2636	/*
2637	 * If we are opening the checkpointed state of the pool by
2638	 * rewinding to it, at this point we will have written the
2639	 * checkpointed uberblock to the vdev labels, so searching
2640	 * the labels will find the right uberblock.  However, if
2641	 * we are opening the checkpointed state read-only, we have
2642	 * not modified the labels. Therefore, we must ignore the
2643	 * labels and continue using the spa_uberblock that was set
2644	 * by spa_ld_checkpoint_rewind.
2645	 *
2646	 * Note that it would be fine to ignore the labels when
2647	 * rewinding (opening writeable) as well. However, if we
2648	 * crash just after writing the labels, we will end up
2649	 * searching the labels. Doing so in the common case means
2650	 * that this code path gets exercised normally, rather than
2651	 * just in the edge case.
2652	 */
2653	if (ub->ub_checkpoint_txg != 0 &&
2654	    spa_importing_readonly_checkpoint(spa)) {
2655		spa_ld_select_uberblock_done(spa, ub);
2656		return (0);
2657	}
2658
2659	/*
2660	 * Find the best uberblock.
2661	 */
2662	vdev_uberblock_load(rvd, ub, &label);
2663
2664	/*
2665	 * If we weren't able to find a single valid uberblock, return failure.
2666	 */
2667	if (ub->ub_txg == 0) {
2668		nvlist_free(label);
2669		spa_load_failed(spa, "no valid uberblock found");
2670		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO));
2671	}
2672
2673	spa_load_note(spa, "using uberblock with txg=%llu",
2674	    (u_longlong_t)ub->ub_txg);
2675
2676	/*
2677	 * If the pool has an unsupported version we can't open it.
2678	 */
2679	if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) {
2680		nvlist_free(label);
2681		spa_load_failed(spa, "version %llu is not supported",
2682		    (u_longlong_t)ub->ub_version);
2683		return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP));
2684	}
2685
2686	if (ub->ub_version >= SPA_VERSION_FEATURES) {
2687		nvlist_t *features;
2688
2689		/*
2690		 * If we weren't able to find what's necessary for reading the
2691		 * MOS in the label, return failure.
2692		 */
2693		if (label == NULL) {
2694			spa_load_failed(spa, "label config unavailable");
2695			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
2696			    ENXIO));
2697		}
2698
2699		if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_FEATURES_FOR_READ,
2700		    &features) != 0) {
2701			nvlist_free(label);
2702			spa_load_failed(spa, "invalid label: '%s' missing",
2703			    ZPOOL_CONFIG_FEATURES_FOR_READ);
2704			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
2705			    ENXIO));
2706		}
2707
2708		/*
2709		 * Update our in-core representation with the definitive values
2710		 * from the label.
2711		 */
2712		nvlist_free(spa->spa_label_features);
2713		VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0);
2714	}
2715
2716	nvlist_free(label);
2717
2718	/*
2719	 * Look through entries in the label nvlist's features_for_read. If
2720	 * there is a feature listed there which we don't understand then we
2721	 * cannot open a pool.
2722	 */
2723	if (ub->ub_version >= SPA_VERSION_FEATURES) {
2724		nvlist_t *unsup_feat;
2725
2726		VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) ==
2727		    0);
2728
2729		for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features,
2730		    NULL); nvp != NULL;
2731		    nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) {
2732			if (!zfeature_is_supported(nvpair_name(nvp))) {
2733				VERIFY(nvlist_add_string(unsup_feat,
2734				    nvpair_name(nvp), "") == 0);
2735			}
2736		}
2737
2738		if (!nvlist_empty(unsup_feat)) {
2739			VERIFY(nvlist_add_nvlist(spa->spa_load_info,
2740			    ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0);
2741			nvlist_free(unsup_feat);
2742			spa_load_failed(spa, "some features are unsupported");
2743			return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
2744			    ENOTSUP));
2745		}
2746
2747		nvlist_free(unsup_feat);
2748	}
2749
2750	if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) {
2751		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2752		spa_try_repair(spa, spa->spa_config);
2753		spa_config_exit(spa, SCL_ALL, FTAG);
2754		nvlist_free(spa->spa_config_splitting);
2755		spa->spa_config_splitting = NULL;
2756	}
2757
2758	/*
2759	 * Initialize internal SPA structures.
2760	 */
2761	spa_ld_select_uberblock_done(spa, ub);
2762
2763	return (0);
2764}
2765
2766static int
2767spa_ld_open_rootbp(spa_t *spa)
2768{
2769	int error = 0;
2770	vdev_t *rvd = spa->spa_root_vdev;
2771
2772	error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
2773	if (error != 0) {
2774		spa_load_failed(spa, "unable to open rootbp in dsl_pool_init "
2775		    "[error=%d]", error);
2776		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2777	}
2778	spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
2779
2780	return (0);
2781}
2782
2783static int
2784spa_ld_trusted_config(spa_t *spa, spa_import_type_t type,
2785    boolean_t reloading)
2786{
2787	vdev_t *mrvd, *rvd = spa->spa_root_vdev;
2788	nvlist_t *nv, *mos_config, *policy;
2789	int error = 0, copy_error;
2790	uint64_t healthy_tvds, healthy_tvds_mos;
2791	uint64_t mos_config_txg;
2792
2793	if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object, B_TRUE)
2794	    != 0)
2795		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2796
2797	/*
2798	 * If we're assembling a pool from a split, the config provided is
2799	 * already trusted so there is nothing to do.
2800	 */
2801	if (type == SPA_IMPORT_ASSEMBLE)
2802		return (0);
2803
2804	healthy_tvds = spa_healthy_core_tvds(spa);
2805
2806	if (load_nvlist(spa, spa->spa_config_object, &mos_config)
2807	    != 0) {
2808		spa_load_failed(spa, "unable to retrieve MOS config");
2809		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2810	}
2811
2812	/*
2813	 * If we are doing an open, pool owner wasn't verified yet, thus do
2814	 * the verification here.
2815	 */
2816	if (spa->spa_load_state == SPA_LOAD_OPEN) {
2817		error = spa_verify_host(spa, mos_config);
2818		if (error != 0) {
2819			nvlist_free(mos_config);
2820			return (error);
2821		}
2822	}
2823
2824	nv = fnvlist_lookup_nvlist(mos_config, ZPOOL_CONFIG_VDEV_TREE);
2825
2826	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2827
2828	/*
2829	 * Build a new vdev tree from the trusted config
2830	 */
2831	VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0);
2832
2833	/*
2834	 * Vdev paths in the MOS may be obsolete. If the untrusted config was
2835	 * obtained by scanning /dev/dsk, then it will have the right vdev
2836	 * paths. We update the trusted MOS config with this information.
2837	 * We first try to copy the paths with vdev_copy_path_strict, which
2838	 * succeeds only when both configs have exactly the same vdev tree.
2839	 * If that fails, we fall back to a more flexible method that has a
2840	 * best effort policy.
2841	 */
2842	copy_error = vdev_copy_path_strict(rvd, mrvd);
2843	if (copy_error != 0 || spa_load_print_vdev_tree) {
2844		spa_load_note(spa, "provided vdev tree:");
2845		vdev_dbgmsg_print_tree(rvd, 2);
2846		spa_load_note(spa, "MOS vdev tree:");
2847		vdev_dbgmsg_print_tree(mrvd, 2);
2848	}
2849	if (copy_error != 0) {
2850		spa_load_note(spa, "vdev_copy_path_strict failed, falling "
2851		    "back to vdev_copy_path_relaxed");
2852		vdev_copy_path_relaxed(rvd, mrvd);
2853	}
2854
2855	vdev_close(rvd);
2856	vdev_free(rvd);
2857	spa->spa_root_vdev = mrvd;
2858	rvd = mrvd;
2859	spa_config_exit(spa, SCL_ALL, FTAG);
2860
2861	/*
2862	 * We will use spa_config if we decide to reload the spa or if spa_load
2863	 * fails and we rewind. We must thus regenerate the config using the
2864	 * MOS information with the updated paths. ZPOOL_LOAD_POLICY is used to
2865	 * pass settings on how to load the pool and is not stored in the MOS.
2866	 * We copy it over to our new, trusted config.
2867	 */
2868	mos_config_txg = fnvlist_lookup_uint64(mos_config,
2869	    ZPOOL_CONFIG_POOL_TXG);
2870	nvlist_free(mos_config);
2871	mos_config = spa_config_generate(spa, NULL, mos_config_txg, B_FALSE);
2872	if (nvlist_lookup_nvlist(spa->spa_config, ZPOOL_LOAD_POLICY,
2873	    &policy) == 0)
2874		fnvlist_add_nvlist(mos_config, ZPOOL_LOAD_POLICY, policy);
2875	spa_config_set(spa, mos_config);
2876	spa->spa_config_source = SPA_CONFIG_SRC_MOS;
2877
2878	/*
2879	 * Now that we got the config from the MOS, we should be more strict
2880	 * in checking blkptrs and can make assumptions about the consistency
2881	 * of the vdev tree. spa_trust_config must be set to true before opening
2882	 * vdevs in order for them to be writeable.
2883	 */
2884	spa->spa_trust_config = B_TRUE;
2885
2886	/*
2887	 * Open and validate the new vdev tree
2888	 */
2889	error = spa_ld_open_vdevs(spa);
2890	if (error != 0)
2891		return (error);
2892
2893	error = spa_ld_validate_vdevs(spa);
2894	if (error != 0)
2895		return (error);
2896
2897	if (copy_error != 0 || spa_load_print_vdev_tree) {
2898		spa_load_note(spa, "final vdev tree:");
2899		vdev_dbgmsg_print_tree(rvd, 2);
2900	}
2901
2902	if (spa->spa_load_state != SPA_LOAD_TRYIMPORT &&
2903	    !spa->spa_extreme_rewind && zfs_max_missing_tvds == 0) {
2904		/*
2905		 * Sanity check to make sure that we are indeed loading the
2906		 * latest uberblock. If we missed SPA_SYNC_MIN_VDEVS tvds
2907		 * in the config provided and they happened to be the only ones
2908		 * to have the latest uberblock, we could involuntarily perform
2909		 * an extreme rewind.
2910		 */
2911		healthy_tvds_mos = spa_healthy_core_tvds(spa);
2912		if (healthy_tvds_mos - healthy_tvds >=
2913		    SPA_SYNC_MIN_VDEVS) {
2914			spa_load_note(spa, "config provided misses too many "
2915			    "top-level vdevs compared to MOS (%lld vs %lld). ",
2916			    (u_longlong_t)healthy_tvds,
2917			    (u_longlong_t)healthy_tvds_mos);
2918			spa_load_note(spa, "vdev tree:");
2919			vdev_dbgmsg_print_tree(rvd, 2);
2920			if (reloading) {
2921				spa_load_failed(spa, "config was already "
2922				    "provided from MOS. Aborting.");
2923				return (spa_vdev_err(rvd,
2924				    VDEV_AUX_CORRUPT_DATA, EIO));
2925			}
2926			spa_load_note(spa, "spa must be reloaded using MOS "
2927			    "config");
2928			return (SET_ERROR(EAGAIN));
2929		}
2930	}
2931
2932	error = spa_check_for_missing_logs(spa);
2933	if (error != 0)
2934		return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO));
2935
2936	if (rvd->vdev_guid_sum != spa->spa_uberblock.ub_guid_sum) {
2937		spa_load_failed(spa, "uberblock guid sum doesn't match MOS "
2938		    "guid sum (%llu != %llu)",
2939		    (u_longlong_t)spa->spa_uberblock.ub_guid_sum,
2940		    (u_longlong_t)rvd->vdev_guid_sum);
2941		return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM,
2942		    ENXIO));
2943	}
2944
2945	return (0);
2946}
2947
2948static int
2949spa_ld_open_indirect_vdev_metadata(spa_t *spa)
2950{
2951	int error = 0;
2952	vdev_t *rvd = spa->spa_root_vdev;
2953
2954	/*
2955	 * Everything that we read before spa_remove_init() must be stored
2956	 * on concreted vdevs.  Therefore we do this as early as possible.
2957	 */
2958	error = spa_remove_init(spa);
2959	if (error != 0) {
2960		spa_load_failed(spa, "spa_remove_init failed [error=%d]",
2961		    error);
2962		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2963	}
2964
2965	/*
2966	 * Retrieve information needed to condense indirect vdev mappings.
2967	 */
2968	error = spa_condense_init(spa);
2969	if (error != 0) {
2970		spa_load_failed(spa, "spa_condense_init failed [error=%d]",
2971		    error);
2972		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
2973	}
2974
2975	return (0);
2976}
2977
2978static int
2979spa_ld_check_features(spa_t *spa, boolean_t *missing_feat_writep)
2980{
2981	int error = 0;
2982	vdev_t *rvd = spa->spa_root_vdev;
2983
2984	if (spa_version(spa) >= SPA_VERSION_FEATURES) {
2985		boolean_t missing_feat_read = B_FALSE;
2986		nvlist_t *unsup_feat, *enabled_feat;
2987
2988		if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ,
2989		    &spa->spa_feat_for_read_obj, B_TRUE) != 0) {
2990			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2991		}
2992
2993		if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE,
2994		    &spa->spa_feat_for_write_obj, B_TRUE) != 0) {
2995			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2996		}
2997
2998		if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS,
2999		    &spa->spa_feat_desc_obj, B_TRUE) != 0) {
3000			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3001		}
3002
3003		enabled_feat = fnvlist_alloc();
3004		unsup_feat = fnvlist_alloc();
3005
3006		if (!spa_features_check(spa, B_FALSE,
3007		    unsup_feat, enabled_feat))
3008			missing_feat_read = B_TRUE;
3009
3010		if (spa_writeable(spa) ||
3011		    spa->spa_load_state == SPA_LOAD_TRYIMPORT) {
3012			if (!spa_features_check(spa, B_TRUE,
3013			    unsup_feat, enabled_feat)) {
3014				*missing_feat_writep = B_TRUE;
3015			}
3016		}
3017
3018		fnvlist_add_nvlist(spa->spa_load_info,
3019		    ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat);
3020
3021		if (!nvlist_empty(unsup_feat)) {
3022			fnvlist_add_nvlist(spa->spa_load_info,
3023			    ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat);
3024		}
3025
3026		fnvlist_free(enabled_feat);
3027		fnvlist_free(unsup_feat);
3028
3029		if (!missing_feat_read) {
3030			fnvlist_add_boolean(spa->spa_load_info,
3031			    ZPOOL_CONFIG_CAN_RDONLY);
3032		}
3033
3034		/*
3035		 * If the state is SPA_LOAD_TRYIMPORT, our objective is
3036		 * twofold: to determine whether the pool is available for
3037		 * import in read-write mode and (if it is not) whether the
3038		 * pool is available for import in read-only mode. If the pool
3039		 * is available for import in read-write mode, it is displayed
3040		 * as available in userland; if it is not available for import
3041		 * in read-only mode, it is displayed as unavailable in
3042		 * userland. If the pool is available for import in read-only
3043		 * mode but not read-write mode, it is displayed as unavailable
3044		 * in userland with a special note that the pool is actually
3045		 * available for open in read-only mode.
3046		 *
3047		 * As a result, if the state is SPA_LOAD_TRYIMPORT and we are
3048		 * missing a feature for write, we must first determine whether
3049		 * the pool can be opened read-only before returning to
3050		 * userland in order to know whether to display the
3051		 * abovementioned note.
3052		 */
3053		if (missing_feat_read || (*missing_feat_writep &&
3054		    spa_writeable(spa))) {
3055			spa_load_failed(spa, "pool uses unsupported features");
3056			return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
3057			    ENOTSUP));
3058		}
3059
3060		/*
3061		 * Load refcounts for ZFS features from disk into an in-memory
3062		 * cache during SPA initialization.
3063		 */
3064		for (spa_feature_t i = 0; i < SPA_FEATURES; i++) {
3065			uint64_t refcount;
3066
3067			error = feature_get_refcount_from_disk(spa,
3068			    &spa_feature_table[i], &refcount);
3069			if (error == 0) {
3070				spa->spa_feat_refcount_cache[i] = refcount;
3071			} else if (error == ENOTSUP) {
3072				spa->spa_feat_refcount_cache[i] =
3073				    SPA_FEATURE_DISABLED;
3074			} else {
3075				spa_load_failed(spa, "error getting refcount "
3076				    "for feature %s [error=%d]",
3077				    spa_feature_table[i].fi_guid, error);
3078				return (spa_vdev_err(rvd,
3079				    VDEV_AUX_CORRUPT_DATA, EIO));
3080			}
3081		}
3082	}
3083
3084	if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) {
3085		if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG,
3086		    &spa->spa_feat_enabled_txg_obj, B_TRUE) != 0)
3087			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3088	}
3089
3090	return (0);
3091}
3092
3093static int
3094spa_ld_load_special_directories(spa_t *spa)
3095{
3096	int error = 0;
3097	vdev_t *rvd = spa->spa_root_vdev;
3098
3099	spa->spa_is_initializing = B_TRUE;
3100	error = dsl_pool_open(spa->spa_dsl_pool);
3101	spa->spa_is_initializing = B_FALSE;
3102	if (error != 0) {
3103		spa_load_failed(spa, "dsl_pool_open failed [error=%d]", error);
3104		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3105	}
3106
3107	return (0);
3108}
3109
3110static int
3111spa_ld_get_props(spa_t *spa)
3112{
3113	int error = 0;
3114	uint64_t obj;
3115	vdev_t *rvd = spa->spa_root_vdev;
3116
3117	/* Grab the secret checksum salt from the MOS. */
3118	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
3119	    DMU_POOL_CHECKSUM_SALT, 1,
3120	    sizeof (spa->spa_cksum_salt.zcs_bytes),
3121	    spa->spa_cksum_salt.zcs_bytes);
3122	if (error == ENOENT) {
3123		/* Generate a new salt for subsequent use */
3124		(void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes,
3125		    sizeof (spa->spa_cksum_salt.zcs_bytes));
3126	} else if (error != 0) {
3127		spa_load_failed(spa, "unable to retrieve checksum salt from "
3128		    "MOS [error=%d]", error);
3129		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3130	}
3131
3132	if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj, B_TRUE) != 0)
3133		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3134	error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj);
3135	if (error != 0) {
3136		spa_load_failed(spa, "error opening deferred-frees bpobj "
3137		    "[error=%d]", error);
3138		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3139	}
3140
3141	/*
3142	 * Load the bit that tells us to use the new accounting function
3143	 * (raid-z deflation).  If we have an older pool, this will not
3144	 * be present.
3145	 */
3146	error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate, B_FALSE);
3147	if (error != 0 && error != ENOENT)
3148		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3149
3150	error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION,
3151	    &spa->spa_creation_version, B_FALSE);
3152	if (error != 0 && error != ENOENT)
3153		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3154
3155	/*
3156	 * Load the persistent error log.  If we have an older pool, this will
3157	 * not be present.
3158	 */
3159	error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last,
3160	    B_FALSE);
3161	if (error != 0 && error != ENOENT)
3162		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3163
3164	error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB,
3165	    &spa->spa_errlog_scrub, B_FALSE);
3166	if (error != 0 && error != ENOENT)
3167		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3168
3169	/*
3170	 * Load the history object.  If we have an older pool, this
3171	 * will not be present.
3172	 */
3173	error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history, B_FALSE);
3174	if (error != 0 && error != ENOENT)
3175		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3176
3177	/*
3178	 * Load the per-vdev ZAP map. If we have an older pool, this will not
3179	 * be present; in this case, defer its creation to a later time to
3180	 * avoid dirtying the MOS this early / out of sync context. See
3181	 * spa_sync_config_object.
3182	 */
3183
3184	/* The sentinel is only available in the MOS config. */
3185	nvlist_t *mos_config;
3186	if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) {
3187		spa_load_failed(spa, "unable to retrieve MOS config");
3188		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3189	}
3190
3191	error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP,
3192	    &spa->spa_all_vdev_zaps, B_FALSE);
3193
3194	if (error == ENOENT) {
3195		VERIFY(!nvlist_exists(mos_config,
3196		    ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS));
3197		spa->spa_avz_action = AVZ_ACTION_INITIALIZE;
3198		ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev));
3199	} else if (error != 0) {
3200		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3201	} else if (!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) {
3202		/*
3203		 * An older version of ZFS overwrote the sentinel value, so
3204		 * we have orphaned per-vdev ZAPs in the MOS. Defer their
3205		 * destruction to later; see spa_sync_config_object.
3206		 */
3207		spa->spa_avz_action = AVZ_ACTION_DESTROY;
3208		/*
3209		 * We're assuming that no vdevs have had their ZAPs created
3210		 * before this. Better be sure of it.
3211		 */
3212		ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev));
3213	}
3214	nvlist_free(mos_config);
3215
3216	spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
3217
3218	error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object,
3219	    B_FALSE);
3220	if (error && error != ENOENT)
3221		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3222
3223	if (error == 0) {
3224		uint64_t autoreplace;
3225
3226		spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs);
3227		spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace);
3228		spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation);
3229		spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode);
3230		spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand);
3231		spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO,
3232		    &spa->spa_dedup_ditto);
3233
3234		spa->spa_autoreplace = (autoreplace != 0);
3235	}
3236
3237	/*
3238	 * If we are importing a pool with missing top-level vdevs,
3239	 * we enforce that the pool doesn't panic or get suspended on
3240	 * error since the likelihood of missing data is extremely high.
3241	 */
3242	if (spa->spa_missing_tvds > 0 &&
3243	    spa->spa_failmode != ZIO_FAILURE_MODE_CONTINUE &&
3244	    spa->spa_load_state != SPA_LOAD_TRYIMPORT) {
3245		spa_load_note(spa, "forcing failmode to 'continue' "
3246		    "as some top level vdevs are missing");
3247		spa->spa_failmode = ZIO_FAILURE_MODE_CONTINUE;
3248	}
3249
3250	return (0);
3251}
3252
3253static int
3254spa_ld_open_aux_vdevs(spa_t *spa, spa_import_type_t type)
3255{
3256	int error = 0;
3257	vdev_t *rvd = spa->spa_root_vdev;
3258
3259	/*
3260	 * If we're assembling the pool from the split-off vdevs of
3261	 * an existing pool, we don't want to attach the spares & cache
3262	 * devices.
3263	 */
3264
3265	/*
3266	 * Load any hot spares for this pool.
3267	 */
3268	error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object,
3269	    B_FALSE);
3270	if (error != 0 && error != ENOENT)
3271		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3272	if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
3273		ASSERT(spa_version(spa) >= SPA_VERSION_SPARES);
3274		if (load_nvlist(spa, spa->spa_spares.sav_object,
3275		    &spa->spa_spares.sav_config) != 0) {
3276			spa_load_failed(spa, "error loading spares nvlist");
3277			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3278		}
3279
3280		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3281		spa_load_spares(spa);
3282		spa_config_exit(spa, SCL_ALL, FTAG);
3283	} else if (error == 0) {
3284		spa->spa_spares.sav_sync = B_TRUE;
3285	}
3286
3287	/*
3288	 * Load any level 2 ARC devices for this pool.
3289	 */
3290	error = spa_dir_prop(spa, DMU_POOL_L2CACHE,
3291	    &spa->spa_l2cache.sav_object, B_FALSE);
3292	if (error != 0 && error != ENOENT)
3293		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3294	if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
3295		ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE);
3296		if (load_nvlist(spa, spa->spa_l2cache.sav_object,
3297		    &spa->spa_l2cache.sav_config) != 0) {
3298			spa_load_failed(spa, "error loading l2cache nvlist");
3299			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3300		}
3301
3302		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3303		spa_load_l2cache(spa);
3304		spa_config_exit(spa, SCL_ALL, FTAG);
3305	} else if (error == 0) {
3306		spa->spa_l2cache.sav_sync = B_TRUE;
3307	}
3308
3309	return (0);
3310}
3311
3312static int
3313spa_ld_load_vdev_metadata(spa_t *spa)
3314{
3315	int error = 0;
3316	vdev_t *rvd = spa->spa_root_vdev;
3317
3318	/*
3319	 * If the 'autoreplace' property is set, then post a resource notifying
3320	 * the ZFS DE that it should not issue any faults for unopenable
3321	 * devices.  We also iterate over the vdevs, and post a sysevent for any
3322	 * unopenable vdevs so that the normal autoreplace handler can take
3323	 * over.
3324	 */
3325	if (spa->spa_autoreplace && spa->spa_load_state != SPA_LOAD_TRYIMPORT) {
3326		spa_check_removed(spa->spa_root_vdev);
3327		/*
3328		 * For the import case, this is done in spa_import(), because
3329		 * at this point we're using the spare definitions from
3330		 * the MOS config, not necessarily from the userland config.
3331		 */
3332		if (spa->spa_load_state != SPA_LOAD_IMPORT) {
3333			spa_aux_check_removed(&spa->spa_spares);
3334			spa_aux_check_removed(&spa->spa_l2cache);
3335		}
3336	}
3337
3338	/*
3339	 * Load the vdev metadata such as metaslabs, DTLs, spacemap object, etc.
3340	 */
3341	error = vdev_load(rvd);
3342	if (error != 0) {
3343		spa_load_failed(spa, "vdev_load failed [error=%d]", error);
3344		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
3345	}
3346
3347	/*
3348	 * Propagate the leaf DTLs we just loaded all the way up the vdev tree.
3349	 */
3350	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3351	vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
3352	spa_config_exit(spa, SCL_ALL, FTAG);
3353
3354	return (0);
3355}
3356
3357static int
3358spa_ld_load_dedup_tables(spa_t *spa)
3359{
3360	int error = 0;
3361	vdev_t *rvd = spa->spa_root_vdev;
3362
3363	error = ddt_load(spa);
3364	if (error != 0) {
3365		spa_load_failed(spa, "ddt_load failed [error=%d]", error);
3366		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3367	}
3368
3369	return (0);
3370}
3371
3372static int
3373spa_ld_verify_logs(spa_t *spa, spa_import_type_t type, char **ereport)
3374{
3375	vdev_t *rvd = spa->spa_root_vdev;
3376
3377	if (type != SPA_IMPORT_ASSEMBLE && spa_writeable(spa)) {
3378		boolean_t missing = spa_check_logs(spa);
3379		if (missing) {
3380			if (spa->spa_missing_tvds != 0) {
3381				spa_load_note(spa, "spa_check_logs failed "
3382				    "so dropping the logs");
3383			} else {
3384				*ereport = FM_EREPORT_ZFS_LOG_REPLAY;
3385				spa_load_failed(spa, "spa_check_logs failed");
3386				return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG,
3387				    ENXIO));
3388			}
3389		}
3390	}
3391
3392	return (0);
3393}
3394
3395static int
3396spa_ld_verify_pool_data(spa_t *spa)
3397{
3398	int error = 0;
3399	vdev_t *rvd = spa->spa_root_vdev;
3400
3401	/*
3402	 * We've successfully opened the pool, verify that we're ready
3403	 * to start pushing transactions.
3404	 */
3405	if (spa->spa_load_state != SPA_LOAD_TRYIMPORT) {
3406		error = spa_load_verify(spa);
3407		if (error != 0) {
3408			spa_load_failed(spa, "spa_load_verify failed "
3409			    "[error=%d]", error);
3410			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
3411			    error));
3412		}
3413	}
3414
3415	return (0);
3416}
3417
3418static void
3419spa_ld_claim_log_blocks(spa_t *spa)
3420{
3421	dmu_tx_t *tx;
3422	dsl_pool_t *dp = spa_get_dsl(spa);
3423
3424	/*
3425	 * Claim log blocks that haven't been committed yet.
3426	 * This must all happen in a single txg.
3427	 * Note: spa_claim_max_txg is updated by spa_claim_notify(),
3428	 * invoked from zil_claim_log_block()'s i/o done callback.
3429	 * Price of rollback is that we abandon the log.
3430	 */
3431	spa->spa_claiming = B_TRUE;
3432
3433	tx = dmu_tx_create_assigned(dp, spa_first_txg(spa));
3434	(void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
3435	    zil_claim, tx, DS_FIND_CHILDREN);
3436	dmu_tx_commit(tx);
3437
3438	spa->spa_claiming = B_FALSE;
3439
3440	spa_set_log_state(spa, SPA_LOG_GOOD);
3441}
3442
3443static void
3444spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg,
3445    boolean_t update_config_cache)
3446{
3447	vdev_t *rvd = spa->spa_root_vdev;
3448	int need_update = B_FALSE;
3449
3450	/*
3451	 * If the config cache is stale, or we have uninitialized
3452	 * metaslabs (see spa_vdev_add()), then update the config.
3453	 *
3454	 * If this is a verbatim import, trust the current
3455	 * in-core spa_config and update the disk labels.
3456	 */
3457	if (update_config_cache || config_cache_txg != spa->spa_config_txg ||
3458	    spa->spa_load_state == SPA_LOAD_IMPORT ||
3459	    spa->spa_load_state == SPA_LOAD_RECOVER ||
3460	    (spa->spa_import_flags & ZFS_IMPORT_VERBATIM))
3461		need_update = B_TRUE;
3462
3463	for (int c = 0; c < rvd->vdev_children; c++)
3464		if (rvd->vdev_child[c]->vdev_ms_array == 0)
3465			need_update = B_TRUE;
3466
3467	/*
3468	 * Update the config cache asychronously in case we're the
3469	 * root pool, in which case the config cache isn't writable yet.
3470	 */
3471	if (need_update)
3472		spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
3473}
3474
3475static void
3476spa_ld_prepare_for_reload(spa_t *spa)
3477{
3478	int mode = spa->spa_mode;
3479	int async_suspended = spa->spa_async_suspended;
3480
3481	spa_unload(spa);
3482	spa_deactivate(spa);
3483	spa_activate(spa, mode);
3484
3485	/*
3486	 * We save the value of spa_async_suspended as it gets reset to 0 by
3487	 * spa_unload(). We want to restore it back to the original value before
3488	 * returning as we might be calling spa_async_resume() later.
3489	 */
3490	spa->spa_async_suspended = async_suspended;
3491}
3492
3493static int
3494spa_ld_read_checkpoint_txg(spa_t *spa)
3495{
3496	uberblock_t checkpoint;
3497	int error = 0;
3498
3499	ASSERT0(spa->spa_checkpoint_txg);
3500	ASSERT(MUTEX_HELD(&spa_namespace_lock));
3501
3502	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
3503	    DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
3504	    sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
3505
3506	if (error == ENOENT)
3507		return (0);
3508
3509	if (error != 0)
3510		return (error);
3511
3512	ASSERT3U(checkpoint.ub_txg, !=, 0);
3513	ASSERT3U(checkpoint.ub_checkpoint_txg, !=, 0);
3514	ASSERT3U(checkpoint.ub_timestamp, !=, 0);
3515	spa->spa_checkpoint_txg = checkpoint.ub_txg;
3516	spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp;
3517
3518	return (0);
3519}
3520
3521static int
3522spa_ld_mos_init(spa_t *spa, spa_import_type_t type)
3523{
3524	int error = 0;
3525
3526	ASSERT(MUTEX_HELD(&spa_namespace_lock));
3527	ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE);
3528
3529	/*
3530	 * Never trust the config that is provided unless we are assembling
3531	 * a pool following a split.
3532	 * This means don't trust blkptrs and the vdev tree in general. This
3533	 * also effectively puts the spa in read-only mode since
3534	 * spa_writeable() checks for spa_trust_config to be true.
3535	 * We will later load a trusted config from the MOS.
3536	 */
3537	if (type != SPA_IMPORT_ASSEMBLE)
3538		spa->spa_trust_config = B_FALSE;
3539
3540	/*
3541	 * Parse the config provided to create a vdev tree.
3542	 */
3543	error = spa_ld_parse_config(spa, type);
3544	if (error != 0)
3545		return (error);
3546
3547	/*
3548	 * Now that we have the vdev tree, try to open each vdev. This involves
3549	 * opening the underlying physical device, retrieving its geometry and
3550	 * probing the vdev with a dummy I/O. The state of each vdev will be set
3551	 * based on the success of those operations. After this we'll be ready
3552	 * to read from the vdevs.
3553	 */
3554	error = spa_ld_open_vdevs(spa);
3555	if (error != 0)
3556		return (error);
3557
3558	/*
3559	 * Read the label of each vdev and make sure that the GUIDs stored
3560	 * there match the GUIDs in the config provided.
3561	 * If we're assembling a new pool that's been split off from an
3562	 * existing pool, the labels haven't yet been updated so we skip
3563	 * validation for now.
3564	 */
3565	if (type != SPA_IMPORT_ASSEMBLE) {
3566		error = spa_ld_validate_vdevs(spa);
3567		if (error != 0)
3568			return (error);
3569	}
3570
3571	/*
3572	 * Read all vdev labels to find the best uberblock (i.e. latest,
3573	 * unless spa_load_max_txg is set) and store it in spa_uberblock. We
3574	 * get the list of features required to read blkptrs in the MOS from
3575	 * the vdev label with the best uberblock and verify that our version
3576	 * of zfs supports them all.
3577	 */
3578	error = spa_ld_select_uberblock(spa, type);
3579	if (error != 0)
3580		return (error);
3581
3582	/*
3583	 * Pass that uberblock to the dsl_pool layer which will open the root
3584	 * blkptr. This blkptr points to the latest version of the MOS and will
3585	 * allow us to read its contents.
3586	 */
3587	error = spa_ld_open_rootbp(spa);
3588	if (error != 0)
3589		return (error);
3590
3591	return (0);
3592}
3593
3594static int
3595spa_ld_checkpoint_rewind(spa_t *spa)
3596{
3597	uberblock_t checkpoint;
3598	int error = 0;
3599
3600	ASSERT(MUTEX_HELD(&spa_namespace_lock));
3601	ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
3602
3603	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
3604	    DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
3605	    sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
3606
3607	if (error != 0) {
3608		spa_load_failed(spa, "unable to retrieve checkpointed "
3609		    "uberblock from the MOS config [error=%d]", error);
3610
3611		if (error == ENOENT)
3612			error = ZFS_ERR_NO_CHECKPOINT;
3613
3614		return (error);
3615	}
3616
3617	ASSERT3U(checkpoint.ub_txg, <, spa->spa_uberblock.ub_txg);
3618	ASSERT3U(checkpoint.ub_txg, ==, checkpoint.ub_checkpoint_txg);
3619
3620	/*
3621	 * We need to update the txg and timestamp of the checkpointed
3622	 * uberblock to be higher than the latest one. This ensures that
3623	 * the checkpointed uberblock is selected if we were to close and
3624	 * reopen the pool right after we've written it in the vdev labels.
3625	 * (also see block comment in vdev_uberblock_compare)
3626	 */
3627	checkpoint.ub_txg = spa->spa_uberblock.ub_txg + 1;
3628	checkpoint.ub_timestamp = gethrestime_sec();
3629
3630	/*
3631	 * Set current uberblock to be the checkpointed uberblock.
3632	 */
3633	spa->spa_uberblock = checkpoint;
3634
3635	/*
3636	 * If we are doing a normal rewind, then the pool is open for
3637	 * writing and we sync the "updated" checkpointed uberblock to
3638	 * disk. Once this is done, we've basically rewound the whole
3639	 * pool and there is no way back.
3640	 *
3641	 * There are cases when we don't want to attempt and sync the
3642	 * checkpointed uberblock to disk because we are opening a
3643	 * pool as read-only. Specifically, verifying the checkpointed
3644	 * state with zdb, and importing the checkpointed state to get
3645	 * a "preview" of its content.
3646	 */
3647	if (spa_writeable(spa)) {
3648		vdev_t *rvd = spa->spa_root_vdev;
3649
3650		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3651		vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL };
3652		int svdcount = 0;
3653		int children = rvd->vdev_children;
3654		int c0 = spa_get_random(children);
3655
3656		for (int c = 0; c < children; c++) {
3657			vdev_t *vd = rvd->vdev_child[(c0 + c) % children];
3658
3659			/* Stop when revisiting the first vdev */
3660			if (c > 0 && svd[0] == vd)
3661				break;
3662
3663			if (vd->vdev_ms_array == 0 || vd->vdev_islog ||
3664			    !vdev_is_concrete(vd))
3665				continue;
3666
3667			svd[svdcount++] = vd;
3668			if (svdcount == SPA_SYNC_MIN_VDEVS)
3669				break;
3670		}
3671		error = vdev_config_sync(svd, svdcount, spa->spa_first_txg);
3672		if (error == 0)
3673			spa->spa_last_synced_guid = rvd->vdev_guid;
3674		spa_config_exit(spa, SCL_ALL, FTAG);
3675
3676		if (error != 0) {
3677			spa_load_failed(spa, "failed to write checkpointed "
3678			    "uberblock to the vdev labels [error=%d]", error);
3679			return (error);
3680		}
3681	}
3682
3683	return (0);
3684}
3685
3686static int
3687spa_ld_mos_with_trusted_config(spa_t *spa, spa_import_type_t type,
3688    boolean_t *update_config_cache)
3689{
3690	int error;
3691
3692	/*
3693	 * Parse the config for pool, open and validate vdevs,
3694	 * select an uberblock, and use that uberblock to open
3695	 * the MOS.
3696	 */
3697	error = spa_ld_mos_init(spa, type);
3698	if (error != 0)
3699		return (error);
3700
3701	/*
3702	 * Retrieve the trusted config stored in the MOS and use it to create
3703	 * a new, exact version of the vdev tree, then reopen all vdevs.
3704	 */
3705	error = spa_ld_trusted_config(spa, type, B_FALSE);
3706	if (error == EAGAIN) {
3707		if (update_config_cache != NULL)
3708			*update_config_cache = B_TRUE;
3709
3710		/*
3711		 * Redo the loading process with the trusted config if it is
3712		 * too different from the untrusted config.
3713		 */
3714		spa_ld_prepare_for_reload(spa);
3715		spa_load_note(spa, "RELOADING");
3716		error = spa_ld_mos_init(spa, type);
3717		if (error != 0)
3718			return (error);
3719
3720		error = spa_ld_trusted_config(spa, type, B_TRUE);
3721		if (error != 0)
3722			return (error);
3723
3724	} else if (error != 0) {
3725		return (error);
3726	}
3727
3728	return (0);
3729}
3730
3731/*
3732 * Load an existing storage pool, using the config provided. This config
3733 * describes which vdevs are part of the pool and is later validated against
3734 * partial configs present in each vdev's label and an entire copy of the
3735 * config stored in the MOS.
3736 */
3737static int
3738spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
3739{
3740	int error = 0;
3741	boolean_t missing_feat_write = B_FALSE;
3742	boolean_t checkpoint_rewind =
3743	    (spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
3744	boolean_t update_config_cache = B_FALSE;
3745
3746	ASSERT(MUTEX_HELD(&spa_namespace_lock));
3747	ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE);
3748
3749	spa_load_note(spa, "LOADING");
3750
3751	error = spa_ld_mos_with_trusted_config(spa, type, &update_config_cache);
3752	if (error != 0)
3753		return (error);
3754
3755	/*
3756	 * If we are rewinding to the checkpoint then we need to repeat
3757	 * everything we've done so far in this function but this time
3758	 * selecting the checkpointed uberblock and using that to open
3759	 * the MOS.
3760	 */
3761	if (checkpoint_rewind) {
3762		/*
3763		 * If we are rewinding to the checkpoint update config cache
3764		 * anyway.
3765		 */
3766		update_config_cache = B_TRUE;
3767
3768		/*
3769		 * Extract the checkpointed uberblock from the current MOS
3770		 * and use this as the pool's uberblock from now on. If the
3771		 * pool is imported as writeable we also write the checkpoint
3772		 * uberblock to the labels, making the rewind permanent.
3773		 */
3774		error = spa_ld_checkpoint_rewind(spa);
3775		if (error != 0)
3776			return (error);
3777
3778		/*
3779		 * Redo the loading process process again with the
3780		 * checkpointed uberblock.
3781		 */
3782		spa_ld_prepare_for_reload(spa);
3783		spa_load_note(spa, "LOADING checkpointed uberblock");
3784		error = spa_ld_mos_with_trusted_config(spa, type, NULL);
3785		if (error != 0)
3786			return (error);
3787	}
3788
3789	/*
3790	 * Retrieve the checkpoint txg if the pool has a checkpoint.
3791	 */
3792	error = spa_ld_read_checkpoint_txg(spa);
3793	if (error != 0)
3794		return (error);
3795
3796	/*
3797	 * Retrieve the mapping of indirect vdevs. Those vdevs were removed
3798	 * from the pool and their contents were re-mapped to other vdevs. Note
3799	 * that everything that we read before this step must have been
3800	 * rewritten on concrete vdevs after the last device removal was
3801	 * initiated. Otherwise we could be reading from indirect vdevs before
3802	 * we have loaded their mappings.
3803	 */
3804	error = spa_ld_open_indirect_vdev_metadata(spa);
3805	if (error != 0)
3806		return (error);
3807
3808	/*
3809	 * Retrieve the full list of active features from the MOS and check if
3810	 * they are all supported.
3811	 */
3812	error = spa_ld_check_features(spa, &missing_feat_write);
3813	if (error != 0)
3814		return (error);
3815
3816	/*
3817	 * Load several special directories from the MOS needed by the dsl_pool
3818	 * layer.
3819	 */
3820	error = spa_ld_load_special_directories(spa);
3821	if (error != 0)
3822		return (error);
3823
3824	/*
3825	 * Retrieve pool properties from the MOS.
3826	 */
3827	error = spa_ld_get_props(spa);
3828	if (error != 0)
3829		return (error);
3830
3831	/*
3832	 * Retrieve the list of auxiliary devices - cache devices and spares -
3833	 * and open them.
3834	 */
3835	error = spa_ld_open_aux_vdevs(spa, type);
3836	if (error != 0)
3837		return (error);
3838
3839	/*
3840	 * Load the metadata for all vdevs. Also check if unopenable devices
3841	 * should be autoreplaced.
3842	 */
3843	error = spa_ld_load_vdev_metadata(spa);
3844	if (error != 0)
3845		return (error);
3846
3847	error = spa_ld_load_dedup_tables(spa);
3848	if (error != 0)
3849		return (error);
3850
3851	/*
3852	 * Verify the logs now to make sure we don't have any unexpected errors
3853	 * when we claim log blocks later.
3854	 */
3855	error = spa_ld_verify_logs(spa, type, ereport);
3856	if (error != 0)
3857		return (error);
3858
3859	if (missing_feat_write) {
3860		ASSERT(spa->spa_load_state == SPA_LOAD_TRYIMPORT);
3861
3862		/*
3863		 * At this point, we know that we can open the pool in
3864		 * read-only mode but not read-write mode. We now have enough
3865		 * information and can return to userland.
3866		 */
3867		return (spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT,
3868		    ENOTSUP));
3869	}
3870
3871	/*
3872	 * Traverse the last txgs to make sure the pool was left off in a safe
3873	 * state. When performing an extreme rewind, we verify the whole pool,
3874	 * which can take a very long time.
3875	 */
3876	error = spa_ld_verify_pool_data(spa);
3877	if (error != 0)
3878		return (error);
3879
3880	/*
3881	 * Calculate the deflated space for the pool. This must be done before
3882	 * we write anything to the pool because we'd need to update the space
3883	 * accounting using the deflated sizes.
3884	 */
3885	spa_update_dspace(spa);
3886
3887	/*
3888	 * We have now retrieved all the information we needed to open the
3889	 * pool. If we are importing the pool in read-write mode, a few
3890	 * additional steps must be performed to finish the import.
3891	 */
3892	if (spa_writeable(spa) && (spa->spa_load_state == SPA_LOAD_RECOVER ||
3893	    spa->spa_load_max_txg == UINT64_MAX)) {
3894		uint64_t config_cache_txg = spa->spa_config_txg;
3895
3896		ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT);
3897
3898		/*
3899		 * In case of a checkpoint rewind, log the original txg
3900		 * of the checkpointed uberblock.
3901		 */
3902		if (checkpoint_rewind) {
3903			spa_history_log_internal(spa, "checkpoint rewind",
3904			    NULL, "rewound state to txg=%llu",
3905			    (u_longlong_t)spa->spa_uberblock.ub_checkpoint_txg);
3906		}
3907
3908		/*
3909		 * Traverse the ZIL and claim all blocks.
3910		 */
3911		spa_ld_claim_log_blocks(spa);
3912
3913		/*
3914		 * Kick-off the syncing thread.
3915		 */
3916		spa->spa_sync_on = B_TRUE;
3917		txg_sync_start(spa->spa_dsl_pool);
3918
3919		/*
3920		 * Wait for all claims to sync.  We sync up to the highest
3921		 * claimed log block birth time so that claimed log blocks
3922		 * don't appear to be from the future.  spa_claim_max_txg
3923		 * will have been set for us by ZIL traversal operations
3924		 * performed above.
3925		 */
3926		txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg);
3927
3928		/*
3929		 * Check if we need to request an update of the config. On the
3930		 * next sync, we would update the config stored in vdev labels
3931		 * and the cachefile (by default /etc/zfs/zpool.cache).
3932		 */
3933		spa_ld_check_for_config_update(spa, config_cache_txg,
3934		    update_config_cache);
3935
3936		/*
3937		 * Check all DTLs to see if anything needs resilvering.
3938		 */
3939		if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
3940		    vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
3941			spa_async_request(spa, SPA_ASYNC_RESILVER);
3942
3943		/*
3944		 * Log the fact that we booted up (so that we can detect if
3945		 * we rebooted in the middle of an operation).
3946		 */
3947		spa_history_log_version(spa, "open");
3948
3949		/*
3950		 * Delete any inconsistent datasets.
3951		 */
3952		(void) dmu_objset_find(spa_name(spa),
3953		    dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN);
3954
3955		/*
3956		 * Clean up any stale temporary dataset userrefs.
3957		 */
3958		dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool);
3959
3960		spa_restart_removal(spa);
3961
3962		spa_spawn_aux_threads(spa);
3963
3964		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
3965		vdev_initialize_restart(spa->spa_root_vdev);
3966		spa_config_exit(spa, SCL_CONFIG, FTAG);
3967	}
3968
3969	spa_load_note(spa, "LOADED");
3970
3971	return (0);
3972}
3973
3974static int
3975spa_load_retry(spa_t *spa, spa_load_state_t state)
3976{
3977	int mode = spa->spa_mode;
3978
3979	spa_unload(spa);
3980	spa_deactivate(spa);
3981
3982	spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1;
3983
3984	spa_activate(spa, mode);
3985	spa_async_suspend(spa);
3986
3987	spa_load_note(spa, "spa_load_retry: rewind, max txg: %llu",
3988	    (u_longlong_t)spa->spa_load_max_txg);
3989
3990	return (spa_load(spa, state, SPA_IMPORT_EXISTING));
3991}
3992
3993/*
3994 * If spa_load() fails this function will try loading prior txg's. If
3995 * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool
3996 * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this
3997 * function will not rewind the pool and will return the same error as
3998 * spa_load().
3999 */
4000static int
4001spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request,
4002    int rewind_flags)
4003{
4004	nvlist_t *loadinfo = NULL;
4005	nvlist_t *config = NULL;
4006	int load_error, rewind_error;
4007	uint64_t safe_rewind_txg;
4008	uint64_t min_txg;
4009
4010	if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) {
4011		spa->spa_load_max_txg = spa->spa_load_txg;
4012		spa_set_log_state(spa, SPA_LOG_CLEAR);
4013	} else {
4014		spa->spa_load_max_txg = max_request;
4015		if (max_request != UINT64_MAX)
4016			spa->spa_extreme_rewind = B_TRUE;
4017	}
4018
4019	load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING);
4020	if (load_error == 0)
4021		return (0);
4022	if (load_error == ZFS_ERR_NO_CHECKPOINT) {
4023		/*
4024		 * When attempting checkpoint-rewind on a pool with no
4025		 * checkpoint, we should not attempt to load uberblocks
4026		 * from previous txgs when spa_load fails.
4027		 */
4028		ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
4029		return (load_error);
4030	}
4031
4032	if (spa->spa_root_vdev != NULL)
4033		config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
4034
4035	spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg;
4036	spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp;
4037
4038	if (rewind_flags & ZPOOL_NEVER_REWIND) {
4039		nvlist_free(config);
4040		return (load_error);
4041	}
4042
4043	if (state == SPA_LOAD_RECOVER) {
4044		/* Price of rolling back is discarding txgs, including log */
4045		spa_set_log_state(spa, SPA_LOG_CLEAR);
4046	} else {
4047		/*
4048		 * If we aren't rolling back save the load info from our first
4049		 * import attempt so that we can restore it after attempting
4050		 * to rewind.
4051		 */
4052		loadinfo = spa->spa_load_info;
4053		spa->spa_load_info = fnvlist_alloc();
4054	}
4055
4056	spa->spa_load_max_txg = spa->spa_last_ubsync_txg;
4057	safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE;
4058	min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ?
4059	    TXG_INITIAL : safe_rewind_txg;
4060
4061	/*
4062	 * Continue as long as we're finding errors, we're still within
4063	 * the acceptable rewind range, and we're still finding uberblocks
4064	 */
4065	while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg &&
4066	    spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) {
4067		if (spa->spa_load_max_txg < safe_rewind_txg)
4068			spa->spa_extreme_rewind = B_TRUE;
4069		rewind_error = spa_load_retry(spa, state);
4070	}
4071
4072	spa->spa_extreme_rewind = B_FALSE;
4073	spa->spa_load_max_txg = UINT64_MAX;
4074
4075	if (config && (rewind_error || state != SPA_LOAD_RECOVER))
4076		spa_config_set(spa, config);
4077	else
4078		nvlist_free(config);
4079
4080	if (state == SPA_LOAD_RECOVER) {
4081		ASSERT3P(loadinfo, ==, NULL);
4082		return (rewind_error);
4083	} else {
4084		/* Store the rewind info as part of the initial load info */
4085		fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO,
4086		    spa->spa_load_info);
4087
4088		/* Restore the initial load info */
4089		fnvlist_free(spa->spa_load_info);
4090		spa->spa_load_info = loadinfo;
4091
4092		return (load_error);
4093	}
4094}
4095
4096/*
4097 * Pool Open/Import
4098 *
4099 * The import case is identical to an open except that the configuration is sent
4100 * down from userland, instead of grabbed from the configuration cache.  For the
4101 * case of an open, the pool configuration will exist in the
4102 * POOL_STATE_UNINITIALIZED state.
4103 *
4104 * The stats information (gen/count/ustats) is used to gather vdev statistics at
4105 * the same time open the pool, without having to keep around the spa_t in some
4106 * ambiguous state.
4107 */
4108static int
4109spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
4110    nvlist_t **config)
4111{
4112	spa_t *spa;
4113	spa_load_state_t state = SPA_LOAD_OPEN;
4114	int error;
4115	int locked = B_FALSE;
4116	int firstopen = B_FALSE;
4117
4118	*spapp = NULL;
4119
4120	/*
4121	 * As disgusting as this is, we need to support recursive calls to this
4122	 * function because dsl_dir_open() is called during spa_load(), and ends
4123	 * up calling spa_open() again.  The real fix is to figure out how to
4124	 * avoid dsl_dir_open() calling this in the first place.
4125	 */
4126	if (mutex_owner(&spa_namespace_lock) != curthread) {
4127		mutex_enter(&spa_namespace_lock);
4128		locked = B_TRUE;
4129	}
4130
4131	if ((spa = spa_lookup(pool)) == NULL) {
4132		if (locked)
4133			mutex_exit(&spa_namespace_lock);
4134		return (SET_ERROR(ENOENT));
4135	}
4136
4137	if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
4138		zpool_load_policy_t policy;
4139
4140		firstopen = B_TRUE;
4141
4142		zpool_get_load_policy(nvpolicy ? nvpolicy : spa->spa_config,
4143		    &policy);
4144		if (policy.zlp_rewind & ZPOOL_DO_REWIND)
4145			state = SPA_LOAD_RECOVER;
4146
4147		spa_activate(spa, spa_mode_global);
4148
4149		if (state != SPA_LOAD_RECOVER)
4150			spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
4151		spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE;
4152
4153		zfs_dbgmsg("spa_open_common: opening %s", pool);
4154		error = spa_load_best(spa, state, policy.zlp_txg,
4155		    policy.zlp_rewind);
4156
4157		if (error == EBADF) {
4158			/*
4159			 * If vdev_validate() returns failure (indicated by
4160			 * EBADF), it indicates that one of the vdevs indicates
4161			 * that the pool has been exported or destroyed.  If
4162			 * this is the case, the config cache is out of sync and
4163			 * we should remove the pool from the namespace.
4164			 */
4165			spa_unload(spa);
4166			spa_deactivate(spa);
4167			spa_write_cachefile(spa, B_TRUE, B_TRUE);
4168			spa_remove(spa);
4169			if (locked)
4170				mutex_exit(&spa_namespace_lock);
4171			return (SET_ERROR(ENOENT));
4172		}
4173
4174		if (error) {
4175			/*
4176			 * We can't open the pool, but we still have useful
4177			 * information: the state of each vdev after the
4178			 * attempted vdev_open().  Return this to the user.
4179			 */
4180			if (config != NULL && spa->spa_config) {
4181				VERIFY(nvlist_dup(spa->spa_config, config,
4182				    KM_SLEEP) == 0);
4183				VERIFY(nvlist_add_nvlist(*config,
4184				    ZPOOL_CONFIG_LOAD_INFO,
4185				    spa->spa_load_info) == 0);
4186			}
4187			spa_unload(spa);
4188			spa_deactivate(spa);
4189			spa->spa_last_open_failed = error;
4190			if (locked)
4191				mutex_exit(&spa_namespace_lock);
4192			*spapp = NULL;
4193			return (error);
4194		}
4195	}
4196
4197	spa_open_ref(spa, tag);
4198
4199	if (config != NULL)
4200		*config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
4201
4202	/*
4203	 * If we've recovered the pool, pass back any information we
4204	 * gathered while doing the load.
4205	 */
4206	if (state == SPA_LOAD_RECOVER) {
4207		VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO,
4208		    spa->spa_load_info) == 0);
4209	}
4210
4211	if (locked) {
4212		spa->spa_last_open_failed = 0;
4213		spa->spa_last_ubsync_txg = 0;
4214		spa->spa_load_txg = 0;
4215		mutex_exit(&spa_namespace_lock);
4216#ifdef __FreeBSD__
4217#ifdef _KERNEL
4218		if (firstopen)
4219			zvol_create_minors(spa->spa_name);
4220#endif
4221#endif
4222	}
4223
4224	*spapp = spa;
4225
4226	return (0);
4227}
4228
4229int
4230spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy,
4231    nvlist_t **config)
4232{
4233	return (spa_open_common(name, spapp, tag, policy, config));
4234}
4235
4236int
4237spa_open(const char *name, spa_t **spapp, void *tag)
4238{
4239	return (spa_open_common(name, spapp, tag, NULL, NULL));
4240}
4241
4242/*
4243 * Lookup the given spa_t, incrementing the inject count in the process,
4244 * preventing it from being exported or destroyed.
4245 */
4246spa_t *
4247spa_inject_addref(char *name)
4248{
4249	spa_t *spa;
4250
4251	mutex_enter(&spa_namespace_lock);
4252	if ((spa = spa_lookup(name)) == NULL) {
4253		mutex_exit(&spa_namespace_lock);
4254		return (NULL);
4255	}
4256	spa->spa_inject_ref++;
4257	mutex_exit(&spa_namespace_lock);
4258
4259	return (spa);
4260}
4261
4262void
4263spa_inject_delref(spa_t *spa)
4264{
4265	mutex_enter(&spa_namespace_lock);
4266	spa->spa_inject_ref--;
4267	mutex_exit(&spa_namespace_lock);
4268}
4269
4270/*
4271 * Add spares device information to the nvlist.
4272 */
4273static void
4274spa_add_spares(spa_t *spa, nvlist_t *config)
4275{
4276	nvlist_t **spares;
4277	uint_t i, nspares;
4278	nvlist_t *nvroot;
4279	uint64_t guid;
4280	vdev_stat_t *vs;
4281	uint_t vsc;
4282	uint64_t pool;
4283
4284	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
4285
4286	if (spa->spa_spares.sav_count == 0)
4287		return;
4288
4289	VERIFY(nvlist_lookup_nvlist(config,
4290	    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
4291	VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
4292	    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
4293	if (nspares != 0) {
4294		VERIFY(nvlist_add_nvlist_array(nvroot,
4295		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
4296		VERIFY(nvlist_lookup_nvlist_array(nvroot,
4297		    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
4298
4299		/*
4300		 * Go through and find any spares which have since been
4301		 * repurposed as an active spare.  If this is the case, update
4302		 * their status appropriately.
4303		 */
4304		for (i = 0; i < nspares; i++) {
4305			VERIFY(nvlist_lookup_uint64(spares[i],
4306			    ZPOOL_CONFIG_GUID, &guid) == 0);
4307			if (spa_spare_exists(guid, &pool, NULL) &&
4308			    pool != 0ULL) {
4309				VERIFY(nvlist_lookup_uint64_array(
4310				    spares[i], ZPOOL_CONFIG_VDEV_STATS,
4311				    (uint64_t **)&vs, &vsc) == 0);
4312				vs->vs_state = VDEV_STATE_CANT_OPEN;
4313				vs->vs_aux = VDEV_AUX_SPARED;
4314			}
4315		}
4316	}
4317}
4318
4319/*
4320 * Add l2cache device information to the nvlist, including vdev stats.
4321 */
4322static void
4323spa_add_l2cache(spa_t *spa, nvlist_t *config)
4324{
4325	nvlist_t **l2cache;
4326	uint_t i, j, nl2cache;
4327	nvlist_t *nvroot;
4328	uint64_t guid;
4329	vdev_t *vd;
4330	vdev_stat_t *vs;
4331	uint_t vsc;
4332
4333	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
4334
4335	if (spa->spa_l2cache.sav_count == 0)
4336		return;
4337
4338	VERIFY(nvlist_lookup_nvlist(config,
4339	    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
4340	VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
4341	    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
4342	if (nl2cache != 0) {
4343		VERIFY(nvlist_add_nvlist_array(nvroot,
4344		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
4345		VERIFY(nvlist_lookup_nvlist_array(nvroot,
4346		    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
4347
4348		/*
4349		 * Update level 2 cache device stats.
4350		 */
4351
4352		for (i = 0; i < nl2cache; i++) {
4353			VERIFY(nvlist_lookup_uint64(l2cache[i],
4354			    ZPOOL_CONFIG_GUID, &guid) == 0);
4355
4356			vd = NULL;
4357			for (j = 0; j < spa->spa_l2cache.sav_count; j++) {
4358				if (guid ==
4359				    spa->spa_l2cache.sav_vdevs[j]->vdev_guid) {
4360					vd = spa->spa_l2cache.sav_vdevs[j];
4361					break;
4362				}
4363			}
4364			ASSERT(vd != NULL);
4365
4366			VERIFY(nvlist_lookup_uint64_array(l2cache[i],
4367			    ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)
4368			    == 0);
4369			vdev_get_stats(vd, vs);
4370		}
4371	}
4372}
4373
4374static void
4375spa_feature_stats_from_disk(spa_t *spa, nvlist_t *features)
4376{
4377	zap_cursor_t zc;
4378	zap_attribute_t za;
4379
4380	/* We may be unable to read features if pool is suspended. */
4381	if (spa_suspended(spa))
4382		return;
4383
4384	if (spa->spa_feat_for_read_obj != 0) {
4385		for (zap_cursor_init(&zc, spa->spa_meta_objset,
4386		    spa->spa_feat_for_read_obj);
4387		    zap_cursor_retrieve(&zc, &za) == 0;
4388		    zap_cursor_advance(&zc)) {
4389			ASSERT(za.za_integer_length == sizeof (uint64_t) &&
4390			    za.za_num_integers == 1);
4391			VERIFY0(nvlist_add_uint64(features, za.za_name,
4392			    za.za_first_integer));
4393		}
4394		zap_cursor_fini(&zc);
4395	}
4396
4397	if (spa->spa_feat_for_write_obj != 0) {
4398		for (zap_cursor_init(&zc, spa->spa_meta_objset,
4399		    spa->spa_feat_for_write_obj);
4400		    zap_cursor_retrieve(&zc, &za) == 0;
4401		    zap_cursor_advance(&zc)) {
4402			ASSERT(za.za_integer_length == sizeof (uint64_t) &&
4403			    za.za_num_integers == 1);
4404			VERIFY0(nvlist_add_uint64(features, za.za_name,
4405			    za.za_first_integer));
4406		}
4407		zap_cursor_fini(&zc);
4408	}
4409}
4410
4411static void
4412spa_feature_stats_from_cache(spa_t *spa, nvlist_t *features)
4413{
4414	int i;
4415
4416	for (i = 0; i < SPA_FEATURES; i++) {
4417		zfeature_info_t feature = spa_feature_table[i];
4418		uint64_t refcount;
4419
4420		if (feature_get_refcount(spa, &feature, &refcount) != 0)
4421			continue;
4422
4423		VERIFY0(nvlist_add_uint64(features, feature.fi_guid, refcount));
4424	}
4425}
4426
4427/*
4428 * Store a list of pool features and their reference counts in the
4429 * config.
4430 *
4431 * The first time this is called on a spa, allocate a new nvlist, fetch
4432 * the pool features and reference counts from disk, then save the list
4433 * in the spa. In subsequent calls on the same spa use the saved nvlist
4434 * and refresh its values from the cached reference counts.  This
4435 * ensures we don't block here on I/O on a suspended pool so 'zpool
4436 * clear' can resume the pool.
4437 */
4438static void
4439spa_add_feature_stats(spa_t *spa, nvlist_t *config)
4440{
4441	nvlist_t *features;
4442
4443	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
4444
4445	mutex_enter(&spa->spa_feat_stats_lock);
4446	features = spa->spa_feat_stats;
4447
4448	if (features != NULL) {
4449		spa_feature_stats_from_cache(spa, features);
4450	} else {
4451		VERIFY0(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP));
4452		spa->spa_feat_stats = features;
4453		spa_feature_stats_from_disk(spa, features);
4454	}
4455
4456	VERIFY0(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS,
4457	    features));
4458
4459	mutex_exit(&spa->spa_feat_stats_lock);
4460}
4461
4462int
4463spa_get_stats(const char *name, nvlist_t **config,
4464    char *altroot, size_t buflen)
4465{
4466	int error;
4467	spa_t *spa;
4468
4469	*config = NULL;
4470	error = spa_open_common(name, &spa, FTAG, NULL, config);
4471
4472	if (spa != NULL) {
4473		/*
4474		 * This still leaves a window of inconsistency where the spares
4475		 * or l2cache devices could change and the config would be
4476		 * self-inconsistent.
4477		 */
4478		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4479
4480		if (*config != NULL) {
4481			uint64_t loadtimes[2];
4482
4483			loadtimes[0] = spa->spa_loaded_ts.tv_sec;
4484			loadtimes[1] = spa->spa_loaded_ts.tv_nsec;
4485			VERIFY(nvlist_add_uint64_array(*config,
4486			    ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0);
4487
4488			VERIFY(nvlist_add_uint64(*config,
4489			    ZPOOL_CONFIG_ERRCOUNT,
4490			    spa_get_errlog_size(spa)) == 0);
4491
4492			if (spa_suspended(spa))
4493				VERIFY(nvlist_add_uint64(*config,
4494				    ZPOOL_CONFIG_SUSPENDED,
4495				    spa->spa_failmode) == 0);
4496
4497			spa_add_spares(spa, *config);
4498			spa_add_l2cache(spa, *config);
4499			spa_add_feature_stats(spa, *config);
4500		}
4501	}
4502
4503	/*
4504	 * We want to get the alternate root even for faulted pools, so we cheat
4505	 * and call spa_lookup() directly.
4506	 */
4507	if (altroot) {
4508		if (spa == NULL) {
4509			mutex_enter(&spa_namespace_lock);
4510			spa = spa_lookup(name);
4511			if (spa)
4512				spa_altroot(spa, altroot, buflen);
4513			else
4514				altroot[0] = '\0';
4515			spa = NULL;
4516			mutex_exit(&spa_namespace_lock);
4517		} else {
4518			spa_altroot(spa, altroot, buflen);
4519		}
4520	}
4521
4522	if (spa != NULL) {
4523		spa_config_exit(spa, SCL_CONFIG, FTAG);
4524		spa_close(spa, FTAG);
4525	}
4526
4527	return (error);
4528}
4529
4530/*
4531 * Validate that the auxiliary device array is well formed.  We must have an
4532 * array of nvlists, each which describes a valid leaf vdev.  If this is an
4533 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be
4534 * specified, as long as they are well-formed.
4535 */
4536static int
4537spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode,
4538    spa_aux_vdev_t *sav, const char *config, uint64_t version,
4539    vdev_labeltype_t label)
4540{
4541	nvlist_t **dev;
4542	uint_t i, ndev;
4543	vdev_t *vd;
4544	int error;
4545
4546	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
4547
4548	/*
4549	 * It's acceptable to have no devs specified.
4550	 */
4551	if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0)
4552		return (0);
4553
4554	if (ndev == 0)
4555		return (SET_ERROR(EINVAL));
4556
4557	/*
4558	 * Make sure the pool is formatted with a version that supports this
4559	 * device type.
4560	 */
4561	if (spa_version(spa) < version)
4562		return (SET_ERROR(ENOTSUP));
4563
4564	/*
4565	 * Set the pending device list so we correctly handle device in-use
4566	 * checking.
4567	 */
4568	sav->sav_pending = dev;
4569	sav->sav_npending = ndev;
4570
4571	for (i = 0; i < ndev; i++) {
4572		if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0,
4573		    mode)) != 0)
4574			goto out;
4575
4576		if (!vd->vdev_ops->vdev_op_leaf) {
4577			vdev_free(vd);
4578			error = SET_ERROR(EINVAL);
4579			goto out;
4580		}
4581
4582		/*
4583		 * The L2ARC currently only supports disk devices in
4584		 * kernel context.  For user-level testing, we allow it.
4585		 */
4586#ifdef _KERNEL
4587		if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) &&
4588		    strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) {
4589			error = SET_ERROR(ENOTBLK);
4590			vdev_free(vd);
4591			goto out;
4592		}
4593#endif
4594		vd->vdev_top = vd;
4595
4596		if ((error = vdev_open(vd)) == 0 &&
4597		    (error = vdev_label_init(vd, crtxg, label)) == 0) {
4598			VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID,
4599			    vd->vdev_guid) == 0);
4600		}
4601
4602		vdev_free(vd);
4603
4604		if (error &&
4605		    (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE))
4606			goto out;
4607		else
4608			error = 0;
4609	}
4610
4611out:
4612	sav->sav_pending = NULL;
4613	sav->sav_npending = 0;
4614	return (error);
4615}
4616
4617static int
4618spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode)
4619{
4620	int error;
4621
4622	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
4623
4624	if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode,
4625	    &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES,
4626	    VDEV_LABEL_SPARE)) != 0) {
4627		return (error);
4628	}
4629
4630	return (spa_validate_aux_devs(spa, nvroot, crtxg, mode,
4631	    &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE,
4632	    VDEV_LABEL_L2CACHE));
4633}
4634
4635static void
4636spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs,
4637    const char *config)
4638{
4639	int i;
4640
4641	if (sav->sav_config != NULL) {
4642		nvlist_t **olddevs;
4643		uint_t oldndevs;
4644		nvlist_t **newdevs;
4645
4646		/*
4647		 * Generate new dev list by concatentating with the
4648		 * current dev list.
4649		 */
4650		VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config,
4651		    &olddevs, &oldndevs) == 0);
4652
4653		newdevs = kmem_alloc(sizeof (void *) *
4654		    (ndevs + oldndevs), KM_SLEEP);
4655		for (i = 0; i < oldndevs; i++)
4656			VERIFY(nvlist_dup(olddevs[i], &newdevs[i],
4657			    KM_SLEEP) == 0);
4658		for (i = 0; i < ndevs; i++)
4659			VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs],
4660			    KM_SLEEP) == 0);
4661
4662		VERIFY(nvlist_remove(sav->sav_config, config,
4663		    DATA_TYPE_NVLIST_ARRAY) == 0);
4664
4665		VERIFY(nvlist_add_nvlist_array(sav->sav_config,
4666		    config, newdevs, ndevs + oldndevs) == 0);
4667		for (i = 0; i < oldndevs + ndevs; i++)
4668			nvlist_free(newdevs[i]);
4669		kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *));
4670	} else {
4671		/*
4672		 * Generate a new dev list.
4673		 */
4674		VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME,
4675		    KM_SLEEP) == 0);
4676		VERIFY(nvlist_add_nvlist_array(sav->sav_config, config,
4677		    devs, ndevs) == 0);
4678	}
4679}
4680
4681/*
4682 * Stop and drop level 2 ARC devices
4683 */
4684void
4685spa_l2cache_drop(spa_t *spa)
4686{
4687	vdev_t *vd;
4688	int i;
4689	spa_aux_vdev_t *sav = &spa->spa_l2cache;
4690
4691	for (i = 0; i < sav->sav_count; i++) {
4692		uint64_t pool;
4693
4694		vd = sav->sav_vdevs[i];
4695		ASSERT(vd != NULL);
4696
4697		if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
4698		    pool != 0ULL && l2arc_vdev_present(vd))
4699			l2arc_remove_vdev(vd);
4700	}
4701}
4702
4703/*
4704 * Pool Creation
4705 */
4706int
4707spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
4708    nvlist_t *zplprops)
4709{
4710	spa_t *spa;
4711	char *altroot = NULL;
4712	vdev_t *rvd;
4713	dsl_pool_t *dp;
4714	dmu_tx_t *tx;
4715	int error = 0;
4716	uint64_t txg = TXG_INITIAL;
4717	nvlist_t **spares, **l2cache;
4718	uint_t nspares, nl2cache;
4719	uint64_t version, obj;
4720	boolean_t has_features;
4721	char *poolname;
4722	nvlist_t *nvl;
4723
4724	if (nvlist_lookup_string(props,
4725	    zpool_prop_to_name(ZPOOL_PROP_TNAME), &poolname) != 0)
4726		poolname = (char *)pool;
4727
4728	/*
4729	 * If this pool already exists, return failure.
4730	 */
4731	mutex_enter(&spa_namespace_lock);
4732	if (spa_lookup(poolname) != NULL) {
4733		mutex_exit(&spa_namespace_lock);
4734		return (SET_ERROR(EEXIST));
4735	}
4736
4737	/*
4738	 * Allocate a new spa_t structure.
4739	 */
4740	nvl = fnvlist_alloc();
4741	fnvlist_add_string(nvl, ZPOOL_CONFIG_POOL_NAME, pool);
4742	(void) nvlist_lookup_string(props,
4743	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
4744	spa = spa_add(poolname, nvl, altroot);
4745	fnvlist_free(nvl);
4746	spa_activate(spa, spa_mode_global);
4747
4748	if (props && (error = spa_prop_validate(spa, props))) {
4749		spa_deactivate(spa);
4750		spa_remove(spa);
4751		mutex_exit(&spa_namespace_lock);
4752		return (error);
4753	}
4754
4755	/*
4756	 * Temporary pool names should never be written to disk.
4757	 */
4758	if (poolname != pool)
4759		spa->spa_import_flags |= ZFS_IMPORT_TEMP_NAME;
4760
4761	has_features = B_FALSE;
4762	for (nvpair_t *elem = nvlist_next_nvpair(props, NULL);
4763	    elem != NULL; elem = nvlist_next_nvpair(props, elem)) {
4764		if (zpool_prop_feature(nvpair_name(elem)))
4765			has_features = B_TRUE;
4766	}
4767
4768	if (has_features || nvlist_lookup_uint64(props,
4769	    zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) {
4770		version = SPA_VERSION;
4771	}
4772	ASSERT(SPA_VERSION_IS_SUPPORTED(version));
4773
4774	spa->spa_first_txg = txg;
4775	spa->spa_uberblock.ub_txg = txg - 1;
4776	spa->spa_uberblock.ub_version = version;
4777	spa->spa_ubsync = spa->spa_uberblock;
4778	spa->spa_load_state = SPA_LOAD_CREATE;
4779	spa->spa_removing_phys.sr_state = DSS_NONE;
4780	spa->spa_removing_phys.sr_removing_vdev = -1;
4781	spa->spa_removing_phys.sr_prev_indirect_vdev = -1;
4782	spa->spa_indirect_vdevs_loaded = B_TRUE;
4783
4784	/*
4785	 * Create "The Godfather" zio to hold all async IOs
4786	 */
4787	spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *),
4788	    KM_SLEEP);
4789	for (int i = 0; i < max_ncpus; i++) {
4790		spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
4791		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
4792		    ZIO_FLAG_GODFATHER);
4793	}
4794
4795	/*
4796	 * Create the root vdev.
4797	 */
4798	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4799
4800	error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD);
4801
4802	ASSERT(error != 0 || rvd != NULL);
4803	ASSERT(error != 0 || spa->spa_root_vdev == rvd);
4804
4805	if (error == 0 && !zfs_allocatable_devs(nvroot))
4806		error = SET_ERROR(EINVAL);
4807
4808	if (error == 0 &&
4809	    (error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
4810	    (error = spa_validate_aux(spa, nvroot, txg,
4811	    VDEV_ALLOC_ADD)) == 0) {
4812		for (int c = 0; c < rvd->vdev_children; c++) {
4813			vdev_ashift_optimize(rvd->vdev_child[c]);
4814			vdev_metaslab_set_size(rvd->vdev_child[c]);
4815			vdev_expand(rvd->vdev_child[c], txg);
4816		}
4817	}
4818
4819	spa_config_exit(spa, SCL_ALL, FTAG);
4820
4821	if (error != 0) {
4822		spa_unload(spa);
4823		spa_deactivate(spa);
4824		spa_remove(spa);
4825		mutex_exit(&spa_namespace_lock);
4826		return (error);
4827	}
4828
4829	/*
4830	 * Get the list of spares, if specified.
4831	 */
4832	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
4833	    &spares, &nspares) == 0) {
4834		VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME,
4835		    KM_SLEEP) == 0);
4836		VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
4837		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
4838		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4839		spa_load_spares(spa);
4840		spa_config_exit(spa, SCL_ALL, FTAG);
4841		spa->spa_spares.sav_sync = B_TRUE;
4842	}
4843
4844	/*
4845	 * Get the list of level 2 cache devices, if specified.
4846	 */
4847	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
4848	    &l2cache, &nl2cache) == 0) {
4849		VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
4850		    NV_UNIQUE_NAME, KM_SLEEP) == 0);
4851		VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
4852		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
4853		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4854		spa_load_l2cache(spa);
4855		spa_config_exit(spa, SCL_ALL, FTAG);
4856		spa->spa_l2cache.sav_sync = B_TRUE;
4857	}
4858
4859	spa->spa_is_initializing = B_TRUE;
4860	spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg);
4861	spa->spa_meta_objset = dp->dp_meta_objset;
4862	spa->spa_is_initializing = B_FALSE;
4863
4864	/*
4865	 * Create DDTs (dedup tables).
4866	 */
4867	ddt_create(spa);
4868
4869	spa_update_dspace(spa);
4870
4871	tx = dmu_tx_create_assigned(dp, txg);
4872
4873	/*
4874	 * Create the pool config object.
4875	 */
4876	spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
4877	    DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE,
4878	    DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
4879
4880	if (zap_add(spa->spa_meta_objset,
4881	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
4882	    sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
4883		cmn_err(CE_PANIC, "failed to add pool config");
4884	}
4885
4886	if (spa_version(spa) >= SPA_VERSION_FEATURES)
4887		spa_feature_create_zap_objects(spa, tx);
4888
4889	if (zap_add(spa->spa_meta_objset,
4890	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION,
4891	    sizeof (uint64_t), 1, &version, tx) != 0) {
4892		cmn_err(CE_PANIC, "failed to add pool version");
4893	}
4894
4895	/* Newly created pools with the right version are always deflated. */
4896	if (version >= SPA_VERSION_RAIDZ_DEFLATE) {
4897		spa->spa_deflate = TRUE;
4898		if (zap_add(spa->spa_meta_objset,
4899		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
4900		    sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
4901			cmn_err(CE_PANIC, "failed to add deflate");
4902		}
4903	}
4904
4905	/*
4906	 * Create the deferred-free bpobj.  Turn off compression
4907	 * because sync-to-convergence takes longer if the blocksize
4908	 * keeps changing.
4909	 */
4910	obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx);
4911	dmu_object_set_compress(spa->spa_meta_objset, obj,
4912	    ZIO_COMPRESS_OFF, tx);
4913	if (zap_add(spa->spa_meta_objset,
4914	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ,
4915	    sizeof (uint64_t), 1, &obj, tx) != 0) {
4916		cmn_err(CE_PANIC, "failed to add bpobj");
4917	}
4918	VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj,
4919	    spa->spa_meta_objset, obj));
4920
4921	/*
4922	 * Create the pool's history object.
4923	 */
4924	if (version >= SPA_VERSION_ZPOOL_HISTORY)
4925		spa_history_create_obj(spa, tx);
4926
4927	/*
4928	 * Generate some random noise for salted checksums to operate on.
4929	 */
4930	(void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes,
4931	    sizeof (spa->spa_cksum_salt.zcs_bytes));
4932
4933	/*
4934	 * Set pool properties.
4935	 */
4936	spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
4937	spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
4938	spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
4939	spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND);
4940
4941	if (props != NULL) {
4942		spa_configfile_set(spa, props, B_FALSE);
4943		spa_sync_props(props, tx);
4944	}
4945
4946	dmu_tx_commit(tx);
4947
4948	spa->spa_sync_on = B_TRUE;
4949	txg_sync_start(spa->spa_dsl_pool);
4950
4951	/*
4952	 * We explicitly wait for the first transaction to complete so that our
4953	 * bean counters are appropriately updated.
4954	 */
4955	txg_wait_synced(spa->spa_dsl_pool, txg);
4956
4957	spa_spawn_aux_threads(spa);
4958
4959	spa_write_cachefile(spa, B_FALSE, B_TRUE);
4960	spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_CREATE);
4961
4962	spa_history_log_version(spa, "create");
4963
4964	/*
4965	 * Don't count references from objsets that are already closed
4966	 * and are making their way through the eviction process.
4967	 */
4968	spa_evicting_os_wait(spa);
4969	spa->spa_minref = refcount_count(&spa->spa_refcount);
4970	spa->spa_load_state = SPA_LOAD_NONE;
4971
4972	mutex_exit(&spa_namespace_lock);
4973
4974	return (0);
4975}
4976
4977#ifdef _KERNEL
4978#ifdef illumos
4979/*
4980 * Get the root pool information from the root disk, then import the root pool
4981 * during the system boot up time.
4982 */
4983extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **);
4984
4985static nvlist_t *
4986spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid)
4987{
4988	nvlist_t *config;
4989	nvlist_t *nvtop, *nvroot;
4990	uint64_t pgid;
4991
4992	if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0)
4993		return (NULL);
4994
4995	/*
4996	 * Add this top-level vdev to the child array.
4997	 */
4998	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
4999	    &nvtop) == 0);
5000	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
5001	    &pgid) == 0);
5002	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0);
5003
5004	/*
5005	 * Put this pool's top-level vdevs into a root vdev.
5006	 */
5007	VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
5008	VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
5009	    VDEV_TYPE_ROOT) == 0);
5010	VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0);
5011	VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0);
5012	VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
5013	    &nvtop, 1) == 0);
5014
5015	/*
5016	 * Replace the existing vdev_tree with the new root vdev in
5017	 * this pool's configuration (remove the old, add the new).
5018	 */
5019	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
5020	nvlist_free(nvroot);
5021	return (config);
5022}
5023
5024/*
5025 * Walk the vdev tree and see if we can find a device with "better"
5026 * configuration. A configuration is "better" if the label on that
5027 * device has a more recent txg.
5028 */
5029static void
5030spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg)
5031{
5032	for (int c = 0; c < vd->vdev_children; c++)
5033		spa_alt_rootvdev(vd->vdev_child[c], avd, txg);
5034
5035	if (vd->vdev_ops->vdev_op_leaf) {
5036		nvlist_t *label;
5037		uint64_t label_txg;
5038
5039		if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid,
5040		    &label) != 0)
5041			return;
5042
5043		VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG,
5044		    &label_txg) == 0);
5045
5046		/*
5047		 * Do we have a better boot device?
5048		 */
5049		if (label_txg > *txg) {
5050			*txg = label_txg;
5051			*avd = vd;
5052		}
5053		nvlist_free(label);
5054	}
5055}
5056
5057/*
5058 * Import a root pool.
5059 *
5060 * For x86. devpath_list will consist of devid and/or physpath name of
5061 * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a").
5062 * The GRUB "findroot" command will return the vdev we should boot.
5063 *
5064 * For Sparc, devpath_list consists the physpath name of the booting device
5065 * no matter the rootpool is a single device pool or a mirrored pool.
5066 * e.g.
5067 *	"/pci@1f,0/ide@d/disk@0,0:a"
5068 */
5069int
5070spa_import_rootpool(char *devpath, char *devid)
5071{
5072	spa_t *spa;
5073	vdev_t *rvd, *bvd, *avd = NULL;
5074	nvlist_t *config, *nvtop;
5075	uint64_t guid, txg;
5076	char *pname;
5077	int error;
5078
5079	/*
5080	 * Read the label from the boot device and generate a configuration.
5081	 */
5082	config = spa_generate_rootconf(devpath, devid, &guid);
5083#if defined(_OBP) && defined(_KERNEL)
5084	if (config == NULL) {
5085		if (strstr(devpath, "/iscsi/ssd") != NULL) {
5086			/* iscsi boot */
5087			get_iscsi_bootpath_phy(devpath);
5088			config = spa_generate_rootconf(devpath, devid, &guid);
5089		}
5090	}
5091#endif
5092	if (config == NULL) {
5093		cmn_err(CE_NOTE, "Cannot read the pool label from '%s'",
5094		    devpath);
5095		return (SET_ERROR(EIO));
5096	}
5097
5098	VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
5099	    &pname) == 0);
5100	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
5101
5102	mutex_enter(&spa_namespace_lock);
5103	if ((spa = spa_lookup(pname)) != NULL) {
5104		/*
5105		 * Remove the existing root pool from the namespace so that we
5106		 * can replace it with the correct config we just read in.
5107		 */
5108		spa_remove(spa);
5109	}
5110
5111	spa = spa_add(pname, config, NULL);
5112	spa->spa_is_root = B_TRUE;
5113	spa->spa_import_flags = ZFS_IMPORT_VERBATIM;
5114	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
5115	    &spa->spa_ubsync.ub_version) != 0)
5116		spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
5117
5118	/*
5119	 * Build up a vdev tree based on the boot device's label config.
5120	 */
5121	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
5122	    &nvtop) == 0);
5123	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5124	error = spa_config_parse(spa, &rvd, nvtop, NULL, 0,
5125	    VDEV_ALLOC_ROOTPOOL);
5126	spa_config_exit(spa, SCL_ALL, FTAG);
5127	if (error) {
5128		mutex_exit(&spa_namespace_lock);
5129		nvlist_free(config);
5130		cmn_err(CE_NOTE, "Can not parse the config for pool '%s'",
5131		    pname);
5132		return (error);
5133	}
5134
5135	/*
5136	 * Get the boot vdev.
5137	 */
5138	if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) {
5139		cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu",
5140		    (u_longlong_t)guid);
5141		error = SET_ERROR(ENOENT);
5142		goto out;
5143	}
5144
5145	/*
5146	 * Determine if there is a better boot device.
5147	 */
5148	avd = bvd;
5149	spa_alt_rootvdev(rvd, &avd, &txg);
5150	if (avd != bvd) {
5151		cmn_err(CE_NOTE, "The boot device is 'degraded'. Please "
5152		    "try booting from '%s'", avd->vdev_path);
5153		error = SET_ERROR(EINVAL);
5154		goto out;
5155	}
5156
5157	/*
5158	 * If the boot device is part of a spare vdev then ensure that
5159	 * we're booting off the active spare.
5160	 */
5161	if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops &&
5162	    !bvd->vdev_isspare) {
5163		cmn_err(CE_NOTE, "The boot device is currently spared. Please "
5164		    "try booting from '%s'",
5165		    bvd->vdev_parent->
5166		    vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path);
5167		error = SET_ERROR(EINVAL);
5168		goto out;
5169	}
5170
5171	error = 0;
5172out:
5173	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5174	vdev_free(rvd);
5175	spa_config_exit(spa, SCL_ALL, FTAG);
5176	mutex_exit(&spa_namespace_lock);
5177
5178	nvlist_free(config);
5179	return (error);
5180}
5181
5182#else	/* !illumos */
5183
5184extern int vdev_geom_read_pool_label(const char *name, nvlist_t ***configs,
5185    uint64_t *count);
5186
5187static nvlist_t *
5188spa_generate_rootconf(const char *name)
5189{
5190	nvlist_t **configs, **tops;
5191	nvlist_t *config;
5192	nvlist_t *best_cfg, *nvtop, *nvroot;
5193	uint64_t *holes;
5194	uint64_t best_txg;
5195	uint64_t nchildren;
5196	uint64_t pgid;
5197	uint64_t count;
5198	uint64_t i;
5199	uint_t   nholes;
5200
5201	if (vdev_geom_read_pool_label(name, &configs, &count) != 0)
5202		return (NULL);
5203
5204	ASSERT3U(count, !=, 0);
5205	best_txg = 0;
5206	for (i = 0; i < count; i++) {
5207		uint64_t txg;
5208
5209		VERIFY(nvlist_lookup_uint64(configs[i], ZPOOL_CONFIG_POOL_TXG,
5210		    &txg) == 0);
5211		if (txg > best_txg) {
5212			best_txg = txg;
5213			best_cfg = configs[i];
5214		}
5215	}
5216
5217	nchildren = 1;
5218	nvlist_lookup_uint64(best_cfg, ZPOOL_CONFIG_VDEV_CHILDREN, &nchildren);
5219	holes = NULL;
5220	nvlist_lookup_uint64_array(best_cfg, ZPOOL_CONFIG_HOLE_ARRAY,
5221	    &holes, &nholes);
5222
5223	tops = kmem_zalloc(nchildren * sizeof(void *), KM_SLEEP);
5224	for (i = 0; i < nchildren; i++) {
5225		if (i >= count)
5226			break;
5227		if (configs[i] == NULL)
5228			continue;
5229		VERIFY(nvlist_lookup_nvlist(configs[i], ZPOOL_CONFIG_VDEV_TREE,
5230		    &nvtop) == 0);
5231		nvlist_dup(nvtop, &tops[i], KM_SLEEP);
5232	}
5233	for (i = 0; holes != NULL && i < nholes; i++) {
5234		if (i >= nchildren)
5235			continue;
5236		if (tops[holes[i]] != NULL)
5237			continue;
5238		nvlist_alloc(&tops[holes[i]], NV_UNIQUE_NAME, KM_SLEEP);
5239		VERIFY(nvlist_add_string(tops[holes[i]], ZPOOL_CONFIG_TYPE,
5240		    VDEV_TYPE_HOLE) == 0);
5241		VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_ID,
5242		    holes[i]) == 0);
5243		VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_GUID,
5244		    0) == 0);
5245	}
5246	for (i = 0; i < nchildren; i++) {
5247		if (tops[i] != NULL)
5248			continue;
5249		nvlist_alloc(&tops[i], NV_UNIQUE_NAME, KM_SLEEP);
5250		VERIFY(nvlist_add_string(tops[i], ZPOOL_CONFIG_TYPE,
5251		    VDEV_TYPE_MISSING) == 0);
5252		VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_ID,
5253		    i) == 0);
5254		VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_GUID,
5255		    0) == 0);
5256	}
5257
5258	/*
5259	 * Create pool config based on the best vdev config.
5260	 */
5261	nvlist_dup(best_cfg, &config, KM_SLEEP);
5262
5263	/*
5264	 * Put this pool's top-level vdevs into a root vdev.
5265	 */
5266	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
5267	    &pgid) == 0);
5268	VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
5269	VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
5270	    VDEV_TYPE_ROOT) == 0);
5271	VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0);
5272	VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0);
5273	VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
5274	    tops, nchildren) == 0);
5275
5276	/*
5277	 * Replace the existing vdev_tree with the new root vdev in
5278	 * this pool's configuration (remove the old, add the new).
5279	 */
5280	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
5281
5282	/*
5283	 * Drop vdev config elements that should not be present at pool level.
5284	 */
5285	nvlist_remove(config, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64);
5286	nvlist_remove(config, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64);
5287
5288	for (i = 0; i < count; i++)
5289		nvlist_free(configs[i]);
5290	kmem_free(configs, count * sizeof(void *));
5291	for (i = 0; i < nchildren; i++)
5292		nvlist_free(tops[i]);
5293	kmem_free(tops, nchildren * sizeof(void *));
5294	nvlist_free(nvroot);
5295	return (config);
5296}
5297
5298int
5299spa_import_rootpool(const char *name)
5300{
5301	spa_t *spa;
5302	vdev_t *rvd, *bvd, *avd = NULL;
5303	nvlist_t *config, *nvtop;
5304	uint64_t txg;
5305	char *pname;
5306	int error;
5307
5308	/*
5309	 * Read the label from the boot device and generate a configuration.
5310	 */
5311	config = spa_generate_rootconf(name);
5312
5313	mutex_enter(&spa_namespace_lock);
5314	if (config != NULL) {
5315		VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
5316		    &pname) == 0 && strcmp(name, pname) == 0);
5317		VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg)
5318		    == 0);
5319
5320		if ((spa = spa_lookup(pname)) != NULL) {
5321			/*
5322			 * The pool could already be imported,
5323			 * e.g., after reboot -r.
5324			 */
5325			if (spa->spa_state == POOL_STATE_ACTIVE) {
5326				mutex_exit(&spa_namespace_lock);
5327				nvlist_free(config);
5328				return (0);
5329			}
5330
5331			/*
5332			 * Remove the existing root pool from the namespace so
5333			 * that we can replace it with the correct config
5334			 * we just read in.
5335			 */
5336			spa_remove(spa);
5337		}
5338		spa = spa_add(pname, config, NULL);
5339
5340		/*
5341		 * Set spa_ubsync.ub_version as it can be used in vdev_alloc()
5342		 * via spa_version().
5343		 */
5344		if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
5345		    &spa->spa_ubsync.ub_version) != 0)
5346			spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
5347	} else if ((spa = spa_lookup(name)) == NULL) {
5348		mutex_exit(&spa_namespace_lock);
5349		nvlist_free(config);
5350		cmn_err(CE_NOTE, "Cannot find the pool label for '%s'",
5351		    name);
5352		return (EIO);
5353	} else {
5354		VERIFY(nvlist_dup(spa->spa_config, &config, KM_SLEEP) == 0);
5355	}
5356	spa->spa_is_root = B_TRUE;
5357	spa->spa_import_flags = ZFS_IMPORT_VERBATIM;
5358
5359	/*
5360	 * Build up a vdev tree based on the boot device's label config.
5361	 */
5362	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
5363	    &nvtop) == 0);
5364	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5365	error = spa_config_parse(spa, &rvd, nvtop, NULL, 0,
5366	    VDEV_ALLOC_ROOTPOOL);
5367	spa_config_exit(spa, SCL_ALL, FTAG);
5368	if (error) {
5369		mutex_exit(&spa_namespace_lock);
5370		nvlist_free(config);
5371		cmn_err(CE_NOTE, "Can not parse the config for pool '%s'",
5372		    pname);
5373		return (error);
5374	}
5375
5376	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5377	vdev_free(rvd);
5378	spa_config_exit(spa, SCL_ALL, FTAG);
5379	mutex_exit(&spa_namespace_lock);
5380
5381	nvlist_free(config);
5382	return (0);
5383}
5384
5385#endif	/* illumos */
5386#endif	/* _KERNEL */
5387
5388/*
5389 * Import a non-root pool into the system.
5390 */
5391int
5392spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
5393{
5394	spa_t *spa;
5395	char *altroot = NULL;
5396	spa_load_state_t state = SPA_LOAD_IMPORT;
5397	zpool_load_policy_t policy;
5398	uint64_t mode = spa_mode_global;
5399	uint64_t readonly = B_FALSE;
5400	int error;
5401	nvlist_t *nvroot;
5402	nvlist_t **spares, **l2cache;
5403	uint_t nspares, nl2cache;
5404
5405	/*
5406	 * If a pool with this name exists, return failure.
5407	 */
5408	mutex_enter(&spa_namespace_lock);
5409	if (spa_lookup(pool) != NULL) {
5410		mutex_exit(&spa_namespace_lock);
5411		return (SET_ERROR(EEXIST));
5412	}
5413
5414	/*
5415	 * Create and initialize the spa structure.
5416	 */
5417	(void) nvlist_lookup_string(props,
5418	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
5419	(void) nvlist_lookup_uint64(props,
5420	    zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly);
5421	if (readonly)
5422		mode = FREAD;
5423	spa = spa_add(pool, config, altroot);
5424	spa->spa_import_flags = flags;
5425
5426	/*
5427	 * Verbatim import - Take a pool and insert it into the namespace
5428	 * as if it had been loaded at boot.
5429	 */
5430	if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) {
5431		if (props != NULL)
5432			spa_configfile_set(spa, props, B_FALSE);
5433
5434		spa_write_cachefile(spa, B_FALSE, B_TRUE);
5435		spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT);
5436		zfs_dbgmsg("spa_import: verbatim import of %s", pool);
5437		mutex_exit(&spa_namespace_lock);
5438		return (0);
5439	}
5440
5441	spa_activate(spa, mode);
5442
5443	/*
5444	 * Don't start async tasks until we know everything is healthy.
5445	 */
5446	spa_async_suspend(spa);
5447
5448	zpool_get_load_policy(config, &policy);
5449	if (policy.zlp_rewind & ZPOOL_DO_REWIND)
5450		state = SPA_LOAD_RECOVER;
5451
5452	spa->spa_config_source = SPA_CONFIG_SRC_TRYIMPORT;
5453
5454	if (state != SPA_LOAD_RECOVER) {
5455		spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
5456		zfs_dbgmsg("spa_import: importing %s", pool);
5457	} else {
5458		zfs_dbgmsg("spa_import: importing %s, max_txg=%lld "
5459		    "(RECOVERY MODE)", pool, (longlong_t)policy.zlp_txg);
5460	}
5461	error = spa_load_best(spa, state, policy.zlp_txg, policy.zlp_rewind);
5462
5463	/*
5464	 * Propagate anything learned while loading the pool and pass it
5465	 * back to caller (i.e. rewind info, missing devices, etc).
5466	 */
5467	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
5468	    spa->spa_load_info) == 0);
5469
5470	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5471	/*
5472	 * Toss any existing sparelist, as it doesn't have any validity
5473	 * anymore, and conflicts with spa_has_spare().
5474	 */
5475	if (spa->spa_spares.sav_config) {
5476		nvlist_free(spa->spa_spares.sav_config);
5477		spa->spa_spares.sav_config = NULL;
5478		spa_load_spares(spa);
5479	}
5480	if (spa->spa_l2cache.sav_config) {
5481		nvlist_free(spa->spa_l2cache.sav_config);
5482		spa->spa_l2cache.sav_config = NULL;
5483		spa_load_l2cache(spa);
5484	}
5485
5486	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
5487	    &nvroot) == 0);
5488	if (error == 0)
5489		error = spa_validate_aux(spa, nvroot, -1ULL,
5490		    VDEV_ALLOC_SPARE);
5491	if (error == 0)
5492		error = spa_validate_aux(spa, nvroot, -1ULL,
5493		    VDEV_ALLOC_L2CACHE);
5494	spa_config_exit(spa, SCL_ALL, FTAG);
5495
5496	if (props != NULL)
5497		spa_configfile_set(spa, props, B_FALSE);
5498
5499	if (error != 0 || (props && spa_writeable(spa) &&
5500	    (error = spa_prop_set(spa, props)))) {
5501		spa_unload(spa);
5502		spa_deactivate(spa);
5503		spa_remove(spa);
5504		mutex_exit(&spa_namespace_lock);
5505		return (error);
5506	}
5507
5508	spa_async_resume(spa);
5509
5510	/*
5511	 * Override any spares and level 2 cache devices as specified by
5512	 * the user, as these may have correct device names/devids, etc.
5513	 */
5514	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
5515	    &spares, &nspares) == 0) {
5516		if (spa->spa_spares.sav_config)
5517			VERIFY(nvlist_remove(spa->spa_spares.sav_config,
5518			    ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
5519		else
5520			VERIFY(nvlist_alloc(&spa->spa_spares.sav_config,
5521			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
5522		VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
5523		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
5524		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5525		spa_load_spares(spa);
5526		spa_config_exit(spa, SCL_ALL, FTAG);
5527		spa->spa_spares.sav_sync = B_TRUE;
5528	}
5529	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
5530	    &l2cache, &nl2cache) == 0) {
5531		if (spa->spa_l2cache.sav_config)
5532			VERIFY(nvlist_remove(spa->spa_l2cache.sav_config,
5533			    ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0);
5534		else
5535			VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
5536			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
5537		VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
5538		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
5539		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5540		spa_load_l2cache(spa);
5541		spa_config_exit(spa, SCL_ALL, FTAG);
5542		spa->spa_l2cache.sav_sync = B_TRUE;
5543	}
5544
5545	/*
5546	 * Check for any removed devices.
5547	 */
5548	if (spa->spa_autoreplace) {
5549		spa_aux_check_removed(&spa->spa_spares);
5550		spa_aux_check_removed(&spa->spa_l2cache);
5551	}
5552
5553	if (spa_writeable(spa)) {
5554		/*
5555		 * Update the config cache to include the newly-imported pool.
5556		 */
5557		spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
5558	}
5559
5560	/*
5561	 * It's possible that the pool was expanded while it was exported.
5562	 * We kick off an async task to handle this for us.
5563	 */
5564	spa_async_request(spa, SPA_ASYNC_AUTOEXPAND);
5565
5566	spa_history_log_version(spa, "import");
5567
5568	spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT);
5569
5570	mutex_exit(&spa_namespace_lock);
5571
5572#ifdef __FreeBSD__
5573#ifdef _KERNEL
5574	zvol_create_minors(pool);
5575#endif
5576#endif
5577	return (0);
5578}
5579
5580nvlist_t *
5581spa_tryimport(nvlist_t *tryconfig)
5582{
5583	nvlist_t *config = NULL;
5584	char *poolname, *cachefile;
5585	spa_t *spa;
5586	uint64_t state;
5587	int error;
5588	zpool_load_policy_t policy;
5589
5590	if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
5591		return (NULL);
5592
5593	if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state))
5594		return (NULL);
5595
5596	/*
5597	 * Create and initialize the spa structure.
5598	 */
5599	mutex_enter(&spa_namespace_lock);
5600	spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL);
5601	spa_activate(spa, FREAD);
5602
5603	/*
5604	 * Rewind pool if a max txg was provided.
5605	 */
5606	zpool_get_load_policy(spa->spa_config, &policy);
5607	if (policy.zlp_txg != UINT64_MAX) {
5608		spa->spa_load_max_txg = policy.zlp_txg;
5609		spa->spa_extreme_rewind = B_TRUE;
5610		zfs_dbgmsg("spa_tryimport: importing %s, max_txg=%lld",
5611		    poolname, (longlong_t)policy.zlp_txg);
5612	} else {
5613		zfs_dbgmsg("spa_tryimport: importing %s", poolname);
5614	}
5615
5616	if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_CACHEFILE, &cachefile)
5617	    == 0) {
5618		zfs_dbgmsg("spa_tryimport: using cachefile '%s'", cachefile);
5619		spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE;
5620	} else {
5621		spa->spa_config_source = SPA_CONFIG_SRC_SCAN;
5622	}
5623
5624	error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING);
5625
5626	/*
5627	 * If 'tryconfig' was at least parsable, return the current config.
5628	 */
5629	if (spa->spa_root_vdev != NULL) {
5630		config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
5631		VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
5632		    poolname) == 0);
5633		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
5634		    state) == 0);
5635		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
5636		    spa->spa_uberblock.ub_timestamp) == 0);
5637		VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
5638		    spa->spa_load_info) == 0);
5639
5640		/*
5641		 * If the bootfs property exists on this pool then we
5642		 * copy it out so that external consumers can tell which
5643		 * pools are bootable.
5644		 */
5645		if ((!error || error == EEXIST) && spa->spa_bootfs) {
5646			char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
5647
5648			/*
5649			 * We have to play games with the name since the
5650			 * pool was opened as TRYIMPORT_NAME.
5651			 */
5652			if (dsl_dsobj_to_dsname(spa_name(spa),
5653			    spa->spa_bootfs, tmpname) == 0) {
5654				char *cp;
5655				char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
5656
5657				cp = strchr(tmpname, '/');
5658				if (cp == NULL) {
5659					(void) strlcpy(dsname, tmpname,
5660					    MAXPATHLEN);
5661				} else {
5662					(void) snprintf(dsname, MAXPATHLEN,
5663					    "%s/%s", poolname, ++cp);
5664				}
5665				VERIFY(nvlist_add_string(config,
5666				    ZPOOL_CONFIG_BOOTFS, dsname) == 0);
5667				kmem_free(dsname, MAXPATHLEN);
5668			}
5669			kmem_free(tmpname, MAXPATHLEN);
5670		}
5671
5672		/*
5673		 * Add the list of hot spares and level 2 cache devices.
5674		 */
5675		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
5676		spa_add_spares(spa, config);
5677		spa_add_l2cache(spa, config);
5678		spa_config_exit(spa, SCL_CONFIG, FTAG);
5679	}
5680
5681	spa_unload(spa);
5682	spa_deactivate(spa);
5683	spa_remove(spa);
5684	mutex_exit(&spa_namespace_lock);
5685
5686	return (config);
5687}
5688
5689/*
5690 * Pool export/destroy
5691 *
5692 * The act of destroying or exporting a pool is very simple.  We make sure there
5693 * is no more pending I/O and any references to the pool are gone.  Then, we
5694 * update the pool state and sync all the labels to disk, removing the
5695 * configuration from the cache afterwards. If the 'hardforce' flag is set, then
5696 * we don't sync the labels or remove the configuration cache.
5697 */
5698static int
5699spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
5700    boolean_t force, boolean_t hardforce)
5701{
5702	spa_t *spa;
5703
5704	if (oldconfig)
5705		*oldconfig = NULL;
5706
5707	if (!(spa_mode_global & FWRITE))
5708		return (SET_ERROR(EROFS));
5709
5710	mutex_enter(&spa_namespace_lock);
5711	if ((spa = spa_lookup(pool)) == NULL) {
5712		mutex_exit(&spa_namespace_lock);
5713		return (SET_ERROR(ENOENT));
5714	}
5715
5716	/*
5717	 * Put a hold on the pool, drop the namespace lock, stop async tasks,
5718	 * reacquire the namespace lock, and see if we can export.
5719	 */
5720	spa_open_ref(spa, FTAG);
5721	mutex_exit(&spa_namespace_lock);
5722	spa_async_suspend(spa);
5723	mutex_enter(&spa_namespace_lock);
5724	spa_close(spa, FTAG);
5725
5726	/*
5727	 * The pool will be in core if it's openable,
5728	 * in which case we can modify its state.
5729	 */
5730	if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) {
5731
5732		/*
5733		 * Objsets may be open only because they're dirty, so we
5734		 * have to force it to sync before checking spa_refcnt.
5735		 */
5736		txg_wait_synced(spa->spa_dsl_pool, 0);
5737		spa_evicting_os_wait(spa);
5738
5739		/*
5740		 * A pool cannot be exported or destroyed if there are active
5741		 * references.  If we are resetting a pool, allow references by
5742		 * fault injection handlers.
5743		 */
5744		if (!spa_refcount_zero(spa) ||
5745		    (spa->spa_inject_ref != 0 &&
5746		    new_state != POOL_STATE_UNINITIALIZED)) {
5747			spa_async_resume(spa);
5748			mutex_exit(&spa_namespace_lock);
5749			return (SET_ERROR(EBUSY));
5750		}
5751
5752		/*
5753		 * A pool cannot be exported if it has an active shared spare.
5754		 * This is to prevent other pools stealing the active spare
5755		 * from an exported pool. At user's own will, such pool can
5756		 * be forcedly exported.
5757		 */
5758		if (!force && new_state == POOL_STATE_EXPORTED &&
5759		    spa_has_active_shared_spare(spa)) {
5760			spa_async_resume(spa);
5761			mutex_exit(&spa_namespace_lock);
5762			return (SET_ERROR(EXDEV));
5763		}
5764
5765		/*
5766		 * We're about to export or destroy this pool. Make sure
5767		 * we stop all initializtion activity here before we
5768		 * set the spa_final_txg. This will ensure that all
5769		 * dirty data resulting from the initialization is
5770		 * committed to disk before we unload the pool.
5771		 */
5772		if (spa->spa_root_vdev != NULL) {
5773			vdev_initialize_stop_all(spa->spa_root_vdev,
5774			    VDEV_INITIALIZE_ACTIVE);
5775		}
5776
5777		/*
5778		 * We want this to be reflected on every label,
5779		 * so mark them all dirty.  spa_unload() will do the
5780		 * final sync that pushes these changes out.
5781		 */
5782		if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
5783			spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5784			spa->spa_state = new_state;
5785			spa->spa_final_txg = spa_last_synced_txg(spa) +
5786			    TXG_DEFER_SIZE + 1;
5787			vdev_config_dirty(spa->spa_root_vdev);
5788			spa_config_exit(spa, SCL_ALL, FTAG);
5789		}
5790	}
5791
5792	spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_DESTROY);
5793
5794	if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
5795		spa_unload(spa);
5796		spa_deactivate(spa);
5797	}
5798
5799	if (oldconfig && spa->spa_config)
5800		VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0);
5801
5802	if (new_state != POOL_STATE_UNINITIALIZED) {
5803		if (!hardforce)
5804			spa_write_cachefile(spa, B_TRUE, B_TRUE);
5805		spa_remove(spa);
5806	}
5807	mutex_exit(&spa_namespace_lock);
5808
5809	return (0);
5810}
5811
5812/*
5813 * Destroy a storage pool.
5814 */
5815int
5816spa_destroy(char *pool)
5817{
5818	return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL,
5819	    B_FALSE, B_FALSE));
5820}
5821
5822/*
5823 * Export a storage pool.
5824 */
5825int
5826spa_export(char *pool, nvlist_t **oldconfig, boolean_t force,
5827    boolean_t hardforce)
5828{
5829	return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig,
5830	    force, hardforce));
5831}
5832
5833/*
5834 * Similar to spa_export(), this unloads the spa_t without actually removing it
5835 * from the namespace in any way.
5836 */
5837int
5838spa_reset(char *pool)
5839{
5840	return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL,
5841	    B_FALSE, B_FALSE));
5842}
5843
5844/*
5845 * ==========================================================================
5846 * Device manipulation
5847 * ==========================================================================
5848 */
5849
5850/*
5851 * Add a device to a storage pool.
5852 */
5853int
5854spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
5855{
5856	uint64_t txg, id;
5857	int error;
5858	vdev_t *rvd = spa->spa_root_vdev;
5859	vdev_t *vd, *tvd;
5860	nvlist_t **spares, **l2cache;
5861	uint_t nspares, nl2cache;
5862
5863	ASSERT(spa_writeable(spa));
5864
5865	txg = spa_vdev_enter(spa);
5866
5867	if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0,
5868	    VDEV_ALLOC_ADD)) != 0)
5869		return (spa_vdev_exit(spa, NULL, txg, error));
5870
5871	spa->spa_pending_vdev = vd;	/* spa_vdev_exit() will clear this */
5872
5873	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares,
5874	    &nspares) != 0)
5875		nspares = 0;
5876
5877	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache,
5878	    &nl2cache) != 0)
5879		nl2cache = 0;
5880
5881	if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0)
5882		return (spa_vdev_exit(spa, vd, txg, EINVAL));
5883
5884	if (vd->vdev_children != 0 &&
5885	    (error = vdev_create(vd, txg, B_FALSE)) != 0)
5886		return (spa_vdev_exit(spa, vd, txg, error));
5887
5888	/*
5889	 * We must validate the spares and l2cache devices after checking the
5890	 * children.  Otherwise, vdev_inuse() will blindly overwrite the spare.
5891	 */
5892	if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0)
5893		return (spa_vdev_exit(spa, vd, txg, error));
5894
5895	/*
5896	 * If we are in the middle of a device removal, we can only add
5897	 * devices which match the existing devices in the pool.
5898	 * If we are in the middle of a removal, or have some indirect
5899	 * vdevs, we can not add raidz toplevels.
5900	 */
5901	if (spa->spa_vdev_removal != NULL ||
5902	    spa->spa_removing_phys.sr_prev_indirect_vdev != -1) {
5903		for (int c = 0; c < vd->vdev_children; c++) {
5904			tvd = vd->vdev_child[c];
5905			if (spa->spa_vdev_removal != NULL &&
5906			    tvd->vdev_ashift != spa->spa_max_ashift) {
5907				return (spa_vdev_exit(spa, vd, txg, EINVAL));
5908			}
5909			/* Fail if top level vdev is raidz */
5910			if (tvd->vdev_ops == &vdev_raidz_ops) {
5911				return (spa_vdev_exit(spa, vd, txg, EINVAL));
5912			}
5913			/*
5914			 * Need the top level mirror to be
5915			 * a mirror of leaf vdevs only
5916			 */
5917			if (tvd->vdev_ops == &vdev_mirror_ops) {
5918				for (uint64_t cid = 0;
5919				    cid < tvd->vdev_children; cid++) {
5920					vdev_t *cvd = tvd->vdev_child[cid];
5921					if (!cvd->vdev_ops->vdev_op_leaf) {
5922						return (spa_vdev_exit(spa, vd,
5923						    txg, EINVAL));
5924					}
5925				}
5926			}
5927		}
5928	}
5929
5930	for (int c = 0; c < vd->vdev_children; c++) {
5931
5932		/*
5933		 * Set the vdev id to the first hole, if one exists.
5934		 */
5935		for (id = 0; id < rvd->vdev_children; id++) {
5936			if (rvd->vdev_child[id]->vdev_ishole) {
5937				vdev_free(rvd->vdev_child[id]);
5938				break;
5939			}
5940		}
5941		tvd = vd->vdev_child[c];
5942		vdev_remove_child(vd, tvd);
5943		tvd->vdev_id = id;
5944		vdev_add_child(rvd, tvd);
5945		vdev_config_dirty(tvd);
5946	}
5947
5948	if (nspares != 0) {
5949		spa_set_aux_vdevs(&spa->spa_spares, spares, nspares,
5950		    ZPOOL_CONFIG_SPARES);
5951		spa_load_spares(spa);
5952		spa->spa_spares.sav_sync = B_TRUE;
5953	}
5954
5955	if (nl2cache != 0) {
5956		spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache,
5957		    ZPOOL_CONFIG_L2CACHE);
5958		spa_load_l2cache(spa);
5959		spa->spa_l2cache.sav_sync = B_TRUE;
5960	}
5961
5962	/*
5963	 * We have to be careful when adding new vdevs to an existing pool.
5964	 * If other threads start allocating from these vdevs before we
5965	 * sync the config cache, and we lose power, then upon reboot we may
5966	 * fail to open the pool because there are DVAs that the config cache
5967	 * can't translate.  Therefore, we first add the vdevs without
5968	 * initializing metaslabs; sync the config cache (via spa_vdev_exit());
5969	 * and then let spa_config_update() initialize the new metaslabs.
5970	 *
5971	 * spa_load() checks for added-but-not-initialized vdevs, so that
5972	 * if we lose power at any point in this sequence, the remaining
5973	 * steps will be completed the next time we load the pool.
5974	 */
5975	(void) spa_vdev_exit(spa, vd, txg, 0);
5976
5977	mutex_enter(&spa_namespace_lock);
5978	spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
5979	spa_event_notify(spa, NULL, NULL, ESC_ZFS_VDEV_ADD);
5980	mutex_exit(&spa_namespace_lock);
5981
5982	return (0);
5983}
5984
5985/*
5986 * Attach a device to a mirror.  The arguments are the path to any device
5987 * in the mirror, and the nvroot for the new device.  If the path specifies
5988 * a device that is not mirrored, we automatically insert the mirror vdev.
5989 *
5990 * If 'replacing' is specified, the new device is intended to replace the
5991 * existing device; in this case the two devices are made into their own
5992 * mirror using the 'replacing' vdev, which is functionally identical to
5993 * the mirror vdev (it actually reuses all the same ops) but has a few
5994 * extra rules: you can't attach to it after it's been created, and upon
5995 * completion of resilvering, the first disk (the one being replaced)
5996 * is automatically detached.
5997 */
5998int
5999spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
6000{
6001	uint64_t txg, dtl_max_txg;
6002	vdev_t *rvd = spa->spa_root_vdev;
6003	vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
6004	vdev_ops_t *pvops;
6005	char *oldvdpath, *newvdpath;
6006	int newvd_isspare;
6007	int error;
6008
6009	ASSERT(spa_writeable(spa));
6010
6011	txg = spa_vdev_enter(spa);
6012
6013	oldvd = spa_lookup_by_guid(spa, guid, B_FALSE);
6014
6015	ASSERT(MUTEX_HELD(&spa_namespace_lock));
6016	if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
6017		error = (spa_has_checkpoint(spa)) ?
6018		    ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
6019		return (spa_vdev_exit(spa, NULL, txg, error));
6020	}
6021
6022	if (spa->spa_vdev_removal != NULL)
6023		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
6024
6025	if (oldvd == NULL)
6026		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
6027
6028	if (!oldvd->vdev_ops->vdev_op_leaf)
6029		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
6030
6031	pvd = oldvd->vdev_parent;
6032
6033	if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
6034	    VDEV_ALLOC_ATTACH)) != 0)
6035		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
6036
6037	if (newrootvd->vdev_children != 1)
6038		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
6039
6040	newvd = newrootvd->vdev_child[0];
6041
6042	if (!newvd->vdev_ops->vdev_op_leaf)
6043		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
6044
6045	if ((error = vdev_create(newrootvd, txg, replacing)) != 0)
6046		return (spa_vdev_exit(spa, newrootvd, txg, error));
6047
6048	/*
6049	 * Spares can't replace logs
6050	 */
6051	if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare)
6052		return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
6053
6054	if (!replacing) {
6055		/*
6056		 * For attach, the only allowable parent is a mirror or the root
6057		 * vdev.
6058		 */
6059		if (pvd->vdev_ops != &vdev_mirror_ops &&
6060		    pvd->vdev_ops != &vdev_root_ops)
6061			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
6062
6063		pvops = &vdev_mirror_ops;
6064	} else {
6065		/*
6066		 * Active hot spares can only be replaced by inactive hot
6067		 * spares.
6068		 */
6069		if (pvd->vdev_ops == &vdev_spare_ops &&
6070		    oldvd->vdev_isspare &&
6071		    !spa_has_spare(spa, newvd->vdev_guid))
6072			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
6073
6074		/*
6075		 * If the source is a hot spare, and the parent isn't already a
6076		 * spare, then we want to create a new hot spare.  Otherwise, we
6077		 * want to create a replacing vdev.  The user is not allowed to
6078		 * attach to a spared vdev child unless the 'isspare' state is
6079		 * the same (spare replaces spare, non-spare replaces
6080		 * non-spare).
6081		 */
6082		if (pvd->vdev_ops == &vdev_replacing_ops &&
6083		    spa_version(spa) < SPA_VERSION_MULTI_REPLACE) {
6084			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
6085		} else if (pvd->vdev_ops == &vdev_spare_ops &&
6086		    newvd->vdev_isspare != oldvd->vdev_isspare) {
6087			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
6088		}
6089
6090		if (newvd->vdev_isspare)
6091			pvops = &vdev_spare_ops;
6092		else
6093			pvops = &vdev_replacing_ops;
6094	}
6095
6096	/*
6097	 * Make sure the new device is big enough.
6098	 */
6099	if (newvd->vdev_asize < vdev_get_min_asize(oldvd))
6100		return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
6101
6102	/*
6103	 * The new device cannot have a higher alignment requirement
6104	 * than the top-level vdev.
6105	 */
6106	if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
6107		return (spa_vdev_exit(spa, newrootvd, txg, EDOM));
6108
6109	/*
6110	 * If this is an in-place replacement, update oldvd's path and devid
6111	 * to make it distinguishable from newvd, and unopenable from now on.
6112	 */
6113	if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
6114		spa_strfree(oldvd->vdev_path);
6115		oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
6116		    KM_SLEEP);
6117		(void) sprintf(oldvd->vdev_path, "%s/%s",
6118		    newvd->vdev_path, "old");
6119		if (oldvd->vdev_devid != NULL) {
6120			spa_strfree(oldvd->vdev_devid);
6121			oldvd->vdev_devid = NULL;
6122		}
6123	}
6124
6125	/* mark the device being resilvered */
6126	newvd->vdev_resilver_txg = txg;
6127
6128	/*
6129	 * If the parent is not a mirror, or if we're replacing, insert the new
6130	 * mirror/replacing/spare vdev above oldvd.
6131	 */
6132	if (pvd->vdev_ops != pvops)
6133		pvd = vdev_add_parent(oldvd, pvops);
6134
6135	ASSERT(pvd->vdev_top->vdev_parent == rvd);
6136	ASSERT(pvd->vdev_ops == pvops);
6137	ASSERT(oldvd->vdev_parent == pvd);
6138
6139	/*
6140	 * Extract the new device from its root and add it to pvd.
6141	 */
6142	vdev_remove_child(newrootvd, newvd);
6143	newvd->vdev_id = pvd->vdev_children;
6144	newvd->vdev_crtxg = oldvd->vdev_crtxg;
6145	vdev_add_child(pvd, newvd);
6146
6147	tvd = newvd->vdev_top;
6148	ASSERT(pvd->vdev_top == tvd);
6149	ASSERT(tvd->vdev_parent == rvd);
6150
6151	vdev_config_dirty(tvd);
6152
6153	/*
6154	 * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account
6155	 * for any dmu_sync-ed blocks.  It will propagate upward when
6156	 * spa_vdev_exit() calls vdev_dtl_reassess().
6157	 */
6158	dtl_max_txg = txg + TXG_CONCURRENT_STATES;
6159
6160	vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL,
6161	    dtl_max_txg - TXG_INITIAL);
6162
6163	if (newvd->vdev_isspare) {
6164		spa_spare_activate(newvd);
6165		spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE);
6166	}
6167
6168	oldvdpath = spa_strdup(oldvd->vdev_path);
6169	newvdpath = spa_strdup(newvd->vdev_path);
6170	newvd_isspare = newvd->vdev_isspare;
6171
6172	/*
6173	 * Mark newvd's DTL dirty in this txg.
6174	 */
6175	vdev_dirty(tvd, VDD_DTL, newvd, txg);
6176
6177	/*
6178	 * Schedule the resilver to restart in the future. We do this to
6179	 * ensure that dmu_sync-ed blocks have been stitched into the
6180	 * respective datasets.
6181	 */
6182	dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg);
6183
6184	if (spa->spa_bootfs)
6185		spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH);
6186
6187	spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_ATTACH);
6188
6189	/*
6190	 * Commit the config
6191	 */
6192	(void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0);
6193
6194	spa_history_log_internal(spa, "vdev attach", NULL,
6195	    "%s vdev=%s %s vdev=%s",
6196	    replacing && newvd_isspare ? "spare in" :
6197	    replacing ? "replace" : "attach", newvdpath,
6198	    replacing ? "for" : "to", oldvdpath);
6199
6200	spa_strfree(oldvdpath);
6201	spa_strfree(newvdpath);
6202
6203	return (0);
6204}
6205
6206/*
6207 * Detach a device from a mirror or replacing vdev.
6208 *
6209 * If 'replace_done' is specified, only detach if the parent
6210 * is a replacing vdev.
6211 */
6212int
6213spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
6214{
6215	uint64_t txg;
6216	int error;
6217	vdev_t *rvd = spa->spa_root_vdev;
6218	vdev_t *vd, *pvd, *cvd, *tvd;
6219	boolean_t unspare = B_FALSE;
6220	uint64_t unspare_guid = 0;
6221	char *vdpath;
6222
6223	ASSERT(spa_writeable(spa));
6224
6225	txg = spa_vdev_enter(spa);
6226
6227	vd = spa_lookup_by_guid(spa, guid, B_FALSE);
6228
6229	/*
6230	 * Besides being called directly from the userland through the
6231	 * ioctl interface, spa_vdev_detach() can be potentially called
6232	 * at the end of spa_vdev_resilver_done().
6233	 *
6234	 * In the regular case, when we have a checkpoint this shouldn't
6235	 * happen as we never empty the DTLs of a vdev during the scrub
6236	 * [see comment in dsl_scan_done()]. Thus spa_vdev_resilvering_done()
6237	 * should never get here when we have a checkpoint.
6238	 *
6239	 * That said, even in a case when we checkpoint the pool exactly
6240	 * as spa_vdev_resilver_done() calls this function everything
6241	 * should be fine as the resilver will return right away.
6242	 */
6243	ASSERT(MUTEX_HELD(&spa_namespace_lock));
6244	if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
6245		error = (spa_has_checkpoint(spa)) ?
6246		    ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
6247		return (spa_vdev_exit(spa, NULL, txg, error));
6248	}
6249
6250	if (vd == NULL)
6251		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
6252
6253	if (!vd->vdev_ops->vdev_op_leaf)
6254		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
6255
6256	pvd = vd->vdev_parent;
6257
6258	/*
6259	 * If the parent/child relationship is not as expected, don't do it.
6260	 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing
6261	 * vdev that's replacing B with C.  The user's intent in replacing
6262	 * is to go from M(A,B) to M(A,C).  If the user decides to cancel
6263	 * the replace by detaching C, the expected behavior is to end up
6264	 * M(A,B).  But suppose that right after deciding to detach C,
6265	 * the replacement of B completes.  We would have M(A,C), and then
6266	 * ask to detach C, which would leave us with just A -- not what
6267	 * the user wanted.  To prevent this, we make sure that the
6268	 * parent/child relationship hasn't changed -- in this example,
6269	 * that C's parent is still the replacing vdev R.
6270	 */
6271	if (pvd->vdev_guid != pguid && pguid != 0)
6272		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
6273
6274	/*
6275	 * Only 'replacing' or 'spare' vdevs can be replaced.
6276	 */
6277	if (replace_done && pvd->vdev_ops != &vdev_replacing_ops &&
6278	    pvd->vdev_ops != &vdev_spare_ops)
6279		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
6280
6281	ASSERT(pvd->vdev_ops != &vdev_spare_ops ||
6282	    spa_version(spa) >= SPA_VERSION_SPARES);
6283
6284	/*
6285	 * Only mirror, replacing, and spare vdevs support detach.
6286	 */
6287	if (pvd->vdev_ops != &vdev_replacing_ops &&
6288	    pvd->vdev_ops != &vdev_mirror_ops &&
6289	    pvd->vdev_ops != &vdev_spare_ops)
6290		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
6291
6292	/*
6293	 * If this device has the only valid copy of some data,
6294	 * we cannot safely detach it.
6295	 */
6296	if (vdev_dtl_required(vd))
6297		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
6298
6299	ASSERT(pvd->vdev_children >= 2);
6300
6301	/*
6302	 * If we are detaching the second disk from a replacing vdev, then
6303	 * check to see if we changed the original vdev's path to have "/old"
6304	 * at the end in spa_vdev_attach().  If so, undo that change now.
6305	 */
6306	if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 &&
6307	    vd->vdev_path != NULL) {
6308		size_t len = strlen(vd->vdev_path);
6309
6310		for (int c = 0; c < pvd->vdev_children; c++) {
6311			cvd = pvd->vdev_child[c];
6312
6313			if (cvd == vd || cvd->vdev_path == NULL)
6314				continue;
6315
6316			if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 &&
6317			    strcmp(cvd->vdev_path + len, "/old") == 0) {
6318				spa_strfree(cvd->vdev_path);
6319				cvd->vdev_path = spa_strdup(vd->vdev_path);
6320				break;
6321			}
6322		}
6323	}
6324
6325	/*
6326	 * If we are detaching the original disk from a spare, then it implies
6327	 * that the spare should become a real disk, and be removed from the
6328	 * active spare list for the pool.
6329	 */
6330	if (pvd->vdev_ops == &vdev_spare_ops &&
6331	    vd->vdev_id == 0 &&
6332	    pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare)
6333		unspare = B_TRUE;
6334
6335	/*
6336	 * Erase the disk labels so the disk can be used for other things.
6337	 * This must be done after all other error cases are handled,
6338	 * but before we disembowel vd (so we can still do I/O to it).
6339	 * But if we can't do it, don't treat the error as fatal --
6340	 * it may be that the unwritability of the disk is the reason
6341	 * it's being detached!
6342	 */
6343	error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
6344
6345	/*
6346	 * Remove vd from its parent and compact the parent's children.
6347	 */
6348	vdev_remove_child(pvd, vd);
6349	vdev_compact_children(pvd);
6350
6351	/*
6352	 * Remember one of the remaining children so we can get tvd below.
6353	 */
6354	cvd = pvd->vdev_child[pvd->vdev_children - 1];
6355
6356	/*
6357	 * If we need to remove the remaining child from the list of hot spares,
6358	 * do it now, marking the vdev as no longer a spare in the process.
6359	 * We must do this before vdev_remove_parent(), because that can
6360	 * change the GUID if it creates a new toplevel GUID.  For a similar
6361	 * reason, we must remove the spare now, in the same txg as the detach;
6362	 * otherwise someone could attach a new sibling, change the GUID, and
6363	 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail.
6364	 */
6365	if (unspare) {
6366		ASSERT(cvd->vdev_isspare);
6367		spa_spare_remove(cvd);
6368		unspare_guid = cvd->vdev_guid;
6369		(void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
6370		cvd->vdev_unspare = B_TRUE;
6371	}
6372
6373	/*
6374	 * If the parent mirror/replacing vdev only has one child,
6375	 * the parent is no longer needed.  Remove it from the tree.
6376	 */
6377	if (pvd->vdev_children == 1) {
6378		if (pvd->vdev_ops == &vdev_spare_ops)
6379			cvd->vdev_unspare = B_FALSE;
6380		vdev_remove_parent(cvd);
6381	}
6382
6383
6384	/*
6385	 * We don't set tvd until now because the parent we just removed
6386	 * may have been the previous top-level vdev.
6387	 */
6388	tvd = cvd->vdev_top;
6389	ASSERT(tvd->vdev_parent == rvd);
6390
6391	/*
6392	 * Reevaluate the parent vdev state.
6393	 */
6394	vdev_propagate_state(cvd);
6395
6396	/*
6397	 * If the 'autoexpand' property is set on the pool then automatically
6398	 * try to expand the size of the pool. For example if the device we
6399	 * just detached was smaller than the others, it may be possible to
6400	 * add metaslabs (i.e. grow the pool). We need to reopen the vdev
6401	 * first so that we can obtain the updated sizes of the leaf vdevs.
6402	 */
6403	if (spa->spa_autoexpand) {
6404		vdev_reopen(tvd);
6405		vdev_expand(tvd, txg);
6406	}
6407
6408	vdev_config_dirty(tvd);
6409
6410	/*
6411	 * Mark vd's DTL as dirty in this txg.  vdev_dtl_sync() will see that
6412	 * vd->vdev_detached is set and free vd's DTL object in syncing context.
6413	 * But first make sure we're not on any *other* txg's DTL list, to
6414	 * prevent vd from being accessed after it's freed.
6415	 */
6416	vdpath = spa_strdup(vd->vdev_path);
6417	for (int t = 0; t < TXG_SIZE; t++)
6418		(void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
6419	vd->vdev_detached = B_TRUE;
6420	vdev_dirty(tvd, VDD_DTL, vd, txg);
6421
6422	spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE);
6423
6424	/* hang on to the spa before we release the lock */
6425	spa_open_ref(spa, FTAG);
6426
6427	error = spa_vdev_exit(spa, vd, txg, 0);
6428
6429	spa_history_log_internal(spa, "detach", NULL,
6430	    "vdev=%s", vdpath);
6431	spa_strfree(vdpath);
6432
6433	/*
6434	 * If this was the removal of the original device in a hot spare vdev,
6435	 * then we want to go through and remove the device from the hot spare
6436	 * list of every other pool.
6437	 */
6438	if (unspare) {
6439		spa_t *altspa = NULL;
6440
6441		mutex_enter(&spa_namespace_lock);
6442		while ((altspa = spa_next(altspa)) != NULL) {
6443			if (altspa->spa_state != POOL_STATE_ACTIVE ||
6444			    altspa == spa)
6445				continue;
6446
6447			spa_open_ref(altspa, FTAG);
6448			mutex_exit(&spa_namespace_lock);
6449			(void) spa_vdev_remove(altspa, unspare_guid, B_TRUE);
6450			mutex_enter(&spa_namespace_lock);
6451			spa_close(altspa, FTAG);
6452		}
6453		mutex_exit(&spa_namespace_lock);
6454
6455		/* search the rest of the vdevs for spares to remove */
6456		spa_vdev_resilver_done(spa);
6457	}
6458
6459	/* all done with the spa; OK to release */
6460	mutex_enter(&spa_namespace_lock);
6461	spa_close(spa, FTAG);
6462	mutex_exit(&spa_namespace_lock);
6463
6464	return (error);
6465}
6466
6467int
6468spa_vdev_initialize(spa_t *spa, uint64_t guid, uint64_t cmd_type)
6469{
6470	/*
6471	 * We hold the namespace lock through the whole function
6472	 * to prevent any changes to the pool while we're starting or
6473	 * stopping initialization. The config and state locks are held so that
6474	 * we can properly assess the vdev state before we commit to
6475	 * the initializing operation.
6476	 */
6477	mutex_enter(&spa_namespace_lock);
6478	spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
6479
6480	/* Look up vdev and ensure it's a leaf. */
6481	vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE);
6482	if (vd == NULL || vd->vdev_detached) {
6483		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
6484		mutex_exit(&spa_namespace_lock);
6485		return (SET_ERROR(ENODEV));
6486	} else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) {
6487		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
6488		mutex_exit(&spa_namespace_lock);
6489		return (SET_ERROR(EINVAL));
6490	} else if (!vdev_writeable(vd)) {
6491		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
6492		mutex_exit(&spa_namespace_lock);
6493		return (SET_ERROR(EROFS));
6494	}
6495	mutex_enter(&vd->vdev_initialize_lock);
6496	spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
6497
6498	/*
6499	 * When we activate an initialize action we check to see
6500	 * if the vdev_initialize_thread is NULL. We do this instead
6501	 * of using the vdev_initialize_state since there might be
6502	 * a previous initialization process which has completed but
6503	 * the thread is not exited.
6504	 */
6505	if (cmd_type == POOL_INITIALIZE_DO &&
6506	    (vd->vdev_initialize_thread != NULL ||
6507	    vd->vdev_top->vdev_removing)) {
6508		mutex_exit(&vd->vdev_initialize_lock);
6509		mutex_exit(&spa_namespace_lock);
6510		return (SET_ERROR(EBUSY));
6511	} else if (cmd_type == POOL_INITIALIZE_CANCEL &&
6512	    (vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE &&
6513	    vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED)) {
6514		mutex_exit(&vd->vdev_initialize_lock);
6515		mutex_exit(&spa_namespace_lock);
6516		return (SET_ERROR(ESRCH));
6517	} else if (cmd_type == POOL_INITIALIZE_SUSPEND &&
6518	    vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) {
6519		mutex_exit(&vd->vdev_initialize_lock);
6520		mutex_exit(&spa_namespace_lock);
6521		return (SET_ERROR(ESRCH));
6522	}
6523
6524	switch (cmd_type) {
6525	case POOL_INITIALIZE_DO:
6526		vdev_initialize(vd);
6527		break;
6528	case POOL_INITIALIZE_CANCEL:
6529		vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED);
6530		break;
6531	case POOL_INITIALIZE_SUSPEND:
6532		vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED);
6533		break;
6534	default:
6535		panic("invalid cmd_type %llu", (unsigned long long)cmd_type);
6536	}
6537	mutex_exit(&vd->vdev_initialize_lock);
6538
6539	/* Sync out the initializing state */
6540	txg_wait_synced(spa->spa_dsl_pool, 0);
6541	mutex_exit(&spa_namespace_lock);
6542
6543	return (0);
6544}
6545
6546
6547/*
6548 * Split a set of devices from their mirrors, and create a new pool from them.
6549 */
6550int
6551spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
6552    nvlist_t *props, boolean_t exp)
6553{
6554	int error = 0;
6555	uint64_t txg, *glist;
6556	spa_t *newspa;
6557	uint_t c, children, lastlog;
6558	nvlist_t **child, *nvl, *tmp;
6559	dmu_tx_t *tx;
6560	char *altroot = NULL;
6561	vdev_t *rvd, **vml = NULL;			/* vdev modify list */
6562	boolean_t activate_slog;
6563
6564	ASSERT(spa_writeable(spa));
6565
6566	txg = spa_vdev_enter(spa);
6567
6568	ASSERT(MUTEX_HELD(&spa_namespace_lock));
6569	if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
6570		error = (spa_has_checkpoint(spa)) ?
6571		    ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
6572		return (spa_vdev_exit(spa, NULL, txg, error));
6573	}
6574
6575	/* clear the log and flush everything up to now */
6576	activate_slog = spa_passivate_log(spa);
6577	(void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
6578	error = spa_reset_logs(spa);
6579	txg = spa_vdev_config_enter(spa);
6580
6581	if (activate_slog)
6582		spa_activate_log(spa);
6583
6584	if (error != 0)
6585		return (spa_vdev_exit(spa, NULL, txg, error));
6586
6587	/* check new spa name before going any further */
6588	if (spa_lookup(newname) != NULL)
6589		return (spa_vdev_exit(spa, NULL, txg, EEXIST));
6590
6591	/*
6592	 * scan through all the children to ensure they're all mirrors
6593	 */
6594	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 ||
6595	    nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child,
6596	    &children) != 0)
6597		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
6598
6599	/* first, check to ensure we've got the right child count */
6600	rvd = spa->spa_root_vdev;
6601	lastlog = 0;
6602	for (c = 0; c < rvd->vdev_children; c++) {
6603		vdev_t *vd = rvd->vdev_child[c];
6604
6605		/* don't count the holes & logs as children */
6606		if (vd->vdev_islog || !vdev_is_concrete(vd)) {
6607			if (lastlog == 0)
6608				lastlog = c;
6609			continue;
6610		}
6611
6612		lastlog = 0;
6613	}
6614	if (children != (lastlog != 0 ? lastlog : rvd->vdev_children))
6615		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
6616
6617	/* next, ensure no spare or cache devices are part of the split */
6618	if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 ||
6619	    nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0)
6620		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
6621
6622	vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP);
6623	glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP);
6624
6625	/* then, loop over each vdev and validate it */
6626	for (c = 0; c < children; c++) {
6627		uint64_t is_hole = 0;
6628
6629		(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
6630		    &is_hole);
6631
6632		if (is_hole != 0) {
6633			if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole ||
6634			    spa->spa_root_vdev->vdev_child[c]->vdev_islog) {
6635				continue;
6636			} else {
6637				error = SET_ERROR(EINVAL);
6638				break;
6639			}
6640		}
6641
6642		/* which disk is going to be split? */
6643		if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID,
6644		    &glist[c]) != 0) {
6645			error = SET_ERROR(EINVAL);
6646			break;
6647		}
6648
6649		/* look it up in the spa */
6650		vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE);
6651		if (vml[c] == NULL) {
6652			error = SET_ERROR(ENODEV);
6653			break;
6654		}
6655
6656		/* make sure there's nothing stopping the split */
6657		if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops ||
6658		    vml[c]->vdev_islog ||
6659		    !vdev_is_concrete(vml[c]) ||
6660		    vml[c]->vdev_isspare ||
6661		    vml[c]->vdev_isl2cache ||
6662		    !vdev_writeable(vml[c]) ||
6663		    vml[c]->vdev_children != 0 ||
6664		    vml[c]->vdev_state != VDEV_STATE_HEALTHY ||
6665		    c != spa->spa_root_vdev->vdev_child[c]->vdev_id) {
6666			error = SET_ERROR(EINVAL);
6667			break;
6668		}
6669
6670		if (vdev_dtl_required(vml[c])) {
6671			error = SET_ERROR(EBUSY);
6672			break;
6673		}
6674
6675		/* we need certain info from the top level */
6676		VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY,
6677		    vml[c]->vdev_top->vdev_ms_array) == 0);
6678		VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT,
6679		    vml[c]->vdev_top->vdev_ms_shift) == 0);
6680		VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE,
6681		    vml[c]->vdev_top->vdev_asize) == 0);
6682		VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT,
6683		    vml[c]->vdev_top->vdev_ashift) == 0);
6684
6685		/* transfer per-vdev ZAPs */
6686		ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0);
6687		VERIFY0(nvlist_add_uint64(child[c],
6688		    ZPOOL_CONFIG_VDEV_LEAF_ZAP, vml[c]->vdev_leaf_zap));
6689
6690		ASSERT3U(vml[c]->vdev_top->vdev_top_zap, !=, 0);
6691		VERIFY0(nvlist_add_uint64(child[c],
6692		    ZPOOL_CONFIG_VDEV_TOP_ZAP,
6693		    vml[c]->vdev_parent->vdev_top_zap));
6694	}
6695
6696	if (error != 0) {
6697		kmem_free(vml, children * sizeof (vdev_t *));
6698		kmem_free(glist, children * sizeof (uint64_t));
6699		return (spa_vdev_exit(spa, NULL, txg, error));
6700	}
6701
6702	/* stop writers from using the disks */
6703	for (c = 0; c < children; c++) {
6704		if (vml[c] != NULL)
6705			vml[c]->vdev_offline = B_TRUE;
6706	}
6707	vdev_reopen(spa->spa_root_vdev);
6708
6709	/*
6710	 * Temporarily record the splitting vdevs in the spa config.  This
6711	 * will disappear once the config is regenerated.
6712	 */
6713	VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0);
6714	VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
6715	    glist, children) == 0);
6716	kmem_free(glist, children * sizeof (uint64_t));
6717
6718	mutex_enter(&spa->spa_props_lock);
6719	VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT,
6720	    nvl) == 0);
6721	mutex_exit(&spa->spa_props_lock);
6722	spa->spa_config_splitting = nvl;
6723	vdev_config_dirty(spa->spa_root_vdev);
6724
6725	/* configure and create the new pool */
6726	VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0);
6727	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
6728	    exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0);
6729	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
6730	    spa_version(spa)) == 0);
6731	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG,
6732	    spa->spa_config_txg) == 0);
6733	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
6734	    spa_generate_guid(NULL)) == 0);
6735	VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS));
6736	(void) nvlist_lookup_string(props,
6737	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
6738
6739	/* add the new pool to the namespace */
6740	newspa = spa_add(newname, config, altroot);
6741	newspa->spa_avz_action = AVZ_ACTION_REBUILD;
6742	newspa->spa_config_txg = spa->spa_config_txg;
6743	spa_set_log_state(newspa, SPA_LOG_CLEAR);
6744
6745	/* release the spa config lock, retaining the namespace lock */
6746	spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
6747
6748	if (zio_injection_enabled)
6749		zio_handle_panic_injection(spa, FTAG, 1);
6750
6751	spa_activate(newspa, spa_mode_global);
6752	spa_async_suspend(newspa);
6753
6754	for (c = 0; c < children; c++) {
6755		if (vml[c] != NULL) {
6756			/*
6757			 * Temporarily stop the initializing activity. We set
6758			 * the state to ACTIVE so that we know to resume
6759			 * the initializing once the split has completed.
6760			 */
6761			mutex_enter(&vml[c]->vdev_initialize_lock);
6762			vdev_initialize_stop(vml[c], VDEV_INITIALIZE_ACTIVE);
6763			mutex_exit(&vml[c]->vdev_initialize_lock);
6764		}
6765	}
6766
6767#ifndef illumos
6768	/* mark that we are creating new spa by splitting */
6769	newspa->spa_splitting_newspa = B_TRUE;
6770#endif
6771	newspa->spa_config_source = SPA_CONFIG_SRC_SPLIT;
6772
6773	/* create the new pool from the disks of the original pool */
6774	error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE);
6775#ifndef illumos
6776	newspa->spa_splitting_newspa = B_FALSE;
6777#endif
6778	if (error)
6779		goto out;
6780
6781	/* if that worked, generate a real config for the new pool */
6782	if (newspa->spa_root_vdev != NULL) {
6783		VERIFY(nvlist_alloc(&newspa->spa_config_splitting,
6784		    NV_UNIQUE_NAME, KM_SLEEP) == 0);
6785		VERIFY(nvlist_add_uint64(newspa->spa_config_splitting,
6786		    ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0);
6787		spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL,
6788		    B_TRUE));
6789	}
6790
6791	/* set the props */
6792	if (props != NULL) {
6793		spa_configfile_set(newspa, props, B_FALSE);
6794		error = spa_prop_set(newspa, props);
6795		if (error)
6796			goto out;
6797	}
6798
6799	/* flush everything */
6800	txg = spa_vdev_config_enter(newspa);
6801	vdev_config_dirty(newspa->spa_root_vdev);
6802	(void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG);
6803
6804	if (zio_injection_enabled)
6805		zio_handle_panic_injection(spa, FTAG, 2);
6806
6807	spa_async_resume(newspa);
6808
6809	/* finally, update the original pool's config */
6810	txg = spa_vdev_config_enter(spa);
6811	tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
6812	error = dmu_tx_assign(tx, TXG_WAIT);
6813	if (error != 0)
6814		dmu_tx_abort(tx);
6815	for (c = 0; c < children; c++) {
6816		if (vml[c] != NULL) {
6817			vdev_split(vml[c]);
6818			if (error == 0)
6819				spa_history_log_internal(spa, "detach", tx,
6820				    "vdev=%s", vml[c]->vdev_path);
6821
6822			vdev_free(vml[c]);
6823		}
6824	}
6825	spa->spa_avz_action = AVZ_ACTION_REBUILD;
6826	vdev_config_dirty(spa->spa_root_vdev);
6827	spa->spa_config_splitting = NULL;
6828	nvlist_free(nvl);
6829	if (error == 0)
6830		dmu_tx_commit(tx);
6831	(void) spa_vdev_exit(spa, NULL, txg, 0);
6832
6833	if (zio_injection_enabled)
6834		zio_handle_panic_injection(spa, FTAG, 3);
6835
6836	/* split is complete; log a history record */
6837	spa_history_log_internal(newspa, "split", NULL,
6838	    "from pool %s", spa_name(spa));
6839
6840	kmem_free(vml, children * sizeof (vdev_t *));
6841
6842	/* if we're not going to mount the filesystems in userland, export */
6843	if (exp)
6844		error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL,
6845		    B_FALSE, B_FALSE);
6846
6847	return (error);
6848
6849out:
6850	spa_unload(newspa);
6851	spa_deactivate(newspa);
6852	spa_remove(newspa);
6853
6854	txg = spa_vdev_config_enter(spa);
6855
6856	/* re-online all offlined disks */
6857	for (c = 0; c < children; c++) {
6858		if (vml[c] != NULL)
6859			vml[c]->vdev_offline = B_FALSE;
6860	}
6861
6862	/* restart initializing disks as necessary */
6863	spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
6864
6865	vdev_reopen(spa->spa_root_vdev);
6866
6867	nvlist_free(spa->spa_config_splitting);
6868	spa->spa_config_splitting = NULL;
6869	(void) spa_vdev_exit(spa, NULL, txg, error);
6870
6871	kmem_free(vml, children * sizeof (vdev_t *));
6872	return (error);
6873}
6874
6875/*
6876 * Find any device that's done replacing, or a vdev marked 'unspare' that's
6877 * currently spared, so we can detach it.
6878 */
6879static vdev_t *
6880spa_vdev_resilver_done_hunt(vdev_t *vd)
6881{
6882	vdev_t *newvd, *oldvd;
6883
6884	for (int c = 0; c < vd->vdev_children; c++) {
6885		oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]);
6886		if (oldvd != NULL)
6887			return (oldvd);
6888	}
6889
6890	/*
6891	 * Check for a completed replacement.  We always consider the first
6892	 * vdev in the list to be the oldest vdev, and the last one to be
6893	 * the newest (see spa_vdev_attach() for how that works).  In
6894	 * the case where the newest vdev is faulted, we will not automatically
6895	 * remove it after a resilver completes.  This is OK as it will require
6896	 * user intervention to determine which disk the admin wishes to keep.
6897	 */
6898	if (vd->vdev_ops == &vdev_replacing_ops) {
6899		ASSERT(vd->vdev_children > 1);
6900
6901		newvd = vd->vdev_child[vd->vdev_children - 1];
6902		oldvd = vd->vdev_child[0];
6903
6904		if (vdev_dtl_empty(newvd, DTL_MISSING) &&
6905		    vdev_dtl_empty(newvd, DTL_OUTAGE) &&
6906		    !vdev_dtl_required(oldvd))
6907			return (oldvd);
6908	}
6909
6910	/*
6911	 * Check for a completed resilver with the 'unspare' flag set.
6912	 * Also potentially update faulted state.
6913	 */
6914	if (vd->vdev_ops == &vdev_spare_ops) {
6915		vdev_t *first = vd->vdev_child[0];
6916		vdev_t *last = vd->vdev_child[vd->vdev_children - 1];
6917
6918		if (last->vdev_unspare) {
6919			oldvd = first;
6920			newvd = last;
6921		} else if (first->vdev_unspare) {
6922			oldvd = last;
6923			newvd = first;
6924		} else {
6925			oldvd = NULL;
6926		}
6927
6928		if (oldvd != NULL &&
6929		    vdev_dtl_empty(newvd, DTL_MISSING) &&
6930		    vdev_dtl_empty(newvd, DTL_OUTAGE) &&
6931		    !vdev_dtl_required(oldvd))
6932			return (oldvd);
6933
6934		vdev_propagate_state(vd);
6935
6936		/*
6937		 * If there are more than two spares attached to a disk,
6938		 * and those spares are not required, then we want to
6939		 * attempt to free them up now so that they can be used
6940		 * by other pools.  Once we're back down to a single
6941		 * disk+spare, we stop removing them.
6942		 */
6943		if (vd->vdev_children > 2) {
6944			newvd = vd->vdev_child[1];
6945
6946			if (newvd->vdev_isspare && last->vdev_isspare &&
6947			    vdev_dtl_empty(last, DTL_MISSING) &&
6948			    vdev_dtl_empty(last, DTL_OUTAGE) &&
6949			    !vdev_dtl_required(newvd))
6950				return (newvd);
6951		}
6952	}
6953
6954	return (NULL);
6955}
6956
6957static void
6958spa_vdev_resilver_done(spa_t *spa)
6959{
6960	vdev_t *vd, *pvd, *ppvd;
6961	uint64_t guid, sguid, pguid, ppguid;
6962
6963	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
6964
6965	while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) {
6966		pvd = vd->vdev_parent;
6967		ppvd = pvd->vdev_parent;
6968		guid = vd->vdev_guid;
6969		pguid = pvd->vdev_guid;
6970		ppguid = ppvd->vdev_guid;
6971		sguid = 0;
6972		/*
6973		 * If we have just finished replacing a hot spared device, then
6974		 * we need to detach the parent's first child (the original hot
6975		 * spare) as well.
6976		 */
6977		if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 &&
6978		    ppvd->vdev_children == 2) {
6979			ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
6980			sguid = ppvd->vdev_child[1]->vdev_guid;
6981		}
6982		ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd));
6983
6984		spa_config_exit(spa, SCL_ALL, FTAG);
6985		if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0)
6986			return;
6987		if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0)
6988			return;
6989		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
6990	}
6991
6992	spa_config_exit(spa, SCL_ALL, FTAG);
6993}
6994
6995/*
6996 * Update the stored path or FRU for this vdev.
6997 */
6998int
6999spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value,
7000    boolean_t ispath)
7001{
7002	vdev_t *vd;
7003	boolean_t sync = B_FALSE;
7004
7005	ASSERT(spa_writeable(spa));
7006
7007	spa_vdev_state_enter(spa, SCL_ALL);
7008
7009	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
7010		return (spa_vdev_state_exit(spa, NULL, ENOENT));
7011
7012	if (!vd->vdev_ops->vdev_op_leaf)
7013		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
7014
7015	if (ispath) {
7016		if (strcmp(value, vd->vdev_path) != 0) {
7017			spa_strfree(vd->vdev_path);
7018			vd->vdev_path = spa_strdup(value);
7019			sync = B_TRUE;
7020		}
7021	} else {
7022		if (vd->vdev_fru == NULL) {
7023			vd->vdev_fru = spa_strdup(value);
7024			sync = B_TRUE;
7025		} else if (strcmp(value, vd->vdev_fru) != 0) {
7026			spa_strfree(vd->vdev_fru);
7027			vd->vdev_fru = spa_strdup(value);
7028			sync = B_TRUE;
7029		}
7030	}
7031
7032	return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0));
7033}
7034
7035int
7036spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
7037{
7038	return (spa_vdev_set_common(spa, guid, newpath, B_TRUE));
7039}
7040
7041int
7042spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru)
7043{
7044	return (spa_vdev_set_common(spa, guid, newfru, B_FALSE));
7045}
7046
7047/*
7048 * ==========================================================================
7049 * SPA Scanning
7050 * ==========================================================================
7051 */
7052int
7053spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t cmd)
7054{
7055	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
7056
7057	if (dsl_scan_resilvering(spa->spa_dsl_pool))
7058		return (SET_ERROR(EBUSY));
7059
7060	return (dsl_scrub_set_pause_resume(spa->spa_dsl_pool, cmd));
7061}
7062
7063int
7064spa_scan_stop(spa_t *spa)
7065{
7066	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
7067	if (dsl_scan_resilvering(spa->spa_dsl_pool))
7068		return (SET_ERROR(EBUSY));
7069	return (dsl_scan_cancel(spa->spa_dsl_pool));
7070}
7071
7072int
7073spa_scan(spa_t *spa, pool_scan_func_t func)
7074{
7075	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
7076
7077	if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE)
7078		return (SET_ERROR(ENOTSUP));
7079
7080	/*
7081	 * If a resilver was requested, but there is no DTL on a
7082	 * writeable leaf device, we have nothing to do.
7083	 */
7084	if (func == POOL_SCAN_RESILVER &&
7085	    !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
7086		spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
7087		return (0);
7088	}
7089
7090	return (dsl_scan(spa->spa_dsl_pool, func));
7091}
7092
7093/*
7094 * ==========================================================================
7095 * SPA async task processing
7096 * ==========================================================================
7097 */
7098
7099static void
7100spa_async_remove(spa_t *spa, vdev_t *vd)
7101{
7102	if (vd->vdev_remove_wanted) {
7103		vd->vdev_remove_wanted = B_FALSE;
7104		vd->vdev_delayed_close = B_FALSE;
7105		vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE);
7106
7107		/*
7108		 * We want to clear the stats, but we don't want to do a full
7109		 * vdev_clear() as that will cause us to throw away
7110		 * degraded/faulted state as well as attempt to reopen the
7111		 * device, all of which is a waste.
7112		 */
7113		vd->vdev_stat.vs_read_errors = 0;
7114		vd->vdev_stat.vs_write_errors = 0;
7115		vd->vdev_stat.vs_checksum_errors = 0;
7116
7117		vdev_state_dirty(vd->vdev_top);
7118		/* Tell userspace that the vdev is gone. */
7119		zfs_post_remove(spa, vd);
7120	}
7121
7122	for (int c = 0; c < vd->vdev_children; c++)
7123		spa_async_remove(spa, vd->vdev_child[c]);
7124}
7125
7126static void
7127spa_async_probe(spa_t *spa, vdev_t *vd)
7128{
7129	if (vd->vdev_probe_wanted) {
7130		vd->vdev_probe_wanted = B_FALSE;
7131		vdev_reopen(vd);	/* vdev_open() does the actual probe */
7132	}
7133
7134	for (int c = 0; c < vd->vdev_children; c++)
7135		spa_async_probe(spa, vd->vdev_child[c]);
7136}
7137
7138static void
7139spa_async_autoexpand(spa_t *spa, vdev_t *vd)
7140{
7141	sysevent_id_t eid;
7142	nvlist_t *attr;
7143	char *physpath;
7144
7145	if (!spa->spa_autoexpand)
7146		return;
7147
7148	for (int c = 0; c < vd->vdev_children; c++) {
7149		vdev_t *cvd = vd->vdev_child[c];
7150		spa_async_autoexpand(spa, cvd);
7151	}
7152
7153	if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL)
7154		return;
7155
7156	physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
7157	(void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath);
7158
7159	VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0);
7160	VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0);
7161
7162	(void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS,
7163	    ESC_ZFS_VDEV_AUTOEXPAND, attr, &eid, DDI_SLEEP);
7164
7165	nvlist_free(attr);
7166	kmem_free(physpath, MAXPATHLEN);
7167}
7168
7169static void
7170spa_async_thread(void *arg)
7171{
7172	spa_t *spa = (spa_t *)arg;
7173	int tasks;
7174
7175	ASSERT(spa->spa_sync_on);
7176
7177	mutex_enter(&spa->spa_async_lock);
7178	tasks = spa->spa_async_tasks;
7179	spa->spa_async_tasks &= SPA_ASYNC_REMOVE;
7180	mutex_exit(&spa->spa_async_lock);
7181
7182	/*
7183	 * See if the config needs to be updated.
7184	 */
7185	if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
7186		uint64_t old_space, new_space;
7187
7188		mutex_enter(&spa_namespace_lock);
7189		old_space = metaslab_class_get_space(spa_normal_class(spa));
7190		spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
7191		new_space = metaslab_class_get_space(spa_normal_class(spa));
7192		mutex_exit(&spa_namespace_lock);
7193
7194		/*
7195		 * If the pool grew as a result of the config update,
7196		 * then log an internal history event.
7197		 */
7198		if (new_space != old_space) {
7199			spa_history_log_internal(spa, "vdev online", NULL,
7200			    "pool '%s' size: %llu(+%llu)",
7201			    spa_name(spa), new_space, new_space - old_space);
7202		}
7203	}
7204
7205	if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) {
7206		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
7207		spa_async_autoexpand(spa, spa->spa_root_vdev);
7208		spa_config_exit(spa, SCL_CONFIG, FTAG);
7209	}
7210
7211	/*
7212	 * See if any devices need to be probed.
7213	 */
7214	if (tasks & SPA_ASYNC_PROBE) {
7215		spa_vdev_state_enter(spa, SCL_NONE);
7216		spa_async_probe(spa, spa->spa_root_vdev);
7217		(void) spa_vdev_state_exit(spa, NULL, 0);
7218	}
7219
7220	/*
7221	 * If any devices are done replacing, detach them.
7222	 */
7223	if (tasks & SPA_ASYNC_RESILVER_DONE)
7224		spa_vdev_resilver_done(spa);
7225
7226	/*
7227	 * Kick off a resilver.
7228	 */
7229	if (tasks & SPA_ASYNC_RESILVER)
7230		dsl_resilver_restart(spa->spa_dsl_pool, 0);
7231
7232	if (tasks & SPA_ASYNC_INITIALIZE_RESTART) {
7233		mutex_enter(&spa_namespace_lock);
7234		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
7235		vdev_initialize_restart(spa->spa_root_vdev);
7236		spa_config_exit(spa, SCL_CONFIG, FTAG);
7237		mutex_exit(&spa_namespace_lock);
7238	}
7239
7240	/*
7241	 * Let the world know that we're done.
7242	 */
7243	mutex_enter(&spa->spa_async_lock);
7244	spa->spa_async_thread = NULL;
7245	cv_broadcast(&spa->spa_async_cv);
7246	mutex_exit(&spa->spa_async_lock);
7247	thread_exit();
7248}
7249
7250static void
7251spa_async_thread_vd(void *arg)
7252{
7253	spa_t *spa = arg;
7254	int tasks;
7255
7256	mutex_enter(&spa->spa_async_lock);
7257	tasks = spa->spa_async_tasks;
7258retry:
7259	spa->spa_async_tasks &= ~SPA_ASYNC_REMOVE;
7260	mutex_exit(&spa->spa_async_lock);
7261
7262	/*
7263	 * See if any devices need to be marked REMOVED.
7264	 */
7265	if (tasks & SPA_ASYNC_REMOVE) {
7266		spa_vdev_state_enter(spa, SCL_NONE);
7267		spa_async_remove(spa, spa->spa_root_vdev);
7268		for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
7269			spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]);
7270		for (int i = 0; i < spa->spa_spares.sav_count; i++)
7271			spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]);
7272		(void) spa_vdev_state_exit(spa, NULL, 0);
7273	}
7274
7275	/*
7276	 * Let the world know that we're done.
7277	 */
7278	mutex_enter(&spa->spa_async_lock);
7279	tasks = spa->spa_async_tasks;
7280	if ((tasks & SPA_ASYNC_REMOVE) != 0)
7281		goto retry;
7282	spa->spa_async_thread_vd = NULL;
7283	cv_broadcast(&spa->spa_async_cv);
7284	mutex_exit(&spa->spa_async_lock);
7285	thread_exit();
7286}
7287
7288void
7289spa_async_suspend(spa_t *spa)
7290{
7291	mutex_enter(&spa->spa_async_lock);
7292	spa->spa_async_suspended++;
7293	while (spa->spa_async_thread != NULL ||
7294	    spa->spa_async_thread_vd != NULL)
7295		cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
7296	mutex_exit(&spa->spa_async_lock);
7297
7298	spa_vdev_remove_suspend(spa);
7299
7300	zthr_t *condense_thread = spa->spa_condense_zthr;
7301	if (condense_thread != NULL && zthr_isrunning(condense_thread))
7302		VERIFY0(zthr_cancel(condense_thread));
7303
7304	zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
7305	if (discard_thread != NULL && zthr_isrunning(discard_thread))
7306		VERIFY0(zthr_cancel(discard_thread));
7307}
7308
7309void
7310spa_async_resume(spa_t *spa)
7311{
7312	mutex_enter(&spa->spa_async_lock);
7313	ASSERT(spa->spa_async_suspended != 0);
7314	spa->spa_async_suspended--;
7315	mutex_exit(&spa->spa_async_lock);
7316	spa_restart_removal(spa);
7317
7318	zthr_t *condense_thread = spa->spa_condense_zthr;
7319	if (condense_thread != NULL && !zthr_isrunning(condense_thread))
7320		zthr_resume(condense_thread);
7321
7322	zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
7323	if (discard_thread != NULL && !zthr_isrunning(discard_thread))
7324		zthr_resume(discard_thread);
7325}
7326
7327static boolean_t
7328spa_async_tasks_pending(spa_t *spa)
7329{
7330	uint_t non_config_tasks;
7331	uint_t config_task;
7332	boolean_t config_task_suspended;
7333
7334	non_config_tasks = spa->spa_async_tasks & ~(SPA_ASYNC_CONFIG_UPDATE |
7335	    SPA_ASYNC_REMOVE);
7336	config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE;
7337	if (spa->spa_ccw_fail_time == 0) {
7338		config_task_suspended = B_FALSE;
7339	} else {
7340		config_task_suspended =
7341		    (gethrtime() - spa->spa_ccw_fail_time) <
7342		    (zfs_ccw_retry_interval * NANOSEC);
7343	}
7344
7345	return (non_config_tasks || (config_task && !config_task_suspended));
7346}
7347
7348static void
7349spa_async_dispatch(spa_t *spa)
7350{
7351	mutex_enter(&spa->spa_async_lock);
7352	if (spa_async_tasks_pending(spa) &&
7353	    !spa->spa_async_suspended &&
7354	    spa->spa_async_thread == NULL &&
7355	    rootdir != NULL)
7356		spa->spa_async_thread = thread_create(NULL, 0,
7357		    spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
7358	mutex_exit(&spa->spa_async_lock);
7359}
7360
7361static void
7362spa_async_dispatch_vd(spa_t *spa)
7363{
7364	mutex_enter(&spa->spa_async_lock);
7365	if ((spa->spa_async_tasks & SPA_ASYNC_REMOVE) != 0 &&
7366	    !spa->spa_async_suspended &&
7367	    spa->spa_async_thread_vd == NULL &&
7368	    rootdir != NULL)
7369		spa->spa_async_thread_vd = thread_create(NULL, 0,
7370		    spa_async_thread_vd, spa, 0, &p0, TS_RUN, maxclsyspri);
7371	mutex_exit(&spa->spa_async_lock);
7372}
7373
7374void
7375spa_async_request(spa_t *spa, int task)
7376{
7377	zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task);
7378	mutex_enter(&spa->spa_async_lock);
7379	spa->spa_async_tasks |= task;
7380	mutex_exit(&spa->spa_async_lock);
7381	spa_async_dispatch_vd(spa);
7382}
7383
7384/*
7385 * ==========================================================================
7386 * SPA syncing routines
7387 * ==========================================================================
7388 */
7389
7390static int
7391bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
7392{
7393	bpobj_t *bpo = arg;
7394	bpobj_enqueue(bpo, bp, tx);
7395	return (0);
7396}
7397
7398static int
7399spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
7400{
7401	zio_t *zio = arg;
7402
7403	zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp,
7404	    BP_GET_PSIZE(bp), zio->io_flags));
7405	return (0);
7406}
7407
7408/*
7409 * Note: this simple function is not inlined to make it easier to dtrace the
7410 * amount of time spent syncing frees.
7411 */
7412static void
7413spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx)
7414{
7415	zio_t *zio = zio_root(spa, NULL, NULL, 0);
7416	bplist_iterate(bpl, spa_free_sync_cb, zio, tx);
7417	VERIFY(zio_wait(zio) == 0);
7418}
7419
7420/*
7421 * Note: this simple function is not inlined to make it easier to dtrace the
7422 * amount of time spent syncing deferred frees.
7423 */
7424static void
7425spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx)
7426{
7427	zio_t *zio = zio_root(spa, NULL, NULL, 0);
7428	VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj,
7429	    spa_free_sync_cb, zio, tx), ==, 0);
7430	VERIFY0(zio_wait(zio));
7431}
7432
7433
7434static void
7435spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
7436{
7437	char *packed = NULL;
7438	size_t bufsize;
7439	size_t nvsize = 0;
7440	dmu_buf_t *db;
7441
7442	VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0);
7443
7444	/*
7445	 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration
7446	 * information.  This avoids the dmu_buf_will_dirty() path and
7447	 * saves us a pre-read to get data we don't actually care about.
7448	 */
7449	bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE);
7450	packed = kmem_alloc(bufsize, KM_SLEEP);
7451
7452	VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
7453	    KM_SLEEP) == 0);
7454	bzero(packed + nvsize, bufsize - nvsize);
7455
7456	dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx);
7457
7458	kmem_free(packed, bufsize);
7459
7460	VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
7461	dmu_buf_will_dirty(db, tx);
7462	*(uint64_t *)db->db_data = nvsize;
7463	dmu_buf_rele(db, FTAG);
7464}
7465
7466static void
7467spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx,
7468    const char *config, const char *entry)
7469{
7470	nvlist_t *nvroot;
7471	nvlist_t **list;
7472	int i;
7473
7474	if (!sav->sav_sync)
7475		return;
7476
7477	/*
7478	 * Update the MOS nvlist describing the list of available devices.
7479	 * spa_validate_aux() will have already made sure this nvlist is
7480	 * valid and the vdevs are labeled appropriately.
7481	 */
7482	if (sav->sav_object == 0) {
7483		sav->sav_object = dmu_object_alloc(spa->spa_meta_objset,
7484		    DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE,
7485		    sizeof (uint64_t), tx);
7486		VERIFY(zap_update(spa->spa_meta_objset,
7487		    DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1,
7488		    &sav->sav_object, tx) == 0);
7489	}
7490
7491	VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
7492	if (sav->sav_count == 0) {
7493		VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0);
7494	} else {
7495		list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
7496		for (i = 0; i < sav->sav_count; i++)
7497			list[i] = vdev_config_generate(spa, sav->sav_vdevs[i],
7498			    B_FALSE, VDEV_CONFIG_L2CACHE);
7499		VERIFY(nvlist_add_nvlist_array(nvroot, config, list,
7500		    sav->sav_count) == 0);
7501		for (i = 0; i < sav->sav_count; i++)
7502			nvlist_free(list[i]);
7503		kmem_free(list, sav->sav_count * sizeof (void *));
7504	}
7505
7506	spa_sync_nvlist(spa, sav->sav_object, nvroot, tx);
7507	nvlist_free(nvroot);
7508
7509	sav->sav_sync = B_FALSE;
7510}
7511
7512/*
7513 * Rebuild spa's all-vdev ZAP from the vdev ZAPs indicated in each vdev_t.
7514 * The all-vdev ZAP must be empty.
7515 */
7516static void
7517spa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx)
7518{
7519	spa_t *spa = vd->vdev_spa;
7520	if (vd->vdev_top_zap != 0) {
7521		VERIFY0(zap_add_int(spa->spa_meta_objset, avz,
7522		    vd->vdev_top_zap, tx));
7523	}
7524	if (vd->vdev_leaf_zap != 0) {
7525		VERIFY0(zap_add_int(spa->spa_meta_objset, avz,
7526		    vd->vdev_leaf_zap, tx));
7527	}
7528	for (uint64_t i = 0; i < vd->vdev_children; i++) {
7529		spa_avz_build(vd->vdev_child[i], avz, tx);
7530	}
7531}
7532
7533static void
7534spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
7535{
7536	nvlist_t *config;
7537
7538	/*
7539	 * If the pool is being imported from a pre-per-vdev-ZAP version of ZFS,
7540	 * its config may not be dirty but we still need to build per-vdev ZAPs.
7541	 * Similarly, if the pool is being assembled (e.g. after a split), we
7542	 * need to rebuild the AVZ although the config may not be dirty.
7543	 */
7544	if (list_is_empty(&spa->spa_config_dirty_list) &&
7545	    spa->spa_avz_action == AVZ_ACTION_NONE)
7546		return;
7547
7548	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
7549
7550	ASSERT(spa->spa_avz_action == AVZ_ACTION_NONE ||
7551	    spa->spa_avz_action == AVZ_ACTION_INITIALIZE ||
7552	    spa->spa_all_vdev_zaps != 0);
7553
7554	if (spa->spa_avz_action == AVZ_ACTION_REBUILD) {
7555		/* Make and build the new AVZ */
7556		uint64_t new_avz = zap_create(spa->spa_meta_objset,
7557		    DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx);
7558		spa_avz_build(spa->spa_root_vdev, new_avz, tx);
7559
7560		/* Diff old AVZ with new one */
7561		zap_cursor_t zc;
7562		zap_attribute_t za;
7563
7564		for (zap_cursor_init(&zc, spa->spa_meta_objset,
7565		    spa->spa_all_vdev_zaps);
7566		    zap_cursor_retrieve(&zc, &za) == 0;
7567		    zap_cursor_advance(&zc)) {
7568			uint64_t vdzap = za.za_first_integer;
7569			if (zap_lookup_int(spa->spa_meta_objset, new_avz,
7570			    vdzap) == ENOENT) {
7571				/*
7572				 * ZAP is listed in old AVZ but not in new one;
7573				 * destroy it
7574				 */
7575				VERIFY0(zap_destroy(spa->spa_meta_objset, vdzap,
7576				    tx));
7577			}
7578		}
7579
7580		zap_cursor_fini(&zc);
7581
7582		/* Destroy the old AVZ */
7583		VERIFY0(zap_destroy(spa->spa_meta_objset,
7584		    spa->spa_all_vdev_zaps, tx));
7585
7586		/* Replace the old AVZ in the dir obj with the new one */
7587		VERIFY0(zap_update(spa->spa_meta_objset,
7588		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP,
7589		    sizeof (new_avz), 1, &new_avz, tx));
7590
7591		spa->spa_all_vdev_zaps = new_avz;
7592	} else if (spa->spa_avz_action == AVZ_ACTION_DESTROY) {
7593		zap_cursor_t zc;
7594		zap_attribute_t za;
7595
7596		/* Walk through the AVZ and destroy all listed ZAPs */
7597		for (zap_cursor_init(&zc, spa->spa_meta_objset,
7598		    spa->spa_all_vdev_zaps);
7599		    zap_cursor_retrieve(&zc, &za) == 0;
7600		    zap_cursor_advance(&zc)) {
7601			uint64_t zap = za.za_first_integer;
7602			VERIFY0(zap_destroy(spa->spa_meta_objset, zap, tx));
7603		}
7604
7605		zap_cursor_fini(&zc);
7606
7607		/* Destroy and unlink the AVZ itself */
7608		VERIFY0(zap_destroy(spa->spa_meta_objset,
7609		    spa->spa_all_vdev_zaps, tx));
7610		VERIFY0(zap_remove(spa->spa_meta_objset,
7611		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx));
7612		spa->spa_all_vdev_zaps = 0;
7613	}
7614
7615	if (spa->spa_all_vdev_zaps == 0) {
7616		spa->spa_all_vdev_zaps = zap_create_link(spa->spa_meta_objset,
7617		    DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
7618		    DMU_POOL_VDEV_ZAP_MAP, tx);
7619	}
7620	spa->spa_avz_action = AVZ_ACTION_NONE;
7621
7622	/* Create ZAPs for vdevs that don't have them. */
7623	vdev_construct_zaps(spa->spa_root_vdev, tx);
7624
7625	config = spa_config_generate(spa, spa->spa_root_vdev,
7626	    dmu_tx_get_txg(tx), B_FALSE);
7627
7628	/*
7629	 * If we're upgrading the spa version then make sure that
7630	 * the config object gets updated with the correct version.
7631	 */
7632	if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version)
7633		fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
7634		    spa->spa_uberblock.ub_version);
7635
7636	spa_config_exit(spa, SCL_STATE, FTAG);
7637
7638	nvlist_free(spa->spa_config_syncing);
7639	spa->spa_config_syncing = config;
7640
7641	spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
7642}
7643
7644static void
7645spa_sync_version(void *arg, dmu_tx_t *tx)
7646{
7647	uint64_t *versionp = arg;
7648	uint64_t version = *versionp;
7649	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
7650
7651	/*
7652	 * Setting the version is special cased when first creating the pool.
7653	 */
7654	ASSERT(tx->tx_txg != TXG_INITIAL);
7655
7656	ASSERT(SPA_VERSION_IS_SUPPORTED(version));
7657	ASSERT(version >= spa_version(spa));
7658
7659	spa->spa_uberblock.ub_version = version;
7660	vdev_config_dirty(spa->spa_root_vdev);
7661	spa_history_log_internal(spa, "set", tx, "version=%lld", version);
7662}
7663
7664/*
7665 * Set zpool properties.
7666 */
7667static void
7668spa_sync_props(void *arg, dmu_tx_t *tx)
7669{
7670	nvlist_t *nvp = arg;
7671	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
7672	objset_t *mos = spa->spa_meta_objset;
7673	nvpair_t *elem = NULL;
7674
7675	mutex_enter(&spa->spa_props_lock);
7676
7677	while ((elem = nvlist_next_nvpair(nvp, elem))) {
7678		uint64_t intval;
7679		char *strval, *fname;
7680		zpool_prop_t prop;
7681		const char *propname;
7682		zprop_type_t proptype;
7683		spa_feature_t fid;
7684
7685		switch (prop = zpool_name_to_prop(nvpair_name(elem))) {
7686		case ZPOOL_PROP_INVAL:
7687			/*
7688			 * We checked this earlier in spa_prop_validate().
7689			 */
7690			ASSERT(zpool_prop_feature(nvpair_name(elem)));
7691
7692			fname = strchr(nvpair_name(elem), '@') + 1;
7693			VERIFY0(zfeature_lookup_name(fname, &fid));
7694
7695			spa_feature_enable(spa, fid, tx);
7696			spa_history_log_internal(spa, "set", tx,
7697			    "%s=enabled", nvpair_name(elem));
7698			break;
7699
7700		case ZPOOL_PROP_VERSION:
7701			intval = fnvpair_value_uint64(elem);
7702			/*
7703			 * The version is synced seperatly before other
7704			 * properties and should be correct by now.
7705			 */
7706			ASSERT3U(spa_version(spa), >=, intval);
7707			break;
7708
7709		case ZPOOL_PROP_ALTROOT:
7710			/*
7711			 * 'altroot' is a non-persistent property. It should
7712			 * have been set temporarily at creation or import time.
7713			 */
7714			ASSERT(spa->spa_root != NULL);
7715			break;
7716
7717		case ZPOOL_PROP_READONLY:
7718		case ZPOOL_PROP_CACHEFILE:
7719			/*
7720			 * 'readonly' and 'cachefile' are also non-persisitent
7721			 * properties.
7722			 */
7723			break;
7724		case ZPOOL_PROP_COMMENT:
7725			strval = fnvpair_value_string(elem);
7726			if (spa->spa_comment != NULL)
7727				spa_strfree(spa->spa_comment);
7728			spa->spa_comment = spa_strdup(strval);
7729			/*
7730			 * We need to dirty the configuration on all the vdevs
7731			 * so that their labels get updated.  It's unnecessary
7732			 * to do this for pool creation since the vdev's
7733			 * configuratoin has already been dirtied.
7734			 */
7735			if (tx->tx_txg != TXG_INITIAL)
7736				vdev_config_dirty(spa->spa_root_vdev);
7737			spa_history_log_internal(spa, "set", tx,
7738			    "%s=%s", nvpair_name(elem), strval);
7739			break;
7740		default:
7741			/*
7742			 * Set pool property values in the poolprops mos object.
7743			 */
7744			if (spa->spa_pool_props_object == 0) {
7745				spa->spa_pool_props_object =
7746				    zap_create_link(mos, DMU_OT_POOL_PROPS,
7747				    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS,
7748				    tx);
7749			}
7750
7751			/* normalize the property name */
7752			propname = zpool_prop_to_name(prop);
7753			proptype = zpool_prop_get_type(prop);
7754
7755			if (nvpair_type(elem) == DATA_TYPE_STRING) {
7756				ASSERT(proptype == PROP_TYPE_STRING);
7757				strval = fnvpair_value_string(elem);
7758				VERIFY0(zap_update(mos,
7759				    spa->spa_pool_props_object, propname,
7760				    1, strlen(strval) + 1, strval, tx));
7761				spa_history_log_internal(spa, "set", tx,
7762				    "%s=%s", nvpair_name(elem), strval);
7763			} else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
7764				intval = fnvpair_value_uint64(elem);
7765
7766				if (proptype == PROP_TYPE_INDEX) {
7767					const char *unused;
7768					VERIFY0(zpool_prop_index_to_string(
7769					    prop, intval, &unused));
7770				}
7771				VERIFY0(zap_update(mos,
7772				    spa->spa_pool_props_object, propname,
7773				    8, 1, &intval, tx));
7774				spa_history_log_internal(spa, "set", tx,
7775				    "%s=%lld", nvpair_name(elem), intval);
7776			} else {
7777				ASSERT(0); /* not allowed */
7778			}
7779
7780			switch (prop) {
7781			case ZPOOL_PROP_DELEGATION:
7782				spa->spa_delegation = intval;
7783				break;
7784			case ZPOOL_PROP_BOOTFS:
7785				spa->spa_bootfs = intval;
7786				break;
7787			case ZPOOL_PROP_FAILUREMODE:
7788				spa->spa_failmode = intval;
7789				break;
7790			case ZPOOL_PROP_AUTOEXPAND:
7791				spa->spa_autoexpand = intval;
7792				if (tx->tx_txg != TXG_INITIAL)
7793					spa_async_request(spa,
7794					    SPA_ASYNC_AUTOEXPAND);
7795				break;
7796			case ZPOOL_PROP_DEDUPDITTO:
7797				spa->spa_dedup_ditto = intval;
7798				break;
7799			default:
7800				break;
7801			}
7802		}
7803
7804	}
7805
7806	mutex_exit(&spa->spa_props_lock);
7807}
7808
7809/*
7810 * Perform one-time upgrade on-disk changes.  spa_version() does not
7811 * reflect the new version this txg, so there must be no changes this
7812 * txg to anything that the upgrade code depends on after it executes.
7813 * Therefore this must be called after dsl_pool_sync() does the sync
7814 * tasks.
7815 */
7816static void
7817spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx)
7818{
7819	dsl_pool_t *dp = spa->spa_dsl_pool;
7820
7821	ASSERT(spa->spa_sync_pass == 1);
7822
7823	rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
7824
7825	if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
7826	    spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) {
7827		dsl_pool_create_origin(dp, tx);
7828
7829		/* Keeping the origin open increases spa_minref */
7830		spa->spa_minref += 3;
7831	}
7832
7833	if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES &&
7834	    spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) {
7835		dsl_pool_upgrade_clones(dp, tx);
7836	}
7837
7838	if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES &&
7839	    spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) {
7840		dsl_pool_upgrade_dir_clones(dp, tx);
7841
7842		/* Keeping the freedir open increases spa_minref */
7843		spa->spa_minref += 3;
7844	}
7845
7846	if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES &&
7847	    spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
7848		spa_feature_create_zap_objects(spa, tx);
7849	}
7850
7851	/*
7852	 * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable
7853	 * when possibility to use lz4 compression for metadata was added
7854	 * Old pools that have this feature enabled must be upgraded to have
7855	 * this feature active
7856	 */
7857	if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
7858		boolean_t lz4_en = spa_feature_is_enabled(spa,
7859		    SPA_FEATURE_LZ4_COMPRESS);
7860		boolean_t lz4_ac = spa_feature_is_active(spa,
7861		    SPA_FEATURE_LZ4_COMPRESS);
7862
7863		if (lz4_en && !lz4_ac)
7864			spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx);
7865	}
7866
7867	/*
7868	 * If we haven't written the salt, do so now.  Note that the
7869	 * feature may not be activated yet, but that's fine since
7870	 * the presence of this ZAP entry is backwards compatible.
7871	 */
7872	if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
7873	    DMU_POOL_CHECKSUM_SALT) == ENOENT) {
7874		VERIFY0(zap_add(spa->spa_meta_objset,
7875		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1,
7876		    sizeof (spa->spa_cksum_salt.zcs_bytes),
7877		    spa->spa_cksum_salt.zcs_bytes, tx));
7878	}
7879
7880	rrw_exit(&dp->dp_config_rwlock, FTAG);
7881}
7882
7883static void
7884vdev_indirect_state_sync_verify(vdev_t *vd)
7885{
7886	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
7887	vdev_indirect_births_t *vib = vd->vdev_indirect_births;
7888
7889	if (vd->vdev_ops == &vdev_indirect_ops) {
7890		ASSERT(vim != NULL);
7891		ASSERT(vib != NULL);
7892	}
7893
7894	if (vdev_obsolete_sm_object(vd) != 0) {
7895		ASSERT(vd->vdev_obsolete_sm != NULL);
7896		ASSERT(vd->vdev_removing ||
7897		    vd->vdev_ops == &vdev_indirect_ops);
7898		ASSERT(vdev_indirect_mapping_num_entries(vim) > 0);
7899		ASSERT(vdev_indirect_mapping_bytes_mapped(vim) > 0);
7900
7901		ASSERT3U(vdev_obsolete_sm_object(vd), ==,
7902		    space_map_object(vd->vdev_obsolete_sm));
7903		ASSERT3U(vdev_indirect_mapping_bytes_mapped(vim), >=,
7904		    space_map_allocated(vd->vdev_obsolete_sm));
7905	}
7906	ASSERT(vd->vdev_obsolete_segments != NULL);
7907
7908	/*
7909	 * Since frees / remaps to an indirect vdev can only
7910	 * happen in syncing context, the obsolete segments
7911	 * tree must be empty when we start syncing.
7912	 */
7913	ASSERT0(range_tree_space(vd->vdev_obsolete_segments));
7914}
7915
7916/*
7917 * Sync the specified transaction group.  New blocks may be dirtied as
7918 * part of the process, so we iterate until it converges.
7919 */
7920void
7921spa_sync(spa_t *spa, uint64_t txg)
7922{
7923	dsl_pool_t *dp = spa->spa_dsl_pool;
7924	objset_t *mos = spa->spa_meta_objset;
7925	bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
7926	vdev_t *rvd = spa->spa_root_vdev;
7927	vdev_t *vd;
7928	dmu_tx_t *tx;
7929	int error;
7930	uint32_t max_queue_depth = zfs_vdev_async_write_max_active *
7931	    zfs_vdev_queue_depth_pct / 100;
7932
7933	VERIFY(spa_writeable(spa));
7934
7935	/*
7936	 * Wait for i/os issued in open context that need to complete
7937	 * before this txg syncs.
7938	 */
7939	(void) zio_wait(spa->spa_txg_zio[txg & TXG_MASK]);
7940	spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL,
7941	    ZIO_FLAG_CANFAIL);
7942
7943	/*
7944	 * Lock out configuration changes.
7945	 */
7946	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
7947
7948	spa->spa_syncing_txg = txg;
7949	spa->spa_sync_pass = 0;
7950
7951	for (int i = 0; i < spa->spa_alloc_count; i++) {
7952		mutex_enter(&spa->spa_alloc_locks[i]);
7953		VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i]));
7954		mutex_exit(&spa->spa_alloc_locks[i]);
7955	}
7956
7957	/*
7958	 * If there are any pending vdev state changes, convert them
7959	 * into config changes that go out with this transaction group.
7960	 */
7961	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
7962	while (list_head(&spa->spa_state_dirty_list) != NULL) {
7963		/*
7964		 * We need the write lock here because, for aux vdevs,
7965		 * calling vdev_config_dirty() modifies sav_config.
7966		 * This is ugly and will become unnecessary when we
7967		 * eliminate the aux vdev wart by integrating all vdevs
7968		 * into the root vdev tree.
7969		 */
7970		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
7971		spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER);
7972		while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
7973			vdev_state_clean(vd);
7974			vdev_config_dirty(vd);
7975		}
7976		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
7977		spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
7978	}
7979	spa_config_exit(spa, SCL_STATE, FTAG);
7980
7981	tx = dmu_tx_create_assigned(dp, txg);
7982
7983	spa->spa_sync_starttime = gethrtime();
7984#ifdef illumos
7985	VERIFY(cyclic_reprogram(spa->spa_deadman_cycid,
7986	    spa->spa_sync_starttime + spa->spa_deadman_synctime));
7987#else	/* !illumos */
7988#ifdef _KERNEL
7989	callout_schedule(&spa->spa_deadman_cycid,
7990	    hz * spa->spa_deadman_synctime / NANOSEC);
7991#endif
7992#endif	/* illumos */
7993
7994	/*
7995	 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
7996	 * set spa_deflate if we have no raid-z vdevs.
7997	 */
7998	if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE &&
7999	    spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) {
8000		int i;
8001
8002		for (i = 0; i < rvd->vdev_children; i++) {
8003			vd = rvd->vdev_child[i];
8004			if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
8005				break;
8006		}
8007		if (i == rvd->vdev_children) {
8008			spa->spa_deflate = TRUE;
8009			VERIFY(0 == zap_add(spa->spa_meta_objset,
8010			    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
8011			    sizeof (uint64_t), 1, &spa->spa_deflate, tx));
8012		}
8013	}
8014
8015	/*
8016	 * Set the top-level vdev's max queue depth. Evaluate each
8017	 * top-level's async write queue depth in case it changed.
8018	 * The max queue depth will not change in the middle of syncing
8019	 * out this txg.
8020	 */
8021	uint64_t slots_per_allocator = 0;
8022	for (int c = 0; c < rvd->vdev_children; c++) {
8023		vdev_t *tvd = rvd->vdev_child[c];
8024		metaslab_group_t *mg = tvd->vdev_mg;
8025
8026		if (mg == NULL || mg->mg_class != spa_normal_class(spa) ||
8027		    !metaslab_group_initialized(mg))
8028			continue;
8029
8030		/*
8031		 * It is safe to do a lock-free check here because only async
8032		 * allocations look at mg_max_alloc_queue_depth, and async
8033		 * allocations all happen from spa_sync().
8034		 */
8035		for (int i = 0; i < spa->spa_alloc_count; i++)
8036			ASSERT0(refcount_count(&(mg->mg_alloc_queue_depth[i])));
8037		mg->mg_max_alloc_queue_depth = max_queue_depth;
8038
8039		for (int i = 0; i < spa->spa_alloc_count; i++) {
8040			mg->mg_cur_max_alloc_queue_depth[i] =
8041			    zfs_vdev_def_queue_depth;
8042		}
8043		slots_per_allocator += zfs_vdev_def_queue_depth;
8044	}
8045	metaslab_class_t *mc = spa_normal_class(spa);
8046	for (int i = 0; i < spa->spa_alloc_count; i++) {
8047		ASSERT0(refcount_count(&mc->mc_alloc_slots[i]));
8048		mc->mc_alloc_max_slots[i] = slots_per_allocator;
8049	}
8050	mc->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
8051
8052	for (int c = 0; c < rvd->vdev_children; c++) {
8053		vdev_t *vd = rvd->vdev_child[c];
8054		vdev_indirect_state_sync_verify(vd);
8055
8056		if (vdev_indirect_should_condense(vd)) {
8057			spa_condense_indirect_start_sync(vd, tx);
8058			break;
8059		}
8060	}
8061
8062	/*
8063	 * Iterate to convergence.
8064	 */
8065	do {
8066		int pass = ++spa->spa_sync_pass;
8067
8068		spa_sync_config_object(spa, tx);
8069		spa_sync_aux_dev(spa, &spa->spa_spares, tx,
8070		    ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES);
8071		spa_sync_aux_dev(spa, &spa->spa_l2cache, tx,
8072		    ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE);
8073		spa_errlog_sync(spa, txg);
8074		dsl_pool_sync(dp, txg);
8075
8076		if (pass < zfs_sync_pass_deferred_free) {
8077			spa_sync_frees(spa, free_bpl, tx);
8078		} else {
8079			/*
8080			 * We can not defer frees in pass 1, because
8081			 * we sync the deferred frees later in pass 1.
8082			 */
8083			ASSERT3U(pass, >, 1);
8084			bplist_iterate(free_bpl, bpobj_enqueue_cb,
8085			    &spa->spa_deferred_bpobj, tx);
8086		}
8087
8088		ddt_sync(spa, txg);
8089		dsl_scan_sync(dp, tx);
8090
8091		if (spa->spa_vdev_removal != NULL)
8092			svr_sync(spa, tx);
8093
8094		while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))
8095		    != NULL)
8096			vdev_sync(vd, txg);
8097
8098		if (pass == 1) {
8099			spa_sync_upgrades(spa, tx);
8100			ASSERT3U(txg, >=,
8101			    spa->spa_uberblock.ub_rootbp.blk_birth);
8102			/*
8103			 * Note: We need to check if the MOS is dirty
8104			 * because we could have marked the MOS dirty
8105			 * without updating the uberblock (e.g. if we
8106			 * have sync tasks but no dirty user data).  We
8107			 * need to check the uberblock's rootbp because
8108			 * it is updated if we have synced out dirty
8109			 * data (though in this case the MOS will most
8110			 * likely also be dirty due to second order
8111			 * effects, we don't want to rely on that here).
8112			 */
8113			if (spa->spa_uberblock.ub_rootbp.blk_birth < txg &&
8114			    !dmu_objset_is_dirty(mos, txg)) {
8115				/*
8116				 * Nothing changed on the first pass,
8117				 * therefore this TXG is a no-op.  Avoid
8118				 * syncing deferred frees, so that we
8119				 * can keep this TXG as a no-op.
8120				 */
8121				ASSERT(txg_list_empty(&dp->dp_dirty_datasets,
8122				    txg));
8123				ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
8124				ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg));
8125				ASSERT(txg_list_empty(&dp->dp_early_sync_tasks,
8126				    txg));
8127				break;
8128			}
8129			spa_sync_deferred_frees(spa, tx);
8130		}
8131
8132	} while (dmu_objset_is_dirty(mos, txg));
8133
8134	if (!list_is_empty(&spa->spa_config_dirty_list)) {
8135		/*
8136		 * Make sure that the number of ZAPs for all the vdevs matches
8137		 * the number of ZAPs in the per-vdev ZAP list. This only gets
8138		 * called if the config is dirty; otherwise there may be
8139		 * outstanding AVZ operations that weren't completed in
8140		 * spa_sync_config_object.
8141		 */
8142		uint64_t all_vdev_zap_entry_count;
8143		ASSERT0(zap_count(spa->spa_meta_objset,
8144		    spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count));
8145		ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==,
8146		    all_vdev_zap_entry_count);
8147	}
8148
8149	if (spa->spa_vdev_removal != NULL) {
8150		ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]);
8151	}
8152
8153	/*
8154	 * Rewrite the vdev configuration (which includes the uberblock)
8155	 * to commit the transaction group.
8156	 *
8157	 * If there are no dirty vdevs, we sync the uberblock to a few
8158	 * random top-level vdevs that are known to be visible in the
8159	 * config cache (see spa_vdev_add() for a complete description).
8160	 * If there *are* dirty vdevs, sync the uberblock to all vdevs.
8161	 */
8162	for (;;) {
8163		/*
8164		 * We hold SCL_STATE to prevent vdev open/close/etc.
8165		 * while we're attempting to write the vdev labels.
8166		 */
8167		spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
8168
8169		if (list_is_empty(&spa->spa_config_dirty_list)) {
8170			vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL };
8171			int svdcount = 0;
8172			int children = rvd->vdev_children;
8173			int c0 = spa_get_random(children);
8174
8175			for (int c = 0; c < children; c++) {
8176				vd = rvd->vdev_child[(c0 + c) % children];
8177
8178				/* Stop when revisiting the first vdev */
8179				if (c > 0 && svd[0] == vd)
8180					break;
8181
8182				if (vd->vdev_ms_array == 0 || vd->vdev_islog ||
8183				    !vdev_is_concrete(vd))
8184					continue;
8185
8186				svd[svdcount++] = vd;
8187				if (svdcount == SPA_SYNC_MIN_VDEVS)
8188					break;
8189			}
8190			error = vdev_config_sync(svd, svdcount, txg);
8191		} else {
8192			error = vdev_config_sync(rvd->vdev_child,
8193			    rvd->vdev_children, txg);
8194		}
8195
8196		if (error == 0)
8197			spa->spa_last_synced_guid = rvd->vdev_guid;
8198
8199		spa_config_exit(spa, SCL_STATE, FTAG);
8200
8201		if (error == 0)
8202			break;
8203		zio_suspend(spa, NULL);
8204		zio_resume_wait(spa);
8205	}
8206	dmu_tx_commit(tx);
8207
8208#ifdef illumos
8209	VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY));
8210#else	/* !illumos */
8211#ifdef _KERNEL
8212	callout_drain(&spa->spa_deadman_cycid);
8213#endif
8214#endif	/* illumos */
8215
8216	/*
8217	 * Clear the dirty config list.
8218	 */
8219	while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL)
8220		vdev_config_clean(vd);
8221
8222	/*
8223	 * Now that the new config has synced transactionally,
8224	 * let it become visible to the config cache.
8225	 */
8226	if (spa->spa_config_syncing != NULL) {
8227		spa_config_set(spa, spa->spa_config_syncing);
8228		spa->spa_config_txg = txg;
8229		spa->spa_config_syncing = NULL;
8230	}
8231
8232	dsl_pool_sync_done(dp, txg);
8233
8234	for (int i = 0; i < spa->spa_alloc_count; i++) {
8235		mutex_enter(&spa->spa_alloc_locks[i]);
8236		VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i]));
8237		mutex_exit(&spa->spa_alloc_locks[i]);
8238	}
8239
8240	/*
8241	 * Update usable space statistics.
8242	 */
8243	while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
8244	    != NULL)
8245		vdev_sync_done(vd, txg);
8246
8247	spa_update_dspace(spa);
8248
8249	/*
8250	 * It had better be the case that we didn't dirty anything
8251	 * since vdev_config_sync().
8252	 */
8253	ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
8254	ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
8255	ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
8256
8257	while (zfs_pause_spa_sync)
8258		delay(1);
8259
8260	spa->spa_sync_pass = 0;
8261
8262	/*
8263	 * Update the last synced uberblock here. We want to do this at
8264	 * the end of spa_sync() so that consumers of spa_last_synced_txg()
8265	 * will be guaranteed that all the processing associated with
8266	 * that txg has been completed.
8267	 */
8268	spa->spa_ubsync = spa->spa_uberblock;
8269	spa_config_exit(spa, SCL_CONFIG, FTAG);
8270
8271	spa_handle_ignored_writes(spa);
8272
8273	/*
8274	 * If any async tasks have been requested, kick them off.
8275	 */
8276	spa_async_dispatch(spa);
8277	spa_async_dispatch_vd(spa);
8278}
8279
8280/*
8281 * Sync all pools.  We don't want to hold the namespace lock across these
8282 * operations, so we take a reference on the spa_t and drop the lock during the
8283 * sync.
8284 */
8285void
8286spa_sync_allpools(void)
8287{
8288	spa_t *spa = NULL;
8289	mutex_enter(&spa_namespace_lock);
8290	while ((spa = spa_next(spa)) != NULL) {
8291		if (spa_state(spa) != POOL_STATE_ACTIVE ||
8292		    !spa_writeable(spa) || spa_suspended(spa))
8293			continue;
8294		spa_open_ref(spa, FTAG);
8295		mutex_exit(&spa_namespace_lock);
8296		txg_wait_synced(spa_get_dsl(spa), 0);
8297		mutex_enter(&spa_namespace_lock);
8298		spa_close(spa, FTAG);
8299	}
8300	mutex_exit(&spa_namespace_lock);
8301}
8302
8303/*
8304 * ==========================================================================
8305 * Miscellaneous routines
8306 * ==========================================================================
8307 */
8308
8309/*
8310 * Remove all pools in the system.
8311 */
8312void
8313spa_evict_all(void)
8314{
8315	spa_t *spa;
8316
8317	/*
8318	 * Remove all cached state.  All pools should be closed now,
8319	 * so every spa in the AVL tree should be unreferenced.
8320	 */
8321	mutex_enter(&spa_namespace_lock);
8322	while ((spa = spa_next(NULL)) != NULL) {
8323		/*
8324		 * Stop async tasks.  The async thread may need to detach
8325		 * a device that's been replaced, which requires grabbing
8326		 * spa_namespace_lock, so we must drop it here.
8327		 */
8328		spa_open_ref(spa, FTAG);
8329		mutex_exit(&spa_namespace_lock);
8330		spa_async_suspend(spa);
8331		mutex_enter(&spa_namespace_lock);
8332		spa_close(spa, FTAG);
8333
8334		if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
8335			spa_unload(spa);
8336			spa_deactivate(spa);
8337		}
8338		spa_remove(spa);
8339	}
8340	mutex_exit(&spa_namespace_lock);
8341}
8342
8343vdev_t *
8344spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux)
8345{
8346	vdev_t *vd;
8347	int i;
8348
8349	if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL)
8350		return (vd);
8351
8352	if (aux) {
8353		for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
8354			vd = spa->spa_l2cache.sav_vdevs[i];
8355			if (vd->vdev_guid == guid)
8356				return (vd);
8357		}
8358
8359		for (i = 0; i < spa->spa_spares.sav_count; i++) {
8360			vd = spa->spa_spares.sav_vdevs[i];
8361			if (vd->vdev_guid == guid)
8362				return (vd);
8363		}
8364	}
8365
8366	return (NULL);
8367}
8368
8369void
8370spa_upgrade(spa_t *spa, uint64_t version)
8371{
8372	ASSERT(spa_writeable(spa));
8373
8374	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
8375
8376	/*
8377	 * This should only be called for a non-faulted pool, and since a
8378	 * future version would result in an unopenable pool, this shouldn't be
8379	 * possible.
8380	 */
8381	ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version));
8382	ASSERT3U(version, >=, spa->spa_uberblock.ub_version);
8383
8384	spa->spa_uberblock.ub_version = version;
8385	vdev_config_dirty(spa->spa_root_vdev);
8386
8387	spa_config_exit(spa, SCL_ALL, FTAG);
8388
8389	txg_wait_synced(spa_get_dsl(spa), 0);
8390}
8391
8392boolean_t
8393spa_has_spare(spa_t *spa, uint64_t guid)
8394{
8395	int i;
8396	uint64_t spareguid;
8397	spa_aux_vdev_t *sav = &spa->spa_spares;
8398
8399	for (i = 0; i < sav->sav_count; i++)
8400		if (sav->sav_vdevs[i]->vdev_guid == guid)
8401			return (B_TRUE);
8402
8403	for (i = 0; i < sav->sav_npending; i++) {
8404		if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID,
8405		    &spareguid) == 0 && spareguid == guid)
8406			return (B_TRUE);
8407	}
8408
8409	return (B_FALSE);
8410}
8411
8412/*
8413 * Check if a pool has an active shared spare device.
8414 * Note: reference count of an active spare is 2, as a spare and as a replace
8415 */
8416static boolean_t
8417spa_has_active_shared_spare(spa_t *spa)
8418{
8419	int i, refcnt;
8420	uint64_t pool;
8421	spa_aux_vdev_t *sav = &spa->spa_spares;
8422
8423	for (i = 0; i < sav->sav_count; i++) {
8424		if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool,
8425		    &refcnt) && pool != 0ULL && pool == spa_guid(spa) &&
8426		    refcnt > 2)
8427			return (B_TRUE);
8428	}
8429
8430	return (B_FALSE);
8431}
8432
8433sysevent_t *
8434spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name)
8435{
8436	sysevent_t		*ev = NULL;
8437#ifdef _KERNEL
8438	sysevent_attr_list_t	*attr = NULL;
8439	sysevent_value_t	value;
8440
8441	ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs",
8442	    SE_SLEEP);
8443	ASSERT(ev != NULL);
8444
8445	value.value_type = SE_DATA_TYPE_STRING;
8446	value.value.sv_string = spa_name(spa);
8447	if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0)
8448		goto done;
8449
8450	value.value_type = SE_DATA_TYPE_UINT64;
8451	value.value.sv_uint64 = spa_guid(spa);
8452	if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0)
8453		goto done;
8454
8455	if (vd) {
8456		value.value_type = SE_DATA_TYPE_UINT64;
8457		value.value.sv_uint64 = vd->vdev_guid;
8458		if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value,
8459		    SE_SLEEP) != 0)
8460			goto done;
8461
8462		if (vd->vdev_path) {
8463			value.value_type = SE_DATA_TYPE_STRING;
8464			value.value.sv_string = vd->vdev_path;
8465			if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH,
8466			    &value, SE_SLEEP) != 0)
8467				goto done;
8468		}
8469	}
8470
8471	if (hist_nvl != NULL) {
8472		fnvlist_merge((nvlist_t *)attr, hist_nvl);
8473	}
8474
8475	if (sysevent_attach_attributes(ev, attr) != 0)
8476		goto done;
8477	attr = NULL;
8478
8479done:
8480	if (attr)
8481		sysevent_free_attr(attr);
8482
8483#endif
8484	return (ev);
8485}
8486
8487void
8488spa_event_post(sysevent_t *ev)
8489{
8490#ifdef _KERNEL
8491	sysevent_id_t		eid;
8492
8493	(void) log_sysevent(ev, SE_SLEEP, &eid);
8494	sysevent_free(ev);
8495#endif
8496}
8497
8498void
8499spa_event_discard(sysevent_t *ev)
8500{
8501#ifdef _KERNEL
8502	sysevent_free(ev);
8503#endif
8504}
8505
8506/*
8507 * Post a sysevent corresponding to the given event.  The 'name' must be one of
8508 * the event definitions in sys/sysevent/eventdefs.h.  The payload will be
8509 * filled in from the spa and (optionally) the vdev and history nvl.  This
8510 * doesn't do anything in the userland libzpool, as we don't want consumers to
8511 * misinterpret ztest or zdb as real changes.
8512 */
8513void
8514spa_event_notify(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name)
8515{
8516	spa_event_post(spa_event_create(spa, vd, hist_nvl, name));
8517}
8518