spa.c revision 204073
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27/*
28 * This file contains all the routines used when modifying on-disk SPA state.
29 * This includes opening, importing, destroying, exporting a pool, and syncing a
30 * pool.
31 */
32
33#include <sys/zfs_context.h>
34#include <sys/fm/fs/zfs.h>
35#include <sys/spa_impl.h>
36#include <sys/zio.h>
37#include <sys/zio_checksum.h>
38#include <sys/zio_compress.h>
39#include <sys/dmu.h>
40#include <sys/dmu_tx.h>
41#include <sys/zap.h>
42#include <sys/zil.h>
43#include <sys/vdev_impl.h>
44#include <sys/metaslab.h>
45#include <sys/uberblock_impl.h>
46#include <sys/txg.h>
47#include <sys/avl.h>
48#include <sys/dmu_traverse.h>
49#include <sys/dmu_objset.h>
50#include <sys/unique.h>
51#include <sys/dsl_pool.h>
52#include <sys/dsl_dataset.h>
53#include <sys/dsl_dir.h>
54#include <sys/dsl_prop.h>
55#include <sys/dsl_synctask.h>
56#include <sys/fs/zfs.h>
57#include <sys/arc.h>
58#include <sys/callb.h>
59#include <sys/sunddi.h>
60#include <sys/spa_boot.h>
61
62#include "zfs_prop.h"
63#include "zfs_comutil.h"
64
65/* Check hostid on import? */
66static int check_hostid = 1;
67
68SYSCTL_DECL(_vfs_zfs);
69TUNABLE_INT("vfs.zfs.check_hostid", &check_hostid);
70SYSCTL_INT(_vfs_zfs, OID_AUTO, check_hostid, CTLFLAG_RW, &check_hostid, 0,
71    "Check hostid on import?");
72
73int zio_taskq_threads[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
74	/*	ISSUE	INTR					*/
75	{	1,	1	},	/* ZIO_TYPE_NULL	*/
76	{	1,	8	},	/* ZIO_TYPE_READ	*/
77	{	8,	1	},	/* ZIO_TYPE_WRITE	*/
78	{	1,	1	},	/* ZIO_TYPE_FREE	*/
79	{	1,	1	},	/* ZIO_TYPE_CLAIM	*/
80	{	1,	1	},	/* ZIO_TYPE_IOCTL	*/
81};
82
83static void spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx);
84static boolean_t spa_has_active_shared_spare(spa_t *spa);
85
86/*
87 * ==========================================================================
88 * SPA properties routines
89 * ==========================================================================
90 */
91
92/*
93 * Add a (source=src, propname=propval) list to an nvlist.
94 */
95static void
96spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval,
97    uint64_t intval, zprop_source_t src)
98{
99	const char *propname = zpool_prop_to_name(prop);
100	nvlist_t *propval;
101
102	VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
103	VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0);
104
105	if (strval != NULL)
106		VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0);
107	else
108		VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0);
109
110	VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0);
111	nvlist_free(propval);
112}
113
114/*
115 * Get property values from the spa configuration.
116 */
117static void
118spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
119{
120	uint64_t size = spa_get_space(spa);
121	uint64_t used = spa_get_alloc(spa);
122	uint64_t cap, version;
123	zprop_source_t src = ZPROP_SRC_NONE;
124	spa_config_dirent_t *dp;
125
126	ASSERT(MUTEX_HELD(&spa->spa_props_lock));
127
128	/*
129	 * readonly properties
130	 */
131	spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
132	spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
133	spa_prop_add_list(*nvp, ZPOOL_PROP_USED, NULL, used, src);
134	spa_prop_add_list(*nvp, ZPOOL_PROP_AVAILABLE, NULL, size - used, src);
135
136	cap = (size == 0) ? 0 : (used * 100 / size);
137	spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);
138
139	spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
140	spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
141	    spa->spa_root_vdev->vdev_state, src);
142
143	/*
144	 * settable properties that are not stored in the pool property object.
145	 */
146	version = spa_version(spa);
147	if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION))
148		src = ZPROP_SRC_DEFAULT;
149	else
150		src = ZPROP_SRC_LOCAL;
151	spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src);
152
153	if (spa->spa_root != NULL)
154		spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root,
155		    0, ZPROP_SRC_LOCAL);
156
157	if ((dp = list_head(&spa->spa_config_list)) != NULL) {
158		if (dp->scd_path == NULL) {
159			spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
160			    "none", 0, ZPROP_SRC_LOCAL);
161		} else if (strcmp(dp->scd_path, spa_config_path) != 0) {
162			spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
163			    dp->scd_path, 0, ZPROP_SRC_LOCAL);
164		}
165	}
166}
167
168/*
169 * Get zpool property values.
170 */
171int
172spa_prop_get(spa_t *spa, nvlist_t **nvp)
173{
174	zap_cursor_t zc;
175	zap_attribute_t za;
176	objset_t *mos = spa->spa_meta_objset;
177	int err;
178
179	VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
180
181	mutex_enter(&spa->spa_props_lock);
182
183	/*
184	 * Get properties from the spa config.
185	 */
186	spa_prop_get_config(spa, nvp);
187
188	/* If no pool property object, no more prop to get. */
189	if (spa->spa_pool_props_object == 0) {
190		mutex_exit(&spa->spa_props_lock);
191		return (0);
192	}
193
194	/*
195	 * Get properties from the MOS pool property object.
196	 */
197	for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object);
198	    (err = zap_cursor_retrieve(&zc, &za)) == 0;
199	    zap_cursor_advance(&zc)) {
200		uint64_t intval = 0;
201		char *strval = NULL;
202		zprop_source_t src = ZPROP_SRC_DEFAULT;
203		zpool_prop_t prop;
204
205		if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL)
206			continue;
207
208		switch (za.za_integer_length) {
209		case 8:
210			/* integer property */
211			if (za.za_first_integer !=
212			    zpool_prop_default_numeric(prop))
213				src = ZPROP_SRC_LOCAL;
214
215			if (prop == ZPOOL_PROP_BOOTFS) {
216				dsl_pool_t *dp;
217				dsl_dataset_t *ds = NULL;
218
219				dp = spa_get_dsl(spa);
220				rw_enter(&dp->dp_config_rwlock, RW_READER);
221				if (err = dsl_dataset_hold_obj(dp,
222				    za.za_first_integer, FTAG, &ds)) {
223					rw_exit(&dp->dp_config_rwlock);
224					break;
225				}
226
227				strval = kmem_alloc(
228				    MAXNAMELEN + strlen(MOS_DIR_NAME) + 1,
229				    KM_SLEEP);
230				dsl_dataset_name(ds, strval);
231				dsl_dataset_rele(ds, FTAG);
232				rw_exit(&dp->dp_config_rwlock);
233			} else {
234				strval = NULL;
235				intval = za.za_first_integer;
236			}
237
238			spa_prop_add_list(*nvp, prop, strval, intval, src);
239
240			if (strval != NULL)
241				kmem_free(strval,
242				    MAXNAMELEN + strlen(MOS_DIR_NAME) + 1);
243
244			break;
245
246		case 1:
247			/* string property */
248			strval = kmem_alloc(za.za_num_integers, KM_SLEEP);
249			err = zap_lookup(mos, spa->spa_pool_props_object,
250			    za.za_name, 1, za.za_num_integers, strval);
251			if (err) {
252				kmem_free(strval, za.za_num_integers);
253				break;
254			}
255			spa_prop_add_list(*nvp, prop, strval, 0, src);
256			kmem_free(strval, za.za_num_integers);
257			break;
258
259		default:
260			break;
261		}
262	}
263	zap_cursor_fini(&zc);
264	mutex_exit(&spa->spa_props_lock);
265out:
266	if (err && err != ENOENT) {
267		nvlist_free(*nvp);
268		*nvp = NULL;
269		return (err);
270	}
271
272	return (0);
273}
274
275/*
276 * Validate the given pool properties nvlist and modify the list
277 * for the property values to be set.
278 */
279static int
280spa_prop_validate(spa_t *spa, nvlist_t *props)
281{
282	nvpair_t *elem;
283	int error = 0, reset_bootfs = 0;
284	uint64_t objnum;
285
286	elem = NULL;
287	while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
288		zpool_prop_t prop;
289		char *propname, *strval;
290		uint64_t intval;
291		objset_t *os;
292		char *slash;
293
294		propname = nvpair_name(elem);
295
296		if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL)
297			return (EINVAL);
298
299		switch (prop) {
300		case ZPOOL_PROP_VERSION:
301			error = nvpair_value_uint64(elem, &intval);
302			if (!error &&
303			    (intval < spa_version(spa) || intval > SPA_VERSION))
304				error = EINVAL;
305			break;
306
307		case ZPOOL_PROP_DELEGATION:
308		case ZPOOL_PROP_AUTOREPLACE:
309		case ZPOOL_PROP_LISTSNAPS:
310			error = nvpair_value_uint64(elem, &intval);
311			if (!error && intval > 1)
312				error = EINVAL;
313			break;
314
315		case ZPOOL_PROP_BOOTFS:
316			if (spa_version(spa) < SPA_VERSION_BOOTFS) {
317				error = ENOTSUP;
318				break;
319			}
320
321			/*
322			 * Make sure the vdev config is bootable
323			 */
324			if (!vdev_is_bootable(spa->spa_root_vdev)) {
325				error = ENOTSUP;
326				break;
327			}
328
329			reset_bootfs = 1;
330
331			error = nvpair_value_string(elem, &strval);
332
333			if (!error) {
334				uint64_t compress;
335
336				if (strval == NULL || strval[0] == '\0') {
337					objnum = zpool_prop_default_numeric(
338					    ZPOOL_PROP_BOOTFS);
339					break;
340				}
341
342				if (error = dmu_objset_open(strval, DMU_OST_ZFS,
343				    DS_MODE_USER | DS_MODE_READONLY, &os))
344					break;
345
346				/* We don't support gzip bootable datasets */
347				if ((error = dsl_prop_get_integer(strval,
348				    zfs_prop_to_name(ZFS_PROP_COMPRESSION),
349				    &compress, NULL)) == 0 &&
350				    !BOOTFS_COMPRESS_VALID(compress)) {
351					error = ENOTSUP;
352				} else {
353					objnum = dmu_objset_id(os);
354				}
355				dmu_objset_close(os);
356			}
357			break;
358
359		case ZPOOL_PROP_FAILUREMODE:
360			error = nvpair_value_uint64(elem, &intval);
361			if (!error && (intval < ZIO_FAILURE_MODE_WAIT ||
362			    intval > ZIO_FAILURE_MODE_PANIC))
363				error = EINVAL;
364
365			/*
366			 * This is a special case which only occurs when
367			 * the pool has completely failed. This allows
368			 * the user to change the in-core failmode property
369			 * without syncing it out to disk (I/Os might
370			 * currently be blocked). We do this by returning
371			 * EIO to the caller (spa_prop_set) to trick it
372			 * into thinking we encountered a property validation
373			 * error.
374			 */
375			if (!error && spa_suspended(spa)) {
376				spa->spa_failmode = intval;
377				error = EIO;
378			}
379			break;
380
381		case ZPOOL_PROP_CACHEFILE:
382			if ((error = nvpair_value_string(elem, &strval)) != 0)
383				break;
384
385			if (strval[0] == '\0')
386				break;
387
388			if (strcmp(strval, "none") == 0)
389				break;
390
391			if (strval[0] != '/') {
392				error = EINVAL;
393				break;
394			}
395
396			slash = strrchr(strval, '/');
397			ASSERT(slash != NULL);
398
399			if (slash[1] == '\0' || strcmp(slash, "/.") == 0 ||
400			    strcmp(slash, "/..") == 0)
401				error = EINVAL;
402			break;
403		}
404
405		if (error)
406			break;
407	}
408
409	if (!error && reset_bootfs) {
410		error = nvlist_remove(props,
411		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING);
412
413		if (!error) {
414			error = nvlist_add_uint64(props,
415			    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum);
416		}
417	}
418
419	return (error);
420}
421
422int
423spa_prop_set(spa_t *spa, nvlist_t *nvp)
424{
425	int error;
426
427	if ((error = spa_prop_validate(spa, nvp)) != 0)
428		return (error);
429
430	return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props,
431	    spa, nvp, 3));
432}
433
434/*
435 * If the bootfs property value is dsobj, clear it.
436 */
437void
438spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
439{
440	if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
441		VERIFY(zap_remove(spa->spa_meta_objset,
442		    spa->spa_pool_props_object,
443		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0);
444		spa->spa_bootfs = 0;
445	}
446}
447
448/*
449 * ==========================================================================
450 * SPA state manipulation (open/create/destroy/import/export)
451 * ==========================================================================
452 */
453
454static int
455spa_error_entry_compare(const void *a, const void *b)
456{
457	spa_error_entry_t *sa = (spa_error_entry_t *)a;
458	spa_error_entry_t *sb = (spa_error_entry_t *)b;
459	int ret;
460
461	ret = bcmp(&sa->se_bookmark, &sb->se_bookmark,
462	    sizeof (zbookmark_t));
463
464	if (ret < 0)
465		return (-1);
466	else if (ret > 0)
467		return (1);
468	else
469		return (0);
470}
471
472/*
473 * Utility function which retrieves copies of the current logs and
474 * re-initializes them in the process.
475 */
476void
477spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
478{
479	ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
480
481	bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t));
482	bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t));
483
484	avl_create(&spa->spa_errlist_scrub,
485	    spa_error_entry_compare, sizeof (spa_error_entry_t),
486	    offsetof(spa_error_entry_t, se_avl));
487	avl_create(&spa->spa_errlist_last,
488	    spa_error_entry_compare, sizeof (spa_error_entry_t),
489	    offsetof(spa_error_entry_t, se_avl));
490}
491
492/*
493 * Activate an uninitialized pool.
494 */
495static void
496spa_activate(spa_t *spa)
497{
498
499	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
500
501	spa->spa_state = POOL_STATE_ACTIVE;
502
503	spa->spa_normal_class = metaslab_class_create();
504	spa->spa_log_class = metaslab_class_create();
505
506	for (int t = 0; t < ZIO_TYPES; t++) {
507		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
508			spa->spa_zio_taskq[t][q] = taskq_create("spa_zio",
509			    zio_taskq_threads[t][q], maxclsyspri, 50,
510			    INT_MAX, TASKQ_PREPOPULATE);
511		}
512	}
513
514	list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
515	    offsetof(vdev_t, vdev_config_dirty_node));
516	list_create(&spa->spa_state_dirty_list, sizeof (vdev_t),
517	    offsetof(vdev_t, vdev_state_dirty_node));
518
519	txg_list_create(&spa->spa_vdev_txg_list,
520	    offsetof(struct vdev, vdev_txg_node));
521
522	avl_create(&spa->spa_errlist_scrub,
523	    spa_error_entry_compare, sizeof (spa_error_entry_t),
524	    offsetof(spa_error_entry_t, se_avl));
525	avl_create(&spa->spa_errlist_last,
526	    spa_error_entry_compare, sizeof (spa_error_entry_t),
527	    offsetof(spa_error_entry_t, se_avl));
528}
529
530/*
531 * Opposite of spa_activate().
532 */
533static void
534spa_deactivate(spa_t *spa)
535{
536	ASSERT(spa->spa_sync_on == B_FALSE);
537	ASSERT(spa->spa_dsl_pool == NULL);
538	ASSERT(spa->spa_root_vdev == NULL);
539
540	ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
541
542	txg_list_destroy(&spa->spa_vdev_txg_list);
543
544	list_destroy(&spa->spa_config_dirty_list);
545	list_destroy(&spa->spa_state_dirty_list);
546
547	for (int t = 0; t < ZIO_TYPES; t++) {
548		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
549			taskq_destroy(spa->spa_zio_taskq[t][q]);
550			spa->spa_zio_taskq[t][q] = NULL;
551		}
552	}
553
554	metaslab_class_destroy(spa->spa_normal_class);
555	spa->spa_normal_class = NULL;
556
557	metaslab_class_destroy(spa->spa_log_class);
558	spa->spa_log_class = NULL;
559
560	/*
561	 * If this was part of an import or the open otherwise failed, we may
562	 * still have errors left in the queues.  Empty them just in case.
563	 */
564	spa_errlog_drain(spa);
565
566	avl_destroy(&spa->spa_errlist_scrub);
567	avl_destroy(&spa->spa_errlist_last);
568
569	spa->spa_state = POOL_STATE_UNINITIALIZED;
570}
571
572/*
573 * Verify a pool configuration, and construct the vdev tree appropriately.  This
574 * will create all the necessary vdevs in the appropriate layout, with each vdev
575 * in the CLOSED state.  This will prep the pool before open/creation/import.
576 * All vdev validation is done by the vdev_alloc() routine.
577 */
578static int
579spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
580    uint_t id, int atype)
581{
582	nvlist_t **child;
583	uint_t c, children;
584	int error;
585
586	if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
587		return (error);
588
589	if ((*vdp)->vdev_ops->vdev_op_leaf)
590		return (0);
591
592	error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
593	    &child, &children);
594
595	if (error == ENOENT)
596		return (0);
597
598	if (error) {
599		vdev_free(*vdp);
600		*vdp = NULL;
601		return (EINVAL);
602	}
603
604	for (c = 0; c < children; c++) {
605		vdev_t *vd;
606		if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
607		    atype)) != 0) {
608			vdev_free(*vdp);
609			*vdp = NULL;
610			return (error);
611		}
612	}
613
614	ASSERT(*vdp != NULL);
615
616	return (0);
617}
618
619/*
620 * Opposite of spa_load().
621 */
622static void
623spa_unload(spa_t *spa)
624{
625	int i;
626
627	ASSERT(MUTEX_HELD(&spa_namespace_lock));
628
629	/*
630	 * Stop async tasks.
631	 */
632	spa_async_suspend(spa);
633
634	/*
635	 * Stop syncing.
636	 */
637	if (spa->spa_sync_on) {
638		txg_sync_stop(spa->spa_dsl_pool);
639		spa->spa_sync_on = B_FALSE;
640	}
641
642	/*
643	 * Wait for any outstanding async I/O to complete.
644	 */
645	mutex_enter(&spa->spa_async_root_lock);
646	while (spa->spa_async_root_count != 0)
647		cv_wait(&spa->spa_async_root_cv, &spa->spa_async_root_lock);
648	mutex_exit(&spa->spa_async_root_lock);
649
650	/*
651	 * Drop and purge level 2 cache
652	 */
653	spa_l2cache_drop(spa);
654
655	/*
656	 * Close the dsl pool.
657	 */
658	if (spa->spa_dsl_pool) {
659		dsl_pool_close(spa->spa_dsl_pool);
660		spa->spa_dsl_pool = NULL;
661	}
662
663	/*
664	 * Close all vdevs.
665	 */
666	if (spa->spa_root_vdev)
667		vdev_free(spa->spa_root_vdev);
668	ASSERT(spa->spa_root_vdev == NULL);
669
670	for (i = 0; i < spa->spa_spares.sav_count; i++)
671		vdev_free(spa->spa_spares.sav_vdevs[i]);
672	if (spa->spa_spares.sav_vdevs) {
673		kmem_free(spa->spa_spares.sav_vdevs,
674		    spa->spa_spares.sav_count * sizeof (void *));
675		spa->spa_spares.sav_vdevs = NULL;
676	}
677	if (spa->spa_spares.sav_config) {
678		nvlist_free(spa->spa_spares.sav_config);
679		spa->spa_spares.sav_config = NULL;
680	}
681	spa->spa_spares.sav_count = 0;
682
683	for (i = 0; i < spa->spa_l2cache.sav_count; i++)
684		vdev_free(spa->spa_l2cache.sav_vdevs[i]);
685	if (spa->spa_l2cache.sav_vdevs) {
686		kmem_free(spa->spa_l2cache.sav_vdevs,
687		    spa->spa_l2cache.sav_count * sizeof (void *));
688		spa->spa_l2cache.sav_vdevs = NULL;
689	}
690	if (spa->spa_l2cache.sav_config) {
691		nvlist_free(spa->spa_l2cache.sav_config);
692		spa->spa_l2cache.sav_config = NULL;
693	}
694	spa->spa_l2cache.sav_count = 0;
695
696	spa->spa_async_suspended = 0;
697}
698
699/*
700 * Load (or re-load) the current list of vdevs describing the active spares for
701 * this pool.  When this is called, we have some form of basic information in
702 * 'spa_spares.sav_config'.  We parse this into vdevs, try to open them, and
703 * then re-generate a more complete list including status information.
704 */
705static void
706spa_load_spares(spa_t *spa)
707{
708	nvlist_t **spares;
709	uint_t nspares;
710	int i;
711	vdev_t *vd, *tvd;
712
713	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
714
715	/*
716	 * First, close and free any existing spare vdevs.
717	 */
718	for (i = 0; i < spa->spa_spares.sav_count; i++) {
719		vd = spa->spa_spares.sav_vdevs[i];
720
721		/* Undo the call to spa_activate() below */
722		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
723		    B_FALSE)) != NULL && tvd->vdev_isspare)
724			spa_spare_remove(tvd);
725		vdev_close(vd);
726		vdev_free(vd);
727	}
728
729	if (spa->spa_spares.sav_vdevs)
730		kmem_free(spa->spa_spares.sav_vdevs,
731		    spa->spa_spares.sav_count * sizeof (void *));
732
733	if (spa->spa_spares.sav_config == NULL)
734		nspares = 0;
735	else
736		VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
737		    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
738
739	spa->spa_spares.sav_count = (int)nspares;
740	spa->spa_spares.sav_vdevs = NULL;
741
742	if (nspares == 0)
743		return;
744
745	/*
746	 * Construct the array of vdevs, opening them to get status in the
747	 * process.   For each spare, there is potentially two different vdev_t
748	 * structures associated with it: one in the list of spares (used only
749	 * for basic validation purposes) and one in the active vdev
750	 * configuration (if it's spared in).  During this phase we open and
751	 * validate each vdev on the spare list.  If the vdev also exists in the
752	 * active configuration, then we also mark this vdev as an active spare.
753	 */
754	spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *),
755	    KM_SLEEP);
756	for (i = 0; i < spa->spa_spares.sav_count; i++) {
757		VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0,
758		    VDEV_ALLOC_SPARE) == 0);
759		ASSERT(vd != NULL);
760
761		spa->spa_spares.sav_vdevs[i] = vd;
762
763		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
764		    B_FALSE)) != NULL) {
765			if (!tvd->vdev_isspare)
766				spa_spare_add(tvd);
767
768			/*
769			 * We only mark the spare active if we were successfully
770			 * able to load the vdev.  Otherwise, importing a pool
771			 * with a bad active spare would result in strange
772			 * behavior, because multiple pool would think the spare
773			 * is actively in use.
774			 *
775			 * There is a vulnerability here to an equally bizarre
776			 * circumstance, where a dead active spare is later
777			 * brought back to life (onlined or otherwise).  Given
778			 * the rarity of this scenario, and the extra complexity
779			 * it adds, we ignore the possibility.
780			 */
781			if (!vdev_is_dead(tvd))
782				spa_spare_activate(tvd);
783		}
784
785		vd->vdev_top = vd;
786
787		if (vdev_open(vd) != 0)
788			continue;
789
790		if (vdev_validate_aux(vd) == 0)
791			spa_spare_add(vd);
792	}
793
794	/*
795	 * Recompute the stashed list of spares, with status information
796	 * this time.
797	 */
798	VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES,
799	    DATA_TYPE_NVLIST_ARRAY) == 0);
800
801	spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *),
802	    KM_SLEEP);
803	for (i = 0; i < spa->spa_spares.sav_count; i++)
804		spares[i] = vdev_config_generate(spa,
805		    spa->spa_spares.sav_vdevs[i], B_TRUE, B_TRUE, B_FALSE);
806	VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
807	    ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0);
808	for (i = 0; i < spa->spa_spares.sav_count; i++)
809		nvlist_free(spares[i]);
810	kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *));
811}
812
813/*
814 * Load (or re-load) the current list of vdevs describing the active l2cache for
815 * this pool.  When this is called, we have some form of basic information in
816 * 'spa_l2cache.sav_config'.  We parse this into vdevs, try to open them, and
817 * then re-generate a more complete list including status information.
818 * Devices which are already active have their details maintained, and are
819 * not re-opened.
820 */
821static void
822spa_load_l2cache(spa_t *spa)
823{
824	nvlist_t **l2cache;
825	uint_t nl2cache;
826	int i, j, oldnvdevs;
827	uint64_t guid, size;
828	vdev_t *vd, **oldvdevs, **newvdevs;
829	spa_aux_vdev_t *sav = &spa->spa_l2cache;
830
831	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
832
833	if (sav->sav_config != NULL) {
834		VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
835		    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
836		newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP);
837	} else {
838		nl2cache = 0;
839	}
840
841	oldvdevs = sav->sav_vdevs;
842	oldnvdevs = sav->sav_count;
843	sav->sav_vdevs = NULL;
844	sav->sav_count = 0;
845
846	/*
847	 * Process new nvlist of vdevs.
848	 */
849	for (i = 0; i < nl2cache; i++) {
850		VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID,
851		    &guid) == 0);
852
853		newvdevs[i] = NULL;
854		for (j = 0; j < oldnvdevs; j++) {
855			vd = oldvdevs[j];
856			if (vd != NULL && guid == vd->vdev_guid) {
857				/*
858				 * Retain previous vdev for add/remove ops.
859				 */
860				newvdevs[i] = vd;
861				oldvdevs[j] = NULL;
862				break;
863			}
864		}
865
866		if (newvdevs[i] == NULL) {
867			/*
868			 * Create new vdev
869			 */
870			VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0,
871			    VDEV_ALLOC_L2CACHE) == 0);
872			ASSERT(vd != NULL);
873			newvdevs[i] = vd;
874
875			/*
876			 * Commit this vdev as an l2cache device,
877			 * even if it fails to open.
878			 */
879			spa_l2cache_add(vd);
880
881			vd->vdev_top = vd;
882			vd->vdev_aux = sav;
883
884			spa_l2cache_activate(vd);
885
886			if (vdev_open(vd) != 0)
887				continue;
888
889			(void) vdev_validate_aux(vd);
890
891			if (!vdev_is_dead(vd)) {
892				size = vdev_get_rsize(vd);
893				l2arc_add_vdev(spa, vd,
894				    VDEV_LABEL_START_SIZE,
895				    size - VDEV_LABEL_START_SIZE);
896			}
897		}
898	}
899
900	/*
901	 * Purge vdevs that were dropped
902	 */
903	for (i = 0; i < oldnvdevs; i++) {
904		uint64_t pool;
905
906		vd = oldvdevs[i];
907		if (vd != NULL) {
908			if ((spa_mode & FWRITE) &&
909			    spa_l2cache_exists(vd->vdev_guid, &pool) &&
910			    pool != 0ULL &&
911			    l2arc_vdev_present(vd)) {
912				l2arc_remove_vdev(vd);
913			}
914			(void) vdev_close(vd);
915			spa_l2cache_remove(vd);
916		}
917	}
918
919	if (oldvdevs)
920		kmem_free(oldvdevs, oldnvdevs * sizeof (void *));
921
922	if (sav->sav_config == NULL)
923		goto out;
924
925	sav->sav_vdevs = newvdevs;
926	sav->sav_count = (int)nl2cache;
927
928	/*
929	 * Recompute the stashed list of l2cache devices, with status
930	 * information this time.
931	 */
932	VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
933	    DATA_TYPE_NVLIST_ARRAY) == 0);
934
935	l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
936	for (i = 0; i < sav->sav_count; i++)
937		l2cache[i] = vdev_config_generate(spa,
938		    sav->sav_vdevs[i], B_TRUE, B_FALSE, B_TRUE);
939	VERIFY(nvlist_add_nvlist_array(sav->sav_config,
940	    ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0);
941out:
942	for (i = 0; i < sav->sav_count; i++)
943		nvlist_free(l2cache[i]);
944	if (sav->sav_count)
945		kmem_free(l2cache, sav->sav_count * sizeof (void *));
946}
947
948static int
949load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
950{
951	dmu_buf_t *db;
952	char *packed = NULL;
953	size_t nvsize = 0;
954	int error;
955	*value = NULL;
956
957	VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
958	nvsize = *(uint64_t *)db->db_data;
959	dmu_buf_rele(db, FTAG);
960
961	packed = kmem_alloc(nvsize, KM_SLEEP);
962	error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed);
963	if (error == 0)
964		error = nvlist_unpack(packed, nvsize, value, 0);
965	kmem_free(packed, nvsize);
966
967	return (error);
968}
969
970/*
971 * Checks to see if the given vdev could not be opened, in which case we post a
972 * sysevent to notify the autoreplace code that the device has been removed.
973 */
974static void
975spa_check_removed(vdev_t *vd)
976{
977	int c;
978
979	for (c = 0; c < vd->vdev_children; c++)
980		spa_check_removed(vd->vdev_child[c]);
981
982	if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) {
983		zfs_post_autoreplace(vd->vdev_spa, vd);
984		spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK);
985	}
986}
987
988/*
989 * Check for missing log devices
990 */
991int
992spa_check_logs(spa_t *spa)
993{
994	switch (spa->spa_log_state) {
995	case SPA_LOG_MISSING:
996		/* need to recheck in case slog has been restored */
997	case SPA_LOG_UNKNOWN:
998		if (dmu_objset_find(spa->spa_name, zil_check_log_chain, NULL,
999		    DS_FIND_CHILDREN)) {
1000			spa->spa_log_state = SPA_LOG_MISSING;
1001			return (1);
1002		}
1003		break;
1004
1005	case SPA_LOG_CLEAR:
1006		(void) dmu_objset_find(spa->spa_name, zil_clear_log_chain, NULL,
1007		    DS_FIND_CHILDREN);
1008		break;
1009	}
1010	spa->spa_log_state = SPA_LOG_GOOD;
1011	return (0);
1012}
1013
1014/*
1015 * Load an existing storage pool, using the pool's builtin spa_config as a
1016 * source of configuration information.
1017 */
1018static int
1019spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
1020{
1021	int error = 0;
1022	nvlist_t *nvroot = NULL;
1023	vdev_t *rvd;
1024	uberblock_t *ub = &spa->spa_uberblock;
1025	uint64_t config_cache_txg = spa->spa_config_txg;
1026	uint64_t pool_guid;
1027	uint64_t version;
1028	uint64_t autoreplace = 0;
1029	char *ereport = FM_EREPORT_ZFS_POOL;
1030
1031	ASSERT(MUTEX_HELD(&spa_namespace_lock));
1032
1033	spa->spa_load_state = state;
1034
1035	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) ||
1036	    nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) {
1037		error = EINVAL;
1038		goto out;
1039	}
1040
1041	/*
1042	 * Versioning wasn't explicitly added to the label until later, so if
1043	 * it's not present treat it as the initial version.
1044	 */
1045	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0)
1046		version = SPA_VERSION_INITIAL;
1047
1048	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
1049	    &spa->spa_config_txg);
1050
1051	if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) &&
1052	    spa_guid_exists(pool_guid, 0)) {
1053		error = EEXIST;
1054		goto out;
1055	}
1056
1057	spa->spa_load_guid = pool_guid;
1058
1059	/*
1060	 * Parse the configuration into a vdev tree.  We explicitly set the
1061	 * value that will be returned by spa_version() since parsing the
1062	 * configuration requires knowing the version number.
1063	 */
1064	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1065	spa->spa_ubsync.ub_version = version;
1066	error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD);
1067	spa_config_exit(spa, SCL_ALL, FTAG);
1068
1069	if (error != 0)
1070		goto out;
1071
1072	ASSERT(spa->spa_root_vdev == rvd);
1073	ASSERT(spa_guid(spa) == pool_guid);
1074
1075	/*
1076	 * Try to open all vdevs, loading each label in the process.
1077	 */
1078	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1079	error = vdev_open(rvd);
1080	spa_config_exit(spa, SCL_ALL, FTAG);
1081	if (error != 0)
1082		goto out;
1083
1084	/*
1085	 * Validate the labels for all leaf vdevs.  We need to grab the config
1086	 * lock because all label I/O is done with ZIO_FLAG_CONFIG_WRITER.
1087	 */
1088	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1089	error = vdev_validate(rvd);
1090	spa_config_exit(spa, SCL_ALL, FTAG);
1091
1092	if (error != 0)
1093		goto out;
1094
1095	if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
1096		error = ENXIO;
1097		goto out;
1098	}
1099
1100	/*
1101	 * Find the best uberblock.
1102	 */
1103	vdev_uberblock_load(NULL, rvd, ub);
1104
1105	/*
1106	 * If we weren't able to find a single valid uberblock, return failure.
1107	 */
1108	if (ub->ub_txg == 0) {
1109		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1110		    VDEV_AUX_CORRUPT_DATA);
1111		error = ENXIO;
1112		goto out;
1113	}
1114
1115	/*
1116	 * If the pool is newer than the code, we can't open it.
1117	 */
1118	if (ub->ub_version > SPA_VERSION) {
1119		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1120		    VDEV_AUX_VERSION_NEWER);
1121		error = ENOTSUP;
1122		goto out;
1123	}
1124
1125	/*
1126	 * If the vdev guid sum doesn't match the uberblock, we have an
1127	 * incomplete configuration.
1128	 */
1129	if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) {
1130		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1131		    VDEV_AUX_BAD_GUID_SUM);
1132		error = ENXIO;
1133		goto out;
1134	}
1135
1136	/*
1137	 * Initialize internal SPA structures.
1138	 */
1139	spa->spa_state = POOL_STATE_ACTIVE;
1140	spa->spa_ubsync = spa->spa_uberblock;
1141	spa->spa_first_txg = spa_last_synced_txg(spa) + 1;
1142	error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
1143	if (error) {
1144		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1145		    VDEV_AUX_CORRUPT_DATA);
1146		goto out;
1147	}
1148	spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
1149
1150	if (zap_lookup(spa->spa_meta_objset,
1151	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
1152	    sizeof (uint64_t), 1, &spa->spa_config_object) != 0) {
1153		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1154		    VDEV_AUX_CORRUPT_DATA);
1155		error = EIO;
1156		goto out;
1157	}
1158
1159	if (!mosconfig) {
1160		nvlist_t *newconfig;
1161		uint64_t hostid;
1162
1163		if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) {
1164			vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1165			    VDEV_AUX_CORRUPT_DATA);
1166			error = EIO;
1167			goto out;
1168		}
1169
1170		if (!spa_is_root(spa) && nvlist_lookup_uint64(newconfig,
1171		    ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
1172			char *hostname;
1173			unsigned long myhostid = 0;
1174
1175			VERIFY(nvlist_lookup_string(newconfig,
1176			    ZPOOL_CONFIG_HOSTNAME, &hostname) == 0);
1177
1178			(void) ddi_strtoul(hw_serial, NULL, 10, &myhostid);
1179			if (check_hostid && hostid != 0 && myhostid != 0 &&
1180			    (unsigned long)hostid != myhostid) {
1181				cmn_err(CE_WARN, "pool '%s' could not be "
1182				    "loaded as it was last accessed by "
1183				    "another system (host: %s hostid: 0x%lx). "
1184				    "See: http://www.sun.com/msg/ZFS-8000-EY",
1185				    spa_name(spa), hostname,
1186				    (unsigned long)hostid);
1187				error = EBADF;
1188				goto out;
1189			}
1190		}
1191
1192		spa_config_set(spa, newconfig);
1193		spa_unload(spa);
1194		spa_deactivate(spa);
1195		spa_activate(spa);
1196
1197		return (spa_load(spa, newconfig, state, B_TRUE));
1198	}
1199
1200	if (zap_lookup(spa->spa_meta_objset,
1201	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
1202	    sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) {
1203		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1204		    VDEV_AUX_CORRUPT_DATA);
1205		error = EIO;
1206		goto out;
1207	}
1208
1209	/*
1210	 * Load the bit that tells us to use the new accounting function
1211	 * (raid-z deflation).  If we have an older pool, this will not
1212	 * be present.
1213	 */
1214	error = zap_lookup(spa->spa_meta_objset,
1215	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
1216	    sizeof (uint64_t), 1, &spa->spa_deflate);
1217	if (error != 0 && error != ENOENT) {
1218		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1219		    VDEV_AUX_CORRUPT_DATA);
1220		error = EIO;
1221		goto out;
1222	}
1223
1224	/*
1225	 * Load the persistent error log.  If we have an older pool, this will
1226	 * not be present.
1227	 */
1228	error = zap_lookup(spa->spa_meta_objset,
1229	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST,
1230	    sizeof (uint64_t), 1, &spa->spa_errlog_last);
1231	if (error != 0 && error != ENOENT) {
1232		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1233		    VDEV_AUX_CORRUPT_DATA);
1234		error = EIO;
1235		goto out;
1236	}
1237
1238	error = zap_lookup(spa->spa_meta_objset,
1239	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB,
1240	    sizeof (uint64_t), 1, &spa->spa_errlog_scrub);
1241	if (error != 0 && error != ENOENT) {
1242		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1243		    VDEV_AUX_CORRUPT_DATA);
1244		error = EIO;
1245		goto out;
1246	}
1247
1248	/*
1249	 * Load the history object.  If we have an older pool, this
1250	 * will not be present.
1251	 */
1252	error = zap_lookup(spa->spa_meta_objset,
1253	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY,
1254	    sizeof (uint64_t), 1, &spa->spa_history);
1255	if (error != 0 && error != ENOENT) {
1256		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1257		    VDEV_AUX_CORRUPT_DATA);
1258		error = EIO;
1259		goto out;
1260	}
1261
1262	/*
1263	 * Load any hot spares for this pool.
1264	 */
1265	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
1266	    DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares.sav_object);
1267	if (error != 0 && error != ENOENT) {
1268		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1269		    VDEV_AUX_CORRUPT_DATA);
1270		error = EIO;
1271		goto out;
1272	}
1273	if (error == 0) {
1274		ASSERT(spa_version(spa) >= SPA_VERSION_SPARES);
1275		if (load_nvlist(spa, spa->spa_spares.sav_object,
1276		    &spa->spa_spares.sav_config) != 0) {
1277			vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1278			    VDEV_AUX_CORRUPT_DATA);
1279			error = EIO;
1280			goto out;
1281		}
1282
1283		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1284		spa_load_spares(spa);
1285		spa_config_exit(spa, SCL_ALL, FTAG);
1286	}
1287
1288	/*
1289	 * Load any level 2 ARC devices for this pool.
1290	 */
1291	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
1292	    DMU_POOL_L2CACHE, sizeof (uint64_t), 1,
1293	    &spa->spa_l2cache.sav_object);
1294	if (error != 0 && error != ENOENT) {
1295		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1296		    VDEV_AUX_CORRUPT_DATA);
1297		error = EIO;
1298		goto out;
1299	}
1300	if (error == 0) {
1301		ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE);
1302		if (load_nvlist(spa, spa->spa_l2cache.sav_object,
1303		    &spa->spa_l2cache.sav_config) != 0) {
1304			vdev_set_state(rvd, B_TRUE,
1305			    VDEV_STATE_CANT_OPEN,
1306			    VDEV_AUX_CORRUPT_DATA);
1307			error = EIO;
1308			goto out;
1309		}
1310
1311		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1312		spa_load_l2cache(spa);
1313		spa_config_exit(spa, SCL_ALL, FTAG);
1314	}
1315
1316	if (spa_check_logs(spa)) {
1317		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1318		    VDEV_AUX_BAD_LOG);
1319		error = ENXIO;
1320		ereport = FM_EREPORT_ZFS_LOG_REPLAY;
1321		goto out;
1322	}
1323
1324
1325	spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
1326
1327	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
1328	    DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object);
1329
1330	if (error && error != ENOENT) {
1331		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1332		    VDEV_AUX_CORRUPT_DATA);
1333		error = EIO;
1334		goto out;
1335	}
1336
1337	if (error == 0) {
1338		(void) zap_lookup(spa->spa_meta_objset,
1339		    spa->spa_pool_props_object,
1340		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS),
1341		    sizeof (uint64_t), 1, &spa->spa_bootfs);
1342		(void) zap_lookup(spa->spa_meta_objset,
1343		    spa->spa_pool_props_object,
1344		    zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE),
1345		    sizeof (uint64_t), 1, &autoreplace);
1346		(void) zap_lookup(spa->spa_meta_objset,
1347		    spa->spa_pool_props_object,
1348		    zpool_prop_to_name(ZPOOL_PROP_DELEGATION),
1349		    sizeof (uint64_t), 1, &spa->spa_delegation);
1350		(void) zap_lookup(spa->spa_meta_objset,
1351		    spa->spa_pool_props_object,
1352		    zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE),
1353		    sizeof (uint64_t), 1, &spa->spa_failmode);
1354	}
1355
1356	/*
1357	 * If the 'autoreplace' property is set, then post a resource notifying
1358	 * the ZFS DE that it should not issue any faults for unopenable
1359	 * devices.  We also iterate over the vdevs, and post a sysevent for any
1360	 * unopenable vdevs so that the normal autoreplace handler can take
1361	 * over.
1362	 */
1363	if (autoreplace && state != SPA_LOAD_TRYIMPORT)
1364		spa_check_removed(spa->spa_root_vdev);
1365
1366	/*
1367	 * Load the vdev state for all toplevel vdevs.
1368	 */
1369	vdev_load(rvd);
1370
1371	/*
1372	 * Propagate the leaf DTLs we just loaded all the way up the tree.
1373	 */
1374	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1375	vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
1376	spa_config_exit(spa, SCL_ALL, FTAG);
1377
1378	/*
1379	 * Check the state of the root vdev.  If it can't be opened, it
1380	 * indicates one or more toplevel vdevs are faulted.
1381	 */
1382	if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
1383		error = ENXIO;
1384		goto out;
1385	}
1386
1387	if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) {
1388		dmu_tx_t *tx;
1389		int need_update = B_FALSE;
1390		int c;
1391
1392		/*
1393		 * Claim log blocks that haven't been committed yet.
1394		 * This must all happen in a single txg.
1395		 */
1396		tx = dmu_tx_create_assigned(spa_get_dsl(spa),
1397		    spa_first_txg(spa));
1398		(void) dmu_objset_find(spa_name(spa),
1399		    zil_claim, tx, DS_FIND_CHILDREN);
1400		dmu_tx_commit(tx);
1401
1402		spa->spa_sync_on = B_TRUE;
1403		txg_sync_start(spa->spa_dsl_pool);
1404
1405		/*
1406		 * Wait for all claims to sync.
1407		 */
1408		txg_wait_synced(spa->spa_dsl_pool, 0);
1409
1410		/*
1411		 * If the config cache is stale, or we have uninitialized
1412		 * metaslabs (see spa_vdev_add()), then update the config.
1413		 */
1414		if (config_cache_txg != spa->spa_config_txg ||
1415		    state == SPA_LOAD_IMPORT)
1416			need_update = B_TRUE;
1417
1418		for (c = 0; c < rvd->vdev_children; c++)
1419			if (rvd->vdev_child[c]->vdev_ms_array == 0)
1420				need_update = B_TRUE;
1421
1422		/*
1423		 * Update the config cache asychronously in case we're the
1424		 * root pool, in which case the config cache isn't writable yet.
1425		 */
1426		if (need_update)
1427			spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
1428	}
1429
1430	error = 0;
1431out:
1432	spa->spa_minref = refcount_count(&spa->spa_refcount);
1433	if (error && error != EBADF)
1434		zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0);
1435	spa->spa_load_state = SPA_LOAD_NONE;
1436	spa->spa_ena = 0;
1437
1438	return (error);
1439}
1440
1441/*
1442 * Pool Open/Import
1443 *
1444 * The import case is identical to an open except that the configuration is sent
1445 * down from userland, instead of grabbed from the configuration cache.  For the
1446 * case of an open, the pool configuration will exist in the
1447 * POOL_STATE_UNINITIALIZED state.
1448 *
1449 * The stats information (gen/count/ustats) is used to gather vdev statistics at
1450 * the same time open the pool, without having to keep around the spa_t in some
1451 * ambiguous state.
1452 */
1453static int
1454spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config)
1455{
1456	spa_t *spa;
1457	int error;
1458	int locked = B_FALSE;
1459
1460	*spapp = NULL;
1461
1462	/*
1463	 * As disgusting as this is, we need to support recursive calls to this
1464	 * function because dsl_dir_open() is called during spa_load(), and ends
1465	 * up calling spa_open() again.  The real fix is to figure out how to
1466	 * avoid dsl_dir_open() calling this in the first place.
1467	 */
1468	if (mutex_owner(&spa_namespace_lock) != curthread) {
1469		mutex_enter(&spa_namespace_lock);
1470		locked = B_TRUE;
1471	}
1472
1473	if ((spa = spa_lookup(pool)) == NULL) {
1474		if (locked)
1475			mutex_exit(&spa_namespace_lock);
1476		return (ENOENT);
1477	}
1478	if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
1479
1480		spa_activate(spa);
1481
1482		error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE);
1483
1484		if (error == EBADF) {
1485			/*
1486			 * If vdev_validate() returns failure (indicated by
1487			 * EBADF), it indicates that one of the vdevs indicates
1488			 * that the pool has been exported or destroyed.  If
1489			 * this is the case, the config cache is out of sync and
1490			 * we should remove the pool from the namespace.
1491			 */
1492			spa_unload(spa);
1493			spa_deactivate(spa);
1494			spa_config_sync(spa, B_TRUE, B_TRUE);
1495			spa_remove(spa);
1496			if (locked)
1497				mutex_exit(&spa_namespace_lock);
1498			return (ENOENT);
1499		}
1500
1501		if (error) {
1502			/*
1503			 * We can't open the pool, but we still have useful
1504			 * information: the state of each vdev after the
1505			 * attempted vdev_open().  Return this to the user.
1506			 */
1507			if (config != NULL && spa->spa_root_vdev != NULL)
1508				*config = spa_config_generate(spa, NULL, -1ULL,
1509				    B_TRUE);
1510			spa_unload(spa);
1511			spa_deactivate(spa);
1512			spa->spa_last_open_failed = B_TRUE;
1513			if (locked)
1514				mutex_exit(&spa_namespace_lock);
1515			*spapp = NULL;
1516			return (error);
1517		} else {
1518			spa->spa_last_open_failed = B_FALSE;
1519		}
1520	}
1521
1522	spa_open_ref(spa, tag);
1523
1524	if (locked)
1525		mutex_exit(&spa_namespace_lock);
1526
1527	*spapp = spa;
1528
1529	if (config != NULL)
1530		*config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
1531
1532	return (0);
1533}
1534
1535int
1536spa_open(const char *name, spa_t **spapp, void *tag)
1537{
1538	return (spa_open_common(name, spapp, tag, NULL));
1539}
1540
1541/*
1542 * Lookup the given spa_t, incrementing the inject count in the process,
1543 * preventing it from being exported or destroyed.
1544 */
1545spa_t *
1546spa_inject_addref(char *name)
1547{
1548	spa_t *spa;
1549
1550	mutex_enter(&spa_namespace_lock);
1551	if ((spa = spa_lookup(name)) == NULL) {
1552		mutex_exit(&spa_namespace_lock);
1553		return (NULL);
1554	}
1555	spa->spa_inject_ref++;
1556	mutex_exit(&spa_namespace_lock);
1557
1558	return (spa);
1559}
1560
1561void
1562spa_inject_delref(spa_t *spa)
1563{
1564	mutex_enter(&spa_namespace_lock);
1565	spa->spa_inject_ref--;
1566	mutex_exit(&spa_namespace_lock);
1567}
1568
1569/*
1570 * Add spares device information to the nvlist.
1571 */
1572static void
1573spa_add_spares(spa_t *spa, nvlist_t *config)
1574{
1575	nvlist_t **spares;
1576	uint_t i, nspares;
1577	nvlist_t *nvroot;
1578	uint64_t guid;
1579	vdev_stat_t *vs;
1580	uint_t vsc;
1581	uint64_t pool;
1582
1583	if (spa->spa_spares.sav_count == 0)
1584		return;
1585
1586	VERIFY(nvlist_lookup_nvlist(config,
1587	    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
1588	VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
1589	    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
1590	if (nspares != 0) {
1591		VERIFY(nvlist_add_nvlist_array(nvroot,
1592		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
1593		VERIFY(nvlist_lookup_nvlist_array(nvroot,
1594		    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
1595
1596		/*
1597		 * Go through and find any spares which have since been
1598		 * repurposed as an active spare.  If this is the case, update
1599		 * their status appropriately.
1600		 */
1601		for (i = 0; i < nspares; i++) {
1602			VERIFY(nvlist_lookup_uint64(spares[i],
1603			    ZPOOL_CONFIG_GUID, &guid) == 0);
1604			if (spa_spare_exists(guid, &pool, NULL) &&
1605			    pool != 0ULL) {
1606				VERIFY(nvlist_lookup_uint64_array(
1607				    spares[i], ZPOOL_CONFIG_STATS,
1608				    (uint64_t **)&vs, &vsc) == 0);
1609				vs->vs_state = VDEV_STATE_CANT_OPEN;
1610				vs->vs_aux = VDEV_AUX_SPARED;
1611			}
1612		}
1613	}
1614}
1615
1616/*
1617 * Add l2cache device information to the nvlist, including vdev stats.
1618 */
1619static void
1620spa_add_l2cache(spa_t *spa, nvlist_t *config)
1621{
1622	nvlist_t **l2cache;
1623	uint_t i, j, nl2cache;
1624	nvlist_t *nvroot;
1625	uint64_t guid;
1626	vdev_t *vd;
1627	vdev_stat_t *vs;
1628	uint_t vsc;
1629
1630	if (spa->spa_l2cache.sav_count == 0)
1631		return;
1632
1633	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
1634
1635	VERIFY(nvlist_lookup_nvlist(config,
1636	    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
1637	VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
1638	    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
1639	if (nl2cache != 0) {
1640		VERIFY(nvlist_add_nvlist_array(nvroot,
1641		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
1642		VERIFY(nvlist_lookup_nvlist_array(nvroot,
1643		    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
1644
1645		/*
1646		 * Update level 2 cache device stats.
1647		 */
1648
1649		for (i = 0; i < nl2cache; i++) {
1650			VERIFY(nvlist_lookup_uint64(l2cache[i],
1651			    ZPOOL_CONFIG_GUID, &guid) == 0);
1652
1653			vd = NULL;
1654			for (j = 0; j < spa->spa_l2cache.sav_count; j++) {
1655				if (guid ==
1656				    spa->spa_l2cache.sav_vdevs[j]->vdev_guid) {
1657					vd = spa->spa_l2cache.sav_vdevs[j];
1658					break;
1659				}
1660			}
1661			ASSERT(vd != NULL);
1662
1663			VERIFY(nvlist_lookup_uint64_array(l2cache[i],
1664			    ZPOOL_CONFIG_STATS, (uint64_t **)&vs, &vsc) == 0);
1665			vdev_get_stats(vd, vs);
1666		}
1667	}
1668
1669	spa_config_exit(spa, SCL_CONFIG, FTAG);
1670}
1671
1672int
1673spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen)
1674{
1675	int error;
1676	spa_t *spa;
1677
1678	*config = NULL;
1679	error = spa_open_common(name, &spa, FTAG, config);
1680
1681	if (spa && *config != NULL) {
1682		VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT,
1683		    spa_get_errlog_size(spa)) == 0);
1684
1685		if (spa_suspended(spa))
1686			VERIFY(nvlist_add_uint64(*config,
1687			    ZPOOL_CONFIG_SUSPENDED, spa->spa_failmode) == 0);
1688
1689		spa_add_spares(spa, *config);
1690		spa_add_l2cache(spa, *config);
1691	}
1692
1693	/*
1694	 * We want to get the alternate root even for faulted pools, so we cheat
1695	 * and call spa_lookup() directly.
1696	 */
1697	if (altroot) {
1698		if (spa == NULL) {
1699			mutex_enter(&spa_namespace_lock);
1700			spa = spa_lookup(name);
1701			if (spa)
1702				spa_altroot(spa, altroot, buflen);
1703			else
1704				altroot[0] = '\0';
1705			spa = NULL;
1706			mutex_exit(&spa_namespace_lock);
1707		} else {
1708			spa_altroot(spa, altroot, buflen);
1709		}
1710	}
1711
1712	if (spa != NULL)
1713		spa_close(spa, FTAG);
1714
1715	return (error);
1716}
1717
1718/*
1719 * Validate that the auxiliary device array is well formed.  We must have an
1720 * array of nvlists, each which describes a valid leaf vdev.  If this is an
1721 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be
1722 * specified, as long as they are well-formed.
1723 */
1724static int
1725spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode,
1726    spa_aux_vdev_t *sav, const char *config, uint64_t version,
1727    vdev_labeltype_t label)
1728{
1729	nvlist_t **dev;
1730	uint_t i, ndev;
1731	vdev_t *vd;
1732	int error;
1733
1734	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
1735
1736	/*
1737	 * It's acceptable to have no devs specified.
1738	 */
1739	if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0)
1740		return (0);
1741
1742	if (ndev == 0)
1743		return (EINVAL);
1744
1745	/*
1746	 * Make sure the pool is formatted with a version that supports this
1747	 * device type.
1748	 */
1749	if (spa_version(spa) < version)
1750		return (ENOTSUP);
1751
1752	/*
1753	 * Set the pending device list so we correctly handle device in-use
1754	 * checking.
1755	 */
1756	sav->sav_pending = dev;
1757	sav->sav_npending = ndev;
1758
1759	for (i = 0; i < ndev; i++) {
1760		if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0,
1761		    mode)) != 0)
1762			goto out;
1763
1764		if (!vd->vdev_ops->vdev_op_leaf) {
1765			vdev_free(vd);
1766			error = EINVAL;
1767			goto out;
1768		}
1769
1770		/*
1771		 * The L2ARC currently only supports disk devices in
1772		 * kernel context.  For user-level testing, we allow it.
1773		 */
1774#ifdef _KERNEL
1775		if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) &&
1776		    strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) {
1777			error = ENOTBLK;
1778			goto out;
1779		}
1780#endif
1781		vd->vdev_top = vd;
1782
1783		if ((error = vdev_open(vd)) == 0 &&
1784		    (error = vdev_label_init(vd, crtxg, label)) == 0) {
1785			VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID,
1786			    vd->vdev_guid) == 0);
1787		}
1788
1789		vdev_free(vd);
1790
1791		if (error &&
1792		    (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE))
1793			goto out;
1794		else
1795			error = 0;
1796	}
1797
1798out:
1799	sav->sav_pending = NULL;
1800	sav->sav_npending = 0;
1801	return (error);
1802}
1803
1804static int
1805spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode)
1806{
1807	int error;
1808
1809	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
1810
1811	if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode,
1812	    &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES,
1813	    VDEV_LABEL_SPARE)) != 0) {
1814		return (error);
1815	}
1816
1817	return (spa_validate_aux_devs(spa, nvroot, crtxg, mode,
1818	    &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE,
1819	    VDEV_LABEL_L2CACHE));
1820}
1821
1822static void
1823spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs,
1824    const char *config)
1825{
1826	int i;
1827
1828	if (sav->sav_config != NULL) {
1829		nvlist_t **olddevs;
1830		uint_t oldndevs;
1831		nvlist_t **newdevs;
1832
1833		/*
1834		 * Generate new dev list by concatentating with the
1835		 * current dev list.
1836		 */
1837		VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config,
1838		    &olddevs, &oldndevs) == 0);
1839
1840		newdevs = kmem_alloc(sizeof (void *) *
1841		    (ndevs + oldndevs), KM_SLEEP);
1842		for (i = 0; i < oldndevs; i++)
1843			VERIFY(nvlist_dup(olddevs[i], &newdevs[i],
1844			    KM_SLEEP) == 0);
1845		for (i = 0; i < ndevs; i++)
1846			VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs],
1847			    KM_SLEEP) == 0);
1848
1849		VERIFY(nvlist_remove(sav->sav_config, config,
1850		    DATA_TYPE_NVLIST_ARRAY) == 0);
1851
1852		VERIFY(nvlist_add_nvlist_array(sav->sav_config,
1853		    config, newdevs, ndevs + oldndevs) == 0);
1854		for (i = 0; i < oldndevs + ndevs; i++)
1855			nvlist_free(newdevs[i]);
1856		kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *));
1857	} else {
1858		/*
1859		 * Generate a new dev list.
1860		 */
1861		VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME,
1862		    KM_SLEEP) == 0);
1863		VERIFY(nvlist_add_nvlist_array(sav->sav_config, config,
1864		    devs, ndevs) == 0);
1865	}
1866}
1867
1868/*
1869 * Stop and drop level 2 ARC devices
1870 */
1871void
1872spa_l2cache_drop(spa_t *spa)
1873{
1874	vdev_t *vd;
1875	int i;
1876	spa_aux_vdev_t *sav = &spa->spa_l2cache;
1877
1878	for (i = 0; i < sav->sav_count; i++) {
1879		uint64_t pool;
1880
1881		vd = sav->sav_vdevs[i];
1882		ASSERT(vd != NULL);
1883
1884		if ((spa_mode & FWRITE) &&
1885		    spa_l2cache_exists(vd->vdev_guid, &pool) && pool != 0ULL &&
1886		    l2arc_vdev_present(vd)) {
1887			l2arc_remove_vdev(vd);
1888		}
1889		if (vd->vdev_isl2cache)
1890			spa_l2cache_remove(vd);
1891		vdev_clear_stats(vd);
1892		(void) vdev_close(vd);
1893	}
1894}
1895
1896/*
1897 * Pool Creation
1898 */
1899int
1900spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
1901    const char *history_str, nvlist_t *zplprops)
1902{
1903	spa_t *spa;
1904	char *altroot = NULL;
1905	vdev_t *rvd;
1906	dsl_pool_t *dp;
1907	dmu_tx_t *tx;
1908	int c, error = 0;
1909	uint64_t txg = TXG_INITIAL;
1910	nvlist_t **spares, **l2cache;
1911	uint_t nspares, nl2cache;
1912	uint64_t version;
1913
1914	/*
1915	 * If this pool already exists, return failure.
1916	 */
1917	mutex_enter(&spa_namespace_lock);
1918	if (spa_lookup(pool) != NULL) {
1919		mutex_exit(&spa_namespace_lock);
1920		return (EEXIST);
1921	}
1922
1923	/*
1924	 * Allocate a new spa_t structure.
1925	 */
1926	(void) nvlist_lookup_string(props,
1927	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
1928	spa = spa_add(pool, altroot);
1929	spa_activate(spa);
1930
1931	spa->spa_uberblock.ub_txg = txg - 1;
1932
1933	if (props && (error = spa_prop_validate(spa, props))) {
1934		spa_unload(spa);
1935		spa_deactivate(spa);
1936		spa_remove(spa);
1937		mutex_exit(&spa_namespace_lock);
1938		return (error);
1939	}
1940
1941	if (nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION),
1942	    &version) != 0)
1943		version = SPA_VERSION;
1944	ASSERT(version <= SPA_VERSION);
1945	spa->spa_uberblock.ub_version = version;
1946	spa->spa_ubsync = spa->spa_uberblock;
1947
1948	/*
1949	 * Create the root vdev.
1950	 */
1951	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1952
1953	error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD);
1954
1955	ASSERT(error != 0 || rvd != NULL);
1956	ASSERT(error != 0 || spa->spa_root_vdev == rvd);
1957
1958	if (error == 0 && !zfs_allocatable_devs(nvroot))
1959		error = EINVAL;
1960
1961	if (error == 0 &&
1962	    (error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
1963	    (error = spa_validate_aux(spa, nvroot, txg,
1964	    VDEV_ALLOC_ADD)) == 0) {
1965		for (c = 0; c < rvd->vdev_children; c++)
1966			vdev_init(rvd->vdev_child[c], txg);
1967		vdev_config_dirty(rvd);
1968	}
1969
1970	spa_config_exit(spa, SCL_ALL, FTAG);
1971
1972	if (error != 0) {
1973		spa_unload(spa);
1974		spa_deactivate(spa);
1975		spa_remove(spa);
1976		mutex_exit(&spa_namespace_lock);
1977		return (error);
1978	}
1979
1980	/*
1981	 * Get the list of spares, if specified.
1982	 */
1983	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
1984	    &spares, &nspares) == 0) {
1985		VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME,
1986		    KM_SLEEP) == 0);
1987		VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
1988		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
1989		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1990		spa_load_spares(spa);
1991		spa_config_exit(spa, SCL_ALL, FTAG);
1992		spa->spa_spares.sav_sync = B_TRUE;
1993	}
1994
1995	/*
1996	 * Get the list of level 2 cache devices, if specified.
1997	 */
1998	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
1999	    &l2cache, &nl2cache) == 0) {
2000		VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
2001		    NV_UNIQUE_NAME, KM_SLEEP) == 0);
2002		VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
2003		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
2004		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2005		spa_load_l2cache(spa);
2006		spa_config_exit(spa, SCL_ALL, FTAG);
2007		spa->spa_l2cache.sav_sync = B_TRUE;
2008	}
2009
2010	spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg);
2011	spa->spa_meta_objset = dp->dp_meta_objset;
2012
2013	tx = dmu_tx_create_assigned(dp, txg);
2014
2015	/*
2016	 * Create the pool config object.
2017	 */
2018	spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
2019	    DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE,
2020	    DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
2021
2022	if (zap_add(spa->spa_meta_objset,
2023	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
2024	    sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
2025		cmn_err(CE_PANIC, "failed to add pool config");
2026	}
2027
2028	/* Newly created pools with the right version are always deflated. */
2029	if (version >= SPA_VERSION_RAIDZ_DEFLATE) {
2030		spa->spa_deflate = TRUE;
2031		if (zap_add(spa->spa_meta_objset,
2032		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
2033		    sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
2034			cmn_err(CE_PANIC, "failed to add deflate");
2035		}
2036	}
2037
2038	/*
2039	 * Create the deferred-free bplist object.  Turn off compression
2040	 * because sync-to-convergence takes longer if the blocksize
2041	 * keeps changing.
2042	 */
2043	spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset,
2044	    1 << 14, tx);
2045	dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj,
2046	    ZIO_COMPRESS_OFF, tx);
2047
2048	if (zap_add(spa->spa_meta_objset,
2049	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
2050	    sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) {
2051		cmn_err(CE_PANIC, "failed to add bplist");
2052	}
2053
2054	/*
2055	 * Create the pool's history object.
2056	 */
2057	if (version >= SPA_VERSION_ZPOOL_HISTORY)
2058		spa_history_create_obj(spa, tx);
2059
2060	/*
2061	 * Set pool properties.
2062	 */
2063	spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
2064	spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
2065	spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
2066	if (props)
2067		spa_sync_props(spa, props, CRED(), tx);
2068
2069	dmu_tx_commit(tx);
2070
2071	spa->spa_sync_on = B_TRUE;
2072	txg_sync_start(spa->spa_dsl_pool);
2073
2074	/*
2075	 * We explicitly wait for the first transaction to complete so that our
2076	 * bean counters are appropriately updated.
2077	 */
2078	txg_wait_synced(spa->spa_dsl_pool, txg);
2079
2080	spa_config_sync(spa, B_FALSE, B_TRUE);
2081
2082	if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL)
2083		(void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE);
2084
2085	mutex_exit(&spa_namespace_lock);
2086
2087	spa->spa_minref = refcount_count(&spa->spa_refcount);
2088
2089	return (0);
2090}
2091
2092/*
2093 * Import the given pool into the system.  We set up the necessary spa_t and
2094 * then call spa_load() to do the dirty work.
2095 */
2096static int
2097spa_import_common(const char *pool, nvlist_t *config, nvlist_t *props,
2098    boolean_t isroot, boolean_t allowfaulted)
2099{
2100	spa_t *spa;
2101	char *altroot = NULL;
2102	int error, loaderr;
2103	nvlist_t *nvroot;
2104	nvlist_t **spares, **l2cache;
2105	uint_t nspares, nl2cache;
2106
2107	/*
2108	 * If a pool with this name exists, return failure.
2109	 */
2110	mutex_enter(&spa_namespace_lock);
2111	if (spa_lookup(pool) != NULL) {
2112		mutex_exit(&spa_namespace_lock);
2113		return (EEXIST);
2114	}
2115
2116	/*
2117	 * Create and initialize the spa structure.
2118	 */
2119	(void) nvlist_lookup_string(props,
2120	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
2121	spa = spa_add(pool, altroot);
2122	spa_activate(spa);
2123
2124	if (allowfaulted)
2125		spa->spa_import_faulted = B_TRUE;
2126	spa->spa_is_root = isroot;
2127
2128	/*
2129	 * Pass off the heavy lifting to spa_load().
2130	 * Pass TRUE for mosconfig (unless this is a root pool) because
2131	 * the user-supplied config is actually the one to trust when
2132	 * doing an import.
2133	 */
2134	loaderr = error = spa_load(spa, config, SPA_LOAD_IMPORT, !isroot);
2135
2136	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2137	/*
2138	 * Toss any existing sparelist, as it doesn't have any validity anymore,
2139	 * and conflicts with spa_has_spare().
2140	 */
2141	if (!isroot && spa->spa_spares.sav_config) {
2142		nvlist_free(spa->spa_spares.sav_config);
2143		spa->spa_spares.sav_config = NULL;
2144		spa_load_spares(spa);
2145	}
2146	if (!isroot && spa->spa_l2cache.sav_config) {
2147		nvlist_free(spa->spa_l2cache.sav_config);
2148		spa->spa_l2cache.sav_config = NULL;
2149		spa_load_l2cache(spa);
2150	}
2151
2152	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
2153	    &nvroot) == 0);
2154	if (error == 0)
2155		error = spa_validate_aux(spa, nvroot, -1ULL, VDEV_ALLOC_SPARE);
2156	if (error == 0)
2157		error = spa_validate_aux(spa, nvroot, -1ULL,
2158		    VDEV_ALLOC_L2CACHE);
2159	spa_config_exit(spa, SCL_ALL, FTAG);
2160
2161	if (error != 0 || (props && (error = spa_prop_set(spa, props)))) {
2162		if (loaderr != 0 && loaderr != EINVAL && allowfaulted) {
2163			/*
2164			 * If we failed to load the pool, but 'allowfaulted' is
2165			 * set, then manually set the config as if the config
2166			 * passed in was specified in the cache file.
2167			 */
2168			error = 0;
2169			spa->spa_import_faulted = B_FALSE;
2170			if (spa->spa_config == NULL)
2171				spa->spa_config = spa_config_generate(spa,
2172				    NULL, -1ULL, B_TRUE);
2173			spa_unload(spa);
2174			spa_deactivate(spa);
2175			spa_config_sync(spa, B_FALSE, B_TRUE);
2176		} else {
2177			spa_unload(spa);
2178			spa_deactivate(spa);
2179			spa_remove(spa);
2180		}
2181		mutex_exit(&spa_namespace_lock);
2182		return (error);
2183	}
2184
2185	/*
2186	 * Override any spares and level 2 cache devices as specified by
2187	 * the user, as these may have correct device names/devids, etc.
2188	 */
2189	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
2190	    &spares, &nspares) == 0) {
2191		if (spa->spa_spares.sav_config)
2192			VERIFY(nvlist_remove(spa->spa_spares.sav_config,
2193			    ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
2194		else
2195			VERIFY(nvlist_alloc(&spa->spa_spares.sav_config,
2196			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
2197		VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
2198		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
2199		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2200		spa_load_spares(spa);
2201		spa_config_exit(spa, SCL_ALL, FTAG);
2202		spa->spa_spares.sav_sync = B_TRUE;
2203	}
2204	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
2205	    &l2cache, &nl2cache) == 0) {
2206		if (spa->spa_l2cache.sav_config)
2207			VERIFY(nvlist_remove(spa->spa_l2cache.sav_config,
2208			    ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0);
2209		else
2210			VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
2211			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
2212		VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
2213		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
2214		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2215		spa_load_l2cache(spa);
2216		spa_config_exit(spa, SCL_ALL, FTAG);
2217		spa->spa_l2cache.sav_sync = B_TRUE;
2218	}
2219
2220	if (spa_mode & FWRITE) {
2221		/*
2222		 * Update the config cache to include the newly-imported pool.
2223		 */
2224		spa_config_update_common(spa, SPA_CONFIG_UPDATE_POOL, isroot);
2225	}
2226
2227	spa->spa_import_faulted = B_FALSE;
2228	mutex_exit(&spa_namespace_lock);
2229
2230	return (0);
2231}
2232
2233#if defined(sun)
2234#ifdef _KERNEL
2235/*
2236 * Build a "root" vdev for a top level vdev read in from a rootpool
2237 * device label.
2238 */
2239static void
2240spa_build_rootpool_config(nvlist_t *config)
2241{
2242	nvlist_t *nvtop, *nvroot;
2243	uint64_t pgid;
2244
2245	/*
2246	 * Add this top-level vdev to the child array.
2247	 */
2248	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtop)
2249	    == 0);
2250	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pgid)
2251	    == 0);
2252
2253	/*
2254	 * Put this pool's top-level vdevs into a root vdev.
2255	 */
2256	VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
2257	VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT)
2258	    == 0);
2259	VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0);
2260	VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0);
2261	VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
2262	    &nvtop, 1) == 0);
2263
2264	/*
2265	 * Replace the existing vdev_tree with the new root vdev in
2266	 * this pool's configuration (remove the old, add the new).
2267	 */
2268	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
2269	nvlist_free(nvroot);
2270}
2271
2272/*
2273 * Get the root pool information from the root disk, then import the root pool
2274 * during the system boot up time.
2275 */
2276extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **);
2277
2278int
2279spa_check_rootconf(char *devpath, char *devid, nvlist_t **bestconf,
2280    uint64_t *besttxg)
2281{
2282	nvlist_t *config;
2283	uint64_t txg;
2284	int error;
2285
2286	if (error = vdev_disk_read_rootlabel(devpath, devid, &config))
2287		return (error);
2288
2289	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
2290
2291	if (bestconf != NULL)
2292		*bestconf = config;
2293	else
2294		nvlist_free(config);
2295	*besttxg = txg;
2296	return (0);
2297}
2298
2299boolean_t
2300spa_rootdev_validate(nvlist_t *nv)
2301{
2302	uint64_t ival;
2303
2304	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &ival) == 0 ||
2305	    nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, &ival) == 0 ||
2306	    nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, &ival) == 0)
2307		return (B_FALSE);
2308
2309	return (B_TRUE);
2310}
2311
2312
2313/*
2314 * Given the boot device's physical path or devid, check if the device
2315 * is in a valid state.  If so, return the configuration from the vdev
2316 * label.
2317 */
2318int
2319spa_get_rootconf(char *devpath, char *devid, nvlist_t **bestconf)
2320{
2321	nvlist_t *conf = NULL;
2322	uint64_t txg = 0;
2323	nvlist_t *nvtop, **child;
2324	char *type;
2325	char *bootpath = NULL;
2326	uint_t children, c;
2327	char *tmp;
2328	int error;
2329
2330	if (devpath && ((tmp = strchr(devpath, ' ')) != NULL))
2331		*tmp = '\0';
2332	if (error = spa_check_rootconf(devpath, devid, &conf, &txg)) {
2333		cmn_err(CE_NOTE, "error reading device label");
2334		return (error);
2335	}
2336	if (txg == 0) {
2337		cmn_err(CE_NOTE, "this device is detached");
2338		nvlist_free(conf);
2339		return (EINVAL);
2340	}
2341
2342	VERIFY(nvlist_lookup_nvlist(conf, ZPOOL_CONFIG_VDEV_TREE,
2343	    &nvtop) == 0);
2344	VERIFY(nvlist_lookup_string(nvtop, ZPOOL_CONFIG_TYPE, &type) == 0);
2345
2346	if (strcmp(type, VDEV_TYPE_DISK) == 0) {
2347		if (spa_rootdev_validate(nvtop)) {
2348			goto out;
2349		} else {
2350			nvlist_free(conf);
2351			return (EINVAL);
2352		}
2353	}
2354
2355	ASSERT(strcmp(type, VDEV_TYPE_MIRROR) == 0);
2356
2357	VERIFY(nvlist_lookup_nvlist_array(nvtop, ZPOOL_CONFIG_CHILDREN,
2358	    &child, &children) == 0);
2359
2360	/*
2361	 * Go thru vdevs in the mirror to see if the given device
2362	 * has the most recent txg. Only the device with the most
2363	 * recent txg has valid information and should be booted.
2364	 */
2365	for (c = 0; c < children; c++) {
2366		char *cdevid, *cpath;
2367		uint64_t tmptxg;
2368
2369		if (nvlist_lookup_string(child[c], ZPOOL_CONFIG_PHYS_PATH,
2370		    &cpath) != 0)
2371			return (EINVAL);
2372		if (nvlist_lookup_string(child[c], ZPOOL_CONFIG_DEVID,
2373		    &cdevid) != 0)
2374			return (EINVAL);
2375		if ((spa_check_rootconf(cpath, cdevid, NULL,
2376		    &tmptxg) == 0) && (tmptxg > txg)) {
2377			txg = tmptxg;
2378			VERIFY(nvlist_lookup_string(child[c],
2379			    ZPOOL_CONFIG_PATH, &bootpath) == 0);
2380		}
2381	}
2382
2383	/* Does the best device match the one we've booted from? */
2384	if (bootpath) {
2385		cmn_err(CE_NOTE, "try booting from '%s'", bootpath);
2386		return (EINVAL);
2387	}
2388out:
2389	*bestconf = conf;
2390	return (0);
2391}
2392
2393/*
2394 * Import a root pool.
2395 *
2396 * For x86. devpath_list will consist of devid and/or physpath name of
2397 * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a").
2398 * The GRUB "findroot" command will return the vdev we should boot.
2399 *
2400 * For Sparc, devpath_list consists the physpath name of the booting device
2401 * no matter the rootpool is a single device pool or a mirrored pool.
2402 * e.g.
2403 *	"/pci@1f,0/ide@d/disk@0,0:a"
2404 */
2405int
2406spa_import_rootpool(char *devpath, char *devid)
2407{
2408	nvlist_t *conf = NULL;
2409	char *pname;
2410	int error;
2411
2412	/*
2413	 * Get the vdev pathname and configuation from the most
2414	 * recently updated vdev (highest txg).
2415	 */
2416	if (error = spa_get_rootconf(devpath, devid, &conf))
2417		goto msg_out;
2418
2419	/*
2420	 * Add type "root" vdev to the config.
2421	 */
2422	spa_build_rootpool_config(conf);
2423
2424	VERIFY(nvlist_lookup_string(conf, ZPOOL_CONFIG_POOL_NAME, &pname) == 0);
2425
2426	/*
2427	 * We specify 'allowfaulted' for this to be treated like spa_open()
2428	 * instead of spa_import().  This prevents us from marking vdevs as
2429	 * persistently unavailable, and generates FMA ereports as if it were a
2430	 * pool open, not import.
2431	 */
2432	error = spa_import_common(pname, conf, NULL, B_TRUE, B_TRUE);
2433	if (error == EEXIST)
2434		error = 0;
2435
2436	nvlist_free(conf);
2437	return (error);
2438
2439msg_out:
2440	cmn_err(CE_NOTE, "\n"
2441	    "  ***************************************************  \n"
2442	    "  *  This device is not bootable!                   *  \n"
2443	    "  *  It is either offlined or detached or faulted.  *  \n"
2444	    "  *  Please try to boot from a different device.    *  \n"
2445	    "  ***************************************************  ");
2446
2447	return (error);
2448}
2449#endif
2450#endif
2451
2452/*
2453 * Import a non-root pool into the system.
2454 */
2455int
2456spa_import(const char *pool, nvlist_t *config, nvlist_t *props)
2457{
2458	return (spa_import_common(pool, config, props, B_FALSE, B_FALSE));
2459}
2460
2461int
2462spa_import_faulted(const char *pool, nvlist_t *config, nvlist_t *props)
2463{
2464	return (spa_import_common(pool, config, props, B_FALSE, B_TRUE));
2465}
2466
2467
2468/*
2469 * This (illegal) pool name is used when temporarily importing a spa_t in order
2470 * to get the vdev stats associated with the imported devices.
2471 */
2472#define	TRYIMPORT_NAME	"$import"
2473
2474nvlist_t *
2475spa_tryimport(nvlist_t *tryconfig)
2476{
2477	nvlist_t *config = NULL;
2478	char *poolname;
2479	spa_t *spa;
2480	uint64_t state;
2481
2482	if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
2483		return (NULL);
2484
2485	if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state))
2486		return (NULL);
2487
2488	/*
2489	 * Create and initialize the spa structure.
2490	 */
2491	mutex_enter(&spa_namespace_lock);
2492	spa = spa_add(TRYIMPORT_NAME, NULL);
2493	spa_activate(spa);
2494
2495	/*
2496	 * Pass off the heavy lifting to spa_load().
2497	 * Pass TRUE for mosconfig because the user-supplied config
2498	 * is actually the one to trust when doing an import.
2499	 */
2500	(void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE);
2501
2502	/*
2503	 * If 'tryconfig' was at least parsable, return the current config.
2504	 */
2505	if (spa->spa_root_vdev != NULL) {
2506		config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
2507		VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
2508		    poolname) == 0);
2509		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
2510		    state) == 0);
2511		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
2512		    spa->spa_uberblock.ub_timestamp) == 0);
2513
2514		/*
2515		 * If the bootfs property exists on this pool then we
2516		 * copy it out so that external consumers can tell which
2517		 * pools are bootable.
2518		 */
2519		if (spa->spa_bootfs) {
2520			char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
2521
2522			/*
2523			 * We have to play games with the name since the
2524			 * pool was opened as TRYIMPORT_NAME.
2525			 */
2526			if (dsl_dsobj_to_dsname(spa_name(spa),
2527			    spa->spa_bootfs, tmpname) == 0) {
2528				char *cp;
2529				char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
2530
2531				cp = strchr(tmpname, '/');
2532				if (cp == NULL) {
2533					(void) strlcpy(dsname, tmpname,
2534					    MAXPATHLEN);
2535				} else {
2536					(void) snprintf(dsname, MAXPATHLEN,
2537					    "%s/%s", poolname, ++cp);
2538				}
2539				VERIFY(nvlist_add_string(config,
2540				    ZPOOL_CONFIG_BOOTFS, dsname) == 0);
2541				kmem_free(dsname, MAXPATHLEN);
2542			}
2543			kmem_free(tmpname, MAXPATHLEN);
2544		}
2545
2546		/*
2547		 * Add the list of hot spares and level 2 cache devices.
2548		 */
2549		spa_add_spares(spa, config);
2550		spa_add_l2cache(spa, config);
2551	}
2552
2553	spa_unload(spa);
2554	spa_deactivate(spa);
2555	spa_remove(spa);
2556	mutex_exit(&spa_namespace_lock);
2557
2558	return (config);
2559}
2560
2561/*
2562 * Pool export/destroy
2563 *
2564 * The act of destroying or exporting a pool is very simple.  We make sure there
2565 * is no more pending I/O and any references to the pool are gone.  Then, we
2566 * update the pool state and sync all the labels to disk, removing the
2567 * configuration from the cache afterwards.
2568 */
2569static int
2570spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
2571    boolean_t force)
2572{
2573	spa_t *spa;
2574
2575	if (oldconfig)
2576		*oldconfig = NULL;
2577
2578	if (!(spa_mode & FWRITE))
2579		return (EROFS);
2580
2581	mutex_enter(&spa_namespace_lock);
2582	if ((spa = spa_lookup(pool)) == NULL) {
2583		mutex_exit(&spa_namespace_lock);
2584		return (ENOENT);
2585	}
2586
2587	/*
2588	 * Put a hold on the pool, drop the namespace lock, stop async tasks,
2589	 * reacquire the namespace lock, and see if we can export.
2590	 */
2591	spa_open_ref(spa, FTAG);
2592	mutex_exit(&spa_namespace_lock);
2593	spa_async_suspend(spa);
2594	mutex_enter(&spa_namespace_lock);
2595	spa_close(spa, FTAG);
2596
2597	/*
2598	 * The pool will be in core if it's openable,
2599	 * in which case we can modify its state.
2600	 */
2601	if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) {
2602		/*
2603		 * Objsets may be open only because they're dirty, so we
2604		 * have to force it to sync before checking spa_refcnt.
2605		 */
2606		txg_wait_synced(spa->spa_dsl_pool, 0);
2607
2608		/*
2609		 * A pool cannot be exported or destroyed if there are active
2610		 * references.  If we are resetting a pool, allow references by
2611		 * fault injection handlers.
2612		 */
2613		if (!spa_refcount_zero(spa) ||
2614		    (spa->spa_inject_ref != 0 &&
2615		    new_state != POOL_STATE_UNINITIALIZED)) {
2616			spa_async_resume(spa);
2617			mutex_exit(&spa_namespace_lock);
2618			return (EBUSY);
2619		}
2620
2621		/*
2622		 * A pool cannot be exported if it has an active shared spare.
2623		 * This is to prevent other pools stealing the active spare
2624		 * from an exported pool. At user's own will, such pool can
2625		 * be forcedly exported.
2626		 */
2627		if (!force && new_state == POOL_STATE_EXPORTED &&
2628		    spa_has_active_shared_spare(spa)) {
2629			spa_async_resume(spa);
2630			mutex_exit(&spa_namespace_lock);
2631			return (EXDEV);
2632		}
2633
2634		/*
2635		 * We want this to be reflected on every label,
2636		 * so mark them all dirty.  spa_unload() will do the
2637		 * final sync that pushes these changes out.
2638		 */
2639		if (new_state != POOL_STATE_UNINITIALIZED) {
2640			spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2641			spa->spa_state = new_state;
2642			spa->spa_final_txg = spa_last_synced_txg(spa) + 1;
2643			vdev_config_dirty(spa->spa_root_vdev);
2644			spa_config_exit(spa, SCL_ALL, FTAG);
2645		}
2646	}
2647
2648	spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY);
2649
2650	if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
2651		spa_unload(spa);
2652		spa_deactivate(spa);
2653	}
2654
2655	if (oldconfig && spa->spa_config)
2656		VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0);
2657
2658	if (new_state != POOL_STATE_UNINITIALIZED) {
2659		spa_config_sync(spa, B_TRUE, B_TRUE);
2660		spa_remove(spa);
2661	}
2662	mutex_exit(&spa_namespace_lock);
2663
2664	return (0);
2665}
2666
2667/*
2668 * Destroy a storage pool.
2669 */
2670int
2671spa_destroy(char *pool)
2672{
2673	return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, B_FALSE));
2674}
2675
2676/*
2677 * Export a storage pool.
2678 */
2679int
2680spa_export(char *pool, nvlist_t **oldconfig, boolean_t force)
2681{
2682	return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, force));
2683}
2684
2685/*
2686 * Similar to spa_export(), this unloads the spa_t without actually removing it
2687 * from the namespace in any way.
2688 */
2689int
2690spa_reset(char *pool)
2691{
2692	return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL,
2693	    B_FALSE));
2694}
2695
2696/*
2697 * ==========================================================================
2698 * Device manipulation
2699 * ==========================================================================
2700 */
2701
2702/*
2703 * Add a device to a storage pool.
2704 */
2705int
2706spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
2707{
2708	uint64_t txg;
2709	int c, error;
2710	vdev_t *rvd = spa->spa_root_vdev;
2711	vdev_t *vd, *tvd;
2712	nvlist_t **spares, **l2cache;
2713	uint_t nspares, nl2cache;
2714
2715	txg = spa_vdev_enter(spa);
2716
2717	if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0,
2718	    VDEV_ALLOC_ADD)) != 0)
2719		return (spa_vdev_exit(spa, NULL, txg, error));
2720
2721	spa->spa_pending_vdev = vd;	/* spa_vdev_exit() will clear this */
2722
2723	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares,
2724	    &nspares) != 0)
2725		nspares = 0;
2726
2727	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache,
2728	    &nl2cache) != 0)
2729		nl2cache = 0;
2730
2731	if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0)
2732		return (spa_vdev_exit(spa, vd, txg, EINVAL));
2733
2734	if (vd->vdev_children != 0 &&
2735	    (error = vdev_create(vd, txg, B_FALSE)) != 0)
2736		return (spa_vdev_exit(spa, vd, txg, error));
2737
2738	/*
2739	 * We must validate the spares and l2cache devices after checking the
2740	 * children.  Otherwise, vdev_inuse() will blindly overwrite the spare.
2741	 */
2742	if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0)
2743		return (spa_vdev_exit(spa, vd, txg, error));
2744
2745	/*
2746	 * Transfer each new top-level vdev from vd to rvd.
2747	 */
2748	for (c = 0; c < vd->vdev_children; c++) {
2749		tvd = vd->vdev_child[c];
2750		vdev_remove_child(vd, tvd);
2751		tvd->vdev_id = rvd->vdev_children;
2752		vdev_add_child(rvd, tvd);
2753		vdev_config_dirty(tvd);
2754	}
2755
2756	if (nspares != 0) {
2757		spa_set_aux_vdevs(&spa->spa_spares, spares, nspares,
2758		    ZPOOL_CONFIG_SPARES);
2759		spa_load_spares(spa);
2760		spa->spa_spares.sav_sync = B_TRUE;
2761	}
2762
2763	if (nl2cache != 0) {
2764		spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache,
2765		    ZPOOL_CONFIG_L2CACHE);
2766		spa_load_l2cache(spa);
2767		spa->spa_l2cache.sav_sync = B_TRUE;
2768	}
2769
2770	/*
2771	 * We have to be careful when adding new vdevs to an existing pool.
2772	 * If other threads start allocating from these vdevs before we
2773	 * sync the config cache, and we lose power, then upon reboot we may
2774	 * fail to open the pool because there are DVAs that the config cache
2775	 * can't translate.  Therefore, we first add the vdevs without
2776	 * initializing metaslabs; sync the config cache (via spa_vdev_exit());
2777	 * and then let spa_config_update() initialize the new metaslabs.
2778	 *
2779	 * spa_load() checks for added-but-not-initialized vdevs, so that
2780	 * if we lose power at any point in this sequence, the remaining
2781	 * steps will be completed the next time we load the pool.
2782	 */
2783	(void) spa_vdev_exit(spa, vd, txg, 0);
2784
2785	mutex_enter(&spa_namespace_lock);
2786	spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
2787	mutex_exit(&spa_namespace_lock);
2788
2789	return (0);
2790}
2791
2792/*
2793 * Attach a device to a mirror.  The arguments are the path to any device
2794 * in the mirror, and the nvroot for the new device.  If the path specifies
2795 * a device that is not mirrored, we automatically insert the mirror vdev.
2796 *
2797 * If 'replacing' is specified, the new device is intended to replace the
2798 * existing device; in this case the two devices are made into their own
2799 * mirror using the 'replacing' vdev, which is functionally identical to
2800 * the mirror vdev (it actually reuses all the same ops) but has a few
2801 * extra rules: you can't attach to it after it's been created, and upon
2802 * completion of resilvering, the first disk (the one being replaced)
2803 * is automatically detached.
2804 */
2805int
2806spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
2807{
2808	uint64_t txg, open_txg;
2809	vdev_t *rvd = spa->spa_root_vdev;
2810	vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
2811	vdev_ops_t *pvops;
2812	dmu_tx_t *tx;
2813	char *oldvdpath, *newvdpath;
2814	int newvd_isspare;
2815	int error;
2816
2817	txg = spa_vdev_enter(spa);
2818
2819	oldvd = spa_lookup_by_guid(spa, guid, B_FALSE);
2820
2821	if (oldvd == NULL)
2822		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
2823
2824	if (!oldvd->vdev_ops->vdev_op_leaf)
2825		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
2826
2827	pvd = oldvd->vdev_parent;
2828
2829	if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
2830	    VDEV_ALLOC_ADD)) != 0)
2831		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
2832
2833	if (newrootvd->vdev_children != 1)
2834		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
2835
2836	newvd = newrootvd->vdev_child[0];
2837
2838	if (!newvd->vdev_ops->vdev_op_leaf)
2839		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
2840
2841	if ((error = vdev_create(newrootvd, txg, replacing)) != 0)
2842		return (spa_vdev_exit(spa, newrootvd, txg, error));
2843
2844	/*
2845	 * Spares can't replace logs
2846	 */
2847	if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare)
2848		return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
2849
2850	if (!replacing) {
2851		/*
2852		 * For attach, the only allowable parent is a mirror or the root
2853		 * vdev.
2854		 */
2855		if (pvd->vdev_ops != &vdev_mirror_ops &&
2856		    pvd->vdev_ops != &vdev_root_ops)
2857			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
2858
2859		pvops = &vdev_mirror_ops;
2860	} else {
2861		/*
2862		 * Active hot spares can only be replaced by inactive hot
2863		 * spares.
2864		 */
2865		if (pvd->vdev_ops == &vdev_spare_ops &&
2866		    pvd->vdev_child[1] == oldvd &&
2867		    !spa_has_spare(spa, newvd->vdev_guid))
2868			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
2869
2870		/*
2871		 * If the source is a hot spare, and the parent isn't already a
2872		 * spare, then we want to create a new hot spare.  Otherwise, we
2873		 * want to create a replacing vdev.  The user is not allowed to
2874		 * attach to a spared vdev child unless the 'isspare' state is
2875		 * the same (spare replaces spare, non-spare replaces
2876		 * non-spare).
2877		 */
2878		if (pvd->vdev_ops == &vdev_replacing_ops)
2879			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
2880		else if (pvd->vdev_ops == &vdev_spare_ops &&
2881		    newvd->vdev_isspare != oldvd->vdev_isspare)
2882			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
2883		else if (pvd->vdev_ops != &vdev_spare_ops &&
2884		    newvd->vdev_isspare)
2885			pvops = &vdev_spare_ops;
2886		else
2887			pvops = &vdev_replacing_ops;
2888	}
2889
2890	/*
2891	 * Compare the new device size with the replaceable/attachable
2892	 * device size.
2893	 */
2894	if (newvd->vdev_psize < vdev_get_rsize(oldvd))
2895		return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
2896
2897	/*
2898	 * The new device cannot have a higher alignment requirement
2899	 * than the top-level vdev.
2900	 */
2901	if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
2902		return (spa_vdev_exit(spa, newrootvd, txg, EDOM));
2903
2904	/*
2905	 * If this is an in-place replacement, update oldvd's path and devid
2906	 * to make it distinguishable from newvd, and unopenable from now on.
2907	 */
2908	if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
2909		spa_strfree(oldvd->vdev_path);
2910		oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
2911		    KM_SLEEP);
2912		(void) sprintf(oldvd->vdev_path, "%s/%s",
2913		    newvd->vdev_path, "old");
2914		if (oldvd->vdev_devid != NULL) {
2915			spa_strfree(oldvd->vdev_devid);
2916			oldvd->vdev_devid = NULL;
2917		}
2918	}
2919
2920	/*
2921	 * If the parent is not a mirror, or if we're replacing, insert the new
2922	 * mirror/replacing/spare vdev above oldvd.
2923	 */
2924	if (pvd->vdev_ops != pvops)
2925		pvd = vdev_add_parent(oldvd, pvops);
2926
2927	ASSERT(pvd->vdev_top->vdev_parent == rvd);
2928	ASSERT(pvd->vdev_ops == pvops);
2929	ASSERT(oldvd->vdev_parent == pvd);
2930
2931	/*
2932	 * Extract the new device from its root and add it to pvd.
2933	 */
2934	vdev_remove_child(newrootvd, newvd);
2935	newvd->vdev_id = pvd->vdev_children;
2936	vdev_add_child(pvd, newvd);
2937
2938	/*
2939	 * If newvd is smaller than oldvd, but larger than its rsize,
2940	 * the addition of newvd may have decreased our parent's asize.
2941	 */
2942	pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize);
2943
2944	tvd = newvd->vdev_top;
2945	ASSERT(pvd->vdev_top == tvd);
2946	ASSERT(tvd->vdev_parent == rvd);
2947
2948	vdev_config_dirty(tvd);
2949
2950	/*
2951	 * Set newvd's DTL to [TXG_INITIAL, open_txg].  It will propagate
2952	 * upward when spa_vdev_exit() calls vdev_dtl_reassess().
2953	 */
2954	open_txg = txg + TXG_CONCURRENT_STATES - 1;
2955
2956	mutex_enter(&newvd->vdev_dtl_lock);
2957	space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL,
2958	    open_txg - TXG_INITIAL + 1);
2959	mutex_exit(&newvd->vdev_dtl_lock);
2960
2961	if (newvd->vdev_isspare)
2962		spa_spare_activate(newvd);
2963	oldvdpath = spa_strdup(oldvd->vdev_path);
2964	newvdpath = spa_strdup(newvd->vdev_path);
2965	newvd_isspare = newvd->vdev_isspare;
2966
2967	/*
2968	 * Mark newvd's DTL dirty in this txg.
2969	 */
2970	vdev_dirty(tvd, VDD_DTL, newvd, txg);
2971
2972	(void) spa_vdev_exit(spa, newrootvd, open_txg, 0);
2973
2974	tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
2975	if (dmu_tx_assign(tx, TXG_WAIT) == 0) {
2976		spa_history_internal_log(LOG_POOL_VDEV_ATTACH, spa, tx,
2977		    CRED(),  "%s vdev=%s %s vdev=%s",
2978		    replacing && newvd_isspare ? "spare in" :
2979		    replacing ? "replace" : "attach", newvdpath,
2980		    replacing ? "for" : "to", oldvdpath);
2981		dmu_tx_commit(tx);
2982	} else {
2983		dmu_tx_abort(tx);
2984	}
2985
2986	spa_strfree(oldvdpath);
2987	spa_strfree(newvdpath);
2988
2989	/*
2990	 * Kick off a resilver to update newvd.
2991	 */
2992	VERIFY3U(spa_scrub(spa, POOL_SCRUB_RESILVER), ==, 0);
2993
2994	return (0);
2995}
2996
2997/*
2998 * Detach a device from a mirror or replacing vdev.
2999 * If 'replace_done' is specified, only detach if the parent
3000 * is a replacing vdev.
3001 */
3002int
3003spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done)
3004{
3005	uint64_t txg;
3006	int c, t, error;
3007	vdev_t *rvd = spa->spa_root_vdev;
3008	vdev_t *vd, *pvd, *cvd, *tvd;
3009	boolean_t unspare = B_FALSE;
3010	uint64_t unspare_guid;
3011	size_t len;
3012
3013	txg = spa_vdev_enter(spa);
3014
3015	vd = spa_lookup_by_guid(spa, guid, B_FALSE);
3016
3017	if (vd == NULL)
3018		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
3019
3020	if (!vd->vdev_ops->vdev_op_leaf)
3021		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
3022
3023	pvd = vd->vdev_parent;
3024
3025	/*
3026	 * If replace_done is specified, only remove this device if it's
3027	 * the first child of a replacing vdev.  For the 'spare' vdev, either
3028	 * disk can be removed.
3029	 */
3030	if (replace_done) {
3031		if (pvd->vdev_ops == &vdev_replacing_ops) {
3032			if (vd->vdev_id != 0)
3033				return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
3034		} else if (pvd->vdev_ops != &vdev_spare_ops) {
3035			return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
3036		}
3037	}
3038
3039	ASSERT(pvd->vdev_ops != &vdev_spare_ops ||
3040	    spa_version(spa) >= SPA_VERSION_SPARES);
3041
3042	/*
3043	 * Only mirror, replacing, and spare vdevs support detach.
3044	 */
3045	if (pvd->vdev_ops != &vdev_replacing_ops &&
3046	    pvd->vdev_ops != &vdev_mirror_ops &&
3047	    pvd->vdev_ops != &vdev_spare_ops)
3048		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
3049
3050	/*
3051	 * If there's only one replica, you can't detach it.
3052	 */
3053	if (pvd->vdev_children <= 1)
3054		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
3055
3056	/*
3057	 * If all siblings have non-empty DTLs, this device may have the only
3058	 * valid copy of the data, which means we cannot safely detach it.
3059	 *
3060	 * XXX -- as in the vdev_offline() case, we really want a more
3061	 * precise DTL check.
3062	 */
3063	for (c = 0; c < pvd->vdev_children; c++) {
3064		uint64_t dirty;
3065
3066		cvd = pvd->vdev_child[c];
3067		if (cvd == vd)
3068			continue;
3069		if (vdev_is_dead(cvd))
3070			continue;
3071		mutex_enter(&cvd->vdev_dtl_lock);
3072		dirty = cvd->vdev_dtl_map.sm_space |
3073		    cvd->vdev_dtl_scrub.sm_space;
3074		mutex_exit(&cvd->vdev_dtl_lock);
3075		if (!dirty)
3076			break;
3077	}
3078
3079	if (c == pvd->vdev_children)
3080		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
3081
3082	/*
3083	 * If we are detaching the second disk from a replacing vdev, then
3084	 * check to see if we changed the original vdev's path to have "/old"
3085	 * at the end in spa_vdev_attach().  If so, undo that change now.
3086	 */
3087	if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id == 1 &&
3088	    pvd->vdev_child[0]->vdev_path != NULL &&
3089	    pvd->vdev_child[1]->vdev_path != NULL) {
3090		ASSERT(pvd->vdev_child[1] == vd);
3091		cvd = pvd->vdev_child[0];
3092		len = strlen(vd->vdev_path);
3093		if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 &&
3094		    strcmp(cvd->vdev_path + len, "/old") == 0) {
3095			spa_strfree(cvd->vdev_path);
3096			cvd->vdev_path = spa_strdup(vd->vdev_path);
3097		}
3098	}
3099
3100	/*
3101	 * If we are detaching the original disk from a spare, then it implies
3102	 * that the spare should become a real disk, and be removed from the
3103	 * active spare list for the pool.
3104	 */
3105	if (pvd->vdev_ops == &vdev_spare_ops &&
3106	    vd->vdev_id == 0)
3107		unspare = B_TRUE;
3108
3109	/*
3110	 * Erase the disk labels so the disk can be used for other things.
3111	 * This must be done after all other error cases are handled,
3112	 * but before we disembowel vd (so we can still do I/O to it).
3113	 * But if we can't do it, don't treat the error as fatal --
3114	 * it may be that the unwritability of the disk is the reason
3115	 * it's being detached!
3116	 */
3117	error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
3118
3119	/*
3120	 * Remove vd from its parent and compact the parent's children.
3121	 */
3122	vdev_remove_child(pvd, vd);
3123	vdev_compact_children(pvd);
3124
3125	/*
3126	 * Remember one of the remaining children so we can get tvd below.
3127	 */
3128	cvd = pvd->vdev_child[0];
3129
3130	/*
3131	 * If we need to remove the remaining child from the list of hot spares,
3132	 * do it now, marking the vdev as no longer a spare in the process.  We
3133	 * must do this before vdev_remove_parent(), because that can change the
3134	 * GUID if it creates a new toplevel GUID.
3135	 */
3136	if (unspare) {
3137		ASSERT(cvd->vdev_isspare);
3138		spa_spare_remove(cvd);
3139		unspare_guid = cvd->vdev_guid;
3140	}
3141
3142	/*
3143	 * If the parent mirror/replacing vdev only has one child,
3144	 * the parent is no longer needed.  Remove it from the tree.
3145	 */
3146	if (pvd->vdev_children == 1)
3147		vdev_remove_parent(cvd);
3148
3149	/*
3150	 * We don't set tvd until now because the parent we just removed
3151	 * may have been the previous top-level vdev.
3152	 */
3153	tvd = cvd->vdev_top;
3154	ASSERT(tvd->vdev_parent == rvd);
3155
3156	/*
3157	 * Reevaluate the parent vdev state.
3158	 */
3159	vdev_propagate_state(cvd);
3160
3161	/*
3162	 * If the device we just detached was smaller than the others, it may be
3163	 * possible to add metaslabs (i.e. grow the pool).  vdev_metaslab_init()
3164	 * can't fail because the existing metaslabs are already in core, so
3165	 * there's nothing to read from disk.
3166	 */
3167	VERIFY(vdev_metaslab_init(tvd, txg) == 0);
3168
3169	vdev_config_dirty(tvd);
3170
3171	/*
3172	 * Mark vd's DTL as dirty in this txg.  vdev_dtl_sync() will see that
3173	 * vd->vdev_detached is set and free vd's DTL object in syncing context.
3174	 * But first make sure we're not on any *other* txg's DTL list, to
3175	 * prevent vd from being accessed after it's freed.
3176	 */
3177	for (t = 0; t < TXG_SIZE; t++)
3178		(void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
3179	vd->vdev_detached = B_TRUE;
3180	vdev_dirty(tvd, VDD_DTL, vd, txg);
3181
3182	spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE);
3183
3184	error = spa_vdev_exit(spa, vd, txg, 0);
3185
3186	/*
3187	 * If this was the removal of the original device in a hot spare vdev,
3188	 * then we want to go through and remove the device from the hot spare
3189	 * list of every other pool.
3190	 */
3191	if (unspare) {
3192		spa = NULL;
3193		mutex_enter(&spa_namespace_lock);
3194		while ((spa = spa_next(spa)) != NULL) {
3195			if (spa->spa_state != POOL_STATE_ACTIVE)
3196				continue;
3197			spa_open_ref(spa, FTAG);
3198			mutex_exit(&spa_namespace_lock);
3199			(void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
3200			mutex_enter(&spa_namespace_lock);
3201			spa_close(spa, FTAG);
3202		}
3203		mutex_exit(&spa_namespace_lock);
3204	}
3205
3206	return (error);
3207}
3208
3209static nvlist_t *
3210spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid)
3211{
3212	for (int i = 0; i < count; i++) {
3213		uint64_t guid;
3214
3215		VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID,
3216		    &guid) == 0);
3217
3218		if (guid == target_guid)
3219			return (nvpp[i]);
3220	}
3221
3222	return (NULL);
3223}
3224
3225static void
3226spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count,
3227	nvlist_t *dev_to_remove)
3228{
3229	nvlist_t **newdev = NULL;
3230
3231	if (count > 1)
3232		newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP);
3233
3234	for (int i = 0, j = 0; i < count; i++) {
3235		if (dev[i] == dev_to_remove)
3236			continue;
3237		VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0);
3238	}
3239
3240	VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0);
3241	VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0);
3242
3243	for (int i = 0; i < count - 1; i++)
3244		nvlist_free(newdev[i]);
3245
3246	if (count > 1)
3247		kmem_free(newdev, (count - 1) * sizeof (void *));
3248}
3249
3250/*
3251 * Remove a device from the pool.  Currently, this supports removing only hot
3252 * spares and level 2 ARC devices.
3253 */
3254int
3255spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
3256{
3257	vdev_t *vd;
3258	nvlist_t **spares, **l2cache, *nv;
3259	uint_t nspares, nl2cache;
3260	uint64_t txg;
3261	int error = 0;
3262
3263	txg = spa_vdev_enter(spa);
3264
3265	vd = spa_lookup_by_guid(spa, guid, B_FALSE);
3266
3267	if (spa->spa_spares.sav_vdevs != NULL &&
3268	    nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
3269	    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 &&
3270	    (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) {
3271		/*
3272		 * Only remove the hot spare if it's not currently in use
3273		 * in this pool.
3274		 */
3275		if (vd == NULL || unspare) {
3276			spa_vdev_remove_aux(spa->spa_spares.sav_config,
3277			    ZPOOL_CONFIG_SPARES, spares, nspares, nv);
3278			spa_load_spares(spa);
3279			spa->spa_spares.sav_sync = B_TRUE;
3280		} else {
3281			error = EBUSY;
3282		}
3283	} else if (spa->spa_l2cache.sav_vdevs != NULL &&
3284	    nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
3285	    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 &&
3286	    (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) {
3287		/*
3288		 * Cache devices can always be removed.
3289		 */
3290		spa_vdev_remove_aux(spa->spa_l2cache.sav_config,
3291		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv);
3292		spa_load_l2cache(spa);
3293		spa->spa_l2cache.sav_sync = B_TRUE;
3294	} else if (vd != NULL) {
3295		/*
3296		 * Normal vdevs cannot be removed (yet).
3297		 */
3298		error = ENOTSUP;
3299	} else {
3300		/*
3301		 * There is no vdev of any kind with the specified guid.
3302		 */
3303		error = ENOENT;
3304	}
3305
3306	return (spa_vdev_exit(spa, NULL, txg, error));
3307}
3308
3309/*
3310 * Find any device that's done replacing, or a vdev marked 'unspare' that's
3311 * current spared, so we can detach it.
3312 */
3313static vdev_t *
3314spa_vdev_resilver_done_hunt(vdev_t *vd)
3315{
3316	vdev_t *newvd, *oldvd;
3317	int c;
3318
3319	for (c = 0; c < vd->vdev_children; c++) {
3320		oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]);
3321		if (oldvd != NULL)
3322			return (oldvd);
3323	}
3324
3325	/*
3326	 * Check for a completed replacement.
3327	 */
3328	if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) {
3329		oldvd = vd->vdev_child[0];
3330		newvd = vd->vdev_child[1];
3331
3332		mutex_enter(&newvd->vdev_dtl_lock);
3333		if (newvd->vdev_dtl_map.sm_space == 0 &&
3334		    newvd->vdev_dtl_scrub.sm_space == 0) {
3335			mutex_exit(&newvd->vdev_dtl_lock);
3336			return (oldvd);
3337		}
3338		mutex_exit(&newvd->vdev_dtl_lock);
3339	}
3340
3341	/*
3342	 * Check for a completed resilver with the 'unspare' flag set.
3343	 */
3344	if (vd->vdev_ops == &vdev_spare_ops && vd->vdev_children == 2) {
3345		newvd = vd->vdev_child[0];
3346		oldvd = vd->vdev_child[1];
3347
3348		mutex_enter(&newvd->vdev_dtl_lock);
3349		if (newvd->vdev_unspare &&
3350		    newvd->vdev_dtl_map.sm_space == 0 &&
3351		    newvd->vdev_dtl_scrub.sm_space == 0) {
3352			newvd->vdev_unspare = 0;
3353			mutex_exit(&newvd->vdev_dtl_lock);
3354			return (oldvd);
3355		}
3356		mutex_exit(&newvd->vdev_dtl_lock);
3357	}
3358
3359	return (NULL);
3360}
3361
3362static void
3363spa_vdev_resilver_done(spa_t *spa)
3364{
3365	vdev_t *vd;
3366	vdev_t *pvd;
3367	uint64_t guid;
3368	uint64_t pguid = 0;
3369
3370	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
3371
3372	while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) {
3373		guid = vd->vdev_guid;
3374		/*
3375		 * If we have just finished replacing a hot spared device, then
3376		 * we need to detach the parent's first child (the original hot
3377		 * spare) as well.
3378		 */
3379		pvd = vd->vdev_parent;
3380		if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops &&
3381		    pvd->vdev_id == 0) {
3382			ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
3383			ASSERT(pvd->vdev_parent->vdev_children == 2);
3384			pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid;
3385		}
3386		spa_config_exit(spa, SCL_CONFIG, FTAG);
3387		if (spa_vdev_detach(spa, guid, B_TRUE) != 0)
3388			return;
3389		if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0)
3390			return;
3391		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
3392	}
3393
3394	spa_config_exit(spa, SCL_CONFIG, FTAG);
3395}
3396
3397/*
3398 * Update the stored path for this vdev.  Dirty the vdev configuration, relying
3399 * on spa_vdev_enter/exit() to synchronize the labels and cache.
3400 */
3401int
3402spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
3403{
3404	vdev_t *vd;
3405	uint64_t txg;
3406
3407	txg = spa_vdev_enter(spa);
3408
3409	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) {
3410		/*
3411		 * Determine if this is a reference to a hot spare device.  If
3412		 * it is, update the path manually as there is no associated
3413		 * vdev_t that can be synced to disk.
3414		 */
3415		nvlist_t **spares;
3416		uint_t i, nspares;
3417
3418		if (spa->spa_spares.sav_config != NULL) {
3419			VERIFY(nvlist_lookup_nvlist_array(
3420			    spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES,
3421			    &spares, &nspares) == 0);
3422			for (i = 0; i < nspares; i++) {
3423				uint64_t theguid;
3424				VERIFY(nvlist_lookup_uint64(spares[i],
3425				    ZPOOL_CONFIG_GUID, &theguid) == 0);
3426				if (theguid == guid) {
3427					VERIFY(nvlist_add_string(spares[i],
3428					    ZPOOL_CONFIG_PATH, newpath) == 0);
3429					spa_load_spares(spa);
3430					spa->spa_spares.sav_sync = B_TRUE;
3431					return (spa_vdev_exit(spa, NULL, txg,
3432					    0));
3433				}
3434			}
3435		}
3436
3437		return (spa_vdev_exit(spa, NULL, txg, ENOENT));
3438	}
3439
3440	if (!vd->vdev_ops->vdev_op_leaf)
3441		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
3442
3443	spa_strfree(vd->vdev_path);
3444	vd->vdev_path = spa_strdup(newpath);
3445
3446	vdev_config_dirty(vd->vdev_top);
3447
3448	return (spa_vdev_exit(spa, NULL, txg, 0));
3449}
3450
3451/*
3452 * ==========================================================================
3453 * SPA Scrubbing
3454 * ==========================================================================
3455 */
3456
3457int
3458spa_scrub(spa_t *spa, pool_scrub_type_t type)
3459{
3460	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
3461
3462	if ((uint_t)type >= POOL_SCRUB_TYPES)
3463		return (ENOTSUP);
3464
3465	/*
3466	 * If a resilver was requested, but there is no DTL on a
3467	 * writeable leaf device, we have nothing to do.
3468	 */
3469	if (type == POOL_SCRUB_RESILVER &&
3470	    !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
3471		spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
3472		return (0);
3473	}
3474
3475	if (type == POOL_SCRUB_EVERYTHING &&
3476	    spa->spa_dsl_pool->dp_scrub_func != SCRUB_FUNC_NONE &&
3477	    spa->spa_dsl_pool->dp_scrub_isresilver)
3478		return (EBUSY);
3479
3480	if (type == POOL_SCRUB_EVERYTHING || type == POOL_SCRUB_RESILVER) {
3481		return (dsl_pool_scrub_clean(spa->spa_dsl_pool));
3482	} else if (type == POOL_SCRUB_NONE) {
3483		return (dsl_pool_scrub_cancel(spa->spa_dsl_pool));
3484	} else {
3485		return (EINVAL);
3486	}
3487}
3488
3489/*
3490 * ==========================================================================
3491 * SPA async task processing
3492 * ==========================================================================
3493 */
3494
3495static void
3496spa_async_remove(spa_t *spa, vdev_t *vd)
3497{
3498	if (vd->vdev_remove_wanted) {
3499		vd->vdev_remove_wanted = 0;
3500		vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE);
3501		vdev_clear(spa, vd);
3502		vdev_state_dirty(vd->vdev_top);
3503	}
3504
3505	for (int c = 0; c < vd->vdev_children; c++)
3506		spa_async_remove(spa, vd->vdev_child[c]);
3507}
3508
3509static void
3510spa_async_probe(spa_t *spa, vdev_t *vd)
3511{
3512	if (vd->vdev_probe_wanted) {
3513		vd->vdev_probe_wanted = 0;
3514		vdev_reopen(vd);	/* vdev_open() does the actual probe */
3515	}
3516
3517	for (int c = 0; c < vd->vdev_children; c++)
3518		spa_async_probe(spa, vd->vdev_child[c]);
3519}
3520
3521static void
3522spa_async_thread(void *arg)
3523{
3524	spa_t *spa = arg;
3525	int tasks;
3526
3527	ASSERT(spa->spa_sync_on);
3528
3529	mutex_enter(&spa->spa_async_lock);
3530	tasks = spa->spa_async_tasks;
3531	spa->spa_async_tasks = 0;
3532	mutex_exit(&spa->spa_async_lock);
3533
3534	/*
3535	 * See if the config needs to be updated.
3536	 */
3537	if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
3538		mutex_enter(&spa_namespace_lock);
3539		spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
3540		mutex_exit(&spa_namespace_lock);
3541	}
3542
3543	/*
3544	 * See if any devices need to be marked REMOVED.
3545	 */
3546	if (tasks & SPA_ASYNC_REMOVE) {
3547		spa_vdev_state_enter(spa);
3548		spa_async_remove(spa, spa->spa_root_vdev);
3549		for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
3550			spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]);
3551		for (int i = 0; i < spa->spa_spares.sav_count; i++)
3552			spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]);
3553		(void) spa_vdev_state_exit(spa, NULL, 0);
3554	}
3555
3556	/*
3557	 * See if any devices need to be probed.
3558	 */
3559	if (tasks & SPA_ASYNC_PROBE) {
3560		spa_vdev_state_enter(spa);
3561		spa_async_probe(spa, spa->spa_root_vdev);
3562		(void) spa_vdev_state_exit(spa, NULL, 0);
3563	}
3564
3565	/*
3566	 * If any devices are done replacing, detach them.
3567	 */
3568	if (tasks & SPA_ASYNC_RESILVER_DONE)
3569		spa_vdev_resilver_done(spa);
3570
3571	/*
3572	 * Kick off a resilver.
3573	 */
3574	if (tasks & SPA_ASYNC_RESILVER)
3575		VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER) == 0);
3576
3577	/*
3578	 * Let the world know that we're done.
3579	 */
3580	mutex_enter(&spa->spa_async_lock);
3581	spa->spa_async_thread = NULL;
3582	cv_broadcast(&spa->spa_async_cv);
3583	mutex_exit(&spa->spa_async_lock);
3584	thread_exit();
3585}
3586
3587void
3588spa_async_suspend(spa_t *spa)
3589{
3590	mutex_enter(&spa->spa_async_lock);
3591	spa->spa_async_suspended++;
3592	while (spa->spa_async_thread != NULL)
3593		cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
3594	mutex_exit(&spa->spa_async_lock);
3595}
3596
3597void
3598spa_async_resume(spa_t *spa)
3599{
3600	mutex_enter(&spa->spa_async_lock);
3601	ASSERT(spa->spa_async_suspended != 0);
3602	spa->spa_async_suspended--;
3603	mutex_exit(&spa->spa_async_lock);
3604}
3605
3606static void
3607spa_async_dispatch(spa_t *spa)
3608{
3609	mutex_enter(&spa->spa_async_lock);
3610	if (spa->spa_async_tasks && !spa->spa_async_suspended &&
3611	    spa->spa_async_thread == NULL &&
3612	    rootdir != NULL && !vn_is_readonly(rootdir))
3613		spa->spa_async_thread = thread_create(NULL, 0,
3614		    spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
3615	mutex_exit(&spa->spa_async_lock);
3616}
3617
3618void
3619spa_async_request(spa_t *spa, int task)
3620{
3621	mutex_enter(&spa->spa_async_lock);
3622	spa->spa_async_tasks |= task;
3623	mutex_exit(&spa->spa_async_lock);
3624}
3625
3626/*
3627 * ==========================================================================
3628 * SPA syncing routines
3629 * ==========================================================================
3630 */
3631
3632static void
3633spa_sync_deferred_frees(spa_t *spa, uint64_t txg)
3634{
3635	bplist_t *bpl = &spa->spa_sync_bplist;
3636	dmu_tx_t *tx;
3637	blkptr_t blk;
3638	uint64_t itor = 0;
3639	zio_t *zio;
3640	int error;
3641	uint8_t c = 1;
3642
3643	zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
3644
3645	while (bplist_iterate(bpl, &itor, &blk) == 0) {
3646		ASSERT(blk.blk_birth < txg);
3647		zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL,
3648		    ZIO_FLAG_MUSTSUCCEED));
3649	}
3650
3651	error = zio_wait(zio);
3652	ASSERT3U(error, ==, 0);
3653
3654	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
3655	bplist_vacate(bpl, tx);
3656
3657	/*
3658	 * Pre-dirty the first block so we sync to convergence faster.
3659	 * (Usually only the first block is needed.)
3660	 */
3661	dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx);
3662	dmu_tx_commit(tx);
3663}
3664
3665static void
3666spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
3667{
3668	char *packed = NULL;
3669	size_t bufsize;
3670	size_t nvsize = 0;
3671	dmu_buf_t *db;
3672
3673	VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0);
3674
3675	/*
3676	 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration
3677	 * information.  This avoids the dbuf_will_dirty() path and
3678	 * saves us a pre-read to get data we don't actually care about.
3679	 */
3680	bufsize = P2ROUNDUP(nvsize, SPA_CONFIG_BLOCKSIZE);
3681	packed = kmem_alloc(bufsize, KM_SLEEP);
3682
3683	VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
3684	    KM_SLEEP) == 0);
3685	bzero(packed + nvsize, bufsize - nvsize);
3686
3687	dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx);
3688
3689	kmem_free(packed, bufsize);
3690
3691	VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
3692	dmu_buf_will_dirty(db, tx);
3693	*(uint64_t *)db->db_data = nvsize;
3694	dmu_buf_rele(db, FTAG);
3695}
3696
3697static void
3698spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx,
3699    const char *config, const char *entry)
3700{
3701	nvlist_t *nvroot;
3702	nvlist_t **list;
3703	int i;
3704
3705	if (!sav->sav_sync)
3706		return;
3707
3708	/*
3709	 * Update the MOS nvlist describing the list of available devices.
3710	 * spa_validate_aux() will have already made sure this nvlist is
3711	 * valid and the vdevs are labeled appropriately.
3712	 */
3713	if (sav->sav_object == 0) {
3714		sav->sav_object = dmu_object_alloc(spa->spa_meta_objset,
3715		    DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE,
3716		    sizeof (uint64_t), tx);
3717		VERIFY(zap_update(spa->spa_meta_objset,
3718		    DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1,
3719		    &sav->sav_object, tx) == 0);
3720	}
3721
3722	VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
3723	if (sav->sav_count == 0) {
3724		VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0);
3725	} else {
3726		list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
3727		for (i = 0; i < sav->sav_count; i++)
3728			list[i] = vdev_config_generate(spa, sav->sav_vdevs[i],
3729			    B_FALSE, B_FALSE, B_TRUE);
3730		VERIFY(nvlist_add_nvlist_array(nvroot, config, list,
3731		    sav->sav_count) == 0);
3732		for (i = 0; i < sav->sav_count; i++)
3733			nvlist_free(list[i]);
3734		kmem_free(list, sav->sav_count * sizeof (void *));
3735	}
3736
3737	spa_sync_nvlist(spa, sav->sav_object, nvroot, tx);
3738	nvlist_free(nvroot);
3739
3740	sav->sav_sync = B_FALSE;
3741}
3742
3743static void
3744spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
3745{
3746	nvlist_t *config;
3747
3748	if (list_is_empty(&spa->spa_config_dirty_list))
3749		return;
3750
3751	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
3752
3753	config = spa_config_generate(spa, spa->spa_root_vdev,
3754	    dmu_tx_get_txg(tx), B_FALSE);
3755
3756	spa_config_exit(spa, SCL_STATE, FTAG);
3757
3758	if (spa->spa_config_syncing)
3759		nvlist_free(spa->spa_config_syncing);
3760	spa->spa_config_syncing = config;
3761
3762	spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
3763}
3764
3765/*
3766 * Set zpool properties.
3767 */
3768static void
3769spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
3770{
3771	spa_t *spa = arg1;
3772	objset_t *mos = spa->spa_meta_objset;
3773	nvlist_t *nvp = arg2;
3774	nvpair_t *elem;
3775	uint64_t intval;
3776	char *strval;
3777	zpool_prop_t prop;
3778	const char *propname;
3779	zprop_type_t proptype;
3780	spa_config_dirent_t *dp;
3781
3782	mutex_enter(&spa->spa_props_lock);
3783
3784	elem = NULL;
3785	while ((elem = nvlist_next_nvpair(nvp, elem))) {
3786		switch (prop = zpool_name_to_prop(nvpair_name(elem))) {
3787		case ZPOOL_PROP_VERSION:
3788			/*
3789			 * Only set version for non-zpool-creation cases
3790			 * (set/import). spa_create() needs special care
3791			 * for version setting.
3792			 */
3793			if (tx->tx_txg != TXG_INITIAL) {
3794				VERIFY(nvpair_value_uint64(elem,
3795				    &intval) == 0);
3796				ASSERT(intval <= SPA_VERSION);
3797				ASSERT(intval >= spa_version(spa));
3798				spa->spa_uberblock.ub_version = intval;
3799				vdev_config_dirty(spa->spa_root_vdev);
3800			}
3801			break;
3802
3803		case ZPOOL_PROP_ALTROOT:
3804			/*
3805			 * 'altroot' is a non-persistent property. It should
3806			 * have been set temporarily at creation or import time.
3807			 */
3808			ASSERT(spa->spa_root != NULL);
3809			break;
3810
3811		case ZPOOL_PROP_CACHEFILE:
3812			/*
3813			 * 'cachefile' is a non-persistent property, but note
3814			 * an async request that the config cache needs to be
3815			 * udpated.
3816			 */
3817			VERIFY(nvpair_value_string(elem, &strval) == 0);
3818
3819			dp = kmem_alloc(sizeof (spa_config_dirent_t), KM_SLEEP);
3820
3821			if (strval[0] == '\0')
3822				dp->scd_path = spa_strdup(spa_config_path);
3823			else if (strcmp(strval, "none") == 0)
3824				dp->scd_path = NULL;
3825			else
3826				dp->scd_path = spa_strdup(strval);
3827
3828			list_insert_head(&spa->spa_config_list, dp);
3829			spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
3830			break;
3831		default:
3832			/*
3833			 * Set pool property values in the poolprops mos object.
3834			 */
3835			if (spa->spa_pool_props_object == 0) {
3836				objset_t *mos = spa->spa_meta_objset;
3837
3838				VERIFY((spa->spa_pool_props_object =
3839				    zap_create(mos, DMU_OT_POOL_PROPS,
3840				    DMU_OT_NONE, 0, tx)) > 0);
3841
3842				VERIFY(zap_update(mos,
3843				    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS,
3844				    8, 1, &spa->spa_pool_props_object, tx)
3845				    == 0);
3846			}
3847
3848			/* normalize the property name */
3849			propname = zpool_prop_to_name(prop);
3850			proptype = zpool_prop_get_type(prop);
3851
3852			if (nvpair_type(elem) == DATA_TYPE_STRING) {
3853				ASSERT(proptype == PROP_TYPE_STRING);
3854				VERIFY(nvpair_value_string(elem, &strval) == 0);
3855				VERIFY(zap_update(mos,
3856				    spa->spa_pool_props_object, propname,
3857				    1, strlen(strval) + 1, strval, tx) == 0);
3858
3859			} else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
3860				VERIFY(nvpair_value_uint64(elem, &intval) == 0);
3861
3862				if (proptype == PROP_TYPE_INDEX) {
3863					const char *unused;
3864					VERIFY(zpool_prop_index_to_string(
3865					    prop, intval, &unused) == 0);
3866				}
3867				VERIFY(zap_update(mos,
3868				    spa->spa_pool_props_object, propname,
3869				    8, 1, &intval, tx) == 0);
3870			} else {
3871				ASSERT(0); /* not allowed */
3872			}
3873
3874			switch (prop) {
3875			case ZPOOL_PROP_DELEGATION:
3876				spa->spa_delegation = intval;
3877				break;
3878			case ZPOOL_PROP_BOOTFS:
3879				spa->spa_bootfs = intval;
3880				break;
3881			case ZPOOL_PROP_FAILUREMODE:
3882				spa->spa_failmode = intval;
3883				break;
3884			default:
3885				break;
3886			}
3887		}
3888
3889		/* log internal history if this is not a zpool create */
3890		if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY &&
3891		    tx->tx_txg != TXG_INITIAL) {
3892			spa_history_internal_log(LOG_POOL_PROPSET,
3893			    spa, tx, cr, "%s %lld %s",
3894			    nvpair_name(elem), intval, spa_name(spa));
3895		}
3896	}
3897
3898	mutex_exit(&spa->spa_props_lock);
3899}
3900
3901/*
3902 * Sync the specified transaction group.  New blocks may be dirtied as
3903 * part of the process, so we iterate until it converges.
3904 */
3905void
3906spa_sync(spa_t *spa, uint64_t txg)
3907{
3908	dsl_pool_t *dp = spa->spa_dsl_pool;
3909	objset_t *mos = spa->spa_meta_objset;
3910	bplist_t *bpl = &spa->spa_sync_bplist;
3911	vdev_t *rvd = spa->spa_root_vdev;
3912	vdev_t *vd;
3913	dmu_tx_t *tx;
3914	int dirty_vdevs;
3915	int error;
3916
3917	/*
3918	 * Lock out configuration changes.
3919	 */
3920	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
3921
3922	spa->spa_syncing_txg = txg;
3923	spa->spa_sync_pass = 0;
3924
3925	/*
3926	 * If there are any pending vdev state changes, convert them
3927	 * into config changes that go out with this transaction group.
3928	 */
3929	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
3930	while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
3931		vdev_state_clean(vd);
3932		vdev_config_dirty(vd);
3933	}
3934	spa_config_exit(spa, SCL_STATE, FTAG);
3935
3936	VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj));
3937
3938	tx = dmu_tx_create_assigned(dp, txg);
3939
3940	/*
3941	 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
3942	 * set spa_deflate if we have no raid-z vdevs.
3943	 */
3944	if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE &&
3945	    spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) {
3946		int i;
3947
3948		for (i = 0; i < rvd->vdev_children; i++) {
3949			vd = rvd->vdev_child[i];
3950			if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
3951				break;
3952		}
3953		if (i == rvd->vdev_children) {
3954			spa->spa_deflate = TRUE;
3955			VERIFY(0 == zap_add(spa->spa_meta_objset,
3956			    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
3957			    sizeof (uint64_t), 1, &spa->spa_deflate, tx));
3958		}
3959	}
3960
3961	if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
3962	    spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) {
3963		dsl_pool_create_origin(dp, tx);
3964
3965		/* Keeping the origin open increases spa_minref */
3966		spa->spa_minref += 3;
3967	}
3968
3969	if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES &&
3970	    spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) {
3971		dsl_pool_upgrade_clones(dp, tx);
3972	}
3973
3974	/*
3975	 * If anything has changed in this txg, push the deferred frees
3976	 * from the previous txg.  If not, leave them alone so that we
3977	 * don't generate work on an otherwise idle system.
3978	 */
3979	if (!txg_list_empty(&dp->dp_dirty_datasets, txg) ||
3980	    !txg_list_empty(&dp->dp_dirty_dirs, txg) ||
3981	    !txg_list_empty(&dp->dp_sync_tasks, txg))
3982		spa_sync_deferred_frees(spa, txg);
3983
3984	/*
3985	 * Iterate to convergence.
3986	 */
3987	do {
3988		spa->spa_sync_pass++;
3989
3990		spa_sync_config_object(spa, tx);
3991		spa_sync_aux_dev(spa, &spa->spa_spares, tx,
3992		    ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES);
3993		spa_sync_aux_dev(spa, &spa->spa_l2cache, tx,
3994		    ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE);
3995		spa_errlog_sync(spa, txg);
3996		dsl_pool_sync(dp, txg);
3997
3998		dirty_vdevs = 0;
3999		while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) {
4000			vdev_sync(vd, txg);
4001			dirty_vdevs++;
4002		}
4003
4004		bplist_sync(bpl, tx);
4005	} while (dirty_vdevs);
4006
4007	bplist_close(bpl);
4008
4009	dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass);
4010
4011	/*
4012	 * Rewrite the vdev configuration (which includes the uberblock)
4013	 * to commit the transaction group.
4014	 *
4015	 * If there are no dirty vdevs, we sync the uberblock to a few
4016	 * random top-level vdevs that are known to be visible in the
4017	 * config cache (see spa_vdev_add() for a complete description).
4018	 * If there *are* dirty vdevs, sync the uberblock to all vdevs.
4019	 */
4020	for (;;) {
4021		/*
4022		 * We hold SCL_STATE to prevent vdev open/close/etc.
4023		 * while we're attempting to write the vdev labels.
4024		 */
4025		spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
4026
4027		if (list_is_empty(&spa->spa_config_dirty_list)) {
4028			vdev_t *svd[SPA_DVAS_PER_BP];
4029			int svdcount = 0;
4030			int children = rvd->vdev_children;
4031			int c0 = spa_get_random(children);
4032			int c;
4033
4034			for (c = 0; c < children; c++) {
4035				vd = rvd->vdev_child[(c0 + c) % children];
4036				if (vd->vdev_ms_array == 0 || vd->vdev_islog)
4037					continue;
4038				svd[svdcount++] = vd;
4039				if (svdcount == SPA_DVAS_PER_BP)
4040					break;
4041			}
4042			error = vdev_config_sync(svd, svdcount, txg);
4043		} else {
4044			error = vdev_config_sync(rvd->vdev_child,
4045			    rvd->vdev_children, txg);
4046		}
4047
4048		spa_config_exit(spa, SCL_STATE, FTAG);
4049
4050		if (error == 0)
4051			break;
4052		zio_suspend(spa, NULL);
4053		zio_resume_wait(spa);
4054	}
4055	dmu_tx_commit(tx);
4056
4057	/*
4058	 * Clear the dirty config list.
4059	 */
4060	while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL)
4061		vdev_config_clean(vd);
4062
4063	/*
4064	 * Now that the new config has synced transactionally,
4065	 * let it become visible to the config cache.
4066	 */
4067	if (spa->spa_config_syncing != NULL) {
4068		spa_config_set(spa, spa->spa_config_syncing);
4069		spa->spa_config_txg = txg;
4070		spa->spa_config_syncing = NULL;
4071	}
4072
4073	spa->spa_traverse_wanted = B_TRUE;
4074	rw_enter(&spa->spa_traverse_lock, RW_WRITER);
4075	spa->spa_traverse_wanted = B_FALSE;
4076	spa->spa_ubsync = spa->spa_uberblock;
4077	rw_exit(&spa->spa_traverse_lock);
4078
4079	/*
4080	 * Clean up the ZIL records for the synced txg.
4081	 */
4082	dsl_pool_zil_clean(dp);
4083
4084	/*
4085	 * Update usable space statistics.
4086	 */
4087	while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
4088		vdev_sync_done(vd, txg);
4089
4090	/*
4091	 * It had better be the case that we didn't dirty anything
4092	 * since vdev_config_sync().
4093	 */
4094	ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
4095	ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
4096	ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
4097	ASSERT(bpl->bpl_queue == NULL);
4098
4099	spa_config_exit(spa, SCL_CONFIG, FTAG);
4100
4101	/*
4102	 * If any async tasks have been requested, kick them off.
4103	 */
4104	spa_async_dispatch(spa);
4105}
4106
4107/*
4108 * Sync all pools.  We don't want to hold the namespace lock across these
4109 * operations, so we take a reference on the spa_t and drop the lock during the
4110 * sync.
4111 */
4112void
4113spa_sync_allpools(void)
4114{
4115	spa_t *spa = NULL;
4116	mutex_enter(&spa_namespace_lock);
4117	while ((spa = spa_next(spa)) != NULL) {
4118		if (spa_state(spa) != POOL_STATE_ACTIVE || spa_suspended(spa))
4119			continue;
4120		spa_open_ref(spa, FTAG);
4121		mutex_exit(&spa_namespace_lock);
4122		txg_wait_synced(spa_get_dsl(spa), 0);
4123		mutex_enter(&spa_namespace_lock);
4124		spa_close(spa, FTAG);
4125	}
4126	mutex_exit(&spa_namespace_lock);
4127}
4128
4129/*
4130 * ==========================================================================
4131 * Miscellaneous routines
4132 * ==========================================================================
4133 */
4134
4135/*
4136 * Remove all pools in the system.
4137 */
4138void
4139spa_evict_all(void)
4140{
4141	spa_t *spa;
4142
4143	/*
4144	 * Remove all cached state.  All pools should be closed now,
4145	 * so every spa in the AVL tree should be unreferenced.
4146	 */
4147	mutex_enter(&spa_namespace_lock);
4148	while ((spa = spa_next(NULL)) != NULL) {
4149		/*
4150		 * Stop async tasks.  The async thread may need to detach
4151		 * a device that's been replaced, which requires grabbing
4152		 * spa_namespace_lock, so we must drop it here.
4153		 */
4154		spa_open_ref(spa, FTAG);
4155		mutex_exit(&spa_namespace_lock);
4156		spa_async_suspend(spa);
4157		mutex_enter(&spa_namespace_lock);
4158		spa_close(spa, FTAG);
4159
4160		if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
4161			spa_unload(spa);
4162			spa_deactivate(spa);
4163		}
4164		spa_remove(spa);
4165	}
4166	mutex_exit(&spa_namespace_lock);
4167}
4168
4169vdev_t *
4170spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t l2cache)
4171{
4172	vdev_t *vd;
4173	int i;
4174
4175	if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL)
4176		return (vd);
4177
4178	if (l2cache) {
4179		for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
4180			vd = spa->spa_l2cache.sav_vdevs[i];
4181			if (vd->vdev_guid == guid)
4182				return (vd);
4183		}
4184	}
4185
4186	return (NULL);
4187}
4188
4189void
4190spa_upgrade(spa_t *spa, uint64_t version)
4191{
4192	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4193
4194	/*
4195	 * This should only be called for a non-faulted pool, and since a
4196	 * future version would result in an unopenable pool, this shouldn't be
4197	 * possible.
4198	 */
4199	ASSERT(spa->spa_uberblock.ub_version <= SPA_VERSION);
4200	ASSERT(version >= spa->spa_uberblock.ub_version);
4201
4202	spa->spa_uberblock.ub_version = version;
4203	vdev_config_dirty(spa->spa_root_vdev);
4204
4205	spa_config_exit(spa, SCL_ALL, FTAG);
4206
4207	txg_wait_synced(spa_get_dsl(spa), 0);
4208}
4209
4210boolean_t
4211spa_has_spare(spa_t *spa, uint64_t guid)
4212{
4213	int i;
4214	uint64_t spareguid;
4215	spa_aux_vdev_t *sav = &spa->spa_spares;
4216
4217	for (i = 0; i < sav->sav_count; i++)
4218		if (sav->sav_vdevs[i]->vdev_guid == guid)
4219			return (B_TRUE);
4220
4221	for (i = 0; i < sav->sav_npending; i++) {
4222		if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID,
4223		    &spareguid) == 0 && spareguid == guid)
4224			return (B_TRUE);
4225	}
4226
4227	return (B_FALSE);
4228}
4229
4230/*
4231 * Check if a pool has an active shared spare device.
4232 * Note: reference count of an active spare is 2, as a spare and as a replace
4233 */
4234static boolean_t
4235spa_has_active_shared_spare(spa_t *spa)
4236{
4237	int i, refcnt;
4238	uint64_t pool;
4239	spa_aux_vdev_t *sav = &spa->spa_spares;
4240
4241	for (i = 0; i < sav->sav_count; i++) {
4242		if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool,
4243		    &refcnt) && pool != 0ULL && pool == spa_guid(spa) &&
4244		    refcnt > 2)
4245			return (B_TRUE);
4246	}
4247
4248	return (B_FALSE);
4249}
4250
4251/*
4252 * Post a sysevent corresponding to the given event.  The 'name' must be one of
4253 * the event definitions in sys/sysevent/eventdefs.h.  The payload will be
4254 * filled in from the spa and (optionally) the vdev.  This doesn't do anything
4255 * in the userland libzpool, as we don't want consumers to misinterpret ztest
4256 * or zdb as real changes.
4257 */
4258void
4259spa_event_notify(spa_t *spa, vdev_t *vd, const char *name)
4260{
4261#if 0
4262#ifdef _KERNEL
4263	sysevent_t		*ev;
4264	sysevent_attr_list_t	*attr = NULL;
4265	sysevent_value_t	value;
4266	sysevent_id_t		eid;
4267
4268	ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs",
4269	    SE_SLEEP);
4270
4271	value.value_type = SE_DATA_TYPE_STRING;
4272	value.value.sv_string = spa_name(spa);
4273	if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0)
4274		goto done;
4275
4276	value.value_type = SE_DATA_TYPE_UINT64;
4277	value.value.sv_uint64 = spa_guid(spa);
4278	if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0)
4279		goto done;
4280
4281	if (vd) {
4282		value.value_type = SE_DATA_TYPE_UINT64;
4283		value.value.sv_uint64 = vd->vdev_guid;
4284		if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value,
4285		    SE_SLEEP) != 0)
4286			goto done;
4287
4288		if (vd->vdev_path) {
4289			value.value_type = SE_DATA_TYPE_STRING;
4290			value.value.sv_string = vd->vdev_path;
4291			if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH,
4292			    &value, SE_SLEEP) != 0)
4293				goto done;
4294		}
4295	}
4296
4297	if (sysevent_attach_attributes(ev, attr) != 0)
4298		goto done;
4299	attr = NULL;
4300
4301	(void) log_sysevent(ev, SE_SLEEP, &eid);
4302
4303done:
4304	if (attr)
4305		sysevent_free_attr(attr);
4306	sysevent_free(ev);
4307#endif
4308#endif
4309}
4310