1192830Sed/*
2192830Sed * CDDL HEADER START
3192830Sed *
4192830Sed * The contents of this file are subject to the terms of the
5192830Sed * Common Development and Distribution License (the "License").
6192830Sed * You may not use this file except in compliance with the License.
7192830Sed *
8192830Sed * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9192914Sed * or http://www.opensolaris.org/os/licensing.
10192914Sed * See the License for the specific language governing permissions
11192914Sed * and limitations under the License.
12192914Sed *
13192914Sed * When distributing Covered Code, include this CDDL HEADER in each
14192914Sed * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15192914Sed * If applicable, add the following below this CDDL HEADER, with the
16192914Sed * fields enclosed by brackets "[]" replaced with your own identifying
17192914Sed * information: Portions Copyright [yyyy] [name of copyright owner]
18192914Sed *
19192914Sed * CDDL HEADER END
20192914Sed */
21192914Sed
22192914Sed/*
23192914Sed * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24192914Sed * Copyright (c) 2013 by Delphix. All rights reserved.
25192914Sed * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
26192914Sed * Copyright (c) 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
27192914Sed */
28192914Sed
29192914Sed/*
30192914Sed * SPA: Storage Pool Allocator
31192914Sed *
32192914Sed * This file contains all the routines used when modifying on-disk SPA state.
33192914Sed * This includes opening, importing, destroying, exporting a pool, and syncing a
34192914Sed * pool.
35192830Sed */
36192914Sed
37192830Sed#include <sys/zfs_context.h>
38192830Sed#include <sys/fm/fs/zfs.h>
39192830Sed#include <sys/spa_impl.h>
40192830Sed#include <sys/zio.h>
41192830Sed#include <sys/zio_checksum.h>
42192830Sed#include <sys/dmu.h>
43192830Sed#include <sys/dmu_tx.h>
44192830Sed#include <sys/zap.h>
45192830Sed#include <sys/zil.h>
46192830Sed#include <sys/ddt.h>
47192830Sed#include <sys/vdev_impl.h>
48192830Sed#include <sys/metaslab.h>
49192830Sed#include <sys/metaslab_impl.h>
50192830Sed#include <sys/uberblock_impl.h>
51192830Sed#include <sys/txg.h>
52192830Sed#include <sys/avl.h>
53213567Sed#include <sys/dmu_traverse.h>
54192830Sed#include <sys/dmu_objset.h>
55192830Sed#include <sys/unique.h>
56192830Sed#include <sys/dsl_pool.h>
57192856Sed#include <sys/dsl_dataset.h>
58192856Sed#include <sys/dsl_dir.h>
59192856Sed#include <sys/dsl_prop.h>
60192830Sed#include <sys/dsl_synctask.h>
61192914Sed#include <sys/fs/zfs.h>
62192830Sed#include <sys/arc.h>
63192914Sed#include <sys/callb.h>
64192830Sed#include <sys/spa_boot.h>
65213567Sed#include <sys/zfs_ioctl.h>
66192830Sed#include <sys/dsl_scan.h>
67192830Sed#include <sys/dmu_send.h>
68192830Sed#include <sys/dsl_destroy.h>
69192856Sed#include <sys/dsl_userhold.h>
70192856Sed#include <sys/zfeature.h>
71192830Sed#include <sys/zvol.h>
72192830Sed#include <sys/trim_map.h>
73192830Sed
74192830Sed#ifdef	_KERNEL
75192856Sed#include <sys/callb.h>
76192914Sed#include <sys/cpupart.h>
77192856Sed#include <sys/zone.h>
78192830Sed#endif	/* _KERNEL */
79192830Sed
80192914Sed#include "zfs_prop.h"
81192914Sed#include "zfs_comutil.h"
82192914Sed
83196751Sache/* Check hostid on import? */
84192914Sedstatic int check_hostid = 1;
85192830Sed
86192830SedSYSCTL_DECL(_vfs_zfs);
87192830SedTUNABLE_INT("vfs.zfs.check_hostid", &check_hostid);
88192914SedSYSCTL_INT(_vfs_zfs, OID_AUTO, check_hostid, CTLFLAG_RW, &check_hostid, 0,
89192914Sed    "Check hostid on import?");
90192914Sed
91192914Sed/*
92192914Sed * The interval, in seconds, at which failed configuration cache file writes
93192856Sed * should be retried.
94192856Sed */
95192856Sedstatic int zfs_ccw_retry_interval = 300;
96192914Sed
97192830Sedtypedef enum zti_modes {
98192830Sed	zti_mode_fixed,			/* value is # of threads (min 1) */
99192830Sed	zti_mode_online_percent,	/* value is % of online CPUs */
100192830Sed	zti_mode_batch,			/* cpu-intensive; value is ignored */
101192830Sed	zti_mode_null,			/* don't create a taskq */
102192830Sed	zti_nmodes
103192830Sed} zti_modes_t;
104192830Sed
105192830Sed#define	ZTI_FIX(n)	{ zti_mode_fixed, (n) }
106192830Sed#define	ZTI_PCT(n)	{ zti_mode_online_percent, (n) }
107192830Sed#define	ZTI_BATCH	{ zti_mode_batch, 0 }
108192830Sed#define	ZTI_NULL	{ zti_mode_null, 0 }
109192830Sed
110192830Sed#define	ZTI_ONE		ZTI_FIX(1)
111192830Sed
112192830Sedtypedef struct zio_taskq_info {
113192830Sed	enum zti_modes zti_mode;
114192830Sed	uint_t zti_value;
115192830Sed} zio_taskq_info_t;
116192830Sed
117192830Sedstatic const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
118192830Sed	"issue", "issue_high", "intr", "intr_high"
119192830Sed};
120192830Sed
121192830Sed/*
122192830Sed * Define the taskq threads for the following I/O types:
123192830Sed * 	NULL, READ, WRITE, FREE, CLAIM, and IOCTL
124192830Sed */
125192830Sedconst zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
126192830Sed	/* ISSUE	ISSUE_HIGH	INTR		INTR_HIGH */
127192830Sed	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL },
128192830Sed	{ ZTI_FIX(8),	ZTI_NULL,	ZTI_BATCH,	ZTI_NULL },
129192830Sed	{ ZTI_BATCH,	ZTI_FIX(5),	ZTI_FIX(8),	ZTI_FIX(5) },
130192830Sed	{ ZTI_FIX(100),	ZTI_NULL,	ZTI_ONE,	ZTI_NULL },
131192830Sed	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL },
132192830Sed	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL },
133192830Sed};
134192830Sed
135192830Sedstatic void spa_sync_version(void *arg, dmu_tx_t *tx);
136192830Sedstatic void spa_sync_props(void *arg, dmu_tx_t *tx);
137192830Sedstatic boolean_t spa_has_active_shared_spare(spa_t *spa);
138192830Sedstatic int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config,
139192830Sed    spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
140192830Sed    char **ereport);
141192830Sedstatic void spa_vdev_resilver_done(spa_t *spa);
142192830Sed
143192830Seduint_t		zio_taskq_batch_pct = 100;	/* 1 thread per cpu in pset */
144192830Sed#ifdef PSRSET_BIND
145192830Sedid_t		zio_taskq_psrset_bind = PS_NONE;
146192830Sed#endif
147192830Sed#ifdef SYSDC
148192830Sedboolean_t	zio_taskq_sysdc = B_TRUE;	/* use SDC scheduling class */
149192830Sed#endif
150192830Seduint_t		zio_taskq_basedc = 80;		/* base duty cycle */
151192914Sed
152192830Sedboolean_t	spa_create_process = B_TRUE;	/* no process ==> no sysdc */
153192830Sedextern int	zfs_sync_pass_deferred_free;
154192830Sed
155192830Sed#ifndef illumos
156192830Sedextern void spa_deadman(void *arg);
157192830Sed#endif
158192830Sed
159192830Sed/*
160192830Sed * This (illegal) pool name is used when temporarily importing a spa_t in order
161192830Sed * to get the vdev stats associated with the imported devices.
162192830Sed */
163192830Sed#define	TRYIMPORT_NAME	"$import"
164192830Sed
165192830Sed/*
166192830Sed * ==========================================================================
167192830Sed * SPA properties routines
168192830Sed * ==========================================================================
169192830Sed */
170192830Sed
171192830Sed/*
172192830Sed * Add a (source=src, propname=propval) list to an nvlist.
173192830Sed */
174192830Sedstatic void
175192830Sedspa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval,
176192830Sed    uint64_t intval, zprop_source_t src)
177192830Sed{
178192830Sed	const char *propname = zpool_prop_to_name(prop);
179192830Sed	nvlist_t *propval;
180192830Sed
181192830Sed	VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
182192830Sed	VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0);
183192830Sed
184192830Sed	if (strval != NULL)
185192830Sed		VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0);
186192830Sed	else
187192830Sed		VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0);
188192830Sed
189192830Sed	VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0);
190192830Sed	nvlist_free(propval);
191192830Sed}
192192830Sed
193192830Sed/*
194192830Sed * Get property values from the spa configuration.
195192830Sed */
196192830Sedstatic void
197192830Sedspa_prop_get_config(spa_t *spa, nvlist_t **nvp)
198192830Sed{
199192830Sed	vdev_t *rvd = spa->spa_root_vdev;
200192830Sed	dsl_pool_t *pool = spa->spa_dsl_pool;
201192830Sed	uint64_t size;
202192830Sed	uint64_t alloc;
203192830Sed	uint64_t space;
204192914Sed	uint64_t cap, version;
205192830Sed	zprop_source_t src = ZPROP_SRC_NONE;
206192830Sed	spa_config_dirent_t *dp;
207192830Sed
208192830Sed	ASSERT(MUTEX_HELD(&spa->spa_props_lock));
209192830Sed
210192830Sed	if (rvd != NULL) {
211192830Sed		alloc = metaslab_class_get_alloc(spa_normal_class(spa));
212192830Sed		size = metaslab_class_get_space(spa_normal_class(spa));
213192830Sed		spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
214192830Sed		spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
215192830Sed		spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
216192830Sed		spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL,
217192830Sed		    size - alloc, src);
218192830Sed
219192830Sed		space = 0;
220192830Sed		for (int c = 0; c < rvd->vdev_children; c++) {
221192830Sed			vdev_t *tvd = rvd->vdev_child[c];
222192830Sed			space += tvd->vdev_max_asize - tvd->vdev_asize;
223192830Sed		}
224192830Sed		spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, space,
225192830Sed		    src);
226192830Sed
227192830Sed		spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL,
228192830Sed		    (spa_mode(spa) == FREAD), src);
229192830Sed
230192830Sed		cap = (size == 0) ? 0 : (alloc * 100 / size);
231192830Sed		spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);
232192830Sed
233192830Sed		spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL,
234192830Sed		    ddt_get_pool_dedup_ratio(spa), src);
235192830Sed
236192830Sed		spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
237192830Sed		    rvd->vdev_state, src);
238192830Sed
239192830Sed		version = spa_version(spa);
240192830Sed		if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION))
241192830Sed			src = ZPROP_SRC_DEFAULT;
242192830Sed		else
243192830Sed			src = ZPROP_SRC_LOCAL;
244192830Sed		spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src);
245192830Sed	}
246192830Sed
247192830Sed	if (pool != NULL) {
248192830Sed		dsl_dir_t *freedir = pool->dp_free_dir;
249192830Sed
250192830Sed		/*
251192830Sed		 * The $FREE directory was introduced in SPA_VERSION_DEADLISTS,
252192830Sed		 * when opening pools before this version freedir will be NULL.
253192830Sed		 */
254192830Sed		if (freedir != NULL) {
255192830Sed			spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL,
256192830Sed			    freedir->dd_phys->dd_used_bytes, src);
257192830Sed		} else {
258192830Sed			spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING,
259192830Sed			    NULL, 0, src);
260192830Sed		}
261192830Sed	}
262192830Sed
263192830Sed	spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
264192830Sed
265192830Sed	if (spa->spa_comment != NULL) {
266192830Sed		spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment,
267192830Sed		    0, ZPROP_SRC_LOCAL);
268192830Sed	}
269192830Sed
270192830Sed	if (spa->spa_root != NULL)
271192830Sed		spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root,
272192830Sed		    0, ZPROP_SRC_LOCAL);
273192830Sed
274192830Sed	if ((dp = list_head(&spa->spa_config_list)) != NULL) {
275192830Sed		if (dp->scd_path == NULL) {
276192830Sed			spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
277192830Sed			    "none", 0, ZPROP_SRC_LOCAL);
278192830Sed		} else if (strcmp(dp->scd_path, spa_config_path) != 0) {
279192830Sed			spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
280192830Sed			    dp->scd_path, 0, ZPROP_SRC_LOCAL);
281192830Sed		}
282192830Sed	}
283192830Sed}
284192830Sed
285192830Sed/*
286192830Sed * Get zpool property values.
287192830Sed */
288192830Sedint
289192830Sedspa_prop_get(spa_t *spa, nvlist_t **nvp)
290192830Sed{
291192830Sed	objset_t *mos = spa->spa_meta_objset;
292192830Sed	zap_cursor_t zc;
293192830Sed	zap_attribute_t za;
294192830Sed	int err;
295192830Sed
296192830Sed	VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
297192830Sed
298192830Sed	mutex_enter(&spa->spa_props_lock);
299192830Sed
300192856Sed	/*
301192830Sed	 * Get properties from the spa config.
302192830Sed	 */
303192830Sed	spa_prop_get_config(spa, nvp);
304192830Sed
305192830Sed	/* If no pool property object, no more prop to get. */
306192830Sed	if (mos == NULL || spa->spa_pool_props_object == 0) {
307192830Sed		mutex_exit(&spa->spa_props_lock);
308192830Sed		return (0);
309192830Sed	}
310228627Sdim
311192830Sed	/*
312192830Sed	 * Get properties from the MOS pool property object.
313192830Sed	 */
314192830Sed	for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object);
315192830Sed	    (err = zap_cursor_retrieve(&zc, &za)) == 0;
316192830Sed	    zap_cursor_advance(&zc)) {
317192830Sed		uint64_t intval = 0;
318192830Sed		char *strval = NULL;
319192830Sed		zprop_source_t src = ZPROP_SRC_DEFAULT;
320192830Sed		zpool_prop_t prop;
321192830Sed
322192830Sed		if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL)
323192830Sed			continue;
324192830Sed
325192830Sed		switch (za.za_integer_length) {
326192830Sed		case 8:
327192830Sed			/* integer property */
328192830Sed			if (za.za_first_integer !=
329192830Sed			    zpool_prop_default_numeric(prop))
330192830Sed				src = ZPROP_SRC_LOCAL;
331192830Sed
332192830Sed			if (prop == ZPOOL_PROP_BOOTFS) {
333192830Sed				dsl_pool_t *dp;
334192830Sed				dsl_dataset_t *ds = NULL;
335192830Sed
336192830Sed				dp = spa_get_dsl(spa);
337192830Sed				dsl_pool_config_enter(dp, FTAG);
338192830Sed				if (err = dsl_dataset_hold_obj(dp,
339192830Sed				    za.za_first_integer, FTAG, &ds)) {
340192830Sed					dsl_pool_config_exit(dp, FTAG);
341192830Sed					break;
342192830Sed				}
343192830Sed
344192830Sed				strval = kmem_alloc(
345192830Sed				    MAXNAMELEN + strlen(MOS_DIR_NAME) + 1,
346192830Sed				    KM_SLEEP);
347192830Sed				dsl_dataset_name(ds, strval);
348192830Sed				dsl_dataset_rele(ds, FTAG);
349192830Sed				dsl_pool_config_exit(dp, FTAG);
350192830Sed			} else {
351192830Sed				strval = NULL;
352192830Sed				intval = za.za_first_integer;
353192830Sed			}
354192830Sed
355192830Sed			spa_prop_add_list(*nvp, prop, strval, intval, src);
356192830Sed
357192830Sed			if (strval != NULL)
358192830Sed				kmem_free(strval,
359192830Sed				    MAXNAMELEN + strlen(MOS_DIR_NAME) + 1);
360192830Sed
361192830Sed			break;
362192830Sed
363192830Sed		case 1:
364192830Sed			/* string property */
365192830Sed			strval = kmem_alloc(za.za_num_integers, KM_SLEEP);
366192830Sed			err = zap_lookup(mos, spa->spa_pool_props_object,
367192830Sed			    za.za_name, 1, za.za_num_integers, strval);
368192830Sed			if (err) {
369192830Sed				kmem_free(strval, za.za_num_integers);
370192830Sed				break;
371192830Sed			}
372192830Sed			spa_prop_add_list(*nvp, prop, strval, 0, src);
373192830Sed			kmem_free(strval, za.za_num_integers);
374192830Sed			break;
375192830Sed
376192830Sed		default:
377192830Sed			break;
378192830Sed		}
379192830Sed	}
380192830Sed	zap_cursor_fini(&zc);
381192830Sed	mutex_exit(&spa->spa_props_lock);
382192830Sedout:
383192830Sed	if (err && err != ENOENT) {
384192830Sed		nvlist_free(*nvp);
385192830Sed		*nvp = NULL;
386192830Sed		return (err);
387192830Sed	}
388192830Sed
389192830Sed	return (0);
390192830Sed}
391192830Sed
392192830Sed/*
393192830Sed * Validate the given pool properties nvlist and modify the list
394192830Sed * for the property values to be set.
395192830Sed */
396192830Sedstatic int
397192830Sedspa_prop_validate(spa_t *spa, nvlist_t *props)
398192830Sed{
399192830Sed	nvpair_t *elem;
400192830Sed	int error = 0, reset_bootfs = 0;
401192830Sed	uint64_t objnum = 0;
402192830Sed	boolean_t has_feature = B_FALSE;
403192830Sed
404192830Sed	elem = NULL;
405192830Sed	while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
406192830Sed		uint64_t intval;
407192830Sed		char *strval, *slash, *check, *fname;
408192830Sed		const char *propname = nvpair_name(elem);
409192830Sed		zpool_prop_t prop = zpool_name_to_prop(propname);
410192830Sed
411192830Sed		switch (prop) {
412192830Sed		case ZPROP_INVAL:
413192830Sed			if (!zpool_prop_feature(propname)) {
414192830Sed				error = SET_ERROR(EINVAL);
415192830Sed				break;
416192830Sed			}
417192830Sed
418192830Sed			/*
419192830Sed			 * Sanitize the input.
420192830Sed			 */
421192830Sed			if (nvpair_type(elem) != DATA_TYPE_UINT64) {
422192830Sed				error = SET_ERROR(EINVAL);
423192830Sed				break;
424192830Sed			}
425192830Sed
426192830Sed			if (nvpair_value_uint64(elem, &intval) != 0) {
427192830Sed				error = SET_ERROR(EINVAL);
428192830Sed				break;
429192830Sed			}
430192830Sed
431192830Sed			if (intval != 0) {
432192830Sed				error = SET_ERROR(EINVAL);
433192830Sed				break;
434192830Sed			}
435192830Sed
436192830Sed			fname = strchr(propname, '@') + 1;
437192830Sed			if (zfeature_lookup_name(fname, NULL) != 0) {
438192830Sed				error = SET_ERROR(EINVAL);
439192830Sed				break;
440192830Sed			}
441192830Sed
442192830Sed			has_feature = B_TRUE;
443192830Sed			break;
444192830Sed
445192830Sed		case ZPOOL_PROP_VERSION:
446192830Sed			error = nvpair_value_uint64(elem, &intval);
447192830Sed			if (!error &&
448192830Sed			    (intval < spa_version(spa) ||
449192830Sed			    intval > SPA_VERSION_BEFORE_FEATURES ||
450192830Sed			    has_feature))
451192830Sed				error = SET_ERROR(EINVAL);
452192830Sed			break;
453192830Sed
454192830Sed		case ZPOOL_PROP_DELEGATION:
455192830Sed		case ZPOOL_PROP_AUTOREPLACE:
456192830Sed		case ZPOOL_PROP_LISTSNAPS:
457192830Sed		case ZPOOL_PROP_AUTOEXPAND:
458192830Sed			error = nvpair_value_uint64(elem, &intval);
459192830Sed			if (!error && intval > 1)
460192830Sed				error = SET_ERROR(EINVAL);
461192830Sed			break;
462192830Sed
463192830Sed		case ZPOOL_PROP_BOOTFS:
464192830Sed			/*
465192830Sed			 * If the pool version is less than SPA_VERSION_BOOTFS,
466192830Sed			 * or the pool is still being created (version == 0),
467192830Sed			 * the bootfs property cannot be set.
468192830Sed			 */
469192830Sed			if (spa_version(spa) < SPA_VERSION_BOOTFS) {
470192830Sed				error = SET_ERROR(ENOTSUP);
471192830Sed				break;
472192830Sed			}
473192830Sed
474192830Sed			/*
475192830Sed			 * Make sure the vdev config is bootable
476192830Sed			 */
477192830Sed			if (!vdev_is_bootable(spa->spa_root_vdev)) {
478192830Sed				error = SET_ERROR(ENOTSUP);
479192830Sed				break;
480192830Sed			}
481192830Sed
482192830Sed			reset_bootfs = 1;
483192830Sed
484192830Sed			error = nvpair_value_string(elem, &strval);
485192830Sed
486192830Sed			if (!error) {
487192830Sed				objset_t *os;
488192830Sed				uint64_t compress;
489192830Sed
490192830Sed				if (strval == NULL || strval[0] == '\0') {
491192830Sed					objnum = zpool_prop_default_numeric(
492192830Sed					    ZPOOL_PROP_BOOTFS);
493192830Sed					break;
494192830Sed				}
495192830Sed
496192830Sed				if (error = dmu_objset_hold(strval, FTAG, &os))
497192830Sed					break;
498192830Sed
499192830Sed				/* Must be ZPL and not gzip compressed. */
500192830Sed
501192830Sed				if (dmu_objset_type(os) != DMU_OST_ZFS) {
502192830Sed					error = SET_ERROR(ENOTSUP);
503192830Sed				} else if ((error =
504192830Sed				    dsl_prop_get_int_ds(dmu_objset_ds(os),
505192830Sed				    zfs_prop_to_name(ZFS_PROP_COMPRESSION),
506192830Sed				    &compress)) == 0 &&
507192830Sed				    !BOOTFS_COMPRESS_VALID(compress)) {
508192830Sed					error = SET_ERROR(ENOTSUP);
509192830Sed				} else {
510192830Sed					objnum = dmu_objset_id(os);
511192830Sed				}
512192830Sed				dmu_objset_rele(os, FTAG);
513192830Sed			}
514192830Sed			break;
515192830Sed
516192830Sed		case ZPOOL_PROP_FAILUREMODE:
517192830Sed			error = nvpair_value_uint64(elem, &intval);
518192830Sed			if (!error && (intval < ZIO_FAILURE_MODE_WAIT ||
519192830Sed			    intval > ZIO_FAILURE_MODE_PANIC))
520192830Sed				error = SET_ERROR(EINVAL);
521192830Sed
522192830Sed			/*
523192830Sed			 * This is a special case which only occurs when
524192830Sed			 * the pool has completely failed. This allows
525192830Sed			 * the user to change the in-core failmode property
526192830Sed			 * without syncing it out to disk (I/Os might
527192830Sed			 * currently be blocked). We do this by returning
528192830Sed			 * EIO to the caller (spa_prop_set) to trick it
529192830Sed			 * into thinking we encountered a property validation
530192830Sed			 * error.
531192830Sed			 */
532192830Sed			if (!error && spa_suspended(spa)) {
533192830Sed				spa->spa_failmode = intval;
534192914Sed				error = SET_ERROR(EIO);
535192830Sed			}
536192830Sed			break;
537192830Sed
538192914Sed		case ZPOOL_PROP_CACHEFILE:
539192914Sed			if ((error = nvpair_value_string(elem, &strval)) != 0)
540192914Sed				break;
541192914Sed
542192914Sed			if (strval[0] == '\0')
543192914Sed				break;
544192914Sed
545192914Sed			if (strcmp(strval, "none") == 0)
546192830Sed				break;
547192830Sed
548192830Sed			if (strval[0] != '/') {
549192830Sed				error = SET_ERROR(EINVAL);
550192830Sed				break;
551192830Sed			}
552192914Sed
553192830Sed			slash = strrchr(strval, '/');
554192914Sed			ASSERT(slash != NULL);
555192914Sed
556192914Sed			if (slash[1] == '\0' || strcmp(slash, "/.") == 0 ||
557192830Sed			    strcmp(slash, "/..") == 0)
558192830Sed				error = SET_ERROR(EINVAL);
559192830Sed			break;
560192830Sed
561192830Sed		case ZPOOL_PROP_COMMENT:
562192914Sed			if ((error = nvpair_value_string(elem, &strval)) != 0)
563192830Sed				break;
564192830Sed			for (check = strval; *check != '\0'; check++) {
565192830Sed				/*
566192830Sed				 * The kernel doesn't have an easy isprint()
567192830Sed				 * check.  For this kernel check, we merely
568192830Sed				 * check ASCII apart from DEL.  Fix this if
569192830Sed				 * there is an easy-to-use kernel isprint().
570192830Sed				 */
571192830Sed				if (*check >= 0x7f) {
572192830Sed					error = SET_ERROR(EINVAL);
573192830Sed					break;
574192830Sed				}
575192830Sed				check++;
576192830Sed			}
577192830Sed			if (strlen(strval) > ZPROP_MAX_COMMENT)
578192830Sed				error = E2BIG;
579192830Sed			break;
580192914Sed
581192914Sed		case ZPOOL_PROP_DEDUPDITTO:
582192830Sed			if (spa_version(spa) < SPA_VERSION_DEDUP)
583192830Sed				error = SET_ERROR(ENOTSUP);
584192830Sed			else
585192830Sed				error = nvpair_value_uint64(elem, &intval);
586192830Sed			if (error == 0 &&
587192830Sed			    intval != 0 && intval < ZIO_DEDUPDITTO_MIN)
588192830Sed				error = SET_ERROR(EINVAL);
589192830Sed			break;
590192830Sed		}
591192830Sed
592192830Sed		if (error)
593192830Sed			break;
594192830Sed	}
595192830Sed
596192830Sed	if (!error && reset_bootfs) {
597192830Sed		error = nvlist_remove(props,
598192830Sed		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING);
599192830Sed
600192830Sed		if (!error) {
601192830Sed			error = nvlist_add_uint64(props,
602192830Sed			    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum);
603192830Sed		}
604192830Sed	}
605192830Sed
606192830Sed	return (error);
607192830Sed}
608192830Sed
609192830Sedvoid
610192914Sedspa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync)
611192914Sed{
612192830Sed	char *cachefile;
613192830Sed	spa_config_dirent_t *dp;
614192914Sed
615192914Sed	if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE),
616192914Sed	    &cachefile) != 0)
617192856Sed		return;
618192856Sed
619192856Sed	dp = kmem_alloc(sizeof (spa_config_dirent_t),
620192914Sed	    KM_SLEEP);
621192914Sed
622192914Sed	if (cachefile[0] == '\0')
623192914Sed		dp->scd_path = spa_strdup(spa_config_path);
624192914Sed	else if (strcmp(cachefile, "none") == 0)
625192914Sed		dp->scd_path = NULL;
626192914Sed	else
627192914Sed		dp->scd_path = spa_strdup(cachefile);
628192856Sed
629192856Sed	list_insert_head(&spa->spa_config_list, dp);
630192914Sed	if (need_sync)
631192830Sed		spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
632192830Sed}
633192914Sed
634192914Sedint
635192914Sedspa_prop_set(spa_t *spa, nvlist_t *nvp)
636192830Sed{
637192830Sed	int error;
638192830Sed	nvpair_t *elem = NULL;
639192830Sed	boolean_t need_sync = B_FALSE;
640192830Sed
641192830Sed	if ((error = spa_prop_validate(spa, nvp)) != 0)
642192830Sed		return (error);
643192830Sed
644192830Sed	while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) {
645192830Sed		zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem));
646192830Sed
647192830Sed		if (prop == ZPOOL_PROP_CACHEFILE ||
648192830Sed		    prop == ZPOOL_PROP_ALTROOT ||
649192830Sed		    prop == ZPOOL_PROP_READONLY)
650192830Sed			continue;
651192830Sed
652192830Sed		if (prop == ZPOOL_PROP_VERSION || prop == ZPROP_INVAL) {
653192830Sed			uint64_t ver;
654192830Sed
655192830Sed			if (prop == ZPOOL_PROP_VERSION) {
656192830Sed				VERIFY(nvpair_value_uint64(elem, &ver) == 0);
657192830Sed			} else {
658192830Sed				ASSERT(zpool_prop_feature(nvpair_name(elem)));
659192830Sed				ver = SPA_VERSION_FEATURES;
660192830Sed				need_sync = B_TRUE;
661192830Sed			}
662192830Sed
663192830Sed			/* Save time if the version is already set. */
664192830Sed			if (ver == spa_version(spa))
665192830Sed				continue;
666192830Sed
667192830Sed			/*
668192830Sed			 * In addition to the pool directory object, we might
669192830Sed			 * create the pool properties object, the features for
670192830Sed			 * read object, the features for write object, or the
671192830Sed			 * feature descriptions object.
672192830Sed			 */
673192830Sed			error = dsl_sync_task(spa->spa_name, NULL,
674192830Sed			    spa_sync_version, &ver, 6);
675192830Sed			if (error)
676192830Sed				return (error);
677192830Sed			continue;
678192830Sed		}
679192830Sed
680192830Sed		need_sync = B_TRUE;
681192830Sed		break;
682192830Sed	}
683192830Sed
684192830Sed	if (need_sync) {
685192830Sed		return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props,
686192830Sed		    nvp, 6));
687192830Sed	}
688192830Sed
689192830Sed	return (0);
690192830Sed}
691192830Sed
692192830Sed/*
693192830Sed * If the bootfs property value is dsobj, clear it.
694192830Sed */
695192830Sedvoid
696192830Sedspa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
697192830Sed{
698192830Sed	if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
699192830Sed		VERIFY(zap_remove(spa->spa_meta_objset,
700192830Sed		    spa->spa_pool_props_object,
701192830Sed		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0);
702192830Sed		spa->spa_bootfs = 0;
703192830Sed	}
704192830Sed}
705192830Sed
706192830Sed/*ARGSUSED*/
707192830Sedstatic int
708192830Sedspa_change_guid_check(void *arg, dmu_tx_t *tx)
709192830Sed{
710192830Sed	uint64_t *newguid = arg;
711192830Sed	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
712192830Sed	vdev_t *rvd = spa->spa_root_vdev;
713192830Sed	uint64_t vdev_state;
714192830Sed
715192830Sed	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
716192830Sed	vdev_state = rvd->vdev_state;
717192830Sed	spa_config_exit(spa, SCL_STATE, FTAG);
718192830Sed
719192830Sed	if (vdev_state != VDEV_STATE_HEALTHY)
720192830Sed		return (SET_ERROR(ENXIO));
721192830Sed
722192830Sed	ASSERT3U(spa_guid(spa), !=, *newguid);
723192830Sed
724192830Sed	return (0);
725196750Sache}
726192830Sed
727192830Sedstatic void
728192830Sedspa_change_guid_sync(void *arg, dmu_tx_t *tx)
729192830Sed{
730192830Sed	uint64_t *newguid = arg;
731192830Sed	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
732192830Sed	uint64_t oldguid;
733196750Sache	vdev_t *rvd = spa->spa_root_vdev;
734192830Sed
735192830Sed	oldguid = spa_guid(spa);
736192830Sed
737192830Sed	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
738192830Sed	rvd->vdev_guid = *newguid;
739192830Sed	rvd->vdev_guid_sum += (*newguid - oldguid);
740192830Sed	vdev_config_dirty(rvd);
741192830Sed	spa_config_exit(spa, SCL_STATE, FTAG);
742192830Sed
743192830Sed	spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu",
744192830Sed	    oldguid, *newguid);
745192830Sed}
746192830Sed
747192830Sed/*
748192830Sed * Change the GUID for the pool.  This is done so that we can later
749192830Sed * re-import a pool built from a clone of our own vdevs.  We will modify
750192830Sed * the root vdev's guid, our own pool guid, and then mark all of our
751192830Sed * vdevs dirty.  Note that we must make sure that all our vdevs are
752192830Sed * online when we do this, or else any vdevs that weren't present
753192830Sed * would be orphaned from our pool.  We are also going to issue a
754192830Sed * sysevent to update any watchers.
755192830Sed */
756192830Sedint
757192830Sedspa_change_guid(spa_t *spa)
758192830Sed{
759192830Sed	int error;
760192830Sed	uint64_t guid;
761192830Sed
762192830Sed	mutex_enter(&spa->spa_vdev_top_lock);
763192830Sed	mutex_enter(&spa_namespace_lock);
764192830Sed	guid = spa_generate_guid(NULL);
765192830Sed
766192830Sed	error = dsl_sync_task(spa->spa_name, spa_change_guid_check,
767192830Sed	    spa_change_guid_sync, &guid, 5);
768192830Sed
769192830Sed	if (error == 0) {
770192830Sed		spa_config_sync(spa, B_FALSE, B_TRUE);
771192830Sed		spa_event_notify(spa, NULL, ESC_ZFS_POOL_REGUID);
772192830Sed	}
773192830Sed
774192830Sed	mutex_exit(&spa_namespace_lock);
775192830Sed	mutex_exit(&spa->spa_vdev_top_lock);
776192830Sed
777192830Sed	return (error);
778192830Sed}
779192830Sed
780192830Sed/*
781192830Sed * ==========================================================================
782192830Sed * SPA state manipulation (open/create/destroy/import/export)
783192830Sed * ==========================================================================
784192830Sed */
785192830Sed
786192830Sedstatic int
787192830Sedspa_error_entry_compare(const void *a, const void *b)
788192830Sed{
789192830Sed	spa_error_entry_t *sa = (spa_error_entry_t *)a;
790192830Sed	spa_error_entry_t *sb = (spa_error_entry_t *)b;
791192830Sed	int ret;
792192830Sed
793192830Sed	ret = bcmp(&sa->se_bookmark, &sb->se_bookmark,
794192830Sed	    sizeof (zbookmark_t));
795192830Sed
796192830Sed	if (ret < 0)
797192830Sed		return (-1);
798192830Sed	else if (ret > 0)
799192830Sed		return (1);
800192830Sed	else
801192830Sed		return (0);
802192830Sed}
803192830Sed
804192830Sed/*
805192830Sed * Utility function which retrieves copies of the current logs and
806192830Sed * re-initializes them in the process.
807192830Sed */
808192830Sedvoid
809192830Sedspa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
810192914Sed{
811192830Sed	ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
812192830Sed
813192830Sed	bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t));
814192830Sed	bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t));
815192830Sed
816192830Sed	avl_create(&spa->spa_errlist_scrub,
817192830Sed	    spa_error_entry_compare, sizeof (spa_error_entry_t),
818192830Sed	    offsetof(spa_error_entry_t, se_avl));
819213567Sed	avl_create(&spa->spa_errlist_last,
820192830Sed	    spa_error_entry_compare, sizeof (spa_error_entry_t),
821192830Sed	    offsetof(spa_error_entry_t, se_avl));
822192830Sed}
823192830Sed
824192830Sedstatic taskq_t *
825192830Sedspa_taskq_create(spa_t *spa, const char *name, enum zti_modes mode,
826192830Sed    uint_t value)
827192830Sed{
828192830Sed	uint_t flags = TASKQ_PREPOPULATE;
829192830Sed	boolean_t batch = B_FALSE;
830192830Sed
831192830Sed	switch (mode) {
832192830Sed	case zti_mode_null:
833192830Sed		return (NULL);		/* no taskq needed */
834192830Sed
835192830Sed	case zti_mode_fixed:
836192830Sed		ASSERT3U(value, >=, 1);
837192830Sed		value = MAX(value, 1);
838192914Sed		break;
839192830Sed
840192830Sed	case zti_mode_batch:
841192830Sed		batch = B_TRUE;
842192830Sed		flags |= TASKQ_THREADS_CPU_PCT;
843192830Sed		value = zio_taskq_batch_pct;
844192830Sed		break;
845192830Sed
846192830Sed	case zti_mode_online_percent:
847192830Sed		flags |= TASKQ_THREADS_CPU_PCT;
848192830Sed		break;
849192830Sed
850192914Sed	default:
851192830Sed		panic("unrecognized mode for %s taskq (%u:%u) in "
852192830Sed		    "spa_activate()",
853192830Sed		    name, mode, value);
854192830Sed		break;
855192830Sed	}
856192830Sed
857192830Sed#ifdef SYSDC
858192830Sed	if (zio_taskq_sysdc && spa->spa_proc != &p0) {
859192830Sed		if (batch)
860192830Sed			flags |= TASKQ_DC_BATCH;
861192830Sed
862192830Sed		return (taskq_create_sysdc(name, value, 50, INT_MAX,
863192830Sed		    spa->spa_proc, zio_taskq_basedc, flags));
864192830Sed	}
865192830Sed#endif
866192830Sed	return (taskq_create_proc(name, value, maxclsyspri, 50, INT_MAX,
867192830Sed	    spa->spa_proc, flags));
868192830Sed}
869192830Sed
870192830Sedstatic void
871192830Sedspa_create_zio_taskqs(spa_t *spa)
872192830Sed{
873192830Sed	for (int t = 0; t < ZIO_TYPES; t++) {
874192830Sed		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
875192830Sed			const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
876192830Sed			enum zti_modes mode = ztip->zti_mode;
877192830Sed			uint_t value = ztip->zti_value;
878192830Sed			char name[32];
879192830Sed
880192830Sed			(void) snprintf(name, sizeof (name),
881192830Sed			    "%s_%s", zio_type_name[t], zio_taskq_types[q]);
882192830Sed
883192830Sed			spa->spa_zio_taskq[t][q] =
884192830Sed			    spa_taskq_create(spa, name, mode, value);
885192830Sed		}
886192830Sed	}
887192830Sed}
888192830Sed
889192830Sed#ifdef _KERNEL
890192830Sed#ifdef SPA_PROCESS
891192830Sedstatic void
892192830Sedspa_thread(void *arg)
893192830Sed{
894192830Sed	callb_cpr_t cprinfo;
895192830Sed
896192830Sed	spa_t *spa = arg;
897192830Sed	user_t *pu = PTOU(curproc);
898192830Sed
899192830Sed	CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr,
900192830Sed	    spa->spa_name);
901192830Sed
902192830Sed	ASSERT(curproc != &p0);
903192830Sed	(void) snprintf(pu->u_psargs, sizeof (pu->u_psargs),
904192830Sed	    "zpool-%s", spa->spa_name);
905192830Sed	(void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm));
906192830Sed
907192830Sed#ifdef PSRSET_BIND
908192830Sed	/* bind this thread to the requested psrset */
909192830Sed	if (zio_taskq_psrset_bind != PS_NONE) {
910192830Sed		pool_lock();
911192830Sed		mutex_enter(&cpu_lock);
912192830Sed		mutex_enter(&pidlock);
913192830Sed		mutex_enter(&curproc->p_lock);
914192830Sed
915192830Sed		if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind,
916192830Sed		    0, NULL, NULL) == 0)  {
917192830Sed			curthread->t_bind_pset = zio_taskq_psrset_bind;
918192830Sed		} else {
919192830Sed			cmn_err(CE_WARN,
920192830Sed			    "Couldn't bind process for zfs pool \"%s\" to "
921192830Sed			    "pset %d\n", spa->spa_name, zio_taskq_psrset_bind);
922192830Sed		}
923192830Sed
924192830Sed		mutex_exit(&curproc->p_lock);
925192830Sed		mutex_exit(&pidlock);
926192830Sed		mutex_exit(&cpu_lock);
927192830Sed		pool_unlock();
928192830Sed	}
929192830Sed#endif
930192830Sed
931192830Sed#ifdef SYSDC
932192830Sed	if (zio_taskq_sysdc) {
933192830Sed		sysdc_thread_enter(curthread, 100, 0);
934192830Sed	}
935192954Smarcel#endif
936192830Sed
937192830Sed	spa->spa_proc = curproc;
938192830Sed	spa->spa_did = curthread->t_did;
939192914Sed
940192830Sed	spa_create_zio_taskqs(spa);
941192830Sed
942192830Sed	mutex_enter(&spa->spa_proc_lock);
943192830Sed	ASSERT(spa->spa_proc_state == SPA_PROC_CREATED);
944192830Sed
945192830Sed	spa->spa_proc_state = SPA_PROC_ACTIVE;
946192830Sed	cv_broadcast(&spa->spa_proc_cv);
947192830Sed
948192830Sed	CALLB_CPR_SAFE_BEGIN(&cprinfo);
949192830Sed	while (spa->spa_proc_state == SPA_PROC_ACTIVE)
950192830Sed		cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
951192830Sed	CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock);
952192830Sed
953192830Sed	ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE);
954192830Sed	spa->spa_proc_state = SPA_PROC_GONE;
955192830Sed	spa->spa_proc = &p0;
956192830Sed	cv_broadcast(&spa->spa_proc_cv);
957192830Sed	CALLB_CPR_EXIT(&cprinfo);	/* drops spa_proc_lock */
958192830Sed
959192830Sed	mutex_enter(&curproc->p_lock);
960192830Sed	lwp_exit();
961192830Sed}
962192830Sed#endif	/* SPA_PROCESS */
963192830Sed#endif
964192830Sed
965192830Sed/*
966192830Sed * Activate an uninitialized pool.
967196750Sache */
968192830Sedstatic void
969192830Sedspa_activate(spa_t *spa, int mode)
970192830Sed{
971192830Sed	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
972192830Sed
973196750Sache	spa->spa_state = POOL_STATE_ACTIVE;
974192830Sed	spa->spa_mode = mode;
975192830Sed
976192914Sed	spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops);
977196750Sache	spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops);
978192830Sed
979192830Sed	/* Try to create a covering process */
980192830Sed	mutex_enter(&spa->spa_proc_lock);
981192830Sed	ASSERT(spa->spa_proc_state == SPA_PROC_NONE);
982192830Sed	ASSERT(spa->spa_proc == &p0);
983192954Smarcel	spa->spa_did = 0;
984192830Sed
985192830Sed#ifdef SPA_PROCESS
986192830Sed	/* Only create a process if we're going to be around a while. */
987192830Sed	if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) {
988192830Sed		if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri,
989192830Sed		    NULL, 0) == 0) {
990192830Sed			spa->spa_proc_state = SPA_PROC_CREATED;
991192830Sed			while (spa->spa_proc_state == SPA_PROC_CREATED) {
992192830Sed				cv_wait(&spa->spa_proc_cv,
993192830Sed				    &spa->spa_proc_lock);
994192830Sed			}
995192830Sed			ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
996192830Sed			ASSERT(spa->spa_proc != &p0);
997192830Sed			ASSERT(spa->spa_did != 0);
998192830Sed		} else {
999192830Sed#ifdef _KERNEL
1000192830Sed			cmn_err(CE_WARN,
1001192830Sed			    "Couldn't create process for zfs pool \"%s\"\n",
1002192830Sed			    spa->spa_name);
1003192830Sed#endif
1004192830Sed		}
1005192830Sed	}
1006192830Sed#endif	/* SPA_PROCESS */
1007192830Sed	mutex_exit(&spa->spa_proc_lock);
1008192830Sed
1009192830Sed	/* If we didn't create a process, we need to create our taskqs. */
1010192830Sed	ASSERT(spa->spa_proc == &p0);
1011192830Sed	if (spa->spa_proc == &p0) {
1012192830Sed		spa_create_zio_taskqs(spa);
1013192830Sed	}
1014192830Sed
1015192830Sed	/*
1016192830Sed	 * Start TRIM thread.
1017192830Sed	 */
1018192830Sed	trim_thread_create(spa);
1019192830Sed
1020192830Sed	list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
1021192830Sed	    offsetof(vdev_t, vdev_config_dirty_node));
1022192830Sed	list_create(&spa->spa_state_dirty_list, sizeof (vdev_t),
1023192830Sed	    offsetof(vdev_t, vdev_state_dirty_node));
1024192830Sed
1025192830Sed	txg_list_create(&spa->spa_vdev_txg_list,
1026192830Sed	    offsetof(struct vdev, vdev_txg_node));
1027192830Sed
1028192830Sed	avl_create(&spa->spa_errlist_scrub,
1029192830Sed	    spa_error_entry_compare, sizeof (spa_error_entry_t),
1030192830Sed	    offsetof(spa_error_entry_t, se_avl));
1031192830Sed	avl_create(&spa->spa_errlist_last,
1032192830Sed	    spa_error_entry_compare, sizeof (spa_error_entry_t),
1033192830Sed	    offsetof(spa_error_entry_t, se_avl));
1034192830Sed}
1035192830Sed
1036192830Sed/*
1037192830Sed * Opposite of spa_activate().
1038192830Sed */
1039192830Sedstatic void
1040192830Sedspa_deactivate(spa_t *spa)
1041192830Sed{
1042196750Sache	ASSERT(spa->spa_sync_on == B_FALSE);
1043192830Sed	ASSERT(spa->spa_dsl_pool == NULL);
1044192830Sed	ASSERT(spa->spa_root_vdev == NULL);
1045192830Sed	ASSERT(spa->spa_async_zio_root == NULL);
1046192830Sed	ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
1047192830Sed
1048192830Sed	/*
1049192830Sed	 * Stop TRIM thread in case spa_unload() wasn't called directly
1050192830Sed	 * before spa_deactivate().
1051192830Sed	 */
1052192830Sed	trim_thread_destroy(spa);
1053192830Sed
1054192830Sed	txg_list_destroy(&spa->spa_vdev_txg_list);
1055192830Sed
1056192830Sed	list_destroy(&spa->spa_config_dirty_list);
1057192830Sed	list_destroy(&spa->spa_state_dirty_list);
1058192830Sed
1059192830Sed	for (int t = 0; t < ZIO_TYPES; t++) {
1060192830Sed		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
1061192830Sed			if (spa->spa_zio_taskq[t][q] != NULL)
1062192830Sed				taskq_destroy(spa->spa_zio_taskq[t][q]);
1063192830Sed			spa->spa_zio_taskq[t][q] = NULL;
1064192830Sed		}
1065192830Sed	}
1066192830Sed
1067192830Sed	metaslab_class_destroy(spa->spa_normal_class);
1068192830Sed	spa->spa_normal_class = NULL;
1069192830Sed
1070192830Sed	metaslab_class_destroy(spa->spa_log_class);
1071192830Sed	spa->spa_log_class = NULL;
1072192830Sed
1073192830Sed	/*
1074192830Sed	 * If this was part of an import or the open otherwise failed, we may
1075192830Sed	 * still have errors left in the queues.  Empty them just in case.
1076192830Sed	 */
1077192830Sed	spa_errlog_drain(spa);
1078192914Sed
1079192830Sed	avl_destroy(&spa->spa_errlist_scrub);
1080192830Sed	avl_destroy(&spa->spa_errlist_last);
1081192830Sed
1082192830Sed	spa->spa_state = POOL_STATE_UNINITIALIZED;
1083192830Sed
1084192830Sed	mutex_enter(&spa->spa_proc_lock);
1085192830Sed	if (spa->spa_proc_state != SPA_PROC_NONE) {
1086192830Sed		ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
1087192830Sed		spa->spa_proc_state = SPA_PROC_DEACTIVATE;
1088192830Sed		cv_broadcast(&spa->spa_proc_cv);
1089192830Sed		while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) {
1090192830Sed			ASSERT(spa->spa_proc != &p0);
1091192830Sed			cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
1092192830Sed		}
1093192830Sed		ASSERT(spa->spa_proc_state == SPA_PROC_GONE);
1094192830Sed		spa->spa_proc_state = SPA_PROC_NONE;
1095192830Sed	}
1096192830Sed	ASSERT(spa->spa_proc == &p0);
1097192830Sed	mutex_exit(&spa->spa_proc_lock);
1098192830Sed
1099192830Sed#ifdef SPA_PROCESS
1100192830Sed	/*
1101192914Sed	 * We want to make sure spa_thread() has actually exited the ZFS
1102192830Sed	 * module, so that the module can't be unloaded out from underneath
1103192830Sed	 * it.
1104192830Sed	 */
1105192830Sed	if (spa->spa_did != 0) {
1106192914Sed		thread_join(spa->spa_did);
1107192830Sed		spa->spa_did = 0;
1108192914Sed	}
1109192830Sed#endif	/* SPA_PROCESS */
1110192830Sed}
1111192830Sed
1112192830Sed/*
1113192830Sed * Verify a pool configuration, and construct the vdev tree appropriately.  This
1114192830Sed * will create all the necessary vdevs in the appropriate layout, with each vdev
1115192830Sed * in the CLOSED state.  This will prep the pool before open/creation/import.
1116192830Sed * All vdev validation is done by the vdev_alloc() routine.
1117192830Sed */
1118192830Sedstatic int
1119192830Sedspa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
1120192830Sed    uint_t id, int atype)
1121192830Sed{
1122192830Sed	nvlist_t **child;
1123192830Sed	uint_t children;
1124192830Sed	int error;
1125192830Sed
1126192830Sed	if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
1127192830Sed		return (error);
1128192830Sed
1129192830Sed	if ((*vdp)->vdev_ops->vdev_op_leaf)
1130192830Sed		return (0);
1131192830Sed
1132192830Sed	error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
1133192830Sed	    &child, &children);
1134192830Sed
1135192830Sed	if (error == ENOENT)
1136192830Sed		return (0);
1137192830Sed
1138192830Sed	if (error) {
1139192830Sed		vdev_free(*vdp);
1140192830Sed		*vdp = NULL;
1141192830Sed		return (SET_ERROR(EINVAL));
1142192830Sed	}
1143192830Sed
1144192830Sed	for (int c = 0; c < children; c++) {
1145192830Sed		vdev_t *vd;
1146192830Sed		if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
1147192830Sed		    atype)) != 0) {
1148192830Sed			vdev_free(*vdp);
1149192830Sed			*vdp = NULL;
1150192830Sed			return (error);
1151192914Sed		}
1152192830Sed	}
1153192914Sed
1154192830Sed	ASSERT(*vdp != NULL);
1155192830Sed
1156192830Sed	return (0);
1157192830Sed}
1158192830Sed
1159192830Sed/*
1160192830Sed * Opposite of spa_load().
1161192830Sed */
1162192830Sedstatic void
1163192830Sedspa_unload(spa_t *spa)
1164192830Sed{
1165192830Sed	int i;
1166192830Sed
1167192830Sed	ASSERT(MUTEX_HELD(&spa_namespace_lock));
1168192830Sed
1169192830Sed	/*
1170192830Sed	 * Stop TRIM thread.
1171192830Sed	 */
1172192830Sed	trim_thread_destroy(spa);
1173192830Sed
1174192830Sed	/*
1175192830Sed	 * Stop async tasks.
1176192830Sed	 */
1177192830Sed	spa_async_suspend(spa);
1178192830Sed
1179192830Sed	/*
1180192830Sed	 * Stop syncing.
1181192830Sed	 */
1182192830Sed	if (spa->spa_sync_on) {
1183192830Sed		txg_sync_stop(spa->spa_dsl_pool);
1184192830Sed		spa->spa_sync_on = B_FALSE;
1185192830Sed	}
1186192830Sed
1187192914Sed	/*
1188192830Sed	 * Wait for any outstanding async I/O to complete.
1189192830Sed	 */
1190192830Sed	if (spa->spa_async_zio_root != NULL) {
1191192830Sed		(void) zio_wait(spa->spa_async_zio_root);
1192192830Sed		spa->spa_async_zio_root = NULL;
1193192830Sed	}
1194192830Sed
1195192830Sed	bpobj_close(&spa->spa_deferred_bpobj);
1196192830Sed
1197192830Sed	/*
1198192830Sed	 * Close the dsl pool.
1199192830Sed	 */
1200192830Sed	if (spa->spa_dsl_pool) {
1201192830Sed		dsl_pool_close(spa->spa_dsl_pool);
1202192830Sed		spa->spa_dsl_pool = NULL;
1203192830Sed		spa->spa_meta_objset = NULL;
1204192830Sed	}
1205192830Sed
1206192830Sed	ddt_unload(spa);
1207192830Sed
1208192830Sed	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1209192830Sed
1210192830Sed	/*
1211192830Sed	 * Drop and purge level 2 cache
1212192830Sed	 */
1213192830Sed	spa_l2cache_drop(spa);
1214192830Sed
1215192830Sed	/*
1216192830Sed	 * Close all vdevs.
1217192830Sed	 */
1218192830Sed	if (spa->spa_root_vdev)
1219192830Sed		vdev_free(spa->spa_root_vdev);
1220192830Sed	ASSERT(spa->spa_root_vdev == NULL);
1221192830Sed
1222213567Sed	for (i = 0; i < spa->spa_spares.sav_count; i++)
1223192830Sed		vdev_free(spa->spa_spares.sav_vdevs[i]);
1224192830Sed	if (spa->spa_spares.sav_vdevs) {
1225192830Sed		kmem_free(spa->spa_spares.sav_vdevs,
1226213567Sed		    spa->spa_spares.sav_count * sizeof (void *));
1227192830Sed		spa->spa_spares.sav_vdevs = NULL;
1228192830Sed	}
1229192830Sed	if (spa->spa_spares.sav_config) {
1230192830Sed		nvlist_free(spa->spa_spares.sav_config);
1231192830Sed		spa->spa_spares.sav_config = NULL;
1232192830Sed	}
1233192830Sed	spa->spa_spares.sav_count = 0;
1234192830Sed
1235192830Sed	for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
1236192830Sed		vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]);
1237192830Sed		vdev_free(spa->spa_l2cache.sav_vdevs[i]);
1238192830Sed	}
1239192830Sed	if (spa->spa_l2cache.sav_vdevs) {
1240192830Sed		kmem_free(spa->spa_l2cache.sav_vdevs,
1241192830Sed		    spa->spa_l2cache.sav_count * sizeof (void *));
1242192830Sed		spa->spa_l2cache.sav_vdevs = NULL;
1243192830Sed	}
1244192830Sed	if (spa->spa_l2cache.sav_config) {
1245192830Sed		nvlist_free(spa->spa_l2cache.sav_config);
1246192830Sed		spa->spa_l2cache.sav_config = NULL;
1247192830Sed	}
1248192830Sed	spa->spa_l2cache.sav_count = 0;
1249192830Sed
1250192830Sed	spa->spa_async_suspended = 0;
1251192830Sed
1252192830Sed	if (spa->spa_comment != NULL) {
1253192830Sed		spa_strfree(spa->spa_comment);
1254192830Sed		spa->spa_comment = NULL;
1255192830Sed	}
1256192830Sed
1257192830Sed	spa_config_exit(spa, SCL_ALL, FTAG);
1258192830Sed}
1259192830Sed
1260192830Sed/*
1261192830Sed * Load (or re-load) the current list of vdevs describing the active spares for
1262192830Sed * this pool.  When this is called, we have some form of basic information in
1263192830Sed * 'spa_spares.sav_config'.  We parse this into vdevs, try to open them, and
1264192830Sed * then re-generate a more complete list including status information.
1265192830Sed */
1266192830Sedstatic void
1267192830Sedspa_load_spares(spa_t *spa)
1268192830Sed{
1269192830Sed	nvlist_t **spares;
1270192830Sed	uint_t nspares;
1271192830Sed	int i;
1272192830Sed	vdev_t *vd, *tvd;
1273192830Sed
1274192830Sed	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
1275192830Sed
1276192830Sed	/*
1277213567Sed	 * First, close and free any existing spare vdevs.
1278192830Sed	 */
1279192830Sed	for (i = 0; i < spa->spa_spares.sav_count; i++) {
1280192830Sed		vd = spa->spa_spares.sav_vdevs[i];
1281192830Sed
1282192830Sed		/* Undo the call to spa_activate() below */
1283192830Sed		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
1284192830Sed		    B_FALSE)) != NULL && tvd->vdev_isspare)
1285192830Sed			spa_spare_remove(tvd);
1286192830Sed		vdev_close(vd);
1287192830Sed		vdev_free(vd);
1288192830Sed	}
1289192830Sed
1290192830Sed	if (spa->spa_spares.sav_vdevs)
1291192830Sed		kmem_free(spa->spa_spares.sav_vdevs,
1292192830Sed		    spa->spa_spares.sav_count * sizeof (void *));
1293192830Sed
1294192830Sed	if (spa->spa_spares.sav_config == NULL)
1295192914Sed		nspares = 0;
1296192830Sed	else
1297192830Sed		VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
1298192830Sed		    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
1299192830Sed
1300192830Sed	spa->spa_spares.sav_count = (int)nspares;
1301192830Sed	spa->spa_spares.sav_vdevs = NULL;
1302192830Sed
1303192830Sed	if (nspares == 0)
1304192830Sed		return;
1305192830Sed
1306192830Sed	/*
1307192830Sed	 * Construct the array of vdevs, opening them to get status in the
1308192830Sed	 * process.   For each spare, there is potentially two different vdev_t
1309192830Sed	 * structures associated with it: one in the list of spares (used only
1310192830Sed	 * for basic validation purposes) and one in the active vdev
1311192830Sed	 * configuration (if it's spared in).  During this phase we open and
1312192830Sed	 * validate each vdev on the spare list.  If the vdev also exists in the
1313192830Sed	 * active configuration, then we also mark this vdev as an active spare.
1314192830Sed	 */
1315192830Sed	spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *),
1316213567Sed	    KM_SLEEP);
1317192830Sed	for (i = 0; i < spa->spa_spares.sav_count; i++) {
1318192830Sed		VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0,
1319192830Sed		    VDEV_ALLOC_SPARE) == 0);
1320192830Sed		ASSERT(vd != NULL);
1321192830Sed
1322192830Sed		spa->spa_spares.sav_vdevs[i] = vd;
1323192830Sed
1324192830Sed		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
1325192830Sed		    B_FALSE)) != NULL) {
1326192830Sed			if (!tvd->vdev_isspare)
1327192830Sed				spa_spare_add(tvd);
1328192830Sed
1329192830Sed			/*
1330192830Sed			 * We only mark the spare active if we were successfully
1331192830Sed			 * able to load the vdev.  Otherwise, importing a pool
1332192830Sed			 * with a bad active spare would result in strange
1333192830Sed			 * behavior, because multiple pool would think the spare
1334192830Sed			 * is actively in use.
1335192914Sed			 *
1336192830Sed			 * There is a vulnerability here to an equally bizarre
1337192914Sed			 * circumstance, where a dead active spare is later
1338192914Sed			 * brought back to life (onlined or otherwise).  Given
1339192830Sed			 * the rarity of this scenario, and the extra complexity
1340192830Sed			 * it adds, we ignore the possibility.
1341192830Sed			 */
1342192830Sed			if (!vdev_is_dead(tvd))
1343192830Sed				spa_spare_activate(tvd);
1344192830Sed		}
1345192830Sed
1346192830Sed		vd->vdev_top = vd;
1347192830Sed		vd->vdev_aux = &spa->spa_spares;
1348192830Sed
1349192830Sed		if (vdev_open(vd) != 0)
1350192830Sed			continue;
1351192914Sed
1352192830Sed		if (vdev_validate_aux(vd) == 0)
1353192914Sed			spa_spare_add(vd);
1354192914Sed	}
1355192830Sed
1356192830Sed	/*
1357192830Sed	 * Recompute the stashed list of spares, with status information
1358192830Sed	 * this time.
1359192830Sed	 */
1360192830Sed	VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES,
1361192830Sed	    DATA_TYPE_NVLIST_ARRAY) == 0);
1362192830Sed
1363192830Sed	spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *),
1364192830Sed	    KM_SLEEP);
1365192830Sed	for (i = 0; i < spa->spa_spares.sav_count; i++)
1366192830Sed		spares[i] = vdev_config_generate(spa,
1367192914Sed		    spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE);
1368192830Sed	VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
1369192830Sed	    ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0);
1370192830Sed	for (i = 0; i < spa->spa_spares.sav_count; i++)
1371192830Sed		nvlist_free(spares[i]);
1372192830Sed	kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *));
1373192830Sed}
1374192830Sed
1375192830Sed/*
1376192830Sed * Load (or re-load) the current list of vdevs describing the active l2cache for
1377192830Sed * this pool.  When this is called, we have some form of basic information in
1378192830Sed * 'spa_l2cache.sav_config'.  We parse this into vdevs, try to open them, and
1379192830Sed * then re-generate a more complete list including status information.
1380192830Sed * Devices which are already active have their details maintained, and are
1381192830Sed * not re-opened.
1382192830Sed */
1383192830Sedstatic void
1384192830Sedspa_load_l2cache(spa_t *spa)
1385192830Sed{
1386192914Sed	nvlist_t **l2cache;
1387192830Sed	uint_t nl2cache;
1388192830Sed	int i, j, oldnvdevs;
1389192830Sed	uint64_t guid;
1390192830Sed	vdev_t *vd, **oldvdevs, **newvdevs;
1391192830Sed	spa_aux_vdev_t *sav = &spa->spa_l2cache;
1392192830Sed
1393192830Sed	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
1394192830Sed
1395192830Sed	if (sav->sav_config != NULL) {
1396192830Sed		VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
1397192830Sed		    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
1398192830Sed		newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP);
1399192830Sed	} else {
1400192830Sed		nl2cache = 0;
1401192830Sed		newvdevs = NULL;
1402192830Sed	}
1403192830Sed
1404192830Sed	oldvdevs = sav->sav_vdevs;
1405192830Sed	oldnvdevs = sav->sav_count;
1406192830Sed	sav->sav_vdevs = NULL;
1407192830Sed	sav->sav_count = 0;
1408192830Sed
1409192830Sed	/*
1410192830Sed	 * Process new nvlist of vdevs.
1411192830Sed	 */
1412192830Sed	for (i = 0; i < nl2cache; i++) {
1413192830Sed		VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID,
1414192830Sed		    &guid) == 0);
1415192830Sed
1416192830Sed		newvdevs[i] = NULL;
1417192830Sed		for (j = 0; j < oldnvdevs; j++) {
1418192830Sed			vd = oldvdevs[j];
1419192830Sed			if (vd != NULL && guid == vd->vdev_guid) {
1420192830Sed				/*
1421192830Sed				 * Retain previous vdev for add/remove ops.
1422192830Sed				 */
1423192830Sed				newvdevs[i] = vd;
1424192914Sed				oldvdevs[j] = NULL;
1425192830Sed				break;
1426192830Sed			}
1427192830Sed		}
1428192830Sed
1429192830Sed		if (newvdevs[i] == NULL) {
1430192830Sed			/*
1431192830Sed			 * Create new vdev
1432192830Sed			 */
1433192830Sed			VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0,
1434192830Sed			    VDEV_ALLOC_L2CACHE) == 0);
1435192830Sed			ASSERT(vd != NULL);
1436192830Sed			newvdevs[i] = vd;
1437192830Sed
1438192830Sed			/*
1439192830Sed			 * Commit this vdev as an l2cache device,
1440192830Sed			 * even if it fails to open.
1441192830Sed			 */
1442192830Sed			spa_l2cache_add(vd);
1443192830Sed
1444192830Sed			vd->vdev_top = vd;
1445192830Sed			vd->vdev_aux = sav;
1446192830Sed
1447192830Sed			spa_l2cache_activate(vd);
1448192830Sed
1449192830Sed			if (vdev_open(vd) != 0)
1450192830Sed				continue;
1451192830Sed
1452192830Sed			(void) vdev_validate_aux(vd);
1453192830Sed
1454192830Sed			if (!vdev_is_dead(vd))
1455192830Sed				l2arc_add_vdev(spa, vd);
1456192830Sed		}
1457192830Sed	}
1458192830Sed
1459192830Sed	/*
1460192914Sed	 * Purge vdevs that were dropped
1461192830Sed	 */
1462192830Sed	for (i = 0; i < oldnvdevs; i++) {
1463192830Sed		uint64_t pool;
1464192830Sed
1465192830Sed		vd = oldvdevs[i];
1466192830Sed		if (vd != NULL) {
1467192830Sed			ASSERT(vd->vdev_isl2cache);
1468192830Sed
1469192830Sed			if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
1470192830Sed			    pool != 0ULL && l2arc_vdev_present(vd))
1471192830Sed				l2arc_remove_vdev(vd);
1472192830Sed			vdev_clear_stats(vd);
1473192830Sed			vdev_free(vd);
1474192830Sed		}
1475192830Sed	}
1476192830Sed
1477192830Sed	if (oldvdevs)
1478192830Sed		kmem_free(oldvdevs, oldnvdevs * sizeof (void *));
1479192830Sed
1480192830Sed	if (sav->sav_config == NULL)
1481192830Sed		goto out;
1482192830Sed
1483192830Sed	sav->sav_vdevs = newvdevs;
1484192830Sed	sav->sav_count = (int)nl2cache;
1485192830Sed
1486192830Sed	/*
1487192830Sed	 * Recompute the stashed list of l2cache devices, with status
1488192830Sed	 * information this time.
1489192830Sed	 */
1490192830Sed	VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
1491192830Sed	    DATA_TYPE_NVLIST_ARRAY) == 0);
1492192830Sed
1493192830Sed	l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
1494192830Sed	for (i = 0; i < sav->sav_count; i++)
1495192830Sed		l2cache[i] = vdev_config_generate(spa,
1496192830Sed		    sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE);
1497192830Sed	VERIFY(nvlist_add_nvlist_array(sav->sav_config,
1498192830Sed	    ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0);
1499192830Sedout:
1500192830Sed	for (i = 0; i < sav->sav_count; i++)
1501192830Sed		nvlist_free(l2cache[i]);
1502192830Sed	if (sav->sav_count)
1503192830Sed		kmem_free(l2cache, sav->sav_count * sizeof (void *));
1504192830Sed}
1505192830Sed
1506192830Sedstatic int
1507192830Sedload_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
1508192830Sed{
1509192830Sed	dmu_buf_t *db;
1510192830Sed	char *packed = NULL;
1511192830Sed	size_t nvsize = 0;
1512192830Sed	int error;
1513192830Sed	*value = NULL;
1514192830Sed
1515192830Sed	VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
1516192830Sed	nvsize = *(uint64_t *)db->db_data;
1517192830Sed	dmu_buf_rele(db, FTAG);
1518192830Sed
1519192830Sed	packed = kmem_alloc(nvsize, KM_SLEEP);
1520192830Sed	error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed,
1521192830Sed	    DMU_READ_PREFETCH);
1522192830Sed	if (error == 0)
1523192830Sed		error = nvlist_unpack(packed, nvsize, value, 0);
1524192830Sed	kmem_free(packed, nvsize);
1525192830Sed
1526192830Sed	return (error);
1527192830Sed}
1528192830Sed
1529192830Sed/*
1530192830Sed * Checks to see if the given vdev could not be opened, in which case we post a
1531192830Sed * sysevent to notify the autoreplace code that the device has been removed.
1532192830Sed */
1533192830Sedstatic void
1534192830Sedspa_check_removed(vdev_t *vd)
1535192830Sed{
1536192830Sed	for (int c = 0; c < vd->vdev_children; c++)
1537192830Sed		spa_check_removed(vd->vdev_child[c]);
1538192830Sed
1539192830Sed	if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) &&
1540192830Sed	    !vd->vdev_ishole) {
1541192830Sed		zfs_post_autoreplace(vd->vdev_spa, vd);
1542192856Sed		spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK);
1543192856Sed	}
1544192856Sed}
1545192856Sed
1546192914Sed/*
1547192830Sed * Validate the current config against the MOS config
1548192830Sed */
1549192830Sedstatic boolean_t
1550192830Sedspa_config_valid(spa_t *spa, nvlist_t *config)
1551213567Sed{
1552192830Sed	vdev_t *mrvd, *rvd = spa->spa_root_vdev;
1553213567Sed	nvlist_t *nv;
1554192830Sed
1555192830Sed	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0);
1556192830Sed
1557192830Sed	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1558192830Sed	VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0);
1559192830Sed
1560192830Sed	ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children);
1561192830Sed
1562192830Sed	/*
1563192830Sed	 * If we're doing a normal import, then build up any additional
1564192830Sed	 * diagnostic information about missing devices in this config.
1565192830Sed	 * We'll pass this up to the user for further processing.
1566192830Sed	 */
1567192830Sed	if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) {
1568192830Sed		nvlist_t **child, *nv;
1569192830Sed		uint64_t idx = 0;
1570192830Sed
1571192830Sed		child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **),
1572192830Sed		    KM_SLEEP);
1573192830Sed		VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
1574192830Sed
1575192830Sed		for (int c = 0; c < rvd->vdev_children; c++) {
1576192830Sed			vdev_t *tvd = rvd->vdev_child[c];
1577192830Sed			vdev_t *mtvd  = mrvd->vdev_child[c];
1578192830Sed
1579192830Sed			if (tvd->vdev_ops == &vdev_missing_ops &&
1580192830Sed			    mtvd->vdev_ops != &vdev_missing_ops &&
1581192830Sed			    mtvd->vdev_islog)
1582192830Sed				child[idx++] = vdev_config_generate(spa, mtvd,
1583192830Sed				    B_FALSE, 0);
1584192830Sed		}
1585192830Sed
1586192830Sed		if (idx) {
1587192830Sed			VERIFY(nvlist_add_nvlist_array(nv,
1588192830Sed			    ZPOOL_CONFIG_CHILDREN, child, idx) == 0);
1589192830Sed			VERIFY(nvlist_add_nvlist(spa->spa_load_info,
1590192830Sed			    ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0);
1591192830Sed
1592192830Sed			for (int i = 0; i < idx; i++)
1593192830Sed				nvlist_free(child[i]);
1594192830Sed		}
1595192830Sed		nvlist_free(nv);
1596192830Sed		kmem_free(child, rvd->vdev_children * sizeof (char **));
1597192830Sed	}
1598192830Sed
1599192830Sed	/*
1600192830Sed	 * Compare the root vdev tree with the information we have
1601192830Sed	 * from the MOS config (mrvd). Check each top-level vdev
1602192830Sed	 * with the corresponding MOS config top-level (mtvd).
1603192830Sed	 */
1604192830Sed	for (int c = 0; c < rvd->vdev_children; c++) {
1605192830Sed		vdev_t *tvd = rvd->vdev_child[c];
1606192830Sed		vdev_t *mtvd  = mrvd->vdev_child[c];
1607192830Sed
1608192830Sed		/*
1609192830Sed		 * Resolve any "missing" vdevs in the current configuration.
1610192830Sed		 * If we find that the MOS config has more accurate information
1611192830Sed		 * about the top-level vdev then use that vdev instead.
1612192830Sed		 */
1613192830Sed		if (tvd->vdev_ops == &vdev_missing_ops &&
1614192830Sed		    mtvd->vdev_ops != &vdev_missing_ops) {
1615192830Sed
1616192830Sed			if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG))
1617192830Sed				continue;
1618192830Sed
1619192830Sed			/*
1620192830Sed			 * Device specific actions.
1621192830Sed			 */
1622192830Sed			if (mtvd->vdev_islog) {
1623192830Sed				spa_set_log_state(spa, SPA_LOG_CLEAR);
1624192830Sed			} else {
1625192830Sed				/*
1626192830Sed				 * XXX - once we have 'readonly' pool
1627192830Sed				 * support we should be able to handle
1628192830Sed				 * missing data devices by transitioning
1629192830Sed				 * the pool to readonly.
1630192830Sed				 */
1631192830Sed				continue;
1632192830Sed			}
1633192830Sed
1634192830Sed			/*
1635192830Sed			 * Swap the missing vdev with the data we were
1636192830Sed			 * able to obtain from the MOS config.
1637192830Sed			 */
1638192830Sed			vdev_remove_child(rvd, tvd);
1639192830Sed			vdev_remove_child(mrvd, mtvd);
1640192830Sed
1641192830Sed			vdev_add_child(rvd, mtvd);
1642192830Sed			vdev_add_child(mrvd, tvd);
1643192830Sed
1644192830Sed			spa_config_exit(spa, SCL_ALL, FTAG);
1645192830Sed			vdev_load(mtvd);
1646192830Sed			spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1647192830Sed
1648192830Sed			vdev_reopen(rvd);
1649192830Sed		} else if (mtvd->vdev_islog) {
1650192830Sed			/*
1651192830Sed			 * Load the slog device's state from the MOS config
1652192830Sed			 * since it's possible that the label does not
1653192830Sed			 * contain the most up-to-date information.
1654192830Sed			 */
1655192830Sed			vdev_load_log_state(tvd, mtvd);
1656192830Sed			vdev_reopen(tvd);
1657192830Sed		}
1658192830Sed	}
1659192830Sed	vdev_free(mrvd);
1660192830Sed	spa_config_exit(spa, SCL_ALL, FTAG);
1661192830Sed
1662192830Sed	/*
1663192830Sed	 * Ensure we were able to validate the config.
1664192830Sed	 */
1665192830Sed	return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum);
1666192830Sed}
1667192830Sed
1668192830Sed/*
1669192830Sed * Check for missing log devices
1670192830Sed */
1671192830Sedstatic boolean_t
1672192830Sedspa_check_logs(spa_t *spa)
1673192830Sed{
1674192830Sed	boolean_t rv = B_FALSE;
1675192830Sed
1676192830Sed	switch (spa->spa_log_state) {
1677192830Sed	case SPA_LOG_MISSING:
1678192830Sed		/* need to recheck in case slog has been restored */
1679192830Sed	case SPA_LOG_UNKNOWN:
1680192830Sed		rv = (dmu_objset_find(spa->spa_name, zil_check_log_chain,
1681192830Sed		    NULL, DS_FIND_CHILDREN) != 0);
1682192830Sed		if (rv)
1683192830Sed			spa_set_log_state(spa, SPA_LOG_MISSING);
1684192830Sed		break;
1685192830Sed	}
1686192830Sed	return (rv);
1687192830Sed}
1688192830Sed
1689192830Sedstatic boolean_t
1690192830Sedspa_passivate_log(spa_t *spa)
1691192830Sed{
1692192830Sed	vdev_t *rvd = spa->spa_root_vdev;
1693192830Sed	boolean_t slog_found = B_FALSE;
1694192830Sed
1695192830Sed	ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
1696192830Sed
1697192830Sed	if (!spa_has_slogs(spa))
1698192830Sed		return (B_FALSE);
1699192830Sed
1700192830Sed	for (int c = 0; c < rvd->vdev_children; c++) {
1701192830Sed		vdev_t *tvd = rvd->vdev_child[c];
1702192830Sed		metaslab_group_t *mg = tvd->vdev_mg;
1703192830Sed
1704192830Sed		if (tvd->vdev_islog) {
1705192830Sed			metaslab_group_passivate(mg);
1706192914Sed			slog_found = B_TRUE;
1707192830Sed		}
1708192830Sed	}
1709192830Sed
1710192830Sed	return (slog_found);
1711192856Sed}
1712192830Sed
1713192830Sedstatic void
1714192830Sedspa_activate_log(spa_t *spa)
1715192830Sed{
1716192830Sed	vdev_t *rvd = spa->spa_root_vdev;
1717192830Sed
1718192830Sed	ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
1719192830Sed
1720192830Sed	for (int c = 0; c < rvd->vdev_children; c++) {
1721192830Sed		vdev_t *tvd = rvd->vdev_child[c];
1722192914Sed		metaslab_group_t *mg = tvd->vdev_mg;
1723192830Sed
1724192830Sed		if (tvd->vdev_islog)
1725192830Sed			metaslab_group_activate(mg);
1726192830Sed	}
1727192830Sed}
1728192830Sed
1729192830Sedint
1730192830Sedspa_offline_log(spa_t *spa)
1731192830Sed{
1732192830Sed	int error;
1733192830Sed
1734192830Sed	error = dmu_objset_find(spa_name(spa), zil_vdev_offline,
1735192830Sed	    NULL, DS_FIND_CHILDREN);
1736192830Sed	if (error == 0) {
1737192830Sed		/*
1738192830Sed		 * We successfully offlined the log device, sync out the
1739192830Sed		 * current txg so that the "stubby" block can be removed
1740192830Sed		 * by zil_sync().
1741192830Sed		 */
1742192830Sed		txg_wait_synced(spa->spa_dsl_pool, 0);
1743192830Sed	}
1744192830Sed	return (error);
1745192830Sed}
1746192830Sed
1747192830Sedstatic void
1748192830Sedspa_aux_check_removed(spa_aux_vdev_t *sav)
1749192830Sed{
1750192830Sed	int i;
1751192830Sed
1752192830Sed	for (i = 0; i < sav->sav_count; i++)
1753192830Sed		spa_check_removed(sav->sav_vdevs[i]);
1754192830Sed}
1755192830Sed
1756192830Sedvoid
1757192830Sedspa_claim_notify(zio_t *zio)
1758192830Sed{
1759192830Sed	spa_t *spa = zio->io_spa;
1760192830Sed
1761192830Sed	if (zio->io_error)
1762192830Sed		return;
1763192830Sed
1764192830Sed	mutex_enter(&spa->spa_props_lock);	/* any mutex will do */
1765192830Sed	if (spa->spa_claim_max_txg < zio->io_bp->blk_birth)
1766192830Sed		spa->spa_claim_max_txg = zio->io_bp->blk_birth;
1767192830Sed	mutex_exit(&spa->spa_props_lock);
1768192830Sed}
1769192830Sed
1770192830Sedtypedef struct spa_load_error {
1771192830Sed	uint64_t	sle_meta_count;
1772192830Sed	uint64_t	sle_data_count;
1773192830Sed} spa_load_error_t;
1774192830Sed
1775192830Sedstatic void
1776192830Sedspa_load_verify_done(zio_t *zio)
1777192830Sed{
1778192830Sed	blkptr_t *bp = zio->io_bp;
1779192830Sed	spa_load_error_t *sle = zio->io_private;
1780192830Sed	dmu_object_type_t type = BP_GET_TYPE(bp);
1781192830Sed	int error = zio->io_error;
1782192830Sed
1783192830Sed	if (error) {
1784192830Sed		if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) &&
1785192830Sed		    type != DMU_OT_INTENT_LOG)
1786192830Sed			atomic_add_64(&sle->sle_meta_count, 1);
1787192830Sed		else
1788192830Sed			atomic_add_64(&sle->sle_data_count, 1);
1789192830Sed	}
1790192830Sed	zio_data_buf_free(zio->io_data, zio->io_size);
1791192830Sed}
1792192830Sed
1793192830Sed/*ARGSUSED*/
1794192830Sedstatic int
1795192830Sedspa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
1796192830Sed    const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
1797192830Sed{
1798192830Sed	if (bp != NULL) {
1799192830Sed		zio_t *rio = arg;
1800192830Sed		size_t size = BP_GET_PSIZE(bp);
1801192830Sed		void *data = zio_data_buf_alloc(size);
1802192830Sed
1803192830Sed		zio_nowait(zio_read(rio, spa, bp, data, size,
1804192830Sed		    spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
1805192830Sed		    ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
1806192830Sed		    ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
1807192830Sed	}
1808192830Sed	return (0);
1809192830Sed}
1810192830Sed
1811192830Sedstatic int
1812192830Sedspa_load_verify(spa_t *spa)
1813192830Sed{
1814192830Sed	zio_t *rio;
1815192830Sed	spa_load_error_t sle = { 0 };
1816192830Sed	zpool_rewind_policy_t policy;
1817192830Sed	boolean_t verify_ok = B_FALSE;
1818192830Sed	int error;
1819192830Sed
1820192830Sed	zpool_get_rewind_policy(spa->spa_config, &policy);
1821192830Sed
1822192830Sed	if (policy.zrp_request & ZPOOL_NEVER_REWIND)
1823192830Sed		return (0);
1824192830Sed
1825192830Sed	rio = zio_root(spa, NULL, &sle,
1826192830Sed	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
1827192830Sed
1828192830Sed	error = traverse_pool(spa, spa->spa_verify_min_txg,
1829192830Sed	    TRAVERSE_PRE | TRAVERSE_PREFETCH, spa_load_verify_cb, rio);
1830192830Sed
1831192830Sed	(void) zio_wait(rio);
1832192830Sed
1833192830Sed	spa->spa_load_meta_errors = sle.sle_meta_count;
1834192830Sed	spa->spa_load_data_errors = sle.sle_data_count;
1835192830Sed
1836192830Sed	if (!error && sle.sle_meta_count <= policy.zrp_maxmeta &&
1837192830Sed	    sle.sle_data_count <= policy.zrp_maxdata) {
1838192830Sed		int64_t loss = 0;
1839192830Sed
1840192830Sed		verify_ok = B_TRUE;
1841192830Sed		spa->spa_load_txg = spa->spa_uberblock.ub_txg;
1842192830Sed		spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp;
1843192830Sed
1844192830Sed		loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts;
1845192830Sed		VERIFY(nvlist_add_uint64(spa->spa_load_info,
1846192830Sed		    ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0);
1847192830Sed		VERIFY(nvlist_add_int64(spa->spa_load_info,
1848192830Sed		    ZPOOL_CONFIG_REWIND_TIME, loss) == 0);
1849192830Sed		VERIFY(nvlist_add_uint64(spa->spa_load_info,
1850192830Sed		    ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0);
1851192830Sed	} else {
1852192830Sed		spa->spa_load_max_txg = spa->spa_uberblock.ub_txg;
1853192830Sed	}
1854192830Sed
1855192830Sed	if (error) {
1856192830Sed		if (error != ENXIO && error != EIO)
1857192830Sed			error = SET_ERROR(EIO);
1858192830Sed		return (error);
1859192830Sed	}
1860192830Sed
1861192830Sed	return (verify_ok ? 0 : EIO);
1862192830Sed}
1863192830Sed
1864192830Sed/*
1865192830Sed * Find a value in the pool props object.
1866192830Sed */
1867192830Sedstatic void
1868192830Sedspa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val)
1869192830Sed{
1870192830Sed	(void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object,
1871192830Sed	    zpool_prop_to_name(prop), sizeof (uint64_t), 1, val);
1872192830Sed}
1873192830Sed
1874192830Sed/*
1875192830Sed * Find a value in the pool directory object.
1876192830Sed */
1877192830Sedstatic int
1878192830Sedspa_dir_prop(spa_t *spa, const char *name, uint64_t *val)
1879192830Sed{
1880192830Sed	return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
1881192830Sed	    name, sizeof (uint64_t), 1, val));
1882192830Sed}
1883192830Sed
1884192830Sedstatic int
1885192830Sedspa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err)
1886192830Sed{
1887192830Sed	vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux);
1888192830Sed	return (err);
1889192830Sed}
1890192830Sed
1891192830Sed/*
1892192914Sed * Fix up config after a partly-completed split.  This is done with the
1893192830Sed * ZPOOL_CONFIG_SPLIT nvlist.  Both the splitting pool and the split-off
1894192830Sed * pool have that entry in their config, but only the splitting one contains
1895192830Sed * a list of all the guids of the vdevs that are being split off.
1896192830Sed *
1897192830Sed * This function determines what to do with that list: either rejoin
1898192830Sed * all the disks to the pool, or complete the splitting process.  To attempt
1899192830Sed * the rejoin, each disk that is offlined is marked online again, and
1900192830Sed * we do a reopen() call.  If the vdev label for every disk that was
1901192830Sed * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL)
1902192830Sed * then we call vdev_split() on each disk, and complete the split.
1903192830Sed *
1904192830Sed * Otherwise we leave the config alone, with all the vdevs in place in
1905192830Sed * the original pool.
1906192830Sed */
1907192830Sedstatic void
1908192830Sedspa_try_repair(spa_t *spa, nvlist_t *config)
1909192830Sed{
1910192830Sed	uint_t extracted;
1911192830Sed	uint64_t *glist;
1912192830Sed	uint_t i, gcount;
1913192830Sed	nvlist_t *nvl;
1914192830Sed	vdev_t **vd;
1915192830Sed	boolean_t attempt_reopen;
1916192830Sed
1917192914Sed	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0)
1918192830Sed		return;
1919192830Sed
1920192830Sed	/* check that the config is complete */
1921196750Sache	if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
1922192830Sed	    &glist, &gcount) != 0)
1923192830Sed		return;
1924192830Sed
1925192830Sed	vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP);
1926192830Sed
1927196750Sache	/* attempt to online all the vdevs & validate */
1928192830Sed	attempt_reopen = B_TRUE;
1929192830Sed	for (i = 0; i < gcount; i++) {
1930192830Sed		if (glist[i] == 0)	/* vdev is hole */
1931192830Sed			continue;
1932192830Sed
1933192914Sed		vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE);
1934192830Sed		if (vd[i] == NULL) {
1935192914Sed			/*
1936192830Sed			 * Don't bother attempting to reopen the disks;
1937192830Sed			 * just do the split.
1938192830Sed			 */
1939192830Sed			attempt_reopen = B_FALSE;
1940192830Sed		} else {
1941192830Sed			/* attempt to re-online it */
1942192830Sed			vd[i]->vdev_offline = B_FALSE;
1943192830Sed		}
1944192830Sed	}
1945192830Sed
1946192830Sed	if (attempt_reopen) {
1947192830Sed		vdev_reopen(spa->spa_root_vdev);
1948192830Sed
1949192830Sed		/* check each device to see what state it's in */
1950192830Sed		for (extracted = 0, i = 0; i < gcount; i++) {
1951192830Sed			if (vd[i] != NULL &&
1952192830Sed			    vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL)
1953192830Sed				break;
1954192830Sed			++extracted;
1955192830Sed		}
1956192830Sed	}
1957192830Sed
1958192830Sed	/*
1959192830Sed	 * If every disk has been moved to the new pool, or if we never
1960192914Sed	 * even attempted to look at them, then we split them off for
1961192830Sed	 * good.
1962192830Sed	 */
1963192830Sed	if (!attempt_reopen || gcount == extracted) {
1964192830Sed		for (i = 0; i < gcount; i++)
1965192830Sed			if (vd[i] != NULL)
1966192830Sed				vdev_split(vd[i]);
1967192830Sed		vdev_reopen(spa->spa_root_vdev);
1968192830Sed	}
1969192830Sed
1970192830Sed	kmem_free(vd, gcount * sizeof (vdev_t *));
1971192830Sed}
1972196818Sache
1973192830Sedstatic int
1974192830Sedspa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type,
1975192830Sed    boolean_t mosconfig)
1976192830Sed{
1977192914Sed	nvlist_t *config = spa->spa_config;
1978192830Sed	char *ereport = FM_EREPORT_ZFS_POOL;
1979192830Sed	char *comment;
1980192830Sed	int error;
1981192830Sed	uint64_t pool_guid;
1982192830Sed	nvlist_t *nvl;
1983192830Sed
1984192830Sed	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid))
1985192830Sed		return (SET_ERROR(EINVAL));
1986192830Sed
1987192830Sed	ASSERT(spa->spa_comment == NULL);
1988192830Sed	if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0)
1989192830Sed		spa->spa_comment = spa_strdup(comment);
1990192830Sed
1991213567Sed	/*
1992192830Sed	 * Versioning wasn't explicitly added to the label until later, so if
1993192830Sed	 * it's not present treat it as the initial version.
1994192830Sed	 */
1995192830Sed	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
1996192830Sed	    &spa->spa_ubsync.ub_version) != 0)
1997192830Sed		spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
1998192830Sed
1999192830Sed	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
2000192830Sed	    &spa->spa_config_txg);
2001192830Sed
2002192830Sed	if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) &&
2003192830Sed	    spa_guid_exists(pool_guid, 0)) {
2004192830Sed		error = SET_ERROR(EEXIST);
2005192830Sed	} else {
2006192830Sed		spa->spa_config_guid = pool_guid;
2007192830Sed
2008213567Sed		if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT,
2009192830Sed		    &nvl) == 0) {
2010192830Sed			VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting,
2011192830Sed			    KM_SLEEP) == 0);
2012192830Sed		}
2013213567Sed
2014192830Sed		nvlist_free(spa->spa_load_info);
2015192830Sed		spa->spa_load_info = fnvlist_alloc();
2016192830Sed
2017192830Sed		gethrestime(&spa->spa_loaded_ts);
2018192830Sed		error = spa_load_impl(spa, pool_guid, config, state, type,
2019192830Sed		    mosconfig, &ereport);
2020192830Sed	}
2021192830Sed
2022213567Sed	spa->spa_minref = refcount_count(&spa->spa_refcount);
2023192914Sed	if (error) {
2024192914Sed		if (error != EEXIST) {
2025192914Sed			spa->spa_loaded_ts.tv_sec = 0;
2026192914Sed			spa->spa_loaded_ts.tv_nsec = 0;
2027192914Sed		}
2028192914Sed		if (error != EBADF) {
2029192914Sed			zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0);
2030192830Sed		}
2031192830Sed	}
2032192830Sed	spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
2033192830Sed	spa->spa_ena = 0;
2034192830Sed
2035192830Sed	return (error);
2036192830Sed}
2037192830Sed
2038192830Sed/*
2039192830Sed * Load an existing storage pool, using the pool's builtin spa_config as a
2040192830Sed * source of configuration information.
2041192830Sed */
2042192830Sedstatic int
2043192830Sedspa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
2044192830Sed    spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
2045192830Sed    char **ereport)
2046192830Sed{
2047192830Sed	int error = 0;
2048192830Sed	nvlist_t *nvroot = NULL;
2049192830Sed	nvlist_t *label;
2050192830Sed	vdev_t *rvd;
2051192830Sed	uberblock_t *ub = &spa->spa_uberblock;
2052192830Sed	uint64_t children, config_cache_txg = spa->spa_config_txg;
2053192830Sed	int orig_mode = spa->spa_mode;
2054192830Sed	int parse;
2055192830Sed	uint64_t obj;
2056192830Sed	boolean_t missing_feat_write = B_FALSE;
2057192830Sed
2058192830Sed	/*
2059192830Sed	 * If this is an untrusted config, access the pool in read-only mode.
2060192830Sed	 * This prevents things like resilvering recently removed devices.
2061192830Sed	 */
2062192830Sed	if (!mosconfig)
2063192830Sed		spa->spa_mode = FREAD;
2064192830Sed
2065192830Sed	ASSERT(MUTEX_HELD(&spa_namespace_lock));
2066192830Sed
2067192830Sed	spa->spa_load_state = state;
2068192830Sed
2069192830Sed	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot))
2070192856Sed		return (SET_ERROR(EINVAL));
2071192830Sed
2072192830Sed	parse = (type == SPA_IMPORT_EXISTING ?
2073192856Sed	    VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT);
2074192830Sed
2075192830Sed	/*
2076192830Sed	 * Create "The Godfather" zio to hold all async IOs
2077192830Sed	 */
2078192830Sed	spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
2079192830Sed	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
2080192830Sed
2081192830Sed	/*
2082192830Sed	 * Parse the configuration into a vdev tree.  We explicitly set the
2083192830Sed	 * value that will be returned by spa_version() since parsing the
2084192830Sed	 * configuration requires knowing the version number.
2085192830Sed	 */
2086192830Sed	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2087192830Sed	error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse);
2088192830Sed	spa_config_exit(spa, SCL_ALL, FTAG);
2089192830Sed
2090192914Sed	if (error != 0)
2091192830Sed		return (error);
2092192830Sed
2093192830Sed	ASSERT(spa->spa_root_vdev == rvd);
2094192830Sed
2095192830Sed	if (type != SPA_IMPORT_ASSEMBLE) {
2096192830Sed		ASSERT(spa_guid(spa) == pool_guid);
2097192830Sed	}
2098192830Sed
2099192830Sed	/*
2100192830Sed	 * Try to open all vdevs, loading each label in the process.
2101192830Sed	 */
2102192830Sed	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2103192830Sed	error = vdev_open(rvd);
2104192830Sed	spa_config_exit(spa, SCL_ALL, FTAG);
2105192830Sed	if (error != 0)
2106192830Sed		return (error);
2107192830Sed
2108228627Sdim	/*
2109228627Sdim	 * We need to validate the vdev labels against the configuration that
2110228627Sdim	 * we have in hand, which is dependent on the setting of mosconfig. If
2111228627Sdim	 * mosconfig is true then we're validating the vdev labels based on
2112192830Sed	 * that config.  Otherwise, we're validating against the cached config
2113192830Sed	 * (zpool.cache) that was read when we loaded the zfs module, and then
2114192914Sed	 * later we will recursively call spa_load() and validate against
2115192830Sed	 * the vdev config.
2116192830Sed	 *
2117192830Sed	 * If we're assembling a new pool that's been split off from an
2118192830Sed	 * existing pool, the labels haven't yet been updated so we skip
2119192856Sed	 * validation for now.
2120192856Sed	 */
2121192830Sed	if (type != SPA_IMPORT_ASSEMBLE) {
2122192830Sed		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2123192856Sed		error = vdev_validate(rvd, mosconfig);
2124192856Sed		spa_config_exit(spa, SCL_ALL, FTAG);
2125192830Sed
2126192830Sed		if (error != 0)
2127192830Sed			return (error);
2128192856Sed
2129192856Sed		if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
2130192856Sed			return (SET_ERROR(ENXIO));
2131192856Sed	}
2132192856Sed
2133192856Sed	/*
2134192856Sed	 * Find the best uberblock.
2135192856Sed	 */
2136192856Sed	vdev_uberblock_load(rvd, ub, &label);
2137192856Sed
2138192856Sed	/*
2139192856Sed	 * If we weren't able to find a single valid uberblock, return failure.
2140192856Sed	 */
2141192914Sed	if (ub->ub_txg == 0) {
2142192856Sed		nvlist_free(label);
2143192856Sed		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO));
2144192856Sed	}
2145192856Sed
2146192856Sed	/*
2147192914Sed	 * If the pool has an unsupported version we can't open it.
2148192856Sed	 */
2149192856Sed	if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) {
2150192856Sed		nvlist_free(label);
2151192856Sed		return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP));
2152192856Sed	}
2153192830Sed
2154192830Sed	if (ub->ub_version >= SPA_VERSION_FEATURES) {
2155192830Sed		nvlist_t *features;
2156192830Sed
2157192830Sed		/*
2158192830Sed		 * If we weren't able to find what's necessary for reading the
2159192830Sed		 * MOS in the label, return failure.
2160192830Sed		 */
2161192830Sed		if (label == NULL || nvlist_lookup_nvlist(label,
2162192830Sed		    ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) {
2163192830Sed			nvlist_free(label);
2164192830Sed			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
2165192830Sed			    ENXIO));
2166192830Sed		}
2167192830Sed
2168192830Sed		/*
2169192830Sed		 * Update our in-core representation with the definitive values
2170192830Sed		 * from the label.
2171192830Sed		 */
2172192830Sed		nvlist_free(spa->spa_label_features);
2173192830Sed		VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0);
2174192830Sed	}
2175192830Sed
2176192830Sed	nvlist_free(label);
2177192830Sed
2178192830Sed	/*
2179192830Sed	 * Look through entries in the label nvlist's features_for_read. If
2180192830Sed	 * there is a feature listed there which we don't understand then we
2181192830Sed	 * cannot open a pool.
2182192830Sed	 */
2183192830Sed	if (ub->ub_version >= SPA_VERSION_FEATURES) {
2184192830Sed		nvlist_t *unsup_feat;
2185192830Sed
2186192830Sed		VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) ==
2187192830Sed		    0);
2188192830Sed
2189192830Sed		for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features,
2190192830Sed		    NULL); nvp != NULL;
2191192830Sed		    nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) {
2192192830Sed			if (!zfeature_is_supported(nvpair_name(nvp))) {
2193192830Sed				VERIFY(nvlist_add_string(unsup_feat,
2194192830Sed				    nvpair_name(nvp), "") == 0);
2195192830Sed			}
2196192830Sed		}
2197192830Sed
2198192830Sed		if (!nvlist_empty(unsup_feat)) {
2199192830Sed			VERIFY(nvlist_add_nvlist(spa->spa_load_info,
2200192830Sed			    ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0);
2201192830Sed			nvlist_free(unsup_feat);
2202192830Sed			return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
2203192830Sed			    ENOTSUP));
2204192830Sed		}
2205192830Sed
2206192830Sed		nvlist_free(unsup_feat);
2207192830Sed	}
2208192830Sed
2209192830Sed	/*
2210192830Sed	 * If the vdev guid sum doesn't match the uberblock, we have an
2211192830Sed	 * incomplete configuration.  We first check to see if the pool
2212192830Sed	 * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN).
2213192830Sed	 * If it is, defer the vdev_guid_sum check till later so we
2214192830Sed	 * can handle missing vdevs.
2215192830Sed	 */
2216192830Sed	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN,
2217192830Sed	    &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE &&
2218192830Sed	    rvd->vdev_guid_sum != ub->ub_guid_sum)
2219213567Sed		return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO));
2220192830Sed
2221192830Sed	if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) {
2222192830Sed		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2223192830Sed		spa_try_repair(spa, config);
2224192830Sed		spa_config_exit(spa, SCL_ALL, FTAG);
2225192830Sed		nvlist_free(spa->spa_config_splitting);
2226192830Sed		spa->spa_config_splitting = NULL;
2227192830Sed	}
2228192830Sed
2229192914Sed	/*
2230192830Sed	 * Initialize internal SPA structures.
2231192830Sed	 */
2232192830Sed	spa->spa_state = POOL_STATE_ACTIVE;
2233192830Sed	spa->spa_ubsync = spa->spa_uberblock;
2234192830Sed	spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
2235192830Sed	    TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1;
2236192830Sed	spa->spa_first_txg = spa->spa_last_ubsync_txg ?
2237192830Sed	    spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
2238192830Sed	spa->spa_claim_max_txg = spa->spa_first_txg;
2239192830Sed	spa->spa_prev_software_version = ub->ub_software_version;
2240192830Sed
2241192830Sed	error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
2242192830Sed	if (error)
2243192830Sed		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2244192830Sed	spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
2245192830Sed
2246192830Sed	if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0)
2247192830Sed		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2248192830Sed
2249192830Sed	if (spa_version(spa) >= SPA_VERSION_FEATURES) {
2250192830Sed		boolean_t missing_feat_read = B_FALSE;
2251192830Sed		nvlist_t *unsup_feat, *enabled_feat;
2252192830Sed
2253192830Sed		if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ,
2254192830Sed		    &spa->spa_feat_for_read_obj) != 0) {
2255192830Sed			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2256192830Sed		}
2257192830Sed
2258192830Sed		if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE,
2259192830Sed		    &spa->spa_feat_for_write_obj) != 0) {
2260192830Sed			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2261192830Sed		}
2262192830Sed
2263192830Sed		if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS,
2264192830Sed		    &spa->spa_feat_desc_obj) != 0) {
2265192830Sed			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2266192830Sed		}
2267192830Sed
2268192830Sed		enabled_feat = fnvlist_alloc();
2269192830Sed		unsup_feat = fnvlist_alloc();
2270192830Sed
2271192830Sed		if (!feature_is_supported(spa->spa_meta_objset,
2272192830Sed		    spa->spa_feat_for_read_obj, spa->spa_feat_desc_obj,
2273192830Sed		    unsup_feat, enabled_feat))
2274192830Sed			missing_feat_read = B_TRUE;
2275192830Sed
2276192830Sed		if (spa_writeable(spa) || state == SPA_LOAD_TRYIMPORT) {
2277192830Sed			if (!feature_is_supported(spa->spa_meta_objset,
2278192830Sed			    spa->spa_feat_for_write_obj, spa->spa_feat_desc_obj,
2279192830Sed			    unsup_feat, enabled_feat)) {
2280192830Sed				missing_feat_write = B_TRUE;
2281192830Sed			}
2282192830Sed		}
2283192830Sed
2284192830Sed		fnvlist_add_nvlist(spa->spa_load_info,
2285192830Sed		    ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat);
2286192830Sed
2287192830Sed		if (!nvlist_empty(unsup_feat)) {
2288192830Sed			fnvlist_add_nvlist(spa->spa_load_info,
2289192830Sed			    ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat);
2290192830Sed		}
2291192830Sed
2292192830Sed		fnvlist_free(enabled_feat);
2293192830Sed		fnvlist_free(unsup_feat);
2294192830Sed
2295192830Sed		if (!missing_feat_read) {
2296192830Sed			fnvlist_add_boolean(spa->spa_load_info,
2297192830Sed			    ZPOOL_CONFIG_CAN_RDONLY);
2298192830Sed		}
2299192830Sed
2300192830Sed		/*
2301192830Sed		 * If the state is SPA_LOAD_TRYIMPORT, our objective is
2302192830Sed		 * twofold: to determine whether the pool is available for
2303192830Sed		 * import in read-write mode and (if it is not) whether the
2304192830Sed		 * pool is available for import in read-only mode. If the pool
2305192830Sed		 * is available for import in read-write mode, it is displayed
2306192830Sed		 * as available in userland; if it is not available for import
2307192830Sed		 * in read-only mode, it is displayed as unavailable in
2308192830Sed		 * userland. If the pool is available for import in read-only
2309192830Sed		 * mode but not read-write mode, it is displayed as unavailable
2310192830Sed		 * in userland with a special note that the pool is actually
2311192830Sed		 * available for open in read-only mode.
2312192830Sed		 *
2313192830Sed		 * As a result, if the state is SPA_LOAD_TRYIMPORT and we are
2314192830Sed		 * missing a feature for write, we must first determine whether
2315192830Sed		 * the pool can be opened read-only before returning to
2316192830Sed		 * userland in order to know whether to display the
2317192830Sed		 * abovementioned note.
2318192830Sed		 */
2319192830Sed		if (missing_feat_read || (missing_feat_write &&
2320192830Sed		    spa_writeable(spa))) {
2321192830Sed			return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
2322192830Sed			    ENOTSUP));
2323192830Sed		}
2324192830Sed	}
2325192830Sed
2326192830Sed	spa->spa_is_initializing = B_TRUE;
2327192830Sed	error = dsl_pool_open(spa->spa_dsl_pool);
2328192830Sed	spa->spa_is_initializing = B_FALSE;
2329192830Sed	if (error != 0)
2330192830Sed		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2331192830Sed
2332192830Sed	if (!mosconfig) {
2333192830Sed		uint64_t hostid;
2334192830Sed		nvlist_t *policy = NULL, *nvconfig;
2335192830Sed
2336192830Sed		if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
2337192830Sed			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2338192830Sed
2339192830Sed		if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig,
2340192830Sed		    ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
2341192830Sed			char *hostname;
2342192830Sed			unsigned long myhostid = 0;
2343192830Sed
2344192830Sed			VERIFY(nvlist_lookup_string(nvconfig,
2345192830Sed			    ZPOOL_CONFIG_HOSTNAME, &hostname) == 0);
2346192914Sed
2347192830Sed#ifdef	_KERNEL
2348192830Sed			myhostid = zone_get_hostid(NULL);
2349192830Sed#else	/* _KERNEL */
2350192830Sed			/*
2351192830Sed			 * We're emulating the system's hostid in userland, so
2352192830Sed			 * we can't use zone_get_hostid().
2353192830Sed			 */
2354192830Sed			(void) ddi_strtoul(hw_serial, NULL, 10, &myhostid);
2355192830Sed#endif	/* _KERNEL */
2356192830Sed			if (check_hostid && hostid != 0 && myhostid != 0 &&
2357192830Sed			    hostid != myhostid) {
2358192830Sed				nvlist_free(nvconfig);
2359192830Sed				cmn_err(CE_WARN, "pool '%s' could not be "
2360192830Sed				    "loaded as it was last accessed by "
2361192830Sed				    "another system (host: %s hostid: 0x%lx). "
2362192830Sed				    "See: http://illumos.org/msg/ZFS-8000-EY",
2363192830Sed				    spa_name(spa), hostname,
2364192830Sed				    (unsigned long)hostid);
2365192830Sed				return (SET_ERROR(EBADF));
2366192830Sed			}
2367192830Sed		}
2368192914Sed		if (nvlist_lookup_nvlist(spa->spa_config,
2369192830Sed		    ZPOOL_REWIND_POLICY, &policy) == 0)
2370192830Sed			VERIFY(nvlist_add_nvlist(nvconfig,
2371192830Sed			    ZPOOL_REWIND_POLICY, policy) == 0);
2372192830Sed
2373192830Sed		spa_config_set(spa, nvconfig);
2374192830Sed		spa_unload(spa);
2375192830Sed		spa_deactivate(spa);
2376192830Sed		spa_activate(spa, orig_mode);
2377192830Sed
2378192830Sed		return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE));
2379192830Sed	}
2380192830Sed
2381192830Sed	if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0)
2382192830Sed		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2383192830Sed	error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj);
2384192830Sed	if (error != 0)
2385192830Sed		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2386192830Sed
2387192830Sed	/*
2388192830Sed	 * Load the bit that tells us to use the new accounting function
2389192830Sed	 * (raid-z deflation).  If we have an older pool, this will not
2390192830Sed	 * be present.
2391192830Sed	 */
2392192830Sed	error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate);
2393192830Sed	if (error != 0 && error != ENOENT)
2394192830Sed		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2395192830Sed
2396192830Sed	error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION,
2397192830Sed	    &spa->spa_creation_version);
2398192830Sed	if (error != 0 && error != ENOENT)
2399192830Sed		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2400192830Sed
2401192830Sed	/*
2402192830Sed	 * Load the persistent error log.  If we have an older pool, this will
2403192830Sed	 * not be present.
2404192830Sed	 */
2405192830Sed	error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last);
2406192914Sed	if (error != 0 && error != ENOENT)
2407192830Sed		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2408192830Sed
2409192914Sed	error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB,
2410192830Sed	    &spa->spa_errlog_scrub);
2411192830Sed	if (error != 0 && error != ENOENT)
2412192830Sed		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2413192830Sed
2414192830Sed	/*
2415192830Sed	 * Load the history object.  If we have an older pool, this
2416192830Sed	 * will not be present.
2417192830Sed	 */
2418192830Sed	error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history);
2419192830Sed	if (error != 0 && error != ENOENT)
2420192830Sed		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2421192830Sed
2422192830Sed	/*
2423192830Sed	 * If we're assembling the pool from the split-off vdevs of
2424192830Sed	 * an existing pool, we don't want to attach the spares & cache
2425192830Sed	 * devices.
2426192856Sed	 */
2427192830Sed
2428192830Sed	/*
2429192830Sed	 * Load any hot spares for this pool.
2430192830Sed	 */
2431192830Sed	error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object);
2432192830Sed	if (error != 0 && error != ENOENT)
2433192830Sed		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2434192830Sed	if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
2435192830Sed		ASSERT(spa_version(spa) >= SPA_VERSION_SPARES);
2436192830Sed		if (load_nvlist(spa, spa->spa_spares.sav_object,
2437192830Sed		    &spa->spa_spares.sav_config) != 0)
2438192830Sed			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2439192830Sed
2440192830Sed		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2441192830Sed		spa_load_spares(spa);
2442192830Sed		spa_config_exit(spa, SCL_ALL, FTAG);
2443192830Sed	} else if (error == 0) {
2444196818Sache		spa->spa_spares.sav_sync = B_TRUE;
2445192830Sed	}
2446192830Sed
2447192830Sed	/*
2448192830Sed	 * Load any level 2 ARC devices for this pool.
2449192830Sed	 */
2450192830Sed	error = spa_dir_prop(spa, DMU_POOL_L2CACHE,
2451192830Sed	    &spa->spa_l2cache.sav_object);
2452192830Sed	if (error != 0 && error != ENOENT)
2453192830Sed		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2454192830Sed	if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
2455192830Sed		ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE);
2456192830Sed		if (load_nvlist(spa, spa->spa_l2cache.sav_object,
2457192830Sed		    &spa->spa_l2cache.sav_config) != 0)
2458192830Sed			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2459192830Sed
2460192830Sed		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2461192830Sed		spa_load_l2cache(spa);
2462192830Sed		spa_config_exit(spa, SCL_ALL, FTAG);
2463192830Sed	} else if (error == 0) {
2464192830Sed		spa->spa_l2cache.sav_sync = B_TRUE;
2465192830Sed	}
2466192830Sed
2467192830Sed	spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
2468192830Sed
2469192830Sed	error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object);
2470192830Sed	if (error && error != ENOENT)
2471192830Sed		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2472192830Sed
2473192830Sed	if (error == 0) {
2474192830Sed		uint64_t autoreplace;
2475192830Sed
2476192830Sed		spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs);
2477192830Sed		spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace);
2478192830Sed		spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation);
2479192830Sed		spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode);
2480192830Sed		spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand);
2481192830Sed		spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO,
2482192830Sed		    &spa->spa_dedup_ditto);
2483192830Sed
2484192830Sed		spa->spa_autoreplace = (autoreplace != 0);
2485192830Sed	}
2486192830Sed
2487192830Sed	/*
2488192830Sed	 * If the 'autoreplace' property is set, then post a resource notifying
2489192830Sed	 * the ZFS DE that it should not issue any faults for unopenable
2490192914Sed	 * devices.  We also iterate over the vdevs, and post a sysevent for any
2491192830Sed	 * unopenable vdevs so that the normal autoreplace handler can take
2492192830Sed	 * over.
2493192830Sed	 */
2494192914Sed	if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) {
2495192830Sed		spa_check_removed(spa->spa_root_vdev);
2496192830Sed		/*
2497192830Sed		 * For the import case, this is done in spa_import(), because
2498192830Sed		 * at this point we're using the spare definitions from
2499192830Sed		 * the MOS config, not necessarily from the userland config.
2500192830Sed		 */
2501192830Sed		if (state != SPA_LOAD_IMPORT) {
2502192830Sed			spa_aux_check_removed(&spa->spa_spares);
2503192856Sed			spa_aux_check_removed(&spa->spa_l2cache);
2504192830Sed		}
2505192856Sed	}
2506192830Sed
2507192830Sed	/*
2508192830Sed	 * Load the vdev state for all toplevel vdevs.
2509192830Sed	 */
2510192830Sed	vdev_load(rvd);
2511192830Sed
2512192830Sed	/*
2513192830Sed	 * Propagate the leaf DTLs we just loaded all the way up the tree.
2514192830Sed	 */
2515192856Sed	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2516192856Sed	vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
2517192830Sed	spa_config_exit(spa, SCL_ALL, FTAG);
2518192830Sed
2519192830Sed	/*
2520192830Sed	 * Load the DDTs (dedup tables).
2521196818Sache	 */
2522192830Sed	error = ddt_load(spa);
2523192830Sed	if (error != 0)
2524192830Sed		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2525192830Sed
2526192830Sed	spa_update_dspace(spa);
2527192830Sed
2528192830Sed	/*
2529192830Sed	 * Validate the config, using the MOS config to fill in any
2530192830Sed	 * information which might be missing.  If we fail to validate
2531192830Sed	 * the config then declare the pool unfit for use. If we're
2532192830Sed	 * assembling a pool from a split, the log is not transferred
2533192830Sed	 * over.
2534192830Sed	 */
2535192830Sed	if (type != SPA_IMPORT_ASSEMBLE) {
2536192830Sed		nvlist_t *nvconfig;
2537192830Sed
2538192830Sed		if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
2539192830Sed			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2540192830Sed
2541192830Sed		if (!spa_config_valid(spa, nvconfig)) {
2542192830Sed			nvlist_free(nvconfig);
2543192830Sed			return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM,
2544192830Sed			    ENXIO));
2545192830Sed		}
2546192830Sed		nvlist_free(nvconfig);
2547192830Sed
2548192830Sed		/*
2549192830Sed		 * Now that we've validated the config, check the state of the
2550192830Sed		 * root vdev.  If it can't be opened, it indicates one or
2551192830Sed		 * more toplevel vdevs are faulted.
2552192830Sed		 */
2553192830Sed		if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
2554192830Sed			return (SET_ERROR(ENXIO));
2555192830Sed
2556192830Sed		if (spa_check_logs(spa)) {
2557192830Sed			*ereport = FM_EREPORT_ZFS_LOG_REPLAY;
2558192830Sed			return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO));
2559192830Sed		}
2560192830Sed	}
2561192830Sed
2562192830Sed	if (missing_feat_write) {
2563192830Sed		ASSERT(state == SPA_LOAD_TRYIMPORT);
2564192830Sed
2565192830Sed		/*
2566192830Sed		 * At this point, we know that we can open the pool in
2567192830Sed		 * read-only mode but not read-write mode. We now have enough
2568192830Sed		 * information and can return to userland.
2569192830Sed		 */
2570192830Sed		return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP));
2571192830Sed	}
2572192830Sed
2573192830Sed	/*
2574192830Sed	 * We've successfully opened the pool, verify that we're ready
2575192830Sed	 * to start pushing transactions.
2576192830Sed	 */
2577192830Sed	if (state != SPA_LOAD_TRYIMPORT) {
2578192830Sed		if (error = spa_load_verify(spa))
2579192830Sed			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
2580192830Sed			    error));
2581192830Sed	}
2582192830Sed
2583192830Sed	if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER ||
2584192830Sed	    spa->spa_load_max_txg == UINT64_MAX)) {
2585192830Sed		dmu_tx_t *tx;
2586192914Sed		int need_update = B_FALSE;
2587192830Sed
2588192830Sed		ASSERT(state != SPA_LOAD_TRYIMPORT);
2589192830Sed
2590192830Sed		/*
2591192830Sed		 * Claim log blocks that haven't been committed yet.
2592192830Sed		 * This must all happen in a single txg.
2593192830Sed		 * Note: spa_claim_max_txg is updated by spa_claim_notify(),
2594192830Sed		 * invoked from zil_claim_log_block()'s i/o done callback.
2595192830Sed		 * Price of rollback is that we abandon the log.
2596192830Sed		 */
2597192830Sed		spa->spa_claiming = B_TRUE;
2598192830Sed
2599192830Sed		tx = dmu_tx_create_assigned(spa_get_dsl(spa),
2600192830Sed		    spa_first_txg(spa));
2601192830Sed		(void) dmu_objset_find(spa_name(spa),
2602192830Sed		    zil_claim, tx, DS_FIND_CHILDREN);
2603192830Sed		dmu_tx_commit(tx);
2604192830Sed
2605192830Sed		spa->spa_claiming = B_FALSE;
2606192830Sed
2607192830Sed		spa_set_log_state(spa, SPA_LOG_GOOD);
2608192830Sed		spa->spa_sync_on = B_TRUE;
2609192830Sed		txg_sync_start(spa->spa_dsl_pool);
2610192830Sed
2611192914Sed		/*
2612192830Sed		 * Wait for all claims to sync.  We sync up to the highest
2613192830Sed		 * claimed log block birth time so that claimed log blocks
2614192830Sed		 * don't appear to be from the future.  spa_claim_max_txg
2615192830Sed		 * will have been set for us by either zil_check_log_chain()
2616192830Sed		 * (invoked from spa_check_logs()) or zil_claim() above.
2617192830Sed		 */
2618192830Sed		txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg);
2619192830Sed
2620192830Sed		/*
2621192914Sed		 * If the config cache is stale, or we have uninitialized
2622192830Sed		 * metaslabs (see spa_vdev_add()), then update the config.
2623192830Sed		 *
2624192830Sed		 * If this is a verbatim import, trust the current
2625192830Sed		 * in-core spa_config and update the disk labels.
2626192830Sed		 */
2627192830Sed		if (config_cache_txg != spa->spa_config_txg ||
2628192914Sed		    state == SPA_LOAD_IMPORT ||
2629192830Sed		    state == SPA_LOAD_RECOVER ||
2630192830Sed		    (spa->spa_import_flags & ZFS_IMPORT_VERBATIM))
2631192830Sed			need_update = B_TRUE;
2632192830Sed
2633192830Sed		for (int c = 0; c < rvd->vdev_children; c++)
2634192830Sed			if (rvd->vdev_child[c]->vdev_ms_array == 0)
2635192830Sed				need_update = B_TRUE;
2636192830Sed
2637192830Sed		/*
2638192830Sed		 * Update the config cache asychronously in case we're the
2639192830Sed		 * root pool, in which case the config cache isn't writable yet.
2640192830Sed		 */
2641192830Sed		if (need_update)
2642192830Sed			spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
2643192830Sed
2644192830Sed		/*
2645192830Sed		 * Check all DTLs to see if anything needs resilvering.
2646192830Sed		 */
2647192830Sed		if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
2648192830Sed		    vdev_resilver_needed(rvd, NULL, NULL))
2649192830Sed			spa_async_request(spa, SPA_ASYNC_RESILVER);
2650192830Sed
2651192830Sed		/*
2652192830Sed		 * Log the fact that we booted up (so that we can detect if
2653192830Sed		 * we rebooted in the middle of an operation).
2654192830Sed		 */
2655192830Sed		spa_history_log_version(spa, "open");
2656192830Sed
2657192830Sed		/*
2658192830Sed		 * Delete any inconsistent datasets.
2659192830Sed		 */
2660192830Sed		(void) dmu_objset_find(spa_name(spa),
2661192830Sed		    dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN);
2662213567Sed
2663192830Sed		/*
2664192830Sed		 * Clean up any stale temporary dataset userrefs.
2665192830Sed		 */
2666192830Sed		dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool);
2667192830Sed	}
2668192914Sed
2669192830Sed	return (0);
2670192830Sed}
2671192830Sed
2672192830Sedstatic int
2673192830Sedspa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig)
2674192830Sed{
2675192830Sed	int mode = spa->spa_mode;
2676192830Sed
2677192830Sed	spa_unload(spa);
2678192830Sed	spa_deactivate(spa);
2679192830Sed
2680192830Sed	spa->spa_load_max_txg--;
2681192830Sed
2682192830Sed	spa_activate(spa, mode);
2683192830Sed	spa_async_suspend(spa);
2684192830Sed
2685192830Sed	return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig));
2686192830Sed}
2687192830Sed
2688192830Sed/*
2689192830Sed * If spa_load() fails this function will try loading prior txg's. If
2690192830Sed * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool
2691192830Sed * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this
2692192830Sed * function will not rewind the pool and will return the same error as
2693192830Sed * spa_load().
2694192830Sed */
2695192830Sedstatic int
2696192830Sedspa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig,
2697192914Sed    uint64_t max_request, int rewind_flags)
2698192830Sed{
2699192830Sed	nvlist_t *loadinfo = NULL;
2700192830Sed	nvlist_t *config = NULL;
2701192830Sed	int load_error, rewind_error;
2702192830Sed	uint64_t safe_rewind_txg;
2703192914Sed	uint64_t min_txg;
2704192830Sed
2705192830Sed	if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) {
2706192830Sed		spa->spa_load_max_txg = spa->spa_load_txg;
2707192830Sed		spa_set_log_state(spa, SPA_LOG_CLEAR);
2708192830Sed	} else {
2709192914Sed		spa->spa_load_max_txg = max_request;
2710192830Sed	}
2711192830Sed
2712192830Sed	load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING,
2713192830Sed	    mosconfig);
2714192830Sed	if (load_error == 0)
2715192830Sed		return (0);
2716192830Sed
2717192830Sed	if (spa->spa_root_vdev != NULL)
2718192830Sed		config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
2719192830Sed
2720192830Sed	spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg;
2721192830Sed	spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp;
2722192830Sed
2723192830Sed	if (rewind_flags & ZPOOL_NEVER_REWIND) {
2724192830Sed		nvlist_free(config);
2725192830Sed		return (load_error);
2726192830Sed	}
2727192830Sed
2728192830Sed	if (state == SPA_LOAD_RECOVER) {
2729192830Sed		/* Price of rolling back is discarding txgs, including log */
2730192830Sed		spa_set_log_state(spa, SPA_LOG_CLEAR);
2731192830Sed	} else {
2732213567Sed		/*
2733213567Sed		 * If we aren't rolling back save the load info from our first
2734192830Sed		 * import attempt so that we can restore it after attempting
2735192830Sed		 * to rewind.
2736192830Sed		 */
2737192830Sed		loadinfo = spa->spa_load_info;
2738192830Sed		spa->spa_load_info = fnvlist_alloc();
2739192830Sed	}
2740192830Sed
2741192830Sed	spa->spa_load_max_txg = spa->spa_last_ubsync_txg;
2742192830Sed	safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE;
2743192830Sed	min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ?
2744192830Sed	    TXG_INITIAL : safe_rewind_txg;
2745192830Sed
2746192914Sed	/*
2747192830Sed	 * Continue as long as we're finding errors, we're still within
2748192830Sed	 * the acceptable rewind range, and we're still finding uberblocks
2749192830Sed	 */
2750192830Sed	while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg &&
2751192830Sed	    spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) {
2752192830Sed		if (spa->spa_load_max_txg < safe_rewind_txg)
2753192830Sed			spa->spa_extreme_rewind = B_TRUE;
2754192830Sed		rewind_error = spa_load_retry(spa, state, mosconfig);
2755192830Sed	}
2756192830Sed
2757192830Sed	spa->spa_extreme_rewind = B_FALSE;
2758192830Sed	spa->spa_load_max_txg = UINT64_MAX;
2759192830Sed
2760192830Sed	if (config && (rewind_error || state != SPA_LOAD_RECOVER))
2761192830Sed		spa_config_set(spa, config);
2762192830Sed
2763192830Sed	if (state == SPA_LOAD_RECOVER) {
2764192830Sed		ASSERT3P(loadinfo, ==, NULL);
2765192830Sed		return (rewind_error);
2766192830Sed	} else {
2767192830Sed		/* Store the rewind info as part of the initial load info */
2768192830Sed		fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO,
2769192830Sed		    spa->spa_load_info);
2770192830Sed
2771192830Sed		/* Restore the initial load info */
2772192830Sed		fnvlist_free(spa->spa_load_info);
2773192830Sed		spa->spa_load_info = loadinfo;
2774192830Sed
2775192830Sed		return (load_error);
2776192830Sed	}
2777192830Sed}
2778192830Sed
2779192830Sed/*
2780192830Sed * Pool Open/Import
2781192830Sed *
2782192830Sed * The import case is identical to an open except that the configuration is sent
2783192830Sed * down from userland, instead of grabbed from the configuration cache.  For the
2784192830Sed * case of an open, the pool configuration will exist in the
2785192830Sed * POOL_STATE_UNINITIALIZED state.
2786192830Sed *
2787192830Sed * The stats information (gen/count/ustats) is used to gather vdev statistics at
2788192914Sed * the same time open the pool, without having to keep around the spa_t in some
2789192830Sed * ambiguous state.
2790192830Sed */
2791192830Sedstatic int
2792192830Sedspa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
2793192830Sed    nvlist_t **config)
2794192830Sed{
2795192830Sed	spa_t *spa;
2796192830Sed	spa_load_state_t state = SPA_LOAD_OPEN;
2797192830Sed	int error;
2798192830Sed	int locked = B_FALSE;
2799192914Sed	int firstopen = B_FALSE;
2800192830Sed
2801192830Sed	*spapp = NULL;
2802192830Sed
2803192830Sed	/*
2804192830Sed	 * As disgusting as this is, we need to support recursive calls to this
2805192830Sed	 * function because dsl_dir_open() is called during spa_load(), and ends
2806192830Sed	 * up calling spa_open() again.  The real fix is to figure out how to
2807192830Sed	 * avoid dsl_dir_open() calling this in the first place.
2808192830Sed	 */
2809192830Sed	if (mutex_owner(&spa_namespace_lock) != curthread) {
2810192830Sed		mutex_enter(&spa_namespace_lock);
2811192830Sed		locked = B_TRUE;
2812192830Sed	}
2813192830Sed
2814192830Sed	if ((spa = spa_lookup(pool)) == NULL) {
2815192830Sed		if (locked)
2816192830Sed			mutex_exit(&spa_namespace_lock);
2817192830Sed		return (SET_ERROR(ENOENT));
2818192830Sed	}
2819192830Sed
2820192830Sed	if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
2821192830Sed		zpool_rewind_policy_t policy;
2822192830Sed
2823192830Sed		firstopen = B_TRUE;
2824192830Sed
2825192830Sed		zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config,
2826192830Sed		    &policy);
2827192830Sed		if (policy.zrp_request & ZPOOL_DO_REWIND)
2828192830Sed			state = SPA_LOAD_RECOVER;
2829192830Sed
2830192830Sed		spa_activate(spa, spa_mode_global);
2831192830Sed
2832192830Sed		if (state != SPA_LOAD_RECOVER)
2833192830Sed			spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
2834192830Sed
2835192830Sed		error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg,
2836192830Sed		    policy.zrp_request);
2837192830Sed
2838192830Sed		if (error == EBADF) {
2839192830Sed			/*
2840192830Sed			 * If vdev_validate() returns failure (indicated by
2841192830Sed			 * EBADF), it indicates that one of the vdevs indicates
2842192830Sed			 * that the pool has been exported or destroyed.  If
2843192830Sed			 * this is the case, the config cache is out of sync and
2844192830Sed			 * we should remove the pool from the namespace.
2845192830Sed			 */
2846192830Sed			spa_unload(spa);
2847192830Sed			spa_deactivate(spa);
2848192830Sed			spa_config_sync(spa, B_TRUE, B_TRUE);
2849192830Sed			spa_remove(spa);
2850192830Sed			if (locked)
2851192830Sed				mutex_exit(&spa_namespace_lock);
2852192914Sed			return (SET_ERROR(ENOENT));
2853192830Sed		}
2854192830Sed
2855192830Sed		if (error) {
2856192830Sed			/*
2857192830Sed			 * We can't open the pool, but we still have useful
2858192830Sed			 * information: the state of each vdev after the
2859192830Sed			 * attempted vdev_open().  Return this to the user.
2860192830Sed			 */
2861192830Sed			if (config != NULL && spa->spa_config) {
2862192830Sed				VERIFY(nvlist_dup(spa->spa_config, config,
2863192830Sed				    KM_SLEEP) == 0);
2864192830Sed				VERIFY(nvlist_add_nvlist(*config,
2865192914Sed				    ZPOOL_CONFIG_LOAD_INFO,
2866192830Sed				    spa->spa_load_info) == 0);
2867192830Sed			}
2868192830Sed			spa_unload(spa);
2869192830Sed			spa_deactivate(spa);
2870192830Sed			spa->spa_last_open_failed = error;
2871192830Sed			if (locked)
2872192830Sed				mutex_exit(&spa_namespace_lock);
2873192830Sed			*spapp = NULL;
2874192830Sed			return (error);
2875192830Sed		}
2876192830Sed	}
2877192830Sed
2878192830Sed	spa_open_ref(spa, tag);
2879192830Sed
2880192830Sed	if (config != NULL)
2881192830Sed		*config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
2882192830Sed
2883192830Sed	/*
2884192830Sed	 * If we've recovered the pool, pass back any information we
2885192830Sed	 * gathered while doing the load.
2886192830Sed	 */
2887192830Sed	if (state == SPA_LOAD_RECOVER) {
2888192830Sed		VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO,
2889192830Sed		    spa->spa_load_info) == 0);
2890192830Sed	}
2891192914Sed
2892192914Sed	if (locked) {
2893192830Sed		spa->spa_last_open_failed = 0;
2894192830Sed		spa->spa_last_ubsync_txg = 0;
2895192830Sed		spa->spa_load_txg = 0;
2896192830Sed		mutex_exit(&spa_namespace_lock);
2897192830Sed#ifdef __FreeBSD__
2898192830Sed#ifdef _KERNEL
2899192830Sed		if (firstopen)
2900192830Sed			zvol_create_minors(spa->spa_name);
2901192830Sed#endif
2902192830Sed#endif
2903192830Sed	}
2904192830Sed
2905192830Sed	*spapp = spa;
2906192830Sed
2907192830Sed	return (0);
2908192830Sed}
2909192830Sed
2910192830Sedint
2911192830Sedspa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy,
2912192830Sed    nvlist_t **config)
2913192830Sed{
2914192830Sed	return (spa_open_common(name, spapp, tag, policy, config));
2915192830Sed}
2916192830Sed
2917192830Sedint
2918192830Sedspa_open(const char *name, spa_t **spapp, void *tag)
2919192830Sed{
2920192830Sed	return (spa_open_common(name, spapp, tag, NULL, NULL));
2921192830Sed}
2922192830Sed
2923192830Sed/*
2924192830Sed * Lookup the given spa_t, incrementing the inject count in the process,
2925192830Sed * preventing it from being exported or destroyed.
2926192830Sed */
2927192914Sedspa_t *
2928192830Sedspa_inject_addref(char *name)
2929192830Sed{
2930192830Sed	spa_t *spa;
2931192830Sed
2932192830Sed	mutex_enter(&spa_namespace_lock);
2933192830Sed	if ((spa = spa_lookup(name)) == NULL) {
2934192830Sed		mutex_exit(&spa_namespace_lock);
2935192830Sed		return (NULL);
2936192830Sed	}
2937192830Sed	spa->spa_inject_ref++;
2938192830Sed	mutex_exit(&spa_namespace_lock);
2939192830Sed
2940192830Sed	return (spa);
2941192830Sed}
2942228627Sdim
2943192830Sedvoid
2944192830Sedspa_inject_delref(spa_t *spa)
2945192830Sed{
2946192830Sed	mutex_enter(&spa_namespace_lock);
2947192830Sed	spa->spa_inject_ref--;
2948213567Sed	mutex_exit(&spa_namespace_lock);
2949192830Sed}
2950192830Sed
2951192830Sed/*
2952192830Sed * Add spares device information to the nvlist.
2953192830Sed */
2954192830Sedstatic void
2955192830Sedspa_add_spares(spa_t *spa, nvlist_t *config)
2956192830Sed{
2957192830Sed	nvlist_t **spares;
2958192830Sed	uint_t i, nspares;
2959192830Sed	nvlist_t *nvroot;
2960192830Sed	uint64_t guid;
2961192830Sed	vdev_stat_t *vs;
2962192830Sed	uint_t vsc;
2963192830Sed	uint64_t pool;
2964192830Sed
2965192830Sed	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
2966192830Sed
2967192914Sed	if (spa->spa_spares.sav_count == 0)
2968192830Sed		return;
2969192830Sed
2970192830Sed	VERIFY(nvlist_lookup_nvlist(config,
2971192830Sed	    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
2972192830Sed	VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
2973192830Sed	    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
2974192830Sed	if (nspares != 0) {
2975192830Sed		VERIFY(nvlist_add_nvlist_array(nvroot,
2976192830Sed		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
2977192830Sed		VERIFY(nvlist_lookup_nvlist_array(nvroot,
2978192830Sed		    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
2979192830Sed
2980192830Sed		/*
2981192830Sed		 * Go through and find any spares which have since been
2982192830Sed		 * repurposed as an active spare.  If this is the case, update
2983192830Sed		 * their status appropriately.
2984192830Sed		 */
2985192830Sed		for (i = 0; i < nspares; i++) {
2986192830Sed			VERIFY(nvlist_lookup_uint64(spares[i],
2987192830Sed			    ZPOOL_CONFIG_GUID, &guid) == 0);
2988192830Sed			if (spa_spare_exists(guid, &pool, NULL) &&
2989192830Sed			    pool != 0ULL) {
2990192830Sed				VERIFY(nvlist_lookup_uint64_array(
2991192830Sed				    spares[i], ZPOOL_CONFIG_VDEV_STATS,
2992192830Sed				    (uint64_t **)&vs, &vsc) == 0);
2993192830Sed				vs->vs_state = VDEV_STATE_CANT_OPEN;
2994192830Sed				vs->vs_aux = VDEV_AUX_SPARED;
2995192830Sed			}
2996192830Sed		}
2997192830Sed	}
2998192914Sed}
2999192830Sed
3000192830Sed/*
3001192830Sed * Add l2cache device information to the nvlist, including vdev stats.
3002192830Sed */
3003192830Sedstatic void
3004192830Sedspa_add_l2cache(spa_t *spa, nvlist_t *config)
3005192830Sed{
3006192830Sed	nvlist_t **l2cache;
3007192830Sed	uint_t i, j, nl2cache;
3008192830Sed	nvlist_t *nvroot;
3009192830Sed	uint64_t guid;
3010192830Sed	vdev_t *vd;
3011192830Sed	vdev_stat_t *vs;
3012192830Sed	uint_t vsc;
3013192830Sed
3014192830Sed	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
3015192830Sed
3016192830Sed	if (spa->spa_l2cache.sav_count == 0)
3017192830Sed		return;
3018192830Sed
3019192830Sed	VERIFY(nvlist_lookup_nvlist(config,
3020192830Sed	    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
3021192830Sed	VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
3022192830Sed	    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
3023192830Sed	if (nl2cache != 0) {
3024192830Sed		VERIFY(nvlist_add_nvlist_array(nvroot,
3025192830Sed		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
3026192830Sed		VERIFY(nvlist_lookup_nvlist_array(nvroot,
3027192830Sed		    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
3028192830Sed
3029192830Sed		/*
3030192830Sed		 * Update level 2 cache device stats.
3031192830Sed		 */
3032192830Sed
3033192830Sed		for (i = 0; i < nl2cache; i++) {
3034192830Sed			VERIFY(nvlist_lookup_uint64(l2cache[i],
3035192830Sed			    ZPOOL_CONFIG_GUID, &guid) == 0);
3036192830Sed
3037192830Sed			vd = NULL;
3038192830Sed			for (j = 0; j < spa->spa_l2cache.sav_count; j++) {
3039192830Sed				if (guid ==
3040192830Sed				    spa->spa_l2cache.sav_vdevs[j]->vdev_guid) {
3041192830Sed					vd = spa->spa_l2cache.sav_vdevs[j];
3042192830Sed					break;
3043192830Sed				}
3044192830Sed			}
3045192830Sed			ASSERT(vd != NULL);
3046192830Sed
3047192830Sed			VERIFY(nvlist_lookup_uint64_array(l2cache[i],
3048192830Sed			    ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)
3049192830Sed			    == 0);
3050192830Sed			vdev_get_stats(vd, vs);
3051192830Sed		}
3052192830Sed	}
3053192830Sed}
3054192830Sed
3055192830Sedstatic void
3056192914Sedspa_add_feature_stats(spa_t *spa, nvlist_t *config)
3057192914Sed{
3058192914Sed	nvlist_t *features;
3059192914Sed	zap_cursor_t zc;
3060192914Sed	zap_attribute_t za;
3061192914Sed
3062192914Sed	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
3063192914Sed	VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0);
3064192914Sed
3065192914Sed	/* We may be unable to read features if pool is suspended. */
3066192914Sed	if (spa_suspended(spa))
3067192914Sed		goto out;
3068192914Sed
3069192914Sed	if (spa->spa_feat_for_read_obj != 0) {
3070192830Sed		for (zap_cursor_init(&zc, spa->spa_meta_objset,
3071192830Sed		    spa->spa_feat_for_read_obj);
3072192830Sed		    zap_cursor_retrieve(&zc, &za) == 0;
3073192830Sed		    zap_cursor_advance(&zc)) {
3074192830Sed			ASSERT(za.za_integer_length == sizeof (uint64_t) &&
3075192830Sed			    za.za_num_integers == 1);
3076192830Sed			VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name,
3077192830Sed			    za.za_first_integer));
3078192830Sed		}
3079192830Sed		zap_cursor_fini(&zc);
3080192830Sed	}
3081192830Sed
3082192830Sed	if (spa->spa_feat_for_write_obj != 0) {
3083192830Sed		for (zap_cursor_init(&zc, spa->spa_meta_objset,
3084192830Sed		    spa->spa_feat_for_write_obj);
3085192830Sed		    zap_cursor_retrieve(&zc, &za) == 0;
3086192830Sed		    zap_cursor_advance(&zc)) {
3087192830Sed			ASSERT(za.za_integer_length == sizeof (uint64_t) &&
3088192830Sed			    za.za_num_integers == 1);
3089192914Sed			VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name,
3090192830Sed			    za.za_first_integer));
3091192830Sed		}
3092192830Sed		zap_cursor_fini(&zc);
3093192830Sed	}
3094192830Sed
3095192830Sedout:
3096192830Sed	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS,
3097192830Sed	    features) == 0);
3098192830Sed	nvlist_free(features);
3099192830Sed}
3100192830Sed
3101192830Sedint
3102192830Sedspa_get_stats(const char *name, nvlist_t **config,
3103192830Sed    char *altroot, size_t buflen)
3104192830Sed{
3105192830Sed	int error;
3106192830Sed	spa_t *spa;
3107192830Sed
3108192830Sed	*config = NULL;
3109192830Sed	error = spa_open_common(name, &spa, FTAG, NULL, config);
3110192830Sed
3111192830Sed	if (spa != NULL) {
3112192830Sed		/*
3113192830Sed		 * This still leaves a window of inconsistency where the spares
3114192830Sed		 * or l2cache devices could change and the config would be
3115192830Sed		 * self-inconsistent.
3116192830Sed		 */
3117192830Sed		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
3118192830Sed
3119192830Sed		if (*config != NULL) {
3120192830Sed			uint64_t loadtimes[2];
3121192830Sed
3122192830Sed			loadtimes[0] = spa->spa_loaded_ts.tv_sec;
3123192830Sed			loadtimes[1] = spa->spa_loaded_ts.tv_nsec;
3124192830Sed			VERIFY(nvlist_add_uint64_array(*config,
3125192830Sed			    ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0);
3126192830Sed
3127192830Sed			VERIFY(nvlist_add_uint64(*config,
3128192830Sed			    ZPOOL_CONFIG_ERRCOUNT,
3129192830Sed			    spa_get_errlog_size(spa)) == 0);
3130192830Sed
3131192830Sed			if (spa_suspended(spa))
3132192830Sed				VERIFY(nvlist_add_uint64(*config,
3133192830Sed				    ZPOOL_CONFIG_SUSPENDED,
3134192830Sed				    spa->spa_failmode) == 0);
3135192830Sed
3136192830Sed			spa_add_spares(spa, *config);
3137192830Sed			spa_add_l2cache(spa, *config);
3138192830Sed			spa_add_feature_stats(spa, *config);
3139192830Sed		}
3140192830Sed	}
3141192830Sed
3142192830Sed	/*
3143192830Sed	 * We want to get the alternate root even for faulted pools, so we cheat
3144192830Sed	 * and call spa_lookup() directly.
3145192830Sed	 */
3146192830Sed	if (altroot) {
3147192830Sed		if (spa == NULL) {
3148192830Sed			mutex_enter(&spa_namespace_lock);
3149192830Sed			spa = spa_lookup(name);
3150192830Sed			if (spa)
3151192830Sed				spa_altroot(spa, altroot, buflen);
3152192830Sed			else
3153192830Sed				altroot[0] = '\0';
3154192830Sed			spa = NULL;
3155192830Sed			mutex_exit(&spa_namespace_lock);
3156192830Sed		} else {
3157192830Sed			spa_altroot(spa, altroot, buflen);
3158192830Sed		}
3159192830Sed	}
3160192830Sed
3161192914Sed	if (spa != NULL) {
3162192830Sed		spa_config_exit(spa, SCL_CONFIG, FTAG);
3163192830Sed		spa_close(spa, FTAG);
3164192830Sed	}
3165192830Sed
3166192830Sed	return (error);
3167192830Sed}
3168192830Sed
3169192830Sed/*
3170192830Sed * Validate that the auxiliary device array is well formed.  We must have an
3171192830Sed * array of nvlists, each which describes a valid leaf vdev.  If this is an
3172192830Sed * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be
3173192830Sed * specified, as long as they are well-formed.
3174192830Sed */
3175192830Sedstatic int
3176192830Sedspa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode,
3177192830Sed    spa_aux_vdev_t *sav, const char *config, uint64_t version,
3178192830Sed    vdev_labeltype_t label)
3179192830Sed{
3180192830Sed	nvlist_t **dev;
3181192830Sed	uint_t i, ndev;
3182192830Sed	vdev_t *vd;
3183192830Sed	int error;
3184192830Sed
3185192830Sed	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
3186192830Sed
3187192830Sed	/*
3188192830Sed	 * It's acceptable to have no devs specified.
3189192830Sed	 */
3190192830Sed	if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0)
3191192830Sed		return (0);
3192192830Sed
3193192830Sed	if (ndev == 0)
3194192830Sed		return (SET_ERROR(EINVAL));
3195192830Sed
3196192830Sed	/*
3197192830Sed	 * Make sure the pool is formatted with a version that supports this
3198192830Sed	 * device type.
3199192830Sed	 */
3200192914Sed	if (spa_version(spa) < version)
3201192914Sed		return (SET_ERROR(ENOTSUP));
3202192914Sed
3203192830Sed	/*
3204192830Sed	 * Set the pending device list so we correctly handle device in-use
3205192830Sed	 * checking.
3206192830Sed	 */
3207192830Sed	sav->sav_pending = dev;
3208192830Sed	sav->sav_npending = ndev;
3209192830Sed
3210192830Sed	for (i = 0; i < ndev; i++) {
3211192830Sed		if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0,
3212192830Sed		    mode)) != 0)
3213192830Sed			goto out;
3214192830Sed
3215192830Sed		if (!vd->vdev_ops->vdev_op_leaf) {
3216192830Sed			vdev_free(vd);
3217192830Sed			error = SET_ERROR(EINVAL);
3218192830Sed			goto out;
3219192830Sed		}
3220192830Sed
3221192830Sed		/*
3222192830Sed		 * The L2ARC currently only supports disk devices in
3223192830Sed		 * kernel context.  For user-level testing, we allow it.
3224192830Sed		 */
3225192830Sed#ifdef _KERNEL
3226192830Sed		if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) &&
3227192830Sed		    strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) {
3228192830Sed			error = SET_ERROR(ENOTBLK);
3229192830Sed			vdev_free(vd);
3230192830Sed			goto out;
3231192830Sed		}
3232192830Sed#endif
3233192830Sed		vd->vdev_top = vd;
3234192830Sed
3235192830Sed		if ((error = vdev_open(vd)) == 0 &&
3236192830Sed		    (error = vdev_label_init(vd, crtxg, label)) == 0) {
3237192830Sed			VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID,
3238192830Sed			    vd->vdev_guid) == 0);
3239192830Sed		}
3240192830Sed
3241192830Sed		vdev_free(vd);
3242192830Sed
3243192830Sed		if (error &&
3244192830Sed		    (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE))
3245228627Sdim			goto out;
3246192830Sed		else
3247192830Sed			error = 0;
3248192830Sed	}
3249192830Sed
3250192830Sedout:
3251192830Sed	sav->sav_pending = NULL;
3252192830Sed	sav->sav_npending = 0;
3253192830Sed	return (error);
3254192830Sed}
3255192830Sed
3256192830Sedstatic int
3257192830Sedspa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode)
3258192830Sed{
3259192830Sed	int error;
3260192830Sed
3261192830Sed	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
3262192830Sed
3263192830Sed	if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode,
3264192830Sed	    &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES,
3265192830Sed	    VDEV_LABEL_SPARE)) != 0) {
3266192830Sed		return (error);
3267192830Sed	}
3268192830Sed
3269192830Sed	return (spa_validate_aux_devs(spa, nvroot, crtxg, mode,
3270192830Sed	    &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE,
3271192830Sed	    VDEV_LABEL_L2CACHE));
3272192830Sed}
3273192830Sed
3274192830Sedstatic void
3275192830Sedspa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs,
3276192830Sed    const char *config)
3277192830Sed{
3278192830Sed	int i;
3279192830Sed
3280192830Sed	if (sav->sav_config != NULL) {
3281192830Sed		nvlist_t **olddevs;
3282192830Sed		uint_t oldndevs;
3283192830Sed		nvlist_t **newdevs;
3284192830Sed
3285192830Sed		/*
3286192830Sed		 * Generate new dev list by concatentating with the
3287192830Sed		 * current dev list.
3288192830Sed		 */
3289192830Sed		VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config,
3290192830Sed		    &olddevs, &oldndevs) == 0);
3291192830Sed
3292192830Sed		newdevs = kmem_alloc(sizeof (void *) *
3293192830Sed		    (ndevs + oldndevs), KM_SLEEP);
3294192830Sed		for (i = 0; i < oldndevs; i++)
3295192830Sed			VERIFY(nvlist_dup(olddevs[i], &newdevs[i],
3296192830Sed			    KM_SLEEP) == 0);
3297192830Sed		for (i = 0; i < ndevs; i++)
3298192830Sed			VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs],
3299192830Sed			    KM_SLEEP) == 0);
3300192830Sed
3301192830Sed		VERIFY(nvlist_remove(sav->sav_config, config,
3302192830Sed		    DATA_TYPE_NVLIST_ARRAY) == 0);
3303192830Sed
3304192830Sed		VERIFY(nvlist_add_nvlist_array(sav->sav_config,
3305192914Sed		    config, newdevs, ndevs + oldndevs) == 0);
3306192830Sed		for (i = 0; i < oldndevs + ndevs; i++)
3307192830Sed			nvlist_free(newdevs[i]);
3308192830Sed		kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *));
3309192830Sed	} else {
3310192830Sed		/*
3311192830Sed		 * Generate a new dev list.
3312192830Sed		 */
3313192830Sed		VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME,
3314192830Sed		    KM_SLEEP) == 0);
3315192830Sed		VERIFY(nvlist_add_nvlist_array(sav->sav_config, config,
3316192830Sed		    devs, ndevs) == 0);
3317192830Sed	}
3318192830Sed}
3319192830Sed
3320192830Sed/*
3321192830Sed * Stop and drop level 2 ARC devices
3322192830Sed */
3323192830Sedvoid
3324192830Sedspa_l2cache_drop(spa_t *spa)
3325192830Sed{
3326192830Sed	vdev_t *vd;
3327192830Sed	int i;
3328192830Sed	spa_aux_vdev_t *sav = &spa->spa_l2cache;
3329192830Sed
3330192830Sed	for (i = 0; i < sav->sav_count; i++) {
3331192830Sed		uint64_t pool;
3332192830Sed
3333192830Sed		vd = sav->sav_vdevs[i];
3334192830Sed		ASSERT(vd != NULL);
3335192830Sed
3336192830Sed		if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
3337192830Sed		    pool != 0ULL && l2arc_vdev_present(vd))
3338192830Sed			l2arc_remove_vdev(vd);
3339192830Sed	}
3340192830Sed}
3341192830Sed
3342192830Sed/*
3343192830Sed * Pool Creation
3344192830Sed */
3345192830Sedint
3346192830Sedspa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
3347192830Sed    nvlist_t *zplprops)
3348192830Sed{
3349192914Sed	spa_t *spa;
3350192830Sed	char *altroot = NULL;
3351192830Sed	vdev_t *rvd;
3352192830Sed	dsl_pool_t *dp;
3353192830Sed	dmu_tx_t *tx;
3354192830Sed	int error = 0;
3355192830Sed	uint64_t txg = TXG_INITIAL;
3356192830Sed	nvlist_t **spares, **l2cache;
3357192830Sed	uint_t nspares, nl2cache;
3358192830Sed	uint64_t version, obj;
3359192830Sed	boolean_t has_features;
3360192830Sed
3361192830Sed	/*
3362192830Sed	 * If this pool already exists, return failure.
3363192830Sed	 */
3364192830Sed	mutex_enter(&spa_namespace_lock);
3365192830Sed	if (spa_lookup(pool) != NULL) {
3366192830Sed		mutex_exit(&spa_namespace_lock);
3367192830Sed		return (SET_ERROR(EEXIST));
3368192830Sed	}
3369192830Sed
3370192830Sed	/*
3371192830Sed	 * Allocate a new spa_t structure.
3372192830Sed	 */
3373192830Sed	(void) nvlist_lookup_string(props,
3374192830Sed	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
3375192830Sed	spa = spa_add(pool, NULL, altroot);
3376192830Sed	spa_activate(spa, spa_mode_global);
3377192830Sed
3378192830Sed	if (props && (error = spa_prop_validate(spa, props))) {
3379192830Sed		spa_deactivate(spa);
3380192830Sed		spa_remove(spa);
3381192830Sed		mutex_exit(&spa_namespace_lock);
3382192830Sed		return (error);
3383192830Sed	}
3384192830Sed
3385192830Sed	has_features = B_FALSE;
3386192830Sed	for (nvpair_t *elem = nvlist_next_nvpair(props, NULL);
3387192830Sed	    elem != NULL; elem = nvlist_next_nvpair(props, elem)) {
3388192830Sed		if (zpool_prop_feature(nvpair_name(elem)))
3389192830Sed			has_features = B_TRUE;
3390192830Sed	}
3391192830Sed
3392192830Sed	if (has_features || nvlist_lookup_uint64(props,
3393192830Sed	    zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) {
3394192830Sed		version = SPA_VERSION;
3395192830Sed	}
3396192830Sed	ASSERT(SPA_VERSION_IS_SUPPORTED(version));
3397192830Sed
3398192830Sed	spa->spa_first_txg = txg;
3399192830Sed	spa->spa_uberblock.ub_txg = txg - 1;
3400192830Sed	spa->spa_uberblock.ub_version = version;
3401192830Sed	spa->spa_ubsync = spa->spa_uberblock;
3402192830Sed
3403192830Sed	/*
3404192830Sed	 * Create "The Godfather" zio to hold all async IOs
3405192830Sed	 */
3406192830Sed	spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
3407192830Sed	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
3408192830Sed
3409192830Sed	/*
3410192830Sed	 * Create the root vdev.
3411192830Sed	 */
3412192830Sed	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3413192830Sed
3414192830Sed	error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD);
3415192830Sed
3416192830Sed	ASSERT(error != 0 || rvd != NULL);
3417192830Sed	ASSERT(error != 0 || spa->spa_root_vdev == rvd);
3418192830Sed
3419192830Sed	if (error == 0 && !zfs_allocatable_devs(nvroot))
3420192830Sed		error = SET_ERROR(EINVAL);
3421192830Sed
3422192830Sed	if (error == 0 &&
3423192830Sed	    (error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
3424192830Sed	    (error = spa_validate_aux(spa, nvroot, txg,
3425192830Sed	    VDEV_ALLOC_ADD)) == 0) {
3426192830Sed		for (int c = 0; c < rvd->vdev_children; c++) {
3427192830Sed			vdev_ashift_optimize(rvd->vdev_child[c]);
3428192830Sed			vdev_metaslab_set_size(rvd->vdev_child[c]);
3429192830Sed			vdev_expand(rvd->vdev_child[c], txg);
3430192830Sed		}
3431192914Sed	}
3432192914Sed
3433192914Sed	spa_config_exit(spa, SCL_ALL, FTAG);
3434192914Sed
3435192830Sed	if (error != 0) {
3436196818Sache		spa_unload(spa);
3437192830Sed		spa_deactivate(spa);
3438196818Sache		spa_remove(spa);
3439192830Sed		mutex_exit(&spa_namespace_lock);
3440192830Sed		return (error);
3441192830Sed	}
3442196818Sache
3443192830Sed	/*
3444192830Sed	 * Get the list of spares, if specified.
3445192830Sed	 */
3446192830Sed	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
3447192830Sed	    &spares, &nspares) == 0) {
3448192830Sed		VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME,
3449192830Sed		    KM_SLEEP) == 0);
3450192830Sed		VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
3451192830Sed		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
3452192830Sed		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3453192830Sed		spa_load_spares(spa);
3454192830Sed		spa_config_exit(spa, SCL_ALL, FTAG);
3455192830Sed		spa->spa_spares.sav_sync = B_TRUE;
3456192830Sed	}
3457192830Sed
3458192830Sed	/*
3459192830Sed	 * Get the list of level 2 cache devices, if specified.
3460192830Sed	 */
3461192830Sed	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
3462192830Sed	    &l2cache, &nl2cache) == 0) {
3463192830Sed		VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
3464192830Sed		    NV_UNIQUE_NAME, KM_SLEEP) == 0);
3465192830Sed		VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
3466192830Sed		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
3467192830Sed		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3468192830Sed		spa_load_l2cache(spa);
3469192830Sed		spa_config_exit(spa, SCL_ALL, FTAG);
3470192830Sed		spa->spa_l2cache.sav_sync = B_TRUE;
3471192830Sed	}
3472192830Sed
3473192830Sed	spa->spa_is_initializing = B_TRUE;
3474192830Sed	spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg);
3475192830Sed	spa->spa_meta_objset = dp->dp_meta_objset;
3476192830Sed	spa->spa_is_initializing = B_FALSE;
3477192830Sed
3478192830Sed	/*
3479192830Sed	 * Create DDTs (dedup tables).
3480192830Sed	 */
3481192830Sed	ddt_create(spa);
3482192830Sed
3483192830Sed	spa_update_dspace(spa);
3484192830Sed
3485192830Sed	tx = dmu_tx_create_assigned(dp, txg);
3486192830Sed
3487192830Sed	/*
3488192830Sed	 * Create the pool config object.
3489192830Sed	 */
3490192830Sed	spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
3491192830Sed	    DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE,
3492192830Sed	    DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
3493192830Sed
3494192830Sed	if (zap_add(spa->spa_meta_objset,
3495192830Sed	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
3496192830Sed	    sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
3497192830Sed		cmn_err(CE_PANIC, "failed to add pool config");
3498192830Sed	}
3499192830Sed
3500192830Sed	if (spa_version(spa) >= SPA_VERSION_FEATURES)
3501192830Sed		spa_feature_create_zap_objects(spa, tx);
3502192830Sed
3503192830Sed	if (zap_add(spa->spa_meta_objset,
3504192830Sed	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION,
3505192830Sed	    sizeof (uint64_t), 1, &version, tx) != 0) {
3506192830Sed		cmn_err(CE_PANIC, "failed to add pool version");
3507192830Sed	}
3508192830Sed
3509192830Sed	/* Newly created pools with the right version are always deflated. */
3510192830Sed	if (version >= SPA_VERSION_RAIDZ_DEFLATE) {
3511192830Sed		spa->spa_deflate = TRUE;
3512192830Sed		if (zap_add(spa->spa_meta_objset,
3513192830Sed		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
3514192830Sed		    sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
3515192830Sed			cmn_err(CE_PANIC, "failed to add deflate");
3516192830Sed		}
3517192830Sed	}
3518192830Sed
3519192830Sed	/*
3520192830Sed	 * Create the deferred-free bpobj.  Turn off compression
3521192830Sed	 * because sync-to-convergence takes longer if the blocksize
3522192830Sed	 * keeps changing.
3523192830Sed	 */
3524192830Sed	obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx);
3525192830Sed	dmu_object_set_compress(spa->spa_meta_objset, obj,
3526192830Sed	    ZIO_COMPRESS_OFF, tx);
3527192830Sed	if (zap_add(spa->spa_meta_objset,
3528192830Sed	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ,
3529192830Sed	    sizeof (uint64_t), 1, &obj, tx) != 0) {
3530192830Sed		cmn_err(CE_PANIC, "failed to add bpobj");
3531192830Sed	}
3532192830Sed	VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj,
3533192830Sed	    spa->spa_meta_objset, obj));
3534192830Sed
3535192830Sed	/*
3536192830Sed	 * Create the pool's history object.
3537192830Sed	 */
3538192830Sed	if (version >= SPA_VERSION_ZPOOL_HISTORY)
3539192830Sed		spa_history_create_obj(spa, tx);
3540192830Sed
3541192830Sed	/*
3542192830Sed	 * Set pool properties.
3543192830Sed	 */
3544192830Sed	spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
3545192830Sed	spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
3546192830Sed	spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
3547192830Sed	spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND);
3548192830Sed
3549192830Sed	if (props != NULL) {
3550192830Sed		spa_configfile_set(spa, props, B_FALSE);
3551192830Sed		spa_sync_props(props, tx);
3552192830Sed	}
3553192830Sed
3554192830Sed	dmu_tx_commit(tx);
3555192830Sed
3556192830Sed	spa->spa_sync_on = B_TRUE;
3557192830Sed	txg_sync_start(spa->spa_dsl_pool);
3558192830Sed
3559192830Sed	/*
3560192830Sed	 * We explicitly wait for the first transaction to complete so that our
3561192830Sed	 * bean counters are appropriately updated.
3562192830Sed	 */
3563192830Sed	txg_wait_synced(spa->spa_dsl_pool, txg);
3564192830Sed
3565192830Sed	spa_config_sync(spa, B_FALSE, B_TRUE);
3566192830Sed
3567192830Sed	spa_history_log_version(spa, "create");
3568192830Sed
3569192830Sed	spa->spa_minref = refcount_count(&spa->spa_refcount);
3570192830Sed
3571192830Sed	mutex_exit(&spa_namespace_lock);
3572192830Sed
3573192830Sed	return (0);
3574192830Sed}
3575192830Sed
3576192830Sed#ifdef _KERNEL
3577192830Sed#if defined(sun)
3578192830Sed/*
3579192830Sed * Get the root pool information from the root disk, then import the root pool
3580192830Sed * during the system boot up time.
3581192830Sed */
3582192830Sedextern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **);
3583192830Sed
3584192830Sedstatic nvlist_t *
3585192830Sedspa_generate_rootconf(char *devpath, char *devid, uint64_t *guid)
3586192830Sed{
3587192830Sed	nvlist_t *config;
3588192830Sed	nvlist_t *nvtop, *nvroot;
3589192830Sed	uint64_t pgid;
3590192830Sed
3591192830Sed	if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0)
3592192830Sed		return (NULL);
3593192830Sed
3594192830Sed	/*
3595192830Sed	 * Add this top-level vdev to the child array.
3596192830Sed	 */
3597192830Sed	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
3598192830Sed	    &nvtop) == 0);
3599192830Sed	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
3600192830Sed	    &pgid) == 0);
3601192830Sed	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0);
3602192830Sed
3603192830Sed	/*
3604192830Sed	 * Put this pool's top-level vdevs into a root vdev.
3605192830Sed	 */
3606192830Sed	VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
3607192830Sed	VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
3608192830Sed	    VDEV_TYPE_ROOT) == 0);
3609192830Sed	VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0);
3610192830Sed	VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0);
3611192830Sed	VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
3612192830Sed	    &nvtop, 1) == 0);
3613192830Sed
3614192830Sed	/*
3615192830Sed	 * Replace the existing vdev_tree with the new root vdev in
3616192830Sed	 * this pool's configuration (remove the old, add the new).
3617192830Sed	 */
3618192830Sed	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
3619192830Sed	nvlist_free(nvroot);
3620192830Sed	return (config);
3621192830Sed}
3622192830Sed
3623192830Sed/*
3624192830Sed * Walk the vdev tree and see if we can find a device with "better"
3625192830Sed * configuration. A configuration is "better" if the label on that
3626192830Sed * device has a more recent txg.
3627192830Sed */
3628192830Sedstatic void
3629192830Sedspa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg)
3630192830Sed{
3631192830Sed	for (int c = 0; c < vd->vdev_children; c++)
3632192830Sed		spa_alt_rootvdev(vd->vdev_child[c], avd, txg);
3633192830Sed
3634192830Sed	if (vd->vdev_ops->vdev_op_leaf) {
3635192830Sed		nvlist_t *label;
3636192830Sed		uint64_t label_txg;
3637192830Sed
3638192830Sed		if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid,
3639192830Sed		    &label) != 0)
3640192830Sed			return;
3641192830Sed
3642192830Sed		VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG,
3643192830Sed		    &label_txg) == 0);
3644192830Sed
3645192830Sed		/*
3646192830Sed		 * Do we have a better boot device?
3647192830Sed		 */
3648192830Sed		if (label_txg > *txg) {
3649192830Sed			*txg = label_txg;
3650192830Sed			*avd = vd;
3651192830Sed		}
3652192830Sed		nvlist_free(label);
3653192830Sed	}
3654192830Sed}
3655192830Sed
3656192830Sed/*
3657192830Sed * Import a root pool.
3658192830Sed *
3659192830Sed * For x86. devpath_list will consist of devid and/or physpath name of
3660192830Sed * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a").
3661192830Sed * The GRUB "findroot" command will return the vdev we should boot.
3662192830Sed *
3663192830Sed * For Sparc, devpath_list consists the physpath name of the booting device
3664192830Sed * no matter the rootpool is a single device pool or a mirrored pool.
3665192830Sed * e.g.
3666192830Sed *	"/pci@1f,0/ide@d/disk@0,0:a"
3667192830Sed */
3668192830Sedint
3669192830Sedspa_import_rootpool(char *devpath, char *devid)
3670192830Sed{
3671192830Sed	spa_t *spa;
3672192830Sed	vdev_t *rvd, *bvd, *avd = NULL;
3673192830Sed	nvlist_t *config, *nvtop;
3674192830Sed	uint64_t guid, txg;
3675192830Sed	char *pname;
3676192914Sed	int error;
3677192914Sed
3678192830Sed	/*
3679192830Sed	 * Read the label from the boot device and generate a configuration.
3680192830Sed	 */
3681192830Sed	config = spa_generate_rootconf(devpath, devid, &guid);
3682192830Sed#if defined(_OBP) && defined(_KERNEL)
3683192830Sed	if (config == NULL) {
3684192830Sed		if (strstr(devpath, "/iscsi/ssd") != NULL) {
3685192830Sed			/* iscsi boot */
3686192830Sed			get_iscsi_bootpath_phy(devpath);
3687192830Sed			config = spa_generate_rootconf(devpath, devid, &guid);
3688192830Sed		}
3689192830Sed	}
3690192830Sed#endif
3691192830Sed	if (config == NULL) {
3692192830Sed		cmn_err(CE_NOTE, "Cannot read the pool label from '%s'",
3693192830Sed		    devpath);
3694192830Sed		return (SET_ERROR(EIO));
3695192830Sed	}
3696192830Sed
3697192830Sed	VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
3698192830Sed	    &pname) == 0);
3699192830Sed	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
3700192830Sed
3701192830Sed	mutex_enter(&spa_namespace_lock);
3702192830Sed	if ((spa = spa_lookup(pname)) != NULL) {
3703192830Sed		/*
3704192830Sed		 * Remove the existing root pool from the namespace so that we
3705192914Sed		 * can replace it with the correct config we just read in.
3706192914Sed		 */
3707192914Sed		spa_remove(spa);
3708192914Sed	}
3709192914Sed
3710192830Sed	spa = spa_add(pname, config, NULL);
3711192830Sed	spa->spa_is_root = B_TRUE;
3712192830Sed	spa->spa_import_flags = ZFS_IMPORT_VERBATIM;
3713192830Sed
3714192830Sed	/*
3715192830Sed	 * Build up a vdev tree based on the boot device's label config.
3716192830Sed	 */
3717192830Sed	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
3718192830Sed	    &nvtop) == 0);
3719192830Sed	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3720192830Sed	error = spa_config_parse(spa, &rvd, nvtop, NULL, 0,
3721192830Sed	    VDEV_ALLOC_ROOTPOOL);
3722192830Sed	spa_config_exit(spa, SCL_ALL, FTAG);
3723192830Sed	if (error) {
3724192830Sed		mutex_exit(&spa_namespace_lock);
3725192830Sed		nvlist_free(config);
3726192830Sed		cmn_err(CE_NOTE, "Can not parse the config for pool '%s'",
3727192830Sed		    pname);
3728192830Sed		return (error);
3729192830Sed	}
3730192830Sed
3731192830Sed	/*
3732192830Sed	 * Get the boot vdev.
3733192830Sed	 */
3734192830Sed	if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) {
3735192830Sed		cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu",
3736192830Sed		    (u_longlong_t)guid);
3737192830Sed		error = SET_ERROR(ENOENT);
3738192830Sed		goto out;
3739192830Sed	}
3740192830Sed
3741192830Sed	/*
3742192830Sed	 * Determine if there is a better boot device.
3743192914Sed	 */
3744192830Sed	avd = bvd;
3745192830Sed	spa_alt_rootvdev(rvd, &avd, &txg);
3746192830Sed	if (avd != bvd) {
3747192830Sed		cmn_err(CE_NOTE, "The boot device is 'degraded'. Please "
3748192830Sed		    "try booting from '%s'", avd->vdev_path);
3749192830Sed		error = SET_ERROR(EINVAL);
3750192830Sed		goto out;
3751192830Sed	}
3752192830Sed
3753192830Sed	/*
3754192830Sed	 * If the boot device is part of a spare vdev then ensure that
3755192830Sed	 * we're booting off the active spare.
3756192830Sed	 */
3757192830Sed	if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops &&
3758192830Sed	    !bvd->vdev_isspare) {
3759192830Sed		cmn_err(CE_NOTE, "The boot device is currently spared. Please "
3760192830Sed		    "try booting from '%s'",
3761192830Sed		    bvd->vdev_parent->
3762192830Sed		    vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path);
3763192830Sed		error = SET_ERROR(EINVAL);
3764192830Sed		goto out;
3765192830Sed	}
3766192830Sed
3767192830Sed	error = 0;
3768192830Sedout:
3769192830Sed	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3770192830Sed	vdev_free(rvd);
3771192830Sed	spa_config_exit(spa, SCL_ALL, FTAG);
3772192830Sed	mutex_exit(&spa_namespace_lock);
3773192830Sed
3774192830Sed	nvlist_free(config);
3775192830Sed	return (error);
3776192830Sed}
3777192830Sed
3778192830Sed#else
3779192856Sed
3780192830Sedextern int vdev_geom_read_pool_label(const char *name, nvlist_t ***configs,
3781192830Sed    uint64_t *count);
3782192830Sed
3783192830Sedstatic nvlist_t *
3784192830Sedspa_generate_rootconf(const char *name)
3785192830Sed{
3786192830Sed	nvlist_t **configs, **tops;
3787192830Sed	nvlist_t *config;
3788192830Sed	nvlist_t *best_cfg, *nvtop, *nvroot;
3789192830Sed	uint64_t *holes;
3790192830Sed	uint64_t best_txg;
3791192830Sed	uint64_t nchildren;
3792192830Sed	uint64_t pgid;
3793192830Sed	uint64_t count;
3794192830Sed	uint64_t i;
3795192830Sed	uint_t   nholes;
3796192914Sed
3797192830Sed	if (vdev_geom_read_pool_label(name, &configs, &count) != 0)
3798192914Sed		return (NULL);
3799192830Sed
3800192830Sed	ASSERT3U(count, !=, 0);
3801192830Sed	best_txg = 0;
3802192830Sed	for (i = 0; i < count; i++) {
3803192830Sed		uint64_t txg;
3804192830Sed
3805192830Sed		VERIFY(nvlist_lookup_uint64(configs[i], ZPOOL_CONFIG_POOL_TXG,
3806192830Sed		    &txg) == 0);
3807192830Sed		if (txg > best_txg) {
3808192830Sed			best_txg = txg;
3809192830Sed			best_cfg = configs[i];
3810192830Sed		}
3811192830Sed	}
3812192830Sed
3813192830Sed	/*
3814192830Sed	 * Multi-vdev root pool configuration discovery is not supported yet.
3815192830Sed	 */
3816192856Sed	nchildren = 1;
3817192830Sed	nvlist_lookup_uint64(best_cfg, ZPOOL_CONFIG_VDEV_CHILDREN, &nchildren);
3818192830Sed	holes = NULL;
3819192830Sed	nvlist_lookup_uint64_array(best_cfg, ZPOOL_CONFIG_HOLE_ARRAY,
3820192830Sed	    &holes, &nholes);
3821192830Sed
3822192830Sed	tops = kmem_zalloc(nchildren * sizeof(void *), KM_SLEEP);
3823192830Sed	for (i = 0; i < nchildren; i++) {
3824192830Sed		if (i >= count)
3825192830Sed			break;
3826192830Sed		if (configs[i] == NULL)
3827192830Sed			continue;
3828192830Sed		VERIFY(nvlist_lookup_nvlist(configs[i], ZPOOL_CONFIG_VDEV_TREE,
3829192830Sed		    &nvtop) == 0);
3830192830Sed		nvlist_dup(nvtop, &tops[i], KM_SLEEP);
3831192830Sed	}
3832192830Sed	for (i = 0; holes != NULL && i < nholes; i++) {
3833192914Sed		if (i >= nchildren)
3834192830Sed			continue;
3835192830Sed		if (tops[holes[i]] != NULL)
3836192830Sed			continue;
3837192830Sed		nvlist_alloc(&tops[holes[i]], NV_UNIQUE_NAME, KM_SLEEP);
3838192830Sed		VERIFY(nvlist_add_string(tops[holes[i]], ZPOOL_CONFIG_TYPE,
3839192830Sed		    VDEV_TYPE_HOLE) == 0);
3840192830Sed		VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_ID,
3841192830Sed		    holes[i]) == 0);
3842192830Sed		VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_GUID,
3843192830Sed		    0) == 0);
3844192830Sed	}
3845192830Sed	for (i = 0; i < nchildren; i++) {
3846192830Sed		if (tops[i] != NULL)
3847192830Sed			continue;
3848192830Sed		nvlist_alloc(&tops[i], NV_UNIQUE_NAME, KM_SLEEP);
3849192830Sed		VERIFY(nvlist_add_string(tops[i], ZPOOL_CONFIG_TYPE,
3850192830Sed		    VDEV_TYPE_MISSING) == 0);
3851192830Sed		VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_ID,
3852192830Sed		    i) == 0);
3853192830Sed		VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_GUID,
3854192830Sed		    0) == 0);
3855192830Sed	}
3856192830Sed
3857192830Sed	/*
3858192830Sed	 * Create pool config based on the best vdev config.
3859192830Sed	 */
3860192830Sed	nvlist_dup(best_cfg, &config, KM_SLEEP);
3861192830Sed
3862192830Sed	/*
3863192830Sed	 * Put this pool's top-level vdevs into a root vdev.
3864192830Sed	 */
3865192830Sed	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
3866192830Sed	    &pgid) == 0);
3867192830Sed	VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
3868192830Sed	VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
3869192830Sed	    VDEV_TYPE_ROOT) == 0);
3870192830Sed	VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0);
3871192830Sed	VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0);
3872192830Sed	VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
3873192830Sed	    tops, nchildren) == 0);
3874192830Sed
3875192830Sed	/*
3876192830Sed	 * Replace the existing vdev_tree with the new root vdev in
3877192830Sed	 * this pool's configuration (remove the old, add the new).
3878192830Sed	 */
3879192830Sed	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
3880192830Sed
3881192830Sed	/*
3882192830Sed	 * Drop vdev config elements that should not be present at pool level.
3883192830Sed	 */
3884192830Sed	nvlist_remove(config, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64);
3885192830Sed	nvlist_remove(config, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64);
3886192830Sed
3887192830Sed	for (i = 0; i < count; i++)
3888192830Sed		nvlist_free(configs[i]);
3889192830Sed	kmem_free(configs, count * sizeof(void *));
3890192830Sed	for (i = 0; i < nchildren; i++)
3891192830Sed		nvlist_free(tops[i]);
3892192830Sed	kmem_free(tops, nchildren * sizeof(void *));
3893192830Sed	nvlist_free(nvroot);
3894192830Sed	return (config);
3895192830Sed}
3896192830Sed
3897192830Sedint
3898192830Sedspa_import_rootpool(const char *name)
3899192830Sed{
3900192830Sed	spa_t *spa;
3901192830Sed	vdev_t *rvd, *bvd, *avd = NULL;
3902192830Sed	nvlist_t *config, *nvtop;
3903192830Sed	uint64_t txg;
3904192830Sed	char *pname;
3905192830Sed	int error;
3906192830Sed
3907192830Sed	/*
3908192830Sed	 * Read the label from the boot device and generate a configuration.
3909192830Sed	 */
3910192830Sed	config = spa_generate_rootconf(name);
3911192830Sed
3912192830Sed	mutex_enter(&spa_namespace_lock);
3913192830Sed	if (config != NULL) {
3914192830Sed		VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
3915192830Sed		    &pname) == 0 && strcmp(name, pname) == 0);
3916192830Sed		VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg)
3917192830Sed		    == 0);
3918192830Sed
3919192830Sed		if ((spa = spa_lookup(pname)) != NULL) {
3920192830Sed			/*
3921192830Sed			 * Remove the existing root pool from the namespace so
3922192830Sed			 * that we can replace it with the correct config
3923192830Sed			 * we just read in.
3924192830Sed			 */
3925192830Sed			spa_remove(spa);
3926192830Sed		}
3927192830Sed		spa = spa_add(pname, config, NULL);
3928192830Sed
3929192830Sed		/*
3930192830Sed		 * Set spa_ubsync.ub_version as it can be used in vdev_alloc()
3931192830Sed		 * via spa_version().
3932192830Sed		 */
3933192830Sed		if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
3934192830Sed		    &spa->spa_ubsync.ub_version) != 0)
3935192830Sed			spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
3936192830Sed	} else if ((spa = spa_lookup(name)) == NULL) {
3937192830Sed		cmn_err(CE_NOTE, "Cannot find the pool label for '%s'",
3938192830Sed		    name);
3939192830Sed		return (EIO);
3940192830Sed	} else {
3941192830Sed		VERIFY(nvlist_dup(spa->spa_config, &config, KM_SLEEP) == 0);
3942192830Sed	}
3943192830Sed	spa->spa_is_root = B_TRUE;
3944192830Sed	spa->spa_import_flags = ZFS_IMPORT_VERBATIM;
3945192830Sed
3946192830Sed	/*
3947192830Sed	 * Build up a vdev tree based on the boot device's label config.
3948192830Sed	 */
3949192830Sed	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
3950192830Sed	    &nvtop) == 0);
3951192830Sed	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3952192830Sed	error = spa_config_parse(spa, &rvd, nvtop, NULL, 0,
3953192830Sed	    VDEV_ALLOC_ROOTPOOL);
3954192830Sed	spa_config_exit(spa, SCL_ALL, FTAG);
3955192830Sed	if (error) {
3956192830Sed		mutex_exit(&spa_namespace_lock);
3957192830Sed		nvlist_free(config);
3958192830Sed		cmn_err(CE_NOTE, "Can not parse the config for pool '%s'",
3959192914Sed		    pname);
3960192830Sed		return (error);
3961192830Sed	}
3962192830Sed
3963192830Sed	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3964192830Sed	vdev_free(rvd);
3965192830Sed	spa_config_exit(spa, SCL_ALL, FTAG);
3966192914Sed	mutex_exit(&spa_namespace_lock);
3967192830Sed
3968192830Sed	nvlist_free(config);
3969192830Sed	return (0);
3970192830Sed}
3971192830Sed
3972192830Sed#endif	/* sun */
3973192830Sed#endif
3974192830Sed
3975192830Sed/*
3976192830Sed * Import a non-root pool into the system.
3977192830Sed */
3978192830Sedint
3979192830Sedspa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
3980192830Sed{
3981192830Sed	spa_t *spa;
3982192830Sed	char *altroot = NULL;
3983192830Sed	spa_load_state_t state = SPA_LOAD_IMPORT;
3984192830Sed	zpool_rewind_policy_t policy;
3985192830Sed	uint64_t mode = spa_mode_global;
3986192830Sed	uint64_t readonly = B_FALSE;
3987192830Sed	int error;
3988192830Sed	nvlist_t *nvroot;
3989192830Sed	nvlist_t **spares, **l2cache;
3990192830Sed	uint_t nspares, nl2cache;
3991192830Sed
3992192830Sed	/*
3993192830Sed	 * If a pool with this name exists, return failure.
3994192830Sed	 */
3995192830Sed	mutex_enter(&spa_namespace_lock);
3996192830Sed	if (spa_lookup(pool) != NULL) {
3997192830Sed		mutex_exit(&spa_namespace_lock);
3998192830Sed		return (SET_ERROR(EEXIST));
3999192830Sed	}
4000192830Sed
4001192830Sed	/*
4002192830Sed	 * Create and initialize the spa structure.
4003192830Sed	 */
4004192830Sed	(void) nvlist_lookup_string(props,
4005192830Sed	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
4006192830Sed	(void) nvlist_lookup_uint64(props,
4007192830Sed	    zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly);
4008192830Sed	if (readonly)
4009192830Sed		mode = FREAD;
4010192830Sed	spa = spa_add(pool, config, altroot);
4011192830Sed	spa->spa_import_flags = flags;
4012192830Sed
4013192830Sed	/*
4014192830Sed	 * Verbatim import - Take a pool and insert it into the namespace
4015192830Sed	 * as if it had been loaded at boot.
4016192830Sed	 */
4017192830Sed	if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) {
4018192830Sed		if (props != NULL)
4019192830Sed			spa_configfile_set(spa, props, B_FALSE);
4020192830Sed
4021192830Sed		spa_config_sync(spa, B_FALSE, B_TRUE);
4022192830Sed
4023192830Sed		mutex_exit(&spa_namespace_lock);
4024192830Sed		spa_history_log_version(spa, "import");
4025192830Sed
4026192830Sed		return (0);
4027192830Sed	}
4028192830Sed
4029192830Sed	spa_activate(spa, mode);
4030192830Sed
4031192830Sed	/*
4032192830Sed	 * Don't start async tasks until we know everything is healthy.
4033192830Sed	 */
4034192830Sed	spa_async_suspend(spa);
4035192830Sed
4036192830Sed	zpool_get_rewind_policy(config, &policy);
4037192830Sed	if (policy.zrp_request & ZPOOL_DO_REWIND)
4038192830Sed		state = SPA_LOAD_RECOVER;
4039192830Sed
4040192830Sed	/*
4041192830Sed	 * Pass off the heavy lifting to spa_load().  Pass TRUE for mosconfig
4042192830Sed	 * because the user-supplied config is actually the one to trust when
4043192830Sed	 * doing an import.
4044192830Sed	 */
4045192830Sed	if (state != SPA_LOAD_RECOVER)
4046192830Sed		spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
4047192830Sed
4048192830Sed	error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg,
4049192830Sed	    policy.zrp_request);
4050192830Sed
4051192830Sed	/*
4052192830Sed	 * Propagate anything learned while loading the pool and pass it
4053192830Sed	 * back to caller (i.e. rewind info, missing devices, etc).
4054192830Sed	 */
4055192830Sed	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
4056192830Sed	    spa->spa_load_info) == 0);
4057192830Sed
4058192830Sed	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4059192830Sed	/*
4060192830Sed	 * Toss any existing sparelist, as it doesn't have any validity
4061192830Sed	 * anymore, and conflicts with spa_has_spare().
4062192830Sed	 */
4063192830Sed	if (spa->spa_spares.sav_config) {
4064192830Sed		nvlist_free(spa->spa_spares.sav_config);
4065192830Sed		spa->spa_spares.sav_config = NULL;
4066192830Sed		spa_load_spares(spa);
4067192830Sed	}
4068192830Sed	if (spa->spa_l2cache.sav_config) {
4069192830Sed		nvlist_free(spa->spa_l2cache.sav_config);
4070192830Sed		spa->spa_l2cache.sav_config = NULL;
4071192830Sed		spa_load_l2cache(spa);
4072192830Sed	}
4073192830Sed
4074192830Sed	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
4075192830Sed	    &nvroot) == 0);
4076192830Sed	if (error == 0)
4077192830Sed		error = spa_validate_aux(spa, nvroot, -1ULL,
4078192830Sed		    VDEV_ALLOC_SPARE);
4079192830Sed	if (error == 0)
4080192830Sed		error = spa_validate_aux(spa, nvroot, -1ULL,
4081192830Sed		    VDEV_ALLOC_L2CACHE);
4082192830Sed	spa_config_exit(spa, SCL_ALL, FTAG);
4083192830Sed
4084192830Sed	if (props != NULL)
4085192830Sed		spa_configfile_set(spa, props, B_FALSE);
4086192830Sed
4087192830Sed	if (error != 0 || (props && spa_writeable(spa) &&
4088192830Sed	    (error = spa_prop_set(spa, props)))) {
4089192830Sed		spa_unload(spa);
4090192830Sed		spa_deactivate(spa);
4091192830Sed		spa_remove(spa);
4092192830Sed		mutex_exit(&spa_namespace_lock);
4093192830Sed		return (error);
4094192830Sed	}
4095192830Sed
4096192830Sed	spa_async_resume(spa);
4097192830Sed
4098192830Sed	/*
4099192830Sed	 * Override any spares and level 2 cache devices as specified by
4100192830Sed	 * the user, as these may have correct device names/devids, etc.
4101192830Sed	 */
4102192830Sed	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
4103192830Sed	    &spares, &nspares) == 0) {
4104192830Sed		if (spa->spa_spares.sav_config)
4105192830Sed			VERIFY(nvlist_remove(spa->spa_spares.sav_config,
4106192830Sed			    ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
4107192830Sed		else
4108192830Sed			VERIFY(nvlist_alloc(&spa->spa_spares.sav_config,
4109192830Sed			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
4110192830Sed		VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
4111192830Sed		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
4112192830Sed		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4113192830Sed		spa_load_spares(spa);
4114192830Sed		spa_config_exit(spa, SCL_ALL, FTAG);
4115192830Sed		spa->spa_spares.sav_sync = B_TRUE;
4116192830Sed	}
4117192830Sed	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
4118192830Sed	    &l2cache, &nl2cache) == 0) {
4119192830Sed		if (spa->spa_l2cache.sav_config)
4120192830Sed			VERIFY(nvlist_remove(spa->spa_l2cache.sav_config,
4121192830Sed			    ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0);
4122192830Sed		else
4123192830Sed			VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
4124192830Sed			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
4125192856Sed		VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
4126192830Sed		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
4127192830Sed		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4128192830Sed		spa_load_l2cache(spa);
4129192830Sed		spa_config_exit(spa, SCL_ALL, FTAG);
4130192830Sed		spa->spa_l2cache.sav_sync = B_TRUE;
4131192830Sed	}
4132192830Sed
4133192830Sed	/*
4134192830Sed	 * Check for any removed devices.
4135192830Sed	 */
4136192830Sed	if (spa->spa_autoreplace) {
4137192830Sed		spa_aux_check_removed(&spa->spa_spares);
4138192830Sed		spa_aux_check_removed(&spa->spa_l2cache);
4139192830Sed	}
4140192830Sed
4141192830Sed	if (spa_writeable(spa)) {
4142192914Sed		/*
4143192914Sed		 * Update the config cache to include the newly-imported pool.
4144192830Sed		 */
4145192830Sed		spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
4146192830Sed	}
4147192830Sed
4148192830Sed	/*
4149192830Sed	 * It's possible that the pool was expanded while it was exported.
4150192830Sed	 * We kick off an async task to handle this for us.
4151192830Sed	 */
4152192830Sed	spa_async_request(spa, SPA_ASYNC_AUTOEXPAND);
4153192830Sed
4154192830Sed	mutex_exit(&spa_namespace_lock);
4155192830Sed	spa_history_log_version(spa, "import");
4156192830Sed
4157192830Sed#ifdef __FreeBSD__
4158192830Sed#ifdef _KERNEL
4159192830Sed	zvol_create_minors(pool);
4160192914Sed#endif
4161192830Sed#endif
4162192830Sed	return (0);
4163192830Sed}
4164192830Sed
4165192830Sednvlist_t *
4166192830Sedspa_tryimport(nvlist_t *tryconfig)
4167192830Sed{
4168192830Sed	nvlist_t *config = NULL;
4169192830Sed	char *poolname;
4170192830Sed	spa_t *spa;
4171192830Sed	uint64_t state;
4172192830Sed	int error;
4173192830Sed
4174192830Sed	if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
4175192830Sed		return (NULL);
4176192830Sed
4177192830Sed	if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state))
4178192830Sed		return (NULL);
4179192830Sed
4180192830Sed	/*
4181192830Sed	 * Create and initialize the spa structure.
4182192830Sed	 */
4183192830Sed	mutex_enter(&spa_namespace_lock);
4184192830Sed	spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL);
4185192830Sed	spa_activate(spa, FREAD);
4186192830Sed
4187192830Sed	/*
4188192830Sed	 * Pass off the heavy lifting to spa_load().
4189192830Sed	 * Pass TRUE for mosconfig because the user-supplied config
4190192830Sed	 * is actually the one to trust when doing an import.
4191192914Sed	 */
4192192830Sed	error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE);
4193192830Sed
4194192830Sed	/*
4195192830Sed	 * If 'tryconfig' was at least parsable, return the current config.
4196192830Sed	 */
4197192830Sed	if (spa->spa_root_vdev != NULL) {
4198192830Sed		config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
4199192830Sed		VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
4200192830Sed		    poolname) == 0);
4201192830Sed		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
4202192830Sed		    state) == 0);
4203192830Sed		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
4204192830Sed		    spa->spa_uberblock.ub_timestamp) == 0);
4205192830Sed		VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
4206192830Sed		    spa->spa_load_info) == 0);
4207192830Sed
4208192830Sed		/*
4209192830Sed		 * If the bootfs property exists on this pool then we
4210192830Sed		 * copy it out so that external consumers can tell which
4211192830Sed		 * pools are bootable.
4212192830Sed		 */
4213192830Sed		if ((!error || error == EEXIST) && spa->spa_bootfs) {
4214192830Sed			char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
4215192830Sed
4216192830Sed			/*
4217192830Sed			 * We have to play games with the name since the
4218192830Sed			 * pool was opened as TRYIMPORT_NAME.
4219192830Sed			 */
4220192830Sed			if (dsl_dsobj_to_dsname(spa_name(spa),
4221192830Sed			    spa->spa_bootfs, tmpname) == 0) {
4222192830Sed				char *cp;
4223192830Sed				char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
4224192830Sed
4225192830Sed				cp = strchr(tmpname, '/');
4226192830Sed				if (cp == NULL) {
4227192830Sed					(void) strlcpy(dsname, tmpname,
4228192830Sed					    MAXPATHLEN);
4229192830Sed				} else {
4230192830Sed					(void) snprintf(dsname, MAXPATHLEN,
4231192830Sed					    "%s/%s", poolname, ++cp);
4232192830Sed				}
4233192830Sed				VERIFY(nvlist_add_string(config,
4234192830Sed				    ZPOOL_CONFIG_BOOTFS, dsname) == 0);
4235192830Sed				kmem_free(dsname, MAXPATHLEN);
4236192830Sed			}
4237192830Sed			kmem_free(tmpname, MAXPATHLEN);
4238192830Sed		}
4239192830Sed
4240192830Sed		/*
4241192830Sed		 * Add the list of hot spares and level 2 cache devices.
4242192830Sed		 */
4243192830Sed		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4244192830Sed		spa_add_spares(spa, config);
4245192830Sed		spa_add_l2cache(spa, config);
4246192830Sed		spa_config_exit(spa, SCL_CONFIG, FTAG);
4247192830Sed	}
4248192830Sed
4249192830Sed	spa_unload(spa);
4250192830Sed	spa_deactivate(spa);
4251192830Sed	spa_remove(spa);
4252192830Sed	mutex_exit(&spa_namespace_lock);
4253192830Sed
4254192830Sed	return (config);
4255192830Sed}
4256192830Sed
4257192830Sed/*
4258192830Sed * Pool export/destroy
4259192830Sed *
4260192830Sed * The act of destroying or exporting a pool is very simple.  We make sure there
4261192830Sed * is no more pending I/O and any references to the pool are gone.  Then, we
4262192830Sed * update the pool state and sync all the labels to disk, removing the
4263192830Sed * configuration from the cache afterwards. If the 'hardforce' flag is set, then
4264192830Sed * we don't sync the labels or remove the configuration cache.
4265192830Sed */
4266192830Sedstatic int
4267192830Sedspa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
4268192830Sed    boolean_t force, boolean_t hardforce)
4269192830Sed{
4270192830Sed	spa_t *spa;
4271192830Sed
4272192830Sed	if (oldconfig)
4273192830Sed		*oldconfig = NULL;
4274192830Sed
4275192830Sed	if (!(spa_mode_global & FWRITE))
4276192830Sed		return (SET_ERROR(EROFS));
4277192830Sed
4278192830Sed	mutex_enter(&spa_namespace_lock);
4279192830Sed	if ((spa = spa_lookup(pool)) == NULL) {
4280192830Sed		mutex_exit(&spa_namespace_lock);
4281192830Sed		return (SET_ERROR(ENOENT));
4282192830Sed	}
4283192830Sed
4284192830Sed	/*
4285192830Sed	 * Put a hold on the pool, drop the namespace lock, stop async tasks,
4286192830Sed	 * reacquire the namespace lock, and see if we can export.
4287192830Sed	 */
4288192830Sed	spa_open_ref(spa, FTAG);
4289192830Sed	mutex_exit(&spa_namespace_lock);
4290192830Sed	spa_async_suspend(spa);
4291192830Sed	mutex_enter(&spa_namespace_lock);
4292192830Sed	spa_close(spa, FTAG);
4293192830Sed
4294192830Sed	/*
4295192830Sed	 * The pool will be in core if it's openable,
4296192830Sed	 * in which case we can modify its state.
4297192830Sed	 */
4298192830Sed	if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) {
4299192830Sed		/*
4300192830Sed		 * Objsets may be open only because they're dirty, so we
4301192830Sed		 * have to force it to sync before checking spa_refcnt.
4302192830Sed		 */
4303192830Sed		txg_wait_synced(spa->spa_dsl_pool, 0);
4304192830Sed
4305192830Sed		/*
4306192830Sed		 * A pool cannot be exported or destroyed if there are active
4307192830Sed		 * references.  If we are resetting a pool, allow references by
4308192830Sed		 * fault injection handlers.
4309192830Sed		 */
4310192830Sed		if (!spa_refcount_zero(spa) ||
4311192830Sed		    (spa->spa_inject_ref != 0 &&
4312192830Sed		    new_state != POOL_STATE_UNINITIALIZED)) {
4313192830Sed			spa_async_resume(spa);
4314192830Sed			mutex_exit(&spa_namespace_lock);
4315192914Sed			return (SET_ERROR(EBUSY));
4316192830Sed		}
4317192830Sed
4318192830Sed		/*
4319192830Sed		 * A pool cannot be exported if it has an active shared spare.
4320192830Sed		 * This is to prevent other pools stealing the active spare
4321192830Sed		 * from an exported pool. At user's own will, such pool can
4322192830Sed		 * be forcedly exported.
4323192830Sed		 */
4324192830Sed		if (!force && new_state == POOL_STATE_EXPORTED &&
4325192830Sed		    spa_has_active_shared_spare(spa)) {
4326192830Sed			spa_async_resume(spa);
4327192830Sed			mutex_exit(&spa_namespace_lock);
4328192830Sed			return (SET_ERROR(EXDEV));
4329192830Sed		}
4330192830Sed
4331192830Sed		/*
4332192830Sed		 * We want this to be reflected on every label,
4333192830Sed		 * so mark them all dirty.  spa_unload() will do the
4334192830Sed		 * final sync that pushes these changes out.
4335192830Sed		 */
4336192830Sed		if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
4337192830Sed			spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4338192830Sed			spa->spa_state = new_state;
4339192830Sed			spa->spa_final_txg = spa_last_synced_txg(spa) +
4340192830Sed			    TXG_DEFER_SIZE + 1;
4341192830Sed			vdev_config_dirty(spa->spa_root_vdev);
4342192830Sed			spa_config_exit(spa, SCL_ALL, FTAG);
4343192830Sed		}
4344192830Sed	}
4345192830Sed
4346192830Sed	spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY);
4347192830Sed
4348192830Sed	if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
4349192830Sed		spa_unload(spa);
4350192830Sed		spa_deactivate(spa);
4351192830Sed	}
4352192830Sed
4353192830Sed	if (oldconfig && spa->spa_config)
4354192830Sed		VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0);
4355192830Sed
4356192830Sed	if (new_state != POOL_STATE_UNINITIALIZED) {
4357192830Sed		if (!hardforce)
4358192830Sed			spa_config_sync(spa, B_TRUE, B_TRUE);
4359192830Sed		spa_remove(spa);
4360192830Sed	}
4361192830Sed	mutex_exit(&spa_namespace_lock);
4362192914Sed
4363192830Sed	return (0);
4364192830Sed}
4365192830Sed
4366192830Sed/*
4367192830Sed * Destroy a storage pool.
4368192830Sed */
4369192830Sedint
4370192830Sedspa_destroy(char *pool)
4371192830Sed{
4372192830Sed	return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL,
4373192830Sed	    B_FALSE, B_FALSE));
4374192830Sed}
4375192830Sed
4376192830Sed/*
4377192830Sed * Export a storage pool.
4378192830Sed */
4379192830Sedint
4380192830Sedspa_export(char *pool, nvlist_t **oldconfig, boolean_t force,
4381192830Sed    boolean_t hardforce)
4382192830Sed{
4383192830Sed	return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig,
4384192830Sed	    force, hardforce));
4385192830Sed}
4386192830Sed
4387192830Sed/*
4388192830Sed * Similar to spa_export(), this unloads the spa_t without actually removing it
4389192830Sed * from the namespace in any way.
4390192830Sed */
4391192830Sedint
4392192830Sedspa_reset(char *pool)
4393192830Sed{
4394192830Sed	return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL,
4395192830Sed	    B_FALSE, B_FALSE));
4396192830Sed}
4397192830Sed
4398192830Sed/*
4399192830Sed * ==========================================================================
4400192830Sed * Device manipulation
4401192830Sed * ==========================================================================
4402192830Sed */
4403192830Sed
4404192830Sed/*
4405192830Sed * Add a device to a storage pool.
4406192830Sed */
4407192830Sedint
4408192830Sedspa_vdev_add(spa_t *spa, nvlist_t *nvroot)
4409192830Sed{
4410192830Sed	uint64_t txg, id;
4411192830Sed	int error;
4412192830Sed	vdev_t *rvd = spa->spa_root_vdev;
4413192830Sed	vdev_t *vd, *tvd;
4414192830Sed	nvlist_t **spares, **l2cache;
4415192830Sed	uint_t nspares, nl2cache;
4416192830Sed
4417192830Sed	ASSERT(spa_writeable(spa));
4418192830Sed
4419192830Sed	txg = spa_vdev_enter(spa);
4420192830Sed
4421192830Sed	if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0,
4422192830Sed	    VDEV_ALLOC_ADD)) != 0)
4423192830Sed		return (spa_vdev_exit(spa, NULL, txg, error));
4424192830Sed
4425192830Sed	spa->spa_pending_vdev = vd;	/* spa_vdev_exit() will clear this */
4426192830Sed
4427192830Sed	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares,
4428192830Sed	    &nspares) != 0)
4429192856Sed		nspares = 0;
4430192830Sed
4431192856Sed	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache,
4432192830Sed	    &nl2cache) != 0)
4433192830Sed		nl2cache = 0;
4434192830Sed
4435192830Sed	if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0)
4436192830Sed		return (spa_vdev_exit(spa, vd, txg, EINVAL));
4437192856Sed
4438192914Sed	if (vd->vdev_children != 0 &&
4439192856Sed	    (error = vdev_create(vd, txg, B_FALSE)) != 0)
4440192856Sed		return (spa_vdev_exit(spa, vd, txg, error));
4441192856Sed
4442192856Sed	/*
4443192856Sed	 * We must validate the spares and l2cache devices after checking the
4444192856Sed	 * children.  Otherwise, vdev_inuse() will blindly overwrite the spare.
4445192856Sed	 */
4446192856Sed	if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0)
4447192830Sed		return (spa_vdev_exit(spa, vd, txg, error));
4448192830Sed
4449192830Sed	/*
4450192830Sed	 * Transfer each new top-level vdev from vd to rvd.
4451192830Sed	 */
4452192830Sed	for (int c = 0; c < vd->vdev_children; c++) {
4453192830Sed
4454192830Sed		/*
4455192830Sed		 * Set the vdev id to the first hole, if one exists.
4456192830Sed		 */
4457192830Sed		for (id = 0; id < rvd->vdev_children; id++) {
4458192830Sed			if (rvd->vdev_child[id]->vdev_ishole) {
4459192830Sed				vdev_free(rvd->vdev_child[id]);
4460192830Sed				break;
4461192830Sed			}
4462192830Sed		}
4463192830Sed		tvd = vd->vdev_child[c];
4464192830Sed		vdev_remove_child(vd, tvd);
4465192830Sed		tvd->vdev_id = id;
4466192830Sed		vdev_add_child(rvd, tvd);
4467192830Sed		vdev_config_dirty(tvd);
4468192830Sed	}
4469192914Sed
4470192830Sed	if (nspares != 0) {
4471192830Sed		spa_set_aux_vdevs(&spa->spa_spares, spares, nspares,
4472192830Sed		    ZPOOL_CONFIG_SPARES);
4473192830Sed		spa_load_spares(spa);
4474192830Sed		spa->spa_spares.sav_sync = B_TRUE;
4475192830Sed	}
4476192830Sed
4477192830Sed	if (nl2cache != 0) {
4478192914Sed		spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache,
4479192830Sed		    ZPOOL_CONFIG_L2CACHE);
4480192830Sed		spa_load_l2cache(spa);
4481192830Sed		spa->spa_l2cache.sav_sync = B_TRUE;
4482192914Sed	}
4483192830Sed
4484192830Sed	/*
4485192830Sed	 * We have to be careful when adding new vdevs to an existing pool.
4486192830Sed	 * If other threads start allocating from these vdevs before we
4487192914Sed	 * sync the config cache, and we lose power, then upon reboot we may
4488192830Sed	 * fail to open the pool because there are DVAs that the config cache
4489192830Sed	 * can't translate.  Therefore, we first add the vdevs without
4490192830Sed	 * initializing metaslabs; sync the config cache (via spa_vdev_exit());
4491192830Sed	 * and then let spa_config_update() initialize the new metaslabs.
4492192830Sed	 *
4493192830Sed	 * spa_load() checks for added-but-not-initialized vdevs, so that
4494192830Sed	 * if we lose power at any point in this sequence, the remaining
4495192830Sed	 * steps will be completed the next time we load the pool.
4496192830Sed	 */
4497192830Sed	(void) spa_vdev_exit(spa, vd, txg, 0);
4498192830Sed
4499192830Sed	mutex_enter(&spa_namespace_lock);
4500192830Sed	spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
4501192830Sed	mutex_exit(&spa_namespace_lock);
4502192830Sed
4503192830Sed	return (0);
4504192830Sed}
4505192830Sed
4506192830Sed/*
4507192830Sed * Attach a device to a mirror.  The arguments are the path to any device
4508192830Sed * in the mirror, and the nvroot for the new device.  If the path specifies
4509192830Sed * a device that is not mirrored, we automatically insert the mirror vdev.
4510192830Sed *
4511192830Sed * If 'replacing' is specified, the new device is intended to replace the
4512192830Sed * existing device; in this case the two devices are made into their own
4513192830Sed * mirror using the 'replacing' vdev, which is functionally identical to
4514192830Sed * the mirror vdev (it actually reuses all the same ops) but has a few
4515192830Sed * extra rules: you can't attach to it after it's been created, and upon
4516192830Sed * completion of resilvering, the first disk (the one being replaced)
4517192830Sed * is automatically detached.
4518192830Sed */
4519192830Sedint
4520192830Sedspa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
4521192830Sed{
4522192830Sed	uint64_t txg, dtl_max_txg;
4523192830Sed	vdev_t *rvd = spa->spa_root_vdev;
4524192830Sed	vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
4525192830Sed	vdev_ops_t *pvops;
4526192830Sed	char *oldvdpath, *newvdpath;
4527192830Sed	int newvd_isspare;
4528192830Sed	int error;
4529192830Sed
4530192830Sed	ASSERT(spa_writeable(spa));
4531192830Sed
4532192830Sed	txg = spa_vdev_enter(spa);
4533192830Sed
4534192830Sed	oldvd = spa_lookup_by_guid(spa, guid, B_FALSE);
4535192830Sed
4536192830Sed	if (oldvd == NULL)
4537192830Sed		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
4538192914Sed
4539192830Sed	if (!oldvd->vdev_ops->vdev_op_leaf)
4540192830Sed		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
4541192830Sed
4542192830Sed	pvd = oldvd->vdev_parent;
4543192830Sed
4544192830Sed	if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
4545192830Sed	    VDEV_ALLOC_ATTACH)) != 0)
4546192830Sed		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
4547192830Sed
4548192830Sed	if (newrootvd->vdev_children != 1)
4549192830Sed		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
4550192830Sed
4551192830Sed	newvd = newrootvd->vdev_child[0];
4552192830Sed
4553192914Sed	if (!newvd->vdev_ops->vdev_op_leaf)
4554192830Sed		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
4555192830Sed
4556192830Sed	if ((error = vdev_create(newrootvd, txg, replacing)) != 0)
4557192830Sed		return (spa_vdev_exit(spa, newrootvd, txg, error));
4558192830Sed
4559192830Sed	/*
4560192914Sed	 * Spares can't replace logs
4561192830Sed	 */
4562192830Sed	if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare)
4563192830Sed		return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
4564192830Sed
4565192830Sed	if (!replacing) {
4566192830Sed		/*
4567192830Sed		 * For attach, the only allowable parent is a mirror or the root
4568192830Sed		 * vdev.
4569192830Sed		 */
4570192830Sed		if (pvd->vdev_ops != &vdev_mirror_ops &&
4571192830Sed		    pvd->vdev_ops != &vdev_root_ops)
4572192830Sed			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
4573192830Sed
4574192830Sed		pvops = &vdev_mirror_ops;
4575192830Sed	} else {
4576192830Sed		/*
4577192830Sed		 * Active hot spares can only be replaced by inactive hot
4578192830Sed		 * spares.
4579192830Sed		 */
4580192830Sed		if (pvd->vdev_ops == &vdev_spare_ops &&
4581192830Sed		    oldvd->vdev_isspare &&
4582192830Sed		    !spa_has_spare(spa, newvd->vdev_guid))
4583192830Sed			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
4584192830Sed
4585192830Sed		/*
4586192830Sed		 * If the source is a hot spare, and the parent isn't already a
4587192830Sed		 * spare, then we want to create a new hot spare.  Otherwise, we
4588192830Sed		 * want to create a replacing vdev.  The user is not allowed to
4589192830Sed		 * attach to a spared vdev child unless the 'isspare' state is
4590192830Sed		 * the same (spare replaces spare, non-spare replaces
4591192830Sed		 * non-spare).
4592192830Sed		 */
4593192830Sed		if (pvd->vdev_ops == &vdev_replacing_ops &&
4594192830Sed		    spa_version(spa) < SPA_VERSION_MULTI_REPLACE) {
4595192830Sed			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
4596192830Sed		} else if (pvd->vdev_ops == &vdev_spare_ops &&
4597192830Sed		    newvd->vdev_isspare != oldvd->vdev_isspare) {
4598192830Sed			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
4599192830Sed		}
4600192830Sed
4601192830Sed		if (newvd->vdev_isspare)
4602192830Sed			pvops = &vdev_spare_ops;
4603192830Sed		else
4604192830Sed			pvops = &vdev_replacing_ops;
4605192830Sed	}
4606192830Sed
4607192830Sed	/*
4608192830Sed	 * Make sure the new device is big enough.
4609192830Sed	 */
4610192830Sed	if (newvd->vdev_asize < vdev_get_min_asize(oldvd))
4611192830Sed		return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
4612192830Sed
4613192830Sed	/*
4614192830Sed	 * The new device cannot have a higher alignment requirement
4615192830Sed	 * than the top-level vdev.
4616192830Sed	 */
4617192830Sed	if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
4618192830Sed		return (spa_vdev_exit(spa, newrootvd, txg, EDOM));
4619192830Sed
4620192830Sed	/*
4621192830Sed	 * If this is an in-place replacement, update oldvd's path and devid
4622192830Sed	 * to make it distinguishable from newvd, and unopenable from now on.
4623192830Sed	 */
4624192830Sed	if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
4625192830Sed		spa_strfree(oldvd->vdev_path);
4626192830Sed		oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
4627192830Sed		    KM_SLEEP);
4628192830Sed		(void) sprintf(oldvd->vdev_path, "%s/%s",
4629192830Sed		    newvd->vdev_path, "old");
4630192830Sed		if (oldvd->vdev_devid != NULL) {
4631192830Sed			spa_strfree(oldvd->vdev_devid);
4632192830Sed			oldvd->vdev_devid = NULL;
4633192830Sed		}
4634192830Sed	}
4635192830Sed
4636192830Sed	/* mark the device being resilvered */
4637192830Sed	newvd->vdev_resilver_txg = txg;
4638192830Sed
4639192830Sed	/*
4640192830Sed	 * If the parent is not a mirror, or if we're replacing, insert the new
4641192830Sed	 * mirror/replacing/spare vdev above oldvd.
4642192830Sed	 */
4643192830Sed	if (pvd->vdev_ops != pvops)
4644192830Sed		pvd = vdev_add_parent(oldvd, pvops);
4645192830Sed
4646192830Sed	ASSERT(pvd->vdev_top->vdev_parent == rvd);
4647192830Sed	ASSERT(pvd->vdev_ops == pvops);
4648192830Sed	ASSERT(oldvd->vdev_parent == pvd);
4649192830Sed
4650192830Sed	/*
4651192830Sed	 * Extract the new device from its root and add it to pvd.
4652192830Sed	 */
4653192830Sed	vdev_remove_child(newrootvd, newvd);
4654192830Sed	newvd->vdev_id = pvd->vdev_children;
4655192830Sed	newvd->vdev_crtxg = oldvd->vdev_crtxg;
4656192830Sed	vdev_add_child(pvd, newvd);
4657192830Sed
4658192830Sed	tvd = newvd->vdev_top;
4659192830Sed	ASSERT(pvd->vdev_top == tvd);
4660192830Sed	ASSERT(tvd->vdev_parent == rvd);
4661192830Sed
4662192830Sed	vdev_config_dirty(tvd);
4663192830Sed
4664192830Sed	/*
4665192830Sed	 * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account
4666192830Sed	 * for any dmu_sync-ed blocks.  It will propagate upward when
4667192830Sed	 * spa_vdev_exit() calls vdev_dtl_reassess().
4668192830Sed	 */
4669192830Sed	dtl_max_txg = txg + TXG_CONCURRENT_STATES;
4670192830Sed
4671192830Sed	vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL,
4672192830Sed	    dtl_max_txg - TXG_INITIAL);
4673192830Sed
4674192830Sed	if (newvd->vdev_isspare) {
4675192830Sed		spa_spare_activate(newvd);
4676192830Sed		spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE);
4677192830Sed	}
4678192830Sed
4679192830Sed	oldvdpath = spa_strdup(oldvd->vdev_path);
4680192830Sed	newvdpath = spa_strdup(newvd->vdev_path);
4681192830Sed	newvd_isspare = newvd->vdev_isspare;
4682192830Sed
4683192830Sed	/*
4684192830Sed	 * Mark newvd's DTL dirty in this txg.
4685192830Sed	 */
4686192830Sed	vdev_dirty(tvd, VDD_DTL, newvd, txg);
4687192830Sed
4688192830Sed	/*
4689192830Sed	 * Restart the resilver
4690192830Sed	 */
4691192830Sed	dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg);
4692192830Sed
4693192830Sed	/*
4694192830Sed	 * Commit the config
4695192830Sed	 */
4696192830Sed	(void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0);
4697192830Sed
4698192830Sed	spa_history_log_internal(spa, "vdev attach", NULL,
4699192830Sed	    "%s vdev=%s %s vdev=%s",
4700192830Sed	    replacing && newvd_isspare ? "spare in" :
4701192830Sed	    replacing ? "replace" : "attach", newvdpath,
4702192830Sed	    replacing ? "for" : "to", oldvdpath);
4703192830Sed
4704192830Sed	spa_strfree(oldvdpath);
4705192830Sed	spa_strfree(newvdpath);
4706192830Sed
4707192830Sed	if (spa->spa_bootfs)
4708192830Sed		spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH);
4709192830Sed
4710192830Sed	return (0);
4711192830Sed}
4712192830Sed
4713192830Sed/*
4714192830Sed * Detach a device from a mirror or replacing vdev.
4715192830Sed *
4716192830Sed * If 'replace_done' is specified, only detach if the parent
4717192830Sed * is a replacing vdev.
4718192830Sed */
4719192830Sedint
4720192830Sedspa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
4721192830Sed{
4722192830Sed	uint64_t txg;
4723192830Sed	int error;
4724192830Sed	vdev_t *rvd = spa->spa_root_vdev;
4725192830Sed	vdev_t *vd, *pvd, *cvd, *tvd;
4726192830Sed	boolean_t unspare = B_FALSE;
4727192830Sed	uint64_t unspare_guid = 0;
4728192830Sed	char *vdpath;
4729192830Sed
4730192830Sed	ASSERT(spa_writeable(spa));
4731192830Sed
4732192830Sed	txg = spa_vdev_enter(spa);
4733192830Sed
4734192830Sed	vd = spa_lookup_by_guid(spa, guid, B_FALSE);
4735192830Sed
4736192830Sed	if (vd == NULL)
4737192830Sed		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
4738192830Sed
4739192830Sed	if (!vd->vdev_ops->vdev_op_leaf)
4740192830Sed		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
4741192830Sed
4742192830Sed	pvd = vd->vdev_parent;
4743192830Sed
4744192830Sed	/*
4745192830Sed	 * If the parent/child relationship is not as expected, don't do it.
4746192830Sed	 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing
4747192830Sed	 * vdev that's replacing B with C.  The user's intent in replacing
4748192830Sed	 * is to go from M(A,B) to M(A,C).  If the user decides to cancel
4749192830Sed	 * the replace by detaching C, the expected behavior is to end up
4750192830Sed	 * M(A,B).  But suppose that right after deciding to detach C,
4751192830Sed	 * the replacement of B completes.  We would have M(A,C), and then
4752192830Sed	 * ask to detach C, which would leave us with just A -- not what
4753192830Sed	 * the user wanted.  To prevent this, we make sure that the
4754192830Sed	 * parent/child relationship hasn't changed -- in this example,
4755192830Sed	 * that C's parent is still the replacing vdev R.
4756192830Sed	 */
4757192830Sed	if (pvd->vdev_guid != pguid && pguid != 0)
4758192830Sed		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
4759192830Sed
4760192830Sed	/*
4761192830Sed	 * Only 'replacing' or 'spare' vdevs can be replaced.
4762192830Sed	 */
4763192830Sed	if (replace_done && pvd->vdev_ops != &vdev_replacing_ops &&
4764192830Sed	    pvd->vdev_ops != &vdev_spare_ops)
4765192830Sed		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
4766192830Sed
4767192830Sed	ASSERT(pvd->vdev_ops != &vdev_spare_ops ||
4768192830Sed	    spa_version(spa) >= SPA_VERSION_SPARES);
4769192830Sed
4770192830Sed	/*
4771192830Sed	 * Only mirror, replacing, and spare vdevs support detach.
4772192830Sed	 */
4773192830Sed	if (pvd->vdev_ops != &vdev_replacing_ops &&
4774192830Sed	    pvd->vdev_ops != &vdev_mirror_ops &&
4775192830Sed	    pvd->vdev_ops != &vdev_spare_ops)
4776192830Sed		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
4777192830Sed
4778192830Sed	/*
4779192830Sed	 * If this device has the only valid copy of some data,
4780192830Sed	 * we cannot safely detach it.
4781192830Sed	 */
4782192830Sed	if (vdev_dtl_required(vd))
4783192830Sed		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
4784192830Sed
4785192830Sed	ASSERT(pvd->vdev_children >= 2);
4786192830Sed
4787192830Sed	/*
4788192830Sed	 * If we are detaching the second disk from a replacing vdev, then
4789192830Sed	 * check to see if we changed the original vdev's path to have "/old"
4790192830Sed	 * at the end in spa_vdev_attach().  If so, undo that change now.
4791192830Sed	 */
4792192830Sed	if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 &&
4793192830Sed	    vd->vdev_path != NULL) {
4794192830Sed		size_t len = strlen(vd->vdev_path);
4795192830Sed
4796192830Sed		for (int c = 0; c < pvd->vdev_children; c++) {
4797192830Sed			cvd = pvd->vdev_child[c];
4798192830Sed
4799192830Sed			if (cvd == vd || cvd->vdev_path == NULL)
4800192830Sed				continue;
4801192830Sed
4802192830Sed			if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 &&
4803192830Sed			    strcmp(cvd->vdev_path + len, "/old") == 0) {
4804192830Sed				spa_strfree(cvd->vdev_path);
4805192830Sed				cvd->vdev_path = spa_strdup(vd->vdev_path);
4806192830Sed				break;
4807192830Sed			}
4808192830Sed		}
4809192830Sed	}
4810192830Sed
4811192830Sed	/*
4812192830Sed	 * If we are detaching the original disk from a spare, then it implies
4813192830Sed	 * that the spare should become a real disk, and be removed from the
4814192830Sed	 * active spare list for the pool.
4815192830Sed	 */
4816192830Sed	if (pvd->vdev_ops == &vdev_spare_ops &&
4817192830Sed	    vd->vdev_id == 0 &&
4818192830Sed	    pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare)
4819192830Sed		unspare = B_TRUE;
4820192830Sed
4821192830Sed	/*
4822192830Sed	 * Erase the disk labels so the disk can be used for other things.
4823192830Sed	 * This must be done after all other error cases are handled,
4824192830Sed	 * but before we disembowel vd (so we can still do I/O to it).
4825192830Sed	 * But if we can't do it, don't treat the error as fatal --
4826192830Sed	 * it may be that the unwritability of the disk is the reason
4827192830Sed	 * it's being detached!
4828192830Sed	 */
4829192830Sed	error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
4830192830Sed
4831192830Sed	/*
4832192830Sed	 * Remove vd from its parent and compact the parent's children.
4833192830Sed	 */
4834192830Sed	vdev_remove_child(pvd, vd);
4835192830Sed	vdev_compact_children(pvd);
4836192830Sed
4837192830Sed	/*
4838192830Sed	 * Remember one of the remaining children so we can get tvd below.
4839192830Sed	 */
4840192830Sed	cvd = pvd->vdev_child[pvd->vdev_children - 1];
4841192830Sed
4842192830Sed	/*
4843192830Sed	 * If we need to remove the remaining child from the list of hot spares,
4844192830Sed	 * do it now, marking the vdev as no longer a spare in the process.
4845192830Sed	 * We must do this before vdev_remove_parent(), because that can
4846192830Sed	 * change the GUID if it creates a new toplevel GUID.  For a similar
4847192830Sed	 * reason, we must remove the spare now, in the same txg as the detach;
4848192830Sed	 * otherwise someone could attach a new sibling, change the GUID, and
4849192830Sed	 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail.
4850192830Sed	 */
4851192830Sed	if (unspare) {
4852192830Sed		ASSERT(cvd->vdev_isspare);
4853192830Sed		spa_spare_remove(cvd);
4854192830Sed		unspare_guid = cvd->vdev_guid;
4855192830Sed		(void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
4856192830Sed		cvd->vdev_unspare = B_TRUE;
4857192830Sed	}
4858192830Sed
4859192830Sed	/*
4860192830Sed	 * If the parent mirror/replacing vdev only has one child,
4861192830Sed	 * the parent is no longer needed.  Remove it from the tree.
4862192914Sed	 */
4863192830Sed	if (pvd->vdev_children == 1) {
4864192914Sed		if (pvd->vdev_ops == &vdev_spare_ops)
4865192830Sed			cvd->vdev_unspare = B_FALSE;
4866192830Sed		vdev_remove_parent(cvd);
4867192830Sed	}
4868192830Sed
4869192830Sed
4870192830Sed	/*
4871192830Sed	 * We don't set tvd until now because the parent we just removed
4872192830Sed	 * may have been the previous top-level vdev.
4873192830Sed	 */
4874192830Sed	tvd = cvd->vdev_top;
4875192830Sed	ASSERT(tvd->vdev_parent == rvd);
4876192830Sed
4877192830Sed	/*
4878192830Sed	 * Reevaluate the parent vdev state.
4879192830Sed	 */
4880192830Sed	vdev_propagate_state(cvd);
4881192830Sed
4882192830Sed	/*
4883192830Sed	 * If the 'autoexpand' property is set on the pool then automatically
4884192830Sed	 * try to expand the size of the pool. For example if the device we
4885192830Sed	 * just detached was smaller than the others, it may be possible to
4886192830Sed	 * add metaslabs (i.e. grow the pool). We need to reopen the vdev
4887192830Sed	 * first so that we can obtain the updated sizes of the leaf vdevs.
4888192830Sed	 */
4889192830Sed	if (spa->spa_autoexpand) {
4890192830Sed		vdev_reopen(tvd);
4891192830Sed		vdev_expand(tvd, txg);
4892192830Sed	}
4893192830Sed
4894192830Sed	vdev_config_dirty(tvd);
4895192830Sed
4896192830Sed	/*
4897192830Sed	 * Mark vd's DTL as dirty in this txg.  vdev_dtl_sync() will see that
4898192830Sed	 * vd->vdev_detached is set and free vd's DTL object in syncing context.
4899192830Sed	 * But first make sure we're not on any *other* txg's DTL list, to
4900192830Sed	 * prevent vd from being accessed after it's freed.
4901192830Sed	 */
4902192830Sed	vdpath = spa_strdup(vd->vdev_path);
4903192830Sed	for (int t = 0; t < TXG_SIZE; t++)
4904192830Sed		(void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
4905192830Sed	vd->vdev_detached = B_TRUE;
4906192830Sed	vdev_dirty(tvd, VDD_DTL, vd, txg);
4907192914Sed
4908192830Sed	spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE);
4909192830Sed
4910192830Sed	/* hang on to the spa before we release the lock */
4911192830Sed	spa_open_ref(spa, FTAG);
4912192830Sed
4913192830Sed	error = spa_vdev_exit(spa, vd, txg, 0);
4914192830Sed
4915192830Sed	spa_history_log_internal(spa, "detach", NULL,
4916192830Sed	    "vdev=%s", vdpath);
4917192830Sed	spa_strfree(vdpath);
4918192830Sed
4919192830Sed	/*
4920192830Sed	 * If this was the removal of the original device in a hot spare vdev,
4921192830Sed	 * then we want to go through and remove the device from the hot spare
4922192830Sed	 * list of every other pool.
4923192830Sed	 */
4924192830Sed	if (unspare) {
4925192830Sed		spa_t *altspa = NULL;
4926192830Sed
4927192914Sed		mutex_enter(&spa_namespace_lock);
4928192830Sed		while ((altspa = spa_next(altspa)) != NULL) {
4929192830Sed			if (altspa->spa_state != POOL_STATE_ACTIVE ||
4930192914Sed			    altspa == spa)
4931192830Sed				continue;
4932192830Sed
4933192830Sed			spa_open_ref(altspa, FTAG);
4934192830Sed			mutex_exit(&spa_namespace_lock);
4935192830Sed			(void) spa_vdev_remove(altspa, unspare_guid, B_TRUE);
4936192830Sed			mutex_enter(&spa_namespace_lock);
4937192830Sed			spa_close(altspa, FTAG);
4938192830Sed		}
4939192830Sed		mutex_exit(&spa_namespace_lock);
4940192830Sed
4941192830Sed		/* search the rest of the vdevs for spares to remove */
4942192830Sed		spa_vdev_resilver_done(spa);
4943192830Sed	}
4944192830Sed
4945192830Sed	/* all done with the spa; OK to release */
4946192914Sed	mutex_enter(&spa_namespace_lock);
4947192830Sed	spa_close(spa, FTAG);
4948192830Sed	mutex_exit(&spa_namespace_lock);
4949192830Sed
4950192830Sed	return (error);
4951192830Sed}
4952192830Sed
4953192830Sed/*
4954192830Sed * Split a set of devices from their mirrors, and create a new pool from them.
4955192830Sed */
4956192830Sedint
4957192830Sedspa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
4958192830Sed    nvlist_t *props, boolean_t exp)
4959192914Sed{
4960192830Sed	int error = 0;
4961192830Sed	uint64_t txg, *glist;
4962192830Sed	spa_t *newspa;
4963192830Sed	uint_t c, children, lastlog;
4964192830Sed	nvlist_t **child, *nvl, *tmp;
4965192830Sed	dmu_tx_t *tx;
4966192830Sed	char *altroot = NULL;
4967192830Sed	vdev_t *rvd, **vml = NULL;			/* vdev modify list */
4968192830Sed	boolean_t activate_slog;
4969192914Sed
4970192830Sed	ASSERT(spa_writeable(spa));
4971192830Sed
4972192830Sed	txg = spa_vdev_enter(spa);
4973192830Sed
4974192830Sed	/* clear the log and flush everything up to now */
4975192830Sed	activate_slog = spa_passivate_log(spa);
4976192830Sed	(void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
4977192830Sed	error = spa_offline_log(spa);
4978192830Sed	txg = spa_vdev_config_enter(spa);
4979192830Sed
4980192830Sed	if (activate_slog)
4981192830Sed		spa_activate_log(spa);
4982192830Sed
4983192830Sed	if (error != 0)
4984192830Sed		return (spa_vdev_exit(spa, NULL, txg, error));
4985192830Sed
4986192830Sed	/* check new spa name before going any further */
4987192830Sed	if (spa_lookup(newname) != NULL)
4988192830Sed		return (spa_vdev_exit(spa, NULL, txg, EEXIST));
4989192830Sed
4990192830Sed	/*
4991192830Sed	 * scan through all the children to ensure they're all mirrors
4992192914Sed	 */
4993192830Sed	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 ||
4994192830Sed	    nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child,
4995192830Sed	    &children) != 0)
4996192830Sed		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
4997192830Sed
4998192830Sed	/* first, check to ensure we've got the right child count */
4999192830Sed	rvd = spa->spa_root_vdev;
5000192830Sed	lastlog = 0;
5001192830Sed	for (c = 0; c < rvd->vdev_children; c++) {
5002192830Sed		vdev_t *vd = rvd->vdev_child[c];
5003192830Sed
5004192830Sed		/* don't count the holes & logs as children */
5005192830Sed		if (vd->vdev_islog || vd->vdev_ishole) {
5006192830Sed			if (lastlog == 0)
5007192830Sed				lastlog = c;
5008192830Sed			continue;
5009192830Sed		}
5010192830Sed
5011192830Sed		lastlog = 0;
5012192830Sed	}
5013192830Sed	if (children != (lastlog != 0 ? lastlog : rvd->vdev_children))
5014192830Sed		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
5015192830Sed
5016192830Sed	/* next, ensure no spare or cache devices are part of the split */
5017192830Sed	if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 ||
5018192830Sed	    nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0)
5019192830Sed		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
5020192830Sed
5021192830Sed	vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP);
5022192830Sed	glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP);
5023192830Sed
5024192830Sed	/* then, loop over each vdev and validate it */
5025192830Sed	for (c = 0; c < children; c++) {
5026192830Sed		uint64_t is_hole = 0;
5027192830Sed
5028192830Sed		(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
5029192830Sed		    &is_hole);
5030192830Sed
5031192830Sed		if (is_hole != 0) {
5032192830Sed			if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole ||
5033192830Sed			    spa->spa_root_vdev->vdev_child[c]->vdev_islog) {
5034192830Sed				continue;
5035192830Sed			} else {
5036192830Sed				error = SET_ERROR(EINVAL);
5037192830Sed				break;
5038192830Sed			}
5039192830Sed		}
5040192830Sed
5041192830Sed		/* which disk is going to be split? */
5042192830Sed		if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID,
5043192830Sed		    &glist[c]) != 0) {
5044192830Sed			error = SET_ERROR(EINVAL);
5045192830Sed			break;
5046192830Sed		}
5047192830Sed
5048192830Sed		/* look it up in the spa */
5049192830Sed		vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE);
5050192830Sed		if (vml[c] == NULL) {
5051192830Sed			error = SET_ERROR(ENODEV);
5052192830Sed			break;
5053192830Sed		}
5054192830Sed
5055192830Sed		/* make sure there's nothing stopping the split */
5056192830Sed		if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops ||
5057192830Sed		    vml[c]->vdev_islog ||
5058192830Sed		    vml[c]->vdev_ishole ||
5059192830Sed		    vml[c]->vdev_isspare ||
5060192830Sed		    vml[c]->vdev_isl2cache ||
5061192830Sed		    !vdev_writeable(vml[c]) ||
5062192830Sed		    vml[c]->vdev_children != 0 ||
5063192830Sed		    vml[c]->vdev_state != VDEV_STATE_HEALTHY ||
5064192830Sed		    c != spa->spa_root_vdev->vdev_child[c]->vdev_id) {
5065192830Sed			error = SET_ERROR(EINVAL);
5066192830Sed			break;
5067192830Sed		}
5068192830Sed
5069192830Sed		if (vdev_dtl_required(vml[c])) {
5070192830Sed			error = SET_ERROR(EBUSY);
5071192830Sed			break;
5072192830Sed		}
5073192830Sed
5074192830Sed		/* we need certain info from the top level */
5075192830Sed		VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY,
5076192830Sed		    vml[c]->vdev_top->vdev_ms_array) == 0);
5077192830Sed		VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT,
5078192830Sed		    vml[c]->vdev_top->vdev_ms_shift) == 0);
5079192830Sed		VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE,
5080192830Sed		    vml[c]->vdev_top->vdev_asize) == 0);
5081192830Sed		VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT,
5082196750Sache		    vml[c]->vdev_top->vdev_ashift) == 0);
5083192830Sed	}
5084192856Sed
5085192830Sed	if (error != 0) {
5086192830Sed		kmem_free(vml, children * sizeof (vdev_t *));
5087192830Sed		kmem_free(glist, children * sizeof (uint64_t));
5088192830Sed		return (spa_vdev_exit(spa, NULL, txg, error));
5089192830Sed	}
5090192830Sed
5091192830Sed	/* stop writers from using the disks */
5092192830Sed	for (c = 0; c < children; c++) {
5093192830Sed		if (vml[c] != NULL)
5094192830Sed			vml[c]->vdev_offline = B_TRUE;
5095192830Sed	}
5096192830Sed	vdev_reopen(spa->spa_root_vdev);
5097192830Sed
5098192830Sed	/*
5099192830Sed	 * Temporarily record the splitting vdevs in the spa config.  This
5100192830Sed	 * will disappear once the config is regenerated.
5101192830Sed	 */
5102192830Sed	VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0);
5103192830Sed	VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
5104192830Sed	    glist, children) == 0);
5105192830Sed	kmem_free(glist, children * sizeof (uint64_t));
5106192830Sed
5107192830Sed	mutex_enter(&spa->spa_props_lock);
5108192830Sed	VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT,
5109192830Sed	    nvl) == 0);
5110192830Sed	mutex_exit(&spa->spa_props_lock);
5111192830Sed	spa->spa_config_splitting = nvl;
5112192830Sed	vdev_config_dirty(spa->spa_root_vdev);
5113192830Sed
5114192830Sed	/* configure and create the new pool */
5115192830Sed	VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0);
5116192830Sed	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
5117192830Sed	    exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0);
5118192830Sed	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
5119192830Sed	    spa_version(spa)) == 0);
5120192830Sed	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG,
5121192830Sed	    spa->spa_config_txg) == 0);
5122192830Sed	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
5123192830Sed	    spa_generate_guid(NULL)) == 0);
5124192830Sed	(void) nvlist_lookup_string(props,
5125192830Sed	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
5126192830Sed
5127192830Sed	/* add the new pool to the namespace */
5128192830Sed	newspa = spa_add(newname, config, altroot);
5129192830Sed	newspa->spa_config_txg = spa->spa_config_txg;
5130192856Sed	spa_set_log_state(newspa, SPA_LOG_CLEAR);
5131192830Sed
5132192830Sed	/* release the spa config lock, retaining the namespace lock */
5133192830Sed	spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
5134192830Sed
5135192830Sed	if (zio_injection_enabled)
5136192830Sed		zio_handle_panic_injection(spa, FTAG, 1);
5137192830Sed
5138192830Sed	spa_activate(newspa, spa_mode_global);
5139192830Sed	spa_async_suspend(newspa);
5140192830Sed
5141192830Sed#ifndef sun
5142192830Sed	/* mark that we are creating new spa by splitting */
5143192830Sed	newspa->spa_splitting_newspa = B_TRUE;
5144192830Sed#endif
5145192830Sed	/* create the new pool from the disks of the original pool */
5146192830Sed	error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE);
5147192856Sed#ifndef sun
5148192830Sed	newspa->spa_splitting_newspa = B_FALSE;
5149192830Sed#endif
5150192830Sed	if (error)
5151192830Sed		goto out;
5152192830Sed
5153192830Sed	/* if that worked, generate a real config for the new pool */
5154192830Sed	if (newspa->spa_root_vdev != NULL) {
5155192830Sed		VERIFY(nvlist_alloc(&newspa->spa_config_splitting,
5156192830Sed		    NV_UNIQUE_NAME, KM_SLEEP) == 0);
5157192830Sed		VERIFY(nvlist_add_uint64(newspa->spa_config_splitting,
5158192830Sed		    ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0);
5159192830Sed		spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL,
5160192830Sed		    B_TRUE));
5161192830Sed	}
5162192830Sed
5163192830Sed	/* set the props */
5164192830Sed	if (props != NULL) {
5165192830Sed		spa_configfile_set(newspa, props, B_FALSE);
5166192830Sed		error = spa_prop_set(newspa, props);
5167192830Sed		if (error)
5168192830Sed			goto out;
5169192830Sed	}
5170192830Sed
5171192830Sed	/* flush everything */
5172192830Sed	txg = spa_vdev_config_enter(newspa);
5173192830Sed	vdev_config_dirty(newspa->spa_root_vdev);
5174192830Sed	(void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG);
5175192830Sed
5176192830Sed	if (zio_injection_enabled)
5177192830Sed		zio_handle_panic_injection(spa, FTAG, 2);
5178192830Sed
5179192830Sed	spa_async_resume(newspa);
5180192830Sed
5181192830Sed	/* finally, update the original pool's config */
5182192830Sed	txg = spa_vdev_config_enter(spa);
5183192830Sed	tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
5184192830Sed	error = dmu_tx_assign(tx, TXG_WAIT);
5185192830Sed	if (error != 0)
5186192830Sed		dmu_tx_abort(tx);
5187192830Sed	for (c = 0; c < children; c++) {
5188192830Sed		if (vml[c] != NULL) {
5189192914Sed			vdev_split(vml[c]);
5190192830Sed			if (error == 0)
5191192830Sed				spa_history_log_internal(spa, "detach", tx,
5192192830Sed				    "vdev=%s", vml[c]->vdev_path);
5193192830Sed			vdev_free(vml[c]);
5194192830Sed		}
5195192830Sed	}
5196192830Sed	vdev_config_dirty(spa->spa_root_vdev);
5197192830Sed	spa->spa_config_splitting = NULL;
5198192830Sed	nvlist_free(nvl);
5199192830Sed	if (error == 0)
5200192830Sed		dmu_tx_commit(tx);
5201192830Sed	(void) spa_vdev_exit(spa, NULL, txg, 0);
5202192830Sed
5203192830Sed	if (zio_injection_enabled)
5204192830Sed		zio_handle_panic_injection(spa, FTAG, 3);
5205192830Sed
5206192830Sed	/* split is complete; log a history record */
5207192830Sed	spa_history_log_internal(newspa, "split", NULL,
5208192830Sed	    "from pool %s", spa_name(spa));
5209192830Sed
5210192830Sed	kmem_free(vml, children * sizeof (vdev_t *));
5211192830Sed
5212192830Sed	/* if we're not going to mount the filesystems in userland, export */
5213192830Sed	if (exp)
5214192830Sed		error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL,
5215192830Sed		    B_FALSE, B_FALSE);
5216192830Sed
5217192830Sed	return (error);
5218192830Sed
5219192830Sedout:
5220192830Sed	spa_unload(newspa);
5221192830Sed	spa_deactivate(newspa);
5222192830Sed	spa_remove(newspa);
5223192830Sed
5224192830Sed	txg = spa_vdev_config_enter(spa);
5225192830Sed
5226192830Sed	/* re-online all offlined disks */
5227192830Sed	for (c = 0; c < children; c++) {
5228192830Sed		if (vml[c] != NULL)
5229192830Sed			vml[c]->vdev_offline = B_FALSE;
5230192830Sed	}
5231192830Sed	vdev_reopen(spa->spa_root_vdev);
5232192830Sed
5233192830Sed	nvlist_free(spa->spa_config_splitting);
5234192830Sed	spa->spa_config_splitting = NULL;
5235192830Sed	(void) spa_vdev_exit(spa, NULL, txg, error);
5236192830Sed
5237192914Sed	kmem_free(vml, children * sizeof (vdev_t *));
5238192914Sed	return (error);
5239192830Sed}
5240192830Sed
5241192830Sedstatic nvlist_t *
5242192830Sedspa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid)
5243192830Sed{
5244192830Sed	for (int i = 0; i < count; i++) {
5245192830Sed		uint64_t guid;
5246192830Sed
5247192830Sed		VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID,
5248192830Sed		    &guid) == 0);
5249192830Sed
5250192830Sed		if (guid == target_guid)
5251192830Sed			return (nvpp[i]);
5252192830Sed	}
5253192830Sed
5254192830Sed	return (NULL);
5255192830Sed}
5256192830Sed
5257192914Sedstatic void
5258192914Sedspa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count,
5259192914Sed	nvlist_t *dev_to_remove)
5260192914Sed{
5261192856Sed	nvlist_t **newdev = NULL;
5262192830Sed
5263192830Sed	if (count > 1)
5264192830Sed		newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP);
5265192830Sed
5266192830Sed	for (int i = 0, j = 0; i < count; i++) {
5267192830Sed		if (dev[i] == dev_to_remove)
5268192830Sed			continue;
5269192830Sed		VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0);
5270192830Sed	}
5271192830Sed
5272192830Sed	VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0);
5273192830Sed	VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0);
5274192830Sed
5275192830Sed	for (int i = 0; i < count - 1; i++)
5276192830Sed		nvlist_free(newdev[i]);
5277192830Sed
5278192830Sed	if (count > 1)
5279192830Sed		kmem_free(newdev, (count - 1) * sizeof (void *));
5280192830Sed}
5281192830Sed
5282192830Sed/*
5283192830Sed * Evacuate the device.
5284192830Sed */
5285192830Sedstatic int
5286192830Sedspa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd)
5287192830Sed{
5288192830Sed	uint64_t txg;
5289192830Sed	int error = 0;
5290192830Sed
5291192830Sed	ASSERT(MUTEX_HELD(&spa_namespace_lock));
5292192830Sed	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
5293192830Sed	ASSERT(vd == vd->vdev_top);
5294192830Sed
5295192830Sed	/*
5296192830Sed	 * Evacuate the device.  We don't hold the config lock as writer
5297192830Sed	 * since we need to do I/O but we do keep the
5298192830Sed	 * spa_namespace_lock held.  Once this completes the device
5299192830Sed	 * should no longer have any blocks allocated on it.
5300192830Sed	 */
5301192830Sed	if (vd->vdev_islog) {
5302192830Sed		if (vd->vdev_stat.vs_alloc != 0)
5303192830Sed			error = spa_offline_log(spa);
5304192830Sed	} else {
5305192830Sed		error = SET_ERROR(ENOTSUP);
5306192830Sed	}
5307192830Sed
5308192830Sed	if (error)
5309192830Sed		return (error);
5310192830Sed
5311192830Sed	/*
5312192830Sed	 * The evacuation succeeded.  Remove any remaining MOS metadata
5313192830Sed	 * associated with this vdev, and wait for these changes to sync.
5314192830Sed	 */
5315192830Sed	ASSERT0(vd->vdev_stat.vs_alloc);
5316192830Sed	txg = spa_vdev_config_enter(spa);
5317192830Sed	vd->vdev_removing = B_TRUE;
5318192830Sed	vdev_dirty(vd, 0, NULL, txg);
5319192830Sed	vdev_config_dirty(vd);
5320192830Sed	spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
5321192830Sed
5322192830Sed	return (0);
5323192830Sed}
5324192830Sed
5325192830Sed/*
5326192830Sed * Complete the removal by cleaning up the namespace.
5327192830Sed */
5328192830Sedstatic void
5329192830Sedspa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd)
5330192830Sed{
5331192830Sed	vdev_t *rvd = spa->spa_root_vdev;
5332192830Sed	uint64_t id = vd->vdev_id;
5333192830Sed	boolean_t last_vdev = (id == (rvd->vdev_children - 1));
5334192830Sed
5335192830Sed	ASSERT(MUTEX_HELD(&spa_namespace_lock));
5336192830Sed	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
5337192830Sed	ASSERT(vd == vd->vdev_top);
5338192830Sed
5339192830Sed	/*
5340192830Sed	 * Only remove any devices which are empty.
5341192830Sed	 */
5342192830Sed	if (vd->vdev_stat.vs_alloc != 0)
5343192830Sed		return;
5344192830Sed
5345192830Sed	(void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
5346192830Sed
5347192830Sed	if (list_link_active(&vd->vdev_state_dirty_node))
5348192830Sed		vdev_state_clean(vd);
5349	if (list_link_active(&vd->vdev_config_dirty_node))
5350		vdev_config_clean(vd);
5351
5352	vdev_free(vd);
5353
5354	if (last_vdev) {
5355		vdev_compact_children(rvd);
5356	} else {
5357		vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops);
5358		vdev_add_child(rvd, vd);
5359	}
5360	vdev_config_dirty(rvd);
5361
5362	/*
5363	 * Reassess the health of our root vdev.
5364	 */
5365	vdev_reopen(rvd);
5366}
5367
5368/*
5369 * Remove a device from the pool -
5370 *
5371 * Removing a device from the vdev namespace requires several steps
5372 * and can take a significant amount of time.  As a result we use
5373 * the spa_vdev_config_[enter/exit] functions which allow us to
5374 * grab and release the spa_config_lock while still holding the namespace
5375 * lock.  During each step the configuration is synced out.
5376 *
5377 * Currently, this supports removing only hot spares, slogs, and level 2 ARC
5378 * devices.
5379 */
5380int
5381spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
5382{
5383	vdev_t *vd;
5384	metaslab_group_t *mg;
5385	nvlist_t **spares, **l2cache, *nv;
5386	uint64_t txg = 0;
5387	uint_t nspares, nl2cache;
5388	int error = 0;
5389	boolean_t locked = MUTEX_HELD(&spa_namespace_lock);
5390
5391	ASSERT(spa_writeable(spa));
5392
5393	if (!locked)
5394		txg = spa_vdev_enter(spa);
5395
5396	vd = spa_lookup_by_guid(spa, guid, B_FALSE);
5397
5398	if (spa->spa_spares.sav_vdevs != NULL &&
5399	    nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
5400	    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 &&
5401	    (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) {
5402		/*
5403		 * Only remove the hot spare if it's not currently in use
5404		 * in this pool.
5405		 */
5406		if (vd == NULL || unspare) {
5407			spa_vdev_remove_aux(spa->spa_spares.sav_config,
5408			    ZPOOL_CONFIG_SPARES, spares, nspares, nv);
5409			spa_load_spares(spa);
5410			spa->spa_spares.sav_sync = B_TRUE;
5411		} else {
5412			error = SET_ERROR(EBUSY);
5413		}
5414	} else if (spa->spa_l2cache.sav_vdevs != NULL &&
5415	    nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
5416	    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 &&
5417	    (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) {
5418		/*
5419		 * Cache devices can always be removed.
5420		 */
5421		spa_vdev_remove_aux(spa->spa_l2cache.sav_config,
5422		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv);
5423		spa_load_l2cache(spa);
5424		spa->spa_l2cache.sav_sync = B_TRUE;
5425	} else if (vd != NULL && vd->vdev_islog) {
5426		ASSERT(!locked);
5427		ASSERT(vd == vd->vdev_top);
5428
5429		/*
5430		 * XXX - Once we have bp-rewrite this should
5431		 * become the common case.
5432		 */
5433
5434		mg = vd->vdev_mg;
5435
5436		/*
5437		 * Stop allocating from this vdev.
5438		 */
5439		metaslab_group_passivate(mg);
5440
5441		/*
5442		 * Wait for the youngest allocations and frees to sync,
5443		 * and then wait for the deferral of those frees to finish.
5444		 */
5445		spa_vdev_config_exit(spa, NULL,
5446		    txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
5447
5448		/*
5449		 * Attempt to evacuate the vdev.
5450		 */
5451		error = spa_vdev_remove_evacuate(spa, vd);
5452
5453		txg = spa_vdev_config_enter(spa);
5454
5455		/*
5456		 * If we couldn't evacuate the vdev, unwind.
5457		 */
5458		if (error) {
5459			metaslab_group_activate(mg);
5460			return (spa_vdev_exit(spa, NULL, txg, error));
5461		}
5462
5463		/*
5464		 * Clean up the vdev namespace.
5465		 */
5466		spa_vdev_remove_from_namespace(spa, vd);
5467
5468	} else if (vd != NULL) {
5469		/*
5470		 * Normal vdevs cannot be removed (yet).
5471		 */
5472		error = SET_ERROR(ENOTSUP);
5473	} else {
5474		/*
5475		 * There is no vdev of any kind with the specified guid.
5476		 */
5477		error = SET_ERROR(ENOENT);
5478	}
5479
5480	if (!locked)
5481		return (spa_vdev_exit(spa, NULL, txg, error));
5482
5483	return (error);
5484}
5485
5486/*
5487 * Find any device that's done replacing, or a vdev marked 'unspare' that's
5488 * currently spared, so we can detach it.
5489 */
5490static vdev_t *
5491spa_vdev_resilver_done_hunt(vdev_t *vd)
5492{
5493	vdev_t *newvd, *oldvd;
5494
5495	for (int c = 0; c < vd->vdev_children; c++) {
5496		oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]);
5497		if (oldvd != NULL)
5498			return (oldvd);
5499	}
5500
5501	/*
5502	 * Check for a completed replacement.  We always consider the first
5503	 * vdev in the list to be the oldest vdev, and the last one to be
5504	 * the newest (see spa_vdev_attach() for how that works).  In
5505	 * the case where the newest vdev is faulted, we will not automatically
5506	 * remove it after a resilver completes.  This is OK as it will require
5507	 * user intervention to determine which disk the admin wishes to keep.
5508	 */
5509	if (vd->vdev_ops == &vdev_replacing_ops) {
5510		ASSERT(vd->vdev_children > 1);
5511
5512		newvd = vd->vdev_child[vd->vdev_children - 1];
5513		oldvd = vd->vdev_child[0];
5514
5515		if (vdev_dtl_empty(newvd, DTL_MISSING) &&
5516		    vdev_dtl_empty(newvd, DTL_OUTAGE) &&
5517		    !vdev_dtl_required(oldvd))
5518			return (oldvd);
5519	}
5520
5521	/*
5522	 * Check for a completed resilver with the 'unspare' flag set.
5523	 */
5524	if (vd->vdev_ops == &vdev_spare_ops) {
5525		vdev_t *first = vd->vdev_child[0];
5526		vdev_t *last = vd->vdev_child[vd->vdev_children - 1];
5527
5528		if (last->vdev_unspare) {
5529			oldvd = first;
5530			newvd = last;
5531		} else if (first->vdev_unspare) {
5532			oldvd = last;
5533			newvd = first;
5534		} else {
5535			oldvd = NULL;
5536		}
5537
5538		if (oldvd != NULL &&
5539		    vdev_dtl_empty(newvd, DTL_MISSING) &&
5540		    vdev_dtl_empty(newvd, DTL_OUTAGE) &&
5541		    !vdev_dtl_required(oldvd))
5542			return (oldvd);
5543
5544		/*
5545		 * If there are more than two spares attached to a disk,
5546		 * and those spares are not required, then we want to
5547		 * attempt to free them up now so that they can be used
5548		 * by other pools.  Once we're back down to a single
5549		 * disk+spare, we stop removing them.
5550		 */
5551		if (vd->vdev_children > 2) {
5552			newvd = vd->vdev_child[1];
5553
5554			if (newvd->vdev_isspare && last->vdev_isspare &&
5555			    vdev_dtl_empty(last, DTL_MISSING) &&
5556			    vdev_dtl_empty(last, DTL_OUTAGE) &&
5557			    !vdev_dtl_required(newvd))
5558				return (newvd);
5559		}
5560	}
5561
5562	return (NULL);
5563}
5564
5565static void
5566spa_vdev_resilver_done(spa_t *spa)
5567{
5568	vdev_t *vd, *pvd, *ppvd;
5569	uint64_t guid, sguid, pguid, ppguid;
5570
5571	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5572
5573	while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) {
5574		pvd = vd->vdev_parent;
5575		ppvd = pvd->vdev_parent;
5576		guid = vd->vdev_guid;
5577		pguid = pvd->vdev_guid;
5578		ppguid = ppvd->vdev_guid;
5579		sguid = 0;
5580		/*
5581		 * If we have just finished replacing a hot spared device, then
5582		 * we need to detach the parent's first child (the original hot
5583		 * spare) as well.
5584		 */
5585		if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 &&
5586		    ppvd->vdev_children == 2) {
5587			ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
5588			sguid = ppvd->vdev_child[1]->vdev_guid;
5589		}
5590		ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd));
5591
5592		spa_config_exit(spa, SCL_ALL, FTAG);
5593		if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0)
5594			return;
5595		if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0)
5596			return;
5597		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5598	}
5599
5600	spa_config_exit(spa, SCL_ALL, FTAG);
5601}
5602
5603/*
5604 * Update the stored path or FRU for this vdev.
5605 */
5606int
5607spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value,
5608    boolean_t ispath)
5609{
5610	vdev_t *vd;
5611	boolean_t sync = B_FALSE;
5612
5613	ASSERT(spa_writeable(spa));
5614
5615	spa_vdev_state_enter(spa, SCL_ALL);
5616
5617	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
5618		return (spa_vdev_state_exit(spa, NULL, ENOENT));
5619
5620	if (!vd->vdev_ops->vdev_op_leaf)
5621		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
5622
5623	if (ispath) {
5624		if (strcmp(value, vd->vdev_path) != 0) {
5625			spa_strfree(vd->vdev_path);
5626			vd->vdev_path = spa_strdup(value);
5627			sync = B_TRUE;
5628		}
5629	} else {
5630		if (vd->vdev_fru == NULL) {
5631			vd->vdev_fru = spa_strdup(value);
5632			sync = B_TRUE;
5633		} else if (strcmp(value, vd->vdev_fru) != 0) {
5634			spa_strfree(vd->vdev_fru);
5635			vd->vdev_fru = spa_strdup(value);
5636			sync = B_TRUE;
5637		}
5638	}
5639
5640	return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0));
5641}
5642
5643int
5644spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
5645{
5646	return (spa_vdev_set_common(spa, guid, newpath, B_TRUE));
5647}
5648
5649int
5650spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru)
5651{
5652	return (spa_vdev_set_common(spa, guid, newfru, B_FALSE));
5653}
5654
5655/*
5656 * ==========================================================================
5657 * SPA Scanning
5658 * ==========================================================================
5659 */
5660
5661int
5662spa_scan_stop(spa_t *spa)
5663{
5664	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
5665	if (dsl_scan_resilvering(spa->spa_dsl_pool))
5666		return (SET_ERROR(EBUSY));
5667	return (dsl_scan_cancel(spa->spa_dsl_pool));
5668}
5669
5670int
5671spa_scan(spa_t *spa, pool_scan_func_t func)
5672{
5673	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
5674
5675	if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE)
5676		return (SET_ERROR(ENOTSUP));
5677
5678	/*
5679	 * If a resilver was requested, but there is no DTL on a
5680	 * writeable leaf device, we have nothing to do.
5681	 */
5682	if (func == POOL_SCAN_RESILVER &&
5683	    !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
5684		spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
5685		return (0);
5686	}
5687
5688	return (dsl_scan(spa->spa_dsl_pool, func));
5689}
5690
5691/*
5692 * ==========================================================================
5693 * SPA async task processing
5694 * ==========================================================================
5695 */
5696
5697static void
5698spa_async_remove(spa_t *spa, vdev_t *vd)
5699{
5700	if (vd->vdev_remove_wanted) {
5701		vd->vdev_remove_wanted = B_FALSE;
5702		vd->vdev_delayed_close = B_FALSE;
5703		vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE);
5704
5705		/*
5706		 * We want to clear the stats, but we don't want to do a full
5707		 * vdev_clear() as that will cause us to throw away
5708		 * degraded/faulted state as well as attempt to reopen the
5709		 * device, all of which is a waste.
5710		 */
5711		vd->vdev_stat.vs_read_errors = 0;
5712		vd->vdev_stat.vs_write_errors = 0;
5713		vd->vdev_stat.vs_checksum_errors = 0;
5714
5715		vdev_state_dirty(vd->vdev_top);
5716	}
5717
5718	for (int c = 0; c < vd->vdev_children; c++)
5719		spa_async_remove(spa, vd->vdev_child[c]);
5720}
5721
5722static void
5723spa_async_probe(spa_t *spa, vdev_t *vd)
5724{
5725	if (vd->vdev_probe_wanted) {
5726		vd->vdev_probe_wanted = B_FALSE;
5727		vdev_reopen(vd);	/* vdev_open() does the actual probe */
5728	}
5729
5730	for (int c = 0; c < vd->vdev_children; c++)
5731		spa_async_probe(spa, vd->vdev_child[c]);
5732}
5733
5734static void
5735spa_async_autoexpand(spa_t *spa, vdev_t *vd)
5736{
5737	sysevent_id_t eid;
5738	nvlist_t *attr;
5739	char *physpath;
5740
5741	if (!spa->spa_autoexpand)
5742		return;
5743
5744	for (int c = 0; c < vd->vdev_children; c++) {
5745		vdev_t *cvd = vd->vdev_child[c];
5746		spa_async_autoexpand(spa, cvd);
5747	}
5748
5749	if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL)
5750		return;
5751
5752	physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
5753	(void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath);
5754
5755	VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0);
5756	VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0);
5757
5758	(void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS,
5759	    ESC_ZFS_VDEV_AUTOEXPAND, attr, &eid, DDI_SLEEP);
5760
5761	nvlist_free(attr);
5762	kmem_free(physpath, MAXPATHLEN);
5763}
5764
5765static void
5766spa_async_thread(void *arg)
5767{
5768	spa_t *spa = arg;
5769	int tasks;
5770
5771	ASSERT(spa->spa_sync_on);
5772
5773	mutex_enter(&spa->spa_async_lock);
5774	tasks = spa->spa_async_tasks;
5775	spa->spa_async_tasks &= SPA_ASYNC_REMOVE;
5776	mutex_exit(&spa->spa_async_lock);
5777
5778	/*
5779	 * See if the config needs to be updated.
5780	 */
5781	if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
5782		uint64_t old_space, new_space;
5783
5784		mutex_enter(&spa_namespace_lock);
5785		old_space = metaslab_class_get_space(spa_normal_class(spa));
5786		spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
5787		new_space = metaslab_class_get_space(spa_normal_class(spa));
5788		mutex_exit(&spa_namespace_lock);
5789
5790		/*
5791		 * If the pool grew as a result of the config update,
5792		 * then log an internal history event.
5793		 */
5794		if (new_space != old_space) {
5795			spa_history_log_internal(spa, "vdev online", NULL,
5796			    "pool '%s' size: %llu(+%llu)",
5797			    spa_name(spa), new_space, new_space - old_space);
5798		}
5799	}
5800
5801	if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) {
5802		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
5803		spa_async_autoexpand(spa, spa->spa_root_vdev);
5804		spa_config_exit(spa, SCL_CONFIG, FTAG);
5805	}
5806
5807	/*
5808	 * See if any devices need to be probed.
5809	 */
5810	if (tasks & SPA_ASYNC_PROBE) {
5811		spa_vdev_state_enter(spa, SCL_NONE);
5812		spa_async_probe(spa, spa->spa_root_vdev);
5813		(void) spa_vdev_state_exit(spa, NULL, 0);
5814	}
5815
5816	/*
5817	 * If any devices are done replacing, detach them.
5818	 */
5819	if (tasks & SPA_ASYNC_RESILVER_DONE)
5820		spa_vdev_resilver_done(spa);
5821
5822	/*
5823	 * Kick off a resilver.
5824	 */
5825	if (tasks & SPA_ASYNC_RESILVER)
5826		dsl_resilver_restart(spa->spa_dsl_pool, 0);
5827
5828	/*
5829	 * Let the world know that we're done.
5830	 */
5831	mutex_enter(&spa->spa_async_lock);
5832	spa->spa_async_thread = NULL;
5833	cv_broadcast(&spa->spa_async_cv);
5834	mutex_exit(&spa->spa_async_lock);
5835	thread_exit();
5836}
5837
5838static void
5839spa_async_thread_vd(void *arg)
5840{
5841	spa_t *spa = arg;
5842	int tasks;
5843
5844	ASSERT(spa->spa_sync_on);
5845
5846	mutex_enter(&spa->spa_async_lock);
5847	tasks = spa->spa_async_tasks;
5848retry:
5849	spa->spa_async_tasks &= ~SPA_ASYNC_REMOVE;
5850	mutex_exit(&spa->spa_async_lock);
5851
5852	/*
5853	 * See if any devices need to be marked REMOVED.
5854	 */
5855	if (tasks & SPA_ASYNC_REMOVE) {
5856		spa_vdev_state_enter(spa, SCL_NONE);
5857		spa_async_remove(spa, spa->spa_root_vdev);
5858		for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
5859			spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]);
5860		for (int i = 0; i < spa->spa_spares.sav_count; i++)
5861			spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]);
5862		(void) spa_vdev_state_exit(spa, NULL, 0);
5863	}
5864
5865	/*
5866	 * Let the world know that we're done.
5867	 */
5868	mutex_enter(&spa->spa_async_lock);
5869	tasks = spa->spa_async_tasks;
5870	if ((tasks & SPA_ASYNC_REMOVE) != 0)
5871		goto retry;
5872	spa->spa_async_thread_vd = NULL;
5873	cv_broadcast(&spa->spa_async_cv);
5874	mutex_exit(&spa->spa_async_lock);
5875	thread_exit();
5876}
5877
5878void
5879spa_async_suspend(spa_t *spa)
5880{
5881	mutex_enter(&spa->spa_async_lock);
5882	spa->spa_async_suspended++;
5883	while (spa->spa_async_thread != NULL &&
5884	    spa->spa_async_thread_vd != NULL)
5885		cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
5886	mutex_exit(&spa->spa_async_lock);
5887}
5888
5889void
5890spa_async_resume(spa_t *spa)
5891{
5892	mutex_enter(&spa->spa_async_lock);
5893	ASSERT(spa->spa_async_suspended != 0);
5894	spa->spa_async_suspended--;
5895	mutex_exit(&spa->spa_async_lock);
5896}
5897
5898static boolean_t
5899spa_async_tasks_pending(spa_t *spa)
5900{
5901	uint_t non_config_tasks;
5902	uint_t config_task;
5903	boolean_t config_task_suspended;
5904
5905	non_config_tasks = spa->spa_async_tasks & ~(SPA_ASYNC_CONFIG_UPDATE |
5906	    SPA_ASYNC_REMOVE);
5907	config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE;
5908	if (spa->spa_ccw_fail_time == 0) {
5909		config_task_suspended = B_FALSE;
5910	} else {
5911		config_task_suspended =
5912		    (gethrtime() - spa->spa_ccw_fail_time) <
5913		    (zfs_ccw_retry_interval * NANOSEC);
5914	}
5915
5916	return (non_config_tasks || (config_task && !config_task_suspended));
5917}
5918
5919static void
5920spa_async_dispatch(spa_t *spa)
5921{
5922	mutex_enter(&spa->spa_async_lock);
5923	if (spa_async_tasks_pending(spa) &&
5924	    !spa->spa_async_suspended &&
5925	    spa->spa_async_thread == NULL &&
5926	    rootdir != NULL)
5927		spa->spa_async_thread = thread_create(NULL, 0,
5928		    spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
5929	mutex_exit(&spa->spa_async_lock);
5930}
5931
5932static void
5933spa_async_dispatch_vd(spa_t *spa)
5934{
5935	mutex_enter(&spa->spa_async_lock);
5936	if ((spa->spa_async_tasks & SPA_ASYNC_REMOVE) != 0 &&
5937	    !spa->spa_async_suspended &&
5938	    spa->spa_async_thread_vd == NULL &&
5939	    rootdir != NULL)
5940		spa->spa_async_thread_vd = thread_create(NULL, 0,
5941		    spa_async_thread_vd, spa, 0, &p0, TS_RUN, maxclsyspri);
5942	mutex_exit(&spa->spa_async_lock);
5943}
5944
5945void
5946spa_async_request(spa_t *spa, int task)
5947{
5948	zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task);
5949	mutex_enter(&spa->spa_async_lock);
5950	spa->spa_async_tasks |= task;
5951	mutex_exit(&spa->spa_async_lock);
5952	spa_async_dispatch_vd(spa);
5953}
5954
5955/*
5956 * ==========================================================================
5957 * SPA syncing routines
5958 * ==========================================================================
5959 */
5960
5961static int
5962bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
5963{
5964	bpobj_t *bpo = arg;
5965	bpobj_enqueue(bpo, bp, tx);
5966	return (0);
5967}
5968
5969static int
5970spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
5971{
5972	zio_t *zio = arg;
5973
5974	zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp,
5975	    BP_GET_PSIZE(bp), zio->io_flags));
5976	return (0);
5977}
5978
5979static void
5980spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
5981{
5982	char *packed = NULL;
5983	size_t bufsize;
5984	size_t nvsize = 0;
5985	dmu_buf_t *db;
5986
5987	VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0);
5988
5989	/*
5990	 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration
5991	 * information.  This avoids the dbuf_will_dirty() path and
5992	 * saves us a pre-read to get data we don't actually care about.
5993	 */
5994	bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE);
5995	packed = kmem_alloc(bufsize, KM_SLEEP);
5996
5997	VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
5998	    KM_SLEEP) == 0);
5999	bzero(packed + nvsize, bufsize - nvsize);
6000
6001	dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx);
6002
6003	kmem_free(packed, bufsize);
6004
6005	VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
6006	dmu_buf_will_dirty(db, tx);
6007	*(uint64_t *)db->db_data = nvsize;
6008	dmu_buf_rele(db, FTAG);
6009}
6010
6011static void
6012spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx,
6013    const char *config, const char *entry)
6014{
6015	nvlist_t *nvroot;
6016	nvlist_t **list;
6017	int i;
6018
6019	if (!sav->sav_sync)
6020		return;
6021
6022	/*
6023	 * Update the MOS nvlist describing the list of available devices.
6024	 * spa_validate_aux() will have already made sure this nvlist is
6025	 * valid and the vdevs are labeled appropriately.
6026	 */
6027	if (sav->sav_object == 0) {
6028		sav->sav_object = dmu_object_alloc(spa->spa_meta_objset,
6029		    DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE,
6030		    sizeof (uint64_t), tx);
6031		VERIFY(zap_update(spa->spa_meta_objset,
6032		    DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1,
6033		    &sav->sav_object, tx) == 0);
6034	}
6035
6036	VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
6037	if (sav->sav_count == 0) {
6038		VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0);
6039	} else {
6040		list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
6041		for (i = 0; i < sav->sav_count; i++)
6042			list[i] = vdev_config_generate(spa, sav->sav_vdevs[i],
6043			    B_FALSE, VDEV_CONFIG_L2CACHE);
6044		VERIFY(nvlist_add_nvlist_array(nvroot, config, list,
6045		    sav->sav_count) == 0);
6046		for (i = 0; i < sav->sav_count; i++)
6047			nvlist_free(list[i]);
6048		kmem_free(list, sav->sav_count * sizeof (void *));
6049	}
6050
6051	spa_sync_nvlist(spa, sav->sav_object, nvroot, tx);
6052	nvlist_free(nvroot);
6053
6054	sav->sav_sync = B_FALSE;
6055}
6056
6057static void
6058spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
6059{
6060	nvlist_t *config;
6061
6062	if (list_is_empty(&spa->spa_config_dirty_list))
6063		return;
6064
6065	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
6066
6067	config = spa_config_generate(spa, spa->spa_root_vdev,
6068	    dmu_tx_get_txg(tx), B_FALSE);
6069
6070	/*
6071	 * If we're upgrading the spa version then make sure that
6072	 * the config object gets updated with the correct version.
6073	 */
6074	if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version)
6075		fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
6076		    spa->spa_uberblock.ub_version);
6077
6078	spa_config_exit(spa, SCL_STATE, FTAG);
6079
6080	if (spa->spa_config_syncing)
6081		nvlist_free(spa->spa_config_syncing);
6082	spa->spa_config_syncing = config;
6083
6084	spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
6085}
6086
6087static void
6088spa_sync_version(void *arg, dmu_tx_t *tx)
6089{
6090	uint64_t *versionp = arg;
6091	uint64_t version = *versionp;
6092	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
6093
6094	/*
6095	 * Setting the version is special cased when first creating the pool.
6096	 */
6097	ASSERT(tx->tx_txg != TXG_INITIAL);
6098
6099	ASSERT(SPA_VERSION_IS_SUPPORTED(version));
6100	ASSERT(version >= spa_version(spa));
6101
6102	spa->spa_uberblock.ub_version = version;
6103	vdev_config_dirty(spa->spa_root_vdev);
6104	spa_history_log_internal(spa, "set", tx, "version=%lld", version);
6105}
6106
6107/*
6108 * Set zpool properties.
6109 */
6110static void
6111spa_sync_props(void *arg, dmu_tx_t *tx)
6112{
6113	nvlist_t *nvp = arg;
6114	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
6115	objset_t *mos = spa->spa_meta_objset;
6116	nvpair_t *elem = NULL;
6117
6118	mutex_enter(&spa->spa_props_lock);
6119
6120	while ((elem = nvlist_next_nvpair(nvp, elem))) {
6121		uint64_t intval;
6122		char *strval, *fname;
6123		zpool_prop_t prop;
6124		const char *propname;
6125		zprop_type_t proptype;
6126		zfeature_info_t *feature;
6127
6128		switch (prop = zpool_name_to_prop(nvpair_name(elem))) {
6129		case ZPROP_INVAL:
6130			/*
6131			 * We checked this earlier in spa_prop_validate().
6132			 */
6133			ASSERT(zpool_prop_feature(nvpair_name(elem)));
6134
6135			fname = strchr(nvpair_name(elem), '@') + 1;
6136			VERIFY3U(0, ==, zfeature_lookup_name(fname, &feature));
6137
6138			spa_feature_enable(spa, feature, tx);
6139			spa_history_log_internal(spa, "set", tx,
6140			    "%s=enabled", nvpair_name(elem));
6141			break;
6142
6143		case ZPOOL_PROP_VERSION:
6144			VERIFY(nvpair_value_uint64(elem, &intval) == 0);
6145			/*
6146			 * The version is synced seperatly before other
6147			 * properties and should be correct by now.
6148			 */
6149			ASSERT3U(spa_version(spa), >=, intval);
6150			break;
6151
6152		case ZPOOL_PROP_ALTROOT:
6153			/*
6154			 * 'altroot' is a non-persistent property. It should
6155			 * have been set temporarily at creation or import time.
6156			 */
6157			ASSERT(spa->spa_root != NULL);
6158			break;
6159
6160		case ZPOOL_PROP_READONLY:
6161		case ZPOOL_PROP_CACHEFILE:
6162			/*
6163			 * 'readonly' and 'cachefile' are also non-persisitent
6164			 * properties.
6165			 */
6166			break;
6167		case ZPOOL_PROP_COMMENT:
6168			VERIFY(nvpair_value_string(elem, &strval) == 0);
6169			if (spa->spa_comment != NULL)
6170				spa_strfree(spa->spa_comment);
6171			spa->spa_comment = spa_strdup(strval);
6172			/*
6173			 * We need to dirty the configuration on all the vdevs
6174			 * so that their labels get updated.  It's unnecessary
6175			 * to do this for pool creation since the vdev's
6176			 * configuratoin has already been dirtied.
6177			 */
6178			if (tx->tx_txg != TXG_INITIAL)
6179				vdev_config_dirty(spa->spa_root_vdev);
6180			spa_history_log_internal(spa, "set", tx,
6181			    "%s=%s", nvpair_name(elem), strval);
6182			break;
6183		default:
6184			/*
6185			 * Set pool property values in the poolprops mos object.
6186			 */
6187			if (spa->spa_pool_props_object == 0) {
6188				spa->spa_pool_props_object =
6189				    zap_create_link(mos, DMU_OT_POOL_PROPS,
6190				    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS,
6191				    tx);
6192			}
6193
6194			/* normalize the property name */
6195			propname = zpool_prop_to_name(prop);
6196			proptype = zpool_prop_get_type(prop);
6197
6198			if (nvpair_type(elem) == DATA_TYPE_STRING) {
6199				ASSERT(proptype == PROP_TYPE_STRING);
6200				VERIFY(nvpair_value_string(elem, &strval) == 0);
6201				VERIFY(zap_update(mos,
6202				    spa->spa_pool_props_object, propname,
6203				    1, strlen(strval) + 1, strval, tx) == 0);
6204				spa_history_log_internal(spa, "set", tx,
6205				    "%s=%s", nvpair_name(elem), strval);
6206			} else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
6207				VERIFY(nvpair_value_uint64(elem, &intval) == 0);
6208
6209				if (proptype == PROP_TYPE_INDEX) {
6210					const char *unused;
6211					VERIFY(zpool_prop_index_to_string(
6212					    prop, intval, &unused) == 0);
6213				}
6214				VERIFY(zap_update(mos,
6215				    spa->spa_pool_props_object, propname,
6216				    8, 1, &intval, tx) == 0);
6217				spa_history_log_internal(spa, "set", tx,
6218				    "%s=%lld", nvpair_name(elem), intval);
6219			} else {
6220				ASSERT(0); /* not allowed */
6221			}
6222
6223			switch (prop) {
6224			case ZPOOL_PROP_DELEGATION:
6225				spa->spa_delegation = intval;
6226				break;
6227			case ZPOOL_PROP_BOOTFS:
6228				spa->spa_bootfs = intval;
6229				break;
6230			case ZPOOL_PROP_FAILUREMODE:
6231				spa->spa_failmode = intval;
6232				break;
6233			case ZPOOL_PROP_AUTOEXPAND:
6234				spa->spa_autoexpand = intval;
6235				if (tx->tx_txg != TXG_INITIAL)
6236					spa_async_request(spa,
6237					    SPA_ASYNC_AUTOEXPAND);
6238				break;
6239			case ZPOOL_PROP_DEDUPDITTO:
6240				spa->spa_dedup_ditto = intval;
6241				break;
6242			default:
6243				break;
6244			}
6245		}
6246
6247	}
6248
6249	mutex_exit(&spa->spa_props_lock);
6250}
6251
6252/*
6253 * Perform one-time upgrade on-disk changes.  spa_version() does not
6254 * reflect the new version this txg, so there must be no changes this
6255 * txg to anything that the upgrade code depends on after it executes.
6256 * Therefore this must be called after dsl_pool_sync() does the sync
6257 * tasks.
6258 */
6259static void
6260spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx)
6261{
6262	dsl_pool_t *dp = spa->spa_dsl_pool;
6263
6264	ASSERT(spa->spa_sync_pass == 1);
6265
6266	rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
6267
6268	if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
6269	    spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) {
6270		dsl_pool_create_origin(dp, tx);
6271
6272		/* Keeping the origin open increases spa_minref */
6273		spa->spa_minref += 3;
6274	}
6275
6276	if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES &&
6277	    spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) {
6278		dsl_pool_upgrade_clones(dp, tx);
6279	}
6280
6281	if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES &&
6282	    spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) {
6283		dsl_pool_upgrade_dir_clones(dp, tx);
6284
6285		/* Keeping the freedir open increases spa_minref */
6286		spa->spa_minref += 3;
6287	}
6288
6289	if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES &&
6290	    spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
6291		spa_feature_create_zap_objects(spa, tx);
6292	}
6293	rrw_exit(&dp->dp_config_rwlock, FTAG);
6294}
6295
6296/*
6297 * Sync the specified transaction group.  New blocks may be dirtied as
6298 * part of the process, so we iterate until it converges.
6299 */
6300void
6301spa_sync(spa_t *spa, uint64_t txg)
6302{
6303	dsl_pool_t *dp = spa->spa_dsl_pool;
6304	objset_t *mos = spa->spa_meta_objset;
6305	bpobj_t *defer_bpo = &spa->spa_deferred_bpobj;
6306	bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
6307	vdev_t *rvd = spa->spa_root_vdev;
6308	vdev_t *vd;
6309	dmu_tx_t *tx;
6310	int error;
6311
6312	VERIFY(spa_writeable(spa));
6313
6314	/*
6315	 * Lock out configuration changes.
6316	 */
6317	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
6318
6319	spa->spa_syncing_txg = txg;
6320	spa->spa_sync_pass = 0;
6321
6322	/*
6323	 * If there are any pending vdev state changes, convert them
6324	 * into config changes that go out with this transaction group.
6325	 */
6326	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
6327	while (list_head(&spa->spa_state_dirty_list) != NULL) {
6328		/*
6329		 * We need the write lock here because, for aux vdevs,
6330		 * calling vdev_config_dirty() modifies sav_config.
6331		 * This is ugly and will become unnecessary when we
6332		 * eliminate the aux vdev wart by integrating all vdevs
6333		 * into the root vdev tree.
6334		 */
6335		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
6336		spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER);
6337		while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
6338			vdev_state_clean(vd);
6339			vdev_config_dirty(vd);
6340		}
6341		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
6342		spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
6343	}
6344	spa_config_exit(spa, SCL_STATE, FTAG);
6345
6346	tx = dmu_tx_create_assigned(dp, txg);
6347
6348	spa->spa_sync_starttime = gethrtime();
6349#ifdef illumos
6350	VERIFY(cyclic_reprogram(spa->spa_deadman_cycid,
6351	    spa->spa_sync_starttime + spa->spa_deadman_synctime));
6352#else	/* FreeBSD */
6353#ifdef _KERNEL
6354	callout_reset(&spa->spa_deadman_cycid,
6355	    hz * spa->spa_deadman_synctime / NANOSEC, spa_deadman, spa);
6356#endif
6357#endif
6358
6359	/*
6360	 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
6361	 * set spa_deflate if we have no raid-z vdevs.
6362	 */
6363	if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE &&
6364	    spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) {
6365		int i;
6366
6367		for (i = 0; i < rvd->vdev_children; i++) {
6368			vd = rvd->vdev_child[i];
6369			if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
6370				break;
6371		}
6372		if (i == rvd->vdev_children) {
6373			spa->spa_deflate = TRUE;
6374			VERIFY(0 == zap_add(spa->spa_meta_objset,
6375			    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
6376			    sizeof (uint64_t), 1, &spa->spa_deflate, tx));
6377		}
6378	}
6379
6380	/*
6381	 * If anything has changed in this txg, or if someone is waiting
6382	 * for this txg to sync (eg, spa_vdev_remove()), push the
6383	 * deferred frees from the previous txg.  If not, leave them
6384	 * alone so that we don't generate work on an otherwise idle
6385	 * system.
6386	 */
6387	if (!txg_list_empty(&dp->dp_dirty_datasets, txg) ||
6388	    !txg_list_empty(&dp->dp_dirty_dirs, txg) ||
6389	    !txg_list_empty(&dp->dp_sync_tasks, txg) ||
6390	    ((dsl_scan_active(dp->dp_scan) ||
6391	    txg_sync_waiting(dp)) && !spa_shutting_down(spa))) {
6392		zio_t *zio = zio_root(spa, NULL, NULL, 0);
6393		VERIFY3U(bpobj_iterate(defer_bpo,
6394		    spa_free_sync_cb, zio, tx), ==, 0);
6395		VERIFY0(zio_wait(zio));
6396	}
6397
6398	/*
6399	 * Iterate to convergence.
6400	 */
6401	do {
6402		int pass = ++spa->spa_sync_pass;
6403
6404		spa_sync_config_object(spa, tx);
6405		spa_sync_aux_dev(spa, &spa->spa_spares, tx,
6406		    ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES);
6407		spa_sync_aux_dev(spa, &spa->spa_l2cache, tx,
6408		    ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE);
6409		spa_errlog_sync(spa, txg);
6410		dsl_pool_sync(dp, txg);
6411
6412		if (pass < zfs_sync_pass_deferred_free) {
6413			zio_t *zio = zio_root(spa, NULL, NULL, 0);
6414			bplist_iterate(free_bpl, spa_free_sync_cb,
6415			    zio, tx);
6416			VERIFY(zio_wait(zio) == 0);
6417		} else {
6418			bplist_iterate(free_bpl, bpobj_enqueue_cb,
6419			    defer_bpo, tx);
6420		}
6421
6422		ddt_sync(spa, txg);
6423		dsl_scan_sync(dp, tx);
6424
6425		while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))
6426			vdev_sync(vd, txg);
6427
6428		if (pass == 1)
6429			spa_sync_upgrades(spa, tx);
6430
6431	} while (dmu_objset_is_dirty(mos, txg));
6432
6433	/*
6434	 * Rewrite the vdev configuration (which includes the uberblock)
6435	 * to commit the transaction group.
6436	 *
6437	 * If there are no dirty vdevs, we sync the uberblock to a few
6438	 * random top-level vdevs that are known to be visible in the
6439	 * config cache (see spa_vdev_add() for a complete description).
6440	 * If there *are* dirty vdevs, sync the uberblock to all vdevs.
6441	 */
6442	for (;;) {
6443		/*
6444		 * We hold SCL_STATE to prevent vdev open/close/etc.
6445		 * while we're attempting to write the vdev labels.
6446		 */
6447		spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
6448
6449		if (list_is_empty(&spa->spa_config_dirty_list)) {
6450			vdev_t *svd[SPA_DVAS_PER_BP];
6451			int svdcount = 0;
6452			int children = rvd->vdev_children;
6453			int c0 = spa_get_random(children);
6454
6455			for (int c = 0; c < children; c++) {
6456				vd = rvd->vdev_child[(c0 + c) % children];
6457				if (vd->vdev_ms_array == 0 || vd->vdev_islog)
6458					continue;
6459				svd[svdcount++] = vd;
6460				if (svdcount == SPA_DVAS_PER_BP)
6461					break;
6462			}
6463			error = vdev_config_sync(svd, svdcount, txg, B_FALSE);
6464			if (error != 0)
6465				error = vdev_config_sync(svd, svdcount, txg,
6466				    B_TRUE);
6467		} else {
6468			error = vdev_config_sync(rvd->vdev_child,
6469			    rvd->vdev_children, txg, B_FALSE);
6470			if (error != 0)
6471				error = vdev_config_sync(rvd->vdev_child,
6472				    rvd->vdev_children, txg, B_TRUE);
6473		}
6474
6475		if (error == 0)
6476			spa->spa_last_synced_guid = rvd->vdev_guid;
6477
6478		spa_config_exit(spa, SCL_STATE, FTAG);
6479
6480		if (error == 0)
6481			break;
6482		zio_suspend(spa, NULL);
6483		zio_resume_wait(spa);
6484	}
6485	dmu_tx_commit(tx);
6486
6487#ifdef illumos
6488	VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY));
6489#else	/* FreeBSD */
6490#ifdef _KERNEL
6491	callout_drain(&spa->spa_deadman_cycid);
6492#endif
6493#endif
6494
6495	/*
6496	 * Clear the dirty config list.
6497	 */
6498	while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL)
6499		vdev_config_clean(vd);
6500
6501	/*
6502	 * Now that the new config has synced transactionally,
6503	 * let it become visible to the config cache.
6504	 */
6505	if (spa->spa_config_syncing != NULL) {
6506		spa_config_set(spa, spa->spa_config_syncing);
6507		spa->spa_config_txg = txg;
6508		spa->spa_config_syncing = NULL;
6509	}
6510
6511	spa->spa_ubsync = spa->spa_uberblock;
6512
6513	dsl_pool_sync_done(dp, txg);
6514
6515	/*
6516	 * Update usable space statistics.
6517	 */
6518	while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
6519		vdev_sync_done(vd, txg);
6520
6521	spa_update_dspace(spa);
6522
6523	/*
6524	 * It had better be the case that we didn't dirty anything
6525	 * since vdev_config_sync().
6526	 */
6527	ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
6528	ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
6529	ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
6530
6531	spa->spa_sync_pass = 0;
6532
6533	spa_config_exit(spa, SCL_CONFIG, FTAG);
6534
6535	spa_handle_ignored_writes(spa);
6536
6537	/*
6538	 * If any async tasks have been requested, kick them off.
6539	 */
6540	spa_async_dispatch(spa);
6541	spa_async_dispatch_vd(spa);
6542}
6543
6544/*
6545 * Sync all pools.  We don't want to hold the namespace lock across these
6546 * operations, so we take a reference on the spa_t and drop the lock during the
6547 * sync.
6548 */
6549void
6550spa_sync_allpools(void)
6551{
6552	spa_t *spa = NULL;
6553	mutex_enter(&spa_namespace_lock);
6554	while ((spa = spa_next(spa)) != NULL) {
6555		if (spa_state(spa) != POOL_STATE_ACTIVE ||
6556		    !spa_writeable(spa) || spa_suspended(spa))
6557			continue;
6558		spa_open_ref(spa, FTAG);
6559		mutex_exit(&spa_namespace_lock);
6560		txg_wait_synced(spa_get_dsl(spa), 0);
6561		mutex_enter(&spa_namespace_lock);
6562		spa_close(spa, FTAG);
6563	}
6564	mutex_exit(&spa_namespace_lock);
6565}
6566
6567/*
6568 * ==========================================================================
6569 * Miscellaneous routines
6570 * ==========================================================================
6571 */
6572
6573/*
6574 * Remove all pools in the system.
6575 */
6576void
6577spa_evict_all(void)
6578{
6579	spa_t *spa;
6580
6581	/*
6582	 * Remove all cached state.  All pools should be closed now,
6583	 * so every spa in the AVL tree should be unreferenced.
6584	 */
6585	mutex_enter(&spa_namespace_lock);
6586	while ((spa = spa_next(NULL)) != NULL) {
6587		/*
6588		 * Stop async tasks.  The async thread may need to detach
6589		 * a device that's been replaced, which requires grabbing
6590		 * spa_namespace_lock, so we must drop it here.
6591		 */
6592		spa_open_ref(spa, FTAG);
6593		mutex_exit(&spa_namespace_lock);
6594		spa_async_suspend(spa);
6595		mutex_enter(&spa_namespace_lock);
6596		spa_close(spa, FTAG);
6597
6598		if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
6599			spa_unload(spa);
6600			spa_deactivate(spa);
6601		}
6602		spa_remove(spa);
6603	}
6604	mutex_exit(&spa_namespace_lock);
6605}
6606
6607vdev_t *
6608spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux)
6609{
6610	vdev_t *vd;
6611	int i;
6612
6613	if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL)
6614		return (vd);
6615
6616	if (aux) {
6617		for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
6618			vd = spa->spa_l2cache.sav_vdevs[i];
6619			if (vd->vdev_guid == guid)
6620				return (vd);
6621		}
6622
6623		for (i = 0; i < spa->spa_spares.sav_count; i++) {
6624			vd = spa->spa_spares.sav_vdevs[i];
6625			if (vd->vdev_guid == guid)
6626				return (vd);
6627		}
6628	}
6629
6630	return (NULL);
6631}
6632
6633void
6634spa_upgrade(spa_t *spa, uint64_t version)
6635{
6636	ASSERT(spa_writeable(spa));
6637
6638	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
6639
6640	/*
6641	 * This should only be called for a non-faulted pool, and since a
6642	 * future version would result in an unopenable pool, this shouldn't be
6643	 * possible.
6644	 */
6645	ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version));
6646	ASSERT(version >= spa->spa_uberblock.ub_version);
6647
6648	spa->spa_uberblock.ub_version = version;
6649	vdev_config_dirty(spa->spa_root_vdev);
6650
6651	spa_config_exit(spa, SCL_ALL, FTAG);
6652
6653	txg_wait_synced(spa_get_dsl(spa), 0);
6654}
6655
6656boolean_t
6657spa_has_spare(spa_t *spa, uint64_t guid)
6658{
6659	int i;
6660	uint64_t spareguid;
6661	spa_aux_vdev_t *sav = &spa->spa_spares;
6662
6663	for (i = 0; i < sav->sav_count; i++)
6664		if (sav->sav_vdevs[i]->vdev_guid == guid)
6665			return (B_TRUE);
6666
6667	for (i = 0; i < sav->sav_npending; i++) {
6668		if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID,
6669		    &spareguid) == 0 && spareguid == guid)
6670			return (B_TRUE);
6671	}
6672
6673	return (B_FALSE);
6674}
6675
6676/*
6677 * Check if a pool has an active shared spare device.
6678 * Note: reference count of an active spare is 2, as a spare and as a replace
6679 */
6680static boolean_t
6681spa_has_active_shared_spare(spa_t *spa)
6682{
6683	int i, refcnt;
6684	uint64_t pool;
6685	spa_aux_vdev_t *sav = &spa->spa_spares;
6686
6687	for (i = 0; i < sav->sav_count; i++) {
6688		if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool,
6689		    &refcnt) && pool != 0ULL && pool == spa_guid(spa) &&
6690		    refcnt > 2)
6691			return (B_TRUE);
6692	}
6693
6694	return (B_FALSE);
6695}
6696
6697/*
6698 * Post a sysevent corresponding to the given event.  The 'name' must be one of
6699 * the event definitions in sys/sysevent/eventdefs.h.  The payload will be
6700 * filled in from the spa and (optionally) the vdev.  This doesn't do anything
6701 * in the userland libzpool, as we don't want consumers to misinterpret ztest
6702 * or zdb as real changes.
6703 */
6704void
6705spa_event_notify(spa_t *spa, vdev_t *vd, const char *name)
6706{
6707#ifdef _KERNEL
6708	sysevent_t		*ev;
6709	sysevent_attr_list_t	*attr = NULL;
6710	sysevent_value_t	value;
6711	sysevent_id_t		eid;
6712
6713	ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs",
6714	    SE_SLEEP);
6715
6716	value.value_type = SE_DATA_TYPE_STRING;
6717	value.value.sv_string = spa_name(spa);
6718	if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0)
6719		goto done;
6720
6721	value.value_type = SE_DATA_TYPE_UINT64;
6722	value.value.sv_uint64 = spa_guid(spa);
6723	if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0)
6724		goto done;
6725
6726	if (vd) {
6727		value.value_type = SE_DATA_TYPE_UINT64;
6728		value.value.sv_uint64 = vd->vdev_guid;
6729		if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value,
6730		    SE_SLEEP) != 0)
6731			goto done;
6732
6733		if (vd->vdev_path) {
6734			value.value_type = SE_DATA_TYPE_STRING;
6735			value.value.sv_string = vd->vdev_path;
6736			if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH,
6737			    &value, SE_SLEEP) != 0)
6738				goto done;
6739		}
6740	}
6741
6742	if (sysevent_attach_attributes(ev, attr) != 0)
6743		goto done;
6744	attr = NULL;
6745
6746	(void) log_sysevent(ev, SE_SLEEP, &eid);
6747
6748done:
6749	if (attr)
6750		sysevent_free_attr(attr);
6751	sysevent_free(ev);
6752#endif
6753}
6754