Deleted Added
full compact
spa.c (251631) spa.c (251636)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2013 by Delphix. All rights reserved.
25 * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
26 * Copyright (c) 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
27 */
28
29/*
30 * SPA: Storage Pool Allocator
31 *
32 * This file contains all the routines used when modifying on-disk SPA state.
33 * This includes opening, importing, destroying, exporting a pool, and syncing a
34 * pool.
35 */
36
37#include <sys/zfs_context.h>
38#include <sys/fm/fs/zfs.h>
39#include <sys/spa_impl.h>
40#include <sys/zio.h>
41#include <sys/zio_checksum.h>
42#include <sys/dmu.h>
43#include <sys/dmu_tx.h>
44#include <sys/zap.h>
45#include <sys/zil.h>
46#include <sys/ddt.h>
47#include <sys/vdev_impl.h>
48#include <sys/metaslab.h>
49#include <sys/metaslab_impl.h>
50#include <sys/uberblock_impl.h>
51#include <sys/txg.h>
52#include <sys/avl.h>
53#include <sys/dmu_traverse.h>
54#include <sys/dmu_objset.h>
55#include <sys/unique.h>
56#include <sys/dsl_pool.h>
57#include <sys/dsl_dataset.h>
58#include <sys/dsl_dir.h>
59#include <sys/dsl_prop.h>
60#include <sys/dsl_synctask.h>
61#include <sys/fs/zfs.h>
62#include <sys/arc.h>
63#include <sys/callb.h>
64#include <sys/spa_boot.h>
65#include <sys/zfs_ioctl.h>
66#include <sys/dsl_scan.h>
67#include <sys/dmu_send.h>
68#include <sys/dsl_destroy.h>
69#include <sys/dsl_userhold.h>
70#include <sys/zfeature.h>
71#include <sys/zvol.h>
72#include <sys/trim_map.h>
73
74#ifdef _KERNEL
75#include <sys/callb.h>
76#include <sys/cpupart.h>
77#include <sys/zone.h>
78#endif /* _KERNEL */
79
80#include "zfs_prop.h"
81#include "zfs_comutil.h"
82
83/* Check hostid on import? */
84static int check_hostid = 1;
85
86SYSCTL_DECL(_vfs_zfs);
87TUNABLE_INT("vfs.zfs.check_hostid", &check_hostid);
88SYSCTL_INT(_vfs_zfs, OID_AUTO, check_hostid, CTLFLAG_RW, &check_hostid, 0,
89 "Check hostid on import?");
90
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2013 by Delphix. All rights reserved.
25 * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
26 * Copyright (c) 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
27 */
28
29/*
30 * SPA: Storage Pool Allocator
31 *
32 * This file contains all the routines used when modifying on-disk SPA state.
33 * This includes opening, importing, destroying, exporting a pool, and syncing a
34 * pool.
35 */
36
37#include <sys/zfs_context.h>
38#include <sys/fm/fs/zfs.h>
39#include <sys/spa_impl.h>
40#include <sys/zio.h>
41#include <sys/zio_checksum.h>
42#include <sys/dmu.h>
43#include <sys/dmu_tx.h>
44#include <sys/zap.h>
45#include <sys/zil.h>
46#include <sys/ddt.h>
47#include <sys/vdev_impl.h>
48#include <sys/metaslab.h>
49#include <sys/metaslab_impl.h>
50#include <sys/uberblock_impl.h>
51#include <sys/txg.h>
52#include <sys/avl.h>
53#include <sys/dmu_traverse.h>
54#include <sys/dmu_objset.h>
55#include <sys/unique.h>
56#include <sys/dsl_pool.h>
57#include <sys/dsl_dataset.h>
58#include <sys/dsl_dir.h>
59#include <sys/dsl_prop.h>
60#include <sys/dsl_synctask.h>
61#include <sys/fs/zfs.h>
62#include <sys/arc.h>
63#include <sys/callb.h>
64#include <sys/spa_boot.h>
65#include <sys/zfs_ioctl.h>
66#include <sys/dsl_scan.h>
67#include <sys/dmu_send.h>
68#include <sys/dsl_destroy.h>
69#include <sys/dsl_userhold.h>
70#include <sys/zfeature.h>
71#include <sys/zvol.h>
72#include <sys/trim_map.h>
73
74#ifdef _KERNEL
75#include <sys/callb.h>
76#include <sys/cpupart.h>
77#include <sys/zone.h>
78#endif /* _KERNEL */
79
80#include "zfs_prop.h"
81#include "zfs_comutil.h"
82
83/* Check hostid on import? */
84static int check_hostid = 1;
85
86SYSCTL_DECL(_vfs_zfs);
87TUNABLE_INT("vfs.zfs.check_hostid", &check_hostid);
88SYSCTL_INT(_vfs_zfs, OID_AUTO, check_hostid, CTLFLAG_RW, &check_hostid, 0,
89 "Check hostid on import?");
90
91/*
92 * The interval, in seconds, at which failed configuration cache file writes
93 * should be retried.
94 */
95static int zfs_ccw_retry_interval = 300;
96
91typedef enum zti_modes {
92 zti_mode_fixed, /* value is # of threads (min 1) */
93 zti_mode_online_percent, /* value is % of online CPUs */
94 zti_mode_batch, /* cpu-intensive; value is ignored */
95 zti_mode_null, /* don't create a taskq */
96 zti_nmodes
97} zti_modes_t;
98
99#define ZTI_FIX(n) { zti_mode_fixed, (n) }
100#define ZTI_PCT(n) { zti_mode_online_percent, (n) }
101#define ZTI_BATCH { zti_mode_batch, 0 }
102#define ZTI_NULL { zti_mode_null, 0 }
103
104#define ZTI_ONE ZTI_FIX(1)
105
106typedef struct zio_taskq_info {
107 enum zti_modes zti_mode;
108 uint_t zti_value;
109} zio_taskq_info_t;
110
111static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
112 "issue", "issue_high", "intr", "intr_high"
113};
114
115/*
116 * Define the taskq threads for the following I/O types:
117 * NULL, READ, WRITE, FREE, CLAIM, and IOCTL
118 */
119const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
120 /* ISSUE ISSUE_HIGH INTR INTR_HIGH */
121 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL },
122 { ZTI_FIX(8), ZTI_NULL, ZTI_BATCH, ZTI_NULL },
123 { ZTI_BATCH, ZTI_FIX(5), ZTI_FIX(8), ZTI_FIX(5) },
124 { ZTI_FIX(100), ZTI_NULL, ZTI_ONE, ZTI_NULL },
125 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL },
126 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL },
127};
128
129static void spa_sync_version(void *arg, dmu_tx_t *tx);
130static void spa_sync_props(void *arg, dmu_tx_t *tx);
131static boolean_t spa_has_active_shared_spare(spa_t *spa);
132static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config,
133 spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
134 char **ereport);
135static void spa_vdev_resilver_done(spa_t *spa);
136
137uint_t zio_taskq_batch_pct = 100; /* 1 thread per cpu in pset */
138#ifdef PSRSET_BIND
139id_t zio_taskq_psrset_bind = PS_NONE;
140#endif
141#ifdef SYSDC
142boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */
143#endif
144uint_t zio_taskq_basedc = 80; /* base duty cycle */
145
146boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */
147extern int zfs_sync_pass_deferred_free;
148
149#ifndef illumos
150extern void spa_deadman(void *arg);
151#endif
152
153/*
154 * This (illegal) pool name is used when temporarily importing a spa_t in order
155 * to get the vdev stats associated with the imported devices.
156 */
157#define TRYIMPORT_NAME "$import"
158
159/*
160 * ==========================================================================
161 * SPA properties routines
162 * ==========================================================================
163 */
164
165/*
166 * Add a (source=src, propname=propval) list to an nvlist.
167 */
168static void
169spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval,
170 uint64_t intval, zprop_source_t src)
171{
172 const char *propname = zpool_prop_to_name(prop);
173 nvlist_t *propval;
174
175 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
176 VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0);
177
178 if (strval != NULL)
179 VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0);
180 else
181 VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0);
182
183 VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0);
184 nvlist_free(propval);
185}
186
187/*
188 * Get property values from the spa configuration.
189 */
190static void
191spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
192{
193 vdev_t *rvd = spa->spa_root_vdev;
194 dsl_pool_t *pool = spa->spa_dsl_pool;
195 uint64_t size;
196 uint64_t alloc;
197 uint64_t space;
198 uint64_t cap, version;
199 zprop_source_t src = ZPROP_SRC_NONE;
200 spa_config_dirent_t *dp;
201
202 ASSERT(MUTEX_HELD(&spa->spa_props_lock));
203
204 if (rvd != NULL) {
205 alloc = metaslab_class_get_alloc(spa_normal_class(spa));
206 size = metaslab_class_get_space(spa_normal_class(spa));
207 spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
208 spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
209 spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
210 spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL,
211 size - alloc, src);
212
213 space = 0;
214 for (int c = 0; c < rvd->vdev_children; c++) {
215 vdev_t *tvd = rvd->vdev_child[c];
216 space += tvd->vdev_max_asize - tvd->vdev_asize;
217 }
218 spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, space,
219 src);
220
221 spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL,
222 (spa_mode(spa) == FREAD), src);
223
224 cap = (size == 0) ? 0 : (alloc * 100 / size);
225 spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);
226
227 spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL,
228 ddt_get_pool_dedup_ratio(spa), src);
229
230 spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
231 rvd->vdev_state, src);
232
233 version = spa_version(spa);
234 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION))
235 src = ZPROP_SRC_DEFAULT;
236 else
237 src = ZPROP_SRC_LOCAL;
238 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src);
239 }
240
241 if (pool != NULL) {
242 dsl_dir_t *freedir = pool->dp_free_dir;
243
244 /*
245 * The $FREE directory was introduced in SPA_VERSION_DEADLISTS,
246 * when opening pools before this version freedir will be NULL.
247 */
248 if (freedir != NULL) {
249 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL,
250 freedir->dd_phys->dd_used_bytes, src);
251 } else {
252 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING,
253 NULL, 0, src);
254 }
255 }
256
257 spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
258
259 if (spa->spa_comment != NULL) {
260 spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment,
261 0, ZPROP_SRC_LOCAL);
262 }
263
264 if (spa->spa_root != NULL)
265 spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root,
266 0, ZPROP_SRC_LOCAL);
267
268 if ((dp = list_head(&spa->spa_config_list)) != NULL) {
269 if (dp->scd_path == NULL) {
270 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
271 "none", 0, ZPROP_SRC_LOCAL);
272 } else if (strcmp(dp->scd_path, spa_config_path) != 0) {
273 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
274 dp->scd_path, 0, ZPROP_SRC_LOCAL);
275 }
276 }
277}
278
279/*
280 * Get zpool property values.
281 */
282int
283spa_prop_get(spa_t *spa, nvlist_t **nvp)
284{
285 objset_t *mos = spa->spa_meta_objset;
286 zap_cursor_t zc;
287 zap_attribute_t za;
288 int err;
289
290 VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
291
292 mutex_enter(&spa->spa_props_lock);
293
294 /*
295 * Get properties from the spa config.
296 */
297 spa_prop_get_config(spa, nvp);
298
299 /* If no pool property object, no more prop to get. */
300 if (mos == NULL || spa->spa_pool_props_object == 0) {
301 mutex_exit(&spa->spa_props_lock);
302 return (0);
303 }
304
305 /*
306 * Get properties from the MOS pool property object.
307 */
308 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object);
309 (err = zap_cursor_retrieve(&zc, &za)) == 0;
310 zap_cursor_advance(&zc)) {
311 uint64_t intval = 0;
312 char *strval = NULL;
313 zprop_source_t src = ZPROP_SRC_DEFAULT;
314 zpool_prop_t prop;
315
316 if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL)
317 continue;
318
319 switch (za.za_integer_length) {
320 case 8:
321 /* integer property */
322 if (za.za_first_integer !=
323 zpool_prop_default_numeric(prop))
324 src = ZPROP_SRC_LOCAL;
325
326 if (prop == ZPOOL_PROP_BOOTFS) {
327 dsl_pool_t *dp;
328 dsl_dataset_t *ds = NULL;
329
330 dp = spa_get_dsl(spa);
331 dsl_pool_config_enter(dp, FTAG);
332 if (err = dsl_dataset_hold_obj(dp,
333 za.za_first_integer, FTAG, &ds)) {
334 dsl_pool_config_exit(dp, FTAG);
335 break;
336 }
337
338 strval = kmem_alloc(
339 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1,
340 KM_SLEEP);
341 dsl_dataset_name(ds, strval);
342 dsl_dataset_rele(ds, FTAG);
343 dsl_pool_config_exit(dp, FTAG);
344 } else {
345 strval = NULL;
346 intval = za.za_first_integer;
347 }
348
349 spa_prop_add_list(*nvp, prop, strval, intval, src);
350
351 if (strval != NULL)
352 kmem_free(strval,
353 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1);
354
355 break;
356
357 case 1:
358 /* string property */
359 strval = kmem_alloc(za.za_num_integers, KM_SLEEP);
360 err = zap_lookup(mos, spa->spa_pool_props_object,
361 za.za_name, 1, za.za_num_integers, strval);
362 if (err) {
363 kmem_free(strval, za.za_num_integers);
364 break;
365 }
366 spa_prop_add_list(*nvp, prop, strval, 0, src);
367 kmem_free(strval, za.za_num_integers);
368 break;
369
370 default:
371 break;
372 }
373 }
374 zap_cursor_fini(&zc);
375 mutex_exit(&spa->spa_props_lock);
376out:
377 if (err && err != ENOENT) {
378 nvlist_free(*nvp);
379 *nvp = NULL;
380 return (err);
381 }
382
383 return (0);
384}
385
386/*
387 * Validate the given pool properties nvlist and modify the list
388 * for the property values to be set.
389 */
390static int
391spa_prop_validate(spa_t *spa, nvlist_t *props)
392{
393 nvpair_t *elem;
394 int error = 0, reset_bootfs = 0;
395 uint64_t objnum = 0;
396 boolean_t has_feature = B_FALSE;
397
398 elem = NULL;
399 while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
400 uint64_t intval;
401 char *strval, *slash, *check, *fname;
402 const char *propname = nvpair_name(elem);
403 zpool_prop_t prop = zpool_name_to_prop(propname);
404
405 switch (prop) {
406 case ZPROP_INVAL:
407 if (!zpool_prop_feature(propname)) {
408 error = SET_ERROR(EINVAL);
409 break;
410 }
411
412 /*
413 * Sanitize the input.
414 */
415 if (nvpair_type(elem) != DATA_TYPE_UINT64) {
416 error = SET_ERROR(EINVAL);
417 break;
418 }
419
420 if (nvpair_value_uint64(elem, &intval) != 0) {
421 error = SET_ERROR(EINVAL);
422 break;
423 }
424
425 if (intval != 0) {
426 error = SET_ERROR(EINVAL);
427 break;
428 }
429
430 fname = strchr(propname, '@') + 1;
431 if (zfeature_lookup_name(fname, NULL) != 0) {
432 error = SET_ERROR(EINVAL);
433 break;
434 }
435
436 has_feature = B_TRUE;
437 break;
438
439 case ZPOOL_PROP_VERSION:
440 error = nvpair_value_uint64(elem, &intval);
441 if (!error &&
442 (intval < spa_version(spa) ||
443 intval > SPA_VERSION_BEFORE_FEATURES ||
444 has_feature))
445 error = SET_ERROR(EINVAL);
446 break;
447
448 case ZPOOL_PROP_DELEGATION:
449 case ZPOOL_PROP_AUTOREPLACE:
450 case ZPOOL_PROP_LISTSNAPS:
451 case ZPOOL_PROP_AUTOEXPAND:
452 error = nvpair_value_uint64(elem, &intval);
453 if (!error && intval > 1)
454 error = SET_ERROR(EINVAL);
455 break;
456
457 case ZPOOL_PROP_BOOTFS:
458 /*
459 * If the pool version is less than SPA_VERSION_BOOTFS,
460 * or the pool is still being created (version == 0),
461 * the bootfs property cannot be set.
462 */
463 if (spa_version(spa) < SPA_VERSION_BOOTFS) {
464 error = SET_ERROR(ENOTSUP);
465 break;
466 }
467
468 /*
469 * Make sure the vdev config is bootable
470 */
471 if (!vdev_is_bootable(spa->spa_root_vdev)) {
472 error = SET_ERROR(ENOTSUP);
473 break;
474 }
475
476 reset_bootfs = 1;
477
478 error = nvpair_value_string(elem, &strval);
479
480 if (!error) {
481 objset_t *os;
482 uint64_t compress;
483
484 if (strval == NULL || strval[0] == '\0') {
485 objnum = zpool_prop_default_numeric(
486 ZPOOL_PROP_BOOTFS);
487 break;
488 }
489
490 if (error = dmu_objset_hold(strval, FTAG, &os))
491 break;
492
493 /* Must be ZPL and not gzip compressed. */
494
495 if (dmu_objset_type(os) != DMU_OST_ZFS) {
496 error = SET_ERROR(ENOTSUP);
497 } else if ((error =
498 dsl_prop_get_int_ds(dmu_objset_ds(os),
499 zfs_prop_to_name(ZFS_PROP_COMPRESSION),
500 &compress)) == 0 &&
501 !BOOTFS_COMPRESS_VALID(compress)) {
502 error = SET_ERROR(ENOTSUP);
503 } else {
504 objnum = dmu_objset_id(os);
505 }
506 dmu_objset_rele(os, FTAG);
507 }
508 break;
509
510 case ZPOOL_PROP_FAILUREMODE:
511 error = nvpair_value_uint64(elem, &intval);
512 if (!error && (intval < ZIO_FAILURE_MODE_WAIT ||
513 intval > ZIO_FAILURE_MODE_PANIC))
514 error = SET_ERROR(EINVAL);
515
516 /*
517 * This is a special case which only occurs when
518 * the pool has completely failed. This allows
519 * the user to change the in-core failmode property
520 * without syncing it out to disk (I/Os might
521 * currently be blocked). We do this by returning
522 * EIO to the caller (spa_prop_set) to trick it
523 * into thinking we encountered a property validation
524 * error.
525 */
526 if (!error && spa_suspended(spa)) {
527 spa->spa_failmode = intval;
528 error = SET_ERROR(EIO);
529 }
530 break;
531
532 case ZPOOL_PROP_CACHEFILE:
533 if ((error = nvpair_value_string(elem, &strval)) != 0)
534 break;
535
536 if (strval[0] == '\0')
537 break;
538
539 if (strcmp(strval, "none") == 0)
540 break;
541
542 if (strval[0] != '/') {
543 error = SET_ERROR(EINVAL);
544 break;
545 }
546
547 slash = strrchr(strval, '/');
548 ASSERT(slash != NULL);
549
550 if (slash[1] == '\0' || strcmp(slash, "/.") == 0 ||
551 strcmp(slash, "/..") == 0)
552 error = SET_ERROR(EINVAL);
553 break;
554
555 case ZPOOL_PROP_COMMENT:
556 if ((error = nvpair_value_string(elem, &strval)) != 0)
557 break;
558 for (check = strval; *check != '\0'; check++) {
559 /*
560 * The kernel doesn't have an easy isprint()
561 * check. For this kernel check, we merely
562 * check ASCII apart from DEL. Fix this if
563 * there is an easy-to-use kernel isprint().
564 */
565 if (*check >= 0x7f) {
566 error = SET_ERROR(EINVAL);
567 break;
568 }
569 check++;
570 }
571 if (strlen(strval) > ZPROP_MAX_COMMENT)
572 error = E2BIG;
573 break;
574
575 case ZPOOL_PROP_DEDUPDITTO:
576 if (spa_version(spa) < SPA_VERSION_DEDUP)
577 error = SET_ERROR(ENOTSUP);
578 else
579 error = nvpair_value_uint64(elem, &intval);
580 if (error == 0 &&
581 intval != 0 && intval < ZIO_DEDUPDITTO_MIN)
582 error = SET_ERROR(EINVAL);
583 break;
584 }
585
586 if (error)
587 break;
588 }
589
590 if (!error && reset_bootfs) {
591 error = nvlist_remove(props,
592 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING);
593
594 if (!error) {
595 error = nvlist_add_uint64(props,
596 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum);
597 }
598 }
599
600 return (error);
601}
602
603void
604spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync)
605{
606 char *cachefile;
607 spa_config_dirent_t *dp;
608
609 if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE),
610 &cachefile) != 0)
611 return;
612
613 dp = kmem_alloc(sizeof (spa_config_dirent_t),
614 KM_SLEEP);
615
616 if (cachefile[0] == '\0')
617 dp->scd_path = spa_strdup(spa_config_path);
618 else if (strcmp(cachefile, "none") == 0)
619 dp->scd_path = NULL;
620 else
621 dp->scd_path = spa_strdup(cachefile);
622
623 list_insert_head(&spa->spa_config_list, dp);
624 if (need_sync)
625 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
626}
627
628int
629spa_prop_set(spa_t *spa, nvlist_t *nvp)
630{
631 int error;
632 nvpair_t *elem = NULL;
633 boolean_t need_sync = B_FALSE;
634
635 if ((error = spa_prop_validate(spa, nvp)) != 0)
636 return (error);
637
638 while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) {
639 zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem));
640
641 if (prop == ZPOOL_PROP_CACHEFILE ||
642 prop == ZPOOL_PROP_ALTROOT ||
643 prop == ZPOOL_PROP_READONLY)
644 continue;
645
646 if (prop == ZPOOL_PROP_VERSION || prop == ZPROP_INVAL) {
647 uint64_t ver;
648
649 if (prop == ZPOOL_PROP_VERSION) {
650 VERIFY(nvpair_value_uint64(elem, &ver) == 0);
651 } else {
652 ASSERT(zpool_prop_feature(nvpair_name(elem)));
653 ver = SPA_VERSION_FEATURES;
654 need_sync = B_TRUE;
655 }
656
657 /* Save time if the version is already set. */
658 if (ver == spa_version(spa))
659 continue;
660
661 /*
662 * In addition to the pool directory object, we might
663 * create the pool properties object, the features for
664 * read object, the features for write object, or the
665 * feature descriptions object.
666 */
667 error = dsl_sync_task(spa->spa_name, NULL,
668 spa_sync_version, &ver, 6);
669 if (error)
670 return (error);
671 continue;
672 }
673
674 need_sync = B_TRUE;
675 break;
676 }
677
678 if (need_sync) {
679 return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props,
680 nvp, 6));
681 }
682
683 return (0);
684}
685
686/*
687 * If the bootfs property value is dsobj, clear it.
688 */
689void
690spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
691{
692 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
693 VERIFY(zap_remove(spa->spa_meta_objset,
694 spa->spa_pool_props_object,
695 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0);
696 spa->spa_bootfs = 0;
697 }
698}
699
700/*ARGSUSED*/
701static int
702spa_change_guid_check(void *arg, dmu_tx_t *tx)
703{
704 uint64_t *newguid = arg;
705 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
706 vdev_t *rvd = spa->spa_root_vdev;
707 uint64_t vdev_state;
708
709 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
710 vdev_state = rvd->vdev_state;
711 spa_config_exit(spa, SCL_STATE, FTAG);
712
713 if (vdev_state != VDEV_STATE_HEALTHY)
714 return (SET_ERROR(ENXIO));
715
716 ASSERT3U(spa_guid(spa), !=, *newguid);
717
718 return (0);
719}
720
721static void
722spa_change_guid_sync(void *arg, dmu_tx_t *tx)
723{
724 uint64_t *newguid = arg;
725 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
726 uint64_t oldguid;
727 vdev_t *rvd = spa->spa_root_vdev;
728
729 oldguid = spa_guid(spa);
730
731 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
732 rvd->vdev_guid = *newguid;
733 rvd->vdev_guid_sum += (*newguid - oldguid);
734 vdev_config_dirty(rvd);
735 spa_config_exit(spa, SCL_STATE, FTAG);
736
737 spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu",
738 oldguid, *newguid);
739}
740
741/*
742 * Change the GUID for the pool. This is done so that we can later
743 * re-import a pool built from a clone of our own vdevs. We will modify
744 * the root vdev's guid, our own pool guid, and then mark all of our
745 * vdevs dirty. Note that we must make sure that all our vdevs are
746 * online when we do this, or else any vdevs that weren't present
747 * would be orphaned from our pool. We are also going to issue a
748 * sysevent to update any watchers.
749 */
750int
751spa_change_guid(spa_t *spa)
752{
753 int error;
754 uint64_t guid;
755
756 mutex_enter(&spa_namespace_lock);
757 guid = spa_generate_guid(NULL);
758
759 error = dsl_sync_task(spa->spa_name, spa_change_guid_check,
760 spa_change_guid_sync, &guid, 5);
761
762 if (error == 0) {
763 spa_config_sync(spa, B_FALSE, B_TRUE);
764 spa_event_notify(spa, NULL, ESC_ZFS_POOL_REGUID);
765 }
766
767 mutex_exit(&spa_namespace_lock);
768
769 return (error);
770}
771
772/*
773 * ==========================================================================
774 * SPA state manipulation (open/create/destroy/import/export)
775 * ==========================================================================
776 */
777
778static int
779spa_error_entry_compare(const void *a, const void *b)
780{
781 spa_error_entry_t *sa = (spa_error_entry_t *)a;
782 spa_error_entry_t *sb = (spa_error_entry_t *)b;
783 int ret;
784
785 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark,
786 sizeof (zbookmark_t));
787
788 if (ret < 0)
789 return (-1);
790 else if (ret > 0)
791 return (1);
792 else
793 return (0);
794}
795
796/*
797 * Utility function which retrieves copies of the current logs and
798 * re-initializes them in the process.
799 */
800void
801spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
802{
803 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
804
805 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t));
806 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t));
807
808 avl_create(&spa->spa_errlist_scrub,
809 spa_error_entry_compare, sizeof (spa_error_entry_t),
810 offsetof(spa_error_entry_t, se_avl));
811 avl_create(&spa->spa_errlist_last,
812 spa_error_entry_compare, sizeof (spa_error_entry_t),
813 offsetof(spa_error_entry_t, se_avl));
814}
815
816static taskq_t *
817spa_taskq_create(spa_t *spa, const char *name, enum zti_modes mode,
818 uint_t value)
819{
820 uint_t flags = TASKQ_PREPOPULATE;
821 boolean_t batch = B_FALSE;
822
823 switch (mode) {
824 case zti_mode_null:
825 return (NULL); /* no taskq needed */
826
827 case zti_mode_fixed:
828 ASSERT3U(value, >=, 1);
829 value = MAX(value, 1);
830 break;
831
832 case zti_mode_batch:
833 batch = B_TRUE;
834 flags |= TASKQ_THREADS_CPU_PCT;
835 value = zio_taskq_batch_pct;
836 break;
837
838 case zti_mode_online_percent:
839 flags |= TASKQ_THREADS_CPU_PCT;
840 break;
841
842 default:
843 panic("unrecognized mode for %s taskq (%u:%u) in "
844 "spa_activate()",
845 name, mode, value);
846 break;
847 }
848
849#ifdef SYSDC
850 if (zio_taskq_sysdc && spa->spa_proc != &p0) {
851 if (batch)
852 flags |= TASKQ_DC_BATCH;
853
854 return (taskq_create_sysdc(name, value, 50, INT_MAX,
855 spa->spa_proc, zio_taskq_basedc, flags));
856 }
857#endif
858 return (taskq_create_proc(name, value, maxclsyspri, 50, INT_MAX,
859 spa->spa_proc, flags));
860}
861
862static void
863spa_create_zio_taskqs(spa_t *spa)
864{
865 for (int t = 0; t < ZIO_TYPES; t++) {
866 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
867 const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
868 enum zti_modes mode = ztip->zti_mode;
869 uint_t value = ztip->zti_value;
870 char name[32];
871
872 (void) snprintf(name, sizeof (name),
873 "%s_%s", zio_type_name[t], zio_taskq_types[q]);
874
875 spa->spa_zio_taskq[t][q] =
876 spa_taskq_create(spa, name, mode, value);
877 }
878 }
879}
880
881#ifdef _KERNEL
882#ifdef SPA_PROCESS
883static void
884spa_thread(void *arg)
885{
886 callb_cpr_t cprinfo;
887
888 spa_t *spa = arg;
889 user_t *pu = PTOU(curproc);
890
891 CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr,
892 spa->spa_name);
893
894 ASSERT(curproc != &p0);
895 (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs),
896 "zpool-%s", spa->spa_name);
897 (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm));
898
899#ifdef PSRSET_BIND
900 /* bind this thread to the requested psrset */
901 if (zio_taskq_psrset_bind != PS_NONE) {
902 pool_lock();
903 mutex_enter(&cpu_lock);
904 mutex_enter(&pidlock);
905 mutex_enter(&curproc->p_lock);
906
907 if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind,
908 0, NULL, NULL) == 0) {
909 curthread->t_bind_pset = zio_taskq_psrset_bind;
910 } else {
911 cmn_err(CE_WARN,
912 "Couldn't bind process for zfs pool \"%s\" to "
913 "pset %d\n", spa->spa_name, zio_taskq_psrset_bind);
914 }
915
916 mutex_exit(&curproc->p_lock);
917 mutex_exit(&pidlock);
918 mutex_exit(&cpu_lock);
919 pool_unlock();
920 }
921#endif
922
923#ifdef SYSDC
924 if (zio_taskq_sysdc) {
925 sysdc_thread_enter(curthread, 100, 0);
926 }
927#endif
928
929 spa->spa_proc = curproc;
930 spa->spa_did = curthread->t_did;
931
932 spa_create_zio_taskqs(spa);
933
934 mutex_enter(&spa->spa_proc_lock);
935 ASSERT(spa->spa_proc_state == SPA_PROC_CREATED);
936
937 spa->spa_proc_state = SPA_PROC_ACTIVE;
938 cv_broadcast(&spa->spa_proc_cv);
939
940 CALLB_CPR_SAFE_BEGIN(&cprinfo);
941 while (spa->spa_proc_state == SPA_PROC_ACTIVE)
942 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
943 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock);
944
945 ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE);
946 spa->spa_proc_state = SPA_PROC_GONE;
947 spa->spa_proc = &p0;
948 cv_broadcast(&spa->spa_proc_cv);
949 CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */
950
951 mutex_enter(&curproc->p_lock);
952 lwp_exit();
953}
954#endif /* SPA_PROCESS */
955#endif
956
957/*
958 * Activate an uninitialized pool.
959 */
960static void
961spa_activate(spa_t *spa, int mode)
962{
963 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
964
965 spa->spa_state = POOL_STATE_ACTIVE;
966 spa->spa_mode = mode;
967
968 spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops);
969 spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops);
970
971 /* Try to create a covering process */
972 mutex_enter(&spa->spa_proc_lock);
973 ASSERT(spa->spa_proc_state == SPA_PROC_NONE);
974 ASSERT(spa->spa_proc == &p0);
975 spa->spa_did = 0;
976
977#ifdef SPA_PROCESS
978 /* Only create a process if we're going to be around a while. */
979 if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) {
980 if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri,
981 NULL, 0) == 0) {
982 spa->spa_proc_state = SPA_PROC_CREATED;
983 while (spa->spa_proc_state == SPA_PROC_CREATED) {
984 cv_wait(&spa->spa_proc_cv,
985 &spa->spa_proc_lock);
986 }
987 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
988 ASSERT(spa->spa_proc != &p0);
989 ASSERT(spa->spa_did != 0);
990 } else {
991#ifdef _KERNEL
992 cmn_err(CE_WARN,
993 "Couldn't create process for zfs pool \"%s\"\n",
994 spa->spa_name);
995#endif
996 }
997 }
998#endif /* SPA_PROCESS */
999 mutex_exit(&spa->spa_proc_lock);
1000
1001 /* If we didn't create a process, we need to create our taskqs. */
1002 ASSERT(spa->spa_proc == &p0);
1003 if (spa->spa_proc == &p0) {
1004 spa_create_zio_taskqs(spa);
1005 }
1006
1007 /*
1008 * Start TRIM thread.
1009 */
1010 trim_thread_create(spa);
1011
1012 list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
1013 offsetof(vdev_t, vdev_config_dirty_node));
1014 list_create(&spa->spa_state_dirty_list, sizeof (vdev_t),
1015 offsetof(vdev_t, vdev_state_dirty_node));
1016
1017 txg_list_create(&spa->spa_vdev_txg_list,
1018 offsetof(struct vdev, vdev_txg_node));
1019
1020 avl_create(&spa->spa_errlist_scrub,
1021 spa_error_entry_compare, sizeof (spa_error_entry_t),
1022 offsetof(spa_error_entry_t, se_avl));
1023 avl_create(&spa->spa_errlist_last,
1024 spa_error_entry_compare, sizeof (spa_error_entry_t),
1025 offsetof(spa_error_entry_t, se_avl));
1026}
1027
1028/*
1029 * Opposite of spa_activate().
1030 */
1031static void
1032spa_deactivate(spa_t *spa)
1033{
1034 ASSERT(spa->spa_sync_on == B_FALSE);
1035 ASSERT(spa->spa_dsl_pool == NULL);
1036 ASSERT(spa->spa_root_vdev == NULL);
1037 ASSERT(spa->spa_async_zio_root == NULL);
1038 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
1039
1040 /*
1041 * Stop TRIM thread in case spa_unload() wasn't called directly
1042 * before spa_deactivate().
1043 */
1044 trim_thread_destroy(spa);
1045
1046 txg_list_destroy(&spa->spa_vdev_txg_list);
1047
1048 list_destroy(&spa->spa_config_dirty_list);
1049 list_destroy(&spa->spa_state_dirty_list);
1050
1051 for (int t = 0; t < ZIO_TYPES; t++) {
1052 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
1053 if (spa->spa_zio_taskq[t][q] != NULL)
1054 taskq_destroy(spa->spa_zio_taskq[t][q]);
1055 spa->spa_zio_taskq[t][q] = NULL;
1056 }
1057 }
1058
1059 metaslab_class_destroy(spa->spa_normal_class);
1060 spa->spa_normal_class = NULL;
1061
1062 metaslab_class_destroy(spa->spa_log_class);
1063 spa->spa_log_class = NULL;
1064
1065 /*
1066 * If this was part of an import or the open otherwise failed, we may
1067 * still have errors left in the queues. Empty them just in case.
1068 */
1069 spa_errlog_drain(spa);
1070
1071 avl_destroy(&spa->spa_errlist_scrub);
1072 avl_destroy(&spa->spa_errlist_last);
1073
1074 spa->spa_state = POOL_STATE_UNINITIALIZED;
1075
1076 mutex_enter(&spa->spa_proc_lock);
1077 if (spa->spa_proc_state != SPA_PROC_NONE) {
1078 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
1079 spa->spa_proc_state = SPA_PROC_DEACTIVATE;
1080 cv_broadcast(&spa->spa_proc_cv);
1081 while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) {
1082 ASSERT(spa->spa_proc != &p0);
1083 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
1084 }
1085 ASSERT(spa->spa_proc_state == SPA_PROC_GONE);
1086 spa->spa_proc_state = SPA_PROC_NONE;
1087 }
1088 ASSERT(spa->spa_proc == &p0);
1089 mutex_exit(&spa->spa_proc_lock);
1090
1091#ifdef SPA_PROCESS
1092 /*
1093 * We want to make sure spa_thread() has actually exited the ZFS
1094 * module, so that the module can't be unloaded out from underneath
1095 * it.
1096 */
1097 if (spa->spa_did != 0) {
1098 thread_join(spa->spa_did);
1099 spa->spa_did = 0;
1100 }
1101#endif /* SPA_PROCESS */
1102}
1103
1104/*
1105 * Verify a pool configuration, and construct the vdev tree appropriately. This
1106 * will create all the necessary vdevs in the appropriate layout, with each vdev
1107 * in the CLOSED state. This will prep the pool before open/creation/import.
1108 * All vdev validation is done by the vdev_alloc() routine.
1109 */
1110static int
1111spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
1112 uint_t id, int atype)
1113{
1114 nvlist_t **child;
1115 uint_t children;
1116 int error;
1117
1118 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
1119 return (error);
1120
1121 if ((*vdp)->vdev_ops->vdev_op_leaf)
1122 return (0);
1123
1124 error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
1125 &child, &children);
1126
1127 if (error == ENOENT)
1128 return (0);
1129
1130 if (error) {
1131 vdev_free(*vdp);
1132 *vdp = NULL;
1133 return (SET_ERROR(EINVAL));
1134 }
1135
1136 for (int c = 0; c < children; c++) {
1137 vdev_t *vd;
1138 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
1139 atype)) != 0) {
1140 vdev_free(*vdp);
1141 *vdp = NULL;
1142 return (error);
1143 }
1144 }
1145
1146 ASSERT(*vdp != NULL);
1147
1148 return (0);
1149}
1150
1151/*
1152 * Opposite of spa_load().
1153 */
1154static void
1155spa_unload(spa_t *spa)
1156{
1157 int i;
1158
1159 ASSERT(MUTEX_HELD(&spa_namespace_lock));
1160
1161 /*
1162 * Stop TRIM thread.
1163 */
1164 trim_thread_destroy(spa);
1165
1166 /*
1167 * Stop async tasks.
1168 */
1169 spa_async_suspend(spa);
1170
1171 /*
1172 * Stop syncing.
1173 */
1174 if (spa->spa_sync_on) {
1175 txg_sync_stop(spa->spa_dsl_pool);
1176 spa->spa_sync_on = B_FALSE;
1177 }
1178
1179 /*
1180 * Wait for any outstanding async I/O to complete.
1181 */
1182 if (spa->spa_async_zio_root != NULL) {
1183 (void) zio_wait(spa->spa_async_zio_root);
1184 spa->spa_async_zio_root = NULL;
1185 }
1186
1187 bpobj_close(&spa->spa_deferred_bpobj);
1188
1189 /*
1190 * Close the dsl pool.
1191 */
1192 if (spa->spa_dsl_pool) {
1193 dsl_pool_close(spa->spa_dsl_pool);
1194 spa->spa_dsl_pool = NULL;
1195 spa->spa_meta_objset = NULL;
1196 }
1197
1198 ddt_unload(spa);
1199
1200 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1201
1202 /*
1203 * Drop and purge level 2 cache
1204 */
1205 spa_l2cache_drop(spa);
1206
1207 /*
1208 * Close all vdevs.
1209 */
1210 if (spa->spa_root_vdev)
1211 vdev_free(spa->spa_root_vdev);
1212 ASSERT(spa->spa_root_vdev == NULL);
1213
1214 for (i = 0; i < spa->spa_spares.sav_count; i++)
1215 vdev_free(spa->spa_spares.sav_vdevs[i]);
1216 if (spa->spa_spares.sav_vdevs) {
1217 kmem_free(spa->spa_spares.sav_vdevs,
1218 spa->spa_spares.sav_count * sizeof (void *));
1219 spa->spa_spares.sav_vdevs = NULL;
1220 }
1221 if (spa->spa_spares.sav_config) {
1222 nvlist_free(spa->spa_spares.sav_config);
1223 spa->spa_spares.sav_config = NULL;
1224 }
1225 spa->spa_spares.sav_count = 0;
1226
1227 for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
1228 vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]);
1229 vdev_free(spa->spa_l2cache.sav_vdevs[i]);
1230 }
1231 if (spa->spa_l2cache.sav_vdevs) {
1232 kmem_free(spa->spa_l2cache.sav_vdevs,
1233 spa->spa_l2cache.sav_count * sizeof (void *));
1234 spa->spa_l2cache.sav_vdevs = NULL;
1235 }
1236 if (spa->spa_l2cache.sav_config) {
1237 nvlist_free(spa->spa_l2cache.sav_config);
1238 spa->spa_l2cache.sav_config = NULL;
1239 }
1240 spa->spa_l2cache.sav_count = 0;
1241
1242 spa->spa_async_suspended = 0;
1243
1244 if (spa->spa_comment != NULL) {
1245 spa_strfree(spa->spa_comment);
1246 spa->spa_comment = NULL;
1247 }
1248
1249 spa_config_exit(spa, SCL_ALL, FTAG);
1250}
1251
1252/*
1253 * Load (or re-load) the current list of vdevs describing the active spares for
1254 * this pool. When this is called, we have some form of basic information in
1255 * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and
1256 * then re-generate a more complete list including status information.
1257 */
1258static void
1259spa_load_spares(spa_t *spa)
1260{
1261 nvlist_t **spares;
1262 uint_t nspares;
1263 int i;
1264 vdev_t *vd, *tvd;
1265
1266 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
1267
1268 /*
1269 * First, close and free any existing spare vdevs.
1270 */
1271 for (i = 0; i < spa->spa_spares.sav_count; i++) {
1272 vd = spa->spa_spares.sav_vdevs[i];
1273
1274 /* Undo the call to spa_activate() below */
1275 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
1276 B_FALSE)) != NULL && tvd->vdev_isspare)
1277 spa_spare_remove(tvd);
1278 vdev_close(vd);
1279 vdev_free(vd);
1280 }
1281
1282 if (spa->spa_spares.sav_vdevs)
1283 kmem_free(spa->spa_spares.sav_vdevs,
1284 spa->spa_spares.sav_count * sizeof (void *));
1285
1286 if (spa->spa_spares.sav_config == NULL)
1287 nspares = 0;
1288 else
1289 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
1290 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
1291
1292 spa->spa_spares.sav_count = (int)nspares;
1293 spa->spa_spares.sav_vdevs = NULL;
1294
1295 if (nspares == 0)
1296 return;
1297
1298 /*
1299 * Construct the array of vdevs, opening them to get status in the
1300 * process. For each spare, there is potentially two different vdev_t
1301 * structures associated with it: one in the list of spares (used only
1302 * for basic validation purposes) and one in the active vdev
1303 * configuration (if it's spared in). During this phase we open and
1304 * validate each vdev on the spare list. If the vdev also exists in the
1305 * active configuration, then we also mark this vdev as an active spare.
1306 */
1307 spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *),
1308 KM_SLEEP);
1309 for (i = 0; i < spa->spa_spares.sav_count; i++) {
1310 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0,
1311 VDEV_ALLOC_SPARE) == 0);
1312 ASSERT(vd != NULL);
1313
1314 spa->spa_spares.sav_vdevs[i] = vd;
1315
1316 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
1317 B_FALSE)) != NULL) {
1318 if (!tvd->vdev_isspare)
1319 spa_spare_add(tvd);
1320
1321 /*
1322 * We only mark the spare active if we were successfully
1323 * able to load the vdev. Otherwise, importing a pool
1324 * with a bad active spare would result in strange
1325 * behavior, because multiple pool would think the spare
1326 * is actively in use.
1327 *
1328 * There is a vulnerability here to an equally bizarre
1329 * circumstance, where a dead active spare is later
1330 * brought back to life (onlined or otherwise). Given
1331 * the rarity of this scenario, and the extra complexity
1332 * it adds, we ignore the possibility.
1333 */
1334 if (!vdev_is_dead(tvd))
1335 spa_spare_activate(tvd);
1336 }
1337
1338 vd->vdev_top = vd;
1339 vd->vdev_aux = &spa->spa_spares;
1340
1341 if (vdev_open(vd) != 0)
1342 continue;
1343
1344 if (vdev_validate_aux(vd) == 0)
1345 spa_spare_add(vd);
1346 }
1347
1348 /*
1349 * Recompute the stashed list of spares, with status information
1350 * this time.
1351 */
1352 VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES,
1353 DATA_TYPE_NVLIST_ARRAY) == 0);
1354
1355 spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *),
1356 KM_SLEEP);
1357 for (i = 0; i < spa->spa_spares.sav_count; i++)
1358 spares[i] = vdev_config_generate(spa,
1359 spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE);
1360 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
1361 ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0);
1362 for (i = 0; i < spa->spa_spares.sav_count; i++)
1363 nvlist_free(spares[i]);
1364 kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *));
1365}
1366
1367/*
1368 * Load (or re-load) the current list of vdevs describing the active l2cache for
1369 * this pool. When this is called, we have some form of basic information in
1370 * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and
1371 * then re-generate a more complete list including status information.
1372 * Devices which are already active have their details maintained, and are
1373 * not re-opened.
1374 */
1375static void
1376spa_load_l2cache(spa_t *spa)
1377{
1378 nvlist_t **l2cache;
1379 uint_t nl2cache;
1380 int i, j, oldnvdevs;
1381 uint64_t guid;
1382 vdev_t *vd, **oldvdevs, **newvdevs;
1383 spa_aux_vdev_t *sav = &spa->spa_l2cache;
1384
1385 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
1386
1387 if (sav->sav_config != NULL) {
1388 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
1389 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
1390 newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP);
1391 } else {
1392 nl2cache = 0;
1393 newvdevs = NULL;
1394 }
1395
1396 oldvdevs = sav->sav_vdevs;
1397 oldnvdevs = sav->sav_count;
1398 sav->sav_vdevs = NULL;
1399 sav->sav_count = 0;
1400
1401 /*
1402 * Process new nvlist of vdevs.
1403 */
1404 for (i = 0; i < nl2cache; i++) {
1405 VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID,
1406 &guid) == 0);
1407
1408 newvdevs[i] = NULL;
1409 for (j = 0; j < oldnvdevs; j++) {
1410 vd = oldvdevs[j];
1411 if (vd != NULL && guid == vd->vdev_guid) {
1412 /*
1413 * Retain previous vdev for add/remove ops.
1414 */
1415 newvdevs[i] = vd;
1416 oldvdevs[j] = NULL;
1417 break;
1418 }
1419 }
1420
1421 if (newvdevs[i] == NULL) {
1422 /*
1423 * Create new vdev
1424 */
1425 VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0,
1426 VDEV_ALLOC_L2CACHE) == 0);
1427 ASSERT(vd != NULL);
1428 newvdevs[i] = vd;
1429
1430 /*
1431 * Commit this vdev as an l2cache device,
1432 * even if it fails to open.
1433 */
1434 spa_l2cache_add(vd);
1435
1436 vd->vdev_top = vd;
1437 vd->vdev_aux = sav;
1438
1439 spa_l2cache_activate(vd);
1440
1441 if (vdev_open(vd) != 0)
1442 continue;
1443
1444 (void) vdev_validate_aux(vd);
1445
1446 if (!vdev_is_dead(vd))
1447 l2arc_add_vdev(spa, vd);
1448 }
1449 }
1450
1451 /*
1452 * Purge vdevs that were dropped
1453 */
1454 for (i = 0; i < oldnvdevs; i++) {
1455 uint64_t pool;
1456
1457 vd = oldvdevs[i];
1458 if (vd != NULL) {
1459 ASSERT(vd->vdev_isl2cache);
1460
1461 if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
1462 pool != 0ULL && l2arc_vdev_present(vd))
1463 l2arc_remove_vdev(vd);
1464 vdev_clear_stats(vd);
1465 vdev_free(vd);
1466 }
1467 }
1468
1469 if (oldvdevs)
1470 kmem_free(oldvdevs, oldnvdevs * sizeof (void *));
1471
1472 if (sav->sav_config == NULL)
1473 goto out;
1474
1475 sav->sav_vdevs = newvdevs;
1476 sav->sav_count = (int)nl2cache;
1477
1478 /*
1479 * Recompute the stashed list of l2cache devices, with status
1480 * information this time.
1481 */
1482 VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
1483 DATA_TYPE_NVLIST_ARRAY) == 0);
1484
1485 l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
1486 for (i = 0; i < sav->sav_count; i++)
1487 l2cache[i] = vdev_config_generate(spa,
1488 sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE);
1489 VERIFY(nvlist_add_nvlist_array(sav->sav_config,
1490 ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0);
1491out:
1492 for (i = 0; i < sav->sav_count; i++)
1493 nvlist_free(l2cache[i]);
1494 if (sav->sav_count)
1495 kmem_free(l2cache, sav->sav_count * sizeof (void *));
1496}
1497
1498static int
1499load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
1500{
1501 dmu_buf_t *db;
1502 char *packed = NULL;
1503 size_t nvsize = 0;
1504 int error;
1505 *value = NULL;
1506
1507 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
1508 nvsize = *(uint64_t *)db->db_data;
1509 dmu_buf_rele(db, FTAG);
1510
1511 packed = kmem_alloc(nvsize, KM_SLEEP);
1512 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed,
1513 DMU_READ_PREFETCH);
1514 if (error == 0)
1515 error = nvlist_unpack(packed, nvsize, value, 0);
1516 kmem_free(packed, nvsize);
1517
1518 return (error);
1519}
1520
1521/*
1522 * Checks to see if the given vdev could not be opened, in which case we post a
1523 * sysevent to notify the autoreplace code that the device has been removed.
1524 */
1525static void
1526spa_check_removed(vdev_t *vd)
1527{
1528 for (int c = 0; c < vd->vdev_children; c++)
1529 spa_check_removed(vd->vdev_child[c]);
1530
1531 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) &&
1532 !vd->vdev_ishole) {
1533 zfs_post_autoreplace(vd->vdev_spa, vd);
1534 spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK);
1535 }
1536}
1537
1538/*
1539 * Validate the current config against the MOS config
1540 */
1541static boolean_t
1542spa_config_valid(spa_t *spa, nvlist_t *config)
1543{
1544 vdev_t *mrvd, *rvd = spa->spa_root_vdev;
1545 nvlist_t *nv;
1546
1547 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0);
1548
1549 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1550 VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0);
1551
1552 ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children);
1553
1554 /*
1555 * If we're doing a normal import, then build up any additional
1556 * diagnostic information about missing devices in this config.
1557 * We'll pass this up to the user for further processing.
1558 */
1559 if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) {
1560 nvlist_t **child, *nv;
1561 uint64_t idx = 0;
1562
1563 child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **),
1564 KM_SLEEP);
1565 VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
1566
1567 for (int c = 0; c < rvd->vdev_children; c++) {
1568 vdev_t *tvd = rvd->vdev_child[c];
1569 vdev_t *mtvd = mrvd->vdev_child[c];
1570
1571 if (tvd->vdev_ops == &vdev_missing_ops &&
1572 mtvd->vdev_ops != &vdev_missing_ops &&
1573 mtvd->vdev_islog)
1574 child[idx++] = vdev_config_generate(spa, mtvd,
1575 B_FALSE, 0);
1576 }
1577
1578 if (idx) {
1579 VERIFY(nvlist_add_nvlist_array(nv,
1580 ZPOOL_CONFIG_CHILDREN, child, idx) == 0);
1581 VERIFY(nvlist_add_nvlist(spa->spa_load_info,
1582 ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0);
1583
1584 for (int i = 0; i < idx; i++)
1585 nvlist_free(child[i]);
1586 }
1587 nvlist_free(nv);
1588 kmem_free(child, rvd->vdev_children * sizeof (char **));
1589 }
1590
1591 /*
1592 * Compare the root vdev tree with the information we have
1593 * from the MOS config (mrvd). Check each top-level vdev
1594 * with the corresponding MOS config top-level (mtvd).
1595 */
1596 for (int c = 0; c < rvd->vdev_children; c++) {
1597 vdev_t *tvd = rvd->vdev_child[c];
1598 vdev_t *mtvd = mrvd->vdev_child[c];
1599
1600 /*
1601 * Resolve any "missing" vdevs in the current configuration.
1602 * If we find that the MOS config has more accurate information
1603 * about the top-level vdev then use that vdev instead.
1604 */
1605 if (tvd->vdev_ops == &vdev_missing_ops &&
1606 mtvd->vdev_ops != &vdev_missing_ops) {
1607
1608 if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG))
1609 continue;
1610
1611 /*
1612 * Device specific actions.
1613 */
1614 if (mtvd->vdev_islog) {
1615 spa_set_log_state(spa, SPA_LOG_CLEAR);
1616 } else {
1617 /*
1618 * XXX - once we have 'readonly' pool
1619 * support we should be able to handle
1620 * missing data devices by transitioning
1621 * the pool to readonly.
1622 */
1623 continue;
1624 }
1625
1626 /*
1627 * Swap the missing vdev with the data we were
1628 * able to obtain from the MOS config.
1629 */
1630 vdev_remove_child(rvd, tvd);
1631 vdev_remove_child(mrvd, mtvd);
1632
1633 vdev_add_child(rvd, mtvd);
1634 vdev_add_child(mrvd, tvd);
1635
1636 spa_config_exit(spa, SCL_ALL, FTAG);
1637 vdev_load(mtvd);
1638 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1639
1640 vdev_reopen(rvd);
1641 } else if (mtvd->vdev_islog) {
1642 /*
1643 * Load the slog device's state from the MOS config
1644 * since it's possible that the label does not
1645 * contain the most up-to-date information.
1646 */
1647 vdev_load_log_state(tvd, mtvd);
1648 vdev_reopen(tvd);
1649 }
1650 }
1651 vdev_free(mrvd);
1652 spa_config_exit(spa, SCL_ALL, FTAG);
1653
1654 /*
1655 * Ensure we were able to validate the config.
1656 */
1657 return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum);
1658}
1659
1660/*
1661 * Check for missing log devices
1662 */
1663static boolean_t
1664spa_check_logs(spa_t *spa)
1665{
1666 boolean_t rv = B_FALSE;
1667
1668 switch (spa->spa_log_state) {
1669 case SPA_LOG_MISSING:
1670 /* need to recheck in case slog has been restored */
1671 case SPA_LOG_UNKNOWN:
1672 rv = (dmu_objset_find(spa->spa_name, zil_check_log_chain,
1673 NULL, DS_FIND_CHILDREN) != 0);
1674 if (rv)
1675 spa_set_log_state(spa, SPA_LOG_MISSING);
1676 break;
1677 }
1678 return (rv);
1679}
1680
1681static boolean_t
1682spa_passivate_log(spa_t *spa)
1683{
1684 vdev_t *rvd = spa->spa_root_vdev;
1685 boolean_t slog_found = B_FALSE;
1686
1687 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
1688
1689 if (!spa_has_slogs(spa))
1690 return (B_FALSE);
1691
1692 for (int c = 0; c < rvd->vdev_children; c++) {
1693 vdev_t *tvd = rvd->vdev_child[c];
1694 metaslab_group_t *mg = tvd->vdev_mg;
1695
1696 if (tvd->vdev_islog) {
1697 metaslab_group_passivate(mg);
1698 slog_found = B_TRUE;
1699 }
1700 }
1701
1702 return (slog_found);
1703}
1704
1705static void
1706spa_activate_log(spa_t *spa)
1707{
1708 vdev_t *rvd = spa->spa_root_vdev;
1709
1710 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
1711
1712 for (int c = 0; c < rvd->vdev_children; c++) {
1713 vdev_t *tvd = rvd->vdev_child[c];
1714 metaslab_group_t *mg = tvd->vdev_mg;
1715
1716 if (tvd->vdev_islog)
1717 metaslab_group_activate(mg);
1718 }
1719}
1720
1721int
1722spa_offline_log(spa_t *spa)
1723{
1724 int error;
1725
1726 error = dmu_objset_find(spa_name(spa), zil_vdev_offline,
1727 NULL, DS_FIND_CHILDREN);
1728 if (error == 0) {
1729 /*
1730 * We successfully offlined the log device, sync out the
1731 * current txg so that the "stubby" block can be removed
1732 * by zil_sync().
1733 */
1734 txg_wait_synced(spa->spa_dsl_pool, 0);
1735 }
1736 return (error);
1737}
1738
1739static void
1740spa_aux_check_removed(spa_aux_vdev_t *sav)
1741{
1742 int i;
1743
1744 for (i = 0; i < sav->sav_count; i++)
1745 spa_check_removed(sav->sav_vdevs[i]);
1746}
1747
1748void
1749spa_claim_notify(zio_t *zio)
1750{
1751 spa_t *spa = zio->io_spa;
1752
1753 if (zio->io_error)
1754 return;
1755
1756 mutex_enter(&spa->spa_props_lock); /* any mutex will do */
1757 if (spa->spa_claim_max_txg < zio->io_bp->blk_birth)
1758 spa->spa_claim_max_txg = zio->io_bp->blk_birth;
1759 mutex_exit(&spa->spa_props_lock);
1760}
1761
1762typedef struct spa_load_error {
1763 uint64_t sle_meta_count;
1764 uint64_t sle_data_count;
1765} spa_load_error_t;
1766
1767static void
1768spa_load_verify_done(zio_t *zio)
1769{
1770 blkptr_t *bp = zio->io_bp;
1771 spa_load_error_t *sle = zio->io_private;
1772 dmu_object_type_t type = BP_GET_TYPE(bp);
1773 int error = zio->io_error;
1774
1775 if (error) {
1776 if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) &&
1777 type != DMU_OT_INTENT_LOG)
1778 atomic_add_64(&sle->sle_meta_count, 1);
1779 else
1780 atomic_add_64(&sle->sle_data_count, 1);
1781 }
1782 zio_data_buf_free(zio->io_data, zio->io_size);
1783}
1784
1785/*ARGSUSED*/
1786static int
1787spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
1788 const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
1789{
1790 if (bp != NULL) {
1791 zio_t *rio = arg;
1792 size_t size = BP_GET_PSIZE(bp);
1793 void *data = zio_data_buf_alloc(size);
1794
1795 zio_nowait(zio_read(rio, spa, bp, data, size,
1796 spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
1797 ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
1798 ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
1799 }
1800 return (0);
1801}
1802
1803static int
1804spa_load_verify(spa_t *spa)
1805{
1806 zio_t *rio;
1807 spa_load_error_t sle = { 0 };
1808 zpool_rewind_policy_t policy;
1809 boolean_t verify_ok = B_FALSE;
1810 int error;
1811
1812 zpool_get_rewind_policy(spa->spa_config, &policy);
1813
1814 if (policy.zrp_request & ZPOOL_NEVER_REWIND)
1815 return (0);
1816
1817 rio = zio_root(spa, NULL, &sle,
1818 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
1819
1820 error = traverse_pool(spa, spa->spa_verify_min_txg,
1821 TRAVERSE_PRE | TRAVERSE_PREFETCH, spa_load_verify_cb, rio);
1822
1823 (void) zio_wait(rio);
1824
1825 spa->spa_load_meta_errors = sle.sle_meta_count;
1826 spa->spa_load_data_errors = sle.sle_data_count;
1827
1828 if (!error && sle.sle_meta_count <= policy.zrp_maxmeta &&
1829 sle.sle_data_count <= policy.zrp_maxdata) {
1830 int64_t loss = 0;
1831
1832 verify_ok = B_TRUE;
1833 spa->spa_load_txg = spa->spa_uberblock.ub_txg;
1834 spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp;
1835
1836 loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts;
1837 VERIFY(nvlist_add_uint64(spa->spa_load_info,
1838 ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0);
1839 VERIFY(nvlist_add_int64(spa->spa_load_info,
1840 ZPOOL_CONFIG_REWIND_TIME, loss) == 0);
1841 VERIFY(nvlist_add_uint64(spa->spa_load_info,
1842 ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0);
1843 } else {
1844 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg;
1845 }
1846
1847 if (error) {
1848 if (error != ENXIO && error != EIO)
1849 error = SET_ERROR(EIO);
1850 return (error);
1851 }
1852
1853 return (verify_ok ? 0 : EIO);
1854}
1855
1856/*
1857 * Find a value in the pool props object.
1858 */
1859static void
1860spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val)
1861{
1862 (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object,
1863 zpool_prop_to_name(prop), sizeof (uint64_t), 1, val);
1864}
1865
1866/*
1867 * Find a value in the pool directory object.
1868 */
1869static int
1870spa_dir_prop(spa_t *spa, const char *name, uint64_t *val)
1871{
1872 return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
1873 name, sizeof (uint64_t), 1, val));
1874}
1875
1876static int
1877spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err)
1878{
1879 vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux);
1880 return (err);
1881}
1882
1883/*
1884 * Fix up config after a partly-completed split. This is done with the
1885 * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off
1886 * pool have that entry in their config, but only the splitting one contains
1887 * a list of all the guids of the vdevs that are being split off.
1888 *
1889 * This function determines what to do with that list: either rejoin
1890 * all the disks to the pool, or complete the splitting process. To attempt
1891 * the rejoin, each disk that is offlined is marked online again, and
1892 * we do a reopen() call. If the vdev label for every disk that was
1893 * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL)
1894 * then we call vdev_split() on each disk, and complete the split.
1895 *
1896 * Otherwise we leave the config alone, with all the vdevs in place in
1897 * the original pool.
1898 */
1899static void
1900spa_try_repair(spa_t *spa, nvlist_t *config)
1901{
1902 uint_t extracted;
1903 uint64_t *glist;
1904 uint_t i, gcount;
1905 nvlist_t *nvl;
1906 vdev_t **vd;
1907 boolean_t attempt_reopen;
1908
1909 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0)
1910 return;
1911
1912 /* check that the config is complete */
1913 if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
1914 &glist, &gcount) != 0)
1915 return;
1916
1917 vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP);
1918
1919 /* attempt to online all the vdevs & validate */
1920 attempt_reopen = B_TRUE;
1921 for (i = 0; i < gcount; i++) {
1922 if (glist[i] == 0) /* vdev is hole */
1923 continue;
1924
1925 vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE);
1926 if (vd[i] == NULL) {
1927 /*
1928 * Don't bother attempting to reopen the disks;
1929 * just do the split.
1930 */
1931 attempt_reopen = B_FALSE;
1932 } else {
1933 /* attempt to re-online it */
1934 vd[i]->vdev_offline = B_FALSE;
1935 }
1936 }
1937
1938 if (attempt_reopen) {
1939 vdev_reopen(spa->spa_root_vdev);
1940
1941 /* check each device to see what state it's in */
1942 for (extracted = 0, i = 0; i < gcount; i++) {
1943 if (vd[i] != NULL &&
1944 vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL)
1945 break;
1946 ++extracted;
1947 }
1948 }
1949
1950 /*
1951 * If every disk has been moved to the new pool, or if we never
1952 * even attempted to look at them, then we split them off for
1953 * good.
1954 */
1955 if (!attempt_reopen || gcount == extracted) {
1956 for (i = 0; i < gcount; i++)
1957 if (vd[i] != NULL)
1958 vdev_split(vd[i]);
1959 vdev_reopen(spa->spa_root_vdev);
1960 }
1961
1962 kmem_free(vd, gcount * sizeof (vdev_t *));
1963}
1964
1965static int
1966spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type,
1967 boolean_t mosconfig)
1968{
1969 nvlist_t *config = spa->spa_config;
1970 char *ereport = FM_EREPORT_ZFS_POOL;
1971 char *comment;
1972 int error;
1973 uint64_t pool_guid;
1974 nvlist_t *nvl;
1975
1976 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid))
1977 return (SET_ERROR(EINVAL));
1978
1979 ASSERT(spa->spa_comment == NULL);
1980 if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0)
1981 spa->spa_comment = spa_strdup(comment);
1982
1983 /*
1984 * Versioning wasn't explicitly added to the label until later, so if
1985 * it's not present treat it as the initial version.
1986 */
1987 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
1988 &spa->spa_ubsync.ub_version) != 0)
1989 spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
1990
1991 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
1992 &spa->spa_config_txg);
1993
1994 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) &&
1995 spa_guid_exists(pool_guid, 0)) {
1996 error = SET_ERROR(EEXIST);
1997 } else {
1998 spa->spa_config_guid = pool_guid;
1999
2000 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT,
2001 &nvl) == 0) {
2002 VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting,
2003 KM_SLEEP) == 0);
2004 }
2005
2006 nvlist_free(spa->spa_load_info);
2007 spa->spa_load_info = fnvlist_alloc();
2008
2009 gethrestime(&spa->spa_loaded_ts);
2010 error = spa_load_impl(spa, pool_guid, config, state, type,
2011 mosconfig, &ereport);
2012 }
2013
2014 spa->spa_minref = refcount_count(&spa->spa_refcount);
2015 if (error) {
2016 if (error != EEXIST) {
2017 spa->spa_loaded_ts.tv_sec = 0;
2018 spa->spa_loaded_ts.tv_nsec = 0;
2019 }
2020 if (error != EBADF) {
2021 zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0);
2022 }
2023 }
2024 spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
2025 spa->spa_ena = 0;
2026
2027 return (error);
2028}
2029
2030/*
2031 * Load an existing storage pool, using the pool's builtin spa_config as a
2032 * source of configuration information.
2033 */
2034static int
2035spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
2036 spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
2037 char **ereport)
2038{
2039 int error = 0;
2040 nvlist_t *nvroot = NULL;
2041 nvlist_t *label;
2042 vdev_t *rvd;
2043 uberblock_t *ub = &spa->spa_uberblock;
2044 uint64_t children, config_cache_txg = spa->spa_config_txg;
2045 int orig_mode = spa->spa_mode;
2046 int parse;
2047 uint64_t obj;
2048 boolean_t missing_feat_write = B_FALSE;
2049
2050 /*
2051 * If this is an untrusted config, access the pool in read-only mode.
2052 * This prevents things like resilvering recently removed devices.
2053 */
2054 if (!mosconfig)
2055 spa->spa_mode = FREAD;
2056
2057 ASSERT(MUTEX_HELD(&spa_namespace_lock));
2058
2059 spa->spa_load_state = state;
2060
2061 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot))
2062 return (SET_ERROR(EINVAL));
2063
2064 parse = (type == SPA_IMPORT_EXISTING ?
2065 VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT);
2066
2067 /*
2068 * Create "The Godfather" zio to hold all async IOs
2069 */
2070 spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
2071 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
2072
2073 /*
2074 * Parse the configuration into a vdev tree. We explicitly set the
2075 * value that will be returned by spa_version() since parsing the
2076 * configuration requires knowing the version number.
2077 */
2078 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2079 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse);
2080 spa_config_exit(spa, SCL_ALL, FTAG);
2081
2082 if (error != 0)
2083 return (error);
2084
2085 ASSERT(spa->spa_root_vdev == rvd);
2086
2087 if (type != SPA_IMPORT_ASSEMBLE) {
2088 ASSERT(spa_guid(spa) == pool_guid);
2089 }
2090
2091 /*
2092 * Try to open all vdevs, loading each label in the process.
2093 */
2094 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2095 error = vdev_open(rvd);
2096 spa_config_exit(spa, SCL_ALL, FTAG);
2097 if (error != 0)
2098 return (error);
2099
2100 /*
2101 * We need to validate the vdev labels against the configuration that
2102 * we have in hand, which is dependent on the setting of mosconfig. If
2103 * mosconfig is true then we're validating the vdev labels based on
2104 * that config. Otherwise, we're validating against the cached config
2105 * (zpool.cache) that was read when we loaded the zfs module, and then
2106 * later we will recursively call spa_load() and validate against
2107 * the vdev config.
2108 *
2109 * If we're assembling a new pool that's been split off from an
2110 * existing pool, the labels haven't yet been updated so we skip
2111 * validation for now.
2112 */
2113 if (type != SPA_IMPORT_ASSEMBLE) {
2114 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2115 error = vdev_validate(rvd, mosconfig);
2116 spa_config_exit(spa, SCL_ALL, FTAG);
2117
2118 if (error != 0)
2119 return (error);
2120
2121 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
2122 return (SET_ERROR(ENXIO));
2123 }
2124
2125 /*
2126 * Find the best uberblock.
2127 */
2128 vdev_uberblock_load(rvd, ub, &label);
2129
2130 /*
2131 * If we weren't able to find a single valid uberblock, return failure.
2132 */
2133 if (ub->ub_txg == 0) {
2134 nvlist_free(label);
2135 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO));
2136 }
2137
2138 /*
2139 * If the pool has an unsupported version we can't open it.
2140 */
2141 if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) {
2142 nvlist_free(label);
2143 return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP));
2144 }
2145
2146 if (ub->ub_version >= SPA_VERSION_FEATURES) {
2147 nvlist_t *features;
2148
2149 /*
2150 * If we weren't able to find what's necessary for reading the
2151 * MOS in the label, return failure.
2152 */
2153 if (label == NULL || nvlist_lookup_nvlist(label,
2154 ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) {
2155 nvlist_free(label);
2156 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
2157 ENXIO));
2158 }
2159
2160 /*
2161 * Update our in-core representation with the definitive values
2162 * from the label.
2163 */
2164 nvlist_free(spa->spa_label_features);
2165 VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0);
2166 }
2167
2168 nvlist_free(label);
2169
2170 /*
2171 * Look through entries in the label nvlist's features_for_read. If
2172 * there is a feature listed there which we don't understand then we
2173 * cannot open a pool.
2174 */
2175 if (ub->ub_version >= SPA_VERSION_FEATURES) {
2176 nvlist_t *unsup_feat;
2177
2178 VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) ==
2179 0);
2180
2181 for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features,
2182 NULL); nvp != NULL;
2183 nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) {
2184 if (!zfeature_is_supported(nvpair_name(nvp))) {
2185 VERIFY(nvlist_add_string(unsup_feat,
2186 nvpair_name(nvp), "") == 0);
2187 }
2188 }
2189
2190 if (!nvlist_empty(unsup_feat)) {
2191 VERIFY(nvlist_add_nvlist(spa->spa_load_info,
2192 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0);
2193 nvlist_free(unsup_feat);
2194 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
2195 ENOTSUP));
2196 }
2197
2198 nvlist_free(unsup_feat);
2199 }
2200
2201 /*
2202 * If the vdev guid sum doesn't match the uberblock, we have an
2203 * incomplete configuration. We first check to see if the pool
2204 * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN).
2205 * If it is, defer the vdev_guid_sum check till later so we
2206 * can handle missing vdevs.
2207 */
2208 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN,
2209 &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE &&
2210 rvd->vdev_guid_sum != ub->ub_guid_sum)
2211 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO));
2212
2213 if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) {
2214 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2215 spa_try_repair(spa, config);
2216 spa_config_exit(spa, SCL_ALL, FTAG);
2217 nvlist_free(spa->spa_config_splitting);
2218 spa->spa_config_splitting = NULL;
2219 }
2220
2221 /*
2222 * Initialize internal SPA structures.
2223 */
2224 spa->spa_state = POOL_STATE_ACTIVE;
2225 spa->spa_ubsync = spa->spa_uberblock;
2226 spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
2227 TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1;
2228 spa->spa_first_txg = spa->spa_last_ubsync_txg ?
2229 spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
2230 spa->spa_claim_max_txg = spa->spa_first_txg;
2231 spa->spa_prev_software_version = ub->ub_software_version;
2232
2233 error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
2234 if (error)
2235 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2236 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
2237
2238 if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0)
2239 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2240
2241 if (spa_version(spa) >= SPA_VERSION_FEATURES) {
2242 boolean_t missing_feat_read = B_FALSE;
2243 nvlist_t *unsup_feat, *enabled_feat;
2244
2245 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ,
2246 &spa->spa_feat_for_read_obj) != 0) {
2247 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2248 }
2249
2250 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE,
2251 &spa->spa_feat_for_write_obj) != 0) {
2252 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2253 }
2254
2255 if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS,
2256 &spa->spa_feat_desc_obj) != 0) {
2257 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2258 }
2259
2260 enabled_feat = fnvlist_alloc();
2261 unsup_feat = fnvlist_alloc();
2262
2263 if (!feature_is_supported(spa->spa_meta_objset,
2264 spa->spa_feat_for_read_obj, spa->spa_feat_desc_obj,
2265 unsup_feat, enabled_feat))
2266 missing_feat_read = B_TRUE;
2267
2268 if (spa_writeable(spa) || state == SPA_LOAD_TRYIMPORT) {
2269 if (!feature_is_supported(spa->spa_meta_objset,
2270 spa->spa_feat_for_write_obj, spa->spa_feat_desc_obj,
2271 unsup_feat, enabled_feat)) {
2272 missing_feat_write = B_TRUE;
2273 }
2274 }
2275
2276 fnvlist_add_nvlist(spa->spa_load_info,
2277 ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat);
2278
2279 if (!nvlist_empty(unsup_feat)) {
2280 fnvlist_add_nvlist(spa->spa_load_info,
2281 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat);
2282 }
2283
2284 fnvlist_free(enabled_feat);
2285 fnvlist_free(unsup_feat);
2286
2287 if (!missing_feat_read) {
2288 fnvlist_add_boolean(spa->spa_load_info,
2289 ZPOOL_CONFIG_CAN_RDONLY);
2290 }
2291
2292 /*
2293 * If the state is SPA_LOAD_TRYIMPORT, our objective is
2294 * twofold: to determine whether the pool is available for
2295 * import in read-write mode and (if it is not) whether the
2296 * pool is available for import in read-only mode. If the pool
2297 * is available for import in read-write mode, it is displayed
2298 * as available in userland; if it is not available for import
2299 * in read-only mode, it is displayed as unavailable in
2300 * userland. If the pool is available for import in read-only
2301 * mode but not read-write mode, it is displayed as unavailable
2302 * in userland with a special note that the pool is actually
2303 * available for open in read-only mode.
2304 *
2305 * As a result, if the state is SPA_LOAD_TRYIMPORT and we are
2306 * missing a feature for write, we must first determine whether
2307 * the pool can be opened read-only before returning to
2308 * userland in order to know whether to display the
2309 * abovementioned note.
2310 */
2311 if (missing_feat_read || (missing_feat_write &&
2312 spa_writeable(spa))) {
2313 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
2314 ENOTSUP));
2315 }
2316 }
2317
2318 spa->spa_is_initializing = B_TRUE;
2319 error = dsl_pool_open(spa->spa_dsl_pool);
2320 spa->spa_is_initializing = B_FALSE;
2321 if (error != 0)
2322 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2323
2324 if (!mosconfig) {
2325 uint64_t hostid;
2326 nvlist_t *policy = NULL, *nvconfig;
2327
2328 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
2329 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2330
2331 if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig,
2332 ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
2333 char *hostname;
2334 unsigned long myhostid = 0;
2335
2336 VERIFY(nvlist_lookup_string(nvconfig,
2337 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0);
2338
2339#ifdef _KERNEL
2340 myhostid = zone_get_hostid(NULL);
2341#else /* _KERNEL */
2342 /*
2343 * We're emulating the system's hostid in userland, so
2344 * we can't use zone_get_hostid().
2345 */
2346 (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid);
2347#endif /* _KERNEL */
2348 if (check_hostid && hostid != 0 && myhostid != 0 &&
2349 hostid != myhostid) {
2350 nvlist_free(nvconfig);
2351 cmn_err(CE_WARN, "pool '%s' could not be "
2352 "loaded as it was last accessed by "
2353 "another system (host: %s hostid: 0x%lx). "
2354 "See: http://illumos.org/msg/ZFS-8000-EY",
2355 spa_name(spa), hostname,
2356 (unsigned long)hostid);
2357 return (SET_ERROR(EBADF));
2358 }
2359 }
2360 if (nvlist_lookup_nvlist(spa->spa_config,
2361 ZPOOL_REWIND_POLICY, &policy) == 0)
2362 VERIFY(nvlist_add_nvlist(nvconfig,
2363 ZPOOL_REWIND_POLICY, policy) == 0);
2364
2365 spa_config_set(spa, nvconfig);
2366 spa_unload(spa);
2367 spa_deactivate(spa);
2368 spa_activate(spa, orig_mode);
2369
2370 return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE));
2371 }
2372
2373 if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0)
2374 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2375 error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj);
2376 if (error != 0)
2377 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2378
2379 /*
2380 * Load the bit that tells us to use the new accounting function
2381 * (raid-z deflation). If we have an older pool, this will not
2382 * be present.
2383 */
2384 error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate);
2385 if (error != 0 && error != ENOENT)
2386 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2387
2388 error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION,
2389 &spa->spa_creation_version);
2390 if (error != 0 && error != ENOENT)
2391 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2392
2393 /*
2394 * Load the persistent error log. If we have an older pool, this will
2395 * not be present.
2396 */
2397 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last);
2398 if (error != 0 && error != ENOENT)
2399 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2400
2401 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB,
2402 &spa->spa_errlog_scrub);
2403 if (error != 0 && error != ENOENT)
2404 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2405
2406 /*
2407 * Load the history object. If we have an older pool, this
2408 * will not be present.
2409 */
2410 error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history);
2411 if (error != 0 && error != ENOENT)
2412 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2413
2414 /*
2415 * If we're assembling the pool from the split-off vdevs of
2416 * an existing pool, we don't want to attach the spares & cache
2417 * devices.
2418 */
2419
2420 /*
2421 * Load any hot spares for this pool.
2422 */
2423 error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object);
2424 if (error != 0 && error != ENOENT)
2425 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2426 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
2427 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES);
2428 if (load_nvlist(spa, spa->spa_spares.sav_object,
2429 &spa->spa_spares.sav_config) != 0)
2430 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2431
2432 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2433 spa_load_spares(spa);
2434 spa_config_exit(spa, SCL_ALL, FTAG);
2435 } else if (error == 0) {
2436 spa->spa_spares.sav_sync = B_TRUE;
2437 }
2438
2439 /*
2440 * Load any level 2 ARC devices for this pool.
2441 */
2442 error = spa_dir_prop(spa, DMU_POOL_L2CACHE,
2443 &spa->spa_l2cache.sav_object);
2444 if (error != 0 && error != ENOENT)
2445 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2446 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
2447 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE);
2448 if (load_nvlist(spa, spa->spa_l2cache.sav_object,
2449 &spa->spa_l2cache.sav_config) != 0)
2450 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2451
2452 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2453 spa_load_l2cache(spa);
2454 spa_config_exit(spa, SCL_ALL, FTAG);
2455 } else if (error == 0) {
2456 spa->spa_l2cache.sav_sync = B_TRUE;
2457 }
2458
2459 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
2460
2461 error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object);
2462 if (error && error != ENOENT)
2463 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2464
2465 if (error == 0) {
2466 uint64_t autoreplace;
2467
2468 spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs);
2469 spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace);
2470 spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation);
2471 spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode);
2472 spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand);
2473 spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO,
2474 &spa->spa_dedup_ditto);
2475
2476 spa->spa_autoreplace = (autoreplace != 0);
2477 }
2478
2479 /*
2480 * If the 'autoreplace' property is set, then post a resource notifying
2481 * the ZFS DE that it should not issue any faults for unopenable
2482 * devices. We also iterate over the vdevs, and post a sysevent for any
2483 * unopenable vdevs so that the normal autoreplace handler can take
2484 * over.
2485 */
2486 if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) {
2487 spa_check_removed(spa->spa_root_vdev);
2488 /*
2489 * For the import case, this is done in spa_import(), because
2490 * at this point we're using the spare definitions from
2491 * the MOS config, not necessarily from the userland config.
2492 */
2493 if (state != SPA_LOAD_IMPORT) {
2494 spa_aux_check_removed(&spa->spa_spares);
2495 spa_aux_check_removed(&spa->spa_l2cache);
2496 }
2497 }
2498
2499 /*
2500 * Load the vdev state for all toplevel vdevs.
2501 */
2502 vdev_load(rvd);
2503
2504 /*
2505 * Propagate the leaf DTLs we just loaded all the way up the tree.
2506 */
2507 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2508 vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
2509 spa_config_exit(spa, SCL_ALL, FTAG);
2510
2511 /*
2512 * Load the DDTs (dedup tables).
2513 */
2514 error = ddt_load(spa);
2515 if (error != 0)
2516 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2517
2518 spa_update_dspace(spa);
2519
2520 /*
2521 * Validate the config, using the MOS config to fill in any
2522 * information which might be missing. If we fail to validate
2523 * the config then declare the pool unfit for use. If we're
2524 * assembling a pool from a split, the log is not transferred
2525 * over.
2526 */
2527 if (type != SPA_IMPORT_ASSEMBLE) {
2528 nvlist_t *nvconfig;
2529
2530 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
2531 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2532
2533 if (!spa_config_valid(spa, nvconfig)) {
2534 nvlist_free(nvconfig);
2535 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM,
2536 ENXIO));
2537 }
2538 nvlist_free(nvconfig);
2539
2540 /*
2541 * Now that we've validated the config, check the state of the
2542 * root vdev. If it can't be opened, it indicates one or
2543 * more toplevel vdevs are faulted.
2544 */
2545 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
2546 return (SET_ERROR(ENXIO));
2547
2548 if (spa_check_logs(spa)) {
2549 *ereport = FM_EREPORT_ZFS_LOG_REPLAY;
2550 return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO));
2551 }
2552 }
2553
2554 if (missing_feat_write) {
2555 ASSERT(state == SPA_LOAD_TRYIMPORT);
2556
2557 /*
2558 * At this point, we know that we can open the pool in
2559 * read-only mode but not read-write mode. We now have enough
2560 * information and can return to userland.
2561 */
2562 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP));
2563 }
2564
2565 /*
2566 * We've successfully opened the pool, verify that we're ready
2567 * to start pushing transactions.
2568 */
2569 if (state != SPA_LOAD_TRYIMPORT) {
2570 if (error = spa_load_verify(spa))
2571 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
2572 error));
2573 }
2574
2575 if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER ||
2576 spa->spa_load_max_txg == UINT64_MAX)) {
2577 dmu_tx_t *tx;
2578 int need_update = B_FALSE;
2579
2580 ASSERT(state != SPA_LOAD_TRYIMPORT);
2581
2582 /*
2583 * Claim log blocks that haven't been committed yet.
2584 * This must all happen in a single txg.
2585 * Note: spa_claim_max_txg is updated by spa_claim_notify(),
2586 * invoked from zil_claim_log_block()'s i/o done callback.
2587 * Price of rollback is that we abandon the log.
2588 */
2589 spa->spa_claiming = B_TRUE;
2590
2591 tx = dmu_tx_create_assigned(spa_get_dsl(spa),
2592 spa_first_txg(spa));
2593 (void) dmu_objset_find(spa_name(spa),
2594 zil_claim, tx, DS_FIND_CHILDREN);
2595 dmu_tx_commit(tx);
2596
2597 spa->spa_claiming = B_FALSE;
2598
2599 spa_set_log_state(spa, SPA_LOG_GOOD);
2600 spa->spa_sync_on = B_TRUE;
2601 txg_sync_start(spa->spa_dsl_pool);
2602
2603 /*
2604 * Wait for all claims to sync. We sync up to the highest
2605 * claimed log block birth time so that claimed log blocks
2606 * don't appear to be from the future. spa_claim_max_txg
2607 * will have been set for us by either zil_check_log_chain()
2608 * (invoked from spa_check_logs()) or zil_claim() above.
2609 */
2610 txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg);
2611
2612 /*
2613 * If the config cache is stale, or we have uninitialized
2614 * metaslabs (see spa_vdev_add()), then update the config.
2615 *
2616 * If this is a verbatim import, trust the current
2617 * in-core spa_config and update the disk labels.
2618 */
2619 if (config_cache_txg != spa->spa_config_txg ||
2620 state == SPA_LOAD_IMPORT ||
2621 state == SPA_LOAD_RECOVER ||
2622 (spa->spa_import_flags & ZFS_IMPORT_VERBATIM))
2623 need_update = B_TRUE;
2624
2625 for (int c = 0; c < rvd->vdev_children; c++)
2626 if (rvd->vdev_child[c]->vdev_ms_array == 0)
2627 need_update = B_TRUE;
2628
2629 /*
2630 * Update the config cache asychronously in case we're the
2631 * root pool, in which case the config cache isn't writable yet.
2632 */
2633 if (need_update)
2634 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
2635
2636 /*
2637 * Check all DTLs to see if anything needs resilvering.
2638 */
2639 if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
2640 vdev_resilver_needed(rvd, NULL, NULL))
2641 spa_async_request(spa, SPA_ASYNC_RESILVER);
2642
2643 /*
2644 * Log the fact that we booted up (so that we can detect if
2645 * we rebooted in the middle of an operation).
2646 */
2647 spa_history_log_version(spa, "open");
2648
2649 /*
2650 * Delete any inconsistent datasets.
2651 */
2652 (void) dmu_objset_find(spa_name(spa),
2653 dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN);
2654
2655 /*
2656 * Clean up any stale temporary dataset userrefs.
2657 */
2658 dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool);
2659 }
2660
2661 return (0);
2662}
2663
2664static int
2665spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig)
2666{
2667 int mode = spa->spa_mode;
2668
2669 spa_unload(spa);
2670 spa_deactivate(spa);
2671
2672 spa->spa_load_max_txg--;
2673
2674 spa_activate(spa, mode);
2675 spa_async_suspend(spa);
2676
2677 return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig));
2678}
2679
2680/*
2681 * If spa_load() fails this function will try loading prior txg's. If
2682 * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool
2683 * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this
2684 * function will not rewind the pool and will return the same error as
2685 * spa_load().
2686 */
2687static int
2688spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig,
2689 uint64_t max_request, int rewind_flags)
2690{
2691 nvlist_t *loadinfo = NULL;
2692 nvlist_t *config = NULL;
2693 int load_error, rewind_error;
2694 uint64_t safe_rewind_txg;
2695 uint64_t min_txg;
2696
2697 if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) {
2698 spa->spa_load_max_txg = spa->spa_load_txg;
2699 spa_set_log_state(spa, SPA_LOG_CLEAR);
2700 } else {
2701 spa->spa_load_max_txg = max_request;
2702 }
2703
2704 load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING,
2705 mosconfig);
2706 if (load_error == 0)
2707 return (0);
2708
2709 if (spa->spa_root_vdev != NULL)
2710 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
2711
2712 spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg;
2713 spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp;
2714
2715 if (rewind_flags & ZPOOL_NEVER_REWIND) {
2716 nvlist_free(config);
2717 return (load_error);
2718 }
2719
2720 if (state == SPA_LOAD_RECOVER) {
2721 /* Price of rolling back is discarding txgs, including log */
2722 spa_set_log_state(spa, SPA_LOG_CLEAR);
2723 } else {
2724 /*
2725 * If we aren't rolling back save the load info from our first
2726 * import attempt so that we can restore it after attempting
2727 * to rewind.
2728 */
2729 loadinfo = spa->spa_load_info;
2730 spa->spa_load_info = fnvlist_alloc();
2731 }
2732
2733 spa->spa_load_max_txg = spa->spa_last_ubsync_txg;
2734 safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE;
2735 min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ?
2736 TXG_INITIAL : safe_rewind_txg;
2737
2738 /*
2739 * Continue as long as we're finding errors, we're still within
2740 * the acceptable rewind range, and we're still finding uberblocks
2741 */
2742 while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg &&
2743 spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) {
2744 if (spa->spa_load_max_txg < safe_rewind_txg)
2745 spa->spa_extreme_rewind = B_TRUE;
2746 rewind_error = spa_load_retry(spa, state, mosconfig);
2747 }
2748
2749 spa->spa_extreme_rewind = B_FALSE;
2750 spa->spa_load_max_txg = UINT64_MAX;
2751
2752 if (config && (rewind_error || state != SPA_LOAD_RECOVER))
2753 spa_config_set(spa, config);
2754
2755 if (state == SPA_LOAD_RECOVER) {
2756 ASSERT3P(loadinfo, ==, NULL);
2757 return (rewind_error);
2758 } else {
2759 /* Store the rewind info as part of the initial load info */
2760 fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO,
2761 spa->spa_load_info);
2762
2763 /* Restore the initial load info */
2764 fnvlist_free(spa->spa_load_info);
2765 spa->spa_load_info = loadinfo;
2766
2767 return (load_error);
2768 }
2769}
2770
2771/*
2772 * Pool Open/Import
2773 *
2774 * The import case is identical to an open except that the configuration is sent
2775 * down from userland, instead of grabbed from the configuration cache. For the
2776 * case of an open, the pool configuration will exist in the
2777 * POOL_STATE_UNINITIALIZED state.
2778 *
2779 * The stats information (gen/count/ustats) is used to gather vdev statistics at
2780 * the same time open the pool, without having to keep around the spa_t in some
2781 * ambiguous state.
2782 */
2783static int
2784spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
2785 nvlist_t **config)
2786{
2787 spa_t *spa;
2788 spa_load_state_t state = SPA_LOAD_OPEN;
2789 int error;
2790 int locked = B_FALSE;
2791 int firstopen = B_FALSE;
2792
2793 *spapp = NULL;
2794
2795 /*
2796 * As disgusting as this is, we need to support recursive calls to this
2797 * function because dsl_dir_open() is called during spa_load(), and ends
2798 * up calling spa_open() again. The real fix is to figure out how to
2799 * avoid dsl_dir_open() calling this in the first place.
2800 */
2801 if (mutex_owner(&spa_namespace_lock) != curthread) {
2802 mutex_enter(&spa_namespace_lock);
2803 locked = B_TRUE;
2804 }
2805
2806 if ((spa = spa_lookup(pool)) == NULL) {
2807 if (locked)
2808 mutex_exit(&spa_namespace_lock);
2809 return (SET_ERROR(ENOENT));
2810 }
2811
2812 if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
2813 zpool_rewind_policy_t policy;
2814
2815 firstopen = B_TRUE;
2816
2817 zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config,
2818 &policy);
2819 if (policy.zrp_request & ZPOOL_DO_REWIND)
2820 state = SPA_LOAD_RECOVER;
2821
2822 spa_activate(spa, spa_mode_global);
2823
2824 if (state != SPA_LOAD_RECOVER)
2825 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
2826
2827 error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg,
2828 policy.zrp_request);
2829
2830 if (error == EBADF) {
2831 /*
2832 * If vdev_validate() returns failure (indicated by
2833 * EBADF), it indicates that one of the vdevs indicates
2834 * that the pool has been exported or destroyed. If
2835 * this is the case, the config cache is out of sync and
2836 * we should remove the pool from the namespace.
2837 */
2838 spa_unload(spa);
2839 spa_deactivate(spa);
2840 spa_config_sync(spa, B_TRUE, B_TRUE);
2841 spa_remove(spa);
2842 if (locked)
2843 mutex_exit(&spa_namespace_lock);
2844 return (SET_ERROR(ENOENT));
2845 }
2846
2847 if (error) {
2848 /*
2849 * We can't open the pool, but we still have useful
2850 * information: the state of each vdev after the
2851 * attempted vdev_open(). Return this to the user.
2852 */
2853 if (config != NULL && spa->spa_config) {
2854 VERIFY(nvlist_dup(spa->spa_config, config,
2855 KM_SLEEP) == 0);
2856 VERIFY(nvlist_add_nvlist(*config,
2857 ZPOOL_CONFIG_LOAD_INFO,
2858 spa->spa_load_info) == 0);
2859 }
2860 spa_unload(spa);
2861 spa_deactivate(spa);
2862 spa->spa_last_open_failed = error;
2863 if (locked)
2864 mutex_exit(&spa_namespace_lock);
2865 *spapp = NULL;
2866 return (error);
2867 }
2868 }
2869
2870 spa_open_ref(spa, tag);
2871
2872 if (config != NULL)
2873 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
2874
2875 /*
2876 * If we've recovered the pool, pass back any information we
2877 * gathered while doing the load.
2878 */
2879 if (state == SPA_LOAD_RECOVER) {
2880 VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO,
2881 spa->spa_load_info) == 0);
2882 }
2883
2884 if (locked) {
2885 spa->spa_last_open_failed = 0;
2886 spa->spa_last_ubsync_txg = 0;
2887 spa->spa_load_txg = 0;
2888 mutex_exit(&spa_namespace_lock);
2889#ifdef __FreeBSD__
2890#ifdef _KERNEL
2891 if (firstopen)
2892 zvol_create_minors(spa->spa_name);
2893#endif
2894#endif
2895 }
2896
2897 *spapp = spa;
2898
2899 return (0);
2900}
2901
2902int
2903spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy,
2904 nvlist_t **config)
2905{
2906 return (spa_open_common(name, spapp, tag, policy, config));
2907}
2908
2909int
2910spa_open(const char *name, spa_t **spapp, void *tag)
2911{
2912 return (spa_open_common(name, spapp, tag, NULL, NULL));
2913}
2914
2915/*
2916 * Lookup the given spa_t, incrementing the inject count in the process,
2917 * preventing it from being exported or destroyed.
2918 */
2919spa_t *
2920spa_inject_addref(char *name)
2921{
2922 spa_t *spa;
2923
2924 mutex_enter(&spa_namespace_lock);
2925 if ((spa = spa_lookup(name)) == NULL) {
2926 mutex_exit(&spa_namespace_lock);
2927 return (NULL);
2928 }
2929 spa->spa_inject_ref++;
2930 mutex_exit(&spa_namespace_lock);
2931
2932 return (spa);
2933}
2934
2935void
2936spa_inject_delref(spa_t *spa)
2937{
2938 mutex_enter(&spa_namespace_lock);
2939 spa->spa_inject_ref--;
2940 mutex_exit(&spa_namespace_lock);
2941}
2942
2943/*
2944 * Add spares device information to the nvlist.
2945 */
2946static void
2947spa_add_spares(spa_t *spa, nvlist_t *config)
2948{
2949 nvlist_t **spares;
2950 uint_t i, nspares;
2951 nvlist_t *nvroot;
2952 uint64_t guid;
2953 vdev_stat_t *vs;
2954 uint_t vsc;
2955 uint64_t pool;
2956
2957 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
2958
2959 if (spa->spa_spares.sav_count == 0)
2960 return;
2961
2962 VERIFY(nvlist_lookup_nvlist(config,
2963 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
2964 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
2965 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
2966 if (nspares != 0) {
2967 VERIFY(nvlist_add_nvlist_array(nvroot,
2968 ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
2969 VERIFY(nvlist_lookup_nvlist_array(nvroot,
2970 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
2971
2972 /*
2973 * Go through and find any spares which have since been
2974 * repurposed as an active spare. If this is the case, update
2975 * their status appropriately.
2976 */
2977 for (i = 0; i < nspares; i++) {
2978 VERIFY(nvlist_lookup_uint64(spares[i],
2979 ZPOOL_CONFIG_GUID, &guid) == 0);
2980 if (spa_spare_exists(guid, &pool, NULL) &&
2981 pool != 0ULL) {
2982 VERIFY(nvlist_lookup_uint64_array(
2983 spares[i], ZPOOL_CONFIG_VDEV_STATS,
2984 (uint64_t **)&vs, &vsc) == 0);
2985 vs->vs_state = VDEV_STATE_CANT_OPEN;
2986 vs->vs_aux = VDEV_AUX_SPARED;
2987 }
2988 }
2989 }
2990}
2991
2992/*
2993 * Add l2cache device information to the nvlist, including vdev stats.
2994 */
2995static void
2996spa_add_l2cache(spa_t *spa, nvlist_t *config)
2997{
2998 nvlist_t **l2cache;
2999 uint_t i, j, nl2cache;
3000 nvlist_t *nvroot;
3001 uint64_t guid;
3002 vdev_t *vd;
3003 vdev_stat_t *vs;
3004 uint_t vsc;
3005
3006 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
3007
3008 if (spa->spa_l2cache.sav_count == 0)
3009 return;
3010
3011 VERIFY(nvlist_lookup_nvlist(config,
3012 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
3013 VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
3014 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
3015 if (nl2cache != 0) {
3016 VERIFY(nvlist_add_nvlist_array(nvroot,
3017 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
3018 VERIFY(nvlist_lookup_nvlist_array(nvroot,
3019 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
3020
3021 /*
3022 * Update level 2 cache device stats.
3023 */
3024
3025 for (i = 0; i < nl2cache; i++) {
3026 VERIFY(nvlist_lookup_uint64(l2cache[i],
3027 ZPOOL_CONFIG_GUID, &guid) == 0);
3028
3029 vd = NULL;
3030 for (j = 0; j < spa->spa_l2cache.sav_count; j++) {
3031 if (guid ==
3032 spa->spa_l2cache.sav_vdevs[j]->vdev_guid) {
3033 vd = spa->spa_l2cache.sav_vdevs[j];
3034 break;
3035 }
3036 }
3037 ASSERT(vd != NULL);
3038
3039 VERIFY(nvlist_lookup_uint64_array(l2cache[i],
3040 ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)
3041 == 0);
3042 vdev_get_stats(vd, vs);
3043 }
3044 }
3045}
3046
3047static void
3048spa_add_feature_stats(spa_t *spa, nvlist_t *config)
3049{
3050 nvlist_t *features;
3051 zap_cursor_t zc;
3052 zap_attribute_t za;
3053
3054 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
3055 VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0);
3056
3057 if (spa->spa_feat_for_read_obj != 0) {
3058 for (zap_cursor_init(&zc, spa->spa_meta_objset,
3059 spa->spa_feat_for_read_obj);
3060 zap_cursor_retrieve(&zc, &za) == 0;
3061 zap_cursor_advance(&zc)) {
3062 ASSERT(za.za_integer_length == sizeof (uint64_t) &&
3063 za.za_num_integers == 1);
3064 VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name,
3065 za.za_first_integer));
3066 }
3067 zap_cursor_fini(&zc);
3068 }
3069
3070 if (spa->spa_feat_for_write_obj != 0) {
3071 for (zap_cursor_init(&zc, spa->spa_meta_objset,
3072 spa->spa_feat_for_write_obj);
3073 zap_cursor_retrieve(&zc, &za) == 0;
3074 zap_cursor_advance(&zc)) {
3075 ASSERT(za.za_integer_length == sizeof (uint64_t) &&
3076 za.za_num_integers == 1);
3077 VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name,
3078 za.za_first_integer));
3079 }
3080 zap_cursor_fini(&zc);
3081 }
3082
3083 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS,
3084 features) == 0);
3085 nvlist_free(features);
3086}
3087
3088int
3089spa_get_stats(const char *name, nvlist_t **config,
3090 char *altroot, size_t buflen)
3091{
3092 int error;
3093 spa_t *spa;
3094
3095 *config = NULL;
3096 error = spa_open_common(name, &spa, FTAG, NULL, config);
3097
3098 if (spa != NULL) {
3099 /*
3100 * This still leaves a window of inconsistency where the spares
3101 * or l2cache devices could change and the config would be
3102 * self-inconsistent.
3103 */
3104 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
3105
3106 if (*config != NULL) {
3107 uint64_t loadtimes[2];
3108
3109 loadtimes[0] = spa->spa_loaded_ts.tv_sec;
3110 loadtimes[1] = spa->spa_loaded_ts.tv_nsec;
3111 VERIFY(nvlist_add_uint64_array(*config,
3112 ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0);
3113
3114 VERIFY(nvlist_add_uint64(*config,
3115 ZPOOL_CONFIG_ERRCOUNT,
3116 spa_get_errlog_size(spa)) == 0);
3117
3118 if (spa_suspended(spa))
3119 VERIFY(nvlist_add_uint64(*config,
3120 ZPOOL_CONFIG_SUSPENDED,
3121 spa->spa_failmode) == 0);
3122
3123 spa_add_spares(spa, *config);
3124 spa_add_l2cache(spa, *config);
3125 spa_add_feature_stats(spa, *config);
3126 }
3127 }
3128
3129 /*
3130 * We want to get the alternate root even for faulted pools, so we cheat
3131 * and call spa_lookup() directly.
3132 */
3133 if (altroot) {
3134 if (spa == NULL) {
3135 mutex_enter(&spa_namespace_lock);
3136 spa = spa_lookup(name);
3137 if (spa)
3138 spa_altroot(spa, altroot, buflen);
3139 else
3140 altroot[0] = '\0';
3141 spa = NULL;
3142 mutex_exit(&spa_namespace_lock);
3143 } else {
3144 spa_altroot(spa, altroot, buflen);
3145 }
3146 }
3147
3148 if (spa != NULL) {
3149 spa_config_exit(spa, SCL_CONFIG, FTAG);
3150 spa_close(spa, FTAG);
3151 }
3152
3153 return (error);
3154}
3155
3156/*
3157 * Validate that the auxiliary device array is well formed. We must have an
3158 * array of nvlists, each which describes a valid leaf vdev. If this is an
3159 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be
3160 * specified, as long as they are well-formed.
3161 */
3162static int
3163spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode,
3164 spa_aux_vdev_t *sav, const char *config, uint64_t version,
3165 vdev_labeltype_t label)
3166{
3167 nvlist_t **dev;
3168 uint_t i, ndev;
3169 vdev_t *vd;
3170 int error;
3171
3172 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
3173
3174 /*
3175 * It's acceptable to have no devs specified.
3176 */
3177 if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0)
3178 return (0);
3179
3180 if (ndev == 0)
3181 return (SET_ERROR(EINVAL));
3182
3183 /*
3184 * Make sure the pool is formatted with a version that supports this
3185 * device type.
3186 */
3187 if (spa_version(spa) < version)
3188 return (SET_ERROR(ENOTSUP));
3189
3190 /*
3191 * Set the pending device list so we correctly handle device in-use
3192 * checking.
3193 */
3194 sav->sav_pending = dev;
3195 sav->sav_npending = ndev;
3196
3197 for (i = 0; i < ndev; i++) {
3198 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0,
3199 mode)) != 0)
3200 goto out;
3201
3202 if (!vd->vdev_ops->vdev_op_leaf) {
3203 vdev_free(vd);
3204 error = SET_ERROR(EINVAL);
3205 goto out;
3206 }
3207
3208 /*
3209 * The L2ARC currently only supports disk devices in
3210 * kernel context. For user-level testing, we allow it.
3211 */
3212#ifdef _KERNEL
3213 if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) &&
3214 strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) {
3215 error = SET_ERROR(ENOTBLK);
3216 vdev_free(vd);
3217 goto out;
3218 }
3219#endif
3220 vd->vdev_top = vd;
3221
3222 if ((error = vdev_open(vd)) == 0 &&
3223 (error = vdev_label_init(vd, crtxg, label)) == 0) {
3224 VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID,
3225 vd->vdev_guid) == 0);
3226 }
3227
3228 vdev_free(vd);
3229
3230 if (error &&
3231 (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE))
3232 goto out;
3233 else
3234 error = 0;
3235 }
3236
3237out:
3238 sav->sav_pending = NULL;
3239 sav->sav_npending = 0;
3240 return (error);
3241}
3242
3243static int
3244spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode)
3245{
3246 int error;
3247
3248 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
3249
3250 if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode,
3251 &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES,
3252 VDEV_LABEL_SPARE)) != 0) {
3253 return (error);
3254 }
3255
3256 return (spa_validate_aux_devs(spa, nvroot, crtxg, mode,
3257 &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE,
3258 VDEV_LABEL_L2CACHE));
3259}
3260
3261static void
3262spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs,
3263 const char *config)
3264{
3265 int i;
3266
3267 if (sav->sav_config != NULL) {
3268 nvlist_t **olddevs;
3269 uint_t oldndevs;
3270 nvlist_t **newdevs;
3271
3272 /*
3273 * Generate new dev list by concatentating with the
3274 * current dev list.
3275 */
3276 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config,
3277 &olddevs, &oldndevs) == 0);
3278
3279 newdevs = kmem_alloc(sizeof (void *) *
3280 (ndevs + oldndevs), KM_SLEEP);
3281 for (i = 0; i < oldndevs; i++)
3282 VERIFY(nvlist_dup(olddevs[i], &newdevs[i],
3283 KM_SLEEP) == 0);
3284 for (i = 0; i < ndevs; i++)
3285 VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs],
3286 KM_SLEEP) == 0);
3287
3288 VERIFY(nvlist_remove(sav->sav_config, config,
3289 DATA_TYPE_NVLIST_ARRAY) == 0);
3290
3291 VERIFY(nvlist_add_nvlist_array(sav->sav_config,
3292 config, newdevs, ndevs + oldndevs) == 0);
3293 for (i = 0; i < oldndevs + ndevs; i++)
3294 nvlist_free(newdevs[i]);
3295 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *));
3296 } else {
3297 /*
3298 * Generate a new dev list.
3299 */
3300 VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME,
3301 KM_SLEEP) == 0);
3302 VERIFY(nvlist_add_nvlist_array(sav->sav_config, config,
3303 devs, ndevs) == 0);
3304 }
3305}
3306
3307/*
3308 * Stop and drop level 2 ARC devices
3309 */
3310void
3311spa_l2cache_drop(spa_t *spa)
3312{
3313 vdev_t *vd;
3314 int i;
3315 spa_aux_vdev_t *sav = &spa->spa_l2cache;
3316
3317 for (i = 0; i < sav->sav_count; i++) {
3318 uint64_t pool;
3319
3320 vd = sav->sav_vdevs[i];
3321 ASSERT(vd != NULL);
3322
3323 if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
3324 pool != 0ULL && l2arc_vdev_present(vd))
3325 l2arc_remove_vdev(vd);
3326 }
3327}
3328
3329/*
3330 * Pool Creation
3331 */
3332int
3333spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
3334 nvlist_t *zplprops)
3335{
3336 spa_t *spa;
3337 char *altroot = NULL;
3338 vdev_t *rvd;
3339 dsl_pool_t *dp;
3340 dmu_tx_t *tx;
3341 int error = 0;
3342 uint64_t txg = TXG_INITIAL;
3343 nvlist_t **spares, **l2cache;
3344 uint_t nspares, nl2cache;
3345 uint64_t version, obj;
3346 boolean_t has_features;
3347
3348 /*
3349 * If this pool already exists, return failure.
3350 */
3351 mutex_enter(&spa_namespace_lock);
3352 if (spa_lookup(pool) != NULL) {
3353 mutex_exit(&spa_namespace_lock);
3354 return (SET_ERROR(EEXIST));
3355 }
3356
3357 /*
3358 * Allocate a new spa_t structure.
3359 */
3360 (void) nvlist_lookup_string(props,
3361 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
3362 spa = spa_add(pool, NULL, altroot);
3363 spa_activate(spa, spa_mode_global);
3364
3365 if (props && (error = spa_prop_validate(spa, props))) {
3366 spa_deactivate(spa);
3367 spa_remove(spa);
3368 mutex_exit(&spa_namespace_lock);
3369 return (error);
3370 }
3371
3372 has_features = B_FALSE;
3373 for (nvpair_t *elem = nvlist_next_nvpair(props, NULL);
3374 elem != NULL; elem = nvlist_next_nvpair(props, elem)) {
3375 if (zpool_prop_feature(nvpair_name(elem)))
3376 has_features = B_TRUE;
3377 }
3378
3379 if (has_features || nvlist_lookup_uint64(props,
3380 zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) {
3381 version = SPA_VERSION;
3382 }
3383 ASSERT(SPA_VERSION_IS_SUPPORTED(version));
3384
3385 spa->spa_first_txg = txg;
3386 spa->spa_uberblock.ub_txg = txg - 1;
3387 spa->spa_uberblock.ub_version = version;
3388 spa->spa_ubsync = spa->spa_uberblock;
3389
3390 /*
3391 * Create "The Godfather" zio to hold all async IOs
3392 */
3393 spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
3394 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
3395
3396 /*
3397 * Create the root vdev.
3398 */
3399 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3400
3401 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD);
3402
3403 ASSERT(error != 0 || rvd != NULL);
3404 ASSERT(error != 0 || spa->spa_root_vdev == rvd);
3405
3406 if (error == 0 && !zfs_allocatable_devs(nvroot))
3407 error = SET_ERROR(EINVAL);
3408
3409 if (error == 0 &&
3410 (error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
3411 (error = spa_validate_aux(spa, nvroot, txg,
3412 VDEV_ALLOC_ADD)) == 0) {
3413 for (int c = 0; c < rvd->vdev_children; c++) {
3414 vdev_metaslab_set_size(rvd->vdev_child[c]);
3415 vdev_expand(rvd->vdev_child[c], txg);
3416 }
3417 }
3418
3419 spa_config_exit(spa, SCL_ALL, FTAG);
3420
3421 if (error != 0) {
3422 spa_unload(spa);
3423 spa_deactivate(spa);
3424 spa_remove(spa);
3425 mutex_exit(&spa_namespace_lock);
3426 return (error);
3427 }
3428
3429 /*
3430 * Get the list of spares, if specified.
3431 */
3432 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
3433 &spares, &nspares) == 0) {
3434 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME,
3435 KM_SLEEP) == 0);
3436 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
3437 ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
3438 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3439 spa_load_spares(spa);
3440 spa_config_exit(spa, SCL_ALL, FTAG);
3441 spa->spa_spares.sav_sync = B_TRUE;
3442 }
3443
3444 /*
3445 * Get the list of level 2 cache devices, if specified.
3446 */
3447 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
3448 &l2cache, &nl2cache) == 0) {
3449 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
3450 NV_UNIQUE_NAME, KM_SLEEP) == 0);
3451 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
3452 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
3453 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3454 spa_load_l2cache(spa);
3455 spa_config_exit(spa, SCL_ALL, FTAG);
3456 spa->spa_l2cache.sav_sync = B_TRUE;
3457 }
3458
3459 spa->spa_is_initializing = B_TRUE;
3460 spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg);
3461 spa->spa_meta_objset = dp->dp_meta_objset;
3462 spa->spa_is_initializing = B_FALSE;
3463
3464 /*
3465 * Create DDTs (dedup tables).
3466 */
3467 ddt_create(spa);
3468
3469 spa_update_dspace(spa);
3470
3471 tx = dmu_tx_create_assigned(dp, txg);
3472
3473 /*
3474 * Create the pool config object.
3475 */
3476 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
3477 DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE,
3478 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
3479
3480 if (zap_add(spa->spa_meta_objset,
3481 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
3482 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
3483 cmn_err(CE_PANIC, "failed to add pool config");
3484 }
3485
3486 if (spa_version(spa) >= SPA_VERSION_FEATURES)
3487 spa_feature_create_zap_objects(spa, tx);
3488
3489 if (zap_add(spa->spa_meta_objset,
3490 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION,
3491 sizeof (uint64_t), 1, &version, tx) != 0) {
3492 cmn_err(CE_PANIC, "failed to add pool version");
3493 }
3494
3495 /* Newly created pools with the right version are always deflated. */
3496 if (version >= SPA_VERSION_RAIDZ_DEFLATE) {
3497 spa->spa_deflate = TRUE;
3498 if (zap_add(spa->spa_meta_objset,
3499 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
3500 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
3501 cmn_err(CE_PANIC, "failed to add deflate");
3502 }
3503 }
3504
3505 /*
3506 * Create the deferred-free bpobj. Turn off compression
3507 * because sync-to-convergence takes longer if the blocksize
3508 * keeps changing.
3509 */
3510 obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx);
3511 dmu_object_set_compress(spa->spa_meta_objset, obj,
3512 ZIO_COMPRESS_OFF, tx);
3513 if (zap_add(spa->spa_meta_objset,
3514 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ,
3515 sizeof (uint64_t), 1, &obj, tx) != 0) {
3516 cmn_err(CE_PANIC, "failed to add bpobj");
3517 }
3518 VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj,
3519 spa->spa_meta_objset, obj));
3520
3521 /*
3522 * Create the pool's history object.
3523 */
3524 if (version >= SPA_VERSION_ZPOOL_HISTORY)
3525 spa_history_create_obj(spa, tx);
3526
3527 /*
3528 * Set pool properties.
3529 */
3530 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
3531 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
3532 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
3533 spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND);
3534
3535 if (props != NULL) {
3536 spa_configfile_set(spa, props, B_FALSE);
3537 spa_sync_props(props, tx);
3538 }
3539
3540 dmu_tx_commit(tx);
3541
3542 spa->spa_sync_on = B_TRUE;
3543 txg_sync_start(spa->spa_dsl_pool);
3544
3545 /*
3546 * We explicitly wait for the first transaction to complete so that our
3547 * bean counters are appropriately updated.
3548 */
3549 txg_wait_synced(spa->spa_dsl_pool, txg);
3550
3551 spa_config_sync(spa, B_FALSE, B_TRUE);
3552
3553 spa_history_log_version(spa, "create");
3554
3555 spa->spa_minref = refcount_count(&spa->spa_refcount);
3556
3557 mutex_exit(&spa_namespace_lock);
3558
3559 return (0);
3560}
3561
3562#ifdef _KERNEL
3563#if defined(sun)
3564/*
3565 * Get the root pool information from the root disk, then import the root pool
3566 * during the system boot up time.
3567 */
3568extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **);
3569
3570static nvlist_t *
3571spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid)
3572{
3573 nvlist_t *config;
3574 nvlist_t *nvtop, *nvroot;
3575 uint64_t pgid;
3576
3577 if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0)
3578 return (NULL);
3579
3580 /*
3581 * Add this top-level vdev to the child array.
3582 */
3583 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
3584 &nvtop) == 0);
3585 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
3586 &pgid) == 0);
3587 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0);
3588
3589 /*
3590 * Put this pool's top-level vdevs into a root vdev.
3591 */
3592 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
3593 VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
3594 VDEV_TYPE_ROOT) == 0);
3595 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0);
3596 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0);
3597 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
3598 &nvtop, 1) == 0);
3599
3600 /*
3601 * Replace the existing vdev_tree with the new root vdev in
3602 * this pool's configuration (remove the old, add the new).
3603 */
3604 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
3605 nvlist_free(nvroot);
3606 return (config);
3607}
3608
3609/*
3610 * Walk the vdev tree and see if we can find a device with "better"
3611 * configuration. A configuration is "better" if the label on that
3612 * device has a more recent txg.
3613 */
3614static void
3615spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg)
3616{
3617 for (int c = 0; c < vd->vdev_children; c++)
3618 spa_alt_rootvdev(vd->vdev_child[c], avd, txg);
3619
3620 if (vd->vdev_ops->vdev_op_leaf) {
3621 nvlist_t *label;
3622 uint64_t label_txg;
3623
3624 if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid,
3625 &label) != 0)
3626 return;
3627
3628 VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG,
3629 &label_txg) == 0);
3630
3631 /*
3632 * Do we have a better boot device?
3633 */
3634 if (label_txg > *txg) {
3635 *txg = label_txg;
3636 *avd = vd;
3637 }
3638 nvlist_free(label);
3639 }
3640}
3641
3642/*
3643 * Import a root pool.
3644 *
3645 * For x86. devpath_list will consist of devid and/or physpath name of
3646 * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a").
3647 * The GRUB "findroot" command will return the vdev we should boot.
3648 *
3649 * For Sparc, devpath_list consists the physpath name of the booting device
3650 * no matter the rootpool is a single device pool or a mirrored pool.
3651 * e.g.
3652 * "/pci@1f,0/ide@d/disk@0,0:a"
3653 */
3654int
3655spa_import_rootpool(char *devpath, char *devid)
3656{
3657 spa_t *spa;
3658 vdev_t *rvd, *bvd, *avd = NULL;
3659 nvlist_t *config, *nvtop;
3660 uint64_t guid, txg;
3661 char *pname;
3662 int error;
3663
3664 /*
3665 * Read the label from the boot device and generate a configuration.
3666 */
3667 config = spa_generate_rootconf(devpath, devid, &guid);
3668#if defined(_OBP) && defined(_KERNEL)
3669 if (config == NULL) {
3670 if (strstr(devpath, "/iscsi/ssd") != NULL) {
3671 /* iscsi boot */
3672 get_iscsi_bootpath_phy(devpath);
3673 config = spa_generate_rootconf(devpath, devid, &guid);
3674 }
3675 }
3676#endif
3677 if (config == NULL) {
3678 cmn_err(CE_NOTE, "Cannot read the pool label from '%s'",
3679 devpath);
3680 return (SET_ERROR(EIO));
3681 }
3682
3683 VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
3684 &pname) == 0);
3685 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
3686
3687 mutex_enter(&spa_namespace_lock);
3688 if ((spa = spa_lookup(pname)) != NULL) {
3689 /*
3690 * Remove the existing root pool from the namespace so that we
3691 * can replace it with the correct config we just read in.
3692 */
3693 spa_remove(spa);
3694 }
3695
3696 spa = spa_add(pname, config, NULL);
3697 spa->spa_is_root = B_TRUE;
3698 spa->spa_import_flags = ZFS_IMPORT_VERBATIM;
3699
3700 /*
3701 * Build up a vdev tree based on the boot device's label config.
3702 */
3703 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
3704 &nvtop) == 0);
3705 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3706 error = spa_config_parse(spa, &rvd, nvtop, NULL, 0,
3707 VDEV_ALLOC_ROOTPOOL);
3708 spa_config_exit(spa, SCL_ALL, FTAG);
3709 if (error) {
3710 mutex_exit(&spa_namespace_lock);
3711 nvlist_free(config);
3712 cmn_err(CE_NOTE, "Can not parse the config for pool '%s'",
3713 pname);
3714 return (error);
3715 }
3716
3717 /*
3718 * Get the boot vdev.
3719 */
3720 if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) {
3721 cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu",
3722 (u_longlong_t)guid);
3723 error = SET_ERROR(ENOENT);
3724 goto out;
3725 }
3726
3727 /*
3728 * Determine if there is a better boot device.
3729 */
3730 avd = bvd;
3731 spa_alt_rootvdev(rvd, &avd, &txg);
3732 if (avd != bvd) {
3733 cmn_err(CE_NOTE, "The boot device is 'degraded'. Please "
3734 "try booting from '%s'", avd->vdev_path);
3735 error = SET_ERROR(EINVAL);
3736 goto out;
3737 }
3738
3739 /*
3740 * If the boot device is part of a spare vdev then ensure that
3741 * we're booting off the active spare.
3742 */
3743 if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops &&
3744 !bvd->vdev_isspare) {
3745 cmn_err(CE_NOTE, "The boot device is currently spared. Please "
3746 "try booting from '%s'",
3747 bvd->vdev_parent->
3748 vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path);
3749 error = SET_ERROR(EINVAL);
3750 goto out;
3751 }
3752
3753 error = 0;
3754out:
3755 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3756 vdev_free(rvd);
3757 spa_config_exit(spa, SCL_ALL, FTAG);
3758 mutex_exit(&spa_namespace_lock);
3759
3760 nvlist_free(config);
3761 return (error);
3762}
3763
3764#else
3765
3766extern int vdev_geom_read_pool_label(const char *name, nvlist_t ***configs,
3767 uint64_t *count);
3768
3769static nvlist_t *
3770spa_generate_rootconf(const char *name)
3771{
3772 nvlist_t **configs, **tops;
3773 nvlist_t *config;
3774 nvlist_t *best_cfg, *nvtop, *nvroot;
3775 uint64_t *holes;
3776 uint64_t best_txg;
3777 uint64_t nchildren;
3778 uint64_t pgid;
3779 uint64_t count;
3780 uint64_t i;
3781 uint_t nholes;
3782
3783 if (vdev_geom_read_pool_label(name, &configs, &count) != 0)
3784 return (NULL);
3785
3786 ASSERT3U(count, !=, 0);
3787 best_txg = 0;
3788 for (i = 0; i < count; i++) {
3789 uint64_t txg;
3790
3791 VERIFY(nvlist_lookup_uint64(configs[i], ZPOOL_CONFIG_POOL_TXG,
3792 &txg) == 0);
3793 if (txg > best_txg) {
3794 best_txg = txg;
3795 best_cfg = configs[i];
3796 }
3797 }
3798
3799 /*
3800 * Multi-vdev root pool configuration discovery is not supported yet.
3801 */
3802 nchildren = 1;
3803 nvlist_lookup_uint64(best_cfg, ZPOOL_CONFIG_VDEV_CHILDREN, &nchildren);
3804 holes = NULL;
3805 nvlist_lookup_uint64_array(best_cfg, ZPOOL_CONFIG_HOLE_ARRAY,
3806 &holes, &nholes);
3807
3808 tops = kmem_zalloc(nchildren * sizeof(void *), KM_SLEEP);
3809 for (i = 0; i < nchildren; i++) {
3810 if (i >= count)
3811 break;
3812 if (configs[i] == NULL)
3813 continue;
3814 VERIFY(nvlist_lookup_nvlist(configs[i], ZPOOL_CONFIG_VDEV_TREE,
3815 &nvtop) == 0);
3816 nvlist_dup(nvtop, &tops[i], KM_SLEEP);
3817 }
3818 for (i = 0; holes != NULL && i < nholes; i++) {
3819 if (i >= nchildren)
3820 continue;
3821 if (tops[holes[i]] != NULL)
3822 continue;
3823 nvlist_alloc(&tops[holes[i]], NV_UNIQUE_NAME, KM_SLEEP);
3824 VERIFY(nvlist_add_string(tops[holes[i]], ZPOOL_CONFIG_TYPE,
3825 VDEV_TYPE_HOLE) == 0);
3826 VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_ID,
3827 holes[i]) == 0);
3828 VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_GUID,
3829 0) == 0);
3830 }
3831 for (i = 0; i < nchildren; i++) {
3832 if (tops[i] != NULL)
3833 continue;
3834 nvlist_alloc(&tops[i], NV_UNIQUE_NAME, KM_SLEEP);
3835 VERIFY(nvlist_add_string(tops[i], ZPOOL_CONFIG_TYPE,
3836 VDEV_TYPE_MISSING) == 0);
3837 VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_ID,
3838 i) == 0);
3839 VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_GUID,
3840 0) == 0);
3841 }
3842
3843 /*
3844 * Create pool config based on the best vdev config.
3845 */
3846 nvlist_dup(best_cfg, &config, KM_SLEEP);
3847
3848 /*
3849 * Put this pool's top-level vdevs into a root vdev.
3850 */
3851 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
3852 &pgid) == 0);
3853 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
3854 VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
3855 VDEV_TYPE_ROOT) == 0);
3856 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0);
3857 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0);
3858 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
3859 tops, nchildren) == 0);
3860
3861 /*
3862 * Replace the existing vdev_tree with the new root vdev in
3863 * this pool's configuration (remove the old, add the new).
3864 */
3865 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
3866
3867 /*
3868 * Drop vdev config elements that should not be present at pool level.
3869 */
3870 nvlist_remove(config, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64);
3871 nvlist_remove(config, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64);
3872
3873 for (i = 0; i < count; i++)
3874 nvlist_free(configs[i]);
3875 kmem_free(configs, count * sizeof(void *));
3876 for (i = 0; i < nchildren; i++)
3877 nvlist_free(tops[i]);
3878 kmem_free(tops, nchildren * sizeof(void *));
3879 nvlist_free(nvroot);
3880 return (config);
3881}
3882
3883int
3884spa_import_rootpool(const char *name)
3885{
3886 spa_t *spa;
3887 vdev_t *rvd, *bvd, *avd = NULL;
3888 nvlist_t *config, *nvtop;
3889 uint64_t txg;
3890 char *pname;
3891 int error;
3892
3893 /*
3894 * Read the label from the boot device and generate a configuration.
3895 */
3896 config = spa_generate_rootconf(name);
3897
3898 mutex_enter(&spa_namespace_lock);
3899 if (config != NULL) {
3900 VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
3901 &pname) == 0 && strcmp(name, pname) == 0);
3902 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg)
3903 == 0);
3904
3905 if ((spa = spa_lookup(pname)) != NULL) {
3906 /*
3907 * Remove the existing root pool from the namespace so
3908 * that we can replace it with the correct config
3909 * we just read in.
3910 */
3911 spa_remove(spa);
3912 }
3913 spa = spa_add(pname, config, NULL);
3914
3915 /*
3916 * Set spa_ubsync.ub_version as it can be used in vdev_alloc()
3917 * via spa_version().
3918 */
3919 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
3920 &spa->spa_ubsync.ub_version) != 0)
3921 spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
3922 } else if ((spa = spa_lookup(name)) == NULL) {
3923 cmn_err(CE_NOTE, "Cannot find the pool label for '%s'",
3924 name);
3925 return (EIO);
3926 } else {
3927 VERIFY(nvlist_dup(spa->spa_config, &config, KM_SLEEP) == 0);
3928 }
3929 spa->spa_is_root = B_TRUE;
3930 spa->spa_import_flags = ZFS_IMPORT_VERBATIM;
3931
3932 /*
3933 * Build up a vdev tree based on the boot device's label config.
3934 */
3935 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
3936 &nvtop) == 0);
3937 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3938 error = spa_config_parse(spa, &rvd, nvtop, NULL, 0,
3939 VDEV_ALLOC_ROOTPOOL);
3940 spa_config_exit(spa, SCL_ALL, FTAG);
3941 if (error) {
3942 mutex_exit(&spa_namespace_lock);
3943 nvlist_free(config);
3944 cmn_err(CE_NOTE, "Can not parse the config for pool '%s'",
3945 pname);
3946 return (error);
3947 }
3948
3949 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3950 vdev_free(rvd);
3951 spa_config_exit(spa, SCL_ALL, FTAG);
3952 mutex_exit(&spa_namespace_lock);
3953
3954 nvlist_free(config);
3955 return (0);
3956}
3957
3958#endif /* sun */
3959#endif
3960
3961/*
3962 * Import a non-root pool into the system.
3963 */
3964int
3965spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
3966{
3967 spa_t *spa;
3968 char *altroot = NULL;
3969 spa_load_state_t state = SPA_LOAD_IMPORT;
3970 zpool_rewind_policy_t policy;
3971 uint64_t mode = spa_mode_global;
3972 uint64_t readonly = B_FALSE;
3973 int error;
3974 nvlist_t *nvroot;
3975 nvlist_t **spares, **l2cache;
3976 uint_t nspares, nl2cache;
3977
3978 /*
3979 * If a pool with this name exists, return failure.
3980 */
3981 mutex_enter(&spa_namespace_lock);
3982 if (spa_lookup(pool) != NULL) {
3983 mutex_exit(&spa_namespace_lock);
3984 return (SET_ERROR(EEXIST));
3985 }
3986
3987 /*
3988 * Create and initialize the spa structure.
3989 */
3990 (void) nvlist_lookup_string(props,
3991 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
3992 (void) nvlist_lookup_uint64(props,
3993 zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly);
3994 if (readonly)
3995 mode = FREAD;
3996 spa = spa_add(pool, config, altroot);
3997 spa->spa_import_flags = flags;
3998
3999 /*
4000 * Verbatim import - Take a pool and insert it into the namespace
4001 * as if it had been loaded at boot.
4002 */
4003 if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) {
4004 if (props != NULL)
4005 spa_configfile_set(spa, props, B_FALSE);
4006
4007 spa_config_sync(spa, B_FALSE, B_TRUE);
4008
4009 mutex_exit(&spa_namespace_lock);
4010 spa_history_log_version(spa, "import");
4011
4012 return (0);
4013 }
4014
4015 spa_activate(spa, mode);
4016
4017 /*
4018 * Don't start async tasks until we know everything is healthy.
4019 */
4020 spa_async_suspend(spa);
4021
4022 zpool_get_rewind_policy(config, &policy);
4023 if (policy.zrp_request & ZPOOL_DO_REWIND)
4024 state = SPA_LOAD_RECOVER;
4025
4026 /*
4027 * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig
4028 * because the user-supplied config is actually the one to trust when
4029 * doing an import.
4030 */
4031 if (state != SPA_LOAD_RECOVER)
4032 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
4033
4034 error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg,
4035 policy.zrp_request);
4036
4037 /*
4038 * Propagate anything learned while loading the pool and pass it
4039 * back to caller (i.e. rewind info, missing devices, etc).
4040 */
4041 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
4042 spa->spa_load_info) == 0);
4043
4044 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4045 /*
4046 * Toss any existing sparelist, as it doesn't have any validity
4047 * anymore, and conflicts with spa_has_spare().
4048 */
4049 if (spa->spa_spares.sav_config) {
4050 nvlist_free(spa->spa_spares.sav_config);
4051 spa->spa_spares.sav_config = NULL;
4052 spa_load_spares(spa);
4053 }
4054 if (spa->spa_l2cache.sav_config) {
4055 nvlist_free(spa->spa_l2cache.sav_config);
4056 spa->spa_l2cache.sav_config = NULL;
4057 spa_load_l2cache(spa);
4058 }
4059
4060 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
4061 &nvroot) == 0);
4062 if (error == 0)
4063 error = spa_validate_aux(spa, nvroot, -1ULL,
4064 VDEV_ALLOC_SPARE);
4065 if (error == 0)
4066 error = spa_validate_aux(spa, nvroot, -1ULL,
4067 VDEV_ALLOC_L2CACHE);
4068 spa_config_exit(spa, SCL_ALL, FTAG);
4069
4070 if (props != NULL)
4071 spa_configfile_set(spa, props, B_FALSE);
4072
4073 if (error != 0 || (props && spa_writeable(spa) &&
4074 (error = spa_prop_set(spa, props)))) {
4075 spa_unload(spa);
4076 spa_deactivate(spa);
4077 spa_remove(spa);
4078 mutex_exit(&spa_namespace_lock);
4079 return (error);
4080 }
4081
4082 spa_async_resume(spa);
4083
4084 /*
4085 * Override any spares and level 2 cache devices as specified by
4086 * the user, as these may have correct device names/devids, etc.
4087 */
4088 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
4089 &spares, &nspares) == 0) {
4090 if (spa->spa_spares.sav_config)
4091 VERIFY(nvlist_remove(spa->spa_spares.sav_config,
4092 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
4093 else
4094 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config,
4095 NV_UNIQUE_NAME, KM_SLEEP) == 0);
4096 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
4097 ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
4098 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4099 spa_load_spares(spa);
4100 spa_config_exit(spa, SCL_ALL, FTAG);
4101 spa->spa_spares.sav_sync = B_TRUE;
4102 }
4103 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
4104 &l2cache, &nl2cache) == 0) {
4105 if (spa->spa_l2cache.sav_config)
4106 VERIFY(nvlist_remove(spa->spa_l2cache.sav_config,
4107 ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0);
4108 else
4109 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
4110 NV_UNIQUE_NAME, KM_SLEEP) == 0);
4111 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
4112 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
4113 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4114 spa_load_l2cache(spa);
4115 spa_config_exit(spa, SCL_ALL, FTAG);
4116 spa->spa_l2cache.sav_sync = B_TRUE;
4117 }
4118
4119 /*
4120 * Check for any removed devices.
4121 */
4122 if (spa->spa_autoreplace) {
4123 spa_aux_check_removed(&spa->spa_spares);
4124 spa_aux_check_removed(&spa->spa_l2cache);
4125 }
4126
4127 if (spa_writeable(spa)) {
4128 /*
4129 * Update the config cache to include the newly-imported pool.
4130 */
4131 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
4132 }
4133
4134 /*
4135 * It's possible that the pool was expanded while it was exported.
4136 * We kick off an async task to handle this for us.
4137 */
4138 spa_async_request(spa, SPA_ASYNC_AUTOEXPAND);
4139
4140 mutex_exit(&spa_namespace_lock);
4141 spa_history_log_version(spa, "import");
4142
4143#ifdef __FreeBSD__
4144#ifdef _KERNEL
4145 zvol_create_minors(pool);
4146#endif
4147#endif
4148 return (0);
4149}
4150
4151nvlist_t *
4152spa_tryimport(nvlist_t *tryconfig)
4153{
4154 nvlist_t *config = NULL;
4155 char *poolname;
4156 spa_t *spa;
4157 uint64_t state;
4158 int error;
4159
4160 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
4161 return (NULL);
4162
4163 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state))
4164 return (NULL);
4165
4166 /*
4167 * Create and initialize the spa structure.
4168 */
4169 mutex_enter(&spa_namespace_lock);
4170 spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL);
4171 spa_activate(spa, FREAD);
4172
4173 /*
4174 * Pass off the heavy lifting to spa_load().
4175 * Pass TRUE for mosconfig because the user-supplied config
4176 * is actually the one to trust when doing an import.
4177 */
4178 error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE);
4179
4180 /*
4181 * If 'tryconfig' was at least parsable, return the current config.
4182 */
4183 if (spa->spa_root_vdev != NULL) {
4184 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
4185 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
4186 poolname) == 0);
4187 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
4188 state) == 0);
4189 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
4190 spa->spa_uberblock.ub_timestamp) == 0);
4191 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
4192 spa->spa_load_info) == 0);
4193
4194 /*
4195 * If the bootfs property exists on this pool then we
4196 * copy it out so that external consumers can tell which
4197 * pools are bootable.
4198 */
4199 if ((!error || error == EEXIST) && spa->spa_bootfs) {
4200 char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
4201
4202 /*
4203 * We have to play games with the name since the
4204 * pool was opened as TRYIMPORT_NAME.
4205 */
4206 if (dsl_dsobj_to_dsname(spa_name(spa),
4207 spa->spa_bootfs, tmpname) == 0) {
4208 char *cp;
4209 char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
4210
4211 cp = strchr(tmpname, '/');
4212 if (cp == NULL) {
4213 (void) strlcpy(dsname, tmpname,
4214 MAXPATHLEN);
4215 } else {
4216 (void) snprintf(dsname, MAXPATHLEN,
4217 "%s/%s", poolname, ++cp);
4218 }
4219 VERIFY(nvlist_add_string(config,
4220 ZPOOL_CONFIG_BOOTFS, dsname) == 0);
4221 kmem_free(dsname, MAXPATHLEN);
4222 }
4223 kmem_free(tmpname, MAXPATHLEN);
4224 }
4225
4226 /*
4227 * Add the list of hot spares and level 2 cache devices.
4228 */
4229 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4230 spa_add_spares(spa, config);
4231 spa_add_l2cache(spa, config);
4232 spa_config_exit(spa, SCL_CONFIG, FTAG);
4233 }
4234
4235 spa_unload(spa);
4236 spa_deactivate(spa);
4237 spa_remove(spa);
4238 mutex_exit(&spa_namespace_lock);
4239
4240 return (config);
4241}
4242
4243/*
4244 * Pool export/destroy
4245 *
4246 * The act of destroying or exporting a pool is very simple. We make sure there
4247 * is no more pending I/O and any references to the pool are gone. Then, we
4248 * update the pool state and sync all the labels to disk, removing the
4249 * configuration from the cache afterwards. If the 'hardforce' flag is set, then
4250 * we don't sync the labels or remove the configuration cache.
4251 */
4252static int
4253spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
4254 boolean_t force, boolean_t hardforce)
4255{
4256 spa_t *spa;
4257
4258 if (oldconfig)
4259 *oldconfig = NULL;
4260
4261 if (!(spa_mode_global & FWRITE))
4262 return (SET_ERROR(EROFS));
4263
4264 mutex_enter(&spa_namespace_lock);
4265 if ((spa = spa_lookup(pool)) == NULL) {
4266 mutex_exit(&spa_namespace_lock);
4267 return (SET_ERROR(ENOENT));
4268 }
4269
4270 /*
4271 * Put a hold on the pool, drop the namespace lock, stop async tasks,
4272 * reacquire the namespace lock, and see if we can export.
4273 */
4274 spa_open_ref(spa, FTAG);
4275 mutex_exit(&spa_namespace_lock);
4276 spa_async_suspend(spa);
4277 mutex_enter(&spa_namespace_lock);
4278 spa_close(spa, FTAG);
4279
4280 /*
4281 * The pool will be in core if it's openable,
4282 * in which case we can modify its state.
4283 */
4284 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) {
4285 /*
4286 * Objsets may be open only because they're dirty, so we
4287 * have to force it to sync before checking spa_refcnt.
4288 */
4289 txg_wait_synced(spa->spa_dsl_pool, 0);
4290
4291 /*
4292 * A pool cannot be exported or destroyed if there are active
4293 * references. If we are resetting a pool, allow references by
4294 * fault injection handlers.
4295 */
4296 if (!spa_refcount_zero(spa) ||
4297 (spa->spa_inject_ref != 0 &&
4298 new_state != POOL_STATE_UNINITIALIZED)) {
4299 spa_async_resume(spa);
4300 mutex_exit(&spa_namespace_lock);
4301 return (SET_ERROR(EBUSY));
4302 }
4303
4304 /*
4305 * A pool cannot be exported if it has an active shared spare.
4306 * This is to prevent other pools stealing the active spare
4307 * from an exported pool. At user's own will, such pool can
4308 * be forcedly exported.
4309 */
4310 if (!force && new_state == POOL_STATE_EXPORTED &&
4311 spa_has_active_shared_spare(spa)) {
4312 spa_async_resume(spa);
4313 mutex_exit(&spa_namespace_lock);
4314 return (SET_ERROR(EXDEV));
4315 }
4316
4317 /*
4318 * We want this to be reflected on every label,
4319 * so mark them all dirty. spa_unload() will do the
4320 * final sync that pushes these changes out.
4321 */
4322 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
4323 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4324 spa->spa_state = new_state;
4325 spa->spa_final_txg = spa_last_synced_txg(spa) +
4326 TXG_DEFER_SIZE + 1;
4327 vdev_config_dirty(spa->spa_root_vdev);
4328 spa_config_exit(spa, SCL_ALL, FTAG);
4329 }
4330 }
4331
4332 spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY);
4333
4334 if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
4335 spa_unload(spa);
4336 spa_deactivate(spa);
4337 }
4338
4339 if (oldconfig && spa->spa_config)
4340 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0);
4341
4342 if (new_state != POOL_STATE_UNINITIALIZED) {
4343 if (!hardforce)
4344 spa_config_sync(spa, B_TRUE, B_TRUE);
4345 spa_remove(spa);
4346 }
4347 mutex_exit(&spa_namespace_lock);
4348
4349 return (0);
4350}
4351
4352/*
4353 * Destroy a storage pool.
4354 */
4355int
4356spa_destroy(char *pool)
4357{
4358 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL,
4359 B_FALSE, B_FALSE));
4360}
4361
4362/*
4363 * Export a storage pool.
4364 */
4365int
4366spa_export(char *pool, nvlist_t **oldconfig, boolean_t force,
4367 boolean_t hardforce)
4368{
4369 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig,
4370 force, hardforce));
4371}
4372
4373/*
4374 * Similar to spa_export(), this unloads the spa_t without actually removing it
4375 * from the namespace in any way.
4376 */
4377int
4378spa_reset(char *pool)
4379{
4380 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL,
4381 B_FALSE, B_FALSE));
4382}
4383
4384/*
4385 * ==========================================================================
4386 * Device manipulation
4387 * ==========================================================================
4388 */
4389
4390/*
4391 * Add a device to a storage pool.
4392 */
4393int
4394spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
4395{
4396 uint64_t txg, id;
4397 int error;
4398 vdev_t *rvd = spa->spa_root_vdev;
4399 vdev_t *vd, *tvd;
4400 nvlist_t **spares, **l2cache;
4401 uint_t nspares, nl2cache;
4402
4403 ASSERT(spa_writeable(spa));
4404
4405 txg = spa_vdev_enter(spa);
4406
4407 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0,
4408 VDEV_ALLOC_ADD)) != 0)
4409 return (spa_vdev_exit(spa, NULL, txg, error));
4410
4411 spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */
4412
4413 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares,
4414 &nspares) != 0)
4415 nspares = 0;
4416
4417 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache,
4418 &nl2cache) != 0)
4419 nl2cache = 0;
4420
4421 if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0)
4422 return (spa_vdev_exit(spa, vd, txg, EINVAL));
4423
4424 if (vd->vdev_children != 0 &&
4425 (error = vdev_create(vd, txg, B_FALSE)) != 0)
4426 return (spa_vdev_exit(spa, vd, txg, error));
4427
4428 /*
4429 * We must validate the spares and l2cache devices after checking the
4430 * children. Otherwise, vdev_inuse() will blindly overwrite the spare.
4431 */
4432 if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0)
4433 return (spa_vdev_exit(spa, vd, txg, error));
4434
4435 /*
4436 * Transfer each new top-level vdev from vd to rvd.
4437 */
4438 for (int c = 0; c < vd->vdev_children; c++) {
4439
4440 /*
4441 * Set the vdev id to the first hole, if one exists.
4442 */
4443 for (id = 0; id < rvd->vdev_children; id++) {
4444 if (rvd->vdev_child[id]->vdev_ishole) {
4445 vdev_free(rvd->vdev_child[id]);
4446 break;
4447 }
4448 }
4449 tvd = vd->vdev_child[c];
4450 vdev_remove_child(vd, tvd);
4451 tvd->vdev_id = id;
4452 vdev_add_child(rvd, tvd);
4453 vdev_config_dirty(tvd);
4454 }
4455
4456 if (nspares != 0) {
4457 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares,
4458 ZPOOL_CONFIG_SPARES);
4459 spa_load_spares(spa);
4460 spa->spa_spares.sav_sync = B_TRUE;
4461 }
4462
4463 if (nl2cache != 0) {
4464 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache,
4465 ZPOOL_CONFIG_L2CACHE);
4466 spa_load_l2cache(spa);
4467 spa->spa_l2cache.sav_sync = B_TRUE;
4468 }
4469
4470 /*
4471 * We have to be careful when adding new vdevs to an existing pool.
4472 * If other threads start allocating from these vdevs before we
4473 * sync the config cache, and we lose power, then upon reboot we may
4474 * fail to open the pool because there are DVAs that the config cache
4475 * can't translate. Therefore, we first add the vdevs without
4476 * initializing metaslabs; sync the config cache (via spa_vdev_exit());
4477 * and then let spa_config_update() initialize the new metaslabs.
4478 *
4479 * spa_load() checks for added-but-not-initialized vdevs, so that
4480 * if we lose power at any point in this sequence, the remaining
4481 * steps will be completed the next time we load the pool.
4482 */
4483 (void) spa_vdev_exit(spa, vd, txg, 0);
4484
4485 mutex_enter(&spa_namespace_lock);
4486 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
4487 mutex_exit(&spa_namespace_lock);
4488
4489 return (0);
4490}
4491
4492/*
4493 * Attach a device to a mirror. The arguments are the path to any device
4494 * in the mirror, and the nvroot for the new device. If the path specifies
4495 * a device that is not mirrored, we automatically insert the mirror vdev.
4496 *
4497 * If 'replacing' is specified, the new device is intended to replace the
4498 * existing device; in this case the two devices are made into their own
4499 * mirror using the 'replacing' vdev, which is functionally identical to
4500 * the mirror vdev (it actually reuses all the same ops) but has a few
4501 * extra rules: you can't attach to it after it's been created, and upon
4502 * completion of resilvering, the first disk (the one being replaced)
4503 * is automatically detached.
4504 */
4505int
4506spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
4507{
4508 uint64_t txg, dtl_max_txg;
4509 vdev_t *rvd = spa->spa_root_vdev;
4510 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
4511 vdev_ops_t *pvops;
4512 char *oldvdpath, *newvdpath;
4513 int newvd_isspare;
4514 int error;
4515
4516 ASSERT(spa_writeable(spa));
4517
4518 txg = spa_vdev_enter(spa);
4519
4520 oldvd = spa_lookup_by_guid(spa, guid, B_FALSE);
4521
4522 if (oldvd == NULL)
4523 return (spa_vdev_exit(spa, NULL, txg, ENODEV));
4524
4525 if (!oldvd->vdev_ops->vdev_op_leaf)
4526 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
4527
4528 pvd = oldvd->vdev_parent;
4529
4530 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
4531 VDEV_ALLOC_ATTACH)) != 0)
4532 return (spa_vdev_exit(spa, NULL, txg, EINVAL));
4533
4534 if (newrootvd->vdev_children != 1)
4535 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
4536
4537 newvd = newrootvd->vdev_child[0];
4538
4539 if (!newvd->vdev_ops->vdev_op_leaf)
4540 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
4541
4542 if ((error = vdev_create(newrootvd, txg, replacing)) != 0)
4543 return (spa_vdev_exit(spa, newrootvd, txg, error));
4544
4545 /*
4546 * Spares can't replace logs
4547 */
4548 if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare)
4549 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
4550
4551 if (!replacing) {
4552 /*
4553 * For attach, the only allowable parent is a mirror or the root
4554 * vdev.
4555 */
4556 if (pvd->vdev_ops != &vdev_mirror_ops &&
4557 pvd->vdev_ops != &vdev_root_ops)
4558 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
4559
4560 pvops = &vdev_mirror_ops;
4561 } else {
4562 /*
4563 * Active hot spares can only be replaced by inactive hot
4564 * spares.
4565 */
4566 if (pvd->vdev_ops == &vdev_spare_ops &&
4567 oldvd->vdev_isspare &&
4568 !spa_has_spare(spa, newvd->vdev_guid))
4569 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
4570
4571 /*
4572 * If the source is a hot spare, and the parent isn't already a
4573 * spare, then we want to create a new hot spare. Otherwise, we
4574 * want to create a replacing vdev. The user is not allowed to
4575 * attach to a spared vdev child unless the 'isspare' state is
4576 * the same (spare replaces spare, non-spare replaces
4577 * non-spare).
4578 */
4579 if (pvd->vdev_ops == &vdev_replacing_ops &&
4580 spa_version(spa) < SPA_VERSION_MULTI_REPLACE) {
4581 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
4582 } else if (pvd->vdev_ops == &vdev_spare_ops &&
4583 newvd->vdev_isspare != oldvd->vdev_isspare) {
4584 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
4585 }
4586
4587 if (newvd->vdev_isspare)
4588 pvops = &vdev_spare_ops;
4589 else
4590 pvops = &vdev_replacing_ops;
4591 }
4592
4593 /*
4594 * Make sure the new device is big enough.
4595 */
4596 if (newvd->vdev_asize < vdev_get_min_asize(oldvd))
4597 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
4598
4599 /*
4600 * The new device cannot have a higher alignment requirement
4601 * than the top-level vdev.
4602 */
4603 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
4604 return (spa_vdev_exit(spa, newrootvd, txg, EDOM));
4605
4606 /*
4607 * If this is an in-place replacement, update oldvd's path and devid
4608 * to make it distinguishable from newvd, and unopenable from now on.
4609 */
4610 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
4611 spa_strfree(oldvd->vdev_path);
4612 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
4613 KM_SLEEP);
4614 (void) sprintf(oldvd->vdev_path, "%s/%s",
4615 newvd->vdev_path, "old");
4616 if (oldvd->vdev_devid != NULL) {
4617 spa_strfree(oldvd->vdev_devid);
4618 oldvd->vdev_devid = NULL;
4619 }
4620 }
4621
4622 /* mark the device being resilvered */
4623 newvd->vdev_resilvering = B_TRUE;
4624
4625 /*
4626 * If the parent is not a mirror, or if we're replacing, insert the new
4627 * mirror/replacing/spare vdev above oldvd.
4628 */
4629 if (pvd->vdev_ops != pvops)
4630 pvd = vdev_add_parent(oldvd, pvops);
4631
4632 ASSERT(pvd->vdev_top->vdev_parent == rvd);
4633 ASSERT(pvd->vdev_ops == pvops);
4634 ASSERT(oldvd->vdev_parent == pvd);
4635
4636 /*
4637 * Extract the new device from its root and add it to pvd.
4638 */
4639 vdev_remove_child(newrootvd, newvd);
4640 newvd->vdev_id = pvd->vdev_children;
4641 newvd->vdev_crtxg = oldvd->vdev_crtxg;
4642 vdev_add_child(pvd, newvd);
4643
4644 tvd = newvd->vdev_top;
4645 ASSERT(pvd->vdev_top == tvd);
4646 ASSERT(tvd->vdev_parent == rvd);
4647
4648 vdev_config_dirty(tvd);
4649
4650 /*
4651 * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account
4652 * for any dmu_sync-ed blocks. It will propagate upward when
4653 * spa_vdev_exit() calls vdev_dtl_reassess().
4654 */
4655 dtl_max_txg = txg + TXG_CONCURRENT_STATES;
4656
4657 vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL,
4658 dtl_max_txg - TXG_INITIAL);
4659
4660 if (newvd->vdev_isspare) {
4661 spa_spare_activate(newvd);
4662 spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE);
4663 }
4664
4665 oldvdpath = spa_strdup(oldvd->vdev_path);
4666 newvdpath = spa_strdup(newvd->vdev_path);
4667 newvd_isspare = newvd->vdev_isspare;
4668
4669 /*
4670 * Mark newvd's DTL dirty in this txg.
4671 */
4672 vdev_dirty(tvd, VDD_DTL, newvd, txg);
4673
4674 /*
4675 * Restart the resilver
4676 */
4677 dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg);
4678
4679 /*
4680 * Commit the config
4681 */
4682 (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0);
4683
4684 spa_history_log_internal(spa, "vdev attach", NULL,
4685 "%s vdev=%s %s vdev=%s",
4686 replacing && newvd_isspare ? "spare in" :
4687 replacing ? "replace" : "attach", newvdpath,
4688 replacing ? "for" : "to", oldvdpath);
4689
4690 spa_strfree(oldvdpath);
4691 spa_strfree(newvdpath);
4692
4693 if (spa->spa_bootfs)
4694 spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH);
4695
4696 return (0);
4697}
4698
4699/*
4700 * Detach a device from a mirror or replacing vdev.
4701 *
4702 * If 'replace_done' is specified, only detach if the parent
4703 * is a replacing vdev.
4704 */
4705int
4706spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
4707{
4708 uint64_t txg;
4709 int error;
4710 vdev_t *rvd = spa->spa_root_vdev;
4711 vdev_t *vd, *pvd, *cvd, *tvd;
4712 boolean_t unspare = B_FALSE;
4713 uint64_t unspare_guid = 0;
4714 char *vdpath;
4715
4716 ASSERT(spa_writeable(spa));
4717
4718 txg = spa_vdev_enter(spa);
4719
4720 vd = spa_lookup_by_guid(spa, guid, B_FALSE);
4721
4722 if (vd == NULL)
4723 return (spa_vdev_exit(spa, NULL, txg, ENODEV));
4724
4725 if (!vd->vdev_ops->vdev_op_leaf)
4726 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
4727
4728 pvd = vd->vdev_parent;
4729
4730 /*
4731 * If the parent/child relationship is not as expected, don't do it.
4732 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing
4733 * vdev that's replacing B with C. The user's intent in replacing
4734 * is to go from M(A,B) to M(A,C). If the user decides to cancel
4735 * the replace by detaching C, the expected behavior is to end up
4736 * M(A,B). But suppose that right after deciding to detach C,
4737 * the replacement of B completes. We would have M(A,C), and then
4738 * ask to detach C, which would leave us with just A -- not what
4739 * the user wanted. To prevent this, we make sure that the
4740 * parent/child relationship hasn't changed -- in this example,
4741 * that C's parent is still the replacing vdev R.
4742 */
4743 if (pvd->vdev_guid != pguid && pguid != 0)
4744 return (spa_vdev_exit(spa, NULL, txg, EBUSY));
4745
4746 /*
4747 * Only 'replacing' or 'spare' vdevs can be replaced.
4748 */
4749 if (replace_done && pvd->vdev_ops != &vdev_replacing_ops &&
4750 pvd->vdev_ops != &vdev_spare_ops)
4751 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
4752
4753 ASSERT(pvd->vdev_ops != &vdev_spare_ops ||
4754 spa_version(spa) >= SPA_VERSION_SPARES);
4755
4756 /*
4757 * Only mirror, replacing, and spare vdevs support detach.
4758 */
4759 if (pvd->vdev_ops != &vdev_replacing_ops &&
4760 pvd->vdev_ops != &vdev_mirror_ops &&
4761 pvd->vdev_ops != &vdev_spare_ops)
4762 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
4763
4764 /*
4765 * If this device has the only valid copy of some data,
4766 * we cannot safely detach it.
4767 */
4768 if (vdev_dtl_required(vd))
4769 return (spa_vdev_exit(spa, NULL, txg, EBUSY));
4770
4771 ASSERT(pvd->vdev_children >= 2);
4772
4773 /*
4774 * If we are detaching the second disk from a replacing vdev, then
4775 * check to see if we changed the original vdev's path to have "/old"
4776 * at the end in spa_vdev_attach(). If so, undo that change now.
4777 */
4778 if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 &&
4779 vd->vdev_path != NULL) {
4780 size_t len = strlen(vd->vdev_path);
4781
4782 for (int c = 0; c < pvd->vdev_children; c++) {
4783 cvd = pvd->vdev_child[c];
4784
4785 if (cvd == vd || cvd->vdev_path == NULL)
4786 continue;
4787
4788 if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 &&
4789 strcmp(cvd->vdev_path + len, "/old") == 0) {
4790 spa_strfree(cvd->vdev_path);
4791 cvd->vdev_path = spa_strdup(vd->vdev_path);
4792 break;
4793 }
4794 }
4795 }
4796
4797 /*
4798 * If we are detaching the original disk from a spare, then it implies
4799 * that the spare should become a real disk, and be removed from the
4800 * active spare list for the pool.
4801 */
4802 if (pvd->vdev_ops == &vdev_spare_ops &&
4803 vd->vdev_id == 0 &&
4804 pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare)
4805 unspare = B_TRUE;
4806
4807 /*
4808 * Erase the disk labels so the disk can be used for other things.
4809 * This must be done after all other error cases are handled,
4810 * but before we disembowel vd (so we can still do I/O to it).
4811 * But if we can't do it, don't treat the error as fatal --
4812 * it may be that the unwritability of the disk is the reason
4813 * it's being detached!
4814 */
4815 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
4816
4817 /*
4818 * Remove vd from its parent and compact the parent's children.
4819 */
4820 vdev_remove_child(pvd, vd);
4821 vdev_compact_children(pvd);
4822
4823 /*
4824 * Remember one of the remaining children so we can get tvd below.
4825 */
4826 cvd = pvd->vdev_child[pvd->vdev_children - 1];
4827
4828 /*
4829 * If we need to remove the remaining child from the list of hot spares,
4830 * do it now, marking the vdev as no longer a spare in the process.
4831 * We must do this before vdev_remove_parent(), because that can
4832 * change the GUID if it creates a new toplevel GUID. For a similar
4833 * reason, we must remove the spare now, in the same txg as the detach;
4834 * otherwise someone could attach a new sibling, change the GUID, and
4835 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail.
4836 */
4837 if (unspare) {
4838 ASSERT(cvd->vdev_isspare);
4839 spa_spare_remove(cvd);
4840 unspare_guid = cvd->vdev_guid;
4841 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
4842 cvd->vdev_unspare = B_TRUE;
4843 }
4844
4845 /*
4846 * If the parent mirror/replacing vdev only has one child,
4847 * the parent is no longer needed. Remove it from the tree.
4848 */
4849 if (pvd->vdev_children == 1) {
4850 if (pvd->vdev_ops == &vdev_spare_ops)
4851 cvd->vdev_unspare = B_FALSE;
4852 vdev_remove_parent(cvd);
4853 cvd->vdev_resilvering = B_FALSE;
4854 }
4855
4856
4857 /*
4858 * We don't set tvd until now because the parent we just removed
4859 * may have been the previous top-level vdev.
4860 */
4861 tvd = cvd->vdev_top;
4862 ASSERT(tvd->vdev_parent == rvd);
4863
4864 /*
4865 * Reevaluate the parent vdev state.
4866 */
4867 vdev_propagate_state(cvd);
4868
4869 /*
4870 * If the 'autoexpand' property is set on the pool then automatically
4871 * try to expand the size of the pool. For example if the device we
4872 * just detached was smaller than the others, it may be possible to
4873 * add metaslabs (i.e. grow the pool). We need to reopen the vdev
4874 * first so that we can obtain the updated sizes of the leaf vdevs.
4875 */
4876 if (spa->spa_autoexpand) {
4877 vdev_reopen(tvd);
4878 vdev_expand(tvd, txg);
4879 }
4880
4881 vdev_config_dirty(tvd);
4882
4883 /*
4884 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that
4885 * vd->vdev_detached is set and free vd's DTL object in syncing context.
4886 * But first make sure we're not on any *other* txg's DTL list, to
4887 * prevent vd from being accessed after it's freed.
4888 */
4889 vdpath = spa_strdup(vd->vdev_path);
4890 for (int t = 0; t < TXG_SIZE; t++)
4891 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
4892 vd->vdev_detached = B_TRUE;
4893 vdev_dirty(tvd, VDD_DTL, vd, txg);
4894
4895 spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE);
4896
4897 /* hang on to the spa before we release the lock */
4898 spa_open_ref(spa, FTAG);
4899
4900 error = spa_vdev_exit(spa, vd, txg, 0);
4901
4902 spa_history_log_internal(spa, "detach", NULL,
4903 "vdev=%s", vdpath);
4904 spa_strfree(vdpath);
4905
4906 /*
4907 * If this was the removal of the original device in a hot spare vdev,
4908 * then we want to go through and remove the device from the hot spare
4909 * list of every other pool.
4910 */
4911 if (unspare) {
4912 spa_t *altspa = NULL;
4913
4914 mutex_enter(&spa_namespace_lock);
4915 while ((altspa = spa_next(altspa)) != NULL) {
4916 if (altspa->spa_state != POOL_STATE_ACTIVE ||
4917 altspa == spa)
4918 continue;
4919
4920 spa_open_ref(altspa, FTAG);
4921 mutex_exit(&spa_namespace_lock);
4922 (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE);
4923 mutex_enter(&spa_namespace_lock);
4924 spa_close(altspa, FTAG);
4925 }
4926 mutex_exit(&spa_namespace_lock);
4927
4928 /* search the rest of the vdevs for spares to remove */
4929 spa_vdev_resilver_done(spa);
4930 }
4931
4932 /* all done with the spa; OK to release */
4933 mutex_enter(&spa_namespace_lock);
4934 spa_close(spa, FTAG);
4935 mutex_exit(&spa_namespace_lock);
4936
4937 return (error);
4938}
4939
4940/*
4941 * Split a set of devices from their mirrors, and create a new pool from them.
4942 */
4943int
4944spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
4945 nvlist_t *props, boolean_t exp)
4946{
4947 int error = 0;
4948 uint64_t txg, *glist;
4949 spa_t *newspa;
4950 uint_t c, children, lastlog;
4951 nvlist_t **child, *nvl, *tmp;
4952 dmu_tx_t *tx;
4953 char *altroot = NULL;
4954 vdev_t *rvd, **vml = NULL; /* vdev modify list */
4955 boolean_t activate_slog;
4956
4957 ASSERT(spa_writeable(spa));
4958
4959 txg = spa_vdev_enter(spa);
4960
4961 /* clear the log and flush everything up to now */
4962 activate_slog = spa_passivate_log(spa);
4963 (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
4964 error = spa_offline_log(spa);
4965 txg = spa_vdev_config_enter(spa);
4966
4967 if (activate_slog)
4968 spa_activate_log(spa);
4969
4970 if (error != 0)
4971 return (spa_vdev_exit(spa, NULL, txg, error));
4972
4973 /* check new spa name before going any further */
4974 if (spa_lookup(newname) != NULL)
4975 return (spa_vdev_exit(spa, NULL, txg, EEXIST));
4976
4977 /*
4978 * scan through all the children to ensure they're all mirrors
4979 */
4980 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 ||
4981 nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child,
4982 &children) != 0)
4983 return (spa_vdev_exit(spa, NULL, txg, EINVAL));
4984
4985 /* first, check to ensure we've got the right child count */
4986 rvd = spa->spa_root_vdev;
4987 lastlog = 0;
4988 for (c = 0; c < rvd->vdev_children; c++) {
4989 vdev_t *vd = rvd->vdev_child[c];
4990
4991 /* don't count the holes & logs as children */
4992 if (vd->vdev_islog || vd->vdev_ishole) {
4993 if (lastlog == 0)
4994 lastlog = c;
4995 continue;
4996 }
4997
4998 lastlog = 0;
4999 }
5000 if (children != (lastlog != 0 ? lastlog : rvd->vdev_children))
5001 return (spa_vdev_exit(spa, NULL, txg, EINVAL));
5002
5003 /* next, ensure no spare or cache devices are part of the split */
5004 if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 ||
5005 nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0)
5006 return (spa_vdev_exit(spa, NULL, txg, EINVAL));
5007
5008 vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP);
5009 glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP);
5010
5011 /* then, loop over each vdev and validate it */
5012 for (c = 0; c < children; c++) {
5013 uint64_t is_hole = 0;
5014
5015 (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
5016 &is_hole);
5017
5018 if (is_hole != 0) {
5019 if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole ||
5020 spa->spa_root_vdev->vdev_child[c]->vdev_islog) {
5021 continue;
5022 } else {
5023 error = SET_ERROR(EINVAL);
5024 break;
5025 }
5026 }
5027
5028 /* which disk is going to be split? */
5029 if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID,
5030 &glist[c]) != 0) {
5031 error = SET_ERROR(EINVAL);
5032 break;
5033 }
5034
5035 /* look it up in the spa */
5036 vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE);
5037 if (vml[c] == NULL) {
5038 error = SET_ERROR(ENODEV);
5039 break;
5040 }
5041
5042 /* make sure there's nothing stopping the split */
5043 if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops ||
5044 vml[c]->vdev_islog ||
5045 vml[c]->vdev_ishole ||
5046 vml[c]->vdev_isspare ||
5047 vml[c]->vdev_isl2cache ||
5048 !vdev_writeable(vml[c]) ||
5049 vml[c]->vdev_children != 0 ||
5050 vml[c]->vdev_state != VDEV_STATE_HEALTHY ||
5051 c != spa->spa_root_vdev->vdev_child[c]->vdev_id) {
5052 error = SET_ERROR(EINVAL);
5053 break;
5054 }
5055
5056 if (vdev_dtl_required(vml[c])) {
5057 error = SET_ERROR(EBUSY);
5058 break;
5059 }
5060
5061 /* we need certain info from the top level */
5062 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY,
5063 vml[c]->vdev_top->vdev_ms_array) == 0);
5064 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT,
5065 vml[c]->vdev_top->vdev_ms_shift) == 0);
5066 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE,
5067 vml[c]->vdev_top->vdev_asize) == 0);
5068 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT,
5069 vml[c]->vdev_top->vdev_ashift) == 0);
5070 }
5071
5072 if (error != 0) {
5073 kmem_free(vml, children * sizeof (vdev_t *));
5074 kmem_free(glist, children * sizeof (uint64_t));
5075 return (spa_vdev_exit(spa, NULL, txg, error));
5076 }
5077
5078 /* stop writers from using the disks */
5079 for (c = 0; c < children; c++) {
5080 if (vml[c] != NULL)
5081 vml[c]->vdev_offline = B_TRUE;
5082 }
5083 vdev_reopen(spa->spa_root_vdev);
5084
5085 /*
5086 * Temporarily record the splitting vdevs in the spa config. This
5087 * will disappear once the config is regenerated.
5088 */
5089 VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0);
5090 VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
5091 glist, children) == 0);
5092 kmem_free(glist, children * sizeof (uint64_t));
5093
5094 mutex_enter(&spa->spa_props_lock);
5095 VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT,
5096 nvl) == 0);
5097 mutex_exit(&spa->spa_props_lock);
5098 spa->spa_config_splitting = nvl;
5099 vdev_config_dirty(spa->spa_root_vdev);
5100
5101 /* configure and create the new pool */
5102 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0);
5103 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
5104 exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0);
5105 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
5106 spa_version(spa)) == 0);
5107 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG,
5108 spa->spa_config_txg) == 0);
5109 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
5110 spa_generate_guid(NULL)) == 0);
5111 (void) nvlist_lookup_string(props,
5112 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
5113
5114 /* add the new pool to the namespace */
5115 newspa = spa_add(newname, config, altroot);
5116 newspa->spa_config_txg = spa->spa_config_txg;
5117 spa_set_log_state(newspa, SPA_LOG_CLEAR);
5118
5119 /* release the spa config lock, retaining the namespace lock */
5120 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
5121
5122 if (zio_injection_enabled)
5123 zio_handle_panic_injection(spa, FTAG, 1);
5124
5125 spa_activate(newspa, spa_mode_global);
5126 spa_async_suspend(newspa);
5127
5128#ifndef sun
5129 /* mark that we are creating new spa by splitting */
5130 newspa->spa_splitting_newspa = B_TRUE;
5131#endif
5132 /* create the new pool from the disks of the original pool */
5133 error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE);
5134#ifndef sun
5135 newspa->spa_splitting_newspa = B_FALSE;
5136#endif
5137 if (error)
5138 goto out;
5139
5140 /* if that worked, generate a real config for the new pool */
5141 if (newspa->spa_root_vdev != NULL) {
5142 VERIFY(nvlist_alloc(&newspa->spa_config_splitting,
5143 NV_UNIQUE_NAME, KM_SLEEP) == 0);
5144 VERIFY(nvlist_add_uint64(newspa->spa_config_splitting,
5145 ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0);
5146 spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL,
5147 B_TRUE));
5148 }
5149
5150 /* set the props */
5151 if (props != NULL) {
5152 spa_configfile_set(newspa, props, B_FALSE);
5153 error = spa_prop_set(newspa, props);
5154 if (error)
5155 goto out;
5156 }
5157
5158 /* flush everything */
5159 txg = spa_vdev_config_enter(newspa);
5160 vdev_config_dirty(newspa->spa_root_vdev);
5161 (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG);
5162
5163 if (zio_injection_enabled)
5164 zio_handle_panic_injection(spa, FTAG, 2);
5165
5166 spa_async_resume(newspa);
5167
5168 /* finally, update the original pool's config */
5169 txg = spa_vdev_config_enter(spa);
5170 tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
5171 error = dmu_tx_assign(tx, TXG_WAIT);
5172 if (error != 0)
5173 dmu_tx_abort(tx);
5174 for (c = 0; c < children; c++) {
5175 if (vml[c] != NULL) {
5176 vdev_split(vml[c]);
5177 if (error == 0)
5178 spa_history_log_internal(spa, "detach", tx,
5179 "vdev=%s", vml[c]->vdev_path);
5180 vdev_free(vml[c]);
5181 }
5182 }
5183 vdev_config_dirty(spa->spa_root_vdev);
5184 spa->spa_config_splitting = NULL;
5185 nvlist_free(nvl);
5186 if (error == 0)
5187 dmu_tx_commit(tx);
5188 (void) spa_vdev_exit(spa, NULL, txg, 0);
5189
5190 if (zio_injection_enabled)
5191 zio_handle_panic_injection(spa, FTAG, 3);
5192
5193 /* split is complete; log a history record */
5194 spa_history_log_internal(newspa, "split", NULL,
5195 "from pool %s", spa_name(spa));
5196
5197 kmem_free(vml, children * sizeof (vdev_t *));
5198
5199 /* if we're not going to mount the filesystems in userland, export */
5200 if (exp)
5201 error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL,
5202 B_FALSE, B_FALSE);
5203
5204 return (error);
5205
5206out:
5207 spa_unload(newspa);
5208 spa_deactivate(newspa);
5209 spa_remove(newspa);
5210
5211 txg = spa_vdev_config_enter(spa);
5212
5213 /* re-online all offlined disks */
5214 for (c = 0; c < children; c++) {
5215 if (vml[c] != NULL)
5216 vml[c]->vdev_offline = B_FALSE;
5217 }
5218 vdev_reopen(spa->spa_root_vdev);
5219
5220 nvlist_free(spa->spa_config_splitting);
5221 spa->spa_config_splitting = NULL;
5222 (void) spa_vdev_exit(spa, NULL, txg, error);
5223
5224 kmem_free(vml, children * sizeof (vdev_t *));
5225 return (error);
5226}
5227
5228static nvlist_t *
5229spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid)
5230{
5231 for (int i = 0; i < count; i++) {
5232 uint64_t guid;
5233
5234 VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID,
5235 &guid) == 0);
5236
5237 if (guid == target_guid)
5238 return (nvpp[i]);
5239 }
5240
5241 return (NULL);
5242}
5243
5244static void
5245spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count,
5246 nvlist_t *dev_to_remove)
5247{
5248 nvlist_t **newdev = NULL;
5249
5250 if (count > 1)
5251 newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP);
5252
5253 for (int i = 0, j = 0; i < count; i++) {
5254 if (dev[i] == dev_to_remove)
5255 continue;
5256 VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0);
5257 }
5258
5259 VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0);
5260 VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0);
5261
5262 for (int i = 0; i < count - 1; i++)
5263 nvlist_free(newdev[i]);
5264
5265 if (count > 1)
5266 kmem_free(newdev, (count - 1) * sizeof (void *));
5267}
5268
5269/*
5270 * Evacuate the device.
5271 */
5272static int
5273spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd)
5274{
5275 uint64_t txg;
5276 int error = 0;
5277
5278 ASSERT(MUTEX_HELD(&spa_namespace_lock));
5279 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
5280 ASSERT(vd == vd->vdev_top);
5281
5282 /*
5283 * Evacuate the device. We don't hold the config lock as writer
5284 * since we need to do I/O but we do keep the
5285 * spa_namespace_lock held. Once this completes the device
5286 * should no longer have any blocks allocated on it.
5287 */
5288 if (vd->vdev_islog) {
5289 if (vd->vdev_stat.vs_alloc != 0)
5290 error = spa_offline_log(spa);
5291 } else {
5292 error = SET_ERROR(ENOTSUP);
5293 }
5294
5295 if (error)
5296 return (error);
5297
5298 /*
5299 * The evacuation succeeded. Remove any remaining MOS metadata
5300 * associated with this vdev, and wait for these changes to sync.
5301 */
5302 ASSERT0(vd->vdev_stat.vs_alloc);
5303 txg = spa_vdev_config_enter(spa);
5304 vd->vdev_removing = B_TRUE;
5305 vdev_dirty(vd, 0, NULL, txg);
5306 vdev_config_dirty(vd);
5307 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
5308
5309 return (0);
5310}
5311
5312/*
5313 * Complete the removal by cleaning up the namespace.
5314 */
5315static void
5316spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd)
5317{
5318 vdev_t *rvd = spa->spa_root_vdev;
5319 uint64_t id = vd->vdev_id;
5320 boolean_t last_vdev = (id == (rvd->vdev_children - 1));
5321
5322 ASSERT(MUTEX_HELD(&spa_namespace_lock));
5323 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
5324 ASSERT(vd == vd->vdev_top);
5325
5326 /*
5327 * Only remove any devices which are empty.
5328 */
5329 if (vd->vdev_stat.vs_alloc != 0)
5330 return;
5331
5332 (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
5333
5334 if (list_link_active(&vd->vdev_state_dirty_node))
5335 vdev_state_clean(vd);
5336 if (list_link_active(&vd->vdev_config_dirty_node))
5337 vdev_config_clean(vd);
5338
5339 vdev_free(vd);
5340
5341 if (last_vdev) {
5342 vdev_compact_children(rvd);
5343 } else {
5344 vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops);
5345 vdev_add_child(rvd, vd);
5346 }
5347 vdev_config_dirty(rvd);
5348
5349 /*
5350 * Reassess the health of our root vdev.
5351 */
5352 vdev_reopen(rvd);
5353}
5354
5355/*
5356 * Remove a device from the pool -
5357 *
5358 * Removing a device from the vdev namespace requires several steps
5359 * and can take a significant amount of time. As a result we use
5360 * the spa_vdev_config_[enter/exit] functions which allow us to
5361 * grab and release the spa_config_lock while still holding the namespace
5362 * lock. During each step the configuration is synced out.
5363 *
5364 * Currently, this supports removing only hot spares, slogs, and level 2 ARC
5365 * devices.
5366 */
5367int
5368spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
5369{
5370 vdev_t *vd;
5371 metaslab_group_t *mg;
5372 nvlist_t **spares, **l2cache, *nv;
5373 uint64_t txg = 0;
5374 uint_t nspares, nl2cache;
5375 int error = 0;
5376 boolean_t locked = MUTEX_HELD(&spa_namespace_lock);
5377
5378 ASSERT(spa_writeable(spa));
5379
5380 if (!locked)
5381 txg = spa_vdev_enter(spa);
5382
5383 vd = spa_lookup_by_guid(spa, guid, B_FALSE);
5384
5385 if (spa->spa_spares.sav_vdevs != NULL &&
5386 nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
5387 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 &&
5388 (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) {
5389 /*
5390 * Only remove the hot spare if it's not currently in use
5391 * in this pool.
5392 */
5393 if (vd == NULL || unspare) {
5394 spa_vdev_remove_aux(spa->spa_spares.sav_config,
5395 ZPOOL_CONFIG_SPARES, spares, nspares, nv);
5396 spa_load_spares(spa);
5397 spa->spa_spares.sav_sync = B_TRUE;
5398 } else {
5399 error = SET_ERROR(EBUSY);
5400 }
5401 } else if (spa->spa_l2cache.sav_vdevs != NULL &&
5402 nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
5403 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 &&
5404 (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) {
5405 /*
5406 * Cache devices can always be removed.
5407 */
5408 spa_vdev_remove_aux(spa->spa_l2cache.sav_config,
5409 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv);
5410 spa_load_l2cache(spa);
5411 spa->spa_l2cache.sav_sync = B_TRUE;
5412 } else if (vd != NULL && vd->vdev_islog) {
5413 ASSERT(!locked);
5414 ASSERT(vd == vd->vdev_top);
5415
5416 /*
5417 * XXX - Once we have bp-rewrite this should
5418 * become the common case.
5419 */
5420
5421 mg = vd->vdev_mg;
5422
5423 /*
5424 * Stop allocating from this vdev.
5425 */
5426 metaslab_group_passivate(mg);
5427
5428 /*
5429 * Wait for the youngest allocations and frees to sync,
5430 * and then wait for the deferral of those frees to finish.
5431 */
5432 spa_vdev_config_exit(spa, NULL,
5433 txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
5434
5435 /*
5436 * Attempt to evacuate the vdev.
5437 */
5438 error = spa_vdev_remove_evacuate(spa, vd);
5439
5440 txg = spa_vdev_config_enter(spa);
5441
5442 /*
5443 * If we couldn't evacuate the vdev, unwind.
5444 */
5445 if (error) {
5446 metaslab_group_activate(mg);
5447 return (spa_vdev_exit(spa, NULL, txg, error));
5448 }
5449
5450 /*
5451 * Clean up the vdev namespace.
5452 */
5453 spa_vdev_remove_from_namespace(spa, vd);
5454
5455 } else if (vd != NULL) {
5456 /*
5457 * Normal vdevs cannot be removed (yet).
5458 */
5459 error = SET_ERROR(ENOTSUP);
5460 } else {
5461 /*
5462 * There is no vdev of any kind with the specified guid.
5463 */
5464 error = SET_ERROR(ENOENT);
5465 }
5466
5467 if (!locked)
5468 return (spa_vdev_exit(spa, NULL, txg, error));
5469
5470 return (error);
5471}
5472
5473/*
5474 * Find any device that's done replacing, or a vdev marked 'unspare' that's
5475 * currently spared, so we can detach it.
5476 */
5477static vdev_t *
5478spa_vdev_resilver_done_hunt(vdev_t *vd)
5479{
5480 vdev_t *newvd, *oldvd;
5481
5482 for (int c = 0; c < vd->vdev_children; c++) {
5483 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]);
5484 if (oldvd != NULL)
5485 return (oldvd);
5486 }
5487
5488 /*
5489 * Check for a completed replacement. We always consider the first
5490 * vdev in the list to be the oldest vdev, and the last one to be
5491 * the newest (see spa_vdev_attach() for how that works). In
5492 * the case where the newest vdev is faulted, we will not automatically
5493 * remove it after a resilver completes. This is OK as it will require
5494 * user intervention to determine which disk the admin wishes to keep.
5495 */
5496 if (vd->vdev_ops == &vdev_replacing_ops) {
5497 ASSERT(vd->vdev_children > 1);
5498
5499 newvd = vd->vdev_child[vd->vdev_children - 1];
5500 oldvd = vd->vdev_child[0];
5501
5502 if (vdev_dtl_empty(newvd, DTL_MISSING) &&
5503 vdev_dtl_empty(newvd, DTL_OUTAGE) &&
5504 !vdev_dtl_required(oldvd))
5505 return (oldvd);
5506 }
5507
5508 /*
5509 * Check for a completed resilver with the 'unspare' flag set.
5510 */
5511 if (vd->vdev_ops == &vdev_spare_ops) {
5512 vdev_t *first = vd->vdev_child[0];
5513 vdev_t *last = vd->vdev_child[vd->vdev_children - 1];
5514
5515 if (last->vdev_unspare) {
5516 oldvd = first;
5517 newvd = last;
5518 } else if (first->vdev_unspare) {
5519 oldvd = last;
5520 newvd = first;
5521 } else {
5522 oldvd = NULL;
5523 }
5524
5525 if (oldvd != NULL &&
5526 vdev_dtl_empty(newvd, DTL_MISSING) &&
5527 vdev_dtl_empty(newvd, DTL_OUTAGE) &&
5528 !vdev_dtl_required(oldvd))
5529 return (oldvd);
5530
5531 /*
5532 * If there are more than two spares attached to a disk,
5533 * and those spares are not required, then we want to
5534 * attempt to free them up now so that they can be used
5535 * by other pools. Once we're back down to a single
5536 * disk+spare, we stop removing them.
5537 */
5538 if (vd->vdev_children > 2) {
5539 newvd = vd->vdev_child[1];
5540
5541 if (newvd->vdev_isspare && last->vdev_isspare &&
5542 vdev_dtl_empty(last, DTL_MISSING) &&
5543 vdev_dtl_empty(last, DTL_OUTAGE) &&
5544 !vdev_dtl_required(newvd))
5545 return (newvd);
5546 }
5547 }
5548
5549 return (NULL);
5550}
5551
5552static void
5553spa_vdev_resilver_done(spa_t *spa)
5554{
5555 vdev_t *vd, *pvd, *ppvd;
5556 uint64_t guid, sguid, pguid, ppguid;
5557
5558 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5559
5560 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) {
5561 pvd = vd->vdev_parent;
5562 ppvd = pvd->vdev_parent;
5563 guid = vd->vdev_guid;
5564 pguid = pvd->vdev_guid;
5565 ppguid = ppvd->vdev_guid;
5566 sguid = 0;
5567 /*
5568 * If we have just finished replacing a hot spared device, then
5569 * we need to detach the parent's first child (the original hot
5570 * spare) as well.
5571 */
5572 if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 &&
5573 ppvd->vdev_children == 2) {
5574 ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
5575 sguid = ppvd->vdev_child[1]->vdev_guid;
5576 }
5577 spa_config_exit(spa, SCL_ALL, FTAG);
5578 if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0)
5579 return;
5580 if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0)
5581 return;
5582 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5583 }
5584
5585 spa_config_exit(spa, SCL_ALL, FTAG);
5586}
5587
5588/*
5589 * Update the stored path or FRU for this vdev.
5590 */
5591int
5592spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value,
5593 boolean_t ispath)
5594{
5595 vdev_t *vd;
5596 boolean_t sync = B_FALSE;
5597
5598 ASSERT(spa_writeable(spa));
5599
5600 spa_vdev_state_enter(spa, SCL_ALL);
5601
5602 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
5603 return (spa_vdev_state_exit(spa, NULL, ENOENT));
5604
5605 if (!vd->vdev_ops->vdev_op_leaf)
5606 return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
5607
5608 if (ispath) {
5609 if (strcmp(value, vd->vdev_path) != 0) {
5610 spa_strfree(vd->vdev_path);
5611 vd->vdev_path = spa_strdup(value);
5612 sync = B_TRUE;
5613 }
5614 } else {
5615 if (vd->vdev_fru == NULL) {
5616 vd->vdev_fru = spa_strdup(value);
5617 sync = B_TRUE;
5618 } else if (strcmp(value, vd->vdev_fru) != 0) {
5619 spa_strfree(vd->vdev_fru);
5620 vd->vdev_fru = spa_strdup(value);
5621 sync = B_TRUE;
5622 }
5623 }
5624
5625 return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0));
5626}
5627
5628int
5629spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
5630{
5631 return (spa_vdev_set_common(spa, guid, newpath, B_TRUE));
5632}
5633
5634int
5635spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru)
5636{
5637 return (spa_vdev_set_common(spa, guid, newfru, B_FALSE));
5638}
5639
5640/*
5641 * ==========================================================================
5642 * SPA Scanning
5643 * ==========================================================================
5644 */
5645
5646int
5647spa_scan_stop(spa_t *spa)
5648{
5649 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
5650 if (dsl_scan_resilvering(spa->spa_dsl_pool))
5651 return (SET_ERROR(EBUSY));
5652 return (dsl_scan_cancel(spa->spa_dsl_pool));
5653}
5654
5655int
5656spa_scan(spa_t *spa, pool_scan_func_t func)
5657{
5658 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
5659
5660 if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE)
5661 return (SET_ERROR(ENOTSUP));
5662
5663 /*
5664 * If a resilver was requested, but there is no DTL on a
5665 * writeable leaf device, we have nothing to do.
5666 */
5667 if (func == POOL_SCAN_RESILVER &&
5668 !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
5669 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
5670 return (0);
5671 }
5672
5673 return (dsl_scan(spa->spa_dsl_pool, func));
5674}
5675
5676/*
5677 * ==========================================================================
5678 * SPA async task processing
5679 * ==========================================================================
5680 */
5681
5682static void
5683spa_async_remove(spa_t *spa, vdev_t *vd)
5684{
5685 if (vd->vdev_remove_wanted) {
5686 vd->vdev_remove_wanted = B_FALSE;
5687 vd->vdev_delayed_close = B_FALSE;
5688 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE);
5689
5690 /*
5691 * We want to clear the stats, but we don't want to do a full
5692 * vdev_clear() as that will cause us to throw away
5693 * degraded/faulted state as well as attempt to reopen the
5694 * device, all of which is a waste.
5695 */
5696 vd->vdev_stat.vs_read_errors = 0;
5697 vd->vdev_stat.vs_write_errors = 0;
5698 vd->vdev_stat.vs_checksum_errors = 0;
5699
5700 vdev_state_dirty(vd->vdev_top);
5701 }
5702
5703 for (int c = 0; c < vd->vdev_children; c++)
5704 spa_async_remove(spa, vd->vdev_child[c]);
5705}
5706
5707static void
5708spa_async_probe(spa_t *spa, vdev_t *vd)
5709{
5710 if (vd->vdev_probe_wanted) {
5711 vd->vdev_probe_wanted = B_FALSE;
5712 vdev_reopen(vd); /* vdev_open() does the actual probe */
5713 }
5714
5715 for (int c = 0; c < vd->vdev_children; c++)
5716 spa_async_probe(spa, vd->vdev_child[c]);
5717}
5718
5719static void
5720spa_async_autoexpand(spa_t *spa, vdev_t *vd)
5721{
5722 sysevent_id_t eid;
5723 nvlist_t *attr;
5724 char *physpath;
5725
5726 if (!spa->spa_autoexpand)
5727 return;
5728
5729 for (int c = 0; c < vd->vdev_children; c++) {
5730 vdev_t *cvd = vd->vdev_child[c];
5731 spa_async_autoexpand(spa, cvd);
5732 }
5733
5734 if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL)
5735 return;
5736
5737 physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
5738 (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath);
5739
5740 VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0);
5741 VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0);
5742
5743 (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS,
5744 ESC_ZFS_VDEV_AUTOEXPAND, attr, &eid, DDI_SLEEP);
5745
5746 nvlist_free(attr);
5747 kmem_free(physpath, MAXPATHLEN);
5748}
5749
5750static void
5751spa_async_thread(void *arg)
5752{
5753 spa_t *spa = arg;
5754 int tasks;
5755
5756 ASSERT(spa->spa_sync_on);
5757
5758 mutex_enter(&spa->spa_async_lock);
5759 tasks = spa->spa_async_tasks;
5760 spa->spa_async_tasks = 0;
5761 mutex_exit(&spa->spa_async_lock);
5762
5763 /*
5764 * See if the config needs to be updated.
5765 */
5766 if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
5767 uint64_t old_space, new_space;
5768
5769 mutex_enter(&spa_namespace_lock);
5770 old_space = metaslab_class_get_space(spa_normal_class(spa));
5771 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
5772 new_space = metaslab_class_get_space(spa_normal_class(spa));
5773 mutex_exit(&spa_namespace_lock);
5774
5775 /*
5776 * If the pool grew as a result of the config update,
5777 * then log an internal history event.
5778 */
5779 if (new_space != old_space) {
5780 spa_history_log_internal(spa, "vdev online", NULL,
5781 "pool '%s' size: %llu(+%llu)",
5782 spa_name(spa), new_space, new_space - old_space);
5783 }
5784 }
5785
5786 /*
5787 * See if any devices need to be marked REMOVED.
5788 */
5789 if (tasks & SPA_ASYNC_REMOVE) {
5790 spa_vdev_state_enter(spa, SCL_NONE);
5791 spa_async_remove(spa, spa->spa_root_vdev);
5792 for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
5793 spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]);
5794 for (int i = 0; i < spa->spa_spares.sav_count; i++)
5795 spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]);
5796 (void) spa_vdev_state_exit(spa, NULL, 0);
5797 }
5798
5799 if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) {
5800 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
5801 spa_async_autoexpand(spa, spa->spa_root_vdev);
5802 spa_config_exit(spa, SCL_CONFIG, FTAG);
5803 }
5804
5805 /*
5806 * See if any devices need to be probed.
5807 */
5808 if (tasks & SPA_ASYNC_PROBE) {
5809 spa_vdev_state_enter(spa, SCL_NONE);
5810 spa_async_probe(spa, spa->spa_root_vdev);
5811 (void) spa_vdev_state_exit(spa, NULL, 0);
5812 }
5813
5814 /*
5815 * If any devices are done replacing, detach them.
5816 */
5817 if (tasks & SPA_ASYNC_RESILVER_DONE)
5818 spa_vdev_resilver_done(spa);
5819
5820 /*
5821 * Kick off a resilver.
5822 */
5823 if (tasks & SPA_ASYNC_RESILVER)
5824 dsl_resilver_restart(spa->spa_dsl_pool, 0);
5825
5826 /*
5827 * Let the world know that we're done.
5828 */
5829 mutex_enter(&spa->spa_async_lock);
5830 spa->spa_async_thread = NULL;
5831 cv_broadcast(&spa->spa_async_cv);
5832 mutex_exit(&spa->spa_async_lock);
5833 thread_exit();
5834}
5835
5836void
5837spa_async_suspend(spa_t *spa)
5838{
5839 mutex_enter(&spa->spa_async_lock);
5840 spa->spa_async_suspended++;
5841 while (spa->spa_async_thread != NULL)
5842 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
5843 mutex_exit(&spa->spa_async_lock);
5844}
5845
5846void
5847spa_async_resume(spa_t *spa)
5848{
5849 mutex_enter(&spa->spa_async_lock);
5850 ASSERT(spa->spa_async_suspended != 0);
5851 spa->spa_async_suspended--;
5852 mutex_exit(&spa->spa_async_lock);
5853}
5854
97typedef enum zti_modes {
98 zti_mode_fixed, /* value is # of threads (min 1) */
99 zti_mode_online_percent, /* value is % of online CPUs */
100 zti_mode_batch, /* cpu-intensive; value is ignored */
101 zti_mode_null, /* don't create a taskq */
102 zti_nmodes
103} zti_modes_t;
104
105#define ZTI_FIX(n) { zti_mode_fixed, (n) }
106#define ZTI_PCT(n) { zti_mode_online_percent, (n) }
107#define ZTI_BATCH { zti_mode_batch, 0 }
108#define ZTI_NULL { zti_mode_null, 0 }
109
110#define ZTI_ONE ZTI_FIX(1)
111
112typedef struct zio_taskq_info {
113 enum zti_modes zti_mode;
114 uint_t zti_value;
115} zio_taskq_info_t;
116
117static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
118 "issue", "issue_high", "intr", "intr_high"
119};
120
121/*
122 * Define the taskq threads for the following I/O types:
123 * NULL, READ, WRITE, FREE, CLAIM, and IOCTL
124 */
125const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
126 /* ISSUE ISSUE_HIGH INTR INTR_HIGH */
127 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL },
128 { ZTI_FIX(8), ZTI_NULL, ZTI_BATCH, ZTI_NULL },
129 { ZTI_BATCH, ZTI_FIX(5), ZTI_FIX(8), ZTI_FIX(5) },
130 { ZTI_FIX(100), ZTI_NULL, ZTI_ONE, ZTI_NULL },
131 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL },
132 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL },
133};
134
135static void spa_sync_version(void *arg, dmu_tx_t *tx);
136static void spa_sync_props(void *arg, dmu_tx_t *tx);
137static boolean_t spa_has_active_shared_spare(spa_t *spa);
138static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config,
139 spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
140 char **ereport);
141static void spa_vdev_resilver_done(spa_t *spa);
142
143uint_t zio_taskq_batch_pct = 100; /* 1 thread per cpu in pset */
144#ifdef PSRSET_BIND
145id_t zio_taskq_psrset_bind = PS_NONE;
146#endif
147#ifdef SYSDC
148boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */
149#endif
150uint_t zio_taskq_basedc = 80; /* base duty cycle */
151
152boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */
153extern int zfs_sync_pass_deferred_free;
154
155#ifndef illumos
156extern void spa_deadman(void *arg);
157#endif
158
159/*
160 * This (illegal) pool name is used when temporarily importing a spa_t in order
161 * to get the vdev stats associated with the imported devices.
162 */
163#define TRYIMPORT_NAME "$import"
164
165/*
166 * ==========================================================================
167 * SPA properties routines
168 * ==========================================================================
169 */
170
171/*
172 * Add a (source=src, propname=propval) list to an nvlist.
173 */
174static void
175spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval,
176 uint64_t intval, zprop_source_t src)
177{
178 const char *propname = zpool_prop_to_name(prop);
179 nvlist_t *propval;
180
181 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
182 VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0);
183
184 if (strval != NULL)
185 VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0);
186 else
187 VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0);
188
189 VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0);
190 nvlist_free(propval);
191}
192
193/*
194 * Get property values from the spa configuration.
195 */
196static void
197spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
198{
199 vdev_t *rvd = spa->spa_root_vdev;
200 dsl_pool_t *pool = spa->spa_dsl_pool;
201 uint64_t size;
202 uint64_t alloc;
203 uint64_t space;
204 uint64_t cap, version;
205 zprop_source_t src = ZPROP_SRC_NONE;
206 spa_config_dirent_t *dp;
207
208 ASSERT(MUTEX_HELD(&spa->spa_props_lock));
209
210 if (rvd != NULL) {
211 alloc = metaslab_class_get_alloc(spa_normal_class(spa));
212 size = metaslab_class_get_space(spa_normal_class(spa));
213 spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
214 spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
215 spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
216 spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL,
217 size - alloc, src);
218
219 space = 0;
220 for (int c = 0; c < rvd->vdev_children; c++) {
221 vdev_t *tvd = rvd->vdev_child[c];
222 space += tvd->vdev_max_asize - tvd->vdev_asize;
223 }
224 spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, space,
225 src);
226
227 spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL,
228 (spa_mode(spa) == FREAD), src);
229
230 cap = (size == 0) ? 0 : (alloc * 100 / size);
231 spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);
232
233 spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL,
234 ddt_get_pool_dedup_ratio(spa), src);
235
236 spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
237 rvd->vdev_state, src);
238
239 version = spa_version(spa);
240 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION))
241 src = ZPROP_SRC_DEFAULT;
242 else
243 src = ZPROP_SRC_LOCAL;
244 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src);
245 }
246
247 if (pool != NULL) {
248 dsl_dir_t *freedir = pool->dp_free_dir;
249
250 /*
251 * The $FREE directory was introduced in SPA_VERSION_DEADLISTS,
252 * when opening pools before this version freedir will be NULL.
253 */
254 if (freedir != NULL) {
255 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL,
256 freedir->dd_phys->dd_used_bytes, src);
257 } else {
258 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING,
259 NULL, 0, src);
260 }
261 }
262
263 spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
264
265 if (spa->spa_comment != NULL) {
266 spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment,
267 0, ZPROP_SRC_LOCAL);
268 }
269
270 if (spa->spa_root != NULL)
271 spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root,
272 0, ZPROP_SRC_LOCAL);
273
274 if ((dp = list_head(&spa->spa_config_list)) != NULL) {
275 if (dp->scd_path == NULL) {
276 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
277 "none", 0, ZPROP_SRC_LOCAL);
278 } else if (strcmp(dp->scd_path, spa_config_path) != 0) {
279 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
280 dp->scd_path, 0, ZPROP_SRC_LOCAL);
281 }
282 }
283}
284
285/*
286 * Get zpool property values.
287 */
288int
289spa_prop_get(spa_t *spa, nvlist_t **nvp)
290{
291 objset_t *mos = spa->spa_meta_objset;
292 zap_cursor_t zc;
293 zap_attribute_t za;
294 int err;
295
296 VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
297
298 mutex_enter(&spa->spa_props_lock);
299
300 /*
301 * Get properties from the spa config.
302 */
303 spa_prop_get_config(spa, nvp);
304
305 /* If no pool property object, no more prop to get. */
306 if (mos == NULL || spa->spa_pool_props_object == 0) {
307 mutex_exit(&spa->spa_props_lock);
308 return (0);
309 }
310
311 /*
312 * Get properties from the MOS pool property object.
313 */
314 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object);
315 (err = zap_cursor_retrieve(&zc, &za)) == 0;
316 zap_cursor_advance(&zc)) {
317 uint64_t intval = 0;
318 char *strval = NULL;
319 zprop_source_t src = ZPROP_SRC_DEFAULT;
320 zpool_prop_t prop;
321
322 if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL)
323 continue;
324
325 switch (za.za_integer_length) {
326 case 8:
327 /* integer property */
328 if (za.za_first_integer !=
329 zpool_prop_default_numeric(prop))
330 src = ZPROP_SRC_LOCAL;
331
332 if (prop == ZPOOL_PROP_BOOTFS) {
333 dsl_pool_t *dp;
334 dsl_dataset_t *ds = NULL;
335
336 dp = spa_get_dsl(spa);
337 dsl_pool_config_enter(dp, FTAG);
338 if (err = dsl_dataset_hold_obj(dp,
339 za.za_first_integer, FTAG, &ds)) {
340 dsl_pool_config_exit(dp, FTAG);
341 break;
342 }
343
344 strval = kmem_alloc(
345 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1,
346 KM_SLEEP);
347 dsl_dataset_name(ds, strval);
348 dsl_dataset_rele(ds, FTAG);
349 dsl_pool_config_exit(dp, FTAG);
350 } else {
351 strval = NULL;
352 intval = za.za_first_integer;
353 }
354
355 spa_prop_add_list(*nvp, prop, strval, intval, src);
356
357 if (strval != NULL)
358 kmem_free(strval,
359 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1);
360
361 break;
362
363 case 1:
364 /* string property */
365 strval = kmem_alloc(za.za_num_integers, KM_SLEEP);
366 err = zap_lookup(mos, spa->spa_pool_props_object,
367 za.za_name, 1, za.za_num_integers, strval);
368 if (err) {
369 kmem_free(strval, za.za_num_integers);
370 break;
371 }
372 spa_prop_add_list(*nvp, prop, strval, 0, src);
373 kmem_free(strval, za.za_num_integers);
374 break;
375
376 default:
377 break;
378 }
379 }
380 zap_cursor_fini(&zc);
381 mutex_exit(&spa->spa_props_lock);
382out:
383 if (err && err != ENOENT) {
384 nvlist_free(*nvp);
385 *nvp = NULL;
386 return (err);
387 }
388
389 return (0);
390}
391
392/*
393 * Validate the given pool properties nvlist and modify the list
394 * for the property values to be set.
395 */
396static int
397spa_prop_validate(spa_t *spa, nvlist_t *props)
398{
399 nvpair_t *elem;
400 int error = 0, reset_bootfs = 0;
401 uint64_t objnum = 0;
402 boolean_t has_feature = B_FALSE;
403
404 elem = NULL;
405 while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
406 uint64_t intval;
407 char *strval, *slash, *check, *fname;
408 const char *propname = nvpair_name(elem);
409 zpool_prop_t prop = zpool_name_to_prop(propname);
410
411 switch (prop) {
412 case ZPROP_INVAL:
413 if (!zpool_prop_feature(propname)) {
414 error = SET_ERROR(EINVAL);
415 break;
416 }
417
418 /*
419 * Sanitize the input.
420 */
421 if (nvpair_type(elem) != DATA_TYPE_UINT64) {
422 error = SET_ERROR(EINVAL);
423 break;
424 }
425
426 if (nvpair_value_uint64(elem, &intval) != 0) {
427 error = SET_ERROR(EINVAL);
428 break;
429 }
430
431 if (intval != 0) {
432 error = SET_ERROR(EINVAL);
433 break;
434 }
435
436 fname = strchr(propname, '@') + 1;
437 if (zfeature_lookup_name(fname, NULL) != 0) {
438 error = SET_ERROR(EINVAL);
439 break;
440 }
441
442 has_feature = B_TRUE;
443 break;
444
445 case ZPOOL_PROP_VERSION:
446 error = nvpair_value_uint64(elem, &intval);
447 if (!error &&
448 (intval < spa_version(spa) ||
449 intval > SPA_VERSION_BEFORE_FEATURES ||
450 has_feature))
451 error = SET_ERROR(EINVAL);
452 break;
453
454 case ZPOOL_PROP_DELEGATION:
455 case ZPOOL_PROP_AUTOREPLACE:
456 case ZPOOL_PROP_LISTSNAPS:
457 case ZPOOL_PROP_AUTOEXPAND:
458 error = nvpair_value_uint64(elem, &intval);
459 if (!error && intval > 1)
460 error = SET_ERROR(EINVAL);
461 break;
462
463 case ZPOOL_PROP_BOOTFS:
464 /*
465 * If the pool version is less than SPA_VERSION_BOOTFS,
466 * or the pool is still being created (version == 0),
467 * the bootfs property cannot be set.
468 */
469 if (spa_version(spa) < SPA_VERSION_BOOTFS) {
470 error = SET_ERROR(ENOTSUP);
471 break;
472 }
473
474 /*
475 * Make sure the vdev config is bootable
476 */
477 if (!vdev_is_bootable(spa->spa_root_vdev)) {
478 error = SET_ERROR(ENOTSUP);
479 break;
480 }
481
482 reset_bootfs = 1;
483
484 error = nvpair_value_string(elem, &strval);
485
486 if (!error) {
487 objset_t *os;
488 uint64_t compress;
489
490 if (strval == NULL || strval[0] == '\0') {
491 objnum = zpool_prop_default_numeric(
492 ZPOOL_PROP_BOOTFS);
493 break;
494 }
495
496 if (error = dmu_objset_hold(strval, FTAG, &os))
497 break;
498
499 /* Must be ZPL and not gzip compressed. */
500
501 if (dmu_objset_type(os) != DMU_OST_ZFS) {
502 error = SET_ERROR(ENOTSUP);
503 } else if ((error =
504 dsl_prop_get_int_ds(dmu_objset_ds(os),
505 zfs_prop_to_name(ZFS_PROP_COMPRESSION),
506 &compress)) == 0 &&
507 !BOOTFS_COMPRESS_VALID(compress)) {
508 error = SET_ERROR(ENOTSUP);
509 } else {
510 objnum = dmu_objset_id(os);
511 }
512 dmu_objset_rele(os, FTAG);
513 }
514 break;
515
516 case ZPOOL_PROP_FAILUREMODE:
517 error = nvpair_value_uint64(elem, &intval);
518 if (!error && (intval < ZIO_FAILURE_MODE_WAIT ||
519 intval > ZIO_FAILURE_MODE_PANIC))
520 error = SET_ERROR(EINVAL);
521
522 /*
523 * This is a special case which only occurs when
524 * the pool has completely failed. This allows
525 * the user to change the in-core failmode property
526 * without syncing it out to disk (I/Os might
527 * currently be blocked). We do this by returning
528 * EIO to the caller (spa_prop_set) to trick it
529 * into thinking we encountered a property validation
530 * error.
531 */
532 if (!error && spa_suspended(spa)) {
533 spa->spa_failmode = intval;
534 error = SET_ERROR(EIO);
535 }
536 break;
537
538 case ZPOOL_PROP_CACHEFILE:
539 if ((error = nvpair_value_string(elem, &strval)) != 0)
540 break;
541
542 if (strval[0] == '\0')
543 break;
544
545 if (strcmp(strval, "none") == 0)
546 break;
547
548 if (strval[0] != '/') {
549 error = SET_ERROR(EINVAL);
550 break;
551 }
552
553 slash = strrchr(strval, '/');
554 ASSERT(slash != NULL);
555
556 if (slash[1] == '\0' || strcmp(slash, "/.") == 0 ||
557 strcmp(slash, "/..") == 0)
558 error = SET_ERROR(EINVAL);
559 break;
560
561 case ZPOOL_PROP_COMMENT:
562 if ((error = nvpair_value_string(elem, &strval)) != 0)
563 break;
564 for (check = strval; *check != '\0'; check++) {
565 /*
566 * The kernel doesn't have an easy isprint()
567 * check. For this kernel check, we merely
568 * check ASCII apart from DEL. Fix this if
569 * there is an easy-to-use kernel isprint().
570 */
571 if (*check >= 0x7f) {
572 error = SET_ERROR(EINVAL);
573 break;
574 }
575 check++;
576 }
577 if (strlen(strval) > ZPROP_MAX_COMMENT)
578 error = E2BIG;
579 break;
580
581 case ZPOOL_PROP_DEDUPDITTO:
582 if (spa_version(spa) < SPA_VERSION_DEDUP)
583 error = SET_ERROR(ENOTSUP);
584 else
585 error = nvpair_value_uint64(elem, &intval);
586 if (error == 0 &&
587 intval != 0 && intval < ZIO_DEDUPDITTO_MIN)
588 error = SET_ERROR(EINVAL);
589 break;
590 }
591
592 if (error)
593 break;
594 }
595
596 if (!error && reset_bootfs) {
597 error = nvlist_remove(props,
598 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING);
599
600 if (!error) {
601 error = nvlist_add_uint64(props,
602 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum);
603 }
604 }
605
606 return (error);
607}
608
609void
610spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync)
611{
612 char *cachefile;
613 spa_config_dirent_t *dp;
614
615 if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE),
616 &cachefile) != 0)
617 return;
618
619 dp = kmem_alloc(sizeof (spa_config_dirent_t),
620 KM_SLEEP);
621
622 if (cachefile[0] == '\0')
623 dp->scd_path = spa_strdup(spa_config_path);
624 else if (strcmp(cachefile, "none") == 0)
625 dp->scd_path = NULL;
626 else
627 dp->scd_path = spa_strdup(cachefile);
628
629 list_insert_head(&spa->spa_config_list, dp);
630 if (need_sync)
631 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
632}
633
634int
635spa_prop_set(spa_t *spa, nvlist_t *nvp)
636{
637 int error;
638 nvpair_t *elem = NULL;
639 boolean_t need_sync = B_FALSE;
640
641 if ((error = spa_prop_validate(spa, nvp)) != 0)
642 return (error);
643
644 while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) {
645 zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem));
646
647 if (prop == ZPOOL_PROP_CACHEFILE ||
648 prop == ZPOOL_PROP_ALTROOT ||
649 prop == ZPOOL_PROP_READONLY)
650 continue;
651
652 if (prop == ZPOOL_PROP_VERSION || prop == ZPROP_INVAL) {
653 uint64_t ver;
654
655 if (prop == ZPOOL_PROP_VERSION) {
656 VERIFY(nvpair_value_uint64(elem, &ver) == 0);
657 } else {
658 ASSERT(zpool_prop_feature(nvpair_name(elem)));
659 ver = SPA_VERSION_FEATURES;
660 need_sync = B_TRUE;
661 }
662
663 /* Save time if the version is already set. */
664 if (ver == spa_version(spa))
665 continue;
666
667 /*
668 * In addition to the pool directory object, we might
669 * create the pool properties object, the features for
670 * read object, the features for write object, or the
671 * feature descriptions object.
672 */
673 error = dsl_sync_task(spa->spa_name, NULL,
674 spa_sync_version, &ver, 6);
675 if (error)
676 return (error);
677 continue;
678 }
679
680 need_sync = B_TRUE;
681 break;
682 }
683
684 if (need_sync) {
685 return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props,
686 nvp, 6));
687 }
688
689 return (0);
690}
691
692/*
693 * If the bootfs property value is dsobj, clear it.
694 */
695void
696spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
697{
698 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
699 VERIFY(zap_remove(spa->spa_meta_objset,
700 spa->spa_pool_props_object,
701 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0);
702 spa->spa_bootfs = 0;
703 }
704}
705
706/*ARGSUSED*/
707static int
708spa_change_guid_check(void *arg, dmu_tx_t *tx)
709{
710 uint64_t *newguid = arg;
711 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
712 vdev_t *rvd = spa->spa_root_vdev;
713 uint64_t vdev_state;
714
715 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
716 vdev_state = rvd->vdev_state;
717 spa_config_exit(spa, SCL_STATE, FTAG);
718
719 if (vdev_state != VDEV_STATE_HEALTHY)
720 return (SET_ERROR(ENXIO));
721
722 ASSERT3U(spa_guid(spa), !=, *newguid);
723
724 return (0);
725}
726
727static void
728spa_change_guid_sync(void *arg, dmu_tx_t *tx)
729{
730 uint64_t *newguid = arg;
731 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
732 uint64_t oldguid;
733 vdev_t *rvd = spa->spa_root_vdev;
734
735 oldguid = spa_guid(spa);
736
737 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
738 rvd->vdev_guid = *newguid;
739 rvd->vdev_guid_sum += (*newguid - oldguid);
740 vdev_config_dirty(rvd);
741 spa_config_exit(spa, SCL_STATE, FTAG);
742
743 spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu",
744 oldguid, *newguid);
745}
746
747/*
748 * Change the GUID for the pool. This is done so that we can later
749 * re-import a pool built from a clone of our own vdevs. We will modify
750 * the root vdev's guid, our own pool guid, and then mark all of our
751 * vdevs dirty. Note that we must make sure that all our vdevs are
752 * online when we do this, or else any vdevs that weren't present
753 * would be orphaned from our pool. We are also going to issue a
754 * sysevent to update any watchers.
755 */
756int
757spa_change_guid(spa_t *spa)
758{
759 int error;
760 uint64_t guid;
761
762 mutex_enter(&spa_namespace_lock);
763 guid = spa_generate_guid(NULL);
764
765 error = dsl_sync_task(spa->spa_name, spa_change_guid_check,
766 spa_change_guid_sync, &guid, 5);
767
768 if (error == 0) {
769 spa_config_sync(spa, B_FALSE, B_TRUE);
770 spa_event_notify(spa, NULL, ESC_ZFS_POOL_REGUID);
771 }
772
773 mutex_exit(&spa_namespace_lock);
774
775 return (error);
776}
777
778/*
779 * ==========================================================================
780 * SPA state manipulation (open/create/destroy/import/export)
781 * ==========================================================================
782 */
783
784static int
785spa_error_entry_compare(const void *a, const void *b)
786{
787 spa_error_entry_t *sa = (spa_error_entry_t *)a;
788 spa_error_entry_t *sb = (spa_error_entry_t *)b;
789 int ret;
790
791 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark,
792 sizeof (zbookmark_t));
793
794 if (ret < 0)
795 return (-1);
796 else if (ret > 0)
797 return (1);
798 else
799 return (0);
800}
801
802/*
803 * Utility function which retrieves copies of the current logs and
804 * re-initializes them in the process.
805 */
806void
807spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
808{
809 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
810
811 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t));
812 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t));
813
814 avl_create(&spa->spa_errlist_scrub,
815 spa_error_entry_compare, sizeof (spa_error_entry_t),
816 offsetof(spa_error_entry_t, se_avl));
817 avl_create(&spa->spa_errlist_last,
818 spa_error_entry_compare, sizeof (spa_error_entry_t),
819 offsetof(spa_error_entry_t, se_avl));
820}
821
822static taskq_t *
823spa_taskq_create(spa_t *spa, const char *name, enum zti_modes mode,
824 uint_t value)
825{
826 uint_t flags = TASKQ_PREPOPULATE;
827 boolean_t batch = B_FALSE;
828
829 switch (mode) {
830 case zti_mode_null:
831 return (NULL); /* no taskq needed */
832
833 case zti_mode_fixed:
834 ASSERT3U(value, >=, 1);
835 value = MAX(value, 1);
836 break;
837
838 case zti_mode_batch:
839 batch = B_TRUE;
840 flags |= TASKQ_THREADS_CPU_PCT;
841 value = zio_taskq_batch_pct;
842 break;
843
844 case zti_mode_online_percent:
845 flags |= TASKQ_THREADS_CPU_PCT;
846 break;
847
848 default:
849 panic("unrecognized mode for %s taskq (%u:%u) in "
850 "spa_activate()",
851 name, mode, value);
852 break;
853 }
854
855#ifdef SYSDC
856 if (zio_taskq_sysdc && spa->spa_proc != &p0) {
857 if (batch)
858 flags |= TASKQ_DC_BATCH;
859
860 return (taskq_create_sysdc(name, value, 50, INT_MAX,
861 spa->spa_proc, zio_taskq_basedc, flags));
862 }
863#endif
864 return (taskq_create_proc(name, value, maxclsyspri, 50, INT_MAX,
865 spa->spa_proc, flags));
866}
867
868static void
869spa_create_zio_taskqs(spa_t *spa)
870{
871 for (int t = 0; t < ZIO_TYPES; t++) {
872 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
873 const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
874 enum zti_modes mode = ztip->zti_mode;
875 uint_t value = ztip->zti_value;
876 char name[32];
877
878 (void) snprintf(name, sizeof (name),
879 "%s_%s", zio_type_name[t], zio_taskq_types[q]);
880
881 spa->spa_zio_taskq[t][q] =
882 spa_taskq_create(spa, name, mode, value);
883 }
884 }
885}
886
887#ifdef _KERNEL
888#ifdef SPA_PROCESS
889static void
890spa_thread(void *arg)
891{
892 callb_cpr_t cprinfo;
893
894 spa_t *spa = arg;
895 user_t *pu = PTOU(curproc);
896
897 CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr,
898 spa->spa_name);
899
900 ASSERT(curproc != &p0);
901 (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs),
902 "zpool-%s", spa->spa_name);
903 (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm));
904
905#ifdef PSRSET_BIND
906 /* bind this thread to the requested psrset */
907 if (zio_taskq_psrset_bind != PS_NONE) {
908 pool_lock();
909 mutex_enter(&cpu_lock);
910 mutex_enter(&pidlock);
911 mutex_enter(&curproc->p_lock);
912
913 if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind,
914 0, NULL, NULL) == 0) {
915 curthread->t_bind_pset = zio_taskq_psrset_bind;
916 } else {
917 cmn_err(CE_WARN,
918 "Couldn't bind process for zfs pool \"%s\" to "
919 "pset %d\n", spa->spa_name, zio_taskq_psrset_bind);
920 }
921
922 mutex_exit(&curproc->p_lock);
923 mutex_exit(&pidlock);
924 mutex_exit(&cpu_lock);
925 pool_unlock();
926 }
927#endif
928
929#ifdef SYSDC
930 if (zio_taskq_sysdc) {
931 sysdc_thread_enter(curthread, 100, 0);
932 }
933#endif
934
935 spa->spa_proc = curproc;
936 spa->spa_did = curthread->t_did;
937
938 spa_create_zio_taskqs(spa);
939
940 mutex_enter(&spa->spa_proc_lock);
941 ASSERT(spa->spa_proc_state == SPA_PROC_CREATED);
942
943 spa->spa_proc_state = SPA_PROC_ACTIVE;
944 cv_broadcast(&spa->spa_proc_cv);
945
946 CALLB_CPR_SAFE_BEGIN(&cprinfo);
947 while (spa->spa_proc_state == SPA_PROC_ACTIVE)
948 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
949 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock);
950
951 ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE);
952 spa->spa_proc_state = SPA_PROC_GONE;
953 spa->spa_proc = &p0;
954 cv_broadcast(&spa->spa_proc_cv);
955 CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */
956
957 mutex_enter(&curproc->p_lock);
958 lwp_exit();
959}
960#endif /* SPA_PROCESS */
961#endif
962
963/*
964 * Activate an uninitialized pool.
965 */
966static void
967spa_activate(spa_t *spa, int mode)
968{
969 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
970
971 spa->spa_state = POOL_STATE_ACTIVE;
972 spa->spa_mode = mode;
973
974 spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops);
975 spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops);
976
977 /* Try to create a covering process */
978 mutex_enter(&spa->spa_proc_lock);
979 ASSERT(spa->spa_proc_state == SPA_PROC_NONE);
980 ASSERT(spa->spa_proc == &p0);
981 spa->spa_did = 0;
982
983#ifdef SPA_PROCESS
984 /* Only create a process if we're going to be around a while. */
985 if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) {
986 if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri,
987 NULL, 0) == 0) {
988 spa->spa_proc_state = SPA_PROC_CREATED;
989 while (spa->spa_proc_state == SPA_PROC_CREATED) {
990 cv_wait(&spa->spa_proc_cv,
991 &spa->spa_proc_lock);
992 }
993 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
994 ASSERT(spa->spa_proc != &p0);
995 ASSERT(spa->spa_did != 0);
996 } else {
997#ifdef _KERNEL
998 cmn_err(CE_WARN,
999 "Couldn't create process for zfs pool \"%s\"\n",
1000 spa->spa_name);
1001#endif
1002 }
1003 }
1004#endif /* SPA_PROCESS */
1005 mutex_exit(&spa->spa_proc_lock);
1006
1007 /* If we didn't create a process, we need to create our taskqs. */
1008 ASSERT(spa->spa_proc == &p0);
1009 if (spa->spa_proc == &p0) {
1010 spa_create_zio_taskqs(spa);
1011 }
1012
1013 /*
1014 * Start TRIM thread.
1015 */
1016 trim_thread_create(spa);
1017
1018 list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
1019 offsetof(vdev_t, vdev_config_dirty_node));
1020 list_create(&spa->spa_state_dirty_list, sizeof (vdev_t),
1021 offsetof(vdev_t, vdev_state_dirty_node));
1022
1023 txg_list_create(&spa->spa_vdev_txg_list,
1024 offsetof(struct vdev, vdev_txg_node));
1025
1026 avl_create(&spa->spa_errlist_scrub,
1027 spa_error_entry_compare, sizeof (spa_error_entry_t),
1028 offsetof(spa_error_entry_t, se_avl));
1029 avl_create(&spa->spa_errlist_last,
1030 spa_error_entry_compare, sizeof (spa_error_entry_t),
1031 offsetof(spa_error_entry_t, se_avl));
1032}
1033
1034/*
1035 * Opposite of spa_activate().
1036 */
1037static void
1038spa_deactivate(spa_t *spa)
1039{
1040 ASSERT(spa->spa_sync_on == B_FALSE);
1041 ASSERT(spa->spa_dsl_pool == NULL);
1042 ASSERT(spa->spa_root_vdev == NULL);
1043 ASSERT(spa->spa_async_zio_root == NULL);
1044 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
1045
1046 /*
1047 * Stop TRIM thread in case spa_unload() wasn't called directly
1048 * before spa_deactivate().
1049 */
1050 trim_thread_destroy(spa);
1051
1052 txg_list_destroy(&spa->spa_vdev_txg_list);
1053
1054 list_destroy(&spa->spa_config_dirty_list);
1055 list_destroy(&spa->spa_state_dirty_list);
1056
1057 for (int t = 0; t < ZIO_TYPES; t++) {
1058 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
1059 if (spa->spa_zio_taskq[t][q] != NULL)
1060 taskq_destroy(spa->spa_zio_taskq[t][q]);
1061 spa->spa_zio_taskq[t][q] = NULL;
1062 }
1063 }
1064
1065 metaslab_class_destroy(spa->spa_normal_class);
1066 spa->spa_normal_class = NULL;
1067
1068 metaslab_class_destroy(spa->spa_log_class);
1069 spa->spa_log_class = NULL;
1070
1071 /*
1072 * If this was part of an import or the open otherwise failed, we may
1073 * still have errors left in the queues. Empty them just in case.
1074 */
1075 spa_errlog_drain(spa);
1076
1077 avl_destroy(&spa->spa_errlist_scrub);
1078 avl_destroy(&spa->spa_errlist_last);
1079
1080 spa->spa_state = POOL_STATE_UNINITIALIZED;
1081
1082 mutex_enter(&spa->spa_proc_lock);
1083 if (spa->spa_proc_state != SPA_PROC_NONE) {
1084 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
1085 spa->spa_proc_state = SPA_PROC_DEACTIVATE;
1086 cv_broadcast(&spa->spa_proc_cv);
1087 while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) {
1088 ASSERT(spa->spa_proc != &p0);
1089 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
1090 }
1091 ASSERT(spa->spa_proc_state == SPA_PROC_GONE);
1092 spa->spa_proc_state = SPA_PROC_NONE;
1093 }
1094 ASSERT(spa->spa_proc == &p0);
1095 mutex_exit(&spa->spa_proc_lock);
1096
1097#ifdef SPA_PROCESS
1098 /*
1099 * We want to make sure spa_thread() has actually exited the ZFS
1100 * module, so that the module can't be unloaded out from underneath
1101 * it.
1102 */
1103 if (spa->spa_did != 0) {
1104 thread_join(spa->spa_did);
1105 spa->spa_did = 0;
1106 }
1107#endif /* SPA_PROCESS */
1108}
1109
1110/*
1111 * Verify a pool configuration, and construct the vdev tree appropriately. This
1112 * will create all the necessary vdevs in the appropriate layout, with each vdev
1113 * in the CLOSED state. This will prep the pool before open/creation/import.
1114 * All vdev validation is done by the vdev_alloc() routine.
1115 */
1116static int
1117spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
1118 uint_t id, int atype)
1119{
1120 nvlist_t **child;
1121 uint_t children;
1122 int error;
1123
1124 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
1125 return (error);
1126
1127 if ((*vdp)->vdev_ops->vdev_op_leaf)
1128 return (0);
1129
1130 error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
1131 &child, &children);
1132
1133 if (error == ENOENT)
1134 return (0);
1135
1136 if (error) {
1137 vdev_free(*vdp);
1138 *vdp = NULL;
1139 return (SET_ERROR(EINVAL));
1140 }
1141
1142 for (int c = 0; c < children; c++) {
1143 vdev_t *vd;
1144 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
1145 atype)) != 0) {
1146 vdev_free(*vdp);
1147 *vdp = NULL;
1148 return (error);
1149 }
1150 }
1151
1152 ASSERT(*vdp != NULL);
1153
1154 return (0);
1155}
1156
1157/*
1158 * Opposite of spa_load().
1159 */
1160static void
1161spa_unload(spa_t *spa)
1162{
1163 int i;
1164
1165 ASSERT(MUTEX_HELD(&spa_namespace_lock));
1166
1167 /*
1168 * Stop TRIM thread.
1169 */
1170 trim_thread_destroy(spa);
1171
1172 /*
1173 * Stop async tasks.
1174 */
1175 spa_async_suspend(spa);
1176
1177 /*
1178 * Stop syncing.
1179 */
1180 if (spa->spa_sync_on) {
1181 txg_sync_stop(spa->spa_dsl_pool);
1182 spa->spa_sync_on = B_FALSE;
1183 }
1184
1185 /*
1186 * Wait for any outstanding async I/O to complete.
1187 */
1188 if (spa->spa_async_zio_root != NULL) {
1189 (void) zio_wait(spa->spa_async_zio_root);
1190 spa->spa_async_zio_root = NULL;
1191 }
1192
1193 bpobj_close(&spa->spa_deferred_bpobj);
1194
1195 /*
1196 * Close the dsl pool.
1197 */
1198 if (spa->spa_dsl_pool) {
1199 dsl_pool_close(spa->spa_dsl_pool);
1200 spa->spa_dsl_pool = NULL;
1201 spa->spa_meta_objset = NULL;
1202 }
1203
1204 ddt_unload(spa);
1205
1206 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1207
1208 /*
1209 * Drop and purge level 2 cache
1210 */
1211 spa_l2cache_drop(spa);
1212
1213 /*
1214 * Close all vdevs.
1215 */
1216 if (spa->spa_root_vdev)
1217 vdev_free(spa->spa_root_vdev);
1218 ASSERT(spa->spa_root_vdev == NULL);
1219
1220 for (i = 0; i < spa->spa_spares.sav_count; i++)
1221 vdev_free(spa->spa_spares.sav_vdevs[i]);
1222 if (spa->spa_spares.sav_vdevs) {
1223 kmem_free(spa->spa_spares.sav_vdevs,
1224 spa->spa_spares.sav_count * sizeof (void *));
1225 spa->spa_spares.sav_vdevs = NULL;
1226 }
1227 if (spa->spa_spares.sav_config) {
1228 nvlist_free(spa->spa_spares.sav_config);
1229 spa->spa_spares.sav_config = NULL;
1230 }
1231 spa->spa_spares.sav_count = 0;
1232
1233 for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
1234 vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]);
1235 vdev_free(spa->spa_l2cache.sav_vdevs[i]);
1236 }
1237 if (spa->spa_l2cache.sav_vdevs) {
1238 kmem_free(spa->spa_l2cache.sav_vdevs,
1239 spa->spa_l2cache.sav_count * sizeof (void *));
1240 spa->spa_l2cache.sav_vdevs = NULL;
1241 }
1242 if (spa->spa_l2cache.sav_config) {
1243 nvlist_free(spa->spa_l2cache.sav_config);
1244 spa->spa_l2cache.sav_config = NULL;
1245 }
1246 spa->spa_l2cache.sav_count = 0;
1247
1248 spa->spa_async_suspended = 0;
1249
1250 if (spa->spa_comment != NULL) {
1251 spa_strfree(spa->spa_comment);
1252 spa->spa_comment = NULL;
1253 }
1254
1255 spa_config_exit(spa, SCL_ALL, FTAG);
1256}
1257
1258/*
1259 * Load (or re-load) the current list of vdevs describing the active spares for
1260 * this pool. When this is called, we have some form of basic information in
1261 * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and
1262 * then re-generate a more complete list including status information.
1263 */
1264static void
1265spa_load_spares(spa_t *spa)
1266{
1267 nvlist_t **spares;
1268 uint_t nspares;
1269 int i;
1270 vdev_t *vd, *tvd;
1271
1272 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
1273
1274 /*
1275 * First, close and free any existing spare vdevs.
1276 */
1277 for (i = 0; i < spa->spa_spares.sav_count; i++) {
1278 vd = spa->spa_spares.sav_vdevs[i];
1279
1280 /* Undo the call to spa_activate() below */
1281 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
1282 B_FALSE)) != NULL && tvd->vdev_isspare)
1283 spa_spare_remove(tvd);
1284 vdev_close(vd);
1285 vdev_free(vd);
1286 }
1287
1288 if (spa->spa_spares.sav_vdevs)
1289 kmem_free(spa->spa_spares.sav_vdevs,
1290 spa->spa_spares.sav_count * sizeof (void *));
1291
1292 if (spa->spa_spares.sav_config == NULL)
1293 nspares = 0;
1294 else
1295 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
1296 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
1297
1298 spa->spa_spares.sav_count = (int)nspares;
1299 spa->spa_spares.sav_vdevs = NULL;
1300
1301 if (nspares == 0)
1302 return;
1303
1304 /*
1305 * Construct the array of vdevs, opening them to get status in the
1306 * process. For each spare, there is potentially two different vdev_t
1307 * structures associated with it: one in the list of spares (used only
1308 * for basic validation purposes) and one in the active vdev
1309 * configuration (if it's spared in). During this phase we open and
1310 * validate each vdev on the spare list. If the vdev also exists in the
1311 * active configuration, then we also mark this vdev as an active spare.
1312 */
1313 spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *),
1314 KM_SLEEP);
1315 for (i = 0; i < spa->spa_spares.sav_count; i++) {
1316 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0,
1317 VDEV_ALLOC_SPARE) == 0);
1318 ASSERT(vd != NULL);
1319
1320 spa->spa_spares.sav_vdevs[i] = vd;
1321
1322 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
1323 B_FALSE)) != NULL) {
1324 if (!tvd->vdev_isspare)
1325 spa_spare_add(tvd);
1326
1327 /*
1328 * We only mark the spare active if we were successfully
1329 * able to load the vdev. Otherwise, importing a pool
1330 * with a bad active spare would result in strange
1331 * behavior, because multiple pool would think the spare
1332 * is actively in use.
1333 *
1334 * There is a vulnerability here to an equally bizarre
1335 * circumstance, where a dead active spare is later
1336 * brought back to life (onlined or otherwise). Given
1337 * the rarity of this scenario, and the extra complexity
1338 * it adds, we ignore the possibility.
1339 */
1340 if (!vdev_is_dead(tvd))
1341 spa_spare_activate(tvd);
1342 }
1343
1344 vd->vdev_top = vd;
1345 vd->vdev_aux = &spa->spa_spares;
1346
1347 if (vdev_open(vd) != 0)
1348 continue;
1349
1350 if (vdev_validate_aux(vd) == 0)
1351 spa_spare_add(vd);
1352 }
1353
1354 /*
1355 * Recompute the stashed list of spares, with status information
1356 * this time.
1357 */
1358 VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES,
1359 DATA_TYPE_NVLIST_ARRAY) == 0);
1360
1361 spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *),
1362 KM_SLEEP);
1363 for (i = 0; i < spa->spa_spares.sav_count; i++)
1364 spares[i] = vdev_config_generate(spa,
1365 spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE);
1366 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
1367 ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0);
1368 for (i = 0; i < spa->spa_spares.sav_count; i++)
1369 nvlist_free(spares[i]);
1370 kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *));
1371}
1372
1373/*
1374 * Load (or re-load) the current list of vdevs describing the active l2cache for
1375 * this pool. When this is called, we have some form of basic information in
1376 * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and
1377 * then re-generate a more complete list including status information.
1378 * Devices which are already active have their details maintained, and are
1379 * not re-opened.
1380 */
1381static void
1382spa_load_l2cache(spa_t *spa)
1383{
1384 nvlist_t **l2cache;
1385 uint_t nl2cache;
1386 int i, j, oldnvdevs;
1387 uint64_t guid;
1388 vdev_t *vd, **oldvdevs, **newvdevs;
1389 spa_aux_vdev_t *sav = &spa->spa_l2cache;
1390
1391 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
1392
1393 if (sav->sav_config != NULL) {
1394 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
1395 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
1396 newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP);
1397 } else {
1398 nl2cache = 0;
1399 newvdevs = NULL;
1400 }
1401
1402 oldvdevs = sav->sav_vdevs;
1403 oldnvdevs = sav->sav_count;
1404 sav->sav_vdevs = NULL;
1405 sav->sav_count = 0;
1406
1407 /*
1408 * Process new nvlist of vdevs.
1409 */
1410 for (i = 0; i < nl2cache; i++) {
1411 VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID,
1412 &guid) == 0);
1413
1414 newvdevs[i] = NULL;
1415 for (j = 0; j < oldnvdevs; j++) {
1416 vd = oldvdevs[j];
1417 if (vd != NULL && guid == vd->vdev_guid) {
1418 /*
1419 * Retain previous vdev for add/remove ops.
1420 */
1421 newvdevs[i] = vd;
1422 oldvdevs[j] = NULL;
1423 break;
1424 }
1425 }
1426
1427 if (newvdevs[i] == NULL) {
1428 /*
1429 * Create new vdev
1430 */
1431 VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0,
1432 VDEV_ALLOC_L2CACHE) == 0);
1433 ASSERT(vd != NULL);
1434 newvdevs[i] = vd;
1435
1436 /*
1437 * Commit this vdev as an l2cache device,
1438 * even if it fails to open.
1439 */
1440 spa_l2cache_add(vd);
1441
1442 vd->vdev_top = vd;
1443 vd->vdev_aux = sav;
1444
1445 spa_l2cache_activate(vd);
1446
1447 if (vdev_open(vd) != 0)
1448 continue;
1449
1450 (void) vdev_validate_aux(vd);
1451
1452 if (!vdev_is_dead(vd))
1453 l2arc_add_vdev(spa, vd);
1454 }
1455 }
1456
1457 /*
1458 * Purge vdevs that were dropped
1459 */
1460 for (i = 0; i < oldnvdevs; i++) {
1461 uint64_t pool;
1462
1463 vd = oldvdevs[i];
1464 if (vd != NULL) {
1465 ASSERT(vd->vdev_isl2cache);
1466
1467 if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
1468 pool != 0ULL && l2arc_vdev_present(vd))
1469 l2arc_remove_vdev(vd);
1470 vdev_clear_stats(vd);
1471 vdev_free(vd);
1472 }
1473 }
1474
1475 if (oldvdevs)
1476 kmem_free(oldvdevs, oldnvdevs * sizeof (void *));
1477
1478 if (sav->sav_config == NULL)
1479 goto out;
1480
1481 sav->sav_vdevs = newvdevs;
1482 sav->sav_count = (int)nl2cache;
1483
1484 /*
1485 * Recompute the stashed list of l2cache devices, with status
1486 * information this time.
1487 */
1488 VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
1489 DATA_TYPE_NVLIST_ARRAY) == 0);
1490
1491 l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
1492 for (i = 0; i < sav->sav_count; i++)
1493 l2cache[i] = vdev_config_generate(spa,
1494 sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE);
1495 VERIFY(nvlist_add_nvlist_array(sav->sav_config,
1496 ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0);
1497out:
1498 for (i = 0; i < sav->sav_count; i++)
1499 nvlist_free(l2cache[i]);
1500 if (sav->sav_count)
1501 kmem_free(l2cache, sav->sav_count * sizeof (void *));
1502}
1503
1504static int
1505load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
1506{
1507 dmu_buf_t *db;
1508 char *packed = NULL;
1509 size_t nvsize = 0;
1510 int error;
1511 *value = NULL;
1512
1513 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
1514 nvsize = *(uint64_t *)db->db_data;
1515 dmu_buf_rele(db, FTAG);
1516
1517 packed = kmem_alloc(nvsize, KM_SLEEP);
1518 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed,
1519 DMU_READ_PREFETCH);
1520 if (error == 0)
1521 error = nvlist_unpack(packed, nvsize, value, 0);
1522 kmem_free(packed, nvsize);
1523
1524 return (error);
1525}
1526
1527/*
1528 * Checks to see if the given vdev could not be opened, in which case we post a
1529 * sysevent to notify the autoreplace code that the device has been removed.
1530 */
1531static void
1532spa_check_removed(vdev_t *vd)
1533{
1534 for (int c = 0; c < vd->vdev_children; c++)
1535 spa_check_removed(vd->vdev_child[c]);
1536
1537 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) &&
1538 !vd->vdev_ishole) {
1539 zfs_post_autoreplace(vd->vdev_spa, vd);
1540 spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK);
1541 }
1542}
1543
1544/*
1545 * Validate the current config against the MOS config
1546 */
1547static boolean_t
1548spa_config_valid(spa_t *spa, nvlist_t *config)
1549{
1550 vdev_t *mrvd, *rvd = spa->spa_root_vdev;
1551 nvlist_t *nv;
1552
1553 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0);
1554
1555 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1556 VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0);
1557
1558 ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children);
1559
1560 /*
1561 * If we're doing a normal import, then build up any additional
1562 * diagnostic information about missing devices in this config.
1563 * We'll pass this up to the user for further processing.
1564 */
1565 if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) {
1566 nvlist_t **child, *nv;
1567 uint64_t idx = 0;
1568
1569 child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **),
1570 KM_SLEEP);
1571 VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
1572
1573 for (int c = 0; c < rvd->vdev_children; c++) {
1574 vdev_t *tvd = rvd->vdev_child[c];
1575 vdev_t *mtvd = mrvd->vdev_child[c];
1576
1577 if (tvd->vdev_ops == &vdev_missing_ops &&
1578 mtvd->vdev_ops != &vdev_missing_ops &&
1579 mtvd->vdev_islog)
1580 child[idx++] = vdev_config_generate(spa, mtvd,
1581 B_FALSE, 0);
1582 }
1583
1584 if (idx) {
1585 VERIFY(nvlist_add_nvlist_array(nv,
1586 ZPOOL_CONFIG_CHILDREN, child, idx) == 0);
1587 VERIFY(nvlist_add_nvlist(spa->spa_load_info,
1588 ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0);
1589
1590 for (int i = 0; i < idx; i++)
1591 nvlist_free(child[i]);
1592 }
1593 nvlist_free(nv);
1594 kmem_free(child, rvd->vdev_children * sizeof (char **));
1595 }
1596
1597 /*
1598 * Compare the root vdev tree with the information we have
1599 * from the MOS config (mrvd). Check each top-level vdev
1600 * with the corresponding MOS config top-level (mtvd).
1601 */
1602 for (int c = 0; c < rvd->vdev_children; c++) {
1603 vdev_t *tvd = rvd->vdev_child[c];
1604 vdev_t *mtvd = mrvd->vdev_child[c];
1605
1606 /*
1607 * Resolve any "missing" vdevs in the current configuration.
1608 * If we find that the MOS config has more accurate information
1609 * about the top-level vdev then use that vdev instead.
1610 */
1611 if (tvd->vdev_ops == &vdev_missing_ops &&
1612 mtvd->vdev_ops != &vdev_missing_ops) {
1613
1614 if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG))
1615 continue;
1616
1617 /*
1618 * Device specific actions.
1619 */
1620 if (mtvd->vdev_islog) {
1621 spa_set_log_state(spa, SPA_LOG_CLEAR);
1622 } else {
1623 /*
1624 * XXX - once we have 'readonly' pool
1625 * support we should be able to handle
1626 * missing data devices by transitioning
1627 * the pool to readonly.
1628 */
1629 continue;
1630 }
1631
1632 /*
1633 * Swap the missing vdev with the data we were
1634 * able to obtain from the MOS config.
1635 */
1636 vdev_remove_child(rvd, tvd);
1637 vdev_remove_child(mrvd, mtvd);
1638
1639 vdev_add_child(rvd, mtvd);
1640 vdev_add_child(mrvd, tvd);
1641
1642 spa_config_exit(spa, SCL_ALL, FTAG);
1643 vdev_load(mtvd);
1644 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1645
1646 vdev_reopen(rvd);
1647 } else if (mtvd->vdev_islog) {
1648 /*
1649 * Load the slog device's state from the MOS config
1650 * since it's possible that the label does not
1651 * contain the most up-to-date information.
1652 */
1653 vdev_load_log_state(tvd, mtvd);
1654 vdev_reopen(tvd);
1655 }
1656 }
1657 vdev_free(mrvd);
1658 spa_config_exit(spa, SCL_ALL, FTAG);
1659
1660 /*
1661 * Ensure we were able to validate the config.
1662 */
1663 return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum);
1664}
1665
1666/*
1667 * Check for missing log devices
1668 */
1669static boolean_t
1670spa_check_logs(spa_t *spa)
1671{
1672 boolean_t rv = B_FALSE;
1673
1674 switch (spa->spa_log_state) {
1675 case SPA_LOG_MISSING:
1676 /* need to recheck in case slog has been restored */
1677 case SPA_LOG_UNKNOWN:
1678 rv = (dmu_objset_find(spa->spa_name, zil_check_log_chain,
1679 NULL, DS_FIND_CHILDREN) != 0);
1680 if (rv)
1681 spa_set_log_state(spa, SPA_LOG_MISSING);
1682 break;
1683 }
1684 return (rv);
1685}
1686
1687static boolean_t
1688spa_passivate_log(spa_t *spa)
1689{
1690 vdev_t *rvd = spa->spa_root_vdev;
1691 boolean_t slog_found = B_FALSE;
1692
1693 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
1694
1695 if (!spa_has_slogs(spa))
1696 return (B_FALSE);
1697
1698 for (int c = 0; c < rvd->vdev_children; c++) {
1699 vdev_t *tvd = rvd->vdev_child[c];
1700 metaslab_group_t *mg = tvd->vdev_mg;
1701
1702 if (tvd->vdev_islog) {
1703 metaslab_group_passivate(mg);
1704 slog_found = B_TRUE;
1705 }
1706 }
1707
1708 return (slog_found);
1709}
1710
1711static void
1712spa_activate_log(spa_t *spa)
1713{
1714 vdev_t *rvd = spa->spa_root_vdev;
1715
1716 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
1717
1718 for (int c = 0; c < rvd->vdev_children; c++) {
1719 vdev_t *tvd = rvd->vdev_child[c];
1720 metaslab_group_t *mg = tvd->vdev_mg;
1721
1722 if (tvd->vdev_islog)
1723 metaslab_group_activate(mg);
1724 }
1725}
1726
1727int
1728spa_offline_log(spa_t *spa)
1729{
1730 int error;
1731
1732 error = dmu_objset_find(spa_name(spa), zil_vdev_offline,
1733 NULL, DS_FIND_CHILDREN);
1734 if (error == 0) {
1735 /*
1736 * We successfully offlined the log device, sync out the
1737 * current txg so that the "stubby" block can be removed
1738 * by zil_sync().
1739 */
1740 txg_wait_synced(spa->spa_dsl_pool, 0);
1741 }
1742 return (error);
1743}
1744
1745static void
1746spa_aux_check_removed(spa_aux_vdev_t *sav)
1747{
1748 int i;
1749
1750 for (i = 0; i < sav->sav_count; i++)
1751 spa_check_removed(sav->sav_vdevs[i]);
1752}
1753
1754void
1755spa_claim_notify(zio_t *zio)
1756{
1757 spa_t *spa = zio->io_spa;
1758
1759 if (zio->io_error)
1760 return;
1761
1762 mutex_enter(&spa->spa_props_lock); /* any mutex will do */
1763 if (spa->spa_claim_max_txg < zio->io_bp->blk_birth)
1764 spa->spa_claim_max_txg = zio->io_bp->blk_birth;
1765 mutex_exit(&spa->spa_props_lock);
1766}
1767
1768typedef struct spa_load_error {
1769 uint64_t sle_meta_count;
1770 uint64_t sle_data_count;
1771} spa_load_error_t;
1772
1773static void
1774spa_load_verify_done(zio_t *zio)
1775{
1776 blkptr_t *bp = zio->io_bp;
1777 spa_load_error_t *sle = zio->io_private;
1778 dmu_object_type_t type = BP_GET_TYPE(bp);
1779 int error = zio->io_error;
1780
1781 if (error) {
1782 if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) &&
1783 type != DMU_OT_INTENT_LOG)
1784 atomic_add_64(&sle->sle_meta_count, 1);
1785 else
1786 atomic_add_64(&sle->sle_data_count, 1);
1787 }
1788 zio_data_buf_free(zio->io_data, zio->io_size);
1789}
1790
1791/*ARGSUSED*/
1792static int
1793spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
1794 const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
1795{
1796 if (bp != NULL) {
1797 zio_t *rio = arg;
1798 size_t size = BP_GET_PSIZE(bp);
1799 void *data = zio_data_buf_alloc(size);
1800
1801 zio_nowait(zio_read(rio, spa, bp, data, size,
1802 spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
1803 ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
1804 ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
1805 }
1806 return (0);
1807}
1808
1809static int
1810spa_load_verify(spa_t *spa)
1811{
1812 zio_t *rio;
1813 spa_load_error_t sle = { 0 };
1814 zpool_rewind_policy_t policy;
1815 boolean_t verify_ok = B_FALSE;
1816 int error;
1817
1818 zpool_get_rewind_policy(spa->spa_config, &policy);
1819
1820 if (policy.zrp_request & ZPOOL_NEVER_REWIND)
1821 return (0);
1822
1823 rio = zio_root(spa, NULL, &sle,
1824 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
1825
1826 error = traverse_pool(spa, spa->spa_verify_min_txg,
1827 TRAVERSE_PRE | TRAVERSE_PREFETCH, spa_load_verify_cb, rio);
1828
1829 (void) zio_wait(rio);
1830
1831 spa->spa_load_meta_errors = sle.sle_meta_count;
1832 spa->spa_load_data_errors = sle.sle_data_count;
1833
1834 if (!error && sle.sle_meta_count <= policy.zrp_maxmeta &&
1835 sle.sle_data_count <= policy.zrp_maxdata) {
1836 int64_t loss = 0;
1837
1838 verify_ok = B_TRUE;
1839 spa->spa_load_txg = spa->spa_uberblock.ub_txg;
1840 spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp;
1841
1842 loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts;
1843 VERIFY(nvlist_add_uint64(spa->spa_load_info,
1844 ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0);
1845 VERIFY(nvlist_add_int64(spa->spa_load_info,
1846 ZPOOL_CONFIG_REWIND_TIME, loss) == 0);
1847 VERIFY(nvlist_add_uint64(spa->spa_load_info,
1848 ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0);
1849 } else {
1850 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg;
1851 }
1852
1853 if (error) {
1854 if (error != ENXIO && error != EIO)
1855 error = SET_ERROR(EIO);
1856 return (error);
1857 }
1858
1859 return (verify_ok ? 0 : EIO);
1860}
1861
1862/*
1863 * Find a value in the pool props object.
1864 */
1865static void
1866spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val)
1867{
1868 (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object,
1869 zpool_prop_to_name(prop), sizeof (uint64_t), 1, val);
1870}
1871
1872/*
1873 * Find a value in the pool directory object.
1874 */
1875static int
1876spa_dir_prop(spa_t *spa, const char *name, uint64_t *val)
1877{
1878 return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
1879 name, sizeof (uint64_t), 1, val));
1880}
1881
1882static int
1883spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err)
1884{
1885 vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux);
1886 return (err);
1887}
1888
1889/*
1890 * Fix up config after a partly-completed split. This is done with the
1891 * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off
1892 * pool have that entry in their config, but only the splitting one contains
1893 * a list of all the guids of the vdevs that are being split off.
1894 *
1895 * This function determines what to do with that list: either rejoin
1896 * all the disks to the pool, or complete the splitting process. To attempt
1897 * the rejoin, each disk that is offlined is marked online again, and
1898 * we do a reopen() call. If the vdev label for every disk that was
1899 * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL)
1900 * then we call vdev_split() on each disk, and complete the split.
1901 *
1902 * Otherwise we leave the config alone, with all the vdevs in place in
1903 * the original pool.
1904 */
1905static void
1906spa_try_repair(spa_t *spa, nvlist_t *config)
1907{
1908 uint_t extracted;
1909 uint64_t *glist;
1910 uint_t i, gcount;
1911 nvlist_t *nvl;
1912 vdev_t **vd;
1913 boolean_t attempt_reopen;
1914
1915 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0)
1916 return;
1917
1918 /* check that the config is complete */
1919 if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
1920 &glist, &gcount) != 0)
1921 return;
1922
1923 vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP);
1924
1925 /* attempt to online all the vdevs & validate */
1926 attempt_reopen = B_TRUE;
1927 for (i = 0; i < gcount; i++) {
1928 if (glist[i] == 0) /* vdev is hole */
1929 continue;
1930
1931 vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE);
1932 if (vd[i] == NULL) {
1933 /*
1934 * Don't bother attempting to reopen the disks;
1935 * just do the split.
1936 */
1937 attempt_reopen = B_FALSE;
1938 } else {
1939 /* attempt to re-online it */
1940 vd[i]->vdev_offline = B_FALSE;
1941 }
1942 }
1943
1944 if (attempt_reopen) {
1945 vdev_reopen(spa->spa_root_vdev);
1946
1947 /* check each device to see what state it's in */
1948 for (extracted = 0, i = 0; i < gcount; i++) {
1949 if (vd[i] != NULL &&
1950 vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL)
1951 break;
1952 ++extracted;
1953 }
1954 }
1955
1956 /*
1957 * If every disk has been moved to the new pool, or if we never
1958 * even attempted to look at them, then we split them off for
1959 * good.
1960 */
1961 if (!attempt_reopen || gcount == extracted) {
1962 for (i = 0; i < gcount; i++)
1963 if (vd[i] != NULL)
1964 vdev_split(vd[i]);
1965 vdev_reopen(spa->spa_root_vdev);
1966 }
1967
1968 kmem_free(vd, gcount * sizeof (vdev_t *));
1969}
1970
1971static int
1972spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type,
1973 boolean_t mosconfig)
1974{
1975 nvlist_t *config = spa->spa_config;
1976 char *ereport = FM_EREPORT_ZFS_POOL;
1977 char *comment;
1978 int error;
1979 uint64_t pool_guid;
1980 nvlist_t *nvl;
1981
1982 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid))
1983 return (SET_ERROR(EINVAL));
1984
1985 ASSERT(spa->spa_comment == NULL);
1986 if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0)
1987 spa->spa_comment = spa_strdup(comment);
1988
1989 /*
1990 * Versioning wasn't explicitly added to the label until later, so if
1991 * it's not present treat it as the initial version.
1992 */
1993 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
1994 &spa->spa_ubsync.ub_version) != 0)
1995 spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
1996
1997 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
1998 &spa->spa_config_txg);
1999
2000 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) &&
2001 spa_guid_exists(pool_guid, 0)) {
2002 error = SET_ERROR(EEXIST);
2003 } else {
2004 spa->spa_config_guid = pool_guid;
2005
2006 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT,
2007 &nvl) == 0) {
2008 VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting,
2009 KM_SLEEP) == 0);
2010 }
2011
2012 nvlist_free(spa->spa_load_info);
2013 spa->spa_load_info = fnvlist_alloc();
2014
2015 gethrestime(&spa->spa_loaded_ts);
2016 error = spa_load_impl(spa, pool_guid, config, state, type,
2017 mosconfig, &ereport);
2018 }
2019
2020 spa->spa_minref = refcount_count(&spa->spa_refcount);
2021 if (error) {
2022 if (error != EEXIST) {
2023 spa->spa_loaded_ts.tv_sec = 0;
2024 spa->spa_loaded_ts.tv_nsec = 0;
2025 }
2026 if (error != EBADF) {
2027 zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0);
2028 }
2029 }
2030 spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
2031 spa->spa_ena = 0;
2032
2033 return (error);
2034}
2035
2036/*
2037 * Load an existing storage pool, using the pool's builtin spa_config as a
2038 * source of configuration information.
2039 */
2040static int
2041spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
2042 spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
2043 char **ereport)
2044{
2045 int error = 0;
2046 nvlist_t *nvroot = NULL;
2047 nvlist_t *label;
2048 vdev_t *rvd;
2049 uberblock_t *ub = &spa->spa_uberblock;
2050 uint64_t children, config_cache_txg = spa->spa_config_txg;
2051 int orig_mode = spa->spa_mode;
2052 int parse;
2053 uint64_t obj;
2054 boolean_t missing_feat_write = B_FALSE;
2055
2056 /*
2057 * If this is an untrusted config, access the pool in read-only mode.
2058 * This prevents things like resilvering recently removed devices.
2059 */
2060 if (!mosconfig)
2061 spa->spa_mode = FREAD;
2062
2063 ASSERT(MUTEX_HELD(&spa_namespace_lock));
2064
2065 spa->spa_load_state = state;
2066
2067 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot))
2068 return (SET_ERROR(EINVAL));
2069
2070 parse = (type == SPA_IMPORT_EXISTING ?
2071 VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT);
2072
2073 /*
2074 * Create "The Godfather" zio to hold all async IOs
2075 */
2076 spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
2077 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
2078
2079 /*
2080 * Parse the configuration into a vdev tree. We explicitly set the
2081 * value that will be returned by spa_version() since parsing the
2082 * configuration requires knowing the version number.
2083 */
2084 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2085 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse);
2086 spa_config_exit(spa, SCL_ALL, FTAG);
2087
2088 if (error != 0)
2089 return (error);
2090
2091 ASSERT(spa->spa_root_vdev == rvd);
2092
2093 if (type != SPA_IMPORT_ASSEMBLE) {
2094 ASSERT(spa_guid(spa) == pool_guid);
2095 }
2096
2097 /*
2098 * Try to open all vdevs, loading each label in the process.
2099 */
2100 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2101 error = vdev_open(rvd);
2102 spa_config_exit(spa, SCL_ALL, FTAG);
2103 if (error != 0)
2104 return (error);
2105
2106 /*
2107 * We need to validate the vdev labels against the configuration that
2108 * we have in hand, which is dependent on the setting of mosconfig. If
2109 * mosconfig is true then we're validating the vdev labels based on
2110 * that config. Otherwise, we're validating against the cached config
2111 * (zpool.cache) that was read when we loaded the zfs module, and then
2112 * later we will recursively call spa_load() and validate against
2113 * the vdev config.
2114 *
2115 * If we're assembling a new pool that's been split off from an
2116 * existing pool, the labels haven't yet been updated so we skip
2117 * validation for now.
2118 */
2119 if (type != SPA_IMPORT_ASSEMBLE) {
2120 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2121 error = vdev_validate(rvd, mosconfig);
2122 spa_config_exit(spa, SCL_ALL, FTAG);
2123
2124 if (error != 0)
2125 return (error);
2126
2127 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
2128 return (SET_ERROR(ENXIO));
2129 }
2130
2131 /*
2132 * Find the best uberblock.
2133 */
2134 vdev_uberblock_load(rvd, ub, &label);
2135
2136 /*
2137 * If we weren't able to find a single valid uberblock, return failure.
2138 */
2139 if (ub->ub_txg == 0) {
2140 nvlist_free(label);
2141 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO));
2142 }
2143
2144 /*
2145 * If the pool has an unsupported version we can't open it.
2146 */
2147 if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) {
2148 nvlist_free(label);
2149 return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP));
2150 }
2151
2152 if (ub->ub_version >= SPA_VERSION_FEATURES) {
2153 nvlist_t *features;
2154
2155 /*
2156 * If we weren't able to find what's necessary for reading the
2157 * MOS in the label, return failure.
2158 */
2159 if (label == NULL || nvlist_lookup_nvlist(label,
2160 ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) {
2161 nvlist_free(label);
2162 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
2163 ENXIO));
2164 }
2165
2166 /*
2167 * Update our in-core representation with the definitive values
2168 * from the label.
2169 */
2170 nvlist_free(spa->spa_label_features);
2171 VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0);
2172 }
2173
2174 nvlist_free(label);
2175
2176 /*
2177 * Look through entries in the label nvlist's features_for_read. If
2178 * there is a feature listed there which we don't understand then we
2179 * cannot open a pool.
2180 */
2181 if (ub->ub_version >= SPA_VERSION_FEATURES) {
2182 nvlist_t *unsup_feat;
2183
2184 VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) ==
2185 0);
2186
2187 for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features,
2188 NULL); nvp != NULL;
2189 nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) {
2190 if (!zfeature_is_supported(nvpair_name(nvp))) {
2191 VERIFY(nvlist_add_string(unsup_feat,
2192 nvpair_name(nvp), "") == 0);
2193 }
2194 }
2195
2196 if (!nvlist_empty(unsup_feat)) {
2197 VERIFY(nvlist_add_nvlist(spa->spa_load_info,
2198 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0);
2199 nvlist_free(unsup_feat);
2200 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
2201 ENOTSUP));
2202 }
2203
2204 nvlist_free(unsup_feat);
2205 }
2206
2207 /*
2208 * If the vdev guid sum doesn't match the uberblock, we have an
2209 * incomplete configuration. We first check to see if the pool
2210 * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN).
2211 * If it is, defer the vdev_guid_sum check till later so we
2212 * can handle missing vdevs.
2213 */
2214 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN,
2215 &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE &&
2216 rvd->vdev_guid_sum != ub->ub_guid_sum)
2217 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO));
2218
2219 if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) {
2220 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2221 spa_try_repair(spa, config);
2222 spa_config_exit(spa, SCL_ALL, FTAG);
2223 nvlist_free(spa->spa_config_splitting);
2224 spa->spa_config_splitting = NULL;
2225 }
2226
2227 /*
2228 * Initialize internal SPA structures.
2229 */
2230 spa->spa_state = POOL_STATE_ACTIVE;
2231 spa->spa_ubsync = spa->spa_uberblock;
2232 spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
2233 TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1;
2234 spa->spa_first_txg = spa->spa_last_ubsync_txg ?
2235 spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
2236 spa->spa_claim_max_txg = spa->spa_first_txg;
2237 spa->spa_prev_software_version = ub->ub_software_version;
2238
2239 error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
2240 if (error)
2241 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2242 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
2243
2244 if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0)
2245 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2246
2247 if (spa_version(spa) >= SPA_VERSION_FEATURES) {
2248 boolean_t missing_feat_read = B_FALSE;
2249 nvlist_t *unsup_feat, *enabled_feat;
2250
2251 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ,
2252 &spa->spa_feat_for_read_obj) != 0) {
2253 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2254 }
2255
2256 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE,
2257 &spa->spa_feat_for_write_obj) != 0) {
2258 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2259 }
2260
2261 if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS,
2262 &spa->spa_feat_desc_obj) != 0) {
2263 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2264 }
2265
2266 enabled_feat = fnvlist_alloc();
2267 unsup_feat = fnvlist_alloc();
2268
2269 if (!feature_is_supported(spa->spa_meta_objset,
2270 spa->spa_feat_for_read_obj, spa->spa_feat_desc_obj,
2271 unsup_feat, enabled_feat))
2272 missing_feat_read = B_TRUE;
2273
2274 if (spa_writeable(spa) || state == SPA_LOAD_TRYIMPORT) {
2275 if (!feature_is_supported(spa->spa_meta_objset,
2276 spa->spa_feat_for_write_obj, spa->spa_feat_desc_obj,
2277 unsup_feat, enabled_feat)) {
2278 missing_feat_write = B_TRUE;
2279 }
2280 }
2281
2282 fnvlist_add_nvlist(spa->spa_load_info,
2283 ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat);
2284
2285 if (!nvlist_empty(unsup_feat)) {
2286 fnvlist_add_nvlist(spa->spa_load_info,
2287 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat);
2288 }
2289
2290 fnvlist_free(enabled_feat);
2291 fnvlist_free(unsup_feat);
2292
2293 if (!missing_feat_read) {
2294 fnvlist_add_boolean(spa->spa_load_info,
2295 ZPOOL_CONFIG_CAN_RDONLY);
2296 }
2297
2298 /*
2299 * If the state is SPA_LOAD_TRYIMPORT, our objective is
2300 * twofold: to determine whether the pool is available for
2301 * import in read-write mode and (if it is not) whether the
2302 * pool is available for import in read-only mode. If the pool
2303 * is available for import in read-write mode, it is displayed
2304 * as available in userland; if it is not available for import
2305 * in read-only mode, it is displayed as unavailable in
2306 * userland. If the pool is available for import in read-only
2307 * mode but not read-write mode, it is displayed as unavailable
2308 * in userland with a special note that the pool is actually
2309 * available for open in read-only mode.
2310 *
2311 * As a result, if the state is SPA_LOAD_TRYIMPORT and we are
2312 * missing a feature for write, we must first determine whether
2313 * the pool can be opened read-only before returning to
2314 * userland in order to know whether to display the
2315 * abovementioned note.
2316 */
2317 if (missing_feat_read || (missing_feat_write &&
2318 spa_writeable(spa))) {
2319 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
2320 ENOTSUP));
2321 }
2322 }
2323
2324 spa->spa_is_initializing = B_TRUE;
2325 error = dsl_pool_open(spa->spa_dsl_pool);
2326 spa->spa_is_initializing = B_FALSE;
2327 if (error != 0)
2328 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2329
2330 if (!mosconfig) {
2331 uint64_t hostid;
2332 nvlist_t *policy = NULL, *nvconfig;
2333
2334 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
2335 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2336
2337 if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig,
2338 ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
2339 char *hostname;
2340 unsigned long myhostid = 0;
2341
2342 VERIFY(nvlist_lookup_string(nvconfig,
2343 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0);
2344
2345#ifdef _KERNEL
2346 myhostid = zone_get_hostid(NULL);
2347#else /* _KERNEL */
2348 /*
2349 * We're emulating the system's hostid in userland, so
2350 * we can't use zone_get_hostid().
2351 */
2352 (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid);
2353#endif /* _KERNEL */
2354 if (check_hostid && hostid != 0 && myhostid != 0 &&
2355 hostid != myhostid) {
2356 nvlist_free(nvconfig);
2357 cmn_err(CE_WARN, "pool '%s' could not be "
2358 "loaded as it was last accessed by "
2359 "another system (host: %s hostid: 0x%lx). "
2360 "See: http://illumos.org/msg/ZFS-8000-EY",
2361 spa_name(spa), hostname,
2362 (unsigned long)hostid);
2363 return (SET_ERROR(EBADF));
2364 }
2365 }
2366 if (nvlist_lookup_nvlist(spa->spa_config,
2367 ZPOOL_REWIND_POLICY, &policy) == 0)
2368 VERIFY(nvlist_add_nvlist(nvconfig,
2369 ZPOOL_REWIND_POLICY, policy) == 0);
2370
2371 spa_config_set(spa, nvconfig);
2372 spa_unload(spa);
2373 spa_deactivate(spa);
2374 spa_activate(spa, orig_mode);
2375
2376 return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE));
2377 }
2378
2379 if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0)
2380 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2381 error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj);
2382 if (error != 0)
2383 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2384
2385 /*
2386 * Load the bit that tells us to use the new accounting function
2387 * (raid-z deflation). If we have an older pool, this will not
2388 * be present.
2389 */
2390 error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate);
2391 if (error != 0 && error != ENOENT)
2392 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2393
2394 error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION,
2395 &spa->spa_creation_version);
2396 if (error != 0 && error != ENOENT)
2397 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2398
2399 /*
2400 * Load the persistent error log. If we have an older pool, this will
2401 * not be present.
2402 */
2403 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last);
2404 if (error != 0 && error != ENOENT)
2405 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2406
2407 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB,
2408 &spa->spa_errlog_scrub);
2409 if (error != 0 && error != ENOENT)
2410 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2411
2412 /*
2413 * Load the history object. If we have an older pool, this
2414 * will not be present.
2415 */
2416 error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history);
2417 if (error != 0 && error != ENOENT)
2418 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2419
2420 /*
2421 * If we're assembling the pool from the split-off vdevs of
2422 * an existing pool, we don't want to attach the spares & cache
2423 * devices.
2424 */
2425
2426 /*
2427 * Load any hot spares for this pool.
2428 */
2429 error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object);
2430 if (error != 0 && error != ENOENT)
2431 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2432 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
2433 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES);
2434 if (load_nvlist(spa, spa->spa_spares.sav_object,
2435 &spa->spa_spares.sav_config) != 0)
2436 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2437
2438 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2439 spa_load_spares(spa);
2440 spa_config_exit(spa, SCL_ALL, FTAG);
2441 } else if (error == 0) {
2442 spa->spa_spares.sav_sync = B_TRUE;
2443 }
2444
2445 /*
2446 * Load any level 2 ARC devices for this pool.
2447 */
2448 error = spa_dir_prop(spa, DMU_POOL_L2CACHE,
2449 &spa->spa_l2cache.sav_object);
2450 if (error != 0 && error != ENOENT)
2451 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2452 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
2453 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE);
2454 if (load_nvlist(spa, spa->spa_l2cache.sav_object,
2455 &spa->spa_l2cache.sav_config) != 0)
2456 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2457
2458 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2459 spa_load_l2cache(spa);
2460 spa_config_exit(spa, SCL_ALL, FTAG);
2461 } else if (error == 0) {
2462 spa->spa_l2cache.sav_sync = B_TRUE;
2463 }
2464
2465 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
2466
2467 error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object);
2468 if (error && error != ENOENT)
2469 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2470
2471 if (error == 0) {
2472 uint64_t autoreplace;
2473
2474 spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs);
2475 spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace);
2476 spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation);
2477 spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode);
2478 spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand);
2479 spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO,
2480 &spa->spa_dedup_ditto);
2481
2482 spa->spa_autoreplace = (autoreplace != 0);
2483 }
2484
2485 /*
2486 * If the 'autoreplace' property is set, then post a resource notifying
2487 * the ZFS DE that it should not issue any faults for unopenable
2488 * devices. We also iterate over the vdevs, and post a sysevent for any
2489 * unopenable vdevs so that the normal autoreplace handler can take
2490 * over.
2491 */
2492 if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) {
2493 spa_check_removed(spa->spa_root_vdev);
2494 /*
2495 * For the import case, this is done in spa_import(), because
2496 * at this point we're using the spare definitions from
2497 * the MOS config, not necessarily from the userland config.
2498 */
2499 if (state != SPA_LOAD_IMPORT) {
2500 spa_aux_check_removed(&spa->spa_spares);
2501 spa_aux_check_removed(&spa->spa_l2cache);
2502 }
2503 }
2504
2505 /*
2506 * Load the vdev state for all toplevel vdevs.
2507 */
2508 vdev_load(rvd);
2509
2510 /*
2511 * Propagate the leaf DTLs we just loaded all the way up the tree.
2512 */
2513 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2514 vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
2515 spa_config_exit(spa, SCL_ALL, FTAG);
2516
2517 /*
2518 * Load the DDTs (dedup tables).
2519 */
2520 error = ddt_load(spa);
2521 if (error != 0)
2522 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2523
2524 spa_update_dspace(spa);
2525
2526 /*
2527 * Validate the config, using the MOS config to fill in any
2528 * information which might be missing. If we fail to validate
2529 * the config then declare the pool unfit for use. If we're
2530 * assembling a pool from a split, the log is not transferred
2531 * over.
2532 */
2533 if (type != SPA_IMPORT_ASSEMBLE) {
2534 nvlist_t *nvconfig;
2535
2536 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
2537 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2538
2539 if (!spa_config_valid(spa, nvconfig)) {
2540 nvlist_free(nvconfig);
2541 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM,
2542 ENXIO));
2543 }
2544 nvlist_free(nvconfig);
2545
2546 /*
2547 * Now that we've validated the config, check the state of the
2548 * root vdev. If it can't be opened, it indicates one or
2549 * more toplevel vdevs are faulted.
2550 */
2551 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
2552 return (SET_ERROR(ENXIO));
2553
2554 if (spa_check_logs(spa)) {
2555 *ereport = FM_EREPORT_ZFS_LOG_REPLAY;
2556 return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO));
2557 }
2558 }
2559
2560 if (missing_feat_write) {
2561 ASSERT(state == SPA_LOAD_TRYIMPORT);
2562
2563 /*
2564 * At this point, we know that we can open the pool in
2565 * read-only mode but not read-write mode. We now have enough
2566 * information and can return to userland.
2567 */
2568 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP));
2569 }
2570
2571 /*
2572 * We've successfully opened the pool, verify that we're ready
2573 * to start pushing transactions.
2574 */
2575 if (state != SPA_LOAD_TRYIMPORT) {
2576 if (error = spa_load_verify(spa))
2577 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
2578 error));
2579 }
2580
2581 if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER ||
2582 spa->spa_load_max_txg == UINT64_MAX)) {
2583 dmu_tx_t *tx;
2584 int need_update = B_FALSE;
2585
2586 ASSERT(state != SPA_LOAD_TRYIMPORT);
2587
2588 /*
2589 * Claim log blocks that haven't been committed yet.
2590 * This must all happen in a single txg.
2591 * Note: spa_claim_max_txg is updated by spa_claim_notify(),
2592 * invoked from zil_claim_log_block()'s i/o done callback.
2593 * Price of rollback is that we abandon the log.
2594 */
2595 spa->spa_claiming = B_TRUE;
2596
2597 tx = dmu_tx_create_assigned(spa_get_dsl(spa),
2598 spa_first_txg(spa));
2599 (void) dmu_objset_find(spa_name(spa),
2600 zil_claim, tx, DS_FIND_CHILDREN);
2601 dmu_tx_commit(tx);
2602
2603 spa->spa_claiming = B_FALSE;
2604
2605 spa_set_log_state(spa, SPA_LOG_GOOD);
2606 spa->spa_sync_on = B_TRUE;
2607 txg_sync_start(spa->spa_dsl_pool);
2608
2609 /*
2610 * Wait for all claims to sync. We sync up to the highest
2611 * claimed log block birth time so that claimed log blocks
2612 * don't appear to be from the future. spa_claim_max_txg
2613 * will have been set for us by either zil_check_log_chain()
2614 * (invoked from spa_check_logs()) or zil_claim() above.
2615 */
2616 txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg);
2617
2618 /*
2619 * If the config cache is stale, or we have uninitialized
2620 * metaslabs (see spa_vdev_add()), then update the config.
2621 *
2622 * If this is a verbatim import, trust the current
2623 * in-core spa_config and update the disk labels.
2624 */
2625 if (config_cache_txg != spa->spa_config_txg ||
2626 state == SPA_LOAD_IMPORT ||
2627 state == SPA_LOAD_RECOVER ||
2628 (spa->spa_import_flags & ZFS_IMPORT_VERBATIM))
2629 need_update = B_TRUE;
2630
2631 for (int c = 0; c < rvd->vdev_children; c++)
2632 if (rvd->vdev_child[c]->vdev_ms_array == 0)
2633 need_update = B_TRUE;
2634
2635 /*
2636 * Update the config cache asychronously in case we're the
2637 * root pool, in which case the config cache isn't writable yet.
2638 */
2639 if (need_update)
2640 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
2641
2642 /*
2643 * Check all DTLs to see if anything needs resilvering.
2644 */
2645 if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
2646 vdev_resilver_needed(rvd, NULL, NULL))
2647 spa_async_request(spa, SPA_ASYNC_RESILVER);
2648
2649 /*
2650 * Log the fact that we booted up (so that we can detect if
2651 * we rebooted in the middle of an operation).
2652 */
2653 spa_history_log_version(spa, "open");
2654
2655 /*
2656 * Delete any inconsistent datasets.
2657 */
2658 (void) dmu_objset_find(spa_name(spa),
2659 dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN);
2660
2661 /*
2662 * Clean up any stale temporary dataset userrefs.
2663 */
2664 dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool);
2665 }
2666
2667 return (0);
2668}
2669
2670static int
2671spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig)
2672{
2673 int mode = spa->spa_mode;
2674
2675 spa_unload(spa);
2676 spa_deactivate(spa);
2677
2678 spa->spa_load_max_txg--;
2679
2680 spa_activate(spa, mode);
2681 spa_async_suspend(spa);
2682
2683 return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig));
2684}
2685
2686/*
2687 * If spa_load() fails this function will try loading prior txg's. If
2688 * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool
2689 * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this
2690 * function will not rewind the pool and will return the same error as
2691 * spa_load().
2692 */
2693static int
2694spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig,
2695 uint64_t max_request, int rewind_flags)
2696{
2697 nvlist_t *loadinfo = NULL;
2698 nvlist_t *config = NULL;
2699 int load_error, rewind_error;
2700 uint64_t safe_rewind_txg;
2701 uint64_t min_txg;
2702
2703 if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) {
2704 spa->spa_load_max_txg = spa->spa_load_txg;
2705 spa_set_log_state(spa, SPA_LOG_CLEAR);
2706 } else {
2707 spa->spa_load_max_txg = max_request;
2708 }
2709
2710 load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING,
2711 mosconfig);
2712 if (load_error == 0)
2713 return (0);
2714
2715 if (spa->spa_root_vdev != NULL)
2716 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
2717
2718 spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg;
2719 spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp;
2720
2721 if (rewind_flags & ZPOOL_NEVER_REWIND) {
2722 nvlist_free(config);
2723 return (load_error);
2724 }
2725
2726 if (state == SPA_LOAD_RECOVER) {
2727 /* Price of rolling back is discarding txgs, including log */
2728 spa_set_log_state(spa, SPA_LOG_CLEAR);
2729 } else {
2730 /*
2731 * If we aren't rolling back save the load info from our first
2732 * import attempt so that we can restore it after attempting
2733 * to rewind.
2734 */
2735 loadinfo = spa->spa_load_info;
2736 spa->spa_load_info = fnvlist_alloc();
2737 }
2738
2739 spa->spa_load_max_txg = spa->spa_last_ubsync_txg;
2740 safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE;
2741 min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ?
2742 TXG_INITIAL : safe_rewind_txg;
2743
2744 /*
2745 * Continue as long as we're finding errors, we're still within
2746 * the acceptable rewind range, and we're still finding uberblocks
2747 */
2748 while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg &&
2749 spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) {
2750 if (spa->spa_load_max_txg < safe_rewind_txg)
2751 spa->spa_extreme_rewind = B_TRUE;
2752 rewind_error = spa_load_retry(spa, state, mosconfig);
2753 }
2754
2755 spa->spa_extreme_rewind = B_FALSE;
2756 spa->spa_load_max_txg = UINT64_MAX;
2757
2758 if (config && (rewind_error || state != SPA_LOAD_RECOVER))
2759 spa_config_set(spa, config);
2760
2761 if (state == SPA_LOAD_RECOVER) {
2762 ASSERT3P(loadinfo, ==, NULL);
2763 return (rewind_error);
2764 } else {
2765 /* Store the rewind info as part of the initial load info */
2766 fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO,
2767 spa->spa_load_info);
2768
2769 /* Restore the initial load info */
2770 fnvlist_free(spa->spa_load_info);
2771 spa->spa_load_info = loadinfo;
2772
2773 return (load_error);
2774 }
2775}
2776
2777/*
2778 * Pool Open/Import
2779 *
2780 * The import case is identical to an open except that the configuration is sent
2781 * down from userland, instead of grabbed from the configuration cache. For the
2782 * case of an open, the pool configuration will exist in the
2783 * POOL_STATE_UNINITIALIZED state.
2784 *
2785 * The stats information (gen/count/ustats) is used to gather vdev statistics at
2786 * the same time open the pool, without having to keep around the spa_t in some
2787 * ambiguous state.
2788 */
2789static int
2790spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
2791 nvlist_t **config)
2792{
2793 spa_t *spa;
2794 spa_load_state_t state = SPA_LOAD_OPEN;
2795 int error;
2796 int locked = B_FALSE;
2797 int firstopen = B_FALSE;
2798
2799 *spapp = NULL;
2800
2801 /*
2802 * As disgusting as this is, we need to support recursive calls to this
2803 * function because dsl_dir_open() is called during spa_load(), and ends
2804 * up calling spa_open() again. The real fix is to figure out how to
2805 * avoid dsl_dir_open() calling this in the first place.
2806 */
2807 if (mutex_owner(&spa_namespace_lock) != curthread) {
2808 mutex_enter(&spa_namespace_lock);
2809 locked = B_TRUE;
2810 }
2811
2812 if ((spa = spa_lookup(pool)) == NULL) {
2813 if (locked)
2814 mutex_exit(&spa_namespace_lock);
2815 return (SET_ERROR(ENOENT));
2816 }
2817
2818 if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
2819 zpool_rewind_policy_t policy;
2820
2821 firstopen = B_TRUE;
2822
2823 zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config,
2824 &policy);
2825 if (policy.zrp_request & ZPOOL_DO_REWIND)
2826 state = SPA_LOAD_RECOVER;
2827
2828 spa_activate(spa, spa_mode_global);
2829
2830 if (state != SPA_LOAD_RECOVER)
2831 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
2832
2833 error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg,
2834 policy.zrp_request);
2835
2836 if (error == EBADF) {
2837 /*
2838 * If vdev_validate() returns failure (indicated by
2839 * EBADF), it indicates that one of the vdevs indicates
2840 * that the pool has been exported or destroyed. If
2841 * this is the case, the config cache is out of sync and
2842 * we should remove the pool from the namespace.
2843 */
2844 spa_unload(spa);
2845 spa_deactivate(spa);
2846 spa_config_sync(spa, B_TRUE, B_TRUE);
2847 spa_remove(spa);
2848 if (locked)
2849 mutex_exit(&spa_namespace_lock);
2850 return (SET_ERROR(ENOENT));
2851 }
2852
2853 if (error) {
2854 /*
2855 * We can't open the pool, but we still have useful
2856 * information: the state of each vdev after the
2857 * attempted vdev_open(). Return this to the user.
2858 */
2859 if (config != NULL && spa->spa_config) {
2860 VERIFY(nvlist_dup(spa->spa_config, config,
2861 KM_SLEEP) == 0);
2862 VERIFY(nvlist_add_nvlist(*config,
2863 ZPOOL_CONFIG_LOAD_INFO,
2864 spa->spa_load_info) == 0);
2865 }
2866 spa_unload(spa);
2867 spa_deactivate(spa);
2868 spa->spa_last_open_failed = error;
2869 if (locked)
2870 mutex_exit(&spa_namespace_lock);
2871 *spapp = NULL;
2872 return (error);
2873 }
2874 }
2875
2876 spa_open_ref(spa, tag);
2877
2878 if (config != NULL)
2879 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
2880
2881 /*
2882 * If we've recovered the pool, pass back any information we
2883 * gathered while doing the load.
2884 */
2885 if (state == SPA_LOAD_RECOVER) {
2886 VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO,
2887 spa->spa_load_info) == 0);
2888 }
2889
2890 if (locked) {
2891 spa->spa_last_open_failed = 0;
2892 spa->spa_last_ubsync_txg = 0;
2893 spa->spa_load_txg = 0;
2894 mutex_exit(&spa_namespace_lock);
2895#ifdef __FreeBSD__
2896#ifdef _KERNEL
2897 if (firstopen)
2898 zvol_create_minors(spa->spa_name);
2899#endif
2900#endif
2901 }
2902
2903 *spapp = spa;
2904
2905 return (0);
2906}
2907
2908int
2909spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy,
2910 nvlist_t **config)
2911{
2912 return (spa_open_common(name, spapp, tag, policy, config));
2913}
2914
2915int
2916spa_open(const char *name, spa_t **spapp, void *tag)
2917{
2918 return (spa_open_common(name, spapp, tag, NULL, NULL));
2919}
2920
2921/*
2922 * Lookup the given spa_t, incrementing the inject count in the process,
2923 * preventing it from being exported or destroyed.
2924 */
2925spa_t *
2926spa_inject_addref(char *name)
2927{
2928 spa_t *spa;
2929
2930 mutex_enter(&spa_namespace_lock);
2931 if ((spa = spa_lookup(name)) == NULL) {
2932 mutex_exit(&spa_namespace_lock);
2933 return (NULL);
2934 }
2935 spa->spa_inject_ref++;
2936 mutex_exit(&spa_namespace_lock);
2937
2938 return (spa);
2939}
2940
2941void
2942spa_inject_delref(spa_t *spa)
2943{
2944 mutex_enter(&spa_namespace_lock);
2945 spa->spa_inject_ref--;
2946 mutex_exit(&spa_namespace_lock);
2947}
2948
2949/*
2950 * Add spares device information to the nvlist.
2951 */
2952static void
2953spa_add_spares(spa_t *spa, nvlist_t *config)
2954{
2955 nvlist_t **spares;
2956 uint_t i, nspares;
2957 nvlist_t *nvroot;
2958 uint64_t guid;
2959 vdev_stat_t *vs;
2960 uint_t vsc;
2961 uint64_t pool;
2962
2963 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
2964
2965 if (spa->spa_spares.sav_count == 0)
2966 return;
2967
2968 VERIFY(nvlist_lookup_nvlist(config,
2969 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
2970 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
2971 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
2972 if (nspares != 0) {
2973 VERIFY(nvlist_add_nvlist_array(nvroot,
2974 ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
2975 VERIFY(nvlist_lookup_nvlist_array(nvroot,
2976 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
2977
2978 /*
2979 * Go through and find any spares which have since been
2980 * repurposed as an active spare. If this is the case, update
2981 * their status appropriately.
2982 */
2983 for (i = 0; i < nspares; i++) {
2984 VERIFY(nvlist_lookup_uint64(spares[i],
2985 ZPOOL_CONFIG_GUID, &guid) == 0);
2986 if (spa_spare_exists(guid, &pool, NULL) &&
2987 pool != 0ULL) {
2988 VERIFY(nvlist_lookup_uint64_array(
2989 spares[i], ZPOOL_CONFIG_VDEV_STATS,
2990 (uint64_t **)&vs, &vsc) == 0);
2991 vs->vs_state = VDEV_STATE_CANT_OPEN;
2992 vs->vs_aux = VDEV_AUX_SPARED;
2993 }
2994 }
2995 }
2996}
2997
2998/*
2999 * Add l2cache device information to the nvlist, including vdev stats.
3000 */
3001static void
3002spa_add_l2cache(spa_t *spa, nvlist_t *config)
3003{
3004 nvlist_t **l2cache;
3005 uint_t i, j, nl2cache;
3006 nvlist_t *nvroot;
3007 uint64_t guid;
3008 vdev_t *vd;
3009 vdev_stat_t *vs;
3010 uint_t vsc;
3011
3012 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
3013
3014 if (spa->spa_l2cache.sav_count == 0)
3015 return;
3016
3017 VERIFY(nvlist_lookup_nvlist(config,
3018 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
3019 VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
3020 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
3021 if (nl2cache != 0) {
3022 VERIFY(nvlist_add_nvlist_array(nvroot,
3023 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
3024 VERIFY(nvlist_lookup_nvlist_array(nvroot,
3025 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
3026
3027 /*
3028 * Update level 2 cache device stats.
3029 */
3030
3031 for (i = 0; i < nl2cache; i++) {
3032 VERIFY(nvlist_lookup_uint64(l2cache[i],
3033 ZPOOL_CONFIG_GUID, &guid) == 0);
3034
3035 vd = NULL;
3036 for (j = 0; j < spa->spa_l2cache.sav_count; j++) {
3037 if (guid ==
3038 spa->spa_l2cache.sav_vdevs[j]->vdev_guid) {
3039 vd = spa->spa_l2cache.sav_vdevs[j];
3040 break;
3041 }
3042 }
3043 ASSERT(vd != NULL);
3044
3045 VERIFY(nvlist_lookup_uint64_array(l2cache[i],
3046 ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)
3047 == 0);
3048 vdev_get_stats(vd, vs);
3049 }
3050 }
3051}
3052
3053static void
3054spa_add_feature_stats(spa_t *spa, nvlist_t *config)
3055{
3056 nvlist_t *features;
3057 zap_cursor_t zc;
3058 zap_attribute_t za;
3059
3060 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
3061 VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0);
3062
3063 if (spa->spa_feat_for_read_obj != 0) {
3064 for (zap_cursor_init(&zc, spa->spa_meta_objset,
3065 spa->spa_feat_for_read_obj);
3066 zap_cursor_retrieve(&zc, &za) == 0;
3067 zap_cursor_advance(&zc)) {
3068 ASSERT(za.za_integer_length == sizeof (uint64_t) &&
3069 za.za_num_integers == 1);
3070 VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name,
3071 za.za_first_integer));
3072 }
3073 zap_cursor_fini(&zc);
3074 }
3075
3076 if (spa->spa_feat_for_write_obj != 0) {
3077 for (zap_cursor_init(&zc, spa->spa_meta_objset,
3078 spa->spa_feat_for_write_obj);
3079 zap_cursor_retrieve(&zc, &za) == 0;
3080 zap_cursor_advance(&zc)) {
3081 ASSERT(za.za_integer_length == sizeof (uint64_t) &&
3082 za.za_num_integers == 1);
3083 VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name,
3084 za.za_first_integer));
3085 }
3086 zap_cursor_fini(&zc);
3087 }
3088
3089 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS,
3090 features) == 0);
3091 nvlist_free(features);
3092}
3093
3094int
3095spa_get_stats(const char *name, nvlist_t **config,
3096 char *altroot, size_t buflen)
3097{
3098 int error;
3099 spa_t *spa;
3100
3101 *config = NULL;
3102 error = spa_open_common(name, &spa, FTAG, NULL, config);
3103
3104 if (spa != NULL) {
3105 /*
3106 * This still leaves a window of inconsistency where the spares
3107 * or l2cache devices could change and the config would be
3108 * self-inconsistent.
3109 */
3110 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
3111
3112 if (*config != NULL) {
3113 uint64_t loadtimes[2];
3114
3115 loadtimes[0] = spa->spa_loaded_ts.tv_sec;
3116 loadtimes[1] = spa->spa_loaded_ts.tv_nsec;
3117 VERIFY(nvlist_add_uint64_array(*config,
3118 ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0);
3119
3120 VERIFY(nvlist_add_uint64(*config,
3121 ZPOOL_CONFIG_ERRCOUNT,
3122 spa_get_errlog_size(spa)) == 0);
3123
3124 if (spa_suspended(spa))
3125 VERIFY(nvlist_add_uint64(*config,
3126 ZPOOL_CONFIG_SUSPENDED,
3127 spa->spa_failmode) == 0);
3128
3129 spa_add_spares(spa, *config);
3130 spa_add_l2cache(spa, *config);
3131 spa_add_feature_stats(spa, *config);
3132 }
3133 }
3134
3135 /*
3136 * We want to get the alternate root even for faulted pools, so we cheat
3137 * and call spa_lookup() directly.
3138 */
3139 if (altroot) {
3140 if (spa == NULL) {
3141 mutex_enter(&spa_namespace_lock);
3142 spa = spa_lookup(name);
3143 if (spa)
3144 spa_altroot(spa, altroot, buflen);
3145 else
3146 altroot[0] = '\0';
3147 spa = NULL;
3148 mutex_exit(&spa_namespace_lock);
3149 } else {
3150 spa_altroot(spa, altroot, buflen);
3151 }
3152 }
3153
3154 if (spa != NULL) {
3155 spa_config_exit(spa, SCL_CONFIG, FTAG);
3156 spa_close(spa, FTAG);
3157 }
3158
3159 return (error);
3160}
3161
3162/*
3163 * Validate that the auxiliary device array is well formed. We must have an
3164 * array of nvlists, each which describes a valid leaf vdev. If this is an
3165 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be
3166 * specified, as long as they are well-formed.
3167 */
3168static int
3169spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode,
3170 spa_aux_vdev_t *sav, const char *config, uint64_t version,
3171 vdev_labeltype_t label)
3172{
3173 nvlist_t **dev;
3174 uint_t i, ndev;
3175 vdev_t *vd;
3176 int error;
3177
3178 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
3179
3180 /*
3181 * It's acceptable to have no devs specified.
3182 */
3183 if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0)
3184 return (0);
3185
3186 if (ndev == 0)
3187 return (SET_ERROR(EINVAL));
3188
3189 /*
3190 * Make sure the pool is formatted with a version that supports this
3191 * device type.
3192 */
3193 if (spa_version(spa) < version)
3194 return (SET_ERROR(ENOTSUP));
3195
3196 /*
3197 * Set the pending device list so we correctly handle device in-use
3198 * checking.
3199 */
3200 sav->sav_pending = dev;
3201 sav->sav_npending = ndev;
3202
3203 for (i = 0; i < ndev; i++) {
3204 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0,
3205 mode)) != 0)
3206 goto out;
3207
3208 if (!vd->vdev_ops->vdev_op_leaf) {
3209 vdev_free(vd);
3210 error = SET_ERROR(EINVAL);
3211 goto out;
3212 }
3213
3214 /*
3215 * The L2ARC currently only supports disk devices in
3216 * kernel context. For user-level testing, we allow it.
3217 */
3218#ifdef _KERNEL
3219 if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) &&
3220 strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) {
3221 error = SET_ERROR(ENOTBLK);
3222 vdev_free(vd);
3223 goto out;
3224 }
3225#endif
3226 vd->vdev_top = vd;
3227
3228 if ((error = vdev_open(vd)) == 0 &&
3229 (error = vdev_label_init(vd, crtxg, label)) == 0) {
3230 VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID,
3231 vd->vdev_guid) == 0);
3232 }
3233
3234 vdev_free(vd);
3235
3236 if (error &&
3237 (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE))
3238 goto out;
3239 else
3240 error = 0;
3241 }
3242
3243out:
3244 sav->sav_pending = NULL;
3245 sav->sav_npending = 0;
3246 return (error);
3247}
3248
3249static int
3250spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode)
3251{
3252 int error;
3253
3254 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
3255
3256 if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode,
3257 &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES,
3258 VDEV_LABEL_SPARE)) != 0) {
3259 return (error);
3260 }
3261
3262 return (spa_validate_aux_devs(spa, nvroot, crtxg, mode,
3263 &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE,
3264 VDEV_LABEL_L2CACHE));
3265}
3266
3267static void
3268spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs,
3269 const char *config)
3270{
3271 int i;
3272
3273 if (sav->sav_config != NULL) {
3274 nvlist_t **olddevs;
3275 uint_t oldndevs;
3276 nvlist_t **newdevs;
3277
3278 /*
3279 * Generate new dev list by concatentating with the
3280 * current dev list.
3281 */
3282 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config,
3283 &olddevs, &oldndevs) == 0);
3284
3285 newdevs = kmem_alloc(sizeof (void *) *
3286 (ndevs + oldndevs), KM_SLEEP);
3287 for (i = 0; i < oldndevs; i++)
3288 VERIFY(nvlist_dup(olddevs[i], &newdevs[i],
3289 KM_SLEEP) == 0);
3290 for (i = 0; i < ndevs; i++)
3291 VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs],
3292 KM_SLEEP) == 0);
3293
3294 VERIFY(nvlist_remove(sav->sav_config, config,
3295 DATA_TYPE_NVLIST_ARRAY) == 0);
3296
3297 VERIFY(nvlist_add_nvlist_array(sav->sav_config,
3298 config, newdevs, ndevs + oldndevs) == 0);
3299 for (i = 0; i < oldndevs + ndevs; i++)
3300 nvlist_free(newdevs[i]);
3301 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *));
3302 } else {
3303 /*
3304 * Generate a new dev list.
3305 */
3306 VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME,
3307 KM_SLEEP) == 0);
3308 VERIFY(nvlist_add_nvlist_array(sav->sav_config, config,
3309 devs, ndevs) == 0);
3310 }
3311}
3312
3313/*
3314 * Stop and drop level 2 ARC devices
3315 */
3316void
3317spa_l2cache_drop(spa_t *spa)
3318{
3319 vdev_t *vd;
3320 int i;
3321 spa_aux_vdev_t *sav = &spa->spa_l2cache;
3322
3323 for (i = 0; i < sav->sav_count; i++) {
3324 uint64_t pool;
3325
3326 vd = sav->sav_vdevs[i];
3327 ASSERT(vd != NULL);
3328
3329 if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
3330 pool != 0ULL && l2arc_vdev_present(vd))
3331 l2arc_remove_vdev(vd);
3332 }
3333}
3334
3335/*
3336 * Pool Creation
3337 */
3338int
3339spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
3340 nvlist_t *zplprops)
3341{
3342 spa_t *spa;
3343 char *altroot = NULL;
3344 vdev_t *rvd;
3345 dsl_pool_t *dp;
3346 dmu_tx_t *tx;
3347 int error = 0;
3348 uint64_t txg = TXG_INITIAL;
3349 nvlist_t **spares, **l2cache;
3350 uint_t nspares, nl2cache;
3351 uint64_t version, obj;
3352 boolean_t has_features;
3353
3354 /*
3355 * If this pool already exists, return failure.
3356 */
3357 mutex_enter(&spa_namespace_lock);
3358 if (spa_lookup(pool) != NULL) {
3359 mutex_exit(&spa_namespace_lock);
3360 return (SET_ERROR(EEXIST));
3361 }
3362
3363 /*
3364 * Allocate a new spa_t structure.
3365 */
3366 (void) nvlist_lookup_string(props,
3367 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
3368 spa = spa_add(pool, NULL, altroot);
3369 spa_activate(spa, spa_mode_global);
3370
3371 if (props && (error = spa_prop_validate(spa, props))) {
3372 spa_deactivate(spa);
3373 spa_remove(spa);
3374 mutex_exit(&spa_namespace_lock);
3375 return (error);
3376 }
3377
3378 has_features = B_FALSE;
3379 for (nvpair_t *elem = nvlist_next_nvpair(props, NULL);
3380 elem != NULL; elem = nvlist_next_nvpair(props, elem)) {
3381 if (zpool_prop_feature(nvpair_name(elem)))
3382 has_features = B_TRUE;
3383 }
3384
3385 if (has_features || nvlist_lookup_uint64(props,
3386 zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) {
3387 version = SPA_VERSION;
3388 }
3389 ASSERT(SPA_VERSION_IS_SUPPORTED(version));
3390
3391 spa->spa_first_txg = txg;
3392 spa->spa_uberblock.ub_txg = txg - 1;
3393 spa->spa_uberblock.ub_version = version;
3394 spa->spa_ubsync = spa->spa_uberblock;
3395
3396 /*
3397 * Create "The Godfather" zio to hold all async IOs
3398 */
3399 spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
3400 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
3401
3402 /*
3403 * Create the root vdev.
3404 */
3405 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3406
3407 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD);
3408
3409 ASSERT(error != 0 || rvd != NULL);
3410 ASSERT(error != 0 || spa->spa_root_vdev == rvd);
3411
3412 if (error == 0 && !zfs_allocatable_devs(nvroot))
3413 error = SET_ERROR(EINVAL);
3414
3415 if (error == 0 &&
3416 (error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
3417 (error = spa_validate_aux(spa, nvroot, txg,
3418 VDEV_ALLOC_ADD)) == 0) {
3419 for (int c = 0; c < rvd->vdev_children; c++) {
3420 vdev_metaslab_set_size(rvd->vdev_child[c]);
3421 vdev_expand(rvd->vdev_child[c], txg);
3422 }
3423 }
3424
3425 spa_config_exit(spa, SCL_ALL, FTAG);
3426
3427 if (error != 0) {
3428 spa_unload(spa);
3429 spa_deactivate(spa);
3430 spa_remove(spa);
3431 mutex_exit(&spa_namespace_lock);
3432 return (error);
3433 }
3434
3435 /*
3436 * Get the list of spares, if specified.
3437 */
3438 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
3439 &spares, &nspares) == 0) {
3440 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME,
3441 KM_SLEEP) == 0);
3442 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
3443 ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
3444 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3445 spa_load_spares(spa);
3446 spa_config_exit(spa, SCL_ALL, FTAG);
3447 spa->spa_spares.sav_sync = B_TRUE;
3448 }
3449
3450 /*
3451 * Get the list of level 2 cache devices, if specified.
3452 */
3453 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
3454 &l2cache, &nl2cache) == 0) {
3455 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
3456 NV_UNIQUE_NAME, KM_SLEEP) == 0);
3457 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
3458 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
3459 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3460 spa_load_l2cache(spa);
3461 spa_config_exit(spa, SCL_ALL, FTAG);
3462 spa->spa_l2cache.sav_sync = B_TRUE;
3463 }
3464
3465 spa->spa_is_initializing = B_TRUE;
3466 spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg);
3467 spa->spa_meta_objset = dp->dp_meta_objset;
3468 spa->spa_is_initializing = B_FALSE;
3469
3470 /*
3471 * Create DDTs (dedup tables).
3472 */
3473 ddt_create(spa);
3474
3475 spa_update_dspace(spa);
3476
3477 tx = dmu_tx_create_assigned(dp, txg);
3478
3479 /*
3480 * Create the pool config object.
3481 */
3482 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
3483 DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE,
3484 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
3485
3486 if (zap_add(spa->spa_meta_objset,
3487 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
3488 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
3489 cmn_err(CE_PANIC, "failed to add pool config");
3490 }
3491
3492 if (spa_version(spa) >= SPA_VERSION_FEATURES)
3493 spa_feature_create_zap_objects(spa, tx);
3494
3495 if (zap_add(spa->spa_meta_objset,
3496 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION,
3497 sizeof (uint64_t), 1, &version, tx) != 0) {
3498 cmn_err(CE_PANIC, "failed to add pool version");
3499 }
3500
3501 /* Newly created pools with the right version are always deflated. */
3502 if (version >= SPA_VERSION_RAIDZ_DEFLATE) {
3503 spa->spa_deflate = TRUE;
3504 if (zap_add(spa->spa_meta_objset,
3505 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
3506 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
3507 cmn_err(CE_PANIC, "failed to add deflate");
3508 }
3509 }
3510
3511 /*
3512 * Create the deferred-free bpobj. Turn off compression
3513 * because sync-to-convergence takes longer if the blocksize
3514 * keeps changing.
3515 */
3516 obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx);
3517 dmu_object_set_compress(spa->spa_meta_objset, obj,
3518 ZIO_COMPRESS_OFF, tx);
3519 if (zap_add(spa->spa_meta_objset,
3520 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ,
3521 sizeof (uint64_t), 1, &obj, tx) != 0) {
3522 cmn_err(CE_PANIC, "failed to add bpobj");
3523 }
3524 VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj,
3525 spa->spa_meta_objset, obj));
3526
3527 /*
3528 * Create the pool's history object.
3529 */
3530 if (version >= SPA_VERSION_ZPOOL_HISTORY)
3531 spa_history_create_obj(spa, tx);
3532
3533 /*
3534 * Set pool properties.
3535 */
3536 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
3537 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
3538 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
3539 spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND);
3540
3541 if (props != NULL) {
3542 spa_configfile_set(spa, props, B_FALSE);
3543 spa_sync_props(props, tx);
3544 }
3545
3546 dmu_tx_commit(tx);
3547
3548 spa->spa_sync_on = B_TRUE;
3549 txg_sync_start(spa->spa_dsl_pool);
3550
3551 /*
3552 * We explicitly wait for the first transaction to complete so that our
3553 * bean counters are appropriately updated.
3554 */
3555 txg_wait_synced(spa->spa_dsl_pool, txg);
3556
3557 spa_config_sync(spa, B_FALSE, B_TRUE);
3558
3559 spa_history_log_version(spa, "create");
3560
3561 spa->spa_minref = refcount_count(&spa->spa_refcount);
3562
3563 mutex_exit(&spa_namespace_lock);
3564
3565 return (0);
3566}
3567
3568#ifdef _KERNEL
3569#if defined(sun)
3570/*
3571 * Get the root pool information from the root disk, then import the root pool
3572 * during the system boot up time.
3573 */
3574extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **);
3575
3576static nvlist_t *
3577spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid)
3578{
3579 nvlist_t *config;
3580 nvlist_t *nvtop, *nvroot;
3581 uint64_t pgid;
3582
3583 if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0)
3584 return (NULL);
3585
3586 /*
3587 * Add this top-level vdev to the child array.
3588 */
3589 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
3590 &nvtop) == 0);
3591 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
3592 &pgid) == 0);
3593 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0);
3594
3595 /*
3596 * Put this pool's top-level vdevs into a root vdev.
3597 */
3598 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
3599 VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
3600 VDEV_TYPE_ROOT) == 0);
3601 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0);
3602 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0);
3603 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
3604 &nvtop, 1) == 0);
3605
3606 /*
3607 * Replace the existing vdev_tree with the new root vdev in
3608 * this pool's configuration (remove the old, add the new).
3609 */
3610 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
3611 nvlist_free(nvroot);
3612 return (config);
3613}
3614
3615/*
3616 * Walk the vdev tree and see if we can find a device with "better"
3617 * configuration. A configuration is "better" if the label on that
3618 * device has a more recent txg.
3619 */
3620static void
3621spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg)
3622{
3623 for (int c = 0; c < vd->vdev_children; c++)
3624 spa_alt_rootvdev(vd->vdev_child[c], avd, txg);
3625
3626 if (vd->vdev_ops->vdev_op_leaf) {
3627 nvlist_t *label;
3628 uint64_t label_txg;
3629
3630 if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid,
3631 &label) != 0)
3632 return;
3633
3634 VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG,
3635 &label_txg) == 0);
3636
3637 /*
3638 * Do we have a better boot device?
3639 */
3640 if (label_txg > *txg) {
3641 *txg = label_txg;
3642 *avd = vd;
3643 }
3644 nvlist_free(label);
3645 }
3646}
3647
3648/*
3649 * Import a root pool.
3650 *
3651 * For x86. devpath_list will consist of devid and/or physpath name of
3652 * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a").
3653 * The GRUB "findroot" command will return the vdev we should boot.
3654 *
3655 * For Sparc, devpath_list consists the physpath name of the booting device
3656 * no matter the rootpool is a single device pool or a mirrored pool.
3657 * e.g.
3658 * "/pci@1f,0/ide@d/disk@0,0:a"
3659 */
3660int
3661spa_import_rootpool(char *devpath, char *devid)
3662{
3663 spa_t *spa;
3664 vdev_t *rvd, *bvd, *avd = NULL;
3665 nvlist_t *config, *nvtop;
3666 uint64_t guid, txg;
3667 char *pname;
3668 int error;
3669
3670 /*
3671 * Read the label from the boot device and generate a configuration.
3672 */
3673 config = spa_generate_rootconf(devpath, devid, &guid);
3674#if defined(_OBP) && defined(_KERNEL)
3675 if (config == NULL) {
3676 if (strstr(devpath, "/iscsi/ssd") != NULL) {
3677 /* iscsi boot */
3678 get_iscsi_bootpath_phy(devpath);
3679 config = spa_generate_rootconf(devpath, devid, &guid);
3680 }
3681 }
3682#endif
3683 if (config == NULL) {
3684 cmn_err(CE_NOTE, "Cannot read the pool label from '%s'",
3685 devpath);
3686 return (SET_ERROR(EIO));
3687 }
3688
3689 VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
3690 &pname) == 0);
3691 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
3692
3693 mutex_enter(&spa_namespace_lock);
3694 if ((spa = spa_lookup(pname)) != NULL) {
3695 /*
3696 * Remove the existing root pool from the namespace so that we
3697 * can replace it with the correct config we just read in.
3698 */
3699 spa_remove(spa);
3700 }
3701
3702 spa = spa_add(pname, config, NULL);
3703 spa->spa_is_root = B_TRUE;
3704 spa->spa_import_flags = ZFS_IMPORT_VERBATIM;
3705
3706 /*
3707 * Build up a vdev tree based on the boot device's label config.
3708 */
3709 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
3710 &nvtop) == 0);
3711 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3712 error = spa_config_parse(spa, &rvd, nvtop, NULL, 0,
3713 VDEV_ALLOC_ROOTPOOL);
3714 spa_config_exit(spa, SCL_ALL, FTAG);
3715 if (error) {
3716 mutex_exit(&spa_namespace_lock);
3717 nvlist_free(config);
3718 cmn_err(CE_NOTE, "Can not parse the config for pool '%s'",
3719 pname);
3720 return (error);
3721 }
3722
3723 /*
3724 * Get the boot vdev.
3725 */
3726 if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) {
3727 cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu",
3728 (u_longlong_t)guid);
3729 error = SET_ERROR(ENOENT);
3730 goto out;
3731 }
3732
3733 /*
3734 * Determine if there is a better boot device.
3735 */
3736 avd = bvd;
3737 spa_alt_rootvdev(rvd, &avd, &txg);
3738 if (avd != bvd) {
3739 cmn_err(CE_NOTE, "The boot device is 'degraded'. Please "
3740 "try booting from '%s'", avd->vdev_path);
3741 error = SET_ERROR(EINVAL);
3742 goto out;
3743 }
3744
3745 /*
3746 * If the boot device is part of a spare vdev then ensure that
3747 * we're booting off the active spare.
3748 */
3749 if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops &&
3750 !bvd->vdev_isspare) {
3751 cmn_err(CE_NOTE, "The boot device is currently spared. Please "
3752 "try booting from '%s'",
3753 bvd->vdev_parent->
3754 vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path);
3755 error = SET_ERROR(EINVAL);
3756 goto out;
3757 }
3758
3759 error = 0;
3760out:
3761 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3762 vdev_free(rvd);
3763 spa_config_exit(spa, SCL_ALL, FTAG);
3764 mutex_exit(&spa_namespace_lock);
3765
3766 nvlist_free(config);
3767 return (error);
3768}
3769
3770#else
3771
3772extern int vdev_geom_read_pool_label(const char *name, nvlist_t ***configs,
3773 uint64_t *count);
3774
3775static nvlist_t *
3776spa_generate_rootconf(const char *name)
3777{
3778 nvlist_t **configs, **tops;
3779 nvlist_t *config;
3780 nvlist_t *best_cfg, *nvtop, *nvroot;
3781 uint64_t *holes;
3782 uint64_t best_txg;
3783 uint64_t nchildren;
3784 uint64_t pgid;
3785 uint64_t count;
3786 uint64_t i;
3787 uint_t nholes;
3788
3789 if (vdev_geom_read_pool_label(name, &configs, &count) != 0)
3790 return (NULL);
3791
3792 ASSERT3U(count, !=, 0);
3793 best_txg = 0;
3794 for (i = 0; i < count; i++) {
3795 uint64_t txg;
3796
3797 VERIFY(nvlist_lookup_uint64(configs[i], ZPOOL_CONFIG_POOL_TXG,
3798 &txg) == 0);
3799 if (txg > best_txg) {
3800 best_txg = txg;
3801 best_cfg = configs[i];
3802 }
3803 }
3804
3805 /*
3806 * Multi-vdev root pool configuration discovery is not supported yet.
3807 */
3808 nchildren = 1;
3809 nvlist_lookup_uint64(best_cfg, ZPOOL_CONFIG_VDEV_CHILDREN, &nchildren);
3810 holes = NULL;
3811 nvlist_lookup_uint64_array(best_cfg, ZPOOL_CONFIG_HOLE_ARRAY,
3812 &holes, &nholes);
3813
3814 tops = kmem_zalloc(nchildren * sizeof(void *), KM_SLEEP);
3815 for (i = 0; i < nchildren; i++) {
3816 if (i >= count)
3817 break;
3818 if (configs[i] == NULL)
3819 continue;
3820 VERIFY(nvlist_lookup_nvlist(configs[i], ZPOOL_CONFIG_VDEV_TREE,
3821 &nvtop) == 0);
3822 nvlist_dup(nvtop, &tops[i], KM_SLEEP);
3823 }
3824 for (i = 0; holes != NULL && i < nholes; i++) {
3825 if (i >= nchildren)
3826 continue;
3827 if (tops[holes[i]] != NULL)
3828 continue;
3829 nvlist_alloc(&tops[holes[i]], NV_UNIQUE_NAME, KM_SLEEP);
3830 VERIFY(nvlist_add_string(tops[holes[i]], ZPOOL_CONFIG_TYPE,
3831 VDEV_TYPE_HOLE) == 0);
3832 VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_ID,
3833 holes[i]) == 0);
3834 VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_GUID,
3835 0) == 0);
3836 }
3837 for (i = 0; i < nchildren; i++) {
3838 if (tops[i] != NULL)
3839 continue;
3840 nvlist_alloc(&tops[i], NV_UNIQUE_NAME, KM_SLEEP);
3841 VERIFY(nvlist_add_string(tops[i], ZPOOL_CONFIG_TYPE,
3842 VDEV_TYPE_MISSING) == 0);
3843 VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_ID,
3844 i) == 0);
3845 VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_GUID,
3846 0) == 0);
3847 }
3848
3849 /*
3850 * Create pool config based on the best vdev config.
3851 */
3852 nvlist_dup(best_cfg, &config, KM_SLEEP);
3853
3854 /*
3855 * Put this pool's top-level vdevs into a root vdev.
3856 */
3857 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
3858 &pgid) == 0);
3859 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
3860 VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
3861 VDEV_TYPE_ROOT) == 0);
3862 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0);
3863 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0);
3864 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
3865 tops, nchildren) == 0);
3866
3867 /*
3868 * Replace the existing vdev_tree with the new root vdev in
3869 * this pool's configuration (remove the old, add the new).
3870 */
3871 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
3872
3873 /*
3874 * Drop vdev config elements that should not be present at pool level.
3875 */
3876 nvlist_remove(config, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64);
3877 nvlist_remove(config, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64);
3878
3879 for (i = 0; i < count; i++)
3880 nvlist_free(configs[i]);
3881 kmem_free(configs, count * sizeof(void *));
3882 for (i = 0; i < nchildren; i++)
3883 nvlist_free(tops[i]);
3884 kmem_free(tops, nchildren * sizeof(void *));
3885 nvlist_free(nvroot);
3886 return (config);
3887}
3888
3889int
3890spa_import_rootpool(const char *name)
3891{
3892 spa_t *spa;
3893 vdev_t *rvd, *bvd, *avd = NULL;
3894 nvlist_t *config, *nvtop;
3895 uint64_t txg;
3896 char *pname;
3897 int error;
3898
3899 /*
3900 * Read the label from the boot device and generate a configuration.
3901 */
3902 config = spa_generate_rootconf(name);
3903
3904 mutex_enter(&spa_namespace_lock);
3905 if (config != NULL) {
3906 VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
3907 &pname) == 0 && strcmp(name, pname) == 0);
3908 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg)
3909 == 0);
3910
3911 if ((spa = spa_lookup(pname)) != NULL) {
3912 /*
3913 * Remove the existing root pool from the namespace so
3914 * that we can replace it with the correct config
3915 * we just read in.
3916 */
3917 spa_remove(spa);
3918 }
3919 spa = spa_add(pname, config, NULL);
3920
3921 /*
3922 * Set spa_ubsync.ub_version as it can be used in vdev_alloc()
3923 * via spa_version().
3924 */
3925 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
3926 &spa->spa_ubsync.ub_version) != 0)
3927 spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
3928 } else if ((spa = spa_lookup(name)) == NULL) {
3929 cmn_err(CE_NOTE, "Cannot find the pool label for '%s'",
3930 name);
3931 return (EIO);
3932 } else {
3933 VERIFY(nvlist_dup(spa->spa_config, &config, KM_SLEEP) == 0);
3934 }
3935 spa->spa_is_root = B_TRUE;
3936 spa->spa_import_flags = ZFS_IMPORT_VERBATIM;
3937
3938 /*
3939 * Build up a vdev tree based on the boot device's label config.
3940 */
3941 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
3942 &nvtop) == 0);
3943 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3944 error = spa_config_parse(spa, &rvd, nvtop, NULL, 0,
3945 VDEV_ALLOC_ROOTPOOL);
3946 spa_config_exit(spa, SCL_ALL, FTAG);
3947 if (error) {
3948 mutex_exit(&spa_namespace_lock);
3949 nvlist_free(config);
3950 cmn_err(CE_NOTE, "Can not parse the config for pool '%s'",
3951 pname);
3952 return (error);
3953 }
3954
3955 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3956 vdev_free(rvd);
3957 spa_config_exit(spa, SCL_ALL, FTAG);
3958 mutex_exit(&spa_namespace_lock);
3959
3960 nvlist_free(config);
3961 return (0);
3962}
3963
3964#endif /* sun */
3965#endif
3966
3967/*
3968 * Import a non-root pool into the system.
3969 */
3970int
3971spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
3972{
3973 spa_t *spa;
3974 char *altroot = NULL;
3975 spa_load_state_t state = SPA_LOAD_IMPORT;
3976 zpool_rewind_policy_t policy;
3977 uint64_t mode = spa_mode_global;
3978 uint64_t readonly = B_FALSE;
3979 int error;
3980 nvlist_t *nvroot;
3981 nvlist_t **spares, **l2cache;
3982 uint_t nspares, nl2cache;
3983
3984 /*
3985 * If a pool with this name exists, return failure.
3986 */
3987 mutex_enter(&spa_namespace_lock);
3988 if (spa_lookup(pool) != NULL) {
3989 mutex_exit(&spa_namespace_lock);
3990 return (SET_ERROR(EEXIST));
3991 }
3992
3993 /*
3994 * Create and initialize the spa structure.
3995 */
3996 (void) nvlist_lookup_string(props,
3997 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
3998 (void) nvlist_lookup_uint64(props,
3999 zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly);
4000 if (readonly)
4001 mode = FREAD;
4002 spa = spa_add(pool, config, altroot);
4003 spa->spa_import_flags = flags;
4004
4005 /*
4006 * Verbatim import - Take a pool and insert it into the namespace
4007 * as if it had been loaded at boot.
4008 */
4009 if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) {
4010 if (props != NULL)
4011 spa_configfile_set(spa, props, B_FALSE);
4012
4013 spa_config_sync(spa, B_FALSE, B_TRUE);
4014
4015 mutex_exit(&spa_namespace_lock);
4016 spa_history_log_version(spa, "import");
4017
4018 return (0);
4019 }
4020
4021 spa_activate(spa, mode);
4022
4023 /*
4024 * Don't start async tasks until we know everything is healthy.
4025 */
4026 spa_async_suspend(spa);
4027
4028 zpool_get_rewind_policy(config, &policy);
4029 if (policy.zrp_request & ZPOOL_DO_REWIND)
4030 state = SPA_LOAD_RECOVER;
4031
4032 /*
4033 * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig
4034 * because the user-supplied config is actually the one to trust when
4035 * doing an import.
4036 */
4037 if (state != SPA_LOAD_RECOVER)
4038 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
4039
4040 error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg,
4041 policy.zrp_request);
4042
4043 /*
4044 * Propagate anything learned while loading the pool and pass it
4045 * back to caller (i.e. rewind info, missing devices, etc).
4046 */
4047 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
4048 spa->spa_load_info) == 0);
4049
4050 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4051 /*
4052 * Toss any existing sparelist, as it doesn't have any validity
4053 * anymore, and conflicts with spa_has_spare().
4054 */
4055 if (spa->spa_spares.sav_config) {
4056 nvlist_free(spa->spa_spares.sav_config);
4057 spa->spa_spares.sav_config = NULL;
4058 spa_load_spares(spa);
4059 }
4060 if (spa->spa_l2cache.sav_config) {
4061 nvlist_free(spa->spa_l2cache.sav_config);
4062 spa->spa_l2cache.sav_config = NULL;
4063 spa_load_l2cache(spa);
4064 }
4065
4066 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
4067 &nvroot) == 0);
4068 if (error == 0)
4069 error = spa_validate_aux(spa, nvroot, -1ULL,
4070 VDEV_ALLOC_SPARE);
4071 if (error == 0)
4072 error = spa_validate_aux(spa, nvroot, -1ULL,
4073 VDEV_ALLOC_L2CACHE);
4074 spa_config_exit(spa, SCL_ALL, FTAG);
4075
4076 if (props != NULL)
4077 spa_configfile_set(spa, props, B_FALSE);
4078
4079 if (error != 0 || (props && spa_writeable(spa) &&
4080 (error = spa_prop_set(spa, props)))) {
4081 spa_unload(spa);
4082 spa_deactivate(spa);
4083 spa_remove(spa);
4084 mutex_exit(&spa_namespace_lock);
4085 return (error);
4086 }
4087
4088 spa_async_resume(spa);
4089
4090 /*
4091 * Override any spares and level 2 cache devices as specified by
4092 * the user, as these may have correct device names/devids, etc.
4093 */
4094 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
4095 &spares, &nspares) == 0) {
4096 if (spa->spa_spares.sav_config)
4097 VERIFY(nvlist_remove(spa->spa_spares.sav_config,
4098 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
4099 else
4100 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config,
4101 NV_UNIQUE_NAME, KM_SLEEP) == 0);
4102 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
4103 ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
4104 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4105 spa_load_spares(spa);
4106 spa_config_exit(spa, SCL_ALL, FTAG);
4107 spa->spa_spares.sav_sync = B_TRUE;
4108 }
4109 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
4110 &l2cache, &nl2cache) == 0) {
4111 if (spa->spa_l2cache.sav_config)
4112 VERIFY(nvlist_remove(spa->spa_l2cache.sav_config,
4113 ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0);
4114 else
4115 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
4116 NV_UNIQUE_NAME, KM_SLEEP) == 0);
4117 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
4118 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
4119 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4120 spa_load_l2cache(spa);
4121 spa_config_exit(spa, SCL_ALL, FTAG);
4122 spa->spa_l2cache.sav_sync = B_TRUE;
4123 }
4124
4125 /*
4126 * Check for any removed devices.
4127 */
4128 if (spa->spa_autoreplace) {
4129 spa_aux_check_removed(&spa->spa_spares);
4130 spa_aux_check_removed(&spa->spa_l2cache);
4131 }
4132
4133 if (spa_writeable(spa)) {
4134 /*
4135 * Update the config cache to include the newly-imported pool.
4136 */
4137 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
4138 }
4139
4140 /*
4141 * It's possible that the pool was expanded while it was exported.
4142 * We kick off an async task to handle this for us.
4143 */
4144 spa_async_request(spa, SPA_ASYNC_AUTOEXPAND);
4145
4146 mutex_exit(&spa_namespace_lock);
4147 spa_history_log_version(spa, "import");
4148
4149#ifdef __FreeBSD__
4150#ifdef _KERNEL
4151 zvol_create_minors(pool);
4152#endif
4153#endif
4154 return (0);
4155}
4156
4157nvlist_t *
4158spa_tryimport(nvlist_t *tryconfig)
4159{
4160 nvlist_t *config = NULL;
4161 char *poolname;
4162 spa_t *spa;
4163 uint64_t state;
4164 int error;
4165
4166 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
4167 return (NULL);
4168
4169 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state))
4170 return (NULL);
4171
4172 /*
4173 * Create and initialize the spa structure.
4174 */
4175 mutex_enter(&spa_namespace_lock);
4176 spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL);
4177 spa_activate(spa, FREAD);
4178
4179 /*
4180 * Pass off the heavy lifting to spa_load().
4181 * Pass TRUE for mosconfig because the user-supplied config
4182 * is actually the one to trust when doing an import.
4183 */
4184 error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE);
4185
4186 /*
4187 * If 'tryconfig' was at least parsable, return the current config.
4188 */
4189 if (spa->spa_root_vdev != NULL) {
4190 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
4191 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
4192 poolname) == 0);
4193 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
4194 state) == 0);
4195 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
4196 spa->spa_uberblock.ub_timestamp) == 0);
4197 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
4198 spa->spa_load_info) == 0);
4199
4200 /*
4201 * If the bootfs property exists on this pool then we
4202 * copy it out so that external consumers can tell which
4203 * pools are bootable.
4204 */
4205 if ((!error || error == EEXIST) && spa->spa_bootfs) {
4206 char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
4207
4208 /*
4209 * We have to play games with the name since the
4210 * pool was opened as TRYIMPORT_NAME.
4211 */
4212 if (dsl_dsobj_to_dsname(spa_name(spa),
4213 spa->spa_bootfs, tmpname) == 0) {
4214 char *cp;
4215 char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
4216
4217 cp = strchr(tmpname, '/');
4218 if (cp == NULL) {
4219 (void) strlcpy(dsname, tmpname,
4220 MAXPATHLEN);
4221 } else {
4222 (void) snprintf(dsname, MAXPATHLEN,
4223 "%s/%s", poolname, ++cp);
4224 }
4225 VERIFY(nvlist_add_string(config,
4226 ZPOOL_CONFIG_BOOTFS, dsname) == 0);
4227 kmem_free(dsname, MAXPATHLEN);
4228 }
4229 kmem_free(tmpname, MAXPATHLEN);
4230 }
4231
4232 /*
4233 * Add the list of hot spares and level 2 cache devices.
4234 */
4235 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4236 spa_add_spares(spa, config);
4237 spa_add_l2cache(spa, config);
4238 spa_config_exit(spa, SCL_CONFIG, FTAG);
4239 }
4240
4241 spa_unload(spa);
4242 spa_deactivate(spa);
4243 spa_remove(spa);
4244 mutex_exit(&spa_namespace_lock);
4245
4246 return (config);
4247}
4248
4249/*
4250 * Pool export/destroy
4251 *
4252 * The act of destroying or exporting a pool is very simple. We make sure there
4253 * is no more pending I/O and any references to the pool are gone. Then, we
4254 * update the pool state and sync all the labels to disk, removing the
4255 * configuration from the cache afterwards. If the 'hardforce' flag is set, then
4256 * we don't sync the labels or remove the configuration cache.
4257 */
4258static int
4259spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
4260 boolean_t force, boolean_t hardforce)
4261{
4262 spa_t *spa;
4263
4264 if (oldconfig)
4265 *oldconfig = NULL;
4266
4267 if (!(spa_mode_global & FWRITE))
4268 return (SET_ERROR(EROFS));
4269
4270 mutex_enter(&spa_namespace_lock);
4271 if ((spa = spa_lookup(pool)) == NULL) {
4272 mutex_exit(&spa_namespace_lock);
4273 return (SET_ERROR(ENOENT));
4274 }
4275
4276 /*
4277 * Put a hold on the pool, drop the namespace lock, stop async tasks,
4278 * reacquire the namespace lock, and see if we can export.
4279 */
4280 spa_open_ref(spa, FTAG);
4281 mutex_exit(&spa_namespace_lock);
4282 spa_async_suspend(spa);
4283 mutex_enter(&spa_namespace_lock);
4284 spa_close(spa, FTAG);
4285
4286 /*
4287 * The pool will be in core if it's openable,
4288 * in which case we can modify its state.
4289 */
4290 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) {
4291 /*
4292 * Objsets may be open only because they're dirty, so we
4293 * have to force it to sync before checking spa_refcnt.
4294 */
4295 txg_wait_synced(spa->spa_dsl_pool, 0);
4296
4297 /*
4298 * A pool cannot be exported or destroyed if there are active
4299 * references. If we are resetting a pool, allow references by
4300 * fault injection handlers.
4301 */
4302 if (!spa_refcount_zero(spa) ||
4303 (spa->spa_inject_ref != 0 &&
4304 new_state != POOL_STATE_UNINITIALIZED)) {
4305 spa_async_resume(spa);
4306 mutex_exit(&spa_namespace_lock);
4307 return (SET_ERROR(EBUSY));
4308 }
4309
4310 /*
4311 * A pool cannot be exported if it has an active shared spare.
4312 * This is to prevent other pools stealing the active spare
4313 * from an exported pool. At user's own will, such pool can
4314 * be forcedly exported.
4315 */
4316 if (!force && new_state == POOL_STATE_EXPORTED &&
4317 spa_has_active_shared_spare(spa)) {
4318 spa_async_resume(spa);
4319 mutex_exit(&spa_namespace_lock);
4320 return (SET_ERROR(EXDEV));
4321 }
4322
4323 /*
4324 * We want this to be reflected on every label,
4325 * so mark them all dirty. spa_unload() will do the
4326 * final sync that pushes these changes out.
4327 */
4328 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
4329 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4330 spa->spa_state = new_state;
4331 spa->spa_final_txg = spa_last_synced_txg(spa) +
4332 TXG_DEFER_SIZE + 1;
4333 vdev_config_dirty(spa->spa_root_vdev);
4334 spa_config_exit(spa, SCL_ALL, FTAG);
4335 }
4336 }
4337
4338 spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY);
4339
4340 if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
4341 spa_unload(spa);
4342 spa_deactivate(spa);
4343 }
4344
4345 if (oldconfig && spa->spa_config)
4346 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0);
4347
4348 if (new_state != POOL_STATE_UNINITIALIZED) {
4349 if (!hardforce)
4350 spa_config_sync(spa, B_TRUE, B_TRUE);
4351 spa_remove(spa);
4352 }
4353 mutex_exit(&spa_namespace_lock);
4354
4355 return (0);
4356}
4357
4358/*
4359 * Destroy a storage pool.
4360 */
4361int
4362spa_destroy(char *pool)
4363{
4364 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL,
4365 B_FALSE, B_FALSE));
4366}
4367
4368/*
4369 * Export a storage pool.
4370 */
4371int
4372spa_export(char *pool, nvlist_t **oldconfig, boolean_t force,
4373 boolean_t hardforce)
4374{
4375 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig,
4376 force, hardforce));
4377}
4378
4379/*
4380 * Similar to spa_export(), this unloads the spa_t without actually removing it
4381 * from the namespace in any way.
4382 */
4383int
4384spa_reset(char *pool)
4385{
4386 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL,
4387 B_FALSE, B_FALSE));
4388}
4389
4390/*
4391 * ==========================================================================
4392 * Device manipulation
4393 * ==========================================================================
4394 */
4395
4396/*
4397 * Add a device to a storage pool.
4398 */
4399int
4400spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
4401{
4402 uint64_t txg, id;
4403 int error;
4404 vdev_t *rvd = spa->spa_root_vdev;
4405 vdev_t *vd, *tvd;
4406 nvlist_t **spares, **l2cache;
4407 uint_t nspares, nl2cache;
4408
4409 ASSERT(spa_writeable(spa));
4410
4411 txg = spa_vdev_enter(spa);
4412
4413 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0,
4414 VDEV_ALLOC_ADD)) != 0)
4415 return (spa_vdev_exit(spa, NULL, txg, error));
4416
4417 spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */
4418
4419 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares,
4420 &nspares) != 0)
4421 nspares = 0;
4422
4423 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache,
4424 &nl2cache) != 0)
4425 nl2cache = 0;
4426
4427 if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0)
4428 return (spa_vdev_exit(spa, vd, txg, EINVAL));
4429
4430 if (vd->vdev_children != 0 &&
4431 (error = vdev_create(vd, txg, B_FALSE)) != 0)
4432 return (spa_vdev_exit(spa, vd, txg, error));
4433
4434 /*
4435 * We must validate the spares and l2cache devices after checking the
4436 * children. Otherwise, vdev_inuse() will blindly overwrite the spare.
4437 */
4438 if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0)
4439 return (spa_vdev_exit(spa, vd, txg, error));
4440
4441 /*
4442 * Transfer each new top-level vdev from vd to rvd.
4443 */
4444 for (int c = 0; c < vd->vdev_children; c++) {
4445
4446 /*
4447 * Set the vdev id to the first hole, if one exists.
4448 */
4449 for (id = 0; id < rvd->vdev_children; id++) {
4450 if (rvd->vdev_child[id]->vdev_ishole) {
4451 vdev_free(rvd->vdev_child[id]);
4452 break;
4453 }
4454 }
4455 tvd = vd->vdev_child[c];
4456 vdev_remove_child(vd, tvd);
4457 tvd->vdev_id = id;
4458 vdev_add_child(rvd, tvd);
4459 vdev_config_dirty(tvd);
4460 }
4461
4462 if (nspares != 0) {
4463 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares,
4464 ZPOOL_CONFIG_SPARES);
4465 spa_load_spares(spa);
4466 spa->spa_spares.sav_sync = B_TRUE;
4467 }
4468
4469 if (nl2cache != 0) {
4470 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache,
4471 ZPOOL_CONFIG_L2CACHE);
4472 spa_load_l2cache(spa);
4473 spa->spa_l2cache.sav_sync = B_TRUE;
4474 }
4475
4476 /*
4477 * We have to be careful when adding new vdevs to an existing pool.
4478 * If other threads start allocating from these vdevs before we
4479 * sync the config cache, and we lose power, then upon reboot we may
4480 * fail to open the pool because there are DVAs that the config cache
4481 * can't translate. Therefore, we first add the vdevs without
4482 * initializing metaslabs; sync the config cache (via spa_vdev_exit());
4483 * and then let spa_config_update() initialize the new metaslabs.
4484 *
4485 * spa_load() checks for added-but-not-initialized vdevs, so that
4486 * if we lose power at any point in this sequence, the remaining
4487 * steps will be completed the next time we load the pool.
4488 */
4489 (void) spa_vdev_exit(spa, vd, txg, 0);
4490
4491 mutex_enter(&spa_namespace_lock);
4492 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
4493 mutex_exit(&spa_namespace_lock);
4494
4495 return (0);
4496}
4497
4498/*
4499 * Attach a device to a mirror. The arguments are the path to any device
4500 * in the mirror, and the nvroot for the new device. If the path specifies
4501 * a device that is not mirrored, we automatically insert the mirror vdev.
4502 *
4503 * If 'replacing' is specified, the new device is intended to replace the
4504 * existing device; in this case the two devices are made into their own
4505 * mirror using the 'replacing' vdev, which is functionally identical to
4506 * the mirror vdev (it actually reuses all the same ops) but has a few
4507 * extra rules: you can't attach to it after it's been created, and upon
4508 * completion of resilvering, the first disk (the one being replaced)
4509 * is automatically detached.
4510 */
4511int
4512spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
4513{
4514 uint64_t txg, dtl_max_txg;
4515 vdev_t *rvd = spa->spa_root_vdev;
4516 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
4517 vdev_ops_t *pvops;
4518 char *oldvdpath, *newvdpath;
4519 int newvd_isspare;
4520 int error;
4521
4522 ASSERT(spa_writeable(spa));
4523
4524 txg = spa_vdev_enter(spa);
4525
4526 oldvd = spa_lookup_by_guid(spa, guid, B_FALSE);
4527
4528 if (oldvd == NULL)
4529 return (spa_vdev_exit(spa, NULL, txg, ENODEV));
4530
4531 if (!oldvd->vdev_ops->vdev_op_leaf)
4532 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
4533
4534 pvd = oldvd->vdev_parent;
4535
4536 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
4537 VDEV_ALLOC_ATTACH)) != 0)
4538 return (spa_vdev_exit(spa, NULL, txg, EINVAL));
4539
4540 if (newrootvd->vdev_children != 1)
4541 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
4542
4543 newvd = newrootvd->vdev_child[0];
4544
4545 if (!newvd->vdev_ops->vdev_op_leaf)
4546 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
4547
4548 if ((error = vdev_create(newrootvd, txg, replacing)) != 0)
4549 return (spa_vdev_exit(spa, newrootvd, txg, error));
4550
4551 /*
4552 * Spares can't replace logs
4553 */
4554 if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare)
4555 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
4556
4557 if (!replacing) {
4558 /*
4559 * For attach, the only allowable parent is a mirror or the root
4560 * vdev.
4561 */
4562 if (pvd->vdev_ops != &vdev_mirror_ops &&
4563 pvd->vdev_ops != &vdev_root_ops)
4564 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
4565
4566 pvops = &vdev_mirror_ops;
4567 } else {
4568 /*
4569 * Active hot spares can only be replaced by inactive hot
4570 * spares.
4571 */
4572 if (pvd->vdev_ops == &vdev_spare_ops &&
4573 oldvd->vdev_isspare &&
4574 !spa_has_spare(spa, newvd->vdev_guid))
4575 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
4576
4577 /*
4578 * If the source is a hot spare, and the parent isn't already a
4579 * spare, then we want to create a new hot spare. Otherwise, we
4580 * want to create a replacing vdev. The user is not allowed to
4581 * attach to a spared vdev child unless the 'isspare' state is
4582 * the same (spare replaces spare, non-spare replaces
4583 * non-spare).
4584 */
4585 if (pvd->vdev_ops == &vdev_replacing_ops &&
4586 spa_version(spa) < SPA_VERSION_MULTI_REPLACE) {
4587 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
4588 } else if (pvd->vdev_ops == &vdev_spare_ops &&
4589 newvd->vdev_isspare != oldvd->vdev_isspare) {
4590 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
4591 }
4592
4593 if (newvd->vdev_isspare)
4594 pvops = &vdev_spare_ops;
4595 else
4596 pvops = &vdev_replacing_ops;
4597 }
4598
4599 /*
4600 * Make sure the new device is big enough.
4601 */
4602 if (newvd->vdev_asize < vdev_get_min_asize(oldvd))
4603 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
4604
4605 /*
4606 * The new device cannot have a higher alignment requirement
4607 * than the top-level vdev.
4608 */
4609 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
4610 return (spa_vdev_exit(spa, newrootvd, txg, EDOM));
4611
4612 /*
4613 * If this is an in-place replacement, update oldvd's path and devid
4614 * to make it distinguishable from newvd, and unopenable from now on.
4615 */
4616 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
4617 spa_strfree(oldvd->vdev_path);
4618 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
4619 KM_SLEEP);
4620 (void) sprintf(oldvd->vdev_path, "%s/%s",
4621 newvd->vdev_path, "old");
4622 if (oldvd->vdev_devid != NULL) {
4623 spa_strfree(oldvd->vdev_devid);
4624 oldvd->vdev_devid = NULL;
4625 }
4626 }
4627
4628 /* mark the device being resilvered */
4629 newvd->vdev_resilvering = B_TRUE;
4630
4631 /*
4632 * If the parent is not a mirror, or if we're replacing, insert the new
4633 * mirror/replacing/spare vdev above oldvd.
4634 */
4635 if (pvd->vdev_ops != pvops)
4636 pvd = vdev_add_parent(oldvd, pvops);
4637
4638 ASSERT(pvd->vdev_top->vdev_parent == rvd);
4639 ASSERT(pvd->vdev_ops == pvops);
4640 ASSERT(oldvd->vdev_parent == pvd);
4641
4642 /*
4643 * Extract the new device from its root and add it to pvd.
4644 */
4645 vdev_remove_child(newrootvd, newvd);
4646 newvd->vdev_id = pvd->vdev_children;
4647 newvd->vdev_crtxg = oldvd->vdev_crtxg;
4648 vdev_add_child(pvd, newvd);
4649
4650 tvd = newvd->vdev_top;
4651 ASSERT(pvd->vdev_top == tvd);
4652 ASSERT(tvd->vdev_parent == rvd);
4653
4654 vdev_config_dirty(tvd);
4655
4656 /*
4657 * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account
4658 * for any dmu_sync-ed blocks. It will propagate upward when
4659 * spa_vdev_exit() calls vdev_dtl_reassess().
4660 */
4661 dtl_max_txg = txg + TXG_CONCURRENT_STATES;
4662
4663 vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL,
4664 dtl_max_txg - TXG_INITIAL);
4665
4666 if (newvd->vdev_isspare) {
4667 spa_spare_activate(newvd);
4668 spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE);
4669 }
4670
4671 oldvdpath = spa_strdup(oldvd->vdev_path);
4672 newvdpath = spa_strdup(newvd->vdev_path);
4673 newvd_isspare = newvd->vdev_isspare;
4674
4675 /*
4676 * Mark newvd's DTL dirty in this txg.
4677 */
4678 vdev_dirty(tvd, VDD_DTL, newvd, txg);
4679
4680 /*
4681 * Restart the resilver
4682 */
4683 dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg);
4684
4685 /*
4686 * Commit the config
4687 */
4688 (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0);
4689
4690 spa_history_log_internal(spa, "vdev attach", NULL,
4691 "%s vdev=%s %s vdev=%s",
4692 replacing && newvd_isspare ? "spare in" :
4693 replacing ? "replace" : "attach", newvdpath,
4694 replacing ? "for" : "to", oldvdpath);
4695
4696 spa_strfree(oldvdpath);
4697 spa_strfree(newvdpath);
4698
4699 if (spa->spa_bootfs)
4700 spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH);
4701
4702 return (0);
4703}
4704
4705/*
4706 * Detach a device from a mirror or replacing vdev.
4707 *
4708 * If 'replace_done' is specified, only detach if the parent
4709 * is a replacing vdev.
4710 */
4711int
4712spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
4713{
4714 uint64_t txg;
4715 int error;
4716 vdev_t *rvd = spa->spa_root_vdev;
4717 vdev_t *vd, *pvd, *cvd, *tvd;
4718 boolean_t unspare = B_FALSE;
4719 uint64_t unspare_guid = 0;
4720 char *vdpath;
4721
4722 ASSERT(spa_writeable(spa));
4723
4724 txg = spa_vdev_enter(spa);
4725
4726 vd = spa_lookup_by_guid(spa, guid, B_FALSE);
4727
4728 if (vd == NULL)
4729 return (spa_vdev_exit(spa, NULL, txg, ENODEV));
4730
4731 if (!vd->vdev_ops->vdev_op_leaf)
4732 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
4733
4734 pvd = vd->vdev_parent;
4735
4736 /*
4737 * If the parent/child relationship is not as expected, don't do it.
4738 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing
4739 * vdev that's replacing B with C. The user's intent in replacing
4740 * is to go from M(A,B) to M(A,C). If the user decides to cancel
4741 * the replace by detaching C, the expected behavior is to end up
4742 * M(A,B). But suppose that right after deciding to detach C,
4743 * the replacement of B completes. We would have M(A,C), and then
4744 * ask to detach C, which would leave us with just A -- not what
4745 * the user wanted. To prevent this, we make sure that the
4746 * parent/child relationship hasn't changed -- in this example,
4747 * that C's parent is still the replacing vdev R.
4748 */
4749 if (pvd->vdev_guid != pguid && pguid != 0)
4750 return (spa_vdev_exit(spa, NULL, txg, EBUSY));
4751
4752 /*
4753 * Only 'replacing' or 'spare' vdevs can be replaced.
4754 */
4755 if (replace_done && pvd->vdev_ops != &vdev_replacing_ops &&
4756 pvd->vdev_ops != &vdev_spare_ops)
4757 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
4758
4759 ASSERT(pvd->vdev_ops != &vdev_spare_ops ||
4760 spa_version(spa) >= SPA_VERSION_SPARES);
4761
4762 /*
4763 * Only mirror, replacing, and spare vdevs support detach.
4764 */
4765 if (pvd->vdev_ops != &vdev_replacing_ops &&
4766 pvd->vdev_ops != &vdev_mirror_ops &&
4767 pvd->vdev_ops != &vdev_spare_ops)
4768 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
4769
4770 /*
4771 * If this device has the only valid copy of some data,
4772 * we cannot safely detach it.
4773 */
4774 if (vdev_dtl_required(vd))
4775 return (spa_vdev_exit(spa, NULL, txg, EBUSY));
4776
4777 ASSERT(pvd->vdev_children >= 2);
4778
4779 /*
4780 * If we are detaching the second disk from a replacing vdev, then
4781 * check to see if we changed the original vdev's path to have "/old"
4782 * at the end in spa_vdev_attach(). If so, undo that change now.
4783 */
4784 if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 &&
4785 vd->vdev_path != NULL) {
4786 size_t len = strlen(vd->vdev_path);
4787
4788 for (int c = 0; c < pvd->vdev_children; c++) {
4789 cvd = pvd->vdev_child[c];
4790
4791 if (cvd == vd || cvd->vdev_path == NULL)
4792 continue;
4793
4794 if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 &&
4795 strcmp(cvd->vdev_path + len, "/old") == 0) {
4796 spa_strfree(cvd->vdev_path);
4797 cvd->vdev_path = spa_strdup(vd->vdev_path);
4798 break;
4799 }
4800 }
4801 }
4802
4803 /*
4804 * If we are detaching the original disk from a spare, then it implies
4805 * that the spare should become a real disk, and be removed from the
4806 * active spare list for the pool.
4807 */
4808 if (pvd->vdev_ops == &vdev_spare_ops &&
4809 vd->vdev_id == 0 &&
4810 pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare)
4811 unspare = B_TRUE;
4812
4813 /*
4814 * Erase the disk labels so the disk can be used for other things.
4815 * This must be done after all other error cases are handled,
4816 * but before we disembowel vd (so we can still do I/O to it).
4817 * But if we can't do it, don't treat the error as fatal --
4818 * it may be that the unwritability of the disk is the reason
4819 * it's being detached!
4820 */
4821 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
4822
4823 /*
4824 * Remove vd from its parent and compact the parent's children.
4825 */
4826 vdev_remove_child(pvd, vd);
4827 vdev_compact_children(pvd);
4828
4829 /*
4830 * Remember one of the remaining children so we can get tvd below.
4831 */
4832 cvd = pvd->vdev_child[pvd->vdev_children - 1];
4833
4834 /*
4835 * If we need to remove the remaining child from the list of hot spares,
4836 * do it now, marking the vdev as no longer a spare in the process.
4837 * We must do this before vdev_remove_parent(), because that can
4838 * change the GUID if it creates a new toplevel GUID. For a similar
4839 * reason, we must remove the spare now, in the same txg as the detach;
4840 * otherwise someone could attach a new sibling, change the GUID, and
4841 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail.
4842 */
4843 if (unspare) {
4844 ASSERT(cvd->vdev_isspare);
4845 spa_spare_remove(cvd);
4846 unspare_guid = cvd->vdev_guid;
4847 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
4848 cvd->vdev_unspare = B_TRUE;
4849 }
4850
4851 /*
4852 * If the parent mirror/replacing vdev only has one child,
4853 * the parent is no longer needed. Remove it from the tree.
4854 */
4855 if (pvd->vdev_children == 1) {
4856 if (pvd->vdev_ops == &vdev_spare_ops)
4857 cvd->vdev_unspare = B_FALSE;
4858 vdev_remove_parent(cvd);
4859 cvd->vdev_resilvering = B_FALSE;
4860 }
4861
4862
4863 /*
4864 * We don't set tvd until now because the parent we just removed
4865 * may have been the previous top-level vdev.
4866 */
4867 tvd = cvd->vdev_top;
4868 ASSERT(tvd->vdev_parent == rvd);
4869
4870 /*
4871 * Reevaluate the parent vdev state.
4872 */
4873 vdev_propagate_state(cvd);
4874
4875 /*
4876 * If the 'autoexpand' property is set on the pool then automatically
4877 * try to expand the size of the pool. For example if the device we
4878 * just detached was smaller than the others, it may be possible to
4879 * add metaslabs (i.e. grow the pool). We need to reopen the vdev
4880 * first so that we can obtain the updated sizes of the leaf vdevs.
4881 */
4882 if (spa->spa_autoexpand) {
4883 vdev_reopen(tvd);
4884 vdev_expand(tvd, txg);
4885 }
4886
4887 vdev_config_dirty(tvd);
4888
4889 /*
4890 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that
4891 * vd->vdev_detached is set and free vd's DTL object in syncing context.
4892 * But first make sure we're not on any *other* txg's DTL list, to
4893 * prevent vd from being accessed after it's freed.
4894 */
4895 vdpath = spa_strdup(vd->vdev_path);
4896 for (int t = 0; t < TXG_SIZE; t++)
4897 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
4898 vd->vdev_detached = B_TRUE;
4899 vdev_dirty(tvd, VDD_DTL, vd, txg);
4900
4901 spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE);
4902
4903 /* hang on to the spa before we release the lock */
4904 spa_open_ref(spa, FTAG);
4905
4906 error = spa_vdev_exit(spa, vd, txg, 0);
4907
4908 spa_history_log_internal(spa, "detach", NULL,
4909 "vdev=%s", vdpath);
4910 spa_strfree(vdpath);
4911
4912 /*
4913 * If this was the removal of the original device in a hot spare vdev,
4914 * then we want to go through and remove the device from the hot spare
4915 * list of every other pool.
4916 */
4917 if (unspare) {
4918 spa_t *altspa = NULL;
4919
4920 mutex_enter(&spa_namespace_lock);
4921 while ((altspa = spa_next(altspa)) != NULL) {
4922 if (altspa->spa_state != POOL_STATE_ACTIVE ||
4923 altspa == spa)
4924 continue;
4925
4926 spa_open_ref(altspa, FTAG);
4927 mutex_exit(&spa_namespace_lock);
4928 (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE);
4929 mutex_enter(&spa_namespace_lock);
4930 spa_close(altspa, FTAG);
4931 }
4932 mutex_exit(&spa_namespace_lock);
4933
4934 /* search the rest of the vdevs for spares to remove */
4935 spa_vdev_resilver_done(spa);
4936 }
4937
4938 /* all done with the spa; OK to release */
4939 mutex_enter(&spa_namespace_lock);
4940 spa_close(spa, FTAG);
4941 mutex_exit(&spa_namespace_lock);
4942
4943 return (error);
4944}
4945
4946/*
4947 * Split a set of devices from their mirrors, and create a new pool from them.
4948 */
4949int
4950spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
4951 nvlist_t *props, boolean_t exp)
4952{
4953 int error = 0;
4954 uint64_t txg, *glist;
4955 spa_t *newspa;
4956 uint_t c, children, lastlog;
4957 nvlist_t **child, *nvl, *tmp;
4958 dmu_tx_t *tx;
4959 char *altroot = NULL;
4960 vdev_t *rvd, **vml = NULL; /* vdev modify list */
4961 boolean_t activate_slog;
4962
4963 ASSERT(spa_writeable(spa));
4964
4965 txg = spa_vdev_enter(spa);
4966
4967 /* clear the log and flush everything up to now */
4968 activate_slog = spa_passivate_log(spa);
4969 (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
4970 error = spa_offline_log(spa);
4971 txg = spa_vdev_config_enter(spa);
4972
4973 if (activate_slog)
4974 spa_activate_log(spa);
4975
4976 if (error != 0)
4977 return (spa_vdev_exit(spa, NULL, txg, error));
4978
4979 /* check new spa name before going any further */
4980 if (spa_lookup(newname) != NULL)
4981 return (spa_vdev_exit(spa, NULL, txg, EEXIST));
4982
4983 /*
4984 * scan through all the children to ensure they're all mirrors
4985 */
4986 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 ||
4987 nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child,
4988 &children) != 0)
4989 return (spa_vdev_exit(spa, NULL, txg, EINVAL));
4990
4991 /* first, check to ensure we've got the right child count */
4992 rvd = spa->spa_root_vdev;
4993 lastlog = 0;
4994 for (c = 0; c < rvd->vdev_children; c++) {
4995 vdev_t *vd = rvd->vdev_child[c];
4996
4997 /* don't count the holes & logs as children */
4998 if (vd->vdev_islog || vd->vdev_ishole) {
4999 if (lastlog == 0)
5000 lastlog = c;
5001 continue;
5002 }
5003
5004 lastlog = 0;
5005 }
5006 if (children != (lastlog != 0 ? lastlog : rvd->vdev_children))
5007 return (spa_vdev_exit(spa, NULL, txg, EINVAL));
5008
5009 /* next, ensure no spare or cache devices are part of the split */
5010 if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 ||
5011 nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0)
5012 return (spa_vdev_exit(spa, NULL, txg, EINVAL));
5013
5014 vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP);
5015 glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP);
5016
5017 /* then, loop over each vdev and validate it */
5018 for (c = 0; c < children; c++) {
5019 uint64_t is_hole = 0;
5020
5021 (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
5022 &is_hole);
5023
5024 if (is_hole != 0) {
5025 if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole ||
5026 spa->spa_root_vdev->vdev_child[c]->vdev_islog) {
5027 continue;
5028 } else {
5029 error = SET_ERROR(EINVAL);
5030 break;
5031 }
5032 }
5033
5034 /* which disk is going to be split? */
5035 if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID,
5036 &glist[c]) != 0) {
5037 error = SET_ERROR(EINVAL);
5038 break;
5039 }
5040
5041 /* look it up in the spa */
5042 vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE);
5043 if (vml[c] == NULL) {
5044 error = SET_ERROR(ENODEV);
5045 break;
5046 }
5047
5048 /* make sure there's nothing stopping the split */
5049 if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops ||
5050 vml[c]->vdev_islog ||
5051 vml[c]->vdev_ishole ||
5052 vml[c]->vdev_isspare ||
5053 vml[c]->vdev_isl2cache ||
5054 !vdev_writeable(vml[c]) ||
5055 vml[c]->vdev_children != 0 ||
5056 vml[c]->vdev_state != VDEV_STATE_HEALTHY ||
5057 c != spa->spa_root_vdev->vdev_child[c]->vdev_id) {
5058 error = SET_ERROR(EINVAL);
5059 break;
5060 }
5061
5062 if (vdev_dtl_required(vml[c])) {
5063 error = SET_ERROR(EBUSY);
5064 break;
5065 }
5066
5067 /* we need certain info from the top level */
5068 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY,
5069 vml[c]->vdev_top->vdev_ms_array) == 0);
5070 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT,
5071 vml[c]->vdev_top->vdev_ms_shift) == 0);
5072 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE,
5073 vml[c]->vdev_top->vdev_asize) == 0);
5074 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT,
5075 vml[c]->vdev_top->vdev_ashift) == 0);
5076 }
5077
5078 if (error != 0) {
5079 kmem_free(vml, children * sizeof (vdev_t *));
5080 kmem_free(glist, children * sizeof (uint64_t));
5081 return (spa_vdev_exit(spa, NULL, txg, error));
5082 }
5083
5084 /* stop writers from using the disks */
5085 for (c = 0; c < children; c++) {
5086 if (vml[c] != NULL)
5087 vml[c]->vdev_offline = B_TRUE;
5088 }
5089 vdev_reopen(spa->spa_root_vdev);
5090
5091 /*
5092 * Temporarily record the splitting vdevs in the spa config. This
5093 * will disappear once the config is regenerated.
5094 */
5095 VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0);
5096 VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
5097 glist, children) == 0);
5098 kmem_free(glist, children * sizeof (uint64_t));
5099
5100 mutex_enter(&spa->spa_props_lock);
5101 VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT,
5102 nvl) == 0);
5103 mutex_exit(&spa->spa_props_lock);
5104 spa->spa_config_splitting = nvl;
5105 vdev_config_dirty(spa->spa_root_vdev);
5106
5107 /* configure and create the new pool */
5108 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0);
5109 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
5110 exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0);
5111 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
5112 spa_version(spa)) == 0);
5113 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG,
5114 spa->spa_config_txg) == 0);
5115 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
5116 spa_generate_guid(NULL)) == 0);
5117 (void) nvlist_lookup_string(props,
5118 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
5119
5120 /* add the new pool to the namespace */
5121 newspa = spa_add(newname, config, altroot);
5122 newspa->spa_config_txg = spa->spa_config_txg;
5123 spa_set_log_state(newspa, SPA_LOG_CLEAR);
5124
5125 /* release the spa config lock, retaining the namespace lock */
5126 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
5127
5128 if (zio_injection_enabled)
5129 zio_handle_panic_injection(spa, FTAG, 1);
5130
5131 spa_activate(newspa, spa_mode_global);
5132 spa_async_suspend(newspa);
5133
5134#ifndef sun
5135 /* mark that we are creating new spa by splitting */
5136 newspa->spa_splitting_newspa = B_TRUE;
5137#endif
5138 /* create the new pool from the disks of the original pool */
5139 error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE);
5140#ifndef sun
5141 newspa->spa_splitting_newspa = B_FALSE;
5142#endif
5143 if (error)
5144 goto out;
5145
5146 /* if that worked, generate a real config for the new pool */
5147 if (newspa->spa_root_vdev != NULL) {
5148 VERIFY(nvlist_alloc(&newspa->spa_config_splitting,
5149 NV_UNIQUE_NAME, KM_SLEEP) == 0);
5150 VERIFY(nvlist_add_uint64(newspa->spa_config_splitting,
5151 ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0);
5152 spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL,
5153 B_TRUE));
5154 }
5155
5156 /* set the props */
5157 if (props != NULL) {
5158 spa_configfile_set(newspa, props, B_FALSE);
5159 error = spa_prop_set(newspa, props);
5160 if (error)
5161 goto out;
5162 }
5163
5164 /* flush everything */
5165 txg = spa_vdev_config_enter(newspa);
5166 vdev_config_dirty(newspa->spa_root_vdev);
5167 (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG);
5168
5169 if (zio_injection_enabled)
5170 zio_handle_panic_injection(spa, FTAG, 2);
5171
5172 spa_async_resume(newspa);
5173
5174 /* finally, update the original pool's config */
5175 txg = spa_vdev_config_enter(spa);
5176 tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
5177 error = dmu_tx_assign(tx, TXG_WAIT);
5178 if (error != 0)
5179 dmu_tx_abort(tx);
5180 for (c = 0; c < children; c++) {
5181 if (vml[c] != NULL) {
5182 vdev_split(vml[c]);
5183 if (error == 0)
5184 spa_history_log_internal(spa, "detach", tx,
5185 "vdev=%s", vml[c]->vdev_path);
5186 vdev_free(vml[c]);
5187 }
5188 }
5189 vdev_config_dirty(spa->spa_root_vdev);
5190 spa->spa_config_splitting = NULL;
5191 nvlist_free(nvl);
5192 if (error == 0)
5193 dmu_tx_commit(tx);
5194 (void) spa_vdev_exit(spa, NULL, txg, 0);
5195
5196 if (zio_injection_enabled)
5197 zio_handle_panic_injection(spa, FTAG, 3);
5198
5199 /* split is complete; log a history record */
5200 spa_history_log_internal(newspa, "split", NULL,
5201 "from pool %s", spa_name(spa));
5202
5203 kmem_free(vml, children * sizeof (vdev_t *));
5204
5205 /* if we're not going to mount the filesystems in userland, export */
5206 if (exp)
5207 error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL,
5208 B_FALSE, B_FALSE);
5209
5210 return (error);
5211
5212out:
5213 spa_unload(newspa);
5214 spa_deactivate(newspa);
5215 spa_remove(newspa);
5216
5217 txg = spa_vdev_config_enter(spa);
5218
5219 /* re-online all offlined disks */
5220 for (c = 0; c < children; c++) {
5221 if (vml[c] != NULL)
5222 vml[c]->vdev_offline = B_FALSE;
5223 }
5224 vdev_reopen(spa->spa_root_vdev);
5225
5226 nvlist_free(spa->spa_config_splitting);
5227 spa->spa_config_splitting = NULL;
5228 (void) spa_vdev_exit(spa, NULL, txg, error);
5229
5230 kmem_free(vml, children * sizeof (vdev_t *));
5231 return (error);
5232}
5233
5234static nvlist_t *
5235spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid)
5236{
5237 for (int i = 0; i < count; i++) {
5238 uint64_t guid;
5239
5240 VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID,
5241 &guid) == 0);
5242
5243 if (guid == target_guid)
5244 return (nvpp[i]);
5245 }
5246
5247 return (NULL);
5248}
5249
5250static void
5251spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count,
5252 nvlist_t *dev_to_remove)
5253{
5254 nvlist_t **newdev = NULL;
5255
5256 if (count > 1)
5257 newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP);
5258
5259 for (int i = 0, j = 0; i < count; i++) {
5260 if (dev[i] == dev_to_remove)
5261 continue;
5262 VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0);
5263 }
5264
5265 VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0);
5266 VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0);
5267
5268 for (int i = 0; i < count - 1; i++)
5269 nvlist_free(newdev[i]);
5270
5271 if (count > 1)
5272 kmem_free(newdev, (count - 1) * sizeof (void *));
5273}
5274
5275/*
5276 * Evacuate the device.
5277 */
5278static int
5279spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd)
5280{
5281 uint64_t txg;
5282 int error = 0;
5283
5284 ASSERT(MUTEX_HELD(&spa_namespace_lock));
5285 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
5286 ASSERT(vd == vd->vdev_top);
5287
5288 /*
5289 * Evacuate the device. We don't hold the config lock as writer
5290 * since we need to do I/O but we do keep the
5291 * spa_namespace_lock held. Once this completes the device
5292 * should no longer have any blocks allocated on it.
5293 */
5294 if (vd->vdev_islog) {
5295 if (vd->vdev_stat.vs_alloc != 0)
5296 error = spa_offline_log(spa);
5297 } else {
5298 error = SET_ERROR(ENOTSUP);
5299 }
5300
5301 if (error)
5302 return (error);
5303
5304 /*
5305 * The evacuation succeeded. Remove any remaining MOS metadata
5306 * associated with this vdev, and wait for these changes to sync.
5307 */
5308 ASSERT0(vd->vdev_stat.vs_alloc);
5309 txg = spa_vdev_config_enter(spa);
5310 vd->vdev_removing = B_TRUE;
5311 vdev_dirty(vd, 0, NULL, txg);
5312 vdev_config_dirty(vd);
5313 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
5314
5315 return (0);
5316}
5317
5318/*
5319 * Complete the removal by cleaning up the namespace.
5320 */
5321static void
5322spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd)
5323{
5324 vdev_t *rvd = spa->spa_root_vdev;
5325 uint64_t id = vd->vdev_id;
5326 boolean_t last_vdev = (id == (rvd->vdev_children - 1));
5327
5328 ASSERT(MUTEX_HELD(&spa_namespace_lock));
5329 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
5330 ASSERT(vd == vd->vdev_top);
5331
5332 /*
5333 * Only remove any devices which are empty.
5334 */
5335 if (vd->vdev_stat.vs_alloc != 0)
5336 return;
5337
5338 (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
5339
5340 if (list_link_active(&vd->vdev_state_dirty_node))
5341 vdev_state_clean(vd);
5342 if (list_link_active(&vd->vdev_config_dirty_node))
5343 vdev_config_clean(vd);
5344
5345 vdev_free(vd);
5346
5347 if (last_vdev) {
5348 vdev_compact_children(rvd);
5349 } else {
5350 vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops);
5351 vdev_add_child(rvd, vd);
5352 }
5353 vdev_config_dirty(rvd);
5354
5355 /*
5356 * Reassess the health of our root vdev.
5357 */
5358 vdev_reopen(rvd);
5359}
5360
5361/*
5362 * Remove a device from the pool -
5363 *
5364 * Removing a device from the vdev namespace requires several steps
5365 * and can take a significant amount of time. As a result we use
5366 * the spa_vdev_config_[enter/exit] functions which allow us to
5367 * grab and release the spa_config_lock while still holding the namespace
5368 * lock. During each step the configuration is synced out.
5369 *
5370 * Currently, this supports removing only hot spares, slogs, and level 2 ARC
5371 * devices.
5372 */
5373int
5374spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
5375{
5376 vdev_t *vd;
5377 metaslab_group_t *mg;
5378 nvlist_t **spares, **l2cache, *nv;
5379 uint64_t txg = 0;
5380 uint_t nspares, nl2cache;
5381 int error = 0;
5382 boolean_t locked = MUTEX_HELD(&spa_namespace_lock);
5383
5384 ASSERT(spa_writeable(spa));
5385
5386 if (!locked)
5387 txg = spa_vdev_enter(spa);
5388
5389 vd = spa_lookup_by_guid(spa, guid, B_FALSE);
5390
5391 if (spa->spa_spares.sav_vdevs != NULL &&
5392 nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
5393 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 &&
5394 (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) {
5395 /*
5396 * Only remove the hot spare if it's not currently in use
5397 * in this pool.
5398 */
5399 if (vd == NULL || unspare) {
5400 spa_vdev_remove_aux(spa->spa_spares.sav_config,
5401 ZPOOL_CONFIG_SPARES, spares, nspares, nv);
5402 spa_load_spares(spa);
5403 spa->spa_spares.sav_sync = B_TRUE;
5404 } else {
5405 error = SET_ERROR(EBUSY);
5406 }
5407 } else if (spa->spa_l2cache.sav_vdevs != NULL &&
5408 nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
5409 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 &&
5410 (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) {
5411 /*
5412 * Cache devices can always be removed.
5413 */
5414 spa_vdev_remove_aux(spa->spa_l2cache.sav_config,
5415 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv);
5416 spa_load_l2cache(spa);
5417 spa->spa_l2cache.sav_sync = B_TRUE;
5418 } else if (vd != NULL && vd->vdev_islog) {
5419 ASSERT(!locked);
5420 ASSERT(vd == vd->vdev_top);
5421
5422 /*
5423 * XXX - Once we have bp-rewrite this should
5424 * become the common case.
5425 */
5426
5427 mg = vd->vdev_mg;
5428
5429 /*
5430 * Stop allocating from this vdev.
5431 */
5432 metaslab_group_passivate(mg);
5433
5434 /*
5435 * Wait for the youngest allocations and frees to sync,
5436 * and then wait for the deferral of those frees to finish.
5437 */
5438 spa_vdev_config_exit(spa, NULL,
5439 txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
5440
5441 /*
5442 * Attempt to evacuate the vdev.
5443 */
5444 error = spa_vdev_remove_evacuate(spa, vd);
5445
5446 txg = spa_vdev_config_enter(spa);
5447
5448 /*
5449 * If we couldn't evacuate the vdev, unwind.
5450 */
5451 if (error) {
5452 metaslab_group_activate(mg);
5453 return (spa_vdev_exit(spa, NULL, txg, error));
5454 }
5455
5456 /*
5457 * Clean up the vdev namespace.
5458 */
5459 spa_vdev_remove_from_namespace(spa, vd);
5460
5461 } else if (vd != NULL) {
5462 /*
5463 * Normal vdevs cannot be removed (yet).
5464 */
5465 error = SET_ERROR(ENOTSUP);
5466 } else {
5467 /*
5468 * There is no vdev of any kind with the specified guid.
5469 */
5470 error = SET_ERROR(ENOENT);
5471 }
5472
5473 if (!locked)
5474 return (spa_vdev_exit(spa, NULL, txg, error));
5475
5476 return (error);
5477}
5478
5479/*
5480 * Find any device that's done replacing, or a vdev marked 'unspare' that's
5481 * currently spared, so we can detach it.
5482 */
5483static vdev_t *
5484spa_vdev_resilver_done_hunt(vdev_t *vd)
5485{
5486 vdev_t *newvd, *oldvd;
5487
5488 for (int c = 0; c < vd->vdev_children; c++) {
5489 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]);
5490 if (oldvd != NULL)
5491 return (oldvd);
5492 }
5493
5494 /*
5495 * Check for a completed replacement. We always consider the first
5496 * vdev in the list to be the oldest vdev, and the last one to be
5497 * the newest (see spa_vdev_attach() for how that works). In
5498 * the case where the newest vdev is faulted, we will not automatically
5499 * remove it after a resilver completes. This is OK as it will require
5500 * user intervention to determine which disk the admin wishes to keep.
5501 */
5502 if (vd->vdev_ops == &vdev_replacing_ops) {
5503 ASSERT(vd->vdev_children > 1);
5504
5505 newvd = vd->vdev_child[vd->vdev_children - 1];
5506 oldvd = vd->vdev_child[0];
5507
5508 if (vdev_dtl_empty(newvd, DTL_MISSING) &&
5509 vdev_dtl_empty(newvd, DTL_OUTAGE) &&
5510 !vdev_dtl_required(oldvd))
5511 return (oldvd);
5512 }
5513
5514 /*
5515 * Check for a completed resilver with the 'unspare' flag set.
5516 */
5517 if (vd->vdev_ops == &vdev_spare_ops) {
5518 vdev_t *first = vd->vdev_child[0];
5519 vdev_t *last = vd->vdev_child[vd->vdev_children - 1];
5520
5521 if (last->vdev_unspare) {
5522 oldvd = first;
5523 newvd = last;
5524 } else if (first->vdev_unspare) {
5525 oldvd = last;
5526 newvd = first;
5527 } else {
5528 oldvd = NULL;
5529 }
5530
5531 if (oldvd != NULL &&
5532 vdev_dtl_empty(newvd, DTL_MISSING) &&
5533 vdev_dtl_empty(newvd, DTL_OUTAGE) &&
5534 !vdev_dtl_required(oldvd))
5535 return (oldvd);
5536
5537 /*
5538 * If there are more than two spares attached to a disk,
5539 * and those spares are not required, then we want to
5540 * attempt to free them up now so that they can be used
5541 * by other pools. Once we're back down to a single
5542 * disk+spare, we stop removing them.
5543 */
5544 if (vd->vdev_children > 2) {
5545 newvd = vd->vdev_child[1];
5546
5547 if (newvd->vdev_isspare && last->vdev_isspare &&
5548 vdev_dtl_empty(last, DTL_MISSING) &&
5549 vdev_dtl_empty(last, DTL_OUTAGE) &&
5550 !vdev_dtl_required(newvd))
5551 return (newvd);
5552 }
5553 }
5554
5555 return (NULL);
5556}
5557
5558static void
5559spa_vdev_resilver_done(spa_t *spa)
5560{
5561 vdev_t *vd, *pvd, *ppvd;
5562 uint64_t guid, sguid, pguid, ppguid;
5563
5564 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5565
5566 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) {
5567 pvd = vd->vdev_parent;
5568 ppvd = pvd->vdev_parent;
5569 guid = vd->vdev_guid;
5570 pguid = pvd->vdev_guid;
5571 ppguid = ppvd->vdev_guid;
5572 sguid = 0;
5573 /*
5574 * If we have just finished replacing a hot spared device, then
5575 * we need to detach the parent's first child (the original hot
5576 * spare) as well.
5577 */
5578 if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 &&
5579 ppvd->vdev_children == 2) {
5580 ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
5581 sguid = ppvd->vdev_child[1]->vdev_guid;
5582 }
5583 spa_config_exit(spa, SCL_ALL, FTAG);
5584 if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0)
5585 return;
5586 if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0)
5587 return;
5588 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5589 }
5590
5591 spa_config_exit(spa, SCL_ALL, FTAG);
5592}
5593
5594/*
5595 * Update the stored path or FRU for this vdev.
5596 */
5597int
5598spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value,
5599 boolean_t ispath)
5600{
5601 vdev_t *vd;
5602 boolean_t sync = B_FALSE;
5603
5604 ASSERT(spa_writeable(spa));
5605
5606 spa_vdev_state_enter(spa, SCL_ALL);
5607
5608 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
5609 return (spa_vdev_state_exit(spa, NULL, ENOENT));
5610
5611 if (!vd->vdev_ops->vdev_op_leaf)
5612 return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
5613
5614 if (ispath) {
5615 if (strcmp(value, vd->vdev_path) != 0) {
5616 spa_strfree(vd->vdev_path);
5617 vd->vdev_path = spa_strdup(value);
5618 sync = B_TRUE;
5619 }
5620 } else {
5621 if (vd->vdev_fru == NULL) {
5622 vd->vdev_fru = spa_strdup(value);
5623 sync = B_TRUE;
5624 } else if (strcmp(value, vd->vdev_fru) != 0) {
5625 spa_strfree(vd->vdev_fru);
5626 vd->vdev_fru = spa_strdup(value);
5627 sync = B_TRUE;
5628 }
5629 }
5630
5631 return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0));
5632}
5633
5634int
5635spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
5636{
5637 return (spa_vdev_set_common(spa, guid, newpath, B_TRUE));
5638}
5639
5640int
5641spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru)
5642{
5643 return (spa_vdev_set_common(spa, guid, newfru, B_FALSE));
5644}
5645
5646/*
5647 * ==========================================================================
5648 * SPA Scanning
5649 * ==========================================================================
5650 */
5651
5652int
5653spa_scan_stop(spa_t *spa)
5654{
5655 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
5656 if (dsl_scan_resilvering(spa->spa_dsl_pool))
5657 return (SET_ERROR(EBUSY));
5658 return (dsl_scan_cancel(spa->spa_dsl_pool));
5659}
5660
5661int
5662spa_scan(spa_t *spa, pool_scan_func_t func)
5663{
5664 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
5665
5666 if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE)
5667 return (SET_ERROR(ENOTSUP));
5668
5669 /*
5670 * If a resilver was requested, but there is no DTL on a
5671 * writeable leaf device, we have nothing to do.
5672 */
5673 if (func == POOL_SCAN_RESILVER &&
5674 !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
5675 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
5676 return (0);
5677 }
5678
5679 return (dsl_scan(spa->spa_dsl_pool, func));
5680}
5681
5682/*
5683 * ==========================================================================
5684 * SPA async task processing
5685 * ==========================================================================
5686 */
5687
5688static void
5689spa_async_remove(spa_t *spa, vdev_t *vd)
5690{
5691 if (vd->vdev_remove_wanted) {
5692 vd->vdev_remove_wanted = B_FALSE;
5693 vd->vdev_delayed_close = B_FALSE;
5694 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE);
5695
5696 /*
5697 * We want to clear the stats, but we don't want to do a full
5698 * vdev_clear() as that will cause us to throw away
5699 * degraded/faulted state as well as attempt to reopen the
5700 * device, all of which is a waste.
5701 */
5702 vd->vdev_stat.vs_read_errors = 0;
5703 vd->vdev_stat.vs_write_errors = 0;
5704 vd->vdev_stat.vs_checksum_errors = 0;
5705
5706 vdev_state_dirty(vd->vdev_top);
5707 }
5708
5709 for (int c = 0; c < vd->vdev_children; c++)
5710 spa_async_remove(spa, vd->vdev_child[c]);
5711}
5712
5713static void
5714spa_async_probe(spa_t *spa, vdev_t *vd)
5715{
5716 if (vd->vdev_probe_wanted) {
5717 vd->vdev_probe_wanted = B_FALSE;
5718 vdev_reopen(vd); /* vdev_open() does the actual probe */
5719 }
5720
5721 for (int c = 0; c < vd->vdev_children; c++)
5722 spa_async_probe(spa, vd->vdev_child[c]);
5723}
5724
5725static void
5726spa_async_autoexpand(spa_t *spa, vdev_t *vd)
5727{
5728 sysevent_id_t eid;
5729 nvlist_t *attr;
5730 char *physpath;
5731
5732 if (!spa->spa_autoexpand)
5733 return;
5734
5735 for (int c = 0; c < vd->vdev_children; c++) {
5736 vdev_t *cvd = vd->vdev_child[c];
5737 spa_async_autoexpand(spa, cvd);
5738 }
5739
5740 if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL)
5741 return;
5742
5743 physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
5744 (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath);
5745
5746 VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0);
5747 VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0);
5748
5749 (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS,
5750 ESC_ZFS_VDEV_AUTOEXPAND, attr, &eid, DDI_SLEEP);
5751
5752 nvlist_free(attr);
5753 kmem_free(physpath, MAXPATHLEN);
5754}
5755
5756static void
5757spa_async_thread(void *arg)
5758{
5759 spa_t *spa = arg;
5760 int tasks;
5761
5762 ASSERT(spa->spa_sync_on);
5763
5764 mutex_enter(&spa->spa_async_lock);
5765 tasks = spa->spa_async_tasks;
5766 spa->spa_async_tasks = 0;
5767 mutex_exit(&spa->spa_async_lock);
5768
5769 /*
5770 * See if the config needs to be updated.
5771 */
5772 if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
5773 uint64_t old_space, new_space;
5774
5775 mutex_enter(&spa_namespace_lock);
5776 old_space = metaslab_class_get_space(spa_normal_class(spa));
5777 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
5778 new_space = metaslab_class_get_space(spa_normal_class(spa));
5779 mutex_exit(&spa_namespace_lock);
5780
5781 /*
5782 * If the pool grew as a result of the config update,
5783 * then log an internal history event.
5784 */
5785 if (new_space != old_space) {
5786 spa_history_log_internal(spa, "vdev online", NULL,
5787 "pool '%s' size: %llu(+%llu)",
5788 spa_name(spa), new_space, new_space - old_space);
5789 }
5790 }
5791
5792 /*
5793 * See if any devices need to be marked REMOVED.
5794 */
5795 if (tasks & SPA_ASYNC_REMOVE) {
5796 spa_vdev_state_enter(spa, SCL_NONE);
5797 spa_async_remove(spa, spa->spa_root_vdev);
5798 for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
5799 spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]);
5800 for (int i = 0; i < spa->spa_spares.sav_count; i++)
5801 spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]);
5802 (void) spa_vdev_state_exit(spa, NULL, 0);
5803 }
5804
5805 if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) {
5806 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
5807 spa_async_autoexpand(spa, spa->spa_root_vdev);
5808 spa_config_exit(spa, SCL_CONFIG, FTAG);
5809 }
5810
5811 /*
5812 * See if any devices need to be probed.
5813 */
5814 if (tasks & SPA_ASYNC_PROBE) {
5815 spa_vdev_state_enter(spa, SCL_NONE);
5816 spa_async_probe(spa, spa->spa_root_vdev);
5817 (void) spa_vdev_state_exit(spa, NULL, 0);
5818 }
5819
5820 /*
5821 * If any devices are done replacing, detach them.
5822 */
5823 if (tasks & SPA_ASYNC_RESILVER_DONE)
5824 spa_vdev_resilver_done(spa);
5825
5826 /*
5827 * Kick off a resilver.
5828 */
5829 if (tasks & SPA_ASYNC_RESILVER)
5830 dsl_resilver_restart(spa->spa_dsl_pool, 0);
5831
5832 /*
5833 * Let the world know that we're done.
5834 */
5835 mutex_enter(&spa->spa_async_lock);
5836 spa->spa_async_thread = NULL;
5837 cv_broadcast(&spa->spa_async_cv);
5838 mutex_exit(&spa->spa_async_lock);
5839 thread_exit();
5840}
5841
5842void
5843spa_async_suspend(spa_t *spa)
5844{
5845 mutex_enter(&spa->spa_async_lock);
5846 spa->spa_async_suspended++;
5847 while (spa->spa_async_thread != NULL)
5848 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
5849 mutex_exit(&spa->spa_async_lock);
5850}
5851
5852void
5853spa_async_resume(spa_t *spa)
5854{
5855 mutex_enter(&spa->spa_async_lock);
5856 ASSERT(spa->spa_async_suspended != 0);
5857 spa->spa_async_suspended--;
5858 mutex_exit(&spa->spa_async_lock);
5859}
5860
5861static boolean_t
5862spa_async_tasks_pending(spa_t *spa)
5863{
5864 uint_t non_config_tasks;
5865 uint_t config_task;
5866 boolean_t config_task_suspended;
5867
5868 non_config_tasks = spa->spa_async_tasks & ~SPA_ASYNC_CONFIG_UPDATE;
5869 config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE;
5870 if (spa->spa_ccw_fail_time == 0) {
5871 config_task_suspended = B_FALSE;
5872 } else {
5873 config_task_suspended =
5874 (gethrtime() - spa->spa_ccw_fail_time) <
5875 (zfs_ccw_retry_interval * NANOSEC);
5876 }
5877
5878 return (non_config_tasks || (config_task && !config_task_suspended));
5879}
5880
5855static void
5856spa_async_dispatch(spa_t *spa)
5857{
5858 mutex_enter(&spa->spa_async_lock);
5881static void
5882spa_async_dispatch(spa_t *spa)
5883{
5884 mutex_enter(&spa->spa_async_lock);
5859 if (spa->spa_async_tasks && !spa->spa_async_suspended &&
5885 if (spa_async_tasks_pending(spa) &&
5886 !spa->spa_async_suspended &&
5860 spa->spa_async_thread == NULL &&
5887 spa->spa_async_thread == NULL &&
5861 rootdir != NULL && !vn_is_readonly(rootdir))
5888 rootdir != NULL)
5862 spa->spa_async_thread = thread_create(NULL, 0,
5863 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
5864 mutex_exit(&spa->spa_async_lock);
5865}
5866
5867void
5868spa_async_request(spa_t *spa, int task)
5869{
5870 zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task);
5871 mutex_enter(&spa->spa_async_lock);
5872 spa->spa_async_tasks |= task;
5873 mutex_exit(&spa->spa_async_lock);
5874}
5875
5876/*
5877 * ==========================================================================
5878 * SPA syncing routines
5879 * ==========================================================================
5880 */
5881
5882static int
5883bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
5884{
5885 bpobj_t *bpo = arg;
5886 bpobj_enqueue(bpo, bp, tx);
5887 return (0);
5888}
5889
5890static int
5891spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
5892{
5893 zio_t *zio = arg;
5894
5895 zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp,
5896 BP_GET_PSIZE(bp), zio->io_flags));
5897 return (0);
5898}
5899
5900static void
5901spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
5902{
5903 char *packed = NULL;
5904 size_t bufsize;
5905 size_t nvsize = 0;
5906 dmu_buf_t *db;
5907
5908 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0);
5909
5910 /*
5911 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration
5912 * information. This avoids the dbuf_will_dirty() path and
5913 * saves us a pre-read to get data we don't actually care about.
5914 */
5915 bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE);
5916 packed = kmem_alloc(bufsize, KM_SLEEP);
5917
5918 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
5919 KM_SLEEP) == 0);
5920 bzero(packed + nvsize, bufsize - nvsize);
5921
5922 dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx);
5923
5924 kmem_free(packed, bufsize);
5925
5926 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
5927 dmu_buf_will_dirty(db, tx);
5928 *(uint64_t *)db->db_data = nvsize;
5929 dmu_buf_rele(db, FTAG);
5930}
5931
5932static void
5933spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx,
5934 const char *config, const char *entry)
5935{
5936 nvlist_t *nvroot;
5937 nvlist_t **list;
5938 int i;
5939
5940 if (!sav->sav_sync)
5941 return;
5942
5943 /*
5944 * Update the MOS nvlist describing the list of available devices.
5945 * spa_validate_aux() will have already made sure this nvlist is
5946 * valid and the vdevs are labeled appropriately.
5947 */
5948 if (sav->sav_object == 0) {
5949 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset,
5950 DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE,
5951 sizeof (uint64_t), tx);
5952 VERIFY(zap_update(spa->spa_meta_objset,
5953 DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1,
5954 &sav->sav_object, tx) == 0);
5955 }
5956
5957 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
5958 if (sav->sav_count == 0) {
5959 VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0);
5960 } else {
5961 list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
5962 for (i = 0; i < sav->sav_count; i++)
5963 list[i] = vdev_config_generate(spa, sav->sav_vdevs[i],
5964 B_FALSE, VDEV_CONFIG_L2CACHE);
5965 VERIFY(nvlist_add_nvlist_array(nvroot, config, list,
5966 sav->sav_count) == 0);
5967 for (i = 0; i < sav->sav_count; i++)
5968 nvlist_free(list[i]);
5969 kmem_free(list, sav->sav_count * sizeof (void *));
5970 }
5971
5972 spa_sync_nvlist(spa, sav->sav_object, nvroot, tx);
5973 nvlist_free(nvroot);
5974
5975 sav->sav_sync = B_FALSE;
5976}
5977
5978static void
5979spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
5980{
5981 nvlist_t *config;
5982
5983 if (list_is_empty(&spa->spa_config_dirty_list))
5984 return;
5985
5986 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
5987
5988 config = spa_config_generate(spa, spa->spa_root_vdev,
5989 dmu_tx_get_txg(tx), B_FALSE);
5990
5991 /*
5992 * If we're upgrading the spa version then make sure that
5993 * the config object gets updated with the correct version.
5994 */
5995 if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version)
5996 fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
5997 spa->spa_uberblock.ub_version);
5998
5999 spa_config_exit(spa, SCL_STATE, FTAG);
6000
6001 if (spa->spa_config_syncing)
6002 nvlist_free(spa->spa_config_syncing);
6003 spa->spa_config_syncing = config;
6004
6005 spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
6006}
6007
6008static void
6009spa_sync_version(void *arg, dmu_tx_t *tx)
6010{
6011 uint64_t *versionp = arg;
6012 uint64_t version = *versionp;
6013 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
6014
6015 /*
6016 * Setting the version is special cased when first creating the pool.
6017 */
6018 ASSERT(tx->tx_txg != TXG_INITIAL);
6019
6020 ASSERT(SPA_VERSION_IS_SUPPORTED(version));
6021 ASSERT(version >= spa_version(spa));
6022
6023 spa->spa_uberblock.ub_version = version;
6024 vdev_config_dirty(spa->spa_root_vdev);
6025 spa_history_log_internal(spa, "set", tx, "version=%lld", version);
6026}
6027
6028/*
6029 * Set zpool properties.
6030 */
6031static void
6032spa_sync_props(void *arg, dmu_tx_t *tx)
6033{
6034 nvlist_t *nvp = arg;
6035 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
6036 objset_t *mos = spa->spa_meta_objset;
6037 nvpair_t *elem = NULL;
6038
6039 mutex_enter(&spa->spa_props_lock);
6040
6041 while ((elem = nvlist_next_nvpair(nvp, elem))) {
6042 uint64_t intval;
6043 char *strval, *fname;
6044 zpool_prop_t prop;
6045 const char *propname;
6046 zprop_type_t proptype;
6047 zfeature_info_t *feature;
6048
6049 switch (prop = zpool_name_to_prop(nvpair_name(elem))) {
6050 case ZPROP_INVAL:
6051 /*
6052 * We checked this earlier in spa_prop_validate().
6053 */
6054 ASSERT(zpool_prop_feature(nvpair_name(elem)));
6055
6056 fname = strchr(nvpair_name(elem), '@') + 1;
6057 VERIFY3U(0, ==, zfeature_lookup_name(fname, &feature));
6058
6059 spa_feature_enable(spa, feature, tx);
6060 spa_history_log_internal(spa, "set", tx,
6061 "%s=enabled", nvpair_name(elem));
6062 break;
6063
6064 case ZPOOL_PROP_VERSION:
6065 VERIFY(nvpair_value_uint64(elem, &intval) == 0);
6066 /*
6067 * The version is synced seperatly before other
6068 * properties and should be correct by now.
6069 */
6070 ASSERT3U(spa_version(spa), >=, intval);
6071 break;
6072
6073 case ZPOOL_PROP_ALTROOT:
6074 /*
6075 * 'altroot' is a non-persistent property. It should
6076 * have been set temporarily at creation or import time.
6077 */
6078 ASSERT(spa->spa_root != NULL);
6079 break;
6080
6081 case ZPOOL_PROP_READONLY:
6082 case ZPOOL_PROP_CACHEFILE:
6083 /*
6084 * 'readonly' and 'cachefile' are also non-persisitent
6085 * properties.
6086 */
6087 break;
6088 case ZPOOL_PROP_COMMENT:
6089 VERIFY(nvpair_value_string(elem, &strval) == 0);
6090 if (spa->spa_comment != NULL)
6091 spa_strfree(spa->spa_comment);
6092 spa->spa_comment = spa_strdup(strval);
6093 /*
6094 * We need to dirty the configuration on all the vdevs
6095 * so that their labels get updated. It's unnecessary
6096 * to do this for pool creation since the vdev's
6097 * configuratoin has already been dirtied.
6098 */
6099 if (tx->tx_txg != TXG_INITIAL)
6100 vdev_config_dirty(spa->spa_root_vdev);
6101 spa_history_log_internal(spa, "set", tx,
6102 "%s=%s", nvpair_name(elem), strval);
6103 break;
6104 default:
6105 /*
6106 * Set pool property values in the poolprops mos object.
6107 */
6108 if (spa->spa_pool_props_object == 0) {
6109 spa->spa_pool_props_object =
6110 zap_create_link(mos, DMU_OT_POOL_PROPS,
6111 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS,
6112 tx);
6113 }
6114
6115 /* normalize the property name */
6116 propname = zpool_prop_to_name(prop);
6117 proptype = zpool_prop_get_type(prop);
6118
6119 if (nvpair_type(elem) == DATA_TYPE_STRING) {
6120 ASSERT(proptype == PROP_TYPE_STRING);
6121 VERIFY(nvpair_value_string(elem, &strval) == 0);
6122 VERIFY(zap_update(mos,
6123 spa->spa_pool_props_object, propname,
6124 1, strlen(strval) + 1, strval, tx) == 0);
6125 spa_history_log_internal(spa, "set", tx,
6126 "%s=%s", nvpair_name(elem), strval);
6127 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
6128 VERIFY(nvpair_value_uint64(elem, &intval) == 0);
6129
6130 if (proptype == PROP_TYPE_INDEX) {
6131 const char *unused;
6132 VERIFY(zpool_prop_index_to_string(
6133 prop, intval, &unused) == 0);
6134 }
6135 VERIFY(zap_update(mos,
6136 spa->spa_pool_props_object, propname,
6137 8, 1, &intval, tx) == 0);
6138 spa_history_log_internal(spa, "set", tx,
6139 "%s=%lld", nvpair_name(elem), intval);
6140 } else {
6141 ASSERT(0); /* not allowed */
6142 }
6143
6144 switch (prop) {
6145 case ZPOOL_PROP_DELEGATION:
6146 spa->spa_delegation = intval;
6147 break;
6148 case ZPOOL_PROP_BOOTFS:
6149 spa->spa_bootfs = intval;
6150 break;
6151 case ZPOOL_PROP_FAILUREMODE:
6152 spa->spa_failmode = intval;
6153 break;
6154 case ZPOOL_PROP_AUTOEXPAND:
6155 spa->spa_autoexpand = intval;
6156 if (tx->tx_txg != TXG_INITIAL)
6157 spa_async_request(spa,
6158 SPA_ASYNC_AUTOEXPAND);
6159 break;
6160 case ZPOOL_PROP_DEDUPDITTO:
6161 spa->spa_dedup_ditto = intval;
6162 break;
6163 default:
6164 break;
6165 }
6166 }
6167
6168 }
6169
6170 mutex_exit(&spa->spa_props_lock);
6171}
6172
6173/*
6174 * Perform one-time upgrade on-disk changes. spa_version() does not
6175 * reflect the new version this txg, so there must be no changes this
6176 * txg to anything that the upgrade code depends on after it executes.
6177 * Therefore this must be called after dsl_pool_sync() does the sync
6178 * tasks.
6179 */
6180static void
6181spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx)
6182{
6183 dsl_pool_t *dp = spa->spa_dsl_pool;
6184
6185 ASSERT(spa->spa_sync_pass == 1);
6186
6187 rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
6188
6189 if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
6190 spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) {
6191 dsl_pool_create_origin(dp, tx);
6192
6193 /* Keeping the origin open increases spa_minref */
6194 spa->spa_minref += 3;
6195 }
6196
6197 if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES &&
6198 spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) {
6199 dsl_pool_upgrade_clones(dp, tx);
6200 }
6201
6202 if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES &&
6203 spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) {
6204 dsl_pool_upgrade_dir_clones(dp, tx);
6205
6206 /* Keeping the freedir open increases spa_minref */
6207 spa->spa_minref += 3;
6208 }
6209
6210 if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES &&
6211 spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
6212 spa_feature_create_zap_objects(spa, tx);
6213 }
6214 rrw_exit(&dp->dp_config_rwlock, FTAG);
6215}
6216
6217/*
6218 * Sync the specified transaction group. New blocks may be dirtied as
6219 * part of the process, so we iterate until it converges.
6220 */
6221void
6222spa_sync(spa_t *spa, uint64_t txg)
6223{
6224 dsl_pool_t *dp = spa->spa_dsl_pool;
6225 objset_t *mos = spa->spa_meta_objset;
6226 bpobj_t *defer_bpo = &spa->spa_deferred_bpobj;
6227 bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
6228 vdev_t *rvd = spa->spa_root_vdev;
6229 vdev_t *vd;
6230 dmu_tx_t *tx;
6231 int error;
6232
6233 VERIFY(spa_writeable(spa));
6234
6235 /*
6236 * Lock out configuration changes.
6237 */
6238 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
6239
6240 spa->spa_syncing_txg = txg;
6241 spa->spa_sync_pass = 0;
6242
6243 /*
6244 * If there are any pending vdev state changes, convert them
6245 * into config changes that go out with this transaction group.
6246 */
6247 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
6248 while (list_head(&spa->spa_state_dirty_list) != NULL) {
6249 /*
6250 * We need the write lock here because, for aux vdevs,
6251 * calling vdev_config_dirty() modifies sav_config.
6252 * This is ugly and will become unnecessary when we
6253 * eliminate the aux vdev wart by integrating all vdevs
6254 * into the root vdev tree.
6255 */
6256 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
6257 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER);
6258 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
6259 vdev_state_clean(vd);
6260 vdev_config_dirty(vd);
6261 }
6262 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
6263 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
6264 }
6265 spa_config_exit(spa, SCL_STATE, FTAG);
6266
6267 tx = dmu_tx_create_assigned(dp, txg);
6268
6269 spa->spa_sync_starttime = gethrtime();
6270#ifdef illumos
6271 VERIFY(cyclic_reprogram(spa->spa_deadman_cycid,
6272 spa->spa_sync_starttime + spa->spa_deadman_synctime));
6273#else /* FreeBSD */
6274#ifdef _KERNEL
6275 callout_reset(&spa->spa_deadman_cycid,
6276 hz * spa->spa_deadman_synctime / NANOSEC, spa_deadman, spa);
6277#endif
6278#endif
6279
6280 /*
6281 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
6282 * set spa_deflate if we have no raid-z vdevs.
6283 */
6284 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE &&
6285 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) {
6286 int i;
6287
6288 for (i = 0; i < rvd->vdev_children; i++) {
6289 vd = rvd->vdev_child[i];
6290 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
6291 break;
6292 }
6293 if (i == rvd->vdev_children) {
6294 spa->spa_deflate = TRUE;
6295 VERIFY(0 == zap_add(spa->spa_meta_objset,
6296 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
6297 sizeof (uint64_t), 1, &spa->spa_deflate, tx));
6298 }
6299 }
6300
6301 /*
6302 * If anything has changed in this txg, or if someone is waiting
6303 * for this txg to sync (eg, spa_vdev_remove()), push the
6304 * deferred frees from the previous txg. If not, leave them
6305 * alone so that we don't generate work on an otherwise idle
6306 * system.
6307 */
6308 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) ||
6309 !txg_list_empty(&dp->dp_dirty_dirs, txg) ||
6310 !txg_list_empty(&dp->dp_sync_tasks, txg) ||
6311 ((dsl_scan_active(dp->dp_scan) ||
6312 txg_sync_waiting(dp)) && !spa_shutting_down(spa))) {
6313 zio_t *zio = zio_root(spa, NULL, NULL, 0);
6314 VERIFY3U(bpobj_iterate(defer_bpo,
6315 spa_free_sync_cb, zio, tx), ==, 0);
6316 VERIFY0(zio_wait(zio));
6317 }
6318
6319 /*
6320 * Iterate to convergence.
6321 */
6322 do {
6323 int pass = ++spa->spa_sync_pass;
6324
6325 spa_sync_config_object(spa, tx);
6326 spa_sync_aux_dev(spa, &spa->spa_spares, tx,
6327 ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES);
6328 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx,
6329 ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE);
6330 spa_errlog_sync(spa, txg);
6331 dsl_pool_sync(dp, txg);
6332
6333 if (pass < zfs_sync_pass_deferred_free) {
6334 zio_t *zio = zio_root(spa, NULL, NULL, 0);
6335 bplist_iterate(free_bpl, spa_free_sync_cb,
6336 zio, tx);
6337 VERIFY(zio_wait(zio) == 0);
6338 } else {
6339 bplist_iterate(free_bpl, bpobj_enqueue_cb,
6340 defer_bpo, tx);
6341 }
6342
6343 ddt_sync(spa, txg);
6344 dsl_scan_sync(dp, tx);
6345
6346 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))
6347 vdev_sync(vd, txg);
6348
6349 if (pass == 1)
6350 spa_sync_upgrades(spa, tx);
6351
6352 } while (dmu_objset_is_dirty(mos, txg));
6353
6354 /*
6355 * Rewrite the vdev configuration (which includes the uberblock)
6356 * to commit the transaction group.
6357 *
6358 * If there are no dirty vdevs, we sync the uberblock to a few
6359 * random top-level vdevs that are known to be visible in the
6360 * config cache (see spa_vdev_add() for a complete description).
6361 * If there *are* dirty vdevs, sync the uberblock to all vdevs.
6362 */
6363 for (;;) {
6364 /*
6365 * We hold SCL_STATE to prevent vdev open/close/etc.
6366 * while we're attempting to write the vdev labels.
6367 */
6368 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
6369
6370 if (list_is_empty(&spa->spa_config_dirty_list)) {
6371 vdev_t *svd[SPA_DVAS_PER_BP];
6372 int svdcount = 0;
6373 int children = rvd->vdev_children;
6374 int c0 = spa_get_random(children);
6375
6376 for (int c = 0; c < children; c++) {
6377 vd = rvd->vdev_child[(c0 + c) % children];
6378 if (vd->vdev_ms_array == 0 || vd->vdev_islog)
6379 continue;
6380 svd[svdcount++] = vd;
6381 if (svdcount == SPA_DVAS_PER_BP)
6382 break;
6383 }
6384 error = vdev_config_sync(svd, svdcount, txg, B_FALSE);
6385 if (error != 0)
6386 error = vdev_config_sync(svd, svdcount, txg,
6387 B_TRUE);
6388 } else {
6389 error = vdev_config_sync(rvd->vdev_child,
6390 rvd->vdev_children, txg, B_FALSE);
6391 if (error != 0)
6392 error = vdev_config_sync(rvd->vdev_child,
6393 rvd->vdev_children, txg, B_TRUE);
6394 }
6395
6396 if (error == 0)
6397 spa->spa_last_synced_guid = rvd->vdev_guid;
6398
6399 spa_config_exit(spa, SCL_STATE, FTAG);
6400
6401 if (error == 0)
6402 break;
6403 zio_suspend(spa, NULL);
6404 zio_resume_wait(spa);
6405 }
6406 dmu_tx_commit(tx);
6407
6408#ifdef illumos
6409 VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY));
6410#else /* FreeBSD */
6411#ifdef _KERNEL
6412 callout_drain(&spa->spa_deadman_cycid);
6413#endif
6414#endif
6415
6416 /*
6417 * Clear the dirty config list.
6418 */
6419 while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL)
6420 vdev_config_clean(vd);
6421
6422 /*
6423 * Now that the new config has synced transactionally,
6424 * let it become visible to the config cache.
6425 */
6426 if (spa->spa_config_syncing != NULL) {
6427 spa_config_set(spa, spa->spa_config_syncing);
6428 spa->spa_config_txg = txg;
6429 spa->spa_config_syncing = NULL;
6430 }
6431
6432 spa->spa_ubsync = spa->spa_uberblock;
6433
6434 dsl_pool_sync_done(dp, txg);
6435
6436 /*
6437 * Update usable space statistics.
6438 */
6439 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
6440 vdev_sync_done(vd, txg);
6441
6442 spa_update_dspace(spa);
6443
6444 /*
6445 * It had better be the case that we didn't dirty anything
6446 * since vdev_config_sync().
6447 */
6448 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
6449 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
6450 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
6451
6452 spa->spa_sync_pass = 0;
6453
6454 spa_config_exit(spa, SCL_CONFIG, FTAG);
6455
6456 spa_handle_ignored_writes(spa);
6457
6458 /*
6459 * If any async tasks have been requested, kick them off.
6460 */
6461 spa_async_dispatch(spa);
6462}
6463
6464/*
6465 * Sync all pools. We don't want to hold the namespace lock across these
6466 * operations, so we take a reference on the spa_t and drop the lock during the
6467 * sync.
6468 */
6469void
6470spa_sync_allpools(void)
6471{
6472 spa_t *spa = NULL;
6473 mutex_enter(&spa_namespace_lock);
6474 while ((spa = spa_next(spa)) != NULL) {
6475 if (spa_state(spa) != POOL_STATE_ACTIVE ||
6476 !spa_writeable(spa) || spa_suspended(spa))
6477 continue;
6478 spa_open_ref(spa, FTAG);
6479 mutex_exit(&spa_namespace_lock);
6480 txg_wait_synced(spa_get_dsl(spa), 0);
6481 mutex_enter(&spa_namespace_lock);
6482 spa_close(spa, FTAG);
6483 }
6484 mutex_exit(&spa_namespace_lock);
6485}
6486
6487/*
6488 * ==========================================================================
6489 * Miscellaneous routines
6490 * ==========================================================================
6491 */
6492
6493/*
6494 * Remove all pools in the system.
6495 */
6496void
6497spa_evict_all(void)
6498{
6499 spa_t *spa;
6500
6501 /*
6502 * Remove all cached state. All pools should be closed now,
6503 * so every spa in the AVL tree should be unreferenced.
6504 */
6505 mutex_enter(&spa_namespace_lock);
6506 while ((spa = spa_next(NULL)) != NULL) {
6507 /*
6508 * Stop async tasks. The async thread may need to detach
6509 * a device that's been replaced, which requires grabbing
6510 * spa_namespace_lock, so we must drop it here.
6511 */
6512 spa_open_ref(spa, FTAG);
6513 mutex_exit(&spa_namespace_lock);
6514 spa_async_suspend(spa);
6515 mutex_enter(&spa_namespace_lock);
6516 spa_close(spa, FTAG);
6517
6518 if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
6519 spa_unload(spa);
6520 spa_deactivate(spa);
6521 }
6522 spa_remove(spa);
6523 }
6524 mutex_exit(&spa_namespace_lock);
6525}
6526
6527vdev_t *
6528spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux)
6529{
6530 vdev_t *vd;
6531 int i;
6532
6533 if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL)
6534 return (vd);
6535
6536 if (aux) {
6537 for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
6538 vd = spa->spa_l2cache.sav_vdevs[i];
6539 if (vd->vdev_guid == guid)
6540 return (vd);
6541 }
6542
6543 for (i = 0; i < spa->spa_spares.sav_count; i++) {
6544 vd = spa->spa_spares.sav_vdevs[i];
6545 if (vd->vdev_guid == guid)
6546 return (vd);
6547 }
6548 }
6549
6550 return (NULL);
6551}
6552
6553void
6554spa_upgrade(spa_t *spa, uint64_t version)
6555{
6556 ASSERT(spa_writeable(spa));
6557
6558 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
6559
6560 /*
6561 * This should only be called for a non-faulted pool, and since a
6562 * future version would result in an unopenable pool, this shouldn't be
6563 * possible.
6564 */
6565 ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version));
6566 ASSERT(version >= spa->spa_uberblock.ub_version);
6567
6568 spa->spa_uberblock.ub_version = version;
6569 vdev_config_dirty(spa->spa_root_vdev);
6570
6571 spa_config_exit(spa, SCL_ALL, FTAG);
6572
6573 txg_wait_synced(spa_get_dsl(spa), 0);
6574}
6575
6576boolean_t
6577spa_has_spare(spa_t *spa, uint64_t guid)
6578{
6579 int i;
6580 uint64_t spareguid;
6581 spa_aux_vdev_t *sav = &spa->spa_spares;
6582
6583 for (i = 0; i < sav->sav_count; i++)
6584 if (sav->sav_vdevs[i]->vdev_guid == guid)
6585 return (B_TRUE);
6586
6587 for (i = 0; i < sav->sav_npending; i++) {
6588 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID,
6589 &spareguid) == 0 && spareguid == guid)
6590 return (B_TRUE);
6591 }
6592
6593 return (B_FALSE);
6594}
6595
6596/*
6597 * Check if a pool has an active shared spare device.
6598 * Note: reference count of an active spare is 2, as a spare and as a replace
6599 */
6600static boolean_t
6601spa_has_active_shared_spare(spa_t *spa)
6602{
6603 int i, refcnt;
6604 uint64_t pool;
6605 spa_aux_vdev_t *sav = &spa->spa_spares;
6606
6607 for (i = 0; i < sav->sav_count; i++) {
6608 if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool,
6609 &refcnt) && pool != 0ULL && pool == spa_guid(spa) &&
6610 refcnt > 2)
6611 return (B_TRUE);
6612 }
6613
6614 return (B_FALSE);
6615}
6616
6617/*
6618 * Post a sysevent corresponding to the given event. The 'name' must be one of
6619 * the event definitions in sys/sysevent/eventdefs.h. The payload will be
6620 * filled in from the spa and (optionally) the vdev. This doesn't do anything
6621 * in the userland libzpool, as we don't want consumers to misinterpret ztest
6622 * or zdb as real changes.
6623 */
6624void
6625spa_event_notify(spa_t *spa, vdev_t *vd, const char *name)
6626{
6627#ifdef _KERNEL
6628 sysevent_t *ev;
6629 sysevent_attr_list_t *attr = NULL;
6630 sysevent_value_t value;
6631 sysevent_id_t eid;
6632
6633 ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs",
6634 SE_SLEEP);
6635
6636 value.value_type = SE_DATA_TYPE_STRING;
6637 value.value.sv_string = spa_name(spa);
6638 if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0)
6639 goto done;
6640
6641 value.value_type = SE_DATA_TYPE_UINT64;
6642 value.value.sv_uint64 = spa_guid(spa);
6643 if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0)
6644 goto done;
6645
6646 if (vd) {
6647 value.value_type = SE_DATA_TYPE_UINT64;
6648 value.value.sv_uint64 = vd->vdev_guid;
6649 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value,
6650 SE_SLEEP) != 0)
6651 goto done;
6652
6653 if (vd->vdev_path) {
6654 value.value_type = SE_DATA_TYPE_STRING;
6655 value.value.sv_string = vd->vdev_path;
6656 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH,
6657 &value, SE_SLEEP) != 0)
6658 goto done;
6659 }
6660 }
6661
6662 if (sysevent_attach_attributes(ev, attr) != 0)
6663 goto done;
6664 attr = NULL;
6665
6666 (void) log_sysevent(ev, SE_SLEEP, &eid);
6667
6668done:
6669 if (attr)
6670 sysevent_free_attr(attr);
6671 sysevent_free(ev);
6672#endif
6673}
5889 spa->spa_async_thread = thread_create(NULL, 0,
5890 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
5891 mutex_exit(&spa->spa_async_lock);
5892}
5893
5894void
5895spa_async_request(spa_t *spa, int task)
5896{
5897 zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task);
5898 mutex_enter(&spa->spa_async_lock);
5899 spa->spa_async_tasks |= task;
5900 mutex_exit(&spa->spa_async_lock);
5901}
5902
5903/*
5904 * ==========================================================================
5905 * SPA syncing routines
5906 * ==========================================================================
5907 */
5908
5909static int
5910bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
5911{
5912 bpobj_t *bpo = arg;
5913 bpobj_enqueue(bpo, bp, tx);
5914 return (0);
5915}
5916
5917static int
5918spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
5919{
5920 zio_t *zio = arg;
5921
5922 zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp,
5923 BP_GET_PSIZE(bp), zio->io_flags));
5924 return (0);
5925}
5926
5927static void
5928spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
5929{
5930 char *packed = NULL;
5931 size_t bufsize;
5932 size_t nvsize = 0;
5933 dmu_buf_t *db;
5934
5935 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0);
5936
5937 /*
5938 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration
5939 * information. This avoids the dbuf_will_dirty() path and
5940 * saves us a pre-read to get data we don't actually care about.
5941 */
5942 bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE);
5943 packed = kmem_alloc(bufsize, KM_SLEEP);
5944
5945 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
5946 KM_SLEEP) == 0);
5947 bzero(packed + nvsize, bufsize - nvsize);
5948
5949 dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx);
5950
5951 kmem_free(packed, bufsize);
5952
5953 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
5954 dmu_buf_will_dirty(db, tx);
5955 *(uint64_t *)db->db_data = nvsize;
5956 dmu_buf_rele(db, FTAG);
5957}
5958
5959static void
5960spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx,
5961 const char *config, const char *entry)
5962{
5963 nvlist_t *nvroot;
5964 nvlist_t **list;
5965 int i;
5966
5967 if (!sav->sav_sync)
5968 return;
5969
5970 /*
5971 * Update the MOS nvlist describing the list of available devices.
5972 * spa_validate_aux() will have already made sure this nvlist is
5973 * valid and the vdevs are labeled appropriately.
5974 */
5975 if (sav->sav_object == 0) {
5976 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset,
5977 DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE,
5978 sizeof (uint64_t), tx);
5979 VERIFY(zap_update(spa->spa_meta_objset,
5980 DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1,
5981 &sav->sav_object, tx) == 0);
5982 }
5983
5984 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
5985 if (sav->sav_count == 0) {
5986 VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0);
5987 } else {
5988 list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
5989 for (i = 0; i < sav->sav_count; i++)
5990 list[i] = vdev_config_generate(spa, sav->sav_vdevs[i],
5991 B_FALSE, VDEV_CONFIG_L2CACHE);
5992 VERIFY(nvlist_add_nvlist_array(nvroot, config, list,
5993 sav->sav_count) == 0);
5994 for (i = 0; i < sav->sav_count; i++)
5995 nvlist_free(list[i]);
5996 kmem_free(list, sav->sav_count * sizeof (void *));
5997 }
5998
5999 spa_sync_nvlist(spa, sav->sav_object, nvroot, tx);
6000 nvlist_free(nvroot);
6001
6002 sav->sav_sync = B_FALSE;
6003}
6004
6005static void
6006spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
6007{
6008 nvlist_t *config;
6009
6010 if (list_is_empty(&spa->spa_config_dirty_list))
6011 return;
6012
6013 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
6014
6015 config = spa_config_generate(spa, spa->spa_root_vdev,
6016 dmu_tx_get_txg(tx), B_FALSE);
6017
6018 /*
6019 * If we're upgrading the spa version then make sure that
6020 * the config object gets updated with the correct version.
6021 */
6022 if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version)
6023 fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
6024 spa->spa_uberblock.ub_version);
6025
6026 spa_config_exit(spa, SCL_STATE, FTAG);
6027
6028 if (spa->spa_config_syncing)
6029 nvlist_free(spa->spa_config_syncing);
6030 spa->spa_config_syncing = config;
6031
6032 spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
6033}
6034
6035static void
6036spa_sync_version(void *arg, dmu_tx_t *tx)
6037{
6038 uint64_t *versionp = arg;
6039 uint64_t version = *versionp;
6040 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
6041
6042 /*
6043 * Setting the version is special cased when first creating the pool.
6044 */
6045 ASSERT(tx->tx_txg != TXG_INITIAL);
6046
6047 ASSERT(SPA_VERSION_IS_SUPPORTED(version));
6048 ASSERT(version >= spa_version(spa));
6049
6050 spa->spa_uberblock.ub_version = version;
6051 vdev_config_dirty(spa->spa_root_vdev);
6052 spa_history_log_internal(spa, "set", tx, "version=%lld", version);
6053}
6054
6055/*
6056 * Set zpool properties.
6057 */
6058static void
6059spa_sync_props(void *arg, dmu_tx_t *tx)
6060{
6061 nvlist_t *nvp = arg;
6062 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
6063 objset_t *mos = spa->spa_meta_objset;
6064 nvpair_t *elem = NULL;
6065
6066 mutex_enter(&spa->spa_props_lock);
6067
6068 while ((elem = nvlist_next_nvpair(nvp, elem))) {
6069 uint64_t intval;
6070 char *strval, *fname;
6071 zpool_prop_t prop;
6072 const char *propname;
6073 zprop_type_t proptype;
6074 zfeature_info_t *feature;
6075
6076 switch (prop = zpool_name_to_prop(nvpair_name(elem))) {
6077 case ZPROP_INVAL:
6078 /*
6079 * We checked this earlier in spa_prop_validate().
6080 */
6081 ASSERT(zpool_prop_feature(nvpair_name(elem)));
6082
6083 fname = strchr(nvpair_name(elem), '@') + 1;
6084 VERIFY3U(0, ==, zfeature_lookup_name(fname, &feature));
6085
6086 spa_feature_enable(spa, feature, tx);
6087 spa_history_log_internal(spa, "set", tx,
6088 "%s=enabled", nvpair_name(elem));
6089 break;
6090
6091 case ZPOOL_PROP_VERSION:
6092 VERIFY(nvpair_value_uint64(elem, &intval) == 0);
6093 /*
6094 * The version is synced seperatly before other
6095 * properties and should be correct by now.
6096 */
6097 ASSERT3U(spa_version(spa), >=, intval);
6098 break;
6099
6100 case ZPOOL_PROP_ALTROOT:
6101 /*
6102 * 'altroot' is a non-persistent property. It should
6103 * have been set temporarily at creation or import time.
6104 */
6105 ASSERT(spa->spa_root != NULL);
6106 break;
6107
6108 case ZPOOL_PROP_READONLY:
6109 case ZPOOL_PROP_CACHEFILE:
6110 /*
6111 * 'readonly' and 'cachefile' are also non-persisitent
6112 * properties.
6113 */
6114 break;
6115 case ZPOOL_PROP_COMMENT:
6116 VERIFY(nvpair_value_string(elem, &strval) == 0);
6117 if (spa->spa_comment != NULL)
6118 spa_strfree(spa->spa_comment);
6119 spa->spa_comment = spa_strdup(strval);
6120 /*
6121 * We need to dirty the configuration on all the vdevs
6122 * so that their labels get updated. It's unnecessary
6123 * to do this for pool creation since the vdev's
6124 * configuratoin has already been dirtied.
6125 */
6126 if (tx->tx_txg != TXG_INITIAL)
6127 vdev_config_dirty(spa->spa_root_vdev);
6128 spa_history_log_internal(spa, "set", tx,
6129 "%s=%s", nvpair_name(elem), strval);
6130 break;
6131 default:
6132 /*
6133 * Set pool property values in the poolprops mos object.
6134 */
6135 if (spa->spa_pool_props_object == 0) {
6136 spa->spa_pool_props_object =
6137 zap_create_link(mos, DMU_OT_POOL_PROPS,
6138 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS,
6139 tx);
6140 }
6141
6142 /* normalize the property name */
6143 propname = zpool_prop_to_name(prop);
6144 proptype = zpool_prop_get_type(prop);
6145
6146 if (nvpair_type(elem) == DATA_TYPE_STRING) {
6147 ASSERT(proptype == PROP_TYPE_STRING);
6148 VERIFY(nvpair_value_string(elem, &strval) == 0);
6149 VERIFY(zap_update(mos,
6150 spa->spa_pool_props_object, propname,
6151 1, strlen(strval) + 1, strval, tx) == 0);
6152 spa_history_log_internal(spa, "set", tx,
6153 "%s=%s", nvpair_name(elem), strval);
6154 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
6155 VERIFY(nvpair_value_uint64(elem, &intval) == 0);
6156
6157 if (proptype == PROP_TYPE_INDEX) {
6158 const char *unused;
6159 VERIFY(zpool_prop_index_to_string(
6160 prop, intval, &unused) == 0);
6161 }
6162 VERIFY(zap_update(mos,
6163 spa->spa_pool_props_object, propname,
6164 8, 1, &intval, tx) == 0);
6165 spa_history_log_internal(spa, "set", tx,
6166 "%s=%lld", nvpair_name(elem), intval);
6167 } else {
6168 ASSERT(0); /* not allowed */
6169 }
6170
6171 switch (prop) {
6172 case ZPOOL_PROP_DELEGATION:
6173 spa->spa_delegation = intval;
6174 break;
6175 case ZPOOL_PROP_BOOTFS:
6176 spa->spa_bootfs = intval;
6177 break;
6178 case ZPOOL_PROP_FAILUREMODE:
6179 spa->spa_failmode = intval;
6180 break;
6181 case ZPOOL_PROP_AUTOEXPAND:
6182 spa->spa_autoexpand = intval;
6183 if (tx->tx_txg != TXG_INITIAL)
6184 spa_async_request(spa,
6185 SPA_ASYNC_AUTOEXPAND);
6186 break;
6187 case ZPOOL_PROP_DEDUPDITTO:
6188 spa->spa_dedup_ditto = intval;
6189 break;
6190 default:
6191 break;
6192 }
6193 }
6194
6195 }
6196
6197 mutex_exit(&spa->spa_props_lock);
6198}
6199
6200/*
6201 * Perform one-time upgrade on-disk changes. spa_version() does not
6202 * reflect the new version this txg, so there must be no changes this
6203 * txg to anything that the upgrade code depends on after it executes.
6204 * Therefore this must be called after dsl_pool_sync() does the sync
6205 * tasks.
6206 */
6207static void
6208spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx)
6209{
6210 dsl_pool_t *dp = spa->spa_dsl_pool;
6211
6212 ASSERT(spa->spa_sync_pass == 1);
6213
6214 rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
6215
6216 if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
6217 spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) {
6218 dsl_pool_create_origin(dp, tx);
6219
6220 /* Keeping the origin open increases spa_minref */
6221 spa->spa_minref += 3;
6222 }
6223
6224 if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES &&
6225 spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) {
6226 dsl_pool_upgrade_clones(dp, tx);
6227 }
6228
6229 if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES &&
6230 spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) {
6231 dsl_pool_upgrade_dir_clones(dp, tx);
6232
6233 /* Keeping the freedir open increases spa_minref */
6234 spa->spa_minref += 3;
6235 }
6236
6237 if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES &&
6238 spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
6239 spa_feature_create_zap_objects(spa, tx);
6240 }
6241 rrw_exit(&dp->dp_config_rwlock, FTAG);
6242}
6243
6244/*
6245 * Sync the specified transaction group. New blocks may be dirtied as
6246 * part of the process, so we iterate until it converges.
6247 */
6248void
6249spa_sync(spa_t *spa, uint64_t txg)
6250{
6251 dsl_pool_t *dp = spa->spa_dsl_pool;
6252 objset_t *mos = spa->spa_meta_objset;
6253 bpobj_t *defer_bpo = &spa->spa_deferred_bpobj;
6254 bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
6255 vdev_t *rvd = spa->spa_root_vdev;
6256 vdev_t *vd;
6257 dmu_tx_t *tx;
6258 int error;
6259
6260 VERIFY(spa_writeable(spa));
6261
6262 /*
6263 * Lock out configuration changes.
6264 */
6265 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
6266
6267 spa->spa_syncing_txg = txg;
6268 spa->spa_sync_pass = 0;
6269
6270 /*
6271 * If there are any pending vdev state changes, convert them
6272 * into config changes that go out with this transaction group.
6273 */
6274 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
6275 while (list_head(&spa->spa_state_dirty_list) != NULL) {
6276 /*
6277 * We need the write lock here because, for aux vdevs,
6278 * calling vdev_config_dirty() modifies sav_config.
6279 * This is ugly and will become unnecessary when we
6280 * eliminate the aux vdev wart by integrating all vdevs
6281 * into the root vdev tree.
6282 */
6283 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
6284 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER);
6285 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
6286 vdev_state_clean(vd);
6287 vdev_config_dirty(vd);
6288 }
6289 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
6290 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
6291 }
6292 spa_config_exit(spa, SCL_STATE, FTAG);
6293
6294 tx = dmu_tx_create_assigned(dp, txg);
6295
6296 spa->spa_sync_starttime = gethrtime();
6297#ifdef illumos
6298 VERIFY(cyclic_reprogram(spa->spa_deadman_cycid,
6299 spa->spa_sync_starttime + spa->spa_deadman_synctime));
6300#else /* FreeBSD */
6301#ifdef _KERNEL
6302 callout_reset(&spa->spa_deadman_cycid,
6303 hz * spa->spa_deadman_synctime / NANOSEC, spa_deadman, spa);
6304#endif
6305#endif
6306
6307 /*
6308 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
6309 * set spa_deflate if we have no raid-z vdevs.
6310 */
6311 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE &&
6312 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) {
6313 int i;
6314
6315 for (i = 0; i < rvd->vdev_children; i++) {
6316 vd = rvd->vdev_child[i];
6317 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
6318 break;
6319 }
6320 if (i == rvd->vdev_children) {
6321 spa->spa_deflate = TRUE;
6322 VERIFY(0 == zap_add(spa->spa_meta_objset,
6323 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
6324 sizeof (uint64_t), 1, &spa->spa_deflate, tx));
6325 }
6326 }
6327
6328 /*
6329 * If anything has changed in this txg, or if someone is waiting
6330 * for this txg to sync (eg, spa_vdev_remove()), push the
6331 * deferred frees from the previous txg. If not, leave them
6332 * alone so that we don't generate work on an otherwise idle
6333 * system.
6334 */
6335 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) ||
6336 !txg_list_empty(&dp->dp_dirty_dirs, txg) ||
6337 !txg_list_empty(&dp->dp_sync_tasks, txg) ||
6338 ((dsl_scan_active(dp->dp_scan) ||
6339 txg_sync_waiting(dp)) && !spa_shutting_down(spa))) {
6340 zio_t *zio = zio_root(spa, NULL, NULL, 0);
6341 VERIFY3U(bpobj_iterate(defer_bpo,
6342 spa_free_sync_cb, zio, tx), ==, 0);
6343 VERIFY0(zio_wait(zio));
6344 }
6345
6346 /*
6347 * Iterate to convergence.
6348 */
6349 do {
6350 int pass = ++spa->spa_sync_pass;
6351
6352 spa_sync_config_object(spa, tx);
6353 spa_sync_aux_dev(spa, &spa->spa_spares, tx,
6354 ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES);
6355 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx,
6356 ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE);
6357 spa_errlog_sync(spa, txg);
6358 dsl_pool_sync(dp, txg);
6359
6360 if (pass < zfs_sync_pass_deferred_free) {
6361 zio_t *zio = zio_root(spa, NULL, NULL, 0);
6362 bplist_iterate(free_bpl, spa_free_sync_cb,
6363 zio, tx);
6364 VERIFY(zio_wait(zio) == 0);
6365 } else {
6366 bplist_iterate(free_bpl, bpobj_enqueue_cb,
6367 defer_bpo, tx);
6368 }
6369
6370 ddt_sync(spa, txg);
6371 dsl_scan_sync(dp, tx);
6372
6373 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))
6374 vdev_sync(vd, txg);
6375
6376 if (pass == 1)
6377 spa_sync_upgrades(spa, tx);
6378
6379 } while (dmu_objset_is_dirty(mos, txg));
6380
6381 /*
6382 * Rewrite the vdev configuration (which includes the uberblock)
6383 * to commit the transaction group.
6384 *
6385 * If there are no dirty vdevs, we sync the uberblock to a few
6386 * random top-level vdevs that are known to be visible in the
6387 * config cache (see spa_vdev_add() for a complete description).
6388 * If there *are* dirty vdevs, sync the uberblock to all vdevs.
6389 */
6390 for (;;) {
6391 /*
6392 * We hold SCL_STATE to prevent vdev open/close/etc.
6393 * while we're attempting to write the vdev labels.
6394 */
6395 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
6396
6397 if (list_is_empty(&spa->spa_config_dirty_list)) {
6398 vdev_t *svd[SPA_DVAS_PER_BP];
6399 int svdcount = 0;
6400 int children = rvd->vdev_children;
6401 int c0 = spa_get_random(children);
6402
6403 for (int c = 0; c < children; c++) {
6404 vd = rvd->vdev_child[(c0 + c) % children];
6405 if (vd->vdev_ms_array == 0 || vd->vdev_islog)
6406 continue;
6407 svd[svdcount++] = vd;
6408 if (svdcount == SPA_DVAS_PER_BP)
6409 break;
6410 }
6411 error = vdev_config_sync(svd, svdcount, txg, B_FALSE);
6412 if (error != 0)
6413 error = vdev_config_sync(svd, svdcount, txg,
6414 B_TRUE);
6415 } else {
6416 error = vdev_config_sync(rvd->vdev_child,
6417 rvd->vdev_children, txg, B_FALSE);
6418 if (error != 0)
6419 error = vdev_config_sync(rvd->vdev_child,
6420 rvd->vdev_children, txg, B_TRUE);
6421 }
6422
6423 if (error == 0)
6424 spa->spa_last_synced_guid = rvd->vdev_guid;
6425
6426 spa_config_exit(spa, SCL_STATE, FTAG);
6427
6428 if (error == 0)
6429 break;
6430 zio_suspend(spa, NULL);
6431 zio_resume_wait(spa);
6432 }
6433 dmu_tx_commit(tx);
6434
6435#ifdef illumos
6436 VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY));
6437#else /* FreeBSD */
6438#ifdef _KERNEL
6439 callout_drain(&spa->spa_deadman_cycid);
6440#endif
6441#endif
6442
6443 /*
6444 * Clear the dirty config list.
6445 */
6446 while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL)
6447 vdev_config_clean(vd);
6448
6449 /*
6450 * Now that the new config has synced transactionally,
6451 * let it become visible to the config cache.
6452 */
6453 if (spa->spa_config_syncing != NULL) {
6454 spa_config_set(spa, spa->spa_config_syncing);
6455 spa->spa_config_txg = txg;
6456 spa->spa_config_syncing = NULL;
6457 }
6458
6459 spa->spa_ubsync = spa->spa_uberblock;
6460
6461 dsl_pool_sync_done(dp, txg);
6462
6463 /*
6464 * Update usable space statistics.
6465 */
6466 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
6467 vdev_sync_done(vd, txg);
6468
6469 spa_update_dspace(spa);
6470
6471 /*
6472 * It had better be the case that we didn't dirty anything
6473 * since vdev_config_sync().
6474 */
6475 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
6476 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
6477 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
6478
6479 spa->spa_sync_pass = 0;
6480
6481 spa_config_exit(spa, SCL_CONFIG, FTAG);
6482
6483 spa_handle_ignored_writes(spa);
6484
6485 /*
6486 * If any async tasks have been requested, kick them off.
6487 */
6488 spa_async_dispatch(spa);
6489}
6490
6491/*
6492 * Sync all pools. We don't want to hold the namespace lock across these
6493 * operations, so we take a reference on the spa_t and drop the lock during the
6494 * sync.
6495 */
6496void
6497spa_sync_allpools(void)
6498{
6499 spa_t *spa = NULL;
6500 mutex_enter(&spa_namespace_lock);
6501 while ((spa = spa_next(spa)) != NULL) {
6502 if (spa_state(spa) != POOL_STATE_ACTIVE ||
6503 !spa_writeable(spa) || spa_suspended(spa))
6504 continue;
6505 spa_open_ref(spa, FTAG);
6506 mutex_exit(&spa_namespace_lock);
6507 txg_wait_synced(spa_get_dsl(spa), 0);
6508 mutex_enter(&spa_namespace_lock);
6509 spa_close(spa, FTAG);
6510 }
6511 mutex_exit(&spa_namespace_lock);
6512}
6513
6514/*
6515 * ==========================================================================
6516 * Miscellaneous routines
6517 * ==========================================================================
6518 */
6519
6520/*
6521 * Remove all pools in the system.
6522 */
6523void
6524spa_evict_all(void)
6525{
6526 spa_t *spa;
6527
6528 /*
6529 * Remove all cached state. All pools should be closed now,
6530 * so every spa in the AVL tree should be unreferenced.
6531 */
6532 mutex_enter(&spa_namespace_lock);
6533 while ((spa = spa_next(NULL)) != NULL) {
6534 /*
6535 * Stop async tasks. The async thread may need to detach
6536 * a device that's been replaced, which requires grabbing
6537 * spa_namespace_lock, so we must drop it here.
6538 */
6539 spa_open_ref(spa, FTAG);
6540 mutex_exit(&spa_namespace_lock);
6541 spa_async_suspend(spa);
6542 mutex_enter(&spa_namespace_lock);
6543 spa_close(spa, FTAG);
6544
6545 if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
6546 spa_unload(spa);
6547 spa_deactivate(spa);
6548 }
6549 spa_remove(spa);
6550 }
6551 mutex_exit(&spa_namespace_lock);
6552}
6553
6554vdev_t *
6555spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux)
6556{
6557 vdev_t *vd;
6558 int i;
6559
6560 if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL)
6561 return (vd);
6562
6563 if (aux) {
6564 for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
6565 vd = spa->spa_l2cache.sav_vdevs[i];
6566 if (vd->vdev_guid == guid)
6567 return (vd);
6568 }
6569
6570 for (i = 0; i < spa->spa_spares.sav_count; i++) {
6571 vd = spa->spa_spares.sav_vdevs[i];
6572 if (vd->vdev_guid == guid)
6573 return (vd);
6574 }
6575 }
6576
6577 return (NULL);
6578}
6579
6580void
6581spa_upgrade(spa_t *spa, uint64_t version)
6582{
6583 ASSERT(spa_writeable(spa));
6584
6585 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
6586
6587 /*
6588 * This should only be called for a non-faulted pool, and since a
6589 * future version would result in an unopenable pool, this shouldn't be
6590 * possible.
6591 */
6592 ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version));
6593 ASSERT(version >= spa->spa_uberblock.ub_version);
6594
6595 spa->spa_uberblock.ub_version = version;
6596 vdev_config_dirty(spa->spa_root_vdev);
6597
6598 spa_config_exit(spa, SCL_ALL, FTAG);
6599
6600 txg_wait_synced(spa_get_dsl(spa), 0);
6601}
6602
6603boolean_t
6604spa_has_spare(spa_t *spa, uint64_t guid)
6605{
6606 int i;
6607 uint64_t spareguid;
6608 spa_aux_vdev_t *sav = &spa->spa_spares;
6609
6610 for (i = 0; i < sav->sav_count; i++)
6611 if (sav->sav_vdevs[i]->vdev_guid == guid)
6612 return (B_TRUE);
6613
6614 for (i = 0; i < sav->sav_npending; i++) {
6615 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID,
6616 &spareguid) == 0 && spareguid == guid)
6617 return (B_TRUE);
6618 }
6619
6620 return (B_FALSE);
6621}
6622
6623/*
6624 * Check if a pool has an active shared spare device.
6625 * Note: reference count of an active spare is 2, as a spare and as a replace
6626 */
6627static boolean_t
6628spa_has_active_shared_spare(spa_t *spa)
6629{
6630 int i, refcnt;
6631 uint64_t pool;
6632 spa_aux_vdev_t *sav = &spa->spa_spares;
6633
6634 for (i = 0; i < sav->sav_count; i++) {
6635 if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool,
6636 &refcnt) && pool != 0ULL && pool == spa_guid(spa) &&
6637 refcnt > 2)
6638 return (B_TRUE);
6639 }
6640
6641 return (B_FALSE);
6642}
6643
6644/*
6645 * Post a sysevent corresponding to the given event. The 'name' must be one of
6646 * the event definitions in sys/sysevent/eventdefs.h. The payload will be
6647 * filled in from the spa and (optionally) the vdev. This doesn't do anything
6648 * in the userland libzpool, as we don't want consumers to misinterpret ztest
6649 * or zdb as real changes.
6650 */
6651void
6652spa_event_notify(spa_t *spa, vdev_t *vd, const char *name)
6653{
6654#ifdef _KERNEL
6655 sysevent_t *ev;
6656 sysevent_attr_list_t *attr = NULL;
6657 sysevent_value_t value;
6658 sysevent_id_t eid;
6659
6660 ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs",
6661 SE_SLEEP);
6662
6663 value.value_type = SE_DATA_TYPE_STRING;
6664 value.value.sv_string = spa_name(spa);
6665 if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0)
6666 goto done;
6667
6668 value.value_type = SE_DATA_TYPE_UINT64;
6669 value.value.sv_uint64 = spa_guid(spa);
6670 if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0)
6671 goto done;
6672
6673 if (vd) {
6674 value.value_type = SE_DATA_TYPE_UINT64;
6675 value.value.sv_uint64 = vd->vdev_guid;
6676 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value,
6677 SE_SLEEP) != 0)
6678 goto done;
6679
6680 if (vd->vdev_path) {
6681 value.value_type = SE_DATA_TYPE_STRING;
6682 value.value.sv_string = vd->vdev_path;
6683 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH,
6684 &value, SE_SLEEP) != 0)
6685 goto done;
6686 }
6687 }
6688
6689 if (sysevent_attach_attributes(ev, attr) != 0)
6690 goto done;
6691 attr = NULL;
6692
6693 (void) log_sysevent(ev, SE_SLEEP, &eid);
6694
6695done:
6696 if (attr)
6697 sysevent_free_attr(attr);
6698 sysevent_free(ev);
6699#endif
6700}