spa.c revision 299441
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22/* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2011, 2014 by Delphix. All rights reserved. 25 * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved. 26 * Copyright (c) 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved. 27 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 28 * Copyright 2013 Saso Kiselkov. All rights reserved. 29 * Copyright (c) 2014 Integros [integros.com] 30 */ 31 32/* 33 * SPA: Storage Pool Allocator 34 * 35 * This file contains all the routines used when modifying on-disk SPA state. 36 * This includes opening, importing, destroying, exporting a pool, and syncing a 37 * pool. 38 */ 39 40#include <sys/zfs_context.h> 41#include <sys/fm/fs/zfs.h> 42#include <sys/spa_impl.h> 43#include <sys/zio.h> 44#include <sys/zio_checksum.h> 45#include <sys/dmu.h> 46#include <sys/dmu_tx.h> 47#include <sys/zap.h> 48#include <sys/zil.h> 49#include <sys/ddt.h> 50#include <sys/vdev_impl.h> 51#include <sys/metaslab.h> 52#include <sys/metaslab_impl.h> 53#include <sys/uberblock_impl.h> 54#include <sys/txg.h> 55#include <sys/avl.h> 56#include <sys/dmu_traverse.h> 57#include <sys/dmu_objset.h> 58#include <sys/unique.h> 59#include <sys/dsl_pool.h> 60#include <sys/dsl_dataset.h> 61#include <sys/dsl_dir.h> 62#include <sys/dsl_prop.h> 63#include <sys/dsl_synctask.h> 64#include <sys/fs/zfs.h> 65#include <sys/arc.h> 66#include <sys/callb.h> 67#include <sys/spa_boot.h> 68#include <sys/zfs_ioctl.h> 69#include <sys/dsl_scan.h> 70#include <sys/dmu_send.h> 71#include <sys/dsl_destroy.h> 72#include <sys/dsl_userhold.h> 73#include <sys/zfeature.h> 74#include <sys/zvol.h> 75#include <sys/trim_map.h> 76 77#ifdef _KERNEL 78#include <sys/callb.h> 79#include <sys/cpupart.h> 80#include <sys/zone.h> 81#endif /* _KERNEL */ 82 83#include "zfs_prop.h" 84#include "zfs_comutil.h" 85 86/* Check hostid on import? */ 87static int check_hostid = 1; 88 89/* 90 * The interval, in seconds, at which failed configuration cache file writes 91 * should be retried. 92 */ 93static int zfs_ccw_retry_interval = 300; 94 95SYSCTL_DECL(_vfs_zfs); 96SYSCTL_INT(_vfs_zfs, OID_AUTO, check_hostid, CTLFLAG_RWTUN, &check_hostid, 0, 97 "Check hostid on import?"); 98TUNABLE_INT("vfs.zfs.ccw_retry_interval", &zfs_ccw_retry_interval); 99SYSCTL_INT(_vfs_zfs, OID_AUTO, ccw_retry_interval, CTLFLAG_RW, 100 &zfs_ccw_retry_interval, 0, 101 "Configuration cache file write, retry after failure, interval (seconds)"); 102 103typedef enum zti_modes { 104 ZTI_MODE_FIXED, /* value is # of threads (min 1) */ 105 ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */ 106 ZTI_MODE_NULL, /* don't create a taskq */ 107 ZTI_NMODES 108} zti_modes_t; 109 110#define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) } 111#define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 } 112#define ZTI_NULL { ZTI_MODE_NULL, 0, 0 } 113 114#define ZTI_N(n) ZTI_P(n, 1) 115#define ZTI_ONE ZTI_N(1) 116 117typedef struct zio_taskq_info { 118 zti_modes_t zti_mode; 119 uint_t zti_value; 120 uint_t zti_count; 121} zio_taskq_info_t; 122 123static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { 124 "issue", "issue_high", "intr", "intr_high" 125}; 126 127/* 128 * This table defines the taskq settings for each ZFS I/O type. When 129 * initializing a pool, we use this table to create an appropriately sized 130 * taskq. Some operations are low volume and therefore have a small, static 131 * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE 132 * macros. Other operations process a large amount of data; the ZTI_BATCH 133 * macro causes us to create a taskq oriented for throughput. Some operations 134 * are so high frequency and short-lived that the taskq itself can become a a 135 * point of lock contention. The ZTI_P(#, #) macro indicates that we need an 136 * additional degree of parallelism specified by the number of threads per- 137 * taskq and the number of taskqs; when dispatching an event in this case, the 138 * particular taskq is chosen at random. 139 * 140 * The different taskq priorities are to handle the different contexts (issue 141 * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that 142 * need to be handled with minimum delay. 143 */ 144const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { 145 /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ 146 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */ 147 { ZTI_N(8), ZTI_NULL, ZTI_P(12, 8), ZTI_NULL }, /* READ */ 148 { ZTI_BATCH, ZTI_N(5), ZTI_N(8), ZTI_N(5) }, /* WRITE */ 149 { ZTI_P(12, 8), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */ 150 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */ 151 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */ 152}; 153 154static void spa_sync_version(void *arg, dmu_tx_t *tx); 155static void spa_sync_props(void *arg, dmu_tx_t *tx); 156static boolean_t spa_has_active_shared_spare(spa_t *spa); 157static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config, 158 spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, 159 char **ereport); 160static void spa_vdev_resilver_done(spa_t *spa); 161 162uint_t zio_taskq_batch_pct = 75; /* 1 thread per cpu in pset */ 163#ifdef PSRSET_BIND 164id_t zio_taskq_psrset_bind = PS_NONE; 165#endif 166#ifdef SYSDC 167boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ 168#endif 169uint_t zio_taskq_basedc = 80; /* base duty cycle */ 170 171boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */ 172extern int zfs_sync_pass_deferred_free; 173 174#ifndef illumos 175extern void spa_deadman(void *arg); 176#endif 177 178/* 179 * This (illegal) pool name is used when temporarily importing a spa_t in order 180 * to get the vdev stats associated with the imported devices. 181 */ 182#define TRYIMPORT_NAME "$import" 183 184/* 185 * ========================================================================== 186 * SPA properties routines 187 * ========================================================================== 188 */ 189 190/* 191 * Add a (source=src, propname=propval) list to an nvlist. 192 */ 193static void 194spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, 195 uint64_t intval, zprop_source_t src) 196{ 197 const char *propname = zpool_prop_to_name(prop); 198 nvlist_t *propval; 199 200 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 201 VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); 202 203 if (strval != NULL) 204 VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); 205 else 206 VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); 207 208 VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); 209 nvlist_free(propval); 210} 211 212/* 213 * Get property values from the spa configuration. 214 */ 215static void 216spa_prop_get_config(spa_t *spa, nvlist_t **nvp) 217{ 218 vdev_t *rvd = spa->spa_root_vdev; 219 dsl_pool_t *pool = spa->spa_dsl_pool; 220 uint64_t size, alloc, cap, version; 221 zprop_source_t src = ZPROP_SRC_NONE; 222 spa_config_dirent_t *dp; 223 metaslab_class_t *mc = spa_normal_class(spa); 224 225 ASSERT(MUTEX_HELD(&spa->spa_props_lock)); 226 227 if (rvd != NULL) { 228 alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 229 size = metaslab_class_get_space(spa_normal_class(spa)); 230 spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); 231 spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); 232 spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); 233 spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, 234 size - alloc, src); 235 236 spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL, 237 metaslab_class_fragmentation(mc), src); 238 spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, 239 metaslab_class_expandable_space(mc), src); 240 spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, 241 (spa_mode(spa) == FREAD), src); 242 243 cap = (size == 0) ? 0 : (alloc * 100 / size); 244 spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); 245 246 spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, 247 ddt_get_pool_dedup_ratio(spa), src); 248 249 spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 250 rvd->vdev_state, src); 251 252 version = spa_version(spa); 253 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) 254 src = ZPROP_SRC_DEFAULT; 255 else 256 src = ZPROP_SRC_LOCAL; 257 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); 258 } 259 260 if (pool != NULL) { 261 /* 262 * The $FREE directory was introduced in SPA_VERSION_DEADLISTS, 263 * when opening pools before this version freedir will be NULL. 264 */ 265 if (pool->dp_free_dir != NULL) { 266 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, 267 dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes, 268 src); 269 } else { 270 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, 271 NULL, 0, src); 272 } 273 274 if (pool->dp_leak_dir != NULL) { 275 spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL, 276 dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes, 277 src); 278 } else { 279 spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, 280 NULL, 0, src); 281 } 282 } 283 284 spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); 285 286 if (spa->spa_comment != NULL) { 287 spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment, 288 0, ZPROP_SRC_LOCAL); 289 } 290 291 if (spa->spa_root != NULL) 292 spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 293 0, ZPROP_SRC_LOCAL); 294 295 if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { 296 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, 297 MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE); 298 } else { 299 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, 300 SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE); 301 } 302 303 if ((dp = list_head(&spa->spa_config_list)) != NULL) { 304 if (dp->scd_path == NULL) { 305 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 306 "none", 0, ZPROP_SRC_LOCAL); 307 } else if (strcmp(dp->scd_path, spa_config_path) != 0) { 308 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 309 dp->scd_path, 0, ZPROP_SRC_LOCAL); 310 } 311 } 312} 313 314/* 315 * Get zpool property values. 316 */ 317int 318spa_prop_get(spa_t *spa, nvlist_t **nvp) 319{ 320 objset_t *mos = spa->spa_meta_objset; 321 zap_cursor_t zc; 322 zap_attribute_t za; 323 int err; 324 325 VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 326 327 mutex_enter(&spa->spa_props_lock); 328 329 /* 330 * Get properties from the spa config. 331 */ 332 spa_prop_get_config(spa, nvp); 333 334 /* If no pool property object, no more prop to get. */ 335 if (mos == NULL || spa->spa_pool_props_object == 0) { 336 mutex_exit(&spa->spa_props_lock); 337 return (0); 338 } 339 340 /* 341 * Get properties from the MOS pool property object. 342 */ 343 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 344 (err = zap_cursor_retrieve(&zc, &za)) == 0; 345 zap_cursor_advance(&zc)) { 346 uint64_t intval = 0; 347 char *strval = NULL; 348 zprop_source_t src = ZPROP_SRC_DEFAULT; 349 zpool_prop_t prop; 350 351 if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL) 352 continue; 353 354 switch (za.za_integer_length) { 355 case 8: 356 /* integer property */ 357 if (za.za_first_integer != 358 zpool_prop_default_numeric(prop)) 359 src = ZPROP_SRC_LOCAL; 360 361 if (prop == ZPOOL_PROP_BOOTFS) { 362 dsl_pool_t *dp; 363 dsl_dataset_t *ds = NULL; 364 365 dp = spa_get_dsl(spa); 366 dsl_pool_config_enter(dp, FTAG); 367 if (err = dsl_dataset_hold_obj(dp, 368 za.za_first_integer, FTAG, &ds)) { 369 dsl_pool_config_exit(dp, FTAG); 370 break; 371 } 372 373 strval = kmem_alloc( 374 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, 375 KM_SLEEP); 376 dsl_dataset_name(ds, strval); 377 dsl_dataset_rele(ds, FTAG); 378 dsl_pool_config_exit(dp, FTAG); 379 } else { 380 strval = NULL; 381 intval = za.za_first_integer; 382 } 383 384 spa_prop_add_list(*nvp, prop, strval, intval, src); 385 386 if (strval != NULL) 387 kmem_free(strval, 388 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); 389 390 break; 391 392 case 1: 393 /* string property */ 394 strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 395 err = zap_lookup(mos, spa->spa_pool_props_object, 396 za.za_name, 1, za.za_num_integers, strval); 397 if (err) { 398 kmem_free(strval, za.za_num_integers); 399 break; 400 } 401 spa_prop_add_list(*nvp, prop, strval, 0, src); 402 kmem_free(strval, za.za_num_integers); 403 break; 404 405 default: 406 break; 407 } 408 } 409 zap_cursor_fini(&zc); 410 mutex_exit(&spa->spa_props_lock); 411out: 412 if (err && err != ENOENT) { 413 nvlist_free(*nvp); 414 *nvp = NULL; 415 return (err); 416 } 417 418 return (0); 419} 420 421/* 422 * Validate the given pool properties nvlist and modify the list 423 * for the property values to be set. 424 */ 425static int 426spa_prop_validate(spa_t *spa, nvlist_t *props) 427{ 428 nvpair_t *elem; 429 int error = 0, reset_bootfs = 0; 430 uint64_t objnum = 0; 431 boolean_t has_feature = B_FALSE; 432 433 elem = NULL; 434 while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 435 uint64_t intval; 436 char *strval, *slash, *check, *fname; 437 const char *propname = nvpair_name(elem); 438 zpool_prop_t prop = zpool_name_to_prop(propname); 439 440 switch (prop) { 441 case ZPROP_INVAL: 442 if (!zpool_prop_feature(propname)) { 443 error = SET_ERROR(EINVAL); 444 break; 445 } 446 447 /* 448 * Sanitize the input. 449 */ 450 if (nvpair_type(elem) != DATA_TYPE_UINT64) { 451 error = SET_ERROR(EINVAL); 452 break; 453 } 454 455 if (nvpair_value_uint64(elem, &intval) != 0) { 456 error = SET_ERROR(EINVAL); 457 break; 458 } 459 460 if (intval != 0) { 461 error = SET_ERROR(EINVAL); 462 break; 463 } 464 465 fname = strchr(propname, '@') + 1; 466 if (zfeature_lookup_name(fname, NULL) != 0) { 467 error = SET_ERROR(EINVAL); 468 break; 469 } 470 471 has_feature = B_TRUE; 472 break; 473 474 case ZPOOL_PROP_VERSION: 475 error = nvpair_value_uint64(elem, &intval); 476 if (!error && 477 (intval < spa_version(spa) || 478 intval > SPA_VERSION_BEFORE_FEATURES || 479 has_feature)) 480 error = SET_ERROR(EINVAL); 481 break; 482 483 case ZPOOL_PROP_DELEGATION: 484 case ZPOOL_PROP_AUTOREPLACE: 485 case ZPOOL_PROP_LISTSNAPS: 486 case ZPOOL_PROP_AUTOEXPAND: 487 error = nvpair_value_uint64(elem, &intval); 488 if (!error && intval > 1) 489 error = SET_ERROR(EINVAL); 490 break; 491 492 case ZPOOL_PROP_BOOTFS: 493 /* 494 * If the pool version is less than SPA_VERSION_BOOTFS, 495 * or the pool is still being created (version == 0), 496 * the bootfs property cannot be set. 497 */ 498 if (spa_version(spa) < SPA_VERSION_BOOTFS) { 499 error = SET_ERROR(ENOTSUP); 500 break; 501 } 502 503 /* 504 * Make sure the vdev config is bootable 505 */ 506 if (!vdev_is_bootable(spa->spa_root_vdev)) { 507 error = SET_ERROR(ENOTSUP); 508 break; 509 } 510 511 reset_bootfs = 1; 512 513 error = nvpair_value_string(elem, &strval); 514 515 if (!error) { 516 objset_t *os; 517 uint64_t propval; 518 519 if (strval == NULL || strval[0] == '\0') { 520 objnum = zpool_prop_default_numeric( 521 ZPOOL_PROP_BOOTFS); 522 break; 523 } 524 525 if (error = dmu_objset_hold(strval, FTAG, &os)) 526 break; 527 528 /* 529 * Must be ZPL, and its property settings 530 * must be supported by GRUB (compression 531 * is not gzip, and large blocks are not used). 532 */ 533 534 if (dmu_objset_type(os) != DMU_OST_ZFS) { 535 error = SET_ERROR(ENOTSUP); 536 } else if ((error = 537 dsl_prop_get_int_ds(dmu_objset_ds(os), 538 zfs_prop_to_name(ZFS_PROP_COMPRESSION), 539 &propval)) == 0 && 540 !BOOTFS_COMPRESS_VALID(propval)) { 541 error = SET_ERROR(ENOTSUP); 542 } else if ((error = 543 dsl_prop_get_int_ds(dmu_objset_ds(os), 544 zfs_prop_to_name(ZFS_PROP_RECORDSIZE), 545 &propval)) == 0 && 546 propval > SPA_OLD_MAXBLOCKSIZE) { 547 error = SET_ERROR(ENOTSUP); 548 } else { 549 objnum = dmu_objset_id(os); 550 } 551 dmu_objset_rele(os, FTAG); 552 } 553 break; 554 555 case ZPOOL_PROP_FAILUREMODE: 556 error = nvpair_value_uint64(elem, &intval); 557 if (!error && (intval < ZIO_FAILURE_MODE_WAIT || 558 intval > ZIO_FAILURE_MODE_PANIC)) 559 error = SET_ERROR(EINVAL); 560 561 /* 562 * This is a special case which only occurs when 563 * the pool has completely failed. This allows 564 * the user to change the in-core failmode property 565 * without syncing it out to disk (I/Os might 566 * currently be blocked). We do this by returning 567 * EIO to the caller (spa_prop_set) to trick it 568 * into thinking we encountered a property validation 569 * error. 570 */ 571 if (!error && spa_suspended(spa)) { 572 spa->spa_failmode = intval; 573 error = SET_ERROR(EIO); 574 } 575 break; 576 577 case ZPOOL_PROP_CACHEFILE: 578 if ((error = nvpair_value_string(elem, &strval)) != 0) 579 break; 580 581 if (strval[0] == '\0') 582 break; 583 584 if (strcmp(strval, "none") == 0) 585 break; 586 587 if (strval[0] != '/') { 588 error = SET_ERROR(EINVAL); 589 break; 590 } 591 592 slash = strrchr(strval, '/'); 593 ASSERT(slash != NULL); 594 595 if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 596 strcmp(slash, "/..") == 0) 597 error = SET_ERROR(EINVAL); 598 break; 599 600 case ZPOOL_PROP_COMMENT: 601 if ((error = nvpair_value_string(elem, &strval)) != 0) 602 break; 603 for (check = strval; *check != '\0'; check++) { 604 /* 605 * The kernel doesn't have an easy isprint() 606 * check. For this kernel check, we merely 607 * check ASCII apart from DEL. Fix this if 608 * there is an easy-to-use kernel isprint(). 609 */ 610 if (*check >= 0x7f) { 611 error = SET_ERROR(EINVAL); 612 break; 613 } 614 } 615 if (strlen(strval) > ZPROP_MAX_COMMENT) 616 error = E2BIG; 617 break; 618 619 case ZPOOL_PROP_DEDUPDITTO: 620 if (spa_version(spa) < SPA_VERSION_DEDUP) 621 error = SET_ERROR(ENOTSUP); 622 else 623 error = nvpair_value_uint64(elem, &intval); 624 if (error == 0 && 625 intval != 0 && intval < ZIO_DEDUPDITTO_MIN) 626 error = SET_ERROR(EINVAL); 627 break; 628 } 629 630 if (error) 631 break; 632 } 633 634 if (!error && reset_bootfs) { 635 error = nvlist_remove(props, 636 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 637 638 if (!error) { 639 error = nvlist_add_uint64(props, 640 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 641 } 642 } 643 644 return (error); 645} 646 647void 648spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) 649{ 650 char *cachefile; 651 spa_config_dirent_t *dp; 652 653 if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), 654 &cachefile) != 0) 655 return; 656 657 dp = kmem_alloc(sizeof (spa_config_dirent_t), 658 KM_SLEEP); 659 660 if (cachefile[0] == '\0') 661 dp->scd_path = spa_strdup(spa_config_path); 662 else if (strcmp(cachefile, "none") == 0) 663 dp->scd_path = NULL; 664 else 665 dp->scd_path = spa_strdup(cachefile); 666 667 list_insert_head(&spa->spa_config_list, dp); 668 if (need_sync) 669 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 670} 671 672int 673spa_prop_set(spa_t *spa, nvlist_t *nvp) 674{ 675 int error; 676 nvpair_t *elem = NULL; 677 boolean_t need_sync = B_FALSE; 678 679 if ((error = spa_prop_validate(spa, nvp)) != 0) 680 return (error); 681 682 while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { 683 zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem)); 684 685 if (prop == ZPOOL_PROP_CACHEFILE || 686 prop == ZPOOL_PROP_ALTROOT || 687 prop == ZPOOL_PROP_READONLY) 688 continue; 689 690 if (prop == ZPOOL_PROP_VERSION || prop == ZPROP_INVAL) { 691 uint64_t ver; 692 693 if (prop == ZPOOL_PROP_VERSION) { 694 VERIFY(nvpair_value_uint64(elem, &ver) == 0); 695 } else { 696 ASSERT(zpool_prop_feature(nvpair_name(elem))); 697 ver = SPA_VERSION_FEATURES; 698 need_sync = B_TRUE; 699 } 700 701 /* Save time if the version is already set. */ 702 if (ver == spa_version(spa)) 703 continue; 704 705 /* 706 * In addition to the pool directory object, we might 707 * create the pool properties object, the features for 708 * read object, the features for write object, or the 709 * feature descriptions object. 710 */ 711 error = dsl_sync_task(spa->spa_name, NULL, 712 spa_sync_version, &ver, 713 6, ZFS_SPACE_CHECK_RESERVED); 714 if (error) 715 return (error); 716 continue; 717 } 718 719 need_sync = B_TRUE; 720 break; 721 } 722 723 if (need_sync) { 724 return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props, 725 nvp, 6, ZFS_SPACE_CHECK_RESERVED)); 726 } 727 728 return (0); 729} 730 731/* 732 * If the bootfs property value is dsobj, clear it. 733 */ 734void 735spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 736{ 737 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 738 VERIFY(zap_remove(spa->spa_meta_objset, 739 spa->spa_pool_props_object, 740 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 741 spa->spa_bootfs = 0; 742 } 743} 744 745/*ARGSUSED*/ 746static int 747spa_change_guid_check(void *arg, dmu_tx_t *tx) 748{ 749 uint64_t *newguid = arg; 750 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 751 vdev_t *rvd = spa->spa_root_vdev; 752 uint64_t vdev_state; 753 754 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 755 vdev_state = rvd->vdev_state; 756 spa_config_exit(spa, SCL_STATE, FTAG); 757 758 if (vdev_state != VDEV_STATE_HEALTHY) 759 return (SET_ERROR(ENXIO)); 760 761 ASSERT3U(spa_guid(spa), !=, *newguid); 762 763 return (0); 764} 765 766static void 767spa_change_guid_sync(void *arg, dmu_tx_t *tx) 768{ 769 uint64_t *newguid = arg; 770 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 771 uint64_t oldguid; 772 vdev_t *rvd = spa->spa_root_vdev; 773 774 oldguid = spa_guid(spa); 775 776 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 777 rvd->vdev_guid = *newguid; 778 rvd->vdev_guid_sum += (*newguid - oldguid); 779 vdev_config_dirty(rvd); 780 spa_config_exit(spa, SCL_STATE, FTAG); 781 782 spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu", 783 oldguid, *newguid); 784} 785 786/* 787 * Change the GUID for the pool. This is done so that we can later 788 * re-import a pool built from a clone of our own vdevs. We will modify 789 * the root vdev's guid, our own pool guid, and then mark all of our 790 * vdevs dirty. Note that we must make sure that all our vdevs are 791 * online when we do this, or else any vdevs that weren't present 792 * would be orphaned from our pool. We are also going to issue a 793 * sysevent to update any watchers. 794 */ 795int 796spa_change_guid(spa_t *spa) 797{ 798 int error; 799 uint64_t guid; 800 801 mutex_enter(&spa->spa_vdev_top_lock); 802 mutex_enter(&spa_namespace_lock); 803 guid = spa_generate_guid(NULL); 804 805 error = dsl_sync_task(spa->spa_name, spa_change_guid_check, 806 spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED); 807 808 if (error == 0) { 809 spa_config_sync(spa, B_FALSE, B_TRUE); 810 spa_event_notify(spa, NULL, ESC_ZFS_POOL_REGUID); 811 } 812 813 mutex_exit(&spa_namespace_lock); 814 mutex_exit(&spa->spa_vdev_top_lock); 815 816 return (error); 817} 818 819/* 820 * ========================================================================== 821 * SPA state manipulation (open/create/destroy/import/export) 822 * ========================================================================== 823 */ 824 825static int 826spa_error_entry_compare(const void *a, const void *b) 827{ 828 spa_error_entry_t *sa = (spa_error_entry_t *)a; 829 spa_error_entry_t *sb = (spa_error_entry_t *)b; 830 int ret; 831 832 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 833 sizeof (zbookmark_phys_t)); 834 835 if (ret < 0) 836 return (-1); 837 else if (ret > 0) 838 return (1); 839 else 840 return (0); 841} 842 843/* 844 * Utility function which retrieves copies of the current logs and 845 * re-initializes them in the process. 846 */ 847void 848spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 849{ 850 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 851 852 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 853 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 854 855 avl_create(&spa->spa_errlist_scrub, 856 spa_error_entry_compare, sizeof (spa_error_entry_t), 857 offsetof(spa_error_entry_t, se_avl)); 858 avl_create(&spa->spa_errlist_last, 859 spa_error_entry_compare, sizeof (spa_error_entry_t), 860 offsetof(spa_error_entry_t, se_avl)); 861} 862 863static void 864spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 865{ 866 const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; 867 enum zti_modes mode = ztip->zti_mode; 868 uint_t value = ztip->zti_value; 869 uint_t count = ztip->zti_count; 870 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 871 char name[32]; 872 uint_t flags = 0; 873 boolean_t batch = B_FALSE; 874 875 if (mode == ZTI_MODE_NULL) { 876 tqs->stqs_count = 0; 877 tqs->stqs_taskq = NULL; 878 return; 879 } 880 881 ASSERT3U(count, >, 0); 882 883 tqs->stqs_count = count; 884 tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP); 885 886 switch (mode) { 887 case ZTI_MODE_FIXED: 888 ASSERT3U(value, >=, 1); 889 value = MAX(value, 1); 890 break; 891 892 case ZTI_MODE_BATCH: 893 batch = B_TRUE; 894 flags |= TASKQ_THREADS_CPU_PCT; 895 value = zio_taskq_batch_pct; 896 break; 897 898 default: 899 panic("unrecognized mode for %s_%s taskq (%u:%u) in " 900 "spa_activate()", 901 zio_type_name[t], zio_taskq_types[q], mode, value); 902 break; 903 } 904 905 for (uint_t i = 0; i < count; i++) { 906 taskq_t *tq; 907 908 if (count > 1) { 909 (void) snprintf(name, sizeof (name), "%s_%s_%u", 910 zio_type_name[t], zio_taskq_types[q], i); 911 } else { 912 (void) snprintf(name, sizeof (name), "%s_%s", 913 zio_type_name[t], zio_taskq_types[q]); 914 } 915 916#ifdef SYSDC 917 if (zio_taskq_sysdc && spa->spa_proc != &p0) { 918 if (batch) 919 flags |= TASKQ_DC_BATCH; 920 921 tq = taskq_create_sysdc(name, value, 50, INT_MAX, 922 spa->spa_proc, zio_taskq_basedc, flags); 923 } else { 924#endif 925 pri_t pri = maxclsyspri; 926 /* 927 * The write issue taskq can be extremely CPU 928 * intensive. Run it at slightly lower priority 929 * than the other taskqs. 930 */ 931 if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) 932 pri++; 933 934 tq = taskq_create_proc(name, value, pri, 50, 935 INT_MAX, spa->spa_proc, flags); 936#ifdef SYSDC 937 } 938#endif 939 940 tqs->stqs_taskq[i] = tq; 941 } 942} 943 944static void 945spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 946{ 947 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 948 949 if (tqs->stqs_taskq == NULL) { 950 ASSERT0(tqs->stqs_count); 951 return; 952 } 953 954 for (uint_t i = 0; i < tqs->stqs_count; i++) { 955 ASSERT3P(tqs->stqs_taskq[i], !=, NULL); 956 taskq_destroy(tqs->stqs_taskq[i]); 957 } 958 959 kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *)); 960 tqs->stqs_taskq = NULL; 961} 962 963/* 964 * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority. 965 * Note that a type may have multiple discrete taskqs to avoid lock contention 966 * on the taskq itself. In that case we choose which taskq at random by using 967 * the low bits of gethrtime(). 968 */ 969void 970spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, 971 task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent) 972{ 973 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 974 taskq_t *tq; 975 976 ASSERT3P(tqs->stqs_taskq, !=, NULL); 977 ASSERT3U(tqs->stqs_count, !=, 0); 978 979 if (tqs->stqs_count == 1) { 980 tq = tqs->stqs_taskq[0]; 981 } else { 982#ifdef _KERNEL 983 tq = tqs->stqs_taskq[cpu_ticks() % tqs->stqs_count]; 984#else 985 tq = tqs->stqs_taskq[gethrtime() % tqs->stqs_count]; 986#endif 987 } 988 989 taskq_dispatch_ent(tq, func, arg, flags, ent); 990} 991 992static void 993spa_create_zio_taskqs(spa_t *spa) 994{ 995 for (int t = 0; t < ZIO_TYPES; t++) { 996 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 997 spa_taskqs_init(spa, t, q); 998 } 999 } 1000} 1001 1002#ifdef _KERNEL 1003#ifdef SPA_PROCESS 1004static void 1005spa_thread(void *arg) 1006{ 1007 callb_cpr_t cprinfo; 1008 1009 spa_t *spa = arg; 1010 user_t *pu = PTOU(curproc); 1011 1012 CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, 1013 spa->spa_name); 1014 1015 ASSERT(curproc != &p0); 1016 (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), 1017 "zpool-%s", spa->spa_name); 1018 (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); 1019 1020#ifdef PSRSET_BIND 1021 /* bind this thread to the requested psrset */ 1022 if (zio_taskq_psrset_bind != PS_NONE) { 1023 pool_lock(); 1024 mutex_enter(&cpu_lock); 1025 mutex_enter(&pidlock); 1026 mutex_enter(&curproc->p_lock); 1027 1028 if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, 1029 0, NULL, NULL) == 0) { 1030 curthread->t_bind_pset = zio_taskq_psrset_bind; 1031 } else { 1032 cmn_err(CE_WARN, 1033 "Couldn't bind process for zfs pool \"%s\" to " 1034 "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); 1035 } 1036 1037 mutex_exit(&curproc->p_lock); 1038 mutex_exit(&pidlock); 1039 mutex_exit(&cpu_lock); 1040 pool_unlock(); 1041 } 1042#endif 1043 1044#ifdef SYSDC 1045 if (zio_taskq_sysdc) { 1046 sysdc_thread_enter(curthread, 100, 0); 1047 } 1048#endif 1049 1050 spa->spa_proc = curproc; 1051 spa->spa_did = curthread->t_did; 1052 1053 spa_create_zio_taskqs(spa); 1054 1055 mutex_enter(&spa->spa_proc_lock); 1056 ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); 1057 1058 spa->spa_proc_state = SPA_PROC_ACTIVE; 1059 cv_broadcast(&spa->spa_proc_cv); 1060 1061 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1062 while (spa->spa_proc_state == SPA_PROC_ACTIVE) 1063 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1064 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); 1065 1066 ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); 1067 spa->spa_proc_state = SPA_PROC_GONE; 1068 spa->spa_proc = &p0; 1069 cv_broadcast(&spa->spa_proc_cv); 1070 CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ 1071 1072 mutex_enter(&curproc->p_lock); 1073 lwp_exit(); 1074} 1075#endif /* SPA_PROCESS */ 1076#endif 1077 1078/* 1079 * Activate an uninitialized pool. 1080 */ 1081static void 1082spa_activate(spa_t *spa, int mode) 1083{ 1084 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 1085 1086 spa->spa_state = POOL_STATE_ACTIVE; 1087 spa->spa_mode = mode; 1088 1089 spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops); 1090 spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); 1091 1092 /* Try to create a covering process */ 1093 mutex_enter(&spa->spa_proc_lock); 1094 ASSERT(spa->spa_proc_state == SPA_PROC_NONE); 1095 ASSERT(spa->spa_proc == &p0); 1096 spa->spa_did = 0; 1097 1098#ifdef SPA_PROCESS 1099 /* Only create a process if we're going to be around a while. */ 1100 if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { 1101 if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, 1102 NULL, 0) == 0) { 1103 spa->spa_proc_state = SPA_PROC_CREATED; 1104 while (spa->spa_proc_state == SPA_PROC_CREATED) { 1105 cv_wait(&spa->spa_proc_cv, 1106 &spa->spa_proc_lock); 1107 } 1108 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1109 ASSERT(spa->spa_proc != &p0); 1110 ASSERT(spa->spa_did != 0); 1111 } else { 1112#ifdef _KERNEL 1113 cmn_err(CE_WARN, 1114 "Couldn't create process for zfs pool \"%s\"\n", 1115 spa->spa_name); 1116#endif 1117 } 1118 } 1119#endif /* SPA_PROCESS */ 1120 mutex_exit(&spa->spa_proc_lock); 1121 1122 /* If we didn't create a process, we need to create our taskqs. */ 1123 ASSERT(spa->spa_proc == &p0); 1124 if (spa->spa_proc == &p0) { 1125 spa_create_zio_taskqs(spa); 1126 } 1127 1128 /* 1129 * Start TRIM thread. 1130 */ 1131 trim_thread_create(spa); 1132 1133 list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), 1134 offsetof(vdev_t, vdev_config_dirty_node)); 1135 list_create(&spa->spa_evicting_os_list, sizeof (objset_t), 1136 offsetof(objset_t, os_evicting_node)); 1137 list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), 1138 offsetof(vdev_t, vdev_state_dirty_node)); 1139 1140 txg_list_create(&spa->spa_vdev_txg_list, 1141 offsetof(struct vdev, vdev_txg_node)); 1142 1143 avl_create(&spa->spa_errlist_scrub, 1144 spa_error_entry_compare, sizeof (spa_error_entry_t), 1145 offsetof(spa_error_entry_t, se_avl)); 1146 avl_create(&spa->spa_errlist_last, 1147 spa_error_entry_compare, sizeof (spa_error_entry_t), 1148 offsetof(spa_error_entry_t, se_avl)); 1149} 1150 1151/* 1152 * Opposite of spa_activate(). 1153 */ 1154static void 1155spa_deactivate(spa_t *spa) 1156{ 1157 ASSERT(spa->spa_sync_on == B_FALSE); 1158 ASSERT(spa->spa_dsl_pool == NULL); 1159 ASSERT(spa->spa_root_vdev == NULL); 1160 ASSERT(spa->spa_async_zio_root == NULL); 1161 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 1162 1163 /* 1164 * Stop TRIM thread in case spa_unload() wasn't called directly 1165 * before spa_deactivate(). 1166 */ 1167 trim_thread_destroy(spa); 1168 1169 spa_evicting_os_wait(spa); 1170 1171 txg_list_destroy(&spa->spa_vdev_txg_list); 1172 1173 list_destroy(&spa->spa_config_dirty_list); 1174 list_destroy(&spa->spa_evicting_os_list); 1175 list_destroy(&spa->spa_state_dirty_list); 1176 1177 for (int t = 0; t < ZIO_TYPES; t++) { 1178 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 1179 spa_taskqs_fini(spa, t, q); 1180 } 1181 } 1182 1183 metaslab_class_destroy(spa->spa_normal_class); 1184 spa->spa_normal_class = NULL; 1185 1186 metaslab_class_destroy(spa->spa_log_class); 1187 spa->spa_log_class = NULL; 1188 1189 /* 1190 * If this was part of an import or the open otherwise failed, we may 1191 * still have errors left in the queues. Empty them just in case. 1192 */ 1193 spa_errlog_drain(spa); 1194 1195 avl_destroy(&spa->spa_errlist_scrub); 1196 avl_destroy(&spa->spa_errlist_last); 1197 1198 spa->spa_state = POOL_STATE_UNINITIALIZED; 1199 1200 mutex_enter(&spa->spa_proc_lock); 1201 if (spa->spa_proc_state != SPA_PROC_NONE) { 1202 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1203 spa->spa_proc_state = SPA_PROC_DEACTIVATE; 1204 cv_broadcast(&spa->spa_proc_cv); 1205 while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { 1206 ASSERT(spa->spa_proc != &p0); 1207 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1208 } 1209 ASSERT(spa->spa_proc_state == SPA_PROC_GONE); 1210 spa->spa_proc_state = SPA_PROC_NONE; 1211 } 1212 ASSERT(spa->spa_proc == &p0); 1213 mutex_exit(&spa->spa_proc_lock); 1214 1215#ifdef SPA_PROCESS 1216 /* 1217 * We want to make sure spa_thread() has actually exited the ZFS 1218 * module, so that the module can't be unloaded out from underneath 1219 * it. 1220 */ 1221 if (spa->spa_did != 0) { 1222 thread_join(spa->spa_did); 1223 spa->spa_did = 0; 1224 } 1225#endif /* SPA_PROCESS */ 1226} 1227 1228/* 1229 * Verify a pool configuration, and construct the vdev tree appropriately. This 1230 * will create all the necessary vdevs in the appropriate layout, with each vdev 1231 * in the CLOSED state. This will prep the pool before open/creation/import. 1232 * All vdev validation is done by the vdev_alloc() routine. 1233 */ 1234static int 1235spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 1236 uint_t id, int atype) 1237{ 1238 nvlist_t **child; 1239 uint_t children; 1240 int error; 1241 1242 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 1243 return (error); 1244 1245 if ((*vdp)->vdev_ops->vdev_op_leaf) 1246 return (0); 1247 1248 error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 1249 &child, &children); 1250 1251 if (error == ENOENT) 1252 return (0); 1253 1254 if (error) { 1255 vdev_free(*vdp); 1256 *vdp = NULL; 1257 return (SET_ERROR(EINVAL)); 1258 } 1259 1260 for (int c = 0; c < children; c++) { 1261 vdev_t *vd; 1262 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 1263 atype)) != 0) { 1264 vdev_free(*vdp); 1265 *vdp = NULL; 1266 return (error); 1267 } 1268 } 1269 1270 ASSERT(*vdp != NULL); 1271 1272 return (0); 1273} 1274 1275/* 1276 * Opposite of spa_load(). 1277 */ 1278static void 1279spa_unload(spa_t *spa) 1280{ 1281 int i; 1282 1283 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1284 1285 /* 1286 * Stop TRIM thread. 1287 */ 1288 trim_thread_destroy(spa); 1289 1290 /* 1291 * Stop async tasks. 1292 */ 1293 spa_async_suspend(spa); 1294 1295 /* 1296 * Stop syncing. 1297 */ 1298 if (spa->spa_sync_on) { 1299 txg_sync_stop(spa->spa_dsl_pool); 1300 spa->spa_sync_on = B_FALSE; 1301 } 1302 1303 /* 1304 * Wait for any outstanding async I/O to complete. 1305 */ 1306 if (spa->spa_async_zio_root != NULL) { 1307 for (int i = 0; i < max_ncpus; i++) 1308 (void) zio_wait(spa->spa_async_zio_root[i]); 1309 kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *)); 1310 spa->spa_async_zio_root = NULL; 1311 } 1312 1313 bpobj_close(&spa->spa_deferred_bpobj); 1314 1315 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1316 1317 /* 1318 * Close all vdevs. 1319 */ 1320 if (spa->spa_root_vdev) 1321 vdev_free(spa->spa_root_vdev); 1322 ASSERT(spa->spa_root_vdev == NULL); 1323 1324 /* 1325 * Close the dsl pool. 1326 */ 1327 if (spa->spa_dsl_pool) { 1328 dsl_pool_close(spa->spa_dsl_pool); 1329 spa->spa_dsl_pool = NULL; 1330 spa->spa_meta_objset = NULL; 1331 } 1332 1333 ddt_unload(spa); 1334 1335 1336 /* 1337 * Drop and purge level 2 cache 1338 */ 1339 spa_l2cache_drop(spa); 1340 1341 for (i = 0; i < spa->spa_spares.sav_count; i++) 1342 vdev_free(spa->spa_spares.sav_vdevs[i]); 1343 if (spa->spa_spares.sav_vdevs) { 1344 kmem_free(spa->spa_spares.sav_vdevs, 1345 spa->spa_spares.sav_count * sizeof (void *)); 1346 spa->spa_spares.sav_vdevs = NULL; 1347 } 1348 if (spa->spa_spares.sav_config) { 1349 nvlist_free(spa->spa_spares.sav_config); 1350 spa->spa_spares.sav_config = NULL; 1351 } 1352 spa->spa_spares.sav_count = 0; 1353 1354 for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 1355 vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]); 1356 vdev_free(spa->spa_l2cache.sav_vdevs[i]); 1357 } 1358 if (spa->spa_l2cache.sav_vdevs) { 1359 kmem_free(spa->spa_l2cache.sav_vdevs, 1360 spa->spa_l2cache.sav_count * sizeof (void *)); 1361 spa->spa_l2cache.sav_vdevs = NULL; 1362 } 1363 if (spa->spa_l2cache.sav_config) { 1364 nvlist_free(spa->spa_l2cache.sav_config); 1365 spa->spa_l2cache.sav_config = NULL; 1366 } 1367 spa->spa_l2cache.sav_count = 0; 1368 1369 spa->spa_async_suspended = 0; 1370 1371 if (spa->spa_comment != NULL) { 1372 spa_strfree(spa->spa_comment); 1373 spa->spa_comment = NULL; 1374 } 1375 1376 spa_config_exit(spa, SCL_ALL, FTAG); 1377} 1378 1379/* 1380 * Load (or re-load) the current list of vdevs describing the active spares for 1381 * this pool. When this is called, we have some form of basic information in 1382 * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and 1383 * then re-generate a more complete list including status information. 1384 */ 1385static void 1386spa_load_spares(spa_t *spa) 1387{ 1388 nvlist_t **spares; 1389 uint_t nspares; 1390 int i; 1391 vdev_t *vd, *tvd; 1392 1393 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1394 1395 /* 1396 * First, close and free any existing spare vdevs. 1397 */ 1398 for (i = 0; i < spa->spa_spares.sav_count; i++) { 1399 vd = spa->spa_spares.sav_vdevs[i]; 1400 1401 /* Undo the call to spa_activate() below */ 1402 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1403 B_FALSE)) != NULL && tvd->vdev_isspare) 1404 spa_spare_remove(tvd); 1405 vdev_close(vd); 1406 vdev_free(vd); 1407 } 1408 1409 if (spa->spa_spares.sav_vdevs) 1410 kmem_free(spa->spa_spares.sav_vdevs, 1411 spa->spa_spares.sav_count * sizeof (void *)); 1412 1413 if (spa->spa_spares.sav_config == NULL) 1414 nspares = 0; 1415 else 1416 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 1417 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1418 1419 spa->spa_spares.sav_count = (int)nspares; 1420 spa->spa_spares.sav_vdevs = NULL; 1421 1422 if (nspares == 0) 1423 return; 1424 1425 /* 1426 * Construct the array of vdevs, opening them to get status in the 1427 * process. For each spare, there is potentially two different vdev_t 1428 * structures associated with it: one in the list of spares (used only 1429 * for basic validation purposes) and one in the active vdev 1430 * configuration (if it's spared in). During this phase we open and 1431 * validate each vdev on the spare list. If the vdev also exists in the 1432 * active configuration, then we also mark this vdev as an active spare. 1433 */ 1434 spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), 1435 KM_SLEEP); 1436 for (i = 0; i < spa->spa_spares.sav_count; i++) { 1437 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 1438 VDEV_ALLOC_SPARE) == 0); 1439 ASSERT(vd != NULL); 1440 1441 spa->spa_spares.sav_vdevs[i] = vd; 1442 1443 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1444 B_FALSE)) != NULL) { 1445 if (!tvd->vdev_isspare) 1446 spa_spare_add(tvd); 1447 1448 /* 1449 * We only mark the spare active if we were successfully 1450 * able to load the vdev. Otherwise, importing a pool 1451 * with a bad active spare would result in strange 1452 * behavior, because multiple pool would think the spare 1453 * is actively in use. 1454 * 1455 * There is a vulnerability here to an equally bizarre 1456 * circumstance, where a dead active spare is later 1457 * brought back to life (onlined or otherwise). Given 1458 * the rarity of this scenario, and the extra complexity 1459 * it adds, we ignore the possibility. 1460 */ 1461 if (!vdev_is_dead(tvd)) 1462 spa_spare_activate(tvd); 1463 } 1464 1465 vd->vdev_top = vd; 1466 vd->vdev_aux = &spa->spa_spares; 1467 1468 if (vdev_open(vd) != 0) 1469 continue; 1470 1471 if (vdev_validate_aux(vd) == 0) 1472 spa_spare_add(vd); 1473 } 1474 1475 /* 1476 * Recompute the stashed list of spares, with status information 1477 * this time. 1478 */ 1479 VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, 1480 DATA_TYPE_NVLIST_ARRAY) == 0); 1481 1482 spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), 1483 KM_SLEEP); 1484 for (i = 0; i < spa->spa_spares.sav_count; i++) 1485 spares[i] = vdev_config_generate(spa, 1486 spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); 1487 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 1488 ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); 1489 for (i = 0; i < spa->spa_spares.sav_count; i++) 1490 nvlist_free(spares[i]); 1491 kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); 1492} 1493 1494/* 1495 * Load (or re-load) the current list of vdevs describing the active l2cache for 1496 * this pool. When this is called, we have some form of basic information in 1497 * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and 1498 * then re-generate a more complete list including status information. 1499 * Devices which are already active have their details maintained, and are 1500 * not re-opened. 1501 */ 1502static void 1503spa_load_l2cache(spa_t *spa) 1504{ 1505 nvlist_t **l2cache; 1506 uint_t nl2cache; 1507 int i, j, oldnvdevs; 1508 uint64_t guid; 1509 vdev_t *vd, **oldvdevs, **newvdevs; 1510 spa_aux_vdev_t *sav = &spa->spa_l2cache; 1511 1512 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1513 1514 if (sav->sav_config != NULL) { 1515 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 1516 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 1517 newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); 1518 } else { 1519 nl2cache = 0; 1520 newvdevs = NULL; 1521 } 1522 1523 oldvdevs = sav->sav_vdevs; 1524 oldnvdevs = sav->sav_count; 1525 sav->sav_vdevs = NULL; 1526 sav->sav_count = 0; 1527 1528 /* 1529 * Process new nvlist of vdevs. 1530 */ 1531 for (i = 0; i < nl2cache; i++) { 1532 VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, 1533 &guid) == 0); 1534 1535 newvdevs[i] = NULL; 1536 for (j = 0; j < oldnvdevs; j++) { 1537 vd = oldvdevs[j]; 1538 if (vd != NULL && guid == vd->vdev_guid) { 1539 /* 1540 * Retain previous vdev for add/remove ops. 1541 */ 1542 newvdevs[i] = vd; 1543 oldvdevs[j] = NULL; 1544 break; 1545 } 1546 } 1547 1548 if (newvdevs[i] == NULL) { 1549 /* 1550 * Create new vdev 1551 */ 1552 VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, 1553 VDEV_ALLOC_L2CACHE) == 0); 1554 ASSERT(vd != NULL); 1555 newvdevs[i] = vd; 1556 1557 /* 1558 * Commit this vdev as an l2cache device, 1559 * even if it fails to open. 1560 */ 1561 spa_l2cache_add(vd); 1562 1563 vd->vdev_top = vd; 1564 vd->vdev_aux = sav; 1565 1566 spa_l2cache_activate(vd); 1567 1568 if (vdev_open(vd) != 0) 1569 continue; 1570 1571 (void) vdev_validate_aux(vd); 1572 1573 if (!vdev_is_dead(vd)) 1574 l2arc_add_vdev(spa, vd); 1575 } 1576 } 1577 1578 /* 1579 * Purge vdevs that were dropped 1580 */ 1581 for (i = 0; i < oldnvdevs; i++) { 1582 uint64_t pool; 1583 1584 vd = oldvdevs[i]; 1585 if (vd != NULL) { 1586 ASSERT(vd->vdev_isl2cache); 1587 1588 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 1589 pool != 0ULL && l2arc_vdev_present(vd)) 1590 l2arc_remove_vdev(vd); 1591 vdev_clear_stats(vd); 1592 vdev_free(vd); 1593 } 1594 } 1595 1596 if (oldvdevs) 1597 kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); 1598 1599 if (sav->sav_config == NULL) 1600 goto out; 1601 1602 sav->sav_vdevs = newvdevs; 1603 sav->sav_count = (int)nl2cache; 1604 1605 /* 1606 * Recompute the stashed list of l2cache devices, with status 1607 * information this time. 1608 */ 1609 VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 1610 DATA_TYPE_NVLIST_ARRAY) == 0); 1611 1612 l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 1613 for (i = 0; i < sav->sav_count; i++) 1614 l2cache[i] = vdev_config_generate(spa, 1615 sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); 1616 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 1617 ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); 1618out: 1619 for (i = 0; i < sav->sav_count; i++) 1620 nvlist_free(l2cache[i]); 1621 if (sav->sav_count) 1622 kmem_free(l2cache, sav->sav_count * sizeof (void *)); 1623} 1624 1625static int 1626load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 1627{ 1628 dmu_buf_t *db; 1629 char *packed = NULL; 1630 size_t nvsize = 0; 1631 int error; 1632 *value = NULL; 1633 1634 error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db); 1635 if (error != 0) 1636 return (error); 1637 1638 nvsize = *(uint64_t *)db->db_data; 1639 dmu_buf_rele(db, FTAG); 1640 1641 packed = kmem_alloc(nvsize, KM_SLEEP); 1642 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, 1643 DMU_READ_PREFETCH); 1644 if (error == 0) 1645 error = nvlist_unpack(packed, nvsize, value, 0); 1646 kmem_free(packed, nvsize); 1647 1648 return (error); 1649} 1650 1651/* 1652 * Checks to see if the given vdev could not be opened, in which case we post a 1653 * sysevent to notify the autoreplace code that the device has been removed. 1654 */ 1655static void 1656spa_check_removed(vdev_t *vd) 1657{ 1658 for (int c = 0; c < vd->vdev_children; c++) 1659 spa_check_removed(vd->vdev_child[c]); 1660 1661 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) && 1662 !vd->vdev_ishole) { 1663 zfs_post_autoreplace(vd->vdev_spa, vd); 1664 spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK); 1665 } 1666} 1667 1668static void 1669spa_config_valid_zaps(vdev_t *vd, vdev_t *mvd) 1670{ 1671 ASSERT3U(vd->vdev_children, ==, mvd->vdev_children); 1672 1673 vd->vdev_top_zap = mvd->vdev_top_zap; 1674 vd->vdev_leaf_zap = mvd->vdev_leaf_zap; 1675 1676 for (uint64_t i = 0; i < vd->vdev_children; i++) { 1677 spa_config_valid_zaps(vd->vdev_child[i], mvd->vdev_child[i]); 1678 } 1679} 1680 1681/* 1682 * Validate the current config against the MOS config 1683 */ 1684static boolean_t 1685spa_config_valid(spa_t *spa, nvlist_t *config) 1686{ 1687 vdev_t *mrvd, *rvd = spa->spa_root_vdev; 1688 nvlist_t *nv; 1689 1690 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0); 1691 1692 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1693 VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); 1694 1695 ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children); 1696 1697 /* 1698 * If we're doing a normal import, then build up any additional 1699 * diagnostic information about missing devices in this config. 1700 * We'll pass this up to the user for further processing. 1701 */ 1702 if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { 1703 nvlist_t **child, *nv; 1704 uint64_t idx = 0; 1705 1706 child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **), 1707 KM_SLEEP); 1708 VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); 1709 1710 for (int c = 0; c < rvd->vdev_children; c++) { 1711 vdev_t *tvd = rvd->vdev_child[c]; 1712 vdev_t *mtvd = mrvd->vdev_child[c]; 1713 1714 if (tvd->vdev_ops == &vdev_missing_ops && 1715 mtvd->vdev_ops != &vdev_missing_ops && 1716 mtvd->vdev_islog) 1717 child[idx++] = vdev_config_generate(spa, mtvd, 1718 B_FALSE, 0); 1719 } 1720 1721 if (idx) { 1722 VERIFY(nvlist_add_nvlist_array(nv, 1723 ZPOOL_CONFIG_CHILDREN, child, idx) == 0); 1724 VERIFY(nvlist_add_nvlist(spa->spa_load_info, 1725 ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0); 1726 1727 for (int i = 0; i < idx; i++) 1728 nvlist_free(child[i]); 1729 } 1730 nvlist_free(nv); 1731 kmem_free(child, rvd->vdev_children * sizeof (char **)); 1732 } 1733 1734 /* 1735 * Compare the root vdev tree with the information we have 1736 * from the MOS config (mrvd). Check each top-level vdev 1737 * with the corresponding MOS config top-level (mtvd). 1738 */ 1739 for (int c = 0; c < rvd->vdev_children; c++) { 1740 vdev_t *tvd = rvd->vdev_child[c]; 1741 vdev_t *mtvd = mrvd->vdev_child[c]; 1742 1743 /* 1744 * Resolve any "missing" vdevs in the current configuration. 1745 * If we find that the MOS config has more accurate information 1746 * about the top-level vdev then use that vdev instead. 1747 */ 1748 if (tvd->vdev_ops == &vdev_missing_ops && 1749 mtvd->vdev_ops != &vdev_missing_ops) { 1750 1751 if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) 1752 continue; 1753 1754 /* 1755 * Device specific actions. 1756 */ 1757 if (mtvd->vdev_islog) { 1758 spa_set_log_state(spa, SPA_LOG_CLEAR); 1759 } else { 1760 /* 1761 * XXX - once we have 'readonly' pool 1762 * support we should be able to handle 1763 * missing data devices by transitioning 1764 * the pool to readonly. 1765 */ 1766 continue; 1767 } 1768 1769 /* 1770 * Swap the missing vdev with the data we were 1771 * able to obtain from the MOS config. 1772 */ 1773 vdev_remove_child(rvd, tvd); 1774 vdev_remove_child(mrvd, mtvd); 1775 1776 vdev_add_child(rvd, mtvd); 1777 vdev_add_child(mrvd, tvd); 1778 1779 spa_config_exit(spa, SCL_ALL, FTAG); 1780 vdev_load(mtvd); 1781 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1782 1783 vdev_reopen(rvd); 1784 } else { 1785 if (mtvd->vdev_islog) { 1786 /* 1787 * Load the slog device's state from the MOS 1788 * config since it's possible that the label 1789 * does not contain the most up-to-date 1790 * information. 1791 */ 1792 vdev_load_log_state(tvd, mtvd); 1793 vdev_reopen(tvd); 1794 } 1795 1796 /* 1797 * Per-vdev ZAP info is stored exclusively in the MOS. 1798 */ 1799 spa_config_valid_zaps(tvd, mtvd); 1800 } 1801 } 1802 1803 vdev_free(mrvd); 1804 spa_config_exit(spa, SCL_ALL, FTAG); 1805 1806 /* 1807 * Ensure we were able to validate the config. 1808 */ 1809 return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum); 1810} 1811 1812/* 1813 * Check for missing log devices 1814 */ 1815static boolean_t 1816spa_check_logs(spa_t *spa) 1817{ 1818 boolean_t rv = B_FALSE; 1819 dsl_pool_t *dp = spa_get_dsl(spa); 1820 1821 switch (spa->spa_log_state) { 1822 case SPA_LOG_MISSING: 1823 /* need to recheck in case slog has been restored */ 1824 case SPA_LOG_UNKNOWN: 1825 rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj, 1826 zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0); 1827 if (rv) 1828 spa_set_log_state(spa, SPA_LOG_MISSING); 1829 break; 1830 } 1831 return (rv); 1832} 1833 1834static boolean_t 1835spa_passivate_log(spa_t *spa) 1836{ 1837 vdev_t *rvd = spa->spa_root_vdev; 1838 boolean_t slog_found = B_FALSE; 1839 1840 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1841 1842 if (!spa_has_slogs(spa)) 1843 return (B_FALSE); 1844 1845 for (int c = 0; c < rvd->vdev_children; c++) { 1846 vdev_t *tvd = rvd->vdev_child[c]; 1847 metaslab_group_t *mg = tvd->vdev_mg; 1848 1849 if (tvd->vdev_islog) { 1850 metaslab_group_passivate(mg); 1851 slog_found = B_TRUE; 1852 } 1853 } 1854 1855 return (slog_found); 1856} 1857 1858static void 1859spa_activate_log(spa_t *spa) 1860{ 1861 vdev_t *rvd = spa->spa_root_vdev; 1862 1863 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1864 1865 for (int c = 0; c < rvd->vdev_children; c++) { 1866 vdev_t *tvd = rvd->vdev_child[c]; 1867 metaslab_group_t *mg = tvd->vdev_mg; 1868 1869 if (tvd->vdev_islog) 1870 metaslab_group_activate(mg); 1871 } 1872} 1873 1874int 1875spa_offline_log(spa_t *spa) 1876{ 1877 int error; 1878 1879 error = dmu_objset_find(spa_name(spa), zil_vdev_offline, 1880 NULL, DS_FIND_CHILDREN); 1881 if (error == 0) { 1882 /* 1883 * We successfully offlined the log device, sync out the 1884 * current txg so that the "stubby" block can be removed 1885 * by zil_sync(). 1886 */ 1887 txg_wait_synced(spa->spa_dsl_pool, 0); 1888 } 1889 return (error); 1890} 1891 1892static void 1893spa_aux_check_removed(spa_aux_vdev_t *sav) 1894{ 1895 int i; 1896 1897 for (i = 0; i < sav->sav_count; i++) 1898 spa_check_removed(sav->sav_vdevs[i]); 1899} 1900 1901void 1902spa_claim_notify(zio_t *zio) 1903{ 1904 spa_t *spa = zio->io_spa; 1905 1906 if (zio->io_error) 1907 return; 1908 1909 mutex_enter(&spa->spa_props_lock); /* any mutex will do */ 1910 if (spa->spa_claim_max_txg < zio->io_bp->blk_birth) 1911 spa->spa_claim_max_txg = zio->io_bp->blk_birth; 1912 mutex_exit(&spa->spa_props_lock); 1913} 1914 1915typedef struct spa_load_error { 1916 uint64_t sle_meta_count; 1917 uint64_t sle_data_count; 1918} spa_load_error_t; 1919 1920static void 1921spa_load_verify_done(zio_t *zio) 1922{ 1923 blkptr_t *bp = zio->io_bp; 1924 spa_load_error_t *sle = zio->io_private; 1925 dmu_object_type_t type = BP_GET_TYPE(bp); 1926 int error = zio->io_error; 1927 spa_t *spa = zio->io_spa; 1928 1929 if (error) { 1930 if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && 1931 type != DMU_OT_INTENT_LOG) 1932 atomic_inc_64(&sle->sle_meta_count); 1933 else 1934 atomic_inc_64(&sle->sle_data_count); 1935 } 1936 zio_data_buf_free(zio->io_data, zio->io_size); 1937 1938 mutex_enter(&spa->spa_scrub_lock); 1939 spa->spa_scrub_inflight--; 1940 cv_broadcast(&spa->spa_scrub_io_cv); 1941 mutex_exit(&spa->spa_scrub_lock); 1942} 1943 1944/* 1945 * Maximum number of concurrent scrub i/os to create while verifying 1946 * a pool while importing it. 1947 */ 1948int spa_load_verify_maxinflight = 10000; 1949boolean_t spa_load_verify_metadata = B_TRUE; 1950boolean_t spa_load_verify_data = B_TRUE; 1951 1952SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_maxinflight, CTLFLAG_RWTUN, 1953 &spa_load_verify_maxinflight, 0, 1954 "Maximum number of concurrent scrub I/Os to create while verifying a " 1955 "pool while importing it"); 1956 1957SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_metadata, CTLFLAG_RWTUN, 1958 &spa_load_verify_metadata, 0, 1959 "Check metadata on import?"); 1960 1961SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_data, CTLFLAG_RWTUN, 1962 &spa_load_verify_data, 0, 1963 "Check user data on import?"); 1964 1965/*ARGSUSED*/ 1966static int 1967spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 1968 const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) 1969{ 1970 if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) 1971 return (0); 1972 /* 1973 * Note: normally this routine will not be called if 1974 * spa_load_verify_metadata is not set. However, it may be useful 1975 * to manually set the flag after the traversal has begun. 1976 */ 1977 if (!spa_load_verify_metadata) 1978 return (0); 1979 if (BP_GET_BUFC_TYPE(bp) == ARC_BUFC_DATA && !spa_load_verify_data) 1980 return (0); 1981 1982 zio_t *rio = arg; 1983 size_t size = BP_GET_PSIZE(bp); 1984 void *data = zio_data_buf_alloc(size); 1985 1986 mutex_enter(&spa->spa_scrub_lock); 1987 while (spa->spa_scrub_inflight >= spa_load_verify_maxinflight) 1988 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1989 spa->spa_scrub_inflight++; 1990 mutex_exit(&spa->spa_scrub_lock); 1991 1992 zio_nowait(zio_read(rio, spa, bp, data, size, 1993 spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, 1994 ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | 1995 ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); 1996 return (0); 1997} 1998 1999static int 2000spa_load_verify(spa_t *spa) 2001{ 2002 zio_t *rio; 2003 spa_load_error_t sle = { 0 }; 2004 zpool_rewind_policy_t policy; 2005 boolean_t verify_ok = B_FALSE; 2006 int error = 0; 2007 2008 zpool_get_rewind_policy(spa->spa_config, &policy); 2009 2010 if (policy.zrp_request & ZPOOL_NEVER_REWIND) 2011 return (0); 2012 2013 rio = zio_root(spa, NULL, &sle, 2014 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 2015 2016 if (spa_load_verify_metadata) { 2017 error = traverse_pool(spa, spa->spa_verify_min_txg, 2018 TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, 2019 spa_load_verify_cb, rio); 2020 } 2021 2022 (void) zio_wait(rio); 2023 2024 spa->spa_load_meta_errors = sle.sle_meta_count; 2025 spa->spa_load_data_errors = sle.sle_data_count; 2026 2027 if (!error && sle.sle_meta_count <= policy.zrp_maxmeta && 2028 sle.sle_data_count <= policy.zrp_maxdata) { 2029 int64_t loss = 0; 2030 2031 verify_ok = B_TRUE; 2032 spa->spa_load_txg = spa->spa_uberblock.ub_txg; 2033 spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; 2034 2035 loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; 2036 VERIFY(nvlist_add_uint64(spa->spa_load_info, 2037 ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0); 2038 VERIFY(nvlist_add_int64(spa->spa_load_info, 2039 ZPOOL_CONFIG_REWIND_TIME, loss) == 0); 2040 VERIFY(nvlist_add_uint64(spa->spa_load_info, 2041 ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0); 2042 } else { 2043 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; 2044 } 2045 2046 if (error) { 2047 if (error != ENXIO && error != EIO) 2048 error = SET_ERROR(EIO); 2049 return (error); 2050 } 2051 2052 return (verify_ok ? 0 : EIO); 2053} 2054 2055/* 2056 * Find a value in the pool props object. 2057 */ 2058static void 2059spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) 2060{ 2061 (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, 2062 zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); 2063} 2064 2065/* 2066 * Find a value in the pool directory object. 2067 */ 2068static int 2069spa_dir_prop(spa_t *spa, const char *name, uint64_t *val) 2070{ 2071 return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 2072 name, sizeof (uint64_t), 1, val)); 2073} 2074 2075static int 2076spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) 2077{ 2078 vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); 2079 return (err); 2080} 2081 2082/* 2083 * Fix up config after a partly-completed split. This is done with the 2084 * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off 2085 * pool have that entry in their config, but only the splitting one contains 2086 * a list of all the guids of the vdevs that are being split off. 2087 * 2088 * This function determines what to do with that list: either rejoin 2089 * all the disks to the pool, or complete the splitting process. To attempt 2090 * the rejoin, each disk that is offlined is marked online again, and 2091 * we do a reopen() call. If the vdev label for every disk that was 2092 * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) 2093 * then we call vdev_split() on each disk, and complete the split. 2094 * 2095 * Otherwise we leave the config alone, with all the vdevs in place in 2096 * the original pool. 2097 */ 2098static void 2099spa_try_repair(spa_t *spa, nvlist_t *config) 2100{ 2101 uint_t extracted; 2102 uint64_t *glist; 2103 uint_t i, gcount; 2104 nvlist_t *nvl; 2105 vdev_t **vd; 2106 boolean_t attempt_reopen; 2107 2108 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) 2109 return; 2110 2111 /* check that the config is complete */ 2112 if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 2113 &glist, &gcount) != 0) 2114 return; 2115 2116 vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); 2117 2118 /* attempt to online all the vdevs & validate */ 2119 attempt_reopen = B_TRUE; 2120 for (i = 0; i < gcount; i++) { 2121 if (glist[i] == 0) /* vdev is hole */ 2122 continue; 2123 2124 vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); 2125 if (vd[i] == NULL) { 2126 /* 2127 * Don't bother attempting to reopen the disks; 2128 * just do the split. 2129 */ 2130 attempt_reopen = B_FALSE; 2131 } else { 2132 /* attempt to re-online it */ 2133 vd[i]->vdev_offline = B_FALSE; 2134 } 2135 } 2136 2137 if (attempt_reopen) { 2138 vdev_reopen(spa->spa_root_vdev); 2139 2140 /* check each device to see what state it's in */ 2141 for (extracted = 0, i = 0; i < gcount; i++) { 2142 if (vd[i] != NULL && 2143 vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) 2144 break; 2145 ++extracted; 2146 } 2147 } 2148 2149 /* 2150 * If every disk has been moved to the new pool, or if we never 2151 * even attempted to look at them, then we split them off for 2152 * good. 2153 */ 2154 if (!attempt_reopen || gcount == extracted) { 2155 for (i = 0; i < gcount; i++) 2156 if (vd[i] != NULL) 2157 vdev_split(vd[i]); 2158 vdev_reopen(spa->spa_root_vdev); 2159 } 2160 2161 kmem_free(vd, gcount * sizeof (vdev_t *)); 2162} 2163 2164static int 2165spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, 2166 boolean_t mosconfig) 2167{ 2168 nvlist_t *config = spa->spa_config; 2169 char *ereport = FM_EREPORT_ZFS_POOL; 2170 char *comment; 2171 int error; 2172 uint64_t pool_guid; 2173 nvlist_t *nvl; 2174 2175 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) 2176 return (SET_ERROR(EINVAL)); 2177 2178 ASSERT(spa->spa_comment == NULL); 2179 if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) 2180 spa->spa_comment = spa_strdup(comment); 2181 2182 /* 2183 * Versioning wasn't explicitly added to the label until later, so if 2184 * it's not present treat it as the initial version. 2185 */ 2186 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 2187 &spa->spa_ubsync.ub_version) != 0) 2188 spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 2189 2190 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 2191 &spa->spa_config_txg); 2192 2193 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 2194 spa_guid_exists(pool_guid, 0)) { 2195 error = SET_ERROR(EEXIST); 2196 } else { 2197 spa->spa_config_guid = pool_guid; 2198 2199 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, 2200 &nvl) == 0) { 2201 VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting, 2202 KM_SLEEP) == 0); 2203 } 2204 2205 nvlist_free(spa->spa_load_info); 2206 spa->spa_load_info = fnvlist_alloc(); 2207 2208 gethrestime(&spa->spa_loaded_ts); 2209 error = spa_load_impl(spa, pool_guid, config, state, type, 2210 mosconfig, &ereport); 2211 } 2212 2213 /* 2214 * Don't count references from objsets that are already closed 2215 * and are making their way through the eviction process. 2216 */ 2217 spa_evicting_os_wait(spa); 2218 spa->spa_minref = refcount_count(&spa->spa_refcount); 2219 if (error) { 2220 if (error != EEXIST) { 2221 spa->spa_loaded_ts.tv_sec = 0; 2222 spa->spa_loaded_ts.tv_nsec = 0; 2223 } 2224 if (error != EBADF) { 2225 zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); 2226 } 2227 } 2228 spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; 2229 spa->spa_ena = 0; 2230 2231 return (error); 2232} 2233 2234/* 2235 * Count the number of per-vdev ZAPs associated with all of the vdevs in the 2236 * vdev tree rooted in the given vd, and ensure that each ZAP is present in the 2237 * spa's per-vdev ZAP list. 2238 */ 2239static uint64_t 2240vdev_count_verify_zaps(vdev_t *vd) 2241{ 2242 spa_t *spa = vd->vdev_spa; 2243 uint64_t total = 0; 2244 if (vd->vdev_top_zap != 0) { 2245 total++; 2246 ASSERT0(zap_lookup_int(spa->spa_meta_objset, 2247 spa->spa_all_vdev_zaps, vd->vdev_top_zap)); 2248 } 2249 if (vd->vdev_leaf_zap != 0) { 2250 total++; 2251 ASSERT0(zap_lookup_int(spa->spa_meta_objset, 2252 spa->spa_all_vdev_zaps, vd->vdev_leaf_zap)); 2253 } 2254 2255 for (uint64_t i = 0; i < vd->vdev_children; i++) { 2256 total += vdev_count_verify_zaps(vd->vdev_child[i]); 2257 } 2258 2259 return (total); 2260} 2261 2262/* 2263 * Load an existing storage pool, using the pool's builtin spa_config as a 2264 * source of configuration information. 2265 */ 2266static int 2267spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, 2268 spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, 2269 char **ereport) 2270{ 2271 int error = 0; 2272 nvlist_t *nvroot = NULL; 2273 nvlist_t *label; 2274 vdev_t *rvd; 2275 uberblock_t *ub = &spa->spa_uberblock; 2276 uint64_t children, config_cache_txg = spa->spa_config_txg; 2277 int orig_mode = spa->spa_mode; 2278 int parse; 2279 uint64_t obj; 2280 boolean_t missing_feat_write = B_FALSE; 2281 2282 /* 2283 * If this is an untrusted config, access the pool in read-only mode. 2284 * This prevents things like resilvering recently removed devices. 2285 */ 2286 if (!mosconfig) 2287 spa->spa_mode = FREAD; 2288 2289 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 2290 2291 spa->spa_load_state = state; 2292 2293 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot)) 2294 return (SET_ERROR(EINVAL)); 2295 2296 parse = (type == SPA_IMPORT_EXISTING ? 2297 VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); 2298 2299 /* 2300 * Create "The Godfather" zio to hold all async IOs 2301 */ 2302 spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), 2303 KM_SLEEP); 2304 for (int i = 0; i < max_ncpus; i++) { 2305 spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, 2306 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 2307 ZIO_FLAG_GODFATHER); 2308 } 2309 2310 /* 2311 * Parse the configuration into a vdev tree. We explicitly set the 2312 * value that will be returned by spa_version() since parsing the 2313 * configuration requires knowing the version number. 2314 */ 2315 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2316 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse); 2317 spa_config_exit(spa, SCL_ALL, FTAG); 2318 2319 if (error != 0) 2320 return (error); 2321 2322 ASSERT(spa->spa_root_vdev == rvd); 2323 ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); 2324 ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT); 2325 2326 if (type != SPA_IMPORT_ASSEMBLE) { 2327 ASSERT(spa_guid(spa) == pool_guid); 2328 } 2329 2330 /* 2331 * Try to open all vdevs, loading each label in the process. 2332 */ 2333 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2334 error = vdev_open(rvd); 2335 spa_config_exit(spa, SCL_ALL, FTAG); 2336 if (error != 0) 2337 return (error); 2338 2339 /* 2340 * We need to validate the vdev labels against the configuration that 2341 * we have in hand, which is dependent on the setting of mosconfig. If 2342 * mosconfig is true then we're validating the vdev labels based on 2343 * that config. Otherwise, we're validating against the cached config 2344 * (zpool.cache) that was read when we loaded the zfs module, and then 2345 * later we will recursively call spa_load() and validate against 2346 * the vdev config. 2347 * 2348 * If we're assembling a new pool that's been split off from an 2349 * existing pool, the labels haven't yet been updated so we skip 2350 * validation for now. 2351 */ 2352 if (type != SPA_IMPORT_ASSEMBLE) { 2353 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2354 error = vdev_validate(rvd, mosconfig); 2355 spa_config_exit(spa, SCL_ALL, FTAG); 2356 2357 if (error != 0) 2358 return (error); 2359 2360 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 2361 return (SET_ERROR(ENXIO)); 2362 } 2363 2364 /* 2365 * Find the best uberblock. 2366 */ 2367 vdev_uberblock_load(rvd, ub, &label); 2368 2369 /* 2370 * If we weren't able to find a single valid uberblock, return failure. 2371 */ 2372 if (ub->ub_txg == 0) { 2373 nvlist_free(label); 2374 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); 2375 } 2376 2377 /* 2378 * If the pool has an unsupported version we can't open it. 2379 */ 2380 if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) { 2381 nvlist_free(label); 2382 return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); 2383 } 2384 2385 if (ub->ub_version >= SPA_VERSION_FEATURES) { 2386 nvlist_t *features; 2387 2388 /* 2389 * If we weren't able to find what's necessary for reading the 2390 * MOS in the label, return failure. 2391 */ 2392 if (label == NULL || nvlist_lookup_nvlist(label, 2393 ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) { 2394 nvlist_free(label); 2395 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 2396 ENXIO)); 2397 } 2398 2399 /* 2400 * Update our in-core representation with the definitive values 2401 * from the label. 2402 */ 2403 nvlist_free(spa->spa_label_features); 2404 VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0); 2405 } 2406 2407 nvlist_free(label); 2408 2409 /* 2410 * Look through entries in the label nvlist's features_for_read. If 2411 * there is a feature listed there which we don't understand then we 2412 * cannot open a pool. 2413 */ 2414 if (ub->ub_version >= SPA_VERSION_FEATURES) { 2415 nvlist_t *unsup_feat; 2416 2417 VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) == 2418 0); 2419 2420 for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features, 2421 NULL); nvp != NULL; 2422 nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) { 2423 if (!zfeature_is_supported(nvpair_name(nvp))) { 2424 VERIFY(nvlist_add_string(unsup_feat, 2425 nvpair_name(nvp), "") == 0); 2426 } 2427 } 2428 2429 if (!nvlist_empty(unsup_feat)) { 2430 VERIFY(nvlist_add_nvlist(spa->spa_load_info, 2431 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0); 2432 nvlist_free(unsup_feat); 2433 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 2434 ENOTSUP)); 2435 } 2436 2437 nvlist_free(unsup_feat); 2438 } 2439 2440 /* 2441 * If the vdev guid sum doesn't match the uberblock, we have an 2442 * incomplete configuration. We first check to see if the pool 2443 * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN). 2444 * If it is, defer the vdev_guid_sum check till later so we 2445 * can handle missing vdevs. 2446 */ 2447 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN, 2448 &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE && 2449 rvd->vdev_guid_sum != ub->ub_guid_sum) 2450 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); 2451 2452 if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { 2453 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2454 spa_try_repair(spa, config); 2455 spa_config_exit(spa, SCL_ALL, FTAG); 2456 nvlist_free(spa->spa_config_splitting); 2457 spa->spa_config_splitting = NULL; 2458 } 2459 2460 /* 2461 * Initialize internal SPA structures. 2462 */ 2463 spa->spa_state = POOL_STATE_ACTIVE; 2464 spa->spa_ubsync = spa->spa_uberblock; 2465 spa->spa_verify_min_txg = spa->spa_extreme_rewind ? 2466 TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; 2467 spa->spa_first_txg = spa->spa_last_ubsync_txg ? 2468 spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; 2469 spa->spa_claim_max_txg = spa->spa_first_txg; 2470 spa->spa_prev_software_version = ub->ub_software_version; 2471 2472 error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 2473 if (error) 2474 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2475 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 2476 2477 if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0) 2478 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2479 2480 if (spa_version(spa) >= SPA_VERSION_FEATURES) { 2481 boolean_t missing_feat_read = B_FALSE; 2482 nvlist_t *unsup_feat, *enabled_feat; 2483 2484 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ, 2485 &spa->spa_feat_for_read_obj) != 0) { 2486 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2487 } 2488 2489 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE, 2490 &spa->spa_feat_for_write_obj) != 0) { 2491 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2492 } 2493 2494 if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS, 2495 &spa->spa_feat_desc_obj) != 0) { 2496 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2497 } 2498 2499 enabled_feat = fnvlist_alloc(); 2500 unsup_feat = fnvlist_alloc(); 2501 2502 if (!spa_features_check(spa, B_FALSE, 2503 unsup_feat, enabled_feat)) 2504 missing_feat_read = B_TRUE; 2505 2506 if (spa_writeable(spa) || state == SPA_LOAD_TRYIMPORT) { 2507 if (!spa_features_check(spa, B_TRUE, 2508 unsup_feat, enabled_feat)) { 2509 missing_feat_write = B_TRUE; 2510 } 2511 } 2512 2513 fnvlist_add_nvlist(spa->spa_load_info, 2514 ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat); 2515 2516 if (!nvlist_empty(unsup_feat)) { 2517 fnvlist_add_nvlist(spa->spa_load_info, 2518 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); 2519 } 2520 2521 fnvlist_free(enabled_feat); 2522 fnvlist_free(unsup_feat); 2523 2524 if (!missing_feat_read) { 2525 fnvlist_add_boolean(spa->spa_load_info, 2526 ZPOOL_CONFIG_CAN_RDONLY); 2527 } 2528 2529 /* 2530 * If the state is SPA_LOAD_TRYIMPORT, our objective is 2531 * twofold: to determine whether the pool is available for 2532 * import in read-write mode and (if it is not) whether the 2533 * pool is available for import in read-only mode. If the pool 2534 * is available for import in read-write mode, it is displayed 2535 * as available in userland; if it is not available for import 2536 * in read-only mode, it is displayed as unavailable in 2537 * userland. If the pool is available for import in read-only 2538 * mode but not read-write mode, it is displayed as unavailable 2539 * in userland with a special note that the pool is actually 2540 * available for open in read-only mode. 2541 * 2542 * As a result, if the state is SPA_LOAD_TRYIMPORT and we are 2543 * missing a feature for write, we must first determine whether 2544 * the pool can be opened read-only before returning to 2545 * userland in order to know whether to display the 2546 * abovementioned note. 2547 */ 2548 if (missing_feat_read || (missing_feat_write && 2549 spa_writeable(spa))) { 2550 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 2551 ENOTSUP)); 2552 } 2553 2554 /* 2555 * Load refcounts for ZFS features from disk into an in-memory 2556 * cache during SPA initialization. 2557 */ 2558 for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { 2559 uint64_t refcount; 2560 2561 error = feature_get_refcount_from_disk(spa, 2562 &spa_feature_table[i], &refcount); 2563 if (error == 0) { 2564 spa->spa_feat_refcount_cache[i] = refcount; 2565 } else if (error == ENOTSUP) { 2566 spa->spa_feat_refcount_cache[i] = 2567 SPA_FEATURE_DISABLED; 2568 } else { 2569 return (spa_vdev_err(rvd, 2570 VDEV_AUX_CORRUPT_DATA, EIO)); 2571 } 2572 } 2573 } 2574 2575 if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) { 2576 if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG, 2577 &spa->spa_feat_enabled_txg_obj) != 0) 2578 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2579 } 2580 2581 spa->spa_is_initializing = B_TRUE; 2582 error = dsl_pool_open(spa->spa_dsl_pool); 2583 spa->spa_is_initializing = B_FALSE; 2584 if (error != 0) 2585 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2586 2587 if (!mosconfig) { 2588 uint64_t hostid; 2589 nvlist_t *policy = NULL, *nvconfig; 2590 2591 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) 2592 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2593 2594 if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig, 2595 ZPOOL_CONFIG_HOSTID, &hostid) == 0) { 2596 char *hostname; 2597 unsigned long myhostid = 0; 2598 2599 VERIFY(nvlist_lookup_string(nvconfig, 2600 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 2601 2602#ifdef _KERNEL 2603 myhostid = zone_get_hostid(NULL); 2604#else /* _KERNEL */ 2605 /* 2606 * We're emulating the system's hostid in userland, so 2607 * we can't use zone_get_hostid(). 2608 */ 2609 (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 2610#endif /* _KERNEL */ 2611 if (check_hostid && hostid != 0 && myhostid != 0 && 2612 hostid != myhostid) { 2613 nvlist_free(nvconfig); 2614 cmn_err(CE_WARN, "pool '%s' could not be " 2615 "loaded as it was last accessed by " 2616 "another system (host: %s hostid: 0x%lx). " 2617 "See: http://illumos.org/msg/ZFS-8000-EY", 2618 spa_name(spa), hostname, 2619 (unsigned long)hostid); 2620 return (SET_ERROR(EBADF)); 2621 } 2622 } 2623 if (nvlist_lookup_nvlist(spa->spa_config, 2624 ZPOOL_REWIND_POLICY, &policy) == 0) 2625 VERIFY(nvlist_add_nvlist(nvconfig, 2626 ZPOOL_REWIND_POLICY, policy) == 0); 2627 2628 spa_config_set(spa, nvconfig); 2629 spa_unload(spa); 2630 spa_deactivate(spa); 2631 spa_activate(spa, orig_mode); 2632 2633 return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE)); 2634 } 2635 2636 /* Grab the secret checksum salt from the MOS. */ 2637 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 2638 DMU_POOL_CHECKSUM_SALT, 1, 2639 sizeof (spa->spa_cksum_salt.zcs_bytes), 2640 spa->spa_cksum_salt.zcs_bytes); 2641 if (error == ENOENT) { 2642 /* Generate a new salt for subsequent use */ 2643 (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, 2644 sizeof (spa->spa_cksum_salt.zcs_bytes)); 2645 } else if (error != 0) { 2646 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2647 } 2648 2649 if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0) 2650 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2651 error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); 2652 if (error != 0) 2653 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2654 2655 /* 2656 * Load the bit that tells us to use the new accounting function 2657 * (raid-z deflation). If we have an older pool, this will not 2658 * be present. 2659 */ 2660 error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate); 2661 if (error != 0 && error != ENOENT) 2662 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2663 2664 error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, 2665 &spa->spa_creation_version); 2666 if (error != 0 && error != ENOENT) 2667 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2668 2669 /* 2670 * Load the persistent error log. If we have an older pool, this will 2671 * not be present. 2672 */ 2673 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last); 2674 if (error != 0 && error != ENOENT) 2675 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2676 2677 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, 2678 &spa->spa_errlog_scrub); 2679 if (error != 0 && error != ENOENT) 2680 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2681 2682 /* 2683 * Load the history object. If we have an older pool, this 2684 * will not be present. 2685 */ 2686 error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history); 2687 if (error != 0 && error != ENOENT) 2688 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2689 2690 /* 2691 * Load the per-vdev ZAP map. If we have an older pool, this will not 2692 * be present; in this case, defer its creation to a later time to 2693 * avoid dirtying the MOS this early / out of sync context. See 2694 * spa_sync_config_object. 2695 */ 2696 2697 /* The sentinel is only available in the MOS config. */ 2698 nvlist_t *mos_config; 2699 if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) 2700 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2701 2702 error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP, 2703 &spa->spa_all_vdev_zaps); 2704 2705 if (error != ENOENT && error != 0) { 2706 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2707 } else if (error == 0 && !nvlist_exists(mos_config, 2708 ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) { 2709 /* 2710 * An older version of ZFS overwrote the sentinel value, so 2711 * we have orphaned per-vdev ZAPs in the MOS. Defer their 2712 * destruction to later; see spa_sync_config_object. 2713 */ 2714 spa->spa_avz_action = AVZ_ACTION_DESTROY; 2715 /* 2716 * We're assuming that no vdevs have had their ZAPs created 2717 * before this. Better be sure of it. 2718 */ 2719 ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); 2720 } 2721 nvlist_free(mos_config); 2722 2723 /* 2724 * If we're assembling the pool from the split-off vdevs of 2725 * an existing pool, we don't want to attach the spares & cache 2726 * devices. 2727 */ 2728 2729 /* 2730 * Load any hot spares for this pool. 2731 */ 2732 error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object); 2733 if (error != 0 && error != ENOENT) 2734 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2735 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 2736 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 2737 if (load_nvlist(spa, spa->spa_spares.sav_object, 2738 &spa->spa_spares.sav_config) != 0) 2739 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2740 2741 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2742 spa_load_spares(spa); 2743 spa_config_exit(spa, SCL_ALL, FTAG); 2744 } else if (error == 0) { 2745 spa->spa_spares.sav_sync = B_TRUE; 2746 } 2747 2748 /* 2749 * Load any level 2 ARC devices for this pool. 2750 */ 2751 error = spa_dir_prop(spa, DMU_POOL_L2CACHE, 2752 &spa->spa_l2cache.sav_object); 2753 if (error != 0 && error != ENOENT) 2754 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2755 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 2756 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 2757 if (load_nvlist(spa, spa->spa_l2cache.sav_object, 2758 &spa->spa_l2cache.sav_config) != 0) 2759 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2760 2761 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2762 spa_load_l2cache(spa); 2763 spa_config_exit(spa, SCL_ALL, FTAG); 2764 } else if (error == 0) { 2765 spa->spa_l2cache.sav_sync = B_TRUE; 2766 } 2767 2768 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 2769 2770 error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object); 2771 if (error && error != ENOENT) 2772 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2773 2774 if (error == 0) { 2775 uint64_t autoreplace; 2776 2777 spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); 2778 spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); 2779 spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); 2780 spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); 2781 spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); 2782 spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO, 2783 &spa->spa_dedup_ditto); 2784 2785 spa->spa_autoreplace = (autoreplace != 0); 2786 } 2787 2788 /* 2789 * If the 'autoreplace' property is set, then post a resource notifying 2790 * the ZFS DE that it should not issue any faults for unopenable 2791 * devices. We also iterate over the vdevs, and post a sysevent for any 2792 * unopenable vdevs so that the normal autoreplace handler can take 2793 * over. 2794 */ 2795 if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) { 2796 spa_check_removed(spa->spa_root_vdev); 2797 /* 2798 * For the import case, this is done in spa_import(), because 2799 * at this point we're using the spare definitions from 2800 * the MOS config, not necessarily from the userland config. 2801 */ 2802 if (state != SPA_LOAD_IMPORT) { 2803 spa_aux_check_removed(&spa->spa_spares); 2804 spa_aux_check_removed(&spa->spa_l2cache); 2805 } 2806 } 2807 2808 /* 2809 * Load the vdev state for all toplevel vdevs. 2810 */ 2811 vdev_load(rvd); 2812 2813 /* 2814 * Propagate the leaf DTLs we just loaded all the way up the tree. 2815 */ 2816 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2817 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 2818 spa_config_exit(spa, SCL_ALL, FTAG); 2819 2820 /* 2821 * Load the DDTs (dedup tables). 2822 */ 2823 error = ddt_load(spa); 2824 if (error != 0) 2825 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2826 2827 spa_update_dspace(spa); 2828 2829 /* 2830 * Validate the config, using the MOS config to fill in any 2831 * information which might be missing. If we fail to validate 2832 * the config then declare the pool unfit for use. If we're 2833 * assembling a pool from a split, the log is not transferred 2834 * over. 2835 */ 2836 if (type != SPA_IMPORT_ASSEMBLE) { 2837 nvlist_t *nvconfig; 2838 2839 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) 2840 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2841 2842 if (!spa_config_valid(spa, nvconfig)) { 2843 nvlist_free(nvconfig); 2844 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, 2845 ENXIO)); 2846 } 2847 nvlist_free(nvconfig); 2848 2849 /* 2850 * Now that we've validated the config, check the state of the 2851 * root vdev. If it can't be opened, it indicates one or 2852 * more toplevel vdevs are faulted. 2853 */ 2854 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 2855 return (SET_ERROR(ENXIO)); 2856 2857 if (spa_writeable(spa) && spa_check_logs(spa)) { 2858 *ereport = FM_EREPORT_ZFS_LOG_REPLAY; 2859 return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO)); 2860 } 2861 } 2862 2863 if (missing_feat_write) { 2864 ASSERT(state == SPA_LOAD_TRYIMPORT); 2865 2866 /* 2867 * At this point, we know that we can open the pool in 2868 * read-only mode but not read-write mode. We now have enough 2869 * information and can return to userland. 2870 */ 2871 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); 2872 } 2873 2874 /* 2875 * We've successfully opened the pool, verify that we're ready 2876 * to start pushing transactions. 2877 */ 2878 if (state != SPA_LOAD_TRYIMPORT) { 2879 if (error = spa_load_verify(spa)) 2880 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 2881 error)); 2882 } 2883 2884 if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER || 2885 spa->spa_load_max_txg == UINT64_MAX)) { 2886 dmu_tx_t *tx; 2887 int need_update = B_FALSE; 2888 dsl_pool_t *dp = spa_get_dsl(spa); 2889 2890 ASSERT(state != SPA_LOAD_TRYIMPORT); 2891 2892 /* 2893 * Claim log blocks that haven't been committed yet. 2894 * This must all happen in a single txg. 2895 * Note: spa_claim_max_txg is updated by spa_claim_notify(), 2896 * invoked from zil_claim_log_block()'s i/o done callback. 2897 * Price of rollback is that we abandon the log. 2898 */ 2899 spa->spa_claiming = B_TRUE; 2900 2901 tx = dmu_tx_create_assigned(dp, spa_first_txg(spa)); 2902 (void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj, 2903 zil_claim, tx, DS_FIND_CHILDREN); 2904 dmu_tx_commit(tx); 2905 2906 spa->spa_claiming = B_FALSE; 2907 2908 spa_set_log_state(spa, SPA_LOG_GOOD); 2909 spa->spa_sync_on = B_TRUE; 2910 txg_sync_start(spa->spa_dsl_pool); 2911 2912 /* 2913 * Wait for all claims to sync. We sync up to the highest 2914 * claimed log block birth time so that claimed log blocks 2915 * don't appear to be from the future. spa_claim_max_txg 2916 * will have been set for us by either zil_check_log_chain() 2917 * (invoked from spa_check_logs()) or zil_claim() above. 2918 */ 2919 txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); 2920 2921 /* 2922 * If the config cache is stale, or we have uninitialized 2923 * metaslabs (see spa_vdev_add()), then update the config. 2924 * 2925 * If this is a verbatim import, trust the current 2926 * in-core spa_config and update the disk labels. 2927 */ 2928 if (config_cache_txg != spa->spa_config_txg || 2929 state == SPA_LOAD_IMPORT || 2930 state == SPA_LOAD_RECOVER || 2931 (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) 2932 need_update = B_TRUE; 2933 2934 for (int c = 0; c < rvd->vdev_children; c++) 2935 if (rvd->vdev_child[c]->vdev_ms_array == 0) 2936 need_update = B_TRUE; 2937 2938 /* 2939 * Update the config cache asychronously in case we're the 2940 * root pool, in which case the config cache isn't writable yet. 2941 */ 2942 if (need_update) 2943 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 2944 2945 /* 2946 * Check all DTLs to see if anything needs resilvering. 2947 */ 2948 if (!dsl_scan_resilvering(spa->spa_dsl_pool) && 2949 vdev_resilver_needed(rvd, NULL, NULL)) 2950 spa_async_request(spa, SPA_ASYNC_RESILVER); 2951 2952 /* 2953 * Log the fact that we booted up (so that we can detect if 2954 * we rebooted in the middle of an operation). 2955 */ 2956 spa_history_log_version(spa, "open"); 2957 2958 /* 2959 * Delete any inconsistent datasets. 2960 */ 2961 (void) dmu_objset_find(spa_name(spa), 2962 dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); 2963 2964 /* 2965 * Clean up any stale temporary dataset userrefs. 2966 */ 2967 dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); 2968 } 2969 2970 return (0); 2971} 2972 2973static int 2974spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig) 2975{ 2976 int mode = spa->spa_mode; 2977 2978 spa_unload(spa); 2979 spa_deactivate(spa); 2980 2981 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1; 2982 2983 spa_activate(spa, mode); 2984 spa_async_suspend(spa); 2985 2986 return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig)); 2987} 2988 2989/* 2990 * If spa_load() fails this function will try loading prior txg's. If 2991 * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool 2992 * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this 2993 * function will not rewind the pool and will return the same error as 2994 * spa_load(). 2995 */ 2996static int 2997spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, 2998 uint64_t max_request, int rewind_flags) 2999{ 3000 nvlist_t *loadinfo = NULL; 3001 nvlist_t *config = NULL; 3002 int load_error, rewind_error; 3003 uint64_t safe_rewind_txg; 3004 uint64_t min_txg; 3005 3006 if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { 3007 spa->spa_load_max_txg = spa->spa_load_txg; 3008 spa_set_log_state(spa, SPA_LOG_CLEAR); 3009 } else { 3010 spa->spa_load_max_txg = max_request; 3011 if (max_request != UINT64_MAX) 3012 spa->spa_extreme_rewind = B_TRUE; 3013 } 3014 3015 load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING, 3016 mosconfig); 3017 if (load_error == 0) 3018 return (0); 3019 3020 if (spa->spa_root_vdev != NULL) 3021 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 3022 3023 spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; 3024 spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; 3025 3026 if (rewind_flags & ZPOOL_NEVER_REWIND) { 3027 nvlist_free(config); 3028 return (load_error); 3029 } 3030 3031 if (state == SPA_LOAD_RECOVER) { 3032 /* Price of rolling back is discarding txgs, including log */ 3033 spa_set_log_state(spa, SPA_LOG_CLEAR); 3034 } else { 3035 /* 3036 * If we aren't rolling back save the load info from our first 3037 * import attempt so that we can restore it after attempting 3038 * to rewind. 3039 */ 3040 loadinfo = spa->spa_load_info; 3041 spa->spa_load_info = fnvlist_alloc(); 3042 } 3043 3044 spa->spa_load_max_txg = spa->spa_last_ubsync_txg; 3045 safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; 3046 min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? 3047 TXG_INITIAL : safe_rewind_txg; 3048 3049 /* 3050 * Continue as long as we're finding errors, we're still within 3051 * the acceptable rewind range, and we're still finding uberblocks 3052 */ 3053 while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && 3054 spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { 3055 if (spa->spa_load_max_txg < safe_rewind_txg) 3056 spa->spa_extreme_rewind = B_TRUE; 3057 rewind_error = spa_load_retry(spa, state, mosconfig); 3058 } 3059 3060 spa->spa_extreme_rewind = B_FALSE; 3061 spa->spa_load_max_txg = UINT64_MAX; 3062 3063 if (config && (rewind_error || state != SPA_LOAD_RECOVER)) 3064 spa_config_set(spa, config); 3065 3066 if (state == SPA_LOAD_RECOVER) { 3067 ASSERT3P(loadinfo, ==, NULL); 3068 return (rewind_error); 3069 } else { 3070 /* Store the rewind info as part of the initial load info */ 3071 fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO, 3072 spa->spa_load_info); 3073 3074 /* Restore the initial load info */ 3075 fnvlist_free(spa->spa_load_info); 3076 spa->spa_load_info = loadinfo; 3077 3078 return (load_error); 3079 } 3080} 3081 3082/* 3083 * Pool Open/Import 3084 * 3085 * The import case is identical to an open except that the configuration is sent 3086 * down from userland, instead of grabbed from the configuration cache. For the 3087 * case of an open, the pool configuration will exist in the 3088 * POOL_STATE_UNINITIALIZED state. 3089 * 3090 * The stats information (gen/count/ustats) is used to gather vdev statistics at 3091 * the same time open the pool, without having to keep around the spa_t in some 3092 * ambiguous state. 3093 */ 3094static int 3095spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, 3096 nvlist_t **config) 3097{ 3098 spa_t *spa; 3099 spa_load_state_t state = SPA_LOAD_OPEN; 3100 int error; 3101 int locked = B_FALSE; 3102 int firstopen = B_FALSE; 3103 3104 *spapp = NULL; 3105 3106 /* 3107 * As disgusting as this is, we need to support recursive calls to this 3108 * function because dsl_dir_open() is called during spa_load(), and ends 3109 * up calling spa_open() again. The real fix is to figure out how to 3110 * avoid dsl_dir_open() calling this in the first place. 3111 */ 3112 if (mutex_owner(&spa_namespace_lock) != curthread) { 3113 mutex_enter(&spa_namespace_lock); 3114 locked = B_TRUE; 3115 } 3116 3117 if ((spa = spa_lookup(pool)) == NULL) { 3118 if (locked) 3119 mutex_exit(&spa_namespace_lock); 3120 return (SET_ERROR(ENOENT)); 3121 } 3122 3123 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 3124 zpool_rewind_policy_t policy; 3125 3126 firstopen = B_TRUE; 3127 3128 zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config, 3129 &policy); 3130 if (policy.zrp_request & ZPOOL_DO_REWIND) 3131 state = SPA_LOAD_RECOVER; 3132 3133 spa_activate(spa, spa_mode_global); 3134 3135 if (state != SPA_LOAD_RECOVER) 3136 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 3137 3138 error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg, 3139 policy.zrp_request); 3140 3141 if (error == EBADF) { 3142 /* 3143 * If vdev_validate() returns failure (indicated by 3144 * EBADF), it indicates that one of the vdevs indicates 3145 * that the pool has been exported or destroyed. If 3146 * this is the case, the config cache is out of sync and 3147 * we should remove the pool from the namespace. 3148 */ 3149 spa_unload(spa); 3150 spa_deactivate(spa); 3151 spa_config_sync(spa, B_TRUE, B_TRUE); 3152 spa_remove(spa); 3153 if (locked) 3154 mutex_exit(&spa_namespace_lock); 3155 return (SET_ERROR(ENOENT)); 3156 } 3157 3158 if (error) { 3159 /* 3160 * We can't open the pool, but we still have useful 3161 * information: the state of each vdev after the 3162 * attempted vdev_open(). Return this to the user. 3163 */ 3164 if (config != NULL && spa->spa_config) { 3165 VERIFY(nvlist_dup(spa->spa_config, config, 3166 KM_SLEEP) == 0); 3167 VERIFY(nvlist_add_nvlist(*config, 3168 ZPOOL_CONFIG_LOAD_INFO, 3169 spa->spa_load_info) == 0); 3170 } 3171 spa_unload(spa); 3172 spa_deactivate(spa); 3173 spa->spa_last_open_failed = error; 3174 if (locked) 3175 mutex_exit(&spa_namespace_lock); 3176 *spapp = NULL; 3177 return (error); 3178 } 3179 } 3180 3181 spa_open_ref(spa, tag); 3182 3183 if (config != NULL) 3184 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 3185 3186 /* 3187 * If we've recovered the pool, pass back any information we 3188 * gathered while doing the load. 3189 */ 3190 if (state == SPA_LOAD_RECOVER) { 3191 VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, 3192 spa->spa_load_info) == 0); 3193 } 3194 3195 if (locked) { 3196 spa->spa_last_open_failed = 0; 3197 spa->spa_last_ubsync_txg = 0; 3198 spa->spa_load_txg = 0; 3199 mutex_exit(&spa_namespace_lock); 3200#ifdef __FreeBSD__ 3201#ifdef _KERNEL 3202 if (firstopen) 3203 zvol_create_minors(spa->spa_name); 3204#endif 3205#endif 3206 } 3207 3208 *spapp = spa; 3209 3210 return (0); 3211} 3212 3213int 3214spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy, 3215 nvlist_t **config) 3216{ 3217 return (spa_open_common(name, spapp, tag, policy, config)); 3218} 3219 3220int 3221spa_open(const char *name, spa_t **spapp, void *tag) 3222{ 3223 return (spa_open_common(name, spapp, tag, NULL, NULL)); 3224} 3225 3226/* 3227 * Lookup the given spa_t, incrementing the inject count in the process, 3228 * preventing it from being exported or destroyed. 3229 */ 3230spa_t * 3231spa_inject_addref(char *name) 3232{ 3233 spa_t *spa; 3234 3235 mutex_enter(&spa_namespace_lock); 3236 if ((spa = spa_lookup(name)) == NULL) { 3237 mutex_exit(&spa_namespace_lock); 3238 return (NULL); 3239 } 3240 spa->spa_inject_ref++; 3241 mutex_exit(&spa_namespace_lock); 3242 3243 return (spa); 3244} 3245 3246void 3247spa_inject_delref(spa_t *spa) 3248{ 3249 mutex_enter(&spa_namespace_lock); 3250 spa->spa_inject_ref--; 3251 mutex_exit(&spa_namespace_lock); 3252} 3253 3254/* 3255 * Add spares device information to the nvlist. 3256 */ 3257static void 3258spa_add_spares(spa_t *spa, nvlist_t *config) 3259{ 3260 nvlist_t **spares; 3261 uint_t i, nspares; 3262 nvlist_t *nvroot; 3263 uint64_t guid; 3264 vdev_stat_t *vs; 3265 uint_t vsc; 3266 uint64_t pool; 3267 3268 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 3269 3270 if (spa->spa_spares.sav_count == 0) 3271 return; 3272 3273 VERIFY(nvlist_lookup_nvlist(config, 3274 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 3275 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 3276 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 3277 if (nspares != 0) { 3278 VERIFY(nvlist_add_nvlist_array(nvroot, 3279 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 3280 VERIFY(nvlist_lookup_nvlist_array(nvroot, 3281 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 3282 3283 /* 3284 * Go through and find any spares which have since been 3285 * repurposed as an active spare. If this is the case, update 3286 * their status appropriately. 3287 */ 3288 for (i = 0; i < nspares; i++) { 3289 VERIFY(nvlist_lookup_uint64(spares[i], 3290 ZPOOL_CONFIG_GUID, &guid) == 0); 3291 if (spa_spare_exists(guid, &pool, NULL) && 3292 pool != 0ULL) { 3293 VERIFY(nvlist_lookup_uint64_array( 3294 spares[i], ZPOOL_CONFIG_VDEV_STATS, 3295 (uint64_t **)&vs, &vsc) == 0); 3296 vs->vs_state = VDEV_STATE_CANT_OPEN; 3297 vs->vs_aux = VDEV_AUX_SPARED; 3298 } 3299 } 3300 } 3301} 3302 3303/* 3304 * Add l2cache device information to the nvlist, including vdev stats. 3305 */ 3306static void 3307spa_add_l2cache(spa_t *spa, nvlist_t *config) 3308{ 3309 nvlist_t **l2cache; 3310 uint_t i, j, nl2cache; 3311 nvlist_t *nvroot; 3312 uint64_t guid; 3313 vdev_t *vd; 3314 vdev_stat_t *vs; 3315 uint_t vsc; 3316 3317 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 3318 3319 if (spa->spa_l2cache.sav_count == 0) 3320 return; 3321 3322 VERIFY(nvlist_lookup_nvlist(config, 3323 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 3324 VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 3325 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 3326 if (nl2cache != 0) { 3327 VERIFY(nvlist_add_nvlist_array(nvroot, 3328 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 3329 VERIFY(nvlist_lookup_nvlist_array(nvroot, 3330 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 3331 3332 /* 3333 * Update level 2 cache device stats. 3334 */ 3335 3336 for (i = 0; i < nl2cache; i++) { 3337 VERIFY(nvlist_lookup_uint64(l2cache[i], 3338 ZPOOL_CONFIG_GUID, &guid) == 0); 3339 3340 vd = NULL; 3341 for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 3342 if (guid == 3343 spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 3344 vd = spa->spa_l2cache.sav_vdevs[j]; 3345 break; 3346 } 3347 } 3348 ASSERT(vd != NULL); 3349 3350 VERIFY(nvlist_lookup_uint64_array(l2cache[i], 3351 ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) 3352 == 0); 3353 vdev_get_stats(vd, vs); 3354 } 3355 } 3356} 3357 3358static void 3359spa_add_feature_stats(spa_t *spa, nvlist_t *config) 3360{ 3361 nvlist_t *features; 3362 zap_cursor_t zc; 3363 zap_attribute_t za; 3364 3365 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 3366 VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3367 3368 /* We may be unable to read features if pool is suspended. */ 3369 if (spa_suspended(spa)) 3370 goto out; 3371 3372 if (spa->spa_feat_for_read_obj != 0) { 3373 for (zap_cursor_init(&zc, spa->spa_meta_objset, 3374 spa->spa_feat_for_read_obj); 3375 zap_cursor_retrieve(&zc, &za) == 0; 3376 zap_cursor_advance(&zc)) { 3377 ASSERT(za.za_integer_length == sizeof (uint64_t) && 3378 za.za_num_integers == 1); 3379 VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, 3380 za.za_first_integer)); 3381 } 3382 zap_cursor_fini(&zc); 3383 } 3384 3385 if (spa->spa_feat_for_write_obj != 0) { 3386 for (zap_cursor_init(&zc, spa->spa_meta_objset, 3387 spa->spa_feat_for_write_obj); 3388 zap_cursor_retrieve(&zc, &za) == 0; 3389 zap_cursor_advance(&zc)) { 3390 ASSERT(za.za_integer_length == sizeof (uint64_t) && 3391 za.za_num_integers == 1); 3392 VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, 3393 za.za_first_integer)); 3394 } 3395 zap_cursor_fini(&zc); 3396 } 3397 3398out: 3399 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, 3400 features) == 0); 3401 nvlist_free(features); 3402} 3403 3404int 3405spa_get_stats(const char *name, nvlist_t **config, 3406 char *altroot, size_t buflen) 3407{ 3408 int error; 3409 spa_t *spa; 3410 3411 *config = NULL; 3412 error = spa_open_common(name, &spa, FTAG, NULL, config); 3413 3414 if (spa != NULL) { 3415 /* 3416 * This still leaves a window of inconsistency where the spares 3417 * or l2cache devices could change and the config would be 3418 * self-inconsistent. 3419 */ 3420 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3421 3422 if (*config != NULL) { 3423 uint64_t loadtimes[2]; 3424 3425 loadtimes[0] = spa->spa_loaded_ts.tv_sec; 3426 loadtimes[1] = spa->spa_loaded_ts.tv_nsec; 3427 VERIFY(nvlist_add_uint64_array(*config, 3428 ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0); 3429 3430 VERIFY(nvlist_add_uint64(*config, 3431 ZPOOL_CONFIG_ERRCOUNT, 3432 spa_get_errlog_size(spa)) == 0); 3433 3434 if (spa_suspended(spa)) 3435 VERIFY(nvlist_add_uint64(*config, 3436 ZPOOL_CONFIG_SUSPENDED, 3437 spa->spa_failmode) == 0); 3438 3439 spa_add_spares(spa, *config); 3440 spa_add_l2cache(spa, *config); 3441 spa_add_feature_stats(spa, *config); 3442 } 3443 } 3444 3445 /* 3446 * We want to get the alternate root even for faulted pools, so we cheat 3447 * and call spa_lookup() directly. 3448 */ 3449 if (altroot) { 3450 if (spa == NULL) { 3451 mutex_enter(&spa_namespace_lock); 3452 spa = spa_lookup(name); 3453 if (spa) 3454 spa_altroot(spa, altroot, buflen); 3455 else 3456 altroot[0] = '\0'; 3457 spa = NULL; 3458 mutex_exit(&spa_namespace_lock); 3459 } else { 3460 spa_altroot(spa, altroot, buflen); 3461 } 3462 } 3463 3464 if (spa != NULL) { 3465 spa_config_exit(spa, SCL_CONFIG, FTAG); 3466 spa_close(spa, FTAG); 3467 } 3468 3469 return (error); 3470} 3471 3472/* 3473 * Validate that the auxiliary device array is well formed. We must have an 3474 * array of nvlists, each which describes a valid leaf vdev. If this is an 3475 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 3476 * specified, as long as they are well-formed. 3477 */ 3478static int 3479spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 3480 spa_aux_vdev_t *sav, const char *config, uint64_t version, 3481 vdev_labeltype_t label) 3482{ 3483 nvlist_t **dev; 3484 uint_t i, ndev; 3485 vdev_t *vd; 3486 int error; 3487 3488 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3489 3490 /* 3491 * It's acceptable to have no devs specified. 3492 */ 3493 if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 3494 return (0); 3495 3496 if (ndev == 0) 3497 return (SET_ERROR(EINVAL)); 3498 3499 /* 3500 * Make sure the pool is formatted with a version that supports this 3501 * device type. 3502 */ 3503 if (spa_version(spa) < version) 3504 return (SET_ERROR(ENOTSUP)); 3505 3506 /* 3507 * Set the pending device list so we correctly handle device in-use 3508 * checking. 3509 */ 3510 sav->sav_pending = dev; 3511 sav->sav_npending = ndev; 3512 3513 for (i = 0; i < ndev; i++) { 3514 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 3515 mode)) != 0) 3516 goto out; 3517 3518 if (!vd->vdev_ops->vdev_op_leaf) { 3519 vdev_free(vd); 3520 error = SET_ERROR(EINVAL); 3521 goto out; 3522 } 3523 3524 /* 3525 * The L2ARC currently only supports disk devices in 3526 * kernel context. For user-level testing, we allow it. 3527 */ 3528#ifdef _KERNEL 3529 if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && 3530 strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { 3531 error = SET_ERROR(ENOTBLK); 3532 vdev_free(vd); 3533 goto out; 3534 } 3535#endif 3536 vd->vdev_top = vd; 3537 3538 if ((error = vdev_open(vd)) == 0 && 3539 (error = vdev_label_init(vd, crtxg, label)) == 0) { 3540 VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 3541 vd->vdev_guid) == 0); 3542 } 3543 3544 vdev_free(vd); 3545 3546 if (error && 3547 (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 3548 goto out; 3549 else 3550 error = 0; 3551 } 3552 3553out: 3554 sav->sav_pending = NULL; 3555 sav->sav_npending = 0; 3556 return (error); 3557} 3558 3559static int 3560spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 3561{ 3562 int error; 3563 3564 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3565 3566 if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 3567 &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 3568 VDEV_LABEL_SPARE)) != 0) { 3569 return (error); 3570 } 3571 3572 return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 3573 &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 3574 VDEV_LABEL_L2CACHE)); 3575} 3576 3577static void 3578spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 3579 const char *config) 3580{ 3581 int i; 3582 3583 if (sav->sav_config != NULL) { 3584 nvlist_t **olddevs; 3585 uint_t oldndevs; 3586 nvlist_t **newdevs; 3587 3588 /* 3589 * Generate new dev list by concatentating with the 3590 * current dev list. 3591 */ 3592 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, 3593 &olddevs, &oldndevs) == 0); 3594 3595 newdevs = kmem_alloc(sizeof (void *) * 3596 (ndevs + oldndevs), KM_SLEEP); 3597 for (i = 0; i < oldndevs; i++) 3598 VERIFY(nvlist_dup(olddevs[i], &newdevs[i], 3599 KM_SLEEP) == 0); 3600 for (i = 0; i < ndevs; i++) 3601 VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], 3602 KM_SLEEP) == 0); 3603 3604 VERIFY(nvlist_remove(sav->sav_config, config, 3605 DATA_TYPE_NVLIST_ARRAY) == 0); 3606 3607 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 3608 config, newdevs, ndevs + oldndevs) == 0); 3609 for (i = 0; i < oldndevs + ndevs; i++) 3610 nvlist_free(newdevs[i]); 3611 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 3612 } else { 3613 /* 3614 * Generate a new dev list. 3615 */ 3616 VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, 3617 KM_SLEEP) == 0); 3618 VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, 3619 devs, ndevs) == 0); 3620 } 3621} 3622 3623/* 3624 * Stop and drop level 2 ARC devices 3625 */ 3626void 3627spa_l2cache_drop(spa_t *spa) 3628{ 3629 vdev_t *vd; 3630 int i; 3631 spa_aux_vdev_t *sav = &spa->spa_l2cache; 3632 3633 for (i = 0; i < sav->sav_count; i++) { 3634 uint64_t pool; 3635 3636 vd = sav->sav_vdevs[i]; 3637 ASSERT(vd != NULL); 3638 3639 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 3640 pool != 0ULL && l2arc_vdev_present(vd)) 3641 l2arc_remove_vdev(vd); 3642 } 3643} 3644 3645/* 3646 * Pool Creation 3647 */ 3648int 3649spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 3650 nvlist_t *zplprops) 3651{ 3652 spa_t *spa; 3653 char *altroot = NULL; 3654 vdev_t *rvd; 3655 dsl_pool_t *dp; 3656 dmu_tx_t *tx; 3657 int error = 0; 3658 uint64_t txg = TXG_INITIAL; 3659 nvlist_t **spares, **l2cache; 3660 uint_t nspares, nl2cache; 3661 uint64_t version, obj; 3662 boolean_t has_features; 3663 3664 /* 3665 * If this pool already exists, return failure. 3666 */ 3667 mutex_enter(&spa_namespace_lock); 3668 if (spa_lookup(pool) != NULL) { 3669 mutex_exit(&spa_namespace_lock); 3670 return (SET_ERROR(EEXIST)); 3671 } 3672 3673 /* 3674 * Allocate a new spa_t structure. 3675 */ 3676 (void) nvlist_lookup_string(props, 3677 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 3678 spa = spa_add(pool, NULL, altroot); 3679 spa_activate(spa, spa_mode_global); 3680 3681 if (props && (error = spa_prop_validate(spa, props))) { 3682 spa_deactivate(spa); 3683 spa_remove(spa); 3684 mutex_exit(&spa_namespace_lock); 3685 return (error); 3686 } 3687 3688 has_features = B_FALSE; 3689 for (nvpair_t *elem = nvlist_next_nvpair(props, NULL); 3690 elem != NULL; elem = nvlist_next_nvpair(props, elem)) { 3691 if (zpool_prop_feature(nvpair_name(elem))) 3692 has_features = B_TRUE; 3693 } 3694 3695 if (has_features || nvlist_lookup_uint64(props, 3696 zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) { 3697 version = SPA_VERSION; 3698 } 3699 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 3700 3701 spa->spa_first_txg = txg; 3702 spa->spa_uberblock.ub_txg = txg - 1; 3703 spa->spa_uberblock.ub_version = version; 3704 spa->spa_ubsync = spa->spa_uberblock; 3705 3706 /* 3707 * Create "The Godfather" zio to hold all async IOs 3708 */ 3709 spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), 3710 KM_SLEEP); 3711 for (int i = 0; i < max_ncpus; i++) { 3712 spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, 3713 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 3714 ZIO_FLAG_GODFATHER); 3715 } 3716 3717 /* 3718 * Create the root vdev. 3719 */ 3720 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3721 3722 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 3723 3724 ASSERT(error != 0 || rvd != NULL); 3725 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 3726 3727 if (error == 0 && !zfs_allocatable_devs(nvroot)) 3728 error = SET_ERROR(EINVAL); 3729 3730 if (error == 0 && 3731 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 3732 (error = spa_validate_aux(spa, nvroot, txg, 3733 VDEV_ALLOC_ADD)) == 0) { 3734 for (int c = 0; c < rvd->vdev_children; c++) { 3735 vdev_ashift_optimize(rvd->vdev_child[c]); 3736 vdev_metaslab_set_size(rvd->vdev_child[c]); 3737 vdev_expand(rvd->vdev_child[c], txg); 3738 } 3739 } 3740 3741 spa_config_exit(spa, SCL_ALL, FTAG); 3742 3743 if (error != 0) { 3744 spa_unload(spa); 3745 spa_deactivate(spa); 3746 spa_remove(spa); 3747 mutex_exit(&spa_namespace_lock); 3748 return (error); 3749 } 3750 3751 /* 3752 * Get the list of spares, if specified. 3753 */ 3754 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 3755 &spares, &nspares) == 0) { 3756 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, 3757 KM_SLEEP) == 0); 3758 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 3759 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 3760 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3761 spa_load_spares(spa); 3762 spa_config_exit(spa, SCL_ALL, FTAG); 3763 spa->spa_spares.sav_sync = B_TRUE; 3764 } 3765 3766 /* 3767 * Get the list of level 2 cache devices, if specified. 3768 */ 3769 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 3770 &l2cache, &nl2cache) == 0) { 3771 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 3772 NV_UNIQUE_NAME, KM_SLEEP) == 0); 3773 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 3774 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 3775 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3776 spa_load_l2cache(spa); 3777 spa_config_exit(spa, SCL_ALL, FTAG); 3778 spa->spa_l2cache.sav_sync = B_TRUE; 3779 } 3780 3781 spa->spa_is_initializing = B_TRUE; 3782 spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); 3783 spa->spa_meta_objset = dp->dp_meta_objset; 3784 spa->spa_is_initializing = B_FALSE; 3785 3786 /* 3787 * Create DDTs (dedup tables). 3788 */ 3789 ddt_create(spa); 3790 3791 spa_update_dspace(spa); 3792 3793 tx = dmu_tx_create_assigned(dp, txg); 3794 3795 /* 3796 * Create the pool config object. 3797 */ 3798 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 3799 DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, 3800 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 3801 3802 if (zap_add(spa->spa_meta_objset, 3803 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 3804 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 3805 cmn_err(CE_PANIC, "failed to add pool config"); 3806 } 3807 3808 if (spa_version(spa) >= SPA_VERSION_FEATURES) 3809 spa_feature_create_zap_objects(spa, tx); 3810 3811 if (zap_add(spa->spa_meta_objset, 3812 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, 3813 sizeof (uint64_t), 1, &version, tx) != 0) { 3814 cmn_err(CE_PANIC, "failed to add pool version"); 3815 } 3816 3817 /* Newly created pools with the right version are always deflated. */ 3818 if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 3819 spa->spa_deflate = TRUE; 3820 if (zap_add(spa->spa_meta_objset, 3821 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 3822 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 3823 cmn_err(CE_PANIC, "failed to add deflate"); 3824 } 3825 } 3826 3827 /* 3828 * Create the deferred-free bpobj. Turn off compression 3829 * because sync-to-convergence takes longer if the blocksize 3830 * keeps changing. 3831 */ 3832 obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); 3833 dmu_object_set_compress(spa->spa_meta_objset, obj, 3834 ZIO_COMPRESS_OFF, tx); 3835 if (zap_add(spa->spa_meta_objset, 3836 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, 3837 sizeof (uint64_t), 1, &obj, tx) != 0) { 3838 cmn_err(CE_PANIC, "failed to add bpobj"); 3839 } 3840 VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, 3841 spa->spa_meta_objset, obj)); 3842 3843 /* 3844 * Create the pool's history object. 3845 */ 3846 if (version >= SPA_VERSION_ZPOOL_HISTORY) 3847 spa_history_create_obj(spa, tx); 3848 3849 /* 3850 * Generate some random noise for salted checksums to operate on. 3851 */ 3852 (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, 3853 sizeof (spa->spa_cksum_salt.zcs_bytes)); 3854 3855 /* 3856 * Set pool properties. 3857 */ 3858 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 3859 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 3860 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 3861 spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); 3862 3863 if (props != NULL) { 3864 spa_configfile_set(spa, props, B_FALSE); 3865 spa_sync_props(props, tx); 3866 } 3867 3868 dmu_tx_commit(tx); 3869 3870 spa->spa_sync_on = B_TRUE; 3871 txg_sync_start(spa->spa_dsl_pool); 3872 3873 /* 3874 * We explicitly wait for the first transaction to complete so that our 3875 * bean counters are appropriately updated. 3876 */ 3877 txg_wait_synced(spa->spa_dsl_pool, txg); 3878 3879 spa_config_sync(spa, B_FALSE, B_TRUE); 3880 spa_event_notify(spa, NULL, ESC_ZFS_POOL_CREATE); 3881 3882 spa_history_log_version(spa, "create"); 3883 3884 /* 3885 * Don't count references from objsets that are already closed 3886 * and are making their way through the eviction process. 3887 */ 3888 spa_evicting_os_wait(spa); 3889 spa->spa_minref = refcount_count(&spa->spa_refcount); 3890 3891 mutex_exit(&spa_namespace_lock); 3892 3893 return (0); 3894} 3895 3896#ifdef _KERNEL 3897#ifdef illumos 3898/* 3899 * Get the root pool information from the root disk, then import the root pool 3900 * during the system boot up time. 3901 */ 3902extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); 3903 3904static nvlist_t * 3905spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid) 3906{ 3907 nvlist_t *config; 3908 nvlist_t *nvtop, *nvroot; 3909 uint64_t pgid; 3910 3911 if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0) 3912 return (NULL); 3913 3914 /* 3915 * Add this top-level vdev to the child array. 3916 */ 3917 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3918 &nvtop) == 0); 3919 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 3920 &pgid) == 0); 3921 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0); 3922 3923 /* 3924 * Put this pool's top-level vdevs into a root vdev. 3925 */ 3926 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3927 VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 3928 VDEV_TYPE_ROOT) == 0); 3929 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 3930 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 3931 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 3932 &nvtop, 1) == 0); 3933 3934 /* 3935 * Replace the existing vdev_tree with the new root vdev in 3936 * this pool's configuration (remove the old, add the new). 3937 */ 3938 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 3939 nvlist_free(nvroot); 3940 return (config); 3941} 3942 3943/* 3944 * Walk the vdev tree and see if we can find a device with "better" 3945 * configuration. A configuration is "better" if the label on that 3946 * device has a more recent txg. 3947 */ 3948static void 3949spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg) 3950{ 3951 for (int c = 0; c < vd->vdev_children; c++) 3952 spa_alt_rootvdev(vd->vdev_child[c], avd, txg); 3953 3954 if (vd->vdev_ops->vdev_op_leaf) { 3955 nvlist_t *label; 3956 uint64_t label_txg; 3957 3958 if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid, 3959 &label) != 0) 3960 return; 3961 3962 VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, 3963 &label_txg) == 0); 3964 3965 /* 3966 * Do we have a better boot device? 3967 */ 3968 if (label_txg > *txg) { 3969 *txg = label_txg; 3970 *avd = vd; 3971 } 3972 nvlist_free(label); 3973 } 3974} 3975 3976/* 3977 * Import a root pool. 3978 * 3979 * For x86. devpath_list will consist of devid and/or physpath name of 3980 * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). 3981 * The GRUB "findroot" command will return the vdev we should boot. 3982 * 3983 * For Sparc, devpath_list consists the physpath name of the booting device 3984 * no matter the rootpool is a single device pool or a mirrored pool. 3985 * e.g. 3986 * "/pci@1f,0/ide@d/disk@0,0:a" 3987 */ 3988int 3989spa_import_rootpool(char *devpath, char *devid) 3990{ 3991 spa_t *spa; 3992 vdev_t *rvd, *bvd, *avd = NULL; 3993 nvlist_t *config, *nvtop; 3994 uint64_t guid, txg; 3995 char *pname; 3996 int error; 3997 3998 /* 3999 * Read the label from the boot device and generate a configuration. 4000 */ 4001 config = spa_generate_rootconf(devpath, devid, &guid); 4002#if defined(_OBP) && defined(_KERNEL) 4003 if (config == NULL) { 4004 if (strstr(devpath, "/iscsi/ssd") != NULL) { 4005 /* iscsi boot */ 4006 get_iscsi_bootpath_phy(devpath); 4007 config = spa_generate_rootconf(devpath, devid, &guid); 4008 } 4009 } 4010#endif 4011 if (config == NULL) { 4012 cmn_err(CE_NOTE, "Cannot read the pool label from '%s'", 4013 devpath); 4014 return (SET_ERROR(EIO)); 4015 } 4016 4017 VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 4018 &pname) == 0); 4019 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); 4020 4021 mutex_enter(&spa_namespace_lock); 4022 if ((spa = spa_lookup(pname)) != NULL) { 4023 /* 4024 * Remove the existing root pool from the namespace so that we 4025 * can replace it with the correct config we just read in. 4026 */ 4027 spa_remove(spa); 4028 } 4029 4030 spa = spa_add(pname, config, NULL); 4031 spa->spa_is_root = B_TRUE; 4032 spa->spa_import_flags = ZFS_IMPORT_VERBATIM; 4033 4034 /* 4035 * Build up a vdev tree based on the boot device's label config. 4036 */ 4037 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 4038 &nvtop) == 0); 4039 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4040 error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, 4041 VDEV_ALLOC_ROOTPOOL); 4042 spa_config_exit(spa, SCL_ALL, FTAG); 4043 if (error) { 4044 mutex_exit(&spa_namespace_lock); 4045 nvlist_free(config); 4046 cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", 4047 pname); 4048 return (error); 4049 } 4050 4051 /* 4052 * Get the boot vdev. 4053 */ 4054 if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 4055 cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu", 4056 (u_longlong_t)guid); 4057 error = SET_ERROR(ENOENT); 4058 goto out; 4059 } 4060 4061 /* 4062 * Determine if there is a better boot device. 4063 */ 4064 avd = bvd; 4065 spa_alt_rootvdev(rvd, &avd, &txg); 4066 if (avd != bvd) { 4067 cmn_err(CE_NOTE, "The boot device is 'degraded'. Please " 4068 "try booting from '%s'", avd->vdev_path); 4069 error = SET_ERROR(EINVAL); 4070 goto out; 4071 } 4072 4073 /* 4074 * If the boot device is part of a spare vdev then ensure that 4075 * we're booting off the active spare. 4076 */ 4077 if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops && 4078 !bvd->vdev_isspare) { 4079 cmn_err(CE_NOTE, "The boot device is currently spared. Please " 4080 "try booting from '%s'", 4081 bvd->vdev_parent-> 4082 vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path); 4083 error = SET_ERROR(EINVAL); 4084 goto out; 4085 } 4086 4087 error = 0; 4088out: 4089 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4090 vdev_free(rvd); 4091 spa_config_exit(spa, SCL_ALL, FTAG); 4092 mutex_exit(&spa_namespace_lock); 4093 4094 nvlist_free(config); 4095 return (error); 4096} 4097 4098#else /* !illumos */ 4099 4100extern int vdev_geom_read_pool_label(const char *name, nvlist_t ***configs, 4101 uint64_t *count); 4102 4103static nvlist_t * 4104spa_generate_rootconf(const char *name) 4105{ 4106 nvlist_t **configs, **tops; 4107 nvlist_t *config; 4108 nvlist_t *best_cfg, *nvtop, *nvroot; 4109 uint64_t *holes; 4110 uint64_t best_txg; 4111 uint64_t nchildren; 4112 uint64_t pgid; 4113 uint64_t count; 4114 uint64_t i; 4115 uint_t nholes; 4116 4117 if (vdev_geom_read_pool_label(name, &configs, &count) != 0) 4118 return (NULL); 4119 4120 ASSERT3U(count, !=, 0); 4121 best_txg = 0; 4122 for (i = 0; i < count; i++) { 4123 uint64_t txg; 4124 4125 VERIFY(nvlist_lookup_uint64(configs[i], ZPOOL_CONFIG_POOL_TXG, 4126 &txg) == 0); 4127 if (txg > best_txg) { 4128 best_txg = txg; 4129 best_cfg = configs[i]; 4130 } 4131 } 4132 4133 nchildren = 1; 4134 nvlist_lookup_uint64(best_cfg, ZPOOL_CONFIG_VDEV_CHILDREN, &nchildren); 4135 holes = NULL; 4136 nvlist_lookup_uint64_array(best_cfg, ZPOOL_CONFIG_HOLE_ARRAY, 4137 &holes, &nholes); 4138 4139 tops = kmem_zalloc(nchildren * sizeof(void *), KM_SLEEP); 4140 for (i = 0; i < nchildren; i++) { 4141 if (i >= count) 4142 break; 4143 if (configs[i] == NULL) 4144 continue; 4145 VERIFY(nvlist_lookup_nvlist(configs[i], ZPOOL_CONFIG_VDEV_TREE, 4146 &nvtop) == 0); 4147 nvlist_dup(nvtop, &tops[i], KM_SLEEP); 4148 } 4149 for (i = 0; holes != NULL && i < nholes; i++) { 4150 if (i >= nchildren) 4151 continue; 4152 if (tops[holes[i]] != NULL) 4153 continue; 4154 nvlist_alloc(&tops[holes[i]], NV_UNIQUE_NAME, KM_SLEEP); 4155 VERIFY(nvlist_add_string(tops[holes[i]], ZPOOL_CONFIG_TYPE, 4156 VDEV_TYPE_HOLE) == 0); 4157 VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_ID, 4158 holes[i]) == 0); 4159 VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_GUID, 4160 0) == 0); 4161 } 4162 for (i = 0; i < nchildren; i++) { 4163 if (tops[i] != NULL) 4164 continue; 4165 nvlist_alloc(&tops[i], NV_UNIQUE_NAME, KM_SLEEP); 4166 VERIFY(nvlist_add_string(tops[i], ZPOOL_CONFIG_TYPE, 4167 VDEV_TYPE_MISSING) == 0); 4168 VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_ID, 4169 i) == 0); 4170 VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_GUID, 4171 0) == 0); 4172 } 4173 4174 /* 4175 * Create pool config based on the best vdev config. 4176 */ 4177 nvlist_dup(best_cfg, &config, KM_SLEEP); 4178 4179 /* 4180 * Put this pool's top-level vdevs into a root vdev. 4181 */ 4182 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 4183 &pgid) == 0); 4184 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 4185 VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 4186 VDEV_TYPE_ROOT) == 0); 4187 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 4188 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 4189 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 4190 tops, nchildren) == 0); 4191 4192 /* 4193 * Replace the existing vdev_tree with the new root vdev in 4194 * this pool's configuration (remove the old, add the new). 4195 */ 4196 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 4197 4198 /* 4199 * Drop vdev config elements that should not be present at pool level. 4200 */ 4201 nvlist_remove(config, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64); 4202 nvlist_remove(config, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64); 4203 4204 for (i = 0; i < count; i++) 4205 nvlist_free(configs[i]); 4206 kmem_free(configs, count * sizeof(void *)); 4207 for (i = 0; i < nchildren; i++) 4208 nvlist_free(tops[i]); 4209 kmem_free(tops, nchildren * sizeof(void *)); 4210 nvlist_free(nvroot); 4211 return (config); 4212} 4213 4214int 4215spa_import_rootpool(const char *name) 4216{ 4217 spa_t *spa; 4218 vdev_t *rvd, *bvd, *avd = NULL; 4219 nvlist_t *config, *nvtop; 4220 uint64_t txg; 4221 char *pname; 4222 int error; 4223 4224 /* 4225 * Read the label from the boot device and generate a configuration. 4226 */ 4227 config = spa_generate_rootconf(name); 4228 4229 mutex_enter(&spa_namespace_lock); 4230 if (config != NULL) { 4231 VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 4232 &pname) == 0 && strcmp(name, pname) == 0); 4233 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) 4234 == 0); 4235 4236 if ((spa = spa_lookup(pname)) != NULL) { 4237 /* 4238 * Remove the existing root pool from the namespace so 4239 * that we can replace it with the correct config 4240 * we just read in. 4241 */ 4242 spa_remove(spa); 4243 } 4244 spa = spa_add(pname, config, NULL); 4245 4246 /* 4247 * Set spa_ubsync.ub_version as it can be used in vdev_alloc() 4248 * via spa_version(). 4249 */ 4250 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 4251 &spa->spa_ubsync.ub_version) != 0) 4252 spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 4253 } else if ((spa = spa_lookup(name)) == NULL) { 4254 mutex_exit(&spa_namespace_lock); 4255 nvlist_free(config); 4256 cmn_err(CE_NOTE, "Cannot find the pool label for '%s'", 4257 name); 4258 return (EIO); 4259 } else { 4260 VERIFY(nvlist_dup(spa->spa_config, &config, KM_SLEEP) == 0); 4261 } 4262 spa->spa_is_root = B_TRUE; 4263 spa->spa_import_flags = ZFS_IMPORT_VERBATIM; 4264 4265 /* 4266 * Build up a vdev tree based on the boot device's label config. 4267 */ 4268 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 4269 &nvtop) == 0); 4270 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4271 error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, 4272 VDEV_ALLOC_ROOTPOOL); 4273 spa_config_exit(spa, SCL_ALL, FTAG); 4274 if (error) { 4275 mutex_exit(&spa_namespace_lock); 4276 nvlist_free(config); 4277 cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", 4278 pname); 4279 return (error); 4280 } 4281 4282 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4283 vdev_free(rvd); 4284 spa_config_exit(spa, SCL_ALL, FTAG); 4285 mutex_exit(&spa_namespace_lock); 4286 4287 nvlist_free(config); 4288 return (0); 4289} 4290 4291#endif /* illumos */ 4292#endif /* _KERNEL */ 4293 4294/* 4295 * Import a non-root pool into the system. 4296 */ 4297int 4298spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) 4299{ 4300 spa_t *spa; 4301 char *altroot = NULL; 4302 spa_load_state_t state = SPA_LOAD_IMPORT; 4303 zpool_rewind_policy_t policy; 4304 uint64_t mode = spa_mode_global; 4305 uint64_t readonly = B_FALSE; 4306 int error; 4307 nvlist_t *nvroot; 4308 nvlist_t **spares, **l2cache; 4309 uint_t nspares, nl2cache; 4310 4311 /* 4312 * If a pool with this name exists, return failure. 4313 */ 4314 mutex_enter(&spa_namespace_lock); 4315 if (spa_lookup(pool) != NULL) { 4316 mutex_exit(&spa_namespace_lock); 4317 return (SET_ERROR(EEXIST)); 4318 } 4319 4320 /* 4321 * Create and initialize the spa structure. 4322 */ 4323 (void) nvlist_lookup_string(props, 4324 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 4325 (void) nvlist_lookup_uint64(props, 4326 zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); 4327 if (readonly) 4328 mode = FREAD; 4329 spa = spa_add(pool, config, altroot); 4330 spa->spa_import_flags = flags; 4331 4332 /* 4333 * Verbatim import - Take a pool and insert it into the namespace 4334 * as if it had been loaded at boot. 4335 */ 4336 if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { 4337 if (props != NULL) 4338 spa_configfile_set(spa, props, B_FALSE); 4339 4340 spa_config_sync(spa, B_FALSE, B_TRUE); 4341 spa_event_notify(spa, NULL, ESC_ZFS_POOL_IMPORT); 4342 4343 mutex_exit(&spa_namespace_lock); 4344 return (0); 4345 } 4346 4347 spa_activate(spa, mode); 4348 4349 /* 4350 * Don't start async tasks until we know everything is healthy. 4351 */ 4352 spa_async_suspend(spa); 4353 4354 zpool_get_rewind_policy(config, &policy); 4355 if (policy.zrp_request & ZPOOL_DO_REWIND) 4356 state = SPA_LOAD_RECOVER; 4357 4358 /* 4359 * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig 4360 * because the user-supplied config is actually the one to trust when 4361 * doing an import. 4362 */ 4363 if (state != SPA_LOAD_RECOVER) 4364 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 4365 4366 error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg, 4367 policy.zrp_request); 4368 4369 /* 4370 * Propagate anything learned while loading the pool and pass it 4371 * back to caller (i.e. rewind info, missing devices, etc). 4372 */ 4373 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 4374 spa->spa_load_info) == 0); 4375 4376 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4377 /* 4378 * Toss any existing sparelist, as it doesn't have any validity 4379 * anymore, and conflicts with spa_has_spare(). 4380 */ 4381 if (spa->spa_spares.sav_config) { 4382 nvlist_free(spa->spa_spares.sav_config); 4383 spa->spa_spares.sav_config = NULL; 4384 spa_load_spares(spa); 4385 } 4386 if (spa->spa_l2cache.sav_config) { 4387 nvlist_free(spa->spa_l2cache.sav_config); 4388 spa->spa_l2cache.sav_config = NULL; 4389 spa_load_l2cache(spa); 4390 } 4391 4392 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 4393 &nvroot) == 0); 4394 if (error == 0) 4395 error = spa_validate_aux(spa, nvroot, -1ULL, 4396 VDEV_ALLOC_SPARE); 4397 if (error == 0) 4398 error = spa_validate_aux(spa, nvroot, -1ULL, 4399 VDEV_ALLOC_L2CACHE); 4400 spa_config_exit(spa, SCL_ALL, FTAG); 4401 4402 if (props != NULL) 4403 spa_configfile_set(spa, props, B_FALSE); 4404 4405 if (error != 0 || (props && spa_writeable(spa) && 4406 (error = spa_prop_set(spa, props)))) { 4407 spa_unload(spa); 4408 spa_deactivate(spa); 4409 spa_remove(spa); 4410 mutex_exit(&spa_namespace_lock); 4411 return (error); 4412 } 4413 4414 spa_async_resume(spa); 4415 4416 /* 4417 * Override any spares and level 2 cache devices as specified by 4418 * the user, as these may have correct device names/devids, etc. 4419 */ 4420 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 4421 &spares, &nspares) == 0) { 4422 if (spa->spa_spares.sav_config) 4423 VERIFY(nvlist_remove(spa->spa_spares.sav_config, 4424 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 4425 else 4426 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, 4427 NV_UNIQUE_NAME, KM_SLEEP) == 0); 4428 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 4429 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 4430 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4431 spa_load_spares(spa); 4432 spa_config_exit(spa, SCL_ALL, FTAG); 4433 spa->spa_spares.sav_sync = B_TRUE; 4434 } 4435 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 4436 &l2cache, &nl2cache) == 0) { 4437 if (spa->spa_l2cache.sav_config) 4438 VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, 4439 ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); 4440 else 4441 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 4442 NV_UNIQUE_NAME, KM_SLEEP) == 0); 4443 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 4444 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 4445 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4446 spa_load_l2cache(spa); 4447 spa_config_exit(spa, SCL_ALL, FTAG); 4448 spa->spa_l2cache.sav_sync = B_TRUE; 4449 } 4450 4451 /* 4452 * Check for any removed devices. 4453 */ 4454 if (spa->spa_autoreplace) { 4455 spa_aux_check_removed(&spa->spa_spares); 4456 spa_aux_check_removed(&spa->spa_l2cache); 4457 } 4458 4459 if (spa_writeable(spa)) { 4460 /* 4461 * Update the config cache to include the newly-imported pool. 4462 */ 4463 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 4464 } 4465 4466 /* 4467 * It's possible that the pool was expanded while it was exported. 4468 * We kick off an async task to handle this for us. 4469 */ 4470 spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 4471 4472 spa_history_log_version(spa, "import"); 4473 4474 spa_event_notify(spa, NULL, ESC_ZFS_POOL_IMPORT); 4475 4476 mutex_exit(&spa_namespace_lock); 4477 4478#ifdef __FreeBSD__ 4479#ifdef _KERNEL 4480 zvol_create_minors(pool); 4481#endif 4482#endif 4483 return (0); 4484} 4485 4486nvlist_t * 4487spa_tryimport(nvlist_t *tryconfig) 4488{ 4489 nvlist_t *config = NULL; 4490 char *poolname; 4491 spa_t *spa; 4492 uint64_t state; 4493 int error; 4494 4495 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 4496 return (NULL); 4497 4498 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 4499 return (NULL); 4500 4501 /* 4502 * Create and initialize the spa structure. 4503 */ 4504 mutex_enter(&spa_namespace_lock); 4505 spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); 4506 spa_activate(spa, FREAD); 4507 4508 /* 4509 * Pass off the heavy lifting to spa_load(). 4510 * Pass TRUE for mosconfig because the user-supplied config 4511 * is actually the one to trust when doing an import. 4512 */ 4513 error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE); 4514 4515 /* 4516 * If 'tryconfig' was at least parsable, return the current config. 4517 */ 4518 if (spa->spa_root_vdev != NULL) { 4519 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 4520 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 4521 poolname) == 0); 4522 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 4523 state) == 0); 4524 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 4525 spa->spa_uberblock.ub_timestamp) == 0); 4526 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 4527 spa->spa_load_info) == 0); 4528 4529 /* 4530 * If the bootfs property exists on this pool then we 4531 * copy it out so that external consumers can tell which 4532 * pools are bootable. 4533 */ 4534 if ((!error || error == EEXIST) && spa->spa_bootfs) { 4535 char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 4536 4537 /* 4538 * We have to play games with the name since the 4539 * pool was opened as TRYIMPORT_NAME. 4540 */ 4541 if (dsl_dsobj_to_dsname(spa_name(spa), 4542 spa->spa_bootfs, tmpname) == 0) { 4543 char *cp; 4544 char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 4545 4546 cp = strchr(tmpname, '/'); 4547 if (cp == NULL) { 4548 (void) strlcpy(dsname, tmpname, 4549 MAXPATHLEN); 4550 } else { 4551 (void) snprintf(dsname, MAXPATHLEN, 4552 "%s/%s", poolname, ++cp); 4553 } 4554 VERIFY(nvlist_add_string(config, 4555 ZPOOL_CONFIG_BOOTFS, dsname) == 0); 4556 kmem_free(dsname, MAXPATHLEN); 4557 } 4558 kmem_free(tmpname, MAXPATHLEN); 4559 } 4560 4561 /* 4562 * Add the list of hot spares and level 2 cache devices. 4563 */ 4564 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4565 spa_add_spares(spa, config); 4566 spa_add_l2cache(spa, config); 4567 spa_config_exit(spa, SCL_CONFIG, FTAG); 4568 } 4569 4570 spa_unload(spa); 4571 spa_deactivate(spa); 4572 spa_remove(spa); 4573 mutex_exit(&spa_namespace_lock); 4574 4575 return (config); 4576} 4577 4578/* 4579 * Pool export/destroy 4580 * 4581 * The act of destroying or exporting a pool is very simple. We make sure there 4582 * is no more pending I/O and any references to the pool are gone. Then, we 4583 * update the pool state and sync all the labels to disk, removing the 4584 * configuration from the cache afterwards. If the 'hardforce' flag is set, then 4585 * we don't sync the labels or remove the configuration cache. 4586 */ 4587static int 4588spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, 4589 boolean_t force, boolean_t hardforce) 4590{ 4591 spa_t *spa; 4592 4593 if (oldconfig) 4594 *oldconfig = NULL; 4595 4596 if (!(spa_mode_global & FWRITE)) 4597 return (SET_ERROR(EROFS)); 4598 4599 mutex_enter(&spa_namespace_lock); 4600 if ((spa = spa_lookup(pool)) == NULL) { 4601 mutex_exit(&spa_namespace_lock); 4602 return (SET_ERROR(ENOENT)); 4603 } 4604 4605 /* 4606 * Put a hold on the pool, drop the namespace lock, stop async tasks, 4607 * reacquire the namespace lock, and see if we can export. 4608 */ 4609 spa_open_ref(spa, FTAG); 4610 mutex_exit(&spa_namespace_lock); 4611 spa_async_suspend(spa); 4612 mutex_enter(&spa_namespace_lock); 4613 spa_close(spa, FTAG); 4614 4615 /* 4616 * The pool will be in core if it's openable, 4617 * in which case we can modify its state. 4618 */ 4619 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 4620 /* 4621 * Objsets may be open only because they're dirty, so we 4622 * have to force it to sync before checking spa_refcnt. 4623 */ 4624 txg_wait_synced(spa->spa_dsl_pool, 0); 4625 spa_evicting_os_wait(spa); 4626 4627 /* 4628 * A pool cannot be exported or destroyed if there are active 4629 * references. If we are resetting a pool, allow references by 4630 * fault injection handlers. 4631 */ 4632 if (!spa_refcount_zero(spa) || 4633 (spa->spa_inject_ref != 0 && 4634 new_state != POOL_STATE_UNINITIALIZED)) { 4635 spa_async_resume(spa); 4636 mutex_exit(&spa_namespace_lock); 4637 return (SET_ERROR(EBUSY)); 4638 } 4639 4640 /* 4641 * A pool cannot be exported if it has an active shared spare. 4642 * This is to prevent other pools stealing the active spare 4643 * from an exported pool. At user's own will, such pool can 4644 * be forcedly exported. 4645 */ 4646 if (!force && new_state == POOL_STATE_EXPORTED && 4647 spa_has_active_shared_spare(spa)) { 4648 spa_async_resume(spa); 4649 mutex_exit(&spa_namespace_lock); 4650 return (SET_ERROR(EXDEV)); 4651 } 4652 4653 /* 4654 * We want this to be reflected on every label, 4655 * so mark them all dirty. spa_unload() will do the 4656 * final sync that pushes these changes out. 4657 */ 4658 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 4659 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4660 spa->spa_state = new_state; 4661 spa->spa_final_txg = spa_last_synced_txg(spa) + 4662 TXG_DEFER_SIZE + 1; 4663 vdev_config_dirty(spa->spa_root_vdev); 4664 spa_config_exit(spa, SCL_ALL, FTAG); 4665 } 4666 } 4667 4668 spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY); 4669 4670 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 4671 spa_unload(spa); 4672 spa_deactivate(spa); 4673 } 4674 4675 if (oldconfig && spa->spa_config) 4676 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 4677 4678 if (new_state != POOL_STATE_UNINITIALIZED) { 4679 if (!hardforce) 4680 spa_config_sync(spa, B_TRUE, B_TRUE); 4681 spa_remove(spa); 4682 } 4683 mutex_exit(&spa_namespace_lock); 4684 4685 return (0); 4686} 4687 4688/* 4689 * Destroy a storage pool. 4690 */ 4691int 4692spa_destroy(char *pool) 4693{ 4694 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, 4695 B_FALSE, B_FALSE)); 4696} 4697 4698/* 4699 * Export a storage pool. 4700 */ 4701int 4702spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, 4703 boolean_t hardforce) 4704{ 4705 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, 4706 force, hardforce)); 4707} 4708 4709/* 4710 * Similar to spa_export(), this unloads the spa_t without actually removing it 4711 * from the namespace in any way. 4712 */ 4713int 4714spa_reset(char *pool) 4715{ 4716 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, 4717 B_FALSE, B_FALSE)); 4718} 4719 4720/* 4721 * ========================================================================== 4722 * Device manipulation 4723 * ========================================================================== 4724 */ 4725 4726/* 4727 * Add a device to a storage pool. 4728 */ 4729int 4730spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 4731{ 4732 uint64_t txg, id; 4733 int error; 4734 vdev_t *rvd = spa->spa_root_vdev; 4735 vdev_t *vd, *tvd; 4736 nvlist_t **spares, **l2cache; 4737 uint_t nspares, nl2cache; 4738 4739 ASSERT(spa_writeable(spa)); 4740 4741 txg = spa_vdev_enter(spa); 4742 4743 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 4744 VDEV_ALLOC_ADD)) != 0) 4745 return (spa_vdev_exit(spa, NULL, txg, error)); 4746 4747 spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ 4748 4749 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 4750 &nspares) != 0) 4751 nspares = 0; 4752 4753 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 4754 &nl2cache) != 0) 4755 nl2cache = 0; 4756 4757 if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) 4758 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 4759 4760 if (vd->vdev_children != 0 && 4761 (error = vdev_create(vd, txg, B_FALSE)) != 0) 4762 return (spa_vdev_exit(spa, vd, txg, error)); 4763 4764 /* 4765 * We must validate the spares and l2cache devices after checking the 4766 * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 4767 */ 4768 if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) 4769 return (spa_vdev_exit(spa, vd, txg, error)); 4770 4771 /* 4772 * Transfer each new top-level vdev from vd to rvd. 4773 */ 4774 for (int c = 0; c < vd->vdev_children; c++) { 4775 4776 /* 4777 * Set the vdev id to the first hole, if one exists. 4778 */ 4779 for (id = 0; id < rvd->vdev_children; id++) { 4780 if (rvd->vdev_child[id]->vdev_ishole) { 4781 vdev_free(rvd->vdev_child[id]); 4782 break; 4783 } 4784 } 4785 tvd = vd->vdev_child[c]; 4786 vdev_remove_child(vd, tvd); 4787 tvd->vdev_id = id; 4788 vdev_add_child(rvd, tvd); 4789 vdev_config_dirty(tvd); 4790 } 4791 4792 if (nspares != 0) { 4793 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 4794 ZPOOL_CONFIG_SPARES); 4795 spa_load_spares(spa); 4796 spa->spa_spares.sav_sync = B_TRUE; 4797 } 4798 4799 if (nl2cache != 0) { 4800 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 4801 ZPOOL_CONFIG_L2CACHE); 4802 spa_load_l2cache(spa); 4803 spa->spa_l2cache.sav_sync = B_TRUE; 4804 } 4805 4806 /* 4807 * We have to be careful when adding new vdevs to an existing pool. 4808 * If other threads start allocating from these vdevs before we 4809 * sync the config cache, and we lose power, then upon reboot we may 4810 * fail to open the pool because there are DVAs that the config cache 4811 * can't translate. Therefore, we first add the vdevs without 4812 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 4813 * and then let spa_config_update() initialize the new metaslabs. 4814 * 4815 * spa_load() checks for added-but-not-initialized vdevs, so that 4816 * if we lose power at any point in this sequence, the remaining 4817 * steps will be completed the next time we load the pool. 4818 */ 4819 (void) spa_vdev_exit(spa, vd, txg, 0); 4820 4821 mutex_enter(&spa_namespace_lock); 4822 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 4823 spa_event_notify(spa, NULL, ESC_ZFS_VDEV_ADD); 4824 mutex_exit(&spa_namespace_lock); 4825 4826 return (0); 4827} 4828 4829/* 4830 * Attach a device to a mirror. The arguments are the path to any device 4831 * in the mirror, and the nvroot for the new device. If the path specifies 4832 * a device that is not mirrored, we automatically insert the mirror vdev. 4833 * 4834 * If 'replacing' is specified, the new device is intended to replace the 4835 * existing device; in this case the two devices are made into their own 4836 * mirror using the 'replacing' vdev, which is functionally identical to 4837 * the mirror vdev (it actually reuses all the same ops) but has a few 4838 * extra rules: you can't attach to it after it's been created, and upon 4839 * completion of resilvering, the first disk (the one being replaced) 4840 * is automatically detached. 4841 */ 4842int 4843spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 4844{ 4845 uint64_t txg, dtl_max_txg; 4846 vdev_t *rvd = spa->spa_root_vdev; 4847 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 4848 vdev_ops_t *pvops; 4849 char *oldvdpath, *newvdpath; 4850 int newvd_isspare; 4851 int error; 4852 4853 ASSERT(spa_writeable(spa)); 4854 4855 txg = spa_vdev_enter(spa); 4856 4857 oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 4858 4859 if (oldvd == NULL) 4860 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 4861 4862 if (!oldvd->vdev_ops->vdev_op_leaf) 4863 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4864 4865 pvd = oldvd->vdev_parent; 4866 4867 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 4868 VDEV_ALLOC_ATTACH)) != 0) 4869 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4870 4871 if (newrootvd->vdev_children != 1) 4872 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 4873 4874 newvd = newrootvd->vdev_child[0]; 4875 4876 if (!newvd->vdev_ops->vdev_op_leaf) 4877 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 4878 4879 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 4880 return (spa_vdev_exit(spa, newrootvd, txg, error)); 4881 4882 /* 4883 * Spares can't replace logs 4884 */ 4885 if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) 4886 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4887 4888 if (!replacing) { 4889 /* 4890 * For attach, the only allowable parent is a mirror or the root 4891 * vdev. 4892 */ 4893 if (pvd->vdev_ops != &vdev_mirror_ops && 4894 pvd->vdev_ops != &vdev_root_ops) 4895 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4896 4897 pvops = &vdev_mirror_ops; 4898 } else { 4899 /* 4900 * Active hot spares can only be replaced by inactive hot 4901 * spares. 4902 */ 4903 if (pvd->vdev_ops == &vdev_spare_ops && 4904 oldvd->vdev_isspare && 4905 !spa_has_spare(spa, newvd->vdev_guid)) 4906 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4907 4908 /* 4909 * If the source is a hot spare, and the parent isn't already a 4910 * spare, then we want to create a new hot spare. Otherwise, we 4911 * want to create a replacing vdev. The user is not allowed to 4912 * attach to a spared vdev child unless the 'isspare' state is 4913 * the same (spare replaces spare, non-spare replaces 4914 * non-spare). 4915 */ 4916 if (pvd->vdev_ops == &vdev_replacing_ops && 4917 spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { 4918 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4919 } else if (pvd->vdev_ops == &vdev_spare_ops && 4920 newvd->vdev_isspare != oldvd->vdev_isspare) { 4921 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4922 } 4923 4924 if (newvd->vdev_isspare) 4925 pvops = &vdev_spare_ops; 4926 else 4927 pvops = &vdev_replacing_ops; 4928 } 4929 4930 /* 4931 * Make sure the new device is big enough. 4932 */ 4933 if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) 4934 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 4935 4936 /* 4937 * The new device cannot have a higher alignment requirement 4938 * than the top-level vdev. 4939 */ 4940 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 4941 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 4942 4943 /* 4944 * If this is an in-place replacement, update oldvd's path and devid 4945 * to make it distinguishable from newvd, and unopenable from now on. 4946 */ 4947 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 4948 spa_strfree(oldvd->vdev_path); 4949 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 4950 KM_SLEEP); 4951 (void) sprintf(oldvd->vdev_path, "%s/%s", 4952 newvd->vdev_path, "old"); 4953 if (oldvd->vdev_devid != NULL) { 4954 spa_strfree(oldvd->vdev_devid); 4955 oldvd->vdev_devid = NULL; 4956 } 4957 } 4958 4959 /* mark the device being resilvered */ 4960 newvd->vdev_resilver_txg = txg; 4961 4962 /* 4963 * If the parent is not a mirror, or if we're replacing, insert the new 4964 * mirror/replacing/spare vdev above oldvd. 4965 */ 4966 if (pvd->vdev_ops != pvops) 4967 pvd = vdev_add_parent(oldvd, pvops); 4968 4969 ASSERT(pvd->vdev_top->vdev_parent == rvd); 4970 ASSERT(pvd->vdev_ops == pvops); 4971 ASSERT(oldvd->vdev_parent == pvd); 4972 4973 /* 4974 * Extract the new device from its root and add it to pvd. 4975 */ 4976 vdev_remove_child(newrootvd, newvd); 4977 newvd->vdev_id = pvd->vdev_children; 4978 newvd->vdev_crtxg = oldvd->vdev_crtxg; 4979 vdev_add_child(pvd, newvd); 4980 4981 tvd = newvd->vdev_top; 4982 ASSERT(pvd->vdev_top == tvd); 4983 ASSERT(tvd->vdev_parent == rvd); 4984 4985 vdev_config_dirty(tvd); 4986 4987 /* 4988 * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account 4989 * for any dmu_sync-ed blocks. It will propagate upward when 4990 * spa_vdev_exit() calls vdev_dtl_reassess(). 4991 */ 4992 dtl_max_txg = txg + TXG_CONCURRENT_STATES; 4993 4994 vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, 4995 dtl_max_txg - TXG_INITIAL); 4996 4997 if (newvd->vdev_isspare) { 4998 spa_spare_activate(newvd); 4999 spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE); 5000 } 5001 5002 oldvdpath = spa_strdup(oldvd->vdev_path); 5003 newvdpath = spa_strdup(newvd->vdev_path); 5004 newvd_isspare = newvd->vdev_isspare; 5005 5006 /* 5007 * Mark newvd's DTL dirty in this txg. 5008 */ 5009 vdev_dirty(tvd, VDD_DTL, newvd, txg); 5010 5011 /* 5012 * Schedule the resilver to restart in the future. We do this to 5013 * ensure that dmu_sync-ed blocks have been stitched into the 5014 * respective datasets. 5015 */ 5016 dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg); 5017 5018 if (spa->spa_bootfs) 5019 spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH); 5020 5021 spa_event_notify(spa, newvd, ESC_ZFS_VDEV_ATTACH); 5022 5023 /* 5024 * Commit the config 5025 */ 5026 (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); 5027 5028 spa_history_log_internal(spa, "vdev attach", NULL, 5029 "%s vdev=%s %s vdev=%s", 5030 replacing && newvd_isspare ? "spare in" : 5031 replacing ? "replace" : "attach", newvdpath, 5032 replacing ? "for" : "to", oldvdpath); 5033 5034 spa_strfree(oldvdpath); 5035 spa_strfree(newvdpath); 5036 5037 return (0); 5038} 5039 5040/* 5041 * Detach a device from a mirror or replacing vdev. 5042 * 5043 * If 'replace_done' is specified, only detach if the parent 5044 * is a replacing vdev. 5045 */ 5046int 5047spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) 5048{ 5049 uint64_t txg; 5050 int error; 5051 vdev_t *rvd = spa->spa_root_vdev; 5052 vdev_t *vd, *pvd, *cvd, *tvd; 5053 boolean_t unspare = B_FALSE; 5054 uint64_t unspare_guid = 0; 5055 char *vdpath; 5056 5057 ASSERT(spa_writeable(spa)); 5058 5059 txg = spa_vdev_enter(spa); 5060 5061 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 5062 5063 if (vd == NULL) 5064 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 5065 5066 if (!vd->vdev_ops->vdev_op_leaf) 5067 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 5068 5069 pvd = vd->vdev_parent; 5070 5071 /* 5072 * If the parent/child relationship is not as expected, don't do it. 5073 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing 5074 * vdev that's replacing B with C. The user's intent in replacing 5075 * is to go from M(A,B) to M(A,C). If the user decides to cancel 5076 * the replace by detaching C, the expected behavior is to end up 5077 * M(A,B). But suppose that right after deciding to detach C, 5078 * the replacement of B completes. We would have M(A,C), and then 5079 * ask to detach C, which would leave us with just A -- not what 5080 * the user wanted. To prevent this, we make sure that the 5081 * parent/child relationship hasn't changed -- in this example, 5082 * that C's parent is still the replacing vdev R. 5083 */ 5084 if (pvd->vdev_guid != pguid && pguid != 0) 5085 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 5086 5087 /* 5088 * Only 'replacing' or 'spare' vdevs can be replaced. 5089 */ 5090 if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && 5091 pvd->vdev_ops != &vdev_spare_ops) 5092 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 5093 5094 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 5095 spa_version(spa) >= SPA_VERSION_SPARES); 5096 5097 /* 5098 * Only mirror, replacing, and spare vdevs support detach. 5099 */ 5100 if (pvd->vdev_ops != &vdev_replacing_ops && 5101 pvd->vdev_ops != &vdev_mirror_ops && 5102 pvd->vdev_ops != &vdev_spare_ops) 5103 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 5104 5105 /* 5106 * If this device has the only valid copy of some data, 5107 * we cannot safely detach it. 5108 */ 5109 if (vdev_dtl_required(vd)) 5110 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 5111 5112 ASSERT(pvd->vdev_children >= 2); 5113 5114 /* 5115 * If we are detaching the second disk from a replacing vdev, then 5116 * check to see if we changed the original vdev's path to have "/old" 5117 * at the end in spa_vdev_attach(). If so, undo that change now. 5118 */ 5119 if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && 5120 vd->vdev_path != NULL) { 5121 size_t len = strlen(vd->vdev_path); 5122 5123 for (int c = 0; c < pvd->vdev_children; c++) { 5124 cvd = pvd->vdev_child[c]; 5125 5126 if (cvd == vd || cvd->vdev_path == NULL) 5127 continue; 5128 5129 if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 5130 strcmp(cvd->vdev_path + len, "/old") == 0) { 5131 spa_strfree(cvd->vdev_path); 5132 cvd->vdev_path = spa_strdup(vd->vdev_path); 5133 break; 5134 } 5135 } 5136 } 5137 5138 /* 5139 * If we are detaching the original disk from a spare, then it implies 5140 * that the spare should become a real disk, and be removed from the 5141 * active spare list for the pool. 5142 */ 5143 if (pvd->vdev_ops == &vdev_spare_ops && 5144 vd->vdev_id == 0 && 5145 pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare) 5146 unspare = B_TRUE; 5147 5148 /* 5149 * Erase the disk labels so the disk can be used for other things. 5150 * This must be done after all other error cases are handled, 5151 * but before we disembowel vd (so we can still do I/O to it). 5152 * But if we can't do it, don't treat the error as fatal -- 5153 * it may be that the unwritability of the disk is the reason 5154 * it's being detached! 5155 */ 5156 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 5157 5158 /* 5159 * Remove vd from its parent and compact the parent's children. 5160 */ 5161 vdev_remove_child(pvd, vd); 5162 vdev_compact_children(pvd); 5163 5164 /* 5165 * Remember one of the remaining children so we can get tvd below. 5166 */ 5167 cvd = pvd->vdev_child[pvd->vdev_children - 1]; 5168 5169 /* 5170 * If we need to remove the remaining child from the list of hot spares, 5171 * do it now, marking the vdev as no longer a spare in the process. 5172 * We must do this before vdev_remove_parent(), because that can 5173 * change the GUID if it creates a new toplevel GUID. For a similar 5174 * reason, we must remove the spare now, in the same txg as the detach; 5175 * otherwise someone could attach a new sibling, change the GUID, and 5176 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. 5177 */ 5178 if (unspare) { 5179 ASSERT(cvd->vdev_isspare); 5180 spa_spare_remove(cvd); 5181 unspare_guid = cvd->vdev_guid; 5182 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 5183 cvd->vdev_unspare = B_TRUE; 5184 } 5185 5186 /* 5187 * If the parent mirror/replacing vdev only has one child, 5188 * the parent is no longer needed. Remove it from the tree. 5189 */ 5190 if (pvd->vdev_children == 1) { 5191 if (pvd->vdev_ops == &vdev_spare_ops) 5192 cvd->vdev_unspare = B_FALSE; 5193 vdev_remove_parent(cvd); 5194 } 5195 5196 5197 /* 5198 * We don't set tvd until now because the parent we just removed 5199 * may have been the previous top-level vdev. 5200 */ 5201 tvd = cvd->vdev_top; 5202 ASSERT(tvd->vdev_parent == rvd); 5203 5204 /* 5205 * Reevaluate the parent vdev state. 5206 */ 5207 vdev_propagate_state(cvd); 5208 5209 /* 5210 * If the 'autoexpand' property is set on the pool then automatically 5211 * try to expand the size of the pool. For example if the device we 5212 * just detached was smaller than the others, it may be possible to 5213 * add metaslabs (i.e. grow the pool). We need to reopen the vdev 5214 * first so that we can obtain the updated sizes of the leaf vdevs. 5215 */ 5216 if (spa->spa_autoexpand) { 5217 vdev_reopen(tvd); 5218 vdev_expand(tvd, txg); 5219 } 5220 5221 vdev_config_dirty(tvd); 5222 5223 /* 5224 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 5225 * vd->vdev_detached is set and free vd's DTL object in syncing context. 5226 * But first make sure we're not on any *other* txg's DTL list, to 5227 * prevent vd from being accessed after it's freed. 5228 */ 5229 vdpath = spa_strdup(vd->vdev_path); 5230 for (int t = 0; t < TXG_SIZE; t++) 5231 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 5232 vd->vdev_detached = B_TRUE; 5233 vdev_dirty(tvd, VDD_DTL, vd, txg); 5234 5235 spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); 5236 5237 /* hang on to the spa before we release the lock */ 5238 spa_open_ref(spa, FTAG); 5239 5240 error = spa_vdev_exit(spa, vd, txg, 0); 5241 5242 spa_history_log_internal(spa, "detach", NULL, 5243 "vdev=%s", vdpath); 5244 spa_strfree(vdpath); 5245 5246 /* 5247 * If this was the removal of the original device in a hot spare vdev, 5248 * then we want to go through and remove the device from the hot spare 5249 * list of every other pool. 5250 */ 5251 if (unspare) { 5252 spa_t *altspa = NULL; 5253 5254 mutex_enter(&spa_namespace_lock); 5255 while ((altspa = spa_next(altspa)) != NULL) { 5256 if (altspa->spa_state != POOL_STATE_ACTIVE || 5257 altspa == spa) 5258 continue; 5259 5260 spa_open_ref(altspa, FTAG); 5261 mutex_exit(&spa_namespace_lock); 5262 (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); 5263 mutex_enter(&spa_namespace_lock); 5264 spa_close(altspa, FTAG); 5265 } 5266 mutex_exit(&spa_namespace_lock); 5267 5268 /* search the rest of the vdevs for spares to remove */ 5269 spa_vdev_resilver_done(spa); 5270 } 5271 5272 /* all done with the spa; OK to release */ 5273 mutex_enter(&spa_namespace_lock); 5274 spa_close(spa, FTAG); 5275 mutex_exit(&spa_namespace_lock); 5276 5277 return (error); 5278} 5279 5280/* 5281 * Split a set of devices from their mirrors, and create a new pool from them. 5282 */ 5283int 5284spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, 5285 nvlist_t *props, boolean_t exp) 5286{ 5287 int error = 0; 5288 uint64_t txg, *glist; 5289 spa_t *newspa; 5290 uint_t c, children, lastlog; 5291 nvlist_t **child, *nvl, *tmp; 5292 dmu_tx_t *tx; 5293 char *altroot = NULL; 5294 vdev_t *rvd, **vml = NULL; /* vdev modify list */ 5295 boolean_t activate_slog; 5296 5297 ASSERT(spa_writeable(spa)); 5298 5299 txg = spa_vdev_enter(spa); 5300 5301 /* clear the log and flush everything up to now */ 5302 activate_slog = spa_passivate_log(spa); 5303 (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 5304 error = spa_offline_log(spa); 5305 txg = spa_vdev_config_enter(spa); 5306 5307 if (activate_slog) 5308 spa_activate_log(spa); 5309 5310 if (error != 0) 5311 return (spa_vdev_exit(spa, NULL, txg, error)); 5312 5313 /* check new spa name before going any further */ 5314 if (spa_lookup(newname) != NULL) 5315 return (spa_vdev_exit(spa, NULL, txg, EEXIST)); 5316 5317 /* 5318 * scan through all the children to ensure they're all mirrors 5319 */ 5320 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || 5321 nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, 5322 &children) != 0) 5323 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 5324 5325 /* first, check to ensure we've got the right child count */ 5326 rvd = spa->spa_root_vdev; 5327 lastlog = 0; 5328 for (c = 0; c < rvd->vdev_children; c++) { 5329 vdev_t *vd = rvd->vdev_child[c]; 5330 5331 /* don't count the holes & logs as children */ 5332 if (vd->vdev_islog || vd->vdev_ishole) { 5333 if (lastlog == 0) 5334 lastlog = c; 5335 continue; 5336 } 5337 5338 lastlog = 0; 5339 } 5340 if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) 5341 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 5342 5343 /* next, ensure no spare or cache devices are part of the split */ 5344 if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || 5345 nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) 5346 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 5347 5348 vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); 5349 glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); 5350 5351 /* then, loop over each vdev and validate it */ 5352 for (c = 0; c < children; c++) { 5353 uint64_t is_hole = 0; 5354 5355 (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, 5356 &is_hole); 5357 5358 if (is_hole != 0) { 5359 if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || 5360 spa->spa_root_vdev->vdev_child[c]->vdev_islog) { 5361 continue; 5362 } else { 5363 error = SET_ERROR(EINVAL); 5364 break; 5365 } 5366 } 5367 5368 /* which disk is going to be split? */ 5369 if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, 5370 &glist[c]) != 0) { 5371 error = SET_ERROR(EINVAL); 5372 break; 5373 } 5374 5375 /* look it up in the spa */ 5376 vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); 5377 if (vml[c] == NULL) { 5378 error = SET_ERROR(ENODEV); 5379 break; 5380 } 5381 5382 /* make sure there's nothing stopping the split */ 5383 if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || 5384 vml[c]->vdev_islog || 5385 vml[c]->vdev_ishole || 5386 vml[c]->vdev_isspare || 5387 vml[c]->vdev_isl2cache || 5388 !vdev_writeable(vml[c]) || 5389 vml[c]->vdev_children != 0 || 5390 vml[c]->vdev_state != VDEV_STATE_HEALTHY || 5391 c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { 5392 error = SET_ERROR(EINVAL); 5393 break; 5394 } 5395 5396 if (vdev_dtl_required(vml[c])) { 5397 error = SET_ERROR(EBUSY); 5398 break; 5399 } 5400 5401 /* we need certain info from the top level */ 5402 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, 5403 vml[c]->vdev_top->vdev_ms_array) == 0); 5404 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, 5405 vml[c]->vdev_top->vdev_ms_shift) == 0); 5406 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, 5407 vml[c]->vdev_top->vdev_asize) == 0); 5408 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, 5409 vml[c]->vdev_top->vdev_ashift) == 0); 5410 5411 /* transfer per-vdev ZAPs */ 5412 ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0); 5413 VERIFY0(nvlist_add_uint64(child[c], 5414 ZPOOL_CONFIG_VDEV_LEAF_ZAP, vml[c]->vdev_leaf_zap)); 5415 5416 ASSERT3U(vml[c]->vdev_top->vdev_top_zap, !=, 0); 5417 VERIFY0(nvlist_add_uint64(child[c], 5418 ZPOOL_CONFIG_VDEV_TOP_ZAP, 5419 vml[c]->vdev_parent->vdev_top_zap)); 5420 } 5421 5422 if (error != 0) { 5423 kmem_free(vml, children * sizeof (vdev_t *)); 5424 kmem_free(glist, children * sizeof (uint64_t)); 5425 return (spa_vdev_exit(spa, NULL, txg, error)); 5426 } 5427 5428 /* stop writers from using the disks */ 5429 for (c = 0; c < children; c++) { 5430 if (vml[c] != NULL) 5431 vml[c]->vdev_offline = B_TRUE; 5432 } 5433 vdev_reopen(spa->spa_root_vdev); 5434 5435 /* 5436 * Temporarily record the splitting vdevs in the spa config. This 5437 * will disappear once the config is regenerated. 5438 */ 5439 VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5440 VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 5441 glist, children) == 0); 5442 kmem_free(glist, children * sizeof (uint64_t)); 5443 5444 mutex_enter(&spa->spa_props_lock); 5445 VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, 5446 nvl) == 0); 5447 mutex_exit(&spa->spa_props_lock); 5448 spa->spa_config_splitting = nvl; 5449 vdev_config_dirty(spa->spa_root_vdev); 5450 5451 /* configure and create the new pool */ 5452 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0); 5453 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 5454 exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0); 5455 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 5456 spa_version(spa)) == 0); 5457 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, 5458 spa->spa_config_txg) == 0); 5459 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, 5460 spa_generate_guid(NULL)) == 0); 5461 VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); 5462 (void) nvlist_lookup_string(props, 5463 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 5464 5465 /* add the new pool to the namespace */ 5466 newspa = spa_add(newname, config, altroot); 5467 newspa->spa_avz_action = AVZ_ACTION_REBUILD; 5468 newspa->spa_config_txg = spa->spa_config_txg; 5469 spa_set_log_state(newspa, SPA_LOG_CLEAR); 5470 5471 /* release the spa config lock, retaining the namespace lock */ 5472 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 5473 5474 if (zio_injection_enabled) 5475 zio_handle_panic_injection(spa, FTAG, 1); 5476 5477 spa_activate(newspa, spa_mode_global); 5478 spa_async_suspend(newspa); 5479 5480#ifndef illumos 5481 /* mark that we are creating new spa by splitting */ 5482 newspa->spa_splitting_newspa = B_TRUE; 5483#endif 5484 /* create the new pool from the disks of the original pool */ 5485 error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE); 5486#ifndef illumos 5487 newspa->spa_splitting_newspa = B_FALSE; 5488#endif 5489 if (error) 5490 goto out; 5491 5492 /* if that worked, generate a real config for the new pool */ 5493 if (newspa->spa_root_vdev != NULL) { 5494 VERIFY(nvlist_alloc(&newspa->spa_config_splitting, 5495 NV_UNIQUE_NAME, KM_SLEEP) == 0); 5496 VERIFY(nvlist_add_uint64(newspa->spa_config_splitting, 5497 ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0); 5498 spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, 5499 B_TRUE)); 5500 } 5501 5502 /* set the props */ 5503 if (props != NULL) { 5504 spa_configfile_set(newspa, props, B_FALSE); 5505 error = spa_prop_set(newspa, props); 5506 if (error) 5507 goto out; 5508 } 5509 5510 /* flush everything */ 5511 txg = spa_vdev_config_enter(newspa); 5512 vdev_config_dirty(newspa->spa_root_vdev); 5513 (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); 5514 5515 if (zio_injection_enabled) 5516 zio_handle_panic_injection(spa, FTAG, 2); 5517 5518 spa_async_resume(newspa); 5519 5520 /* finally, update the original pool's config */ 5521 txg = spa_vdev_config_enter(spa); 5522 tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 5523 error = dmu_tx_assign(tx, TXG_WAIT); 5524 if (error != 0) 5525 dmu_tx_abort(tx); 5526 for (c = 0; c < children; c++) { 5527 if (vml[c] != NULL) { 5528 vdev_split(vml[c]); 5529 if (error == 0) 5530 spa_history_log_internal(spa, "detach", tx, 5531 "vdev=%s", vml[c]->vdev_path); 5532 5533 vdev_free(vml[c]); 5534 } 5535 } 5536 spa->spa_avz_action = AVZ_ACTION_REBUILD; 5537 vdev_config_dirty(spa->spa_root_vdev); 5538 spa->spa_config_splitting = NULL; 5539 nvlist_free(nvl); 5540 if (error == 0) 5541 dmu_tx_commit(tx); 5542 (void) spa_vdev_exit(spa, NULL, txg, 0); 5543 5544 if (zio_injection_enabled) 5545 zio_handle_panic_injection(spa, FTAG, 3); 5546 5547 /* split is complete; log a history record */ 5548 spa_history_log_internal(newspa, "split", NULL, 5549 "from pool %s", spa_name(spa)); 5550 5551 kmem_free(vml, children * sizeof (vdev_t *)); 5552 5553 /* if we're not going to mount the filesystems in userland, export */ 5554 if (exp) 5555 error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, 5556 B_FALSE, B_FALSE); 5557 5558 return (error); 5559 5560out: 5561 spa_unload(newspa); 5562 spa_deactivate(newspa); 5563 spa_remove(newspa); 5564 5565 txg = spa_vdev_config_enter(spa); 5566 5567 /* re-online all offlined disks */ 5568 for (c = 0; c < children; c++) { 5569 if (vml[c] != NULL) 5570 vml[c]->vdev_offline = B_FALSE; 5571 } 5572 vdev_reopen(spa->spa_root_vdev); 5573 5574 nvlist_free(spa->spa_config_splitting); 5575 spa->spa_config_splitting = NULL; 5576 (void) spa_vdev_exit(spa, NULL, txg, error); 5577 5578 kmem_free(vml, children * sizeof (vdev_t *)); 5579 return (error); 5580} 5581 5582static nvlist_t * 5583spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) 5584{ 5585 for (int i = 0; i < count; i++) { 5586 uint64_t guid; 5587 5588 VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID, 5589 &guid) == 0); 5590 5591 if (guid == target_guid) 5592 return (nvpp[i]); 5593 } 5594 5595 return (NULL); 5596} 5597 5598static void 5599spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, 5600 nvlist_t *dev_to_remove) 5601{ 5602 nvlist_t **newdev = NULL; 5603 5604 if (count > 1) 5605 newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); 5606 5607 for (int i = 0, j = 0; i < count; i++) { 5608 if (dev[i] == dev_to_remove) 5609 continue; 5610 VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); 5611 } 5612 5613 VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); 5614 VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); 5615 5616 for (int i = 0; i < count - 1; i++) 5617 nvlist_free(newdev[i]); 5618 5619 if (count > 1) 5620 kmem_free(newdev, (count - 1) * sizeof (void *)); 5621} 5622 5623/* 5624 * Evacuate the device. 5625 */ 5626static int 5627spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd) 5628{ 5629 uint64_t txg; 5630 int error = 0; 5631 5632 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 5633 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5634 ASSERT(vd == vd->vdev_top); 5635 5636 /* 5637 * Evacuate the device. We don't hold the config lock as writer 5638 * since we need to do I/O but we do keep the 5639 * spa_namespace_lock held. Once this completes the device 5640 * should no longer have any blocks allocated on it. 5641 */ 5642 if (vd->vdev_islog) { 5643 if (vd->vdev_stat.vs_alloc != 0) 5644 error = spa_offline_log(spa); 5645 } else { 5646 error = SET_ERROR(ENOTSUP); 5647 } 5648 5649 if (error) 5650 return (error); 5651 5652 /* 5653 * The evacuation succeeded. Remove any remaining MOS metadata 5654 * associated with this vdev, and wait for these changes to sync. 5655 */ 5656 ASSERT0(vd->vdev_stat.vs_alloc); 5657 txg = spa_vdev_config_enter(spa); 5658 vd->vdev_removing = B_TRUE; 5659 vdev_dirty_leaves(vd, VDD_DTL, txg); 5660 vdev_config_dirty(vd); 5661 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 5662 5663 return (0); 5664} 5665 5666/* 5667 * Complete the removal by cleaning up the namespace. 5668 */ 5669static void 5670spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd) 5671{ 5672 vdev_t *rvd = spa->spa_root_vdev; 5673 uint64_t id = vd->vdev_id; 5674 boolean_t last_vdev = (id == (rvd->vdev_children - 1)); 5675 5676 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 5677 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 5678 ASSERT(vd == vd->vdev_top); 5679 5680 /* 5681 * Only remove any devices which are empty. 5682 */ 5683 if (vd->vdev_stat.vs_alloc != 0) 5684 return; 5685 5686 (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 5687 5688 if (list_link_active(&vd->vdev_state_dirty_node)) 5689 vdev_state_clean(vd); 5690 if (list_link_active(&vd->vdev_config_dirty_node)) 5691 vdev_config_clean(vd); 5692 5693 vdev_free(vd); 5694 5695 if (last_vdev) { 5696 vdev_compact_children(rvd); 5697 } else { 5698 vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); 5699 vdev_add_child(rvd, vd); 5700 } 5701 vdev_config_dirty(rvd); 5702 5703 /* 5704 * Reassess the health of our root vdev. 5705 */ 5706 vdev_reopen(rvd); 5707} 5708 5709/* 5710 * Remove a device from the pool - 5711 * 5712 * Removing a device from the vdev namespace requires several steps 5713 * and can take a significant amount of time. As a result we use 5714 * the spa_vdev_config_[enter/exit] functions which allow us to 5715 * grab and release the spa_config_lock while still holding the namespace 5716 * lock. During each step the configuration is synced out. 5717 * 5718 * Currently, this supports removing only hot spares, slogs, and level 2 ARC 5719 * devices. 5720 */ 5721int 5722spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 5723{ 5724 vdev_t *vd; 5725 metaslab_group_t *mg; 5726 nvlist_t **spares, **l2cache, *nv; 5727 uint64_t txg = 0; 5728 uint_t nspares, nl2cache; 5729 int error = 0; 5730 boolean_t locked = MUTEX_HELD(&spa_namespace_lock); 5731 5732 ASSERT(spa_writeable(spa)); 5733 5734 if (!locked) 5735 txg = spa_vdev_enter(spa); 5736 5737 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 5738 5739 if (spa->spa_spares.sav_vdevs != NULL && 5740 nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 5741 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && 5742 (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { 5743 /* 5744 * Only remove the hot spare if it's not currently in use 5745 * in this pool. 5746 */ 5747 if (vd == NULL || unspare) { 5748 spa_vdev_remove_aux(spa->spa_spares.sav_config, 5749 ZPOOL_CONFIG_SPARES, spares, nspares, nv); 5750 spa_load_spares(spa); 5751 spa->spa_spares.sav_sync = B_TRUE; 5752 } else { 5753 error = SET_ERROR(EBUSY); 5754 } 5755 } else if (spa->spa_l2cache.sav_vdevs != NULL && 5756 nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 5757 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && 5758 (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { 5759 /* 5760 * Cache devices can always be removed. 5761 */ 5762 spa_vdev_remove_aux(spa->spa_l2cache.sav_config, 5763 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); 5764 spa_load_l2cache(spa); 5765 spa->spa_l2cache.sav_sync = B_TRUE; 5766 } else if (vd != NULL && vd->vdev_islog) { 5767 ASSERT(!locked); 5768 ASSERT(vd == vd->vdev_top); 5769 5770 mg = vd->vdev_mg; 5771 5772 /* 5773 * Stop allocating from this vdev. 5774 */ 5775 metaslab_group_passivate(mg); 5776 5777 /* 5778 * Wait for the youngest allocations and frees to sync, 5779 * and then wait for the deferral of those frees to finish. 5780 */ 5781 spa_vdev_config_exit(spa, NULL, 5782 txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); 5783 5784 /* 5785 * Attempt to evacuate the vdev. 5786 */ 5787 error = spa_vdev_remove_evacuate(spa, vd); 5788 5789 txg = spa_vdev_config_enter(spa); 5790 5791 /* 5792 * If we couldn't evacuate the vdev, unwind. 5793 */ 5794 if (error) { 5795 metaslab_group_activate(mg); 5796 return (spa_vdev_exit(spa, NULL, txg, error)); 5797 } 5798 5799 /* 5800 * Clean up the vdev namespace. 5801 */ 5802 spa_vdev_remove_from_namespace(spa, vd); 5803 5804 } else if (vd != NULL) { 5805 /* 5806 * Normal vdevs cannot be removed (yet). 5807 */ 5808 error = SET_ERROR(ENOTSUP); 5809 } else { 5810 /* 5811 * There is no vdev of any kind with the specified guid. 5812 */ 5813 error = SET_ERROR(ENOENT); 5814 } 5815 5816 if (!locked) 5817 return (spa_vdev_exit(spa, NULL, txg, error)); 5818 5819 return (error); 5820} 5821 5822/* 5823 * Find any device that's done replacing, or a vdev marked 'unspare' that's 5824 * currently spared, so we can detach it. 5825 */ 5826static vdev_t * 5827spa_vdev_resilver_done_hunt(vdev_t *vd) 5828{ 5829 vdev_t *newvd, *oldvd; 5830 5831 for (int c = 0; c < vd->vdev_children; c++) { 5832 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 5833 if (oldvd != NULL) 5834 return (oldvd); 5835 } 5836 5837 /* 5838 * Check for a completed replacement. We always consider the first 5839 * vdev in the list to be the oldest vdev, and the last one to be 5840 * the newest (see spa_vdev_attach() for how that works). In 5841 * the case where the newest vdev is faulted, we will not automatically 5842 * remove it after a resilver completes. This is OK as it will require 5843 * user intervention to determine which disk the admin wishes to keep. 5844 */ 5845 if (vd->vdev_ops == &vdev_replacing_ops) { 5846 ASSERT(vd->vdev_children > 1); 5847 5848 newvd = vd->vdev_child[vd->vdev_children - 1]; 5849 oldvd = vd->vdev_child[0]; 5850 5851 if (vdev_dtl_empty(newvd, DTL_MISSING) && 5852 vdev_dtl_empty(newvd, DTL_OUTAGE) && 5853 !vdev_dtl_required(oldvd)) 5854 return (oldvd); 5855 } 5856 5857 /* 5858 * Check for a completed resilver with the 'unspare' flag set. 5859 */ 5860 if (vd->vdev_ops == &vdev_spare_ops) { 5861 vdev_t *first = vd->vdev_child[0]; 5862 vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; 5863 5864 if (last->vdev_unspare) { 5865 oldvd = first; 5866 newvd = last; 5867 } else if (first->vdev_unspare) { 5868 oldvd = last; 5869 newvd = first; 5870 } else { 5871 oldvd = NULL; 5872 } 5873 5874 if (oldvd != NULL && 5875 vdev_dtl_empty(newvd, DTL_MISSING) && 5876 vdev_dtl_empty(newvd, DTL_OUTAGE) && 5877 !vdev_dtl_required(oldvd)) 5878 return (oldvd); 5879 5880 /* 5881 * If there are more than two spares attached to a disk, 5882 * and those spares are not required, then we want to 5883 * attempt to free them up now so that they can be used 5884 * by other pools. Once we're back down to a single 5885 * disk+spare, we stop removing them. 5886 */ 5887 if (vd->vdev_children > 2) { 5888 newvd = vd->vdev_child[1]; 5889 5890 if (newvd->vdev_isspare && last->vdev_isspare && 5891 vdev_dtl_empty(last, DTL_MISSING) && 5892 vdev_dtl_empty(last, DTL_OUTAGE) && 5893 !vdev_dtl_required(newvd)) 5894 return (newvd); 5895 } 5896 } 5897 5898 return (NULL); 5899} 5900 5901static void 5902spa_vdev_resilver_done(spa_t *spa) 5903{ 5904 vdev_t *vd, *pvd, *ppvd; 5905 uint64_t guid, sguid, pguid, ppguid; 5906 5907 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5908 5909 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 5910 pvd = vd->vdev_parent; 5911 ppvd = pvd->vdev_parent; 5912 guid = vd->vdev_guid; 5913 pguid = pvd->vdev_guid; 5914 ppguid = ppvd->vdev_guid; 5915 sguid = 0; 5916 /* 5917 * If we have just finished replacing a hot spared device, then 5918 * we need to detach the parent's first child (the original hot 5919 * spare) as well. 5920 */ 5921 if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && 5922 ppvd->vdev_children == 2) { 5923 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 5924 sguid = ppvd->vdev_child[1]->vdev_guid; 5925 } 5926 ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd)); 5927 5928 spa_config_exit(spa, SCL_ALL, FTAG); 5929 if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) 5930 return; 5931 if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) 5932 return; 5933 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5934 } 5935 5936 spa_config_exit(spa, SCL_ALL, FTAG); 5937} 5938 5939/* 5940 * Update the stored path or FRU for this vdev. 5941 */ 5942int 5943spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, 5944 boolean_t ispath) 5945{ 5946 vdev_t *vd; 5947 boolean_t sync = B_FALSE; 5948 5949 ASSERT(spa_writeable(spa)); 5950 5951 spa_vdev_state_enter(spa, SCL_ALL); 5952 5953 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 5954 return (spa_vdev_state_exit(spa, NULL, ENOENT)); 5955 5956 if (!vd->vdev_ops->vdev_op_leaf) 5957 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 5958 5959 if (ispath) { 5960 if (strcmp(value, vd->vdev_path) != 0) { 5961 spa_strfree(vd->vdev_path); 5962 vd->vdev_path = spa_strdup(value); 5963 sync = B_TRUE; 5964 } 5965 } else { 5966 if (vd->vdev_fru == NULL) { 5967 vd->vdev_fru = spa_strdup(value); 5968 sync = B_TRUE; 5969 } else if (strcmp(value, vd->vdev_fru) != 0) { 5970 spa_strfree(vd->vdev_fru); 5971 vd->vdev_fru = spa_strdup(value); 5972 sync = B_TRUE; 5973 } 5974 } 5975 5976 return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); 5977} 5978 5979int 5980spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 5981{ 5982 return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); 5983} 5984 5985int 5986spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) 5987{ 5988 return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); 5989} 5990 5991/* 5992 * ========================================================================== 5993 * SPA Scanning 5994 * ========================================================================== 5995 */ 5996 5997int 5998spa_scan_stop(spa_t *spa) 5999{ 6000 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 6001 if (dsl_scan_resilvering(spa->spa_dsl_pool)) 6002 return (SET_ERROR(EBUSY)); 6003 return (dsl_scan_cancel(spa->spa_dsl_pool)); 6004} 6005 6006int 6007spa_scan(spa_t *spa, pool_scan_func_t func) 6008{ 6009 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 6010 6011 if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) 6012 return (SET_ERROR(ENOTSUP)); 6013 6014 /* 6015 * If a resilver was requested, but there is no DTL on a 6016 * writeable leaf device, we have nothing to do. 6017 */ 6018 if (func == POOL_SCAN_RESILVER && 6019 !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 6020 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 6021 return (0); 6022 } 6023 6024 return (dsl_scan(spa->spa_dsl_pool, func)); 6025} 6026 6027/* 6028 * ========================================================================== 6029 * SPA async task processing 6030 * ========================================================================== 6031 */ 6032 6033static void 6034spa_async_remove(spa_t *spa, vdev_t *vd) 6035{ 6036 if (vd->vdev_remove_wanted) { 6037 vd->vdev_remove_wanted = B_FALSE; 6038 vd->vdev_delayed_close = B_FALSE; 6039 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 6040 6041 /* 6042 * We want to clear the stats, but we don't want to do a full 6043 * vdev_clear() as that will cause us to throw away 6044 * degraded/faulted state as well as attempt to reopen the 6045 * device, all of which is a waste. 6046 */ 6047 vd->vdev_stat.vs_read_errors = 0; 6048 vd->vdev_stat.vs_write_errors = 0; 6049 vd->vdev_stat.vs_checksum_errors = 0; 6050 6051 vdev_state_dirty(vd->vdev_top); 6052 /* Tell userspace that the vdev is gone. */ 6053 zfs_post_remove(spa, vd); 6054 } 6055 6056 for (int c = 0; c < vd->vdev_children; c++) 6057 spa_async_remove(spa, vd->vdev_child[c]); 6058} 6059 6060static void 6061spa_async_probe(spa_t *spa, vdev_t *vd) 6062{ 6063 if (vd->vdev_probe_wanted) { 6064 vd->vdev_probe_wanted = B_FALSE; 6065 vdev_reopen(vd); /* vdev_open() does the actual probe */ 6066 } 6067 6068 for (int c = 0; c < vd->vdev_children; c++) 6069 spa_async_probe(spa, vd->vdev_child[c]); 6070} 6071 6072static void 6073spa_async_autoexpand(spa_t *spa, vdev_t *vd) 6074{ 6075 sysevent_id_t eid; 6076 nvlist_t *attr; 6077 char *physpath; 6078 6079 if (!spa->spa_autoexpand) 6080 return; 6081 6082 for (int c = 0; c < vd->vdev_children; c++) { 6083 vdev_t *cvd = vd->vdev_child[c]; 6084 spa_async_autoexpand(spa, cvd); 6085 } 6086 6087 if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) 6088 return; 6089 6090 physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP); 6091 (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath); 6092 6093 VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0); 6094 VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0); 6095 6096 (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS, 6097 ESC_ZFS_VDEV_AUTOEXPAND, attr, &eid, DDI_SLEEP); 6098 6099 nvlist_free(attr); 6100 kmem_free(physpath, MAXPATHLEN); 6101} 6102 6103static void 6104spa_async_thread(void *arg) 6105{ 6106 spa_t *spa = arg; 6107 int tasks; 6108 6109 ASSERT(spa->spa_sync_on); 6110 6111 mutex_enter(&spa->spa_async_lock); 6112 tasks = spa->spa_async_tasks; 6113 spa->spa_async_tasks &= SPA_ASYNC_REMOVE; 6114 mutex_exit(&spa->spa_async_lock); 6115 6116 /* 6117 * See if the config needs to be updated. 6118 */ 6119 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 6120 uint64_t old_space, new_space; 6121 6122 mutex_enter(&spa_namespace_lock); 6123 old_space = metaslab_class_get_space(spa_normal_class(spa)); 6124 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 6125 new_space = metaslab_class_get_space(spa_normal_class(spa)); 6126 mutex_exit(&spa_namespace_lock); 6127 6128 /* 6129 * If the pool grew as a result of the config update, 6130 * then log an internal history event. 6131 */ 6132 if (new_space != old_space) { 6133 spa_history_log_internal(spa, "vdev online", NULL, 6134 "pool '%s' size: %llu(+%llu)", 6135 spa_name(spa), new_space, new_space - old_space); 6136 } 6137 } 6138 6139 if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { 6140 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 6141 spa_async_autoexpand(spa, spa->spa_root_vdev); 6142 spa_config_exit(spa, SCL_CONFIG, FTAG); 6143 } 6144 6145 /* 6146 * See if any devices need to be probed. 6147 */ 6148 if (tasks & SPA_ASYNC_PROBE) { 6149 spa_vdev_state_enter(spa, SCL_NONE); 6150 spa_async_probe(spa, spa->spa_root_vdev); 6151 (void) spa_vdev_state_exit(spa, NULL, 0); 6152 } 6153 6154 /* 6155 * If any devices are done replacing, detach them. 6156 */ 6157 if (tasks & SPA_ASYNC_RESILVER_DONE) 6158 spa_vdev_resilver_done(spa); 6159 6160 /* 6161 * Kick off a resilver. 6162 */ 6163 if (tasks & SPA_ASYNC_RESILVER) 6164 dsl_resilver_restart(spa->spa_dsl_pool, 0); 6165 6166 /* 6167 * Let the world know that we're done. 6168 */ 6169 mutex_enter(&spa->spa_async_lock); 6170 spa->spa_async_thread = NULL; 6171 cv_broadcast(&spa->spa_async_cv); 6172 mutex_exit(&spa->spa_async_lock); 6173 thread_exit(); 6174} 6175 6176static void 6177spa_async_thread_vd(void *arg) 6178{ 6179 spa_t *spa = arg; 6180 int tasks; 6181 6182 ASSERT(spa->spa_sync_on); 6183 6184 mutex_enter(&spa->spa_async_lock); 6185 tasks = spa->spa_async_tasks; 6186retry: 6187 spa->spa_async_tasks &= ~SPA_ASYNC_REMOVE; 6188 mutex_exit(&spa->spa_async_lock); 6189 6190 /* 6191 * See if any devices need to be marked REMOVED. 6192 */ 6193 if (tasks & SPA_ASYNC_REMOVE) { 6194 spa_vdev_state_enter(spa, SCL_NONE); 6195 spa_async_remove(spa, spa->spa_root_vdev); 6196 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 6197 spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); 6198 for (int i = 0; i < spa->spa_spares.sav_count; i++) 6199 spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); 6200 (void) spa_vdev_state_exit(spa, NULL, 0); 6201 } 6202 6203 /* 6204 * Let the world know that we're done. 6205 */ 6206 mutex_enter(&spa->spa_async_lock); 6207 tasks = spa->spa_async_tasks; 6208 if ((tasks & SPA_ASYNC_REMOVE) != 0) 6209 goto retry; 6210 spa->spa_async_thread_vd = NULL; 6211 cv_broadcast(&spa->spa_async_cv); 6212 mutex_exit(&spa->spa_async_lock); 6213 thread_exit(); 6214} 6215 6216void 6217spa_async_suspend(spa_t *spa) 6218{ 6219 mutex_enter(&spa->spa_async_lock); 6220 spa->spa_async_suspended++; 6221 while (spa->spa_async_thread != NULL && 6222 spa->spa_async_thread_vd != NULL) 6223 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 6224 mutex_exit(&spa->spa_async_lock); 6225} 6226 6227void 6228spa_async_resume(spa_t *spa) 6229{ 6230 mutex_enter(&spa->spa_async_lock); 6231 ASSERT(spa->spa_async_suspended != 0); 6232 spa->spa_async_suspended--; 6233 mutex_exit(&spa->spa_async_lock); 6234} 6235 6236static boolean_t 6237spa_async_tasks_pending(spa_t *spa) 6238{ 6239 uint_t non_config_tasks; 6240 uint_t config_task; 6241 boolean_t config_task_suspended; 6242 6243 non_config_tasks = spa->spa_async_tasks & ~(SPA_ASYNC_CONFIG_UPDATE | 6244 SPA_ASYNC_REMOVE); 6245 config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE; 6246 if (spa->spa_ccw_fail_time == 0) { 6247 config_task_suspended = B_FALSE; 6248 } else { 6249 config_task_suspended = 6250 (gethrtime() - spa->spa_ccw_fail_time) < 6251 (zfs_ccw_retry_interval * NANOSEC); 6252 } 6253 6254 return (non_config_tasks || (config_task && !config_task_suspended)); 6255} 6256 6257static void 6258spa_async_dispatch(spa_t *spa) 6259{ 6260 mutex_enter(&spa->spa_async_lock); 6261 if (spa_async_tasks_pending(spa) && 6262 !spa->spa_async_suspended && 6263 spa->spa_async_thread == NULL && 6264 rootdir != NULL) 6265 spa->spa_async_thread = thread_create(NULL, 0, 6266 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 6267 mutex_exit(&spa->spa_async_lock); 6268} 6269 6270static void 6271spa_async_dispatch_vd(spa_t *spa) 6272{ 6273 mutex_enter(&spa->spa_async_lock); 6274 if ((spa->spa_async_tasks & SPA_ASYNC_REMOVE) != 0 && 6275 !spa->spa_async_suspended && 6276 spa->spa_async_thread_vd == NULL && 6277 rootdir != NULL) 6278 spa->spa_async_thread_vd = thread_create(NULL, 0, 6279 spa_async_thread_vd, spa, 0, &p0, TS_RUN, maxclsyspri); 6280 mutex_exit(&spa->spa_async_lock); 6281} 6282 6283void 6284spa_async_request(spa_t *spa, int task) 6285{ 6286 zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); 6287 mutex_enter(&spa->spa_async_lock); 6288 spa->spa_async_tasks |= task; 6289 mutex_exit(&spa->spa_async_lock); 6290 spa_async_dispatch_vd(spa); 6291} 6292 6293/* 6294 * ========================================================================== 6295 * SPA syncing routines 6296 * ========================================================================== 6297 */ 6298 6299static int 6300bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 6301{ 6302 bpobj_t *bpo = arg; 6303 bpobj_enqueue(bpo, bp, tx); 6304 return (0); 6305} 6306 6307static int 6308spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 6309{ 6310 zio_t *zio = arg; 6311 6312 zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp, 6313 BP_GET_PSIZE(bp), zio->io_flags)); 6314 return (0); 6315} 6316 6317/* 6318 * Note: this simple function is not inlined to make it easier to dtrace the 6319 * amount of time spent syncing frees. 6320 */ 6321static void 6322spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx) 6323{ 6324 zio_t *zio = zio_root(spa, NULL, NULL, 0); 6325 bplist_iterate(bpl, spa_free_sync_cb, zio, tx); 6326 VERIFY(zio_wait(zio) == 0); 6327} 6328 6329/* 6330 * Note: this simple function is not inlined to make it easier to dtrace the 6331 * amount of time spent syncing deferred frees. 6332 */ 6333static void 6334spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx) 6335{ 6336 zio_t *zio = zio_root(spa, NULL, NULL, 0); 6337 VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj, 6338 spa_free_sync_cb, zio, tx), ==, 0); 6339 VERIFY0(zio_wait(zio)); 6340} 6341 6342 6343static void 6344spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 6345{ 6346 char *packed = NULL; 6347 size_t bufsize; 6348 size_t nvsize = 0; 6349 dmu_buf_t *db; 6350 6351 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 6352 6353 /* 6354 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 6355 * information. This avoids the dmu_buf_will_dirty() path and 6356 * saves us a pre-read to get data we don't actually care about. 6357 */ 6358 bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE); 6359 packed = kmem_alloc(bufsize, KM_SLEEP); 6360 6361 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 6362 KM_SLEEP) == 0); 6363 bzero(packed + nvsize, bufsize - nvsize); 6364 6365 dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); 6366 6367 kmem_free(packed, bufsize); 6368 6369 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 6370 dmu_buf_will_dirty(db, tx); 6371 *(uint64_t *)db->db_data = nvsize; 6372 dmu_buf_rele(db, FTAG); 6373} 6374 6375static void 6376spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 6377 const char *config, const char *entry) 6378{ 6379 nvlist_t *nvroot; 6380 nvlist_t **list; 6381 int i; 6382 6383 if (!sav->sav_sync) 6384 return; 6385 6386 /* 6387 * Update the MOS nvlist describing the list of available devices. 6388 * spa_validate_aux() will have already made sure this nvlist is 6389 * valid and the vdevs are labeled appropriately. 6390 */ 6391 if (sav->sav_object == 0) { 6392 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 6393 DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 6394 sizeof (uint64_t), tx); 6395 VERIFY(zap_update(spa->spa_meta_objset, 6396 DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 6397 &sav->sav_object, tx) == 0); 6398 } 6399 6400 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 6401 if (sav->sav_count == 0) { 6402 VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); 6403 } else { 6404 list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 6405 for (i = 0; i < sav->sav_count; i++) 6406 list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 6407 B_FALSE, VDEV_CONFIG_L2CACHE); 6408 VERIFY(nvlist_add_nvlist_array(nvroot, config, list, 6409 sav->sav_count) == 0); 6410 for (i = 0; i < sav->sav_count; i++) 6411 nvlist_free(list[i]); 6412 kmem_free(list, sav->sav_count * sizeof (void *)); 6413 } 6414 6415 spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 6416 nvlist_free(nvroot); 6417 6418 sav->sav_sync = B_FALSE; 6419} 6420 6421/* 6422 * Rebuild spa's all-vdev ZAP from the vdev ZAPs indicated in each vdev_t. 6423 * The all-vdev ZAP must be empty. 6424 */ 6425static void 6426spa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx) 6427{ 6428 spa_t *spa = vd->vdev_spa; 6429 if (vd->vdev_top_zap != 0) { 6430 VERIFY0(zap_add_int(spa->spa_meta_objset, avz, 6431 vd->vdev_top_zap, tx)); 6432 } 6433 if (vd->vdev_leaf_zap != 0) { 6434 VERIFY0(zap_add_int(spa->spa_meta_objset, avz, 6435 vd->vdev_leaf_zap, tx)); 6436 } 6437 for (uint64_t i = 0; i < vd->vdev_children; i++) { 6438 spa_avz_build(vd->vdev_child[i], avz, tx); 6439 } 6440} 6441 6442static void 6443spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 6444{ 6445 nvlist_t *config; 6446 6447 /* 6448 * If the pool is being imported from a pre-per-vdev-ZAP version of ZFS, 6449 * its config may not be dirty but we still need to build per-vdev ZAPs. 6450 * Similarly, if the pool is being assembled (e.g. after a split), we 6451 * need to rebuild the AVZ although the config may not be dirty. 6452 */ 6453 if (list_is_empty(&spa->spa_config_dirty_list) && 6454 spa->spa_avz_action == AVZ_ACTION_NONE) 6455 return; 6456 6457 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6458 6459 ASSERT(spa->spa_avz_action == AVZ_ACTION_NONE || 6460 spa->spa_all_vdev_zaps != 0); 6461 6462 if (spa->spa_avz_action == AVZ_ACTION_REBUILD) { 6463 /* Make and build the new AVZ */ 6464 uint64_t new_avz = zap_create(spa->spa_meta_objset, 6465 DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); 6466 spa_avz_build(spa->spa_root_vdev, new_avz, tx); 6467 6468 /* Diff old AVZ with new one */ 6469 zap_cursor_t zc; 6470 zap_attribute_t za; 6471 6472 for (zap_cursor_init(&zc, spa->spa_meta_objset, 6473 spa->spa_all_vdev_zaps); 6474 zap_cursor_retrieve(&zc, &za) == 0; 6475 zap_cursor_advance(&zc)) { 6476 uint64_t vdzap = za.za_first_integer; 6477 if (zap_lookup_int(spa->spa_meta_objset, new_avz, 6478 vdzap) == ENOENT) { 6479 /* 6480 * ZAP is listed in old AVZ but not in new one; 6481 * destroy it 6482 */ 6483 VERIFY0(zap_destroy(spa->spa_meta_objset, vdzap, 6484 tx)); 6485 } 6486 } 6487 6488 zap_cursor_fini(&zc); 6489 6490 /* Destroy the old AVZ */ 6491 VERIFY0(zap_destroy(spa->spa_meta_objset, 6492 spa->spa_all_vdev_zaps, tx)); 6493 6494 /* Replace the old AVZ in the dir obj with the new one */ 6495 VERIFY0(zap_update(spa->spa_meta_objset, 6496 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, 6497 sizeof (new_avz), 1, &new_avz, tx)); 6498 6499 spa->spa_all_vdev_zaps = new_avz; 6500 } else if (spa->spa_avz_action == AVZ_ACTION_DESTROY) { 6501 zap_cursor_t zc; 6502 zap_attribute_t za; 6503 6504 /* Walk through the AVZ and destroy all listed ZAPs */ 6505 for (zap_cursor_init(&zc, spa->spa_meta_objset, 6506 spa->spa_all_vdev_zaps); 6507 zap_cursor_retrieve(&zc, &za) == 0; 6508 zap_cursor_advance(&zc)) { 6509 uint64_t zap = za.za_first_integer; 6510 VERIFY0(zap_destroy(spa->spa_meta_objset, zap, tx)); 6511 } 6512 6513 zap_cursor_fini(&zc); 6514 6515 /* Destroy and unlink the AVZ itself */ 6516 VERIFY0(zap_destroy(spa->spa_meta_objset, 6517 spa->spa_all_vdev_zaps, tx)); 6518 VERIFY0(zap_remove(spa->spa_meta_objset, 6519 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx)); 6520 spa->spa_all_vdev_zaps = 0; 6521 } 6522 6523 if (spa->spa_all_vdev_zaps == 0) { 6524 spa->spa_all_vdev_zaps = zap_create_link(spa->spa_meta_objset, 6525 DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, 6526 DMU_POOL_VDEV_ZAP_MAP, tx); 6527 } 6528 spa->spa_avz_action = AVZ_ACTION_NONE; 6529 6530 /* Create ZAPs for vdevs that don't have them. */ 6531 vdev_construct_zaps(spa->spa_root_vdev, tx); 6532 6533 config = spa_config_generate(spa, spa->spa_root_vdev, 6534 dmu_tx_get_txg(tx), B_FALSE); 6535 6536 /* 6537 * If we're upgrading the spa version then make sure that 6538 * the config object gets updated with the correct version. 6539 */ 6540 if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version) 6541 fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 6542 spa->spa_uberblock.ub_version); 6543 6544 spa_config_exit(spa, SCL_STATE, FTAG); 6545 6546 nvlist_free(spa->spa_config_syncing); 6547 spa->spa_config_syncing = config; 6548 6549 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 6550} 6551 6552static void 6553spa_sync_version(void *arg, dmu_tx_t *tx) 6554{ 6555 uint64_t *versionp = arg; 6556 uint64_t version = *versionp; 6557 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 6558 6559 /* 6560 * Setting the version is special cased when first creating the pool. 6561 */ 6562 ASSERT(tx->tx_txg != TXG_INITIAL); 6563 6564 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 6565 ASSERT(version >= spa_version(spa)); 6566 6567 spa->spa_uberblock.ub_version = version; 6568 vdev_config_dirty(spa->spa_root_vdev); 6569 spa_history_log_internal(spa, "set", tx, "version=%lld", version); 6570} 6571 6572/* 6573 * Set zpool properties. 6574 */ 6575static void 6576spa_sync_props(void *arg, dmu_tx_t *tx) 6577{ 6578 nvlist_t *nvp = arg; 6579 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 6580 objset_t *mos = spa->spa_meta_objset; 6581 nvpair_t *elem = NULL; 6582 6583 mutex_enter(&spa->spa_props_lock); 6584 6585 while ((elem = nvlist_next_nvpair(nvp, elem))) { 6586 uint64_t intval; 6587 char *strval, *fname; 6588 zpool_prop_t prop; 6589 const char *propname; 6590 zprop_type_t proptype; 6591 spa_feature_t fid; 6592 6593 switch (prop = zpool_name_to_prop(nvpair_name(elem))) { 6594 case ZPROP_INVAL: 6595 /* 6596 * We checked this earlier in spa_prop_validate(). 6597 */ 6598 ASSERT(zpool_prop_feature(nvpair_name(elem))); 6599 6600 fname = strchr(nvpair_name(elem), '@') + 1; 6601 VERIFY0(zfeature_lookup_name(fname, &fid)); 6602 6603 spa_feature_enable(spa, fid, tx); 6604 spa_history_log_internal(spa, "set", tx, 6605 "%s=enabled", nvpair_name(elem)); 6606 break; 6607 6608 case ZPOOL_PROP_VERSION: 6609 intval = fnvpair_value_uint64(elem); 6610 /* 6611 * The version is synced seperatly before other 6612 * properties and should be correct by now. 6613 */ 6614 ASSERT3U(spa_version(spa), >=, intval); 6615 break; 6616 6617 case ZPOOL_PROP_ALTROOT: 6618 /* 6619 * 'altroot' is a non-persistent property. It should 6620 * have been set temporarily at creation or import time. 6621 */ 6622 ASSERT(spa->spa_root != NULL); 6623 break; 6624 6625 case ZPOOL_PROP_READONLY: 6626 case ZPOOL_PROP_CACHEFILE: 6627 /* 6628 * 'readonly' and 'cachefile' are also non-persisitent 6629 * properties. 6630 */ 6631 break; 6632 case ZPOOL_PROP_COMMENT: 6633 strval = fnvpair_value_string(elem); 6634 if (spa->spa_comment != NULL) 6635 spa_strfree(spa->spa_comment); 6636 spa->spa_comment = spa_strdup(strval); 6637 /* 6638 * We need to dirty the configuration on all the vdevs 6639 * so that their labels get updated. It's unnecessary 6640 * to do this for pool creation since the vdev's 6641 * configuratoin has already been dirtied. 6642 */ 6643 if (tx->tx_txg != TXG_INITIAL) 6644 vdev_config_dirty(spa->spa_root_vdev); 6645 spa_history_log_internal(spa, "set", tx, 6646 "%s=%s", nvpair_name(elem), strval); 6647 break; 6648 default: 6649 /* 6650 * Set pool property values in the poolprops mos object. 6651 */ 6652 if (spa->spa_pool_props_object == 0) { 6653 spa->spa_pool_props_object = 6654 zap_create_link(mos, DMU_OT_POOL_PROPS, 6655 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 6656 tx); 6657 } 6658 6659 /* normalize the property name */ 6660 propname = zpool_prop_to_name(prop); 6661 proptype = zpool_prop_get_type(prop); 6662 6663 if (nvpair_type(elem) == DATA_TYPE_STRING) { 6664 ASSERT(proptype == PROP_TYPE_STRING); 6665 strval = fnvpair_value_string(elem); 6666 VERIFY0(zap_update(mos, 6667 spa->spa_pool_props_object, propname, 6668 1, strlen(strval) + 1, strval, tx)); 6669 spa_history_log_internal(spa, "set", tx, 6670 "%s=%s", nvpair_name(elem), strval); 6671 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 6672 intval = fnvpair_value_uint64(elem); 6673 6674 if (proptype == PROP_TYPE_INDEX) { 6675 const char *unused; 6676 VERIFY0(zpool_prop_index_to_string( 6677 prop, intval, &unused)); 6678 } 6679 VERIFY0(zap_update(mos, 6680 spa->spa_pool_props_object, propname, 6681 8, 1, &intval, tx)); 6682 spa_history_log_internal(spa, "set", tx, 6683 "%s=%lld", nvpair_name(elem), intval); 6684 } else { 6685 ASSERT(0); /* not allowed */ 6686 } 6687 6688 switch (prop) { 6689 case ZPOOL_PROP_DELEGATION: 6690 spa->spa_delegation = intval; 6691 break; 6692 case ZPOOL_PROP_BOOTFS: 6693 spa->spa_bootfs = intval; 6694 break; 6695 case ZPOOL_PROP_FAILUREMODE: 6696 spa->spa_failmode = intval; 6697 break; 6698 case ZPOOL_PROP_AUTOEXPAND: 6699 spa->spa_autoexpand = intval; 6700 if (tx->tx_txg != TXG_INITIAL) 6701 spa_async_request(spa, 6702 SPA_ASYNC_AUTOEXPAND); 6703 break; 6704 case ZPOOL_PROP_DEDUPDITTO: 6705 spa->spa_dedup_ditto = intval; 6706 break; 6707 default: 6708 break; 6709 } 6710 } 6711 6712 } 6713 6714 mutex_exit(&spa->spa_props_lock); 6715} 6716 6717/* 6718 * Perform one-time upgrade on-disk changes. spa_version() does not 6719 * reflect the new version this txg, so there must be no changes this 6720 * txg to anything that the upgrade code depends on after it executes. 6721 * Therefore this must be called after dsl_pool_sync() does the sync 6722 * tasks. 6723 */ 6724static void 6725spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) 6726{ 6727 dsl_pool_t *dp = spa->spa_dsl_pool; 6728 6729 ASSERT(spa->spa_sync_pass == 1); 6730 6731 rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); 6732 6733 if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && 6734 spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { 6735 dsl_pool_create_origin(dp, tx); 6736 6737 /* Keeping the origin open increases spa_minref */ 6738 spa->spa_minref += 3; 6739 } 6740 6741 if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && 6742 spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { 6743 dsl_pool_upgrade_clones(dp, tx); 6744 } 6745 6746 if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && 6747 spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { 6748 dsl_pool_upgrade_dir_clones(dp, tx); 6749 6750 /* Keeping the freedir open increases spa_minref */ 6751 spa->spa_minref += 3; 6752 } 6753 6754 if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES && 6755 spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 6756 spa_feature_create_zap_objects(spa, tx); 6757 } 6758 6759 /* 6760 * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable 6761 * when possibility to use lz4 compression for metadata was added 6762 * Old pools that have this feature enabled must be upgraded to have 6763 * this feature active 6764 */ 6765 if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 6766 boolean_t lz4_en = spa_feature_is_enabled(spa, 6767 SPA_FEATURE_LZ4_COMPRESS); 6768 boolean_t lz4_ac = spa_feature_is_active(spa, 6769 SPA_FEATURE_LZ4_COMPRESS); 6770 6771 if (lz4_en && !lz4_ac) 6772 spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx); 6773 } 6774 6775 /* 6776 * If we haven't written the salt, do so now. Note that the 6777 * feature may not be activated yet, but that's fine since 6778 * the presence of this ZAP entry is backwards compatible. 6779 */ 6780 if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 6781 DMU_POOL_CHECKSUM_SALT) == ENOENT) { 6782 VERIFY0(zap_add(spa->spa_meta_objset, 6783 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1, 6784 sizeof (spa->spa_cksum_salt.zcs_bytes), 6785 spa->spa_cksum_salt.zcs_bytes, tx)); 6786 } 6787 6788 rrw_exit(&dp->dp_config_rwlock, FTAG); 6789} 6790 6791/* 6792 * Sync the specified transaction group. New blocks may be dirtied as 6793 * part of the process, so we iterate until it converges. 6794 */ 6795void 6796spa_sync(spa_t *spa, uint64_t txg) 6797{ 6798 dsl_pool_t *dp = spa->spa_dsl_pool; 6799 objset_t *mos = spa->spa_meta_objset; 6800 bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; 6801 vdev_t *rvd = spa->spa_root_vdev; 6802 vdev_t *vd; 6803 dmu_tx_t *tx; 6804 int error; 6805 6806 VERIFY(spa_writeable(spa)); 6807 6808 /* 6809 * Lock out configuration changes. 6810 */ 6811 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 6812 6813 spa->spa_syncing_txg = txg; 6814 spa->spa_sync_pass = 0; 6815 6816 /* 6817 * If there are any pending vdev state changes, convert them 6818 * into config changes that go out with this transaction group. 6819 */ 6820 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6821 while (list_head(&spa->spa_state_dirty_list) != NULL) { 6822 /* 6823 * We need the write lock here because, for aux vdevs, 6824 * calling vdev_config_dirty() modifies sav_config. 6825 * This is ugly and will become unnecessary when we 6826 * eliminate the aux vdev wart by integrating all vdevs 6827 * into the root vdev tree. 6828 */ 6829 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 6830 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); 6831 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 6832 vdev_state_clean(vd); 6833 vdev_config_dirty(vd); 6834 } 6835 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 6836 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 6837 } 6838 spa_config_exit(spa, SCL_STATE, FTAG); 6839 6840 tx = dmu_tx_create_assigned(dp, txg); 6841 6842 spa->spa_sync_starttime = gethrtime(); 6843#ifdef illumos 6844 VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, 6845 spa->spa_sync_starttime + spa->spa_deadman_synctime)); 6846#else /* !illumos */ 6847#ifdef _KERNEL 6848 callout_reset(&spa->spa_deadman_cycid, 6849 hz * spa->spa_deadman_synctime / NANOSEC, spa_deadman, spa); 6850#endif 6851#endif /* illumos */ 6852 6853 /* 6854 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 6855 * set spa_deflate if we have no raid-z vdevs. 6856 */ 6857 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 6858 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 6859 int i; 6860 6861 for (i = 0; i < rvd->vdev_children; i++) { 6862 vd = rvd->vdev_child[i]; 6863 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 6864 break; 6865 } 6866 if (i == rvd->vdev_children) { 6867 spa->spa_deflate = TRUE; 6868 VERIFY(0 == zap_add(spa->spa_meta_objset, 6869 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 6870 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 6871 } 6872 } 6873 6874 /* 6875 * Iterate to convergence. 6876 */ 6877 do { 6878 int pass = ++spa->spa_sync_pass; 6879 6880 spa_sync_config_object(spa, tx); 6881 spa_sync_aux_dev(spa, &spa->spa_spares, tx, 6882 ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 6883 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 6884 ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 6885 spa_errlog_sync(spa, txg); 6886 dsl_pool_sync(dp, txg); 6887 6888 if (pass < zfs_sync_pass_deferred_free) { 6889 spa_sync_frees(spa, free_bpl, tx); 6890 } else { 6891 /* 6892 * We can not defer frees in pass 1, because 6893 * we sync the deferred frees later in pass 1. 6894 */ 6895 ASSERT3U(pass, >, 1); 6896 bplist_iterate(free_bpl, bpobj_enqueue_cb, 6897 &spa->spa_deferred_bpobj, tx); 6898 } 6899 6900 ddt_sync(spa, txg); 6901 dsl_scan_sync(dp, tx); 6902 6903 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) 6904 vdev_sync(vd, txg); 6905 6906 if (pass == 1) { 6907 spa_sync_upgrades(spa, tx); 6908 ASSERT3U(txg, >=, 6909 spa->spa_uberblock.ub_rootbp.blk_birth); 6910 /* 6911 * Note: We need to check if the MOS is dirty 6912 * because we could have marked the MOS dirty 6913 * without updating the uberblock (e.g. if we 6914 * have sync tasks but no dirty user data). We 6915 * need to check the uberblock's rootbp because 6916 * it is updated if we have synced out dirty 6917 * data (though in this case the MOS will most 6918 * likely also be dirty due to second order 6919 * effects, we don't want to rely on that here). 6920 */ 6921 if (spa->spa_uberblock.ub_rootbp.blk_birth < txg && 6922 !dmu_objset_is_dirty(mos, txg)) { 6923 /* 6924 * Nothing changed on the first pass, 6925 * therefore this TXG is a no-op. Avoid 6926 * syncing deferred frees, so that we 6927 * can keep this TXG as a no-op. 6928 */ 6929 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, 6930 txg)); 6931 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 6932 ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg)); 6933 break; 6934 } 6935 spa_sync_deferred_frees(spa, tx); 6936 } 6937 6938 } while (dmu_objset_is_dirty(mos, txg)); 6939 6940 if (!list_is_empty(&spa->spa_config_dirty_list)) { 6941 /* 6942 * Make sure that the number of ZAPs for all the vdevs matches 6943 * the number of ZAPs in the per-vdev ZAP list. This only gets 6944 * called if the config is dirty; otherwise there may be 6945 * outstanding AVZ operations that weren't completed in 6946 * spa_sync_config_object. 6947 */ 6948 uint64_t all_vdev_zap_entry_count; 6949 ASSERT0(zap_count(spa->spa_meta_objset, 6950 spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count)); 6951 ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==, 6952 all_vdev_zap_entry_count); 6953 } 6954 6955 /* 6956 * Rewrite the vdev configuration (which includes the uberblock) 6957 * to commit the transaction group. 6958 * 6959 * If there are no dirty vdevs, we sync the uberblock to a few 6960 * random top-level vdevs that are known to be visible in the 6961 * config cache (see spa_vdev_add() for a complete description). 6962 * If there *are* dirty vdevs, sync the uberblock to all vdevs. 6963 */ 6964 for (;;) { 6965 /* 6966 * We hold SCL_STATE to prevent vdev open/close/etc. 6967 * while we're attempting to write the vdev labels. 6968 */ 6969 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6970 6971 if (list_is_empty(&spa->spa_config_dirty_list)) { 6972 vdev_t *svd[SPA_DVAS_PER_BP]; 6973 int svdcount = 0; 6974 int children = rvd->vdev_children; 6975 int c0 = spa_get_random(children); 6976 6977 for (int c = 0; c < children; c++) { 6978 vd = rvd->vdev_child[(c0 + c) % children]; 6979 if (vd->vdev_ms_array == 0 || vd->vdev_islog) 6980 continue; 6981 svd[svdcount++] = vd; 6982 if (svdcount == SPA_DVAS_PER_BP) 6983 break; 6984 } 6985 error = vdev_config_sync(svd, svdcount, txg); 6986 } else { 6987 error = vdev_config_sync(rvd->vdev_child, 6988 rvd->vdev_children, txg); 6989 } 6990 6991 if (error == 0) 6992 spa->spa_last_synced_guid = rvd->vdev_guid; 6993 6994 spa_config_exit(spa, SCL_STATE, FTAG); 6995 6996 if (error == 0) 6997 break; 6998 zio_suspend(spa, NULL); 6999 zio_resume_wait(spa); 7000 } 7001 dmu_tx_commit(tx); 7002 7003#ifdef illumos 7004 VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY)); 7005#else /* !illumos */ 7006#ifdef _KERNEL 7007 callout_drain(&spa->spa_deadman_cycid); 7008#endif 7009#endif /* illumos */ 7010 7011 /* 7012 * Clear the dirty config list. 7013 */ 7014 while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) 7015 vdev_config_clean(vd); 7016 7017 /* 7018 * Now that the new config has synced transactionally, 7019 * let it become visible to the config cache. 7020 */ 7021 if (spa->spa_config_syncing != NULL) { 7022 spa_config_set(spa, spa->spa_config_syncing); 7023 spa->spa_config_txg = txg; 7024 spa->spa_config_syncing = NULL; 7025 } 7026 7027 spa->spa_ubsync = spa->spa_uberblock; 7028 7029 dsl_pool_sync_done(dp, txg); 7030 7031 /* 7032 * Update usable space statistics. 7033 */ 7034 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 7035 vdev_sync_done(vd, txg); 7036 7037 spa_update_dspace(spa); 7038 7039 /* 7040 * It had better be the case that we didn't dirty anything 7041 * since vdev_config_sync(). 7042 */ 7043 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 7044 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 7045 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 7046 7047 spa->spa_sync_pass = 0; 7048 7049 spa_config_exit(spa, SCL_CONFIG, FTAG); 7050 7051 spa_handle_ignored_writes(spa); 7052 7053 /* 7054 * If any async tasks have been requested, kick them off. 7055 */ 7056 spa_async_dispatch(spa); 7057 spa_async_dispatch_vd(spa); 7058} 7059 7060/* 7061 * Sync all pools. We don't want to hold the namespace lock across these 7062 * operations, so we take a reference on the spa_t and drop the lock during the 7063 * sync. 7064 */ 7065void 7066spa_sync_allpools(void) 7067{ 7068 spa_t *spa = NULL; 7069 mutex_enter(&spa_namespace_lock); 7070 while ((spa = spa_next(spa)) != NULL) { 7071 if (spa_state(spa) != POOL_STATE_ACTIVE || 7072 !spa_writeable(spa) || spa_suspended(spa)) 7073 continue; 7074 spa_open_ref(spa, FTAG); 7075 mutex_exit(&spa_namespace_lock); 7076 txg_wait_synced(spa_get_dsl(spa), 0); 7077 mutex_enter(&spa_namespace_lock); 7078 spa_close(spa, FTAG); 7079 } 7080 mutex_exit(&spa_namespace_lock); 7081} 7082 7083/* 7084 * ========================================================================== 7085 * Miscellaneous routines 7086 * ========================================================================== 7087 */ 7088 7089/* 7090 * Remove all pools in the system. 7091 */ 7092void 7093spa_evict_all(void) 7094{ 7095 spa_t *spa; 7096 7097 /* 7098 * Remove all cached state. All pools should be closed now, 7099 * so every spa in the AVL tree should be unreferenced. 7100 */ 7101 mutex_enter(&spa_namespace_lock); 7102 while ((spa = spa_next(NULL)) != NULL) { 7103 /* 7104 * Stop async tasks. The async thread may need to detach 7105 * a device that's been replaced, which requires grabbing 7106 * spa_namespace_lock, so we must drop it here. 7107 */ 7108 spa_open_ref(spa, FTAG); 7109 mutex_exit(&spa_namespace_lock); 7110 spa_async_suspend(spa); 7111 mutex_enter(&spa_namespace_lock); 7112 spa_close(spa, FTAG); 7113 7114 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 7115 spa_unload(spa); 7116 spa_deactivate(spa); 7117 } 7118 spa_remove(spa); 7119 } 7120 mutex_exit(&spa_namespace_lock); 7121} 7122 7123vdev_t * 7124spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) 7125{ 7126 vdev_t *vd; 7127 int i; 7128 7129 if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 7130 return (vd); 7131 7132 if (aux) { 7133 for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 7134 vd = spa->spa_l2cache.sav_vdevs[i]; 7135 if (vd->vdev_guid == guid) 7136 return (vd); 7137 } 7138 7139 for (i = 0; i < spa->spa_spares.sav_count; i++) { 7140 vd = spa->spa_spares.sav_vdevs[i]; 7141 if (vd->vdev_guid == guid) 7142 return (vd); 7143 } 7144 } 7145 7146 return (NULL); 7147} 7148 7149void 7150spa_upgrade(spa_t *spa, uint64_t version) 7151{ 7152 ASSERT(spa_writeable(spa)); 7153 7154 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 7155 7156 /* 7157 * This should only be called for a non-faulted pool, and since a 7158 * future version would result in an unopenable pool, this shouldn't be 7159 * possible. 7160 */ 7161 ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version)); 7162 ASSERT3U(version, >=, spa->spa_uberblock.ub_version); 7163 7164 spa->spa_uberblock.ub_version = version; 7165 vdev_config_dirty(spa->spa_root_vdev); 7166 7167 spa_config_exit(spa, SCL_ALL, FTAG); 7168 7169 txg_wait_synced(spa_get_dsl(spa), 0); 7170} 7171 7172boolean_t 7173spa_has_spare(spa_t *spa, uint64_t guid) 7174{ 7175 int i; 7176 uint64_t spareguid; 7177 spa_aux_vdev_t *sav = &spa->spa_spares; 7178 7179 for (i = 0; i < sav->sav_count; i++) 7180 if (sav->sav_vdevs[i]->vdev_guid == guid) 7181 return (B_TRUE); 7182 7183 for (i = 0; i < sav->sav_npending; i++) { 7184 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 7185 &spareguid) == 0 && spareguid == guid) 7186 return (B_TRUE); 7187 } 7188 7189 return (B_FALSE); 7190} 7191 7192/* 7193 * Check if a pool has an active shared spare device. 7194 * Note: reference count of an active spare is 2, as a spare and as a replace 7195 */ 7196static boolean_t 7197spa_has_active_shared_spare(spa_t *spa) 7198{ 7199 int i, refcnt; 7200 uint64_t pool; 7201 spa_aux_vdev_t *sav = &spa->spa_spares; 7202 7203 for (i = 0; i < sav->sav_count; i++) { 7204 if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, 7205 &refcnt) && pool != 0ULL && pool == spa_guid(spa) && 7206 refcnt > 2) 7207 return (B_TRUE); 7208 } 7209 7210 return (B_FALSE); 7211} 7212 7213/* 7214 * Post a sysevent corresponding to the given event. The 'name' must be one of 7215 * the event definitions in sys/sysevent/eventdefs.h. The payload will be 7216 * filled in from the spa and (optionally) the vdev. This doesn't do anything 7217 * in the userland libzpool, as we don't want consumers to misinterpret ztest 7218 * or zdb as real changes. 7219 */ 7220void 7221spa_event_notify(spa_t *spa, vdev_t *vd, const char *name) 7222{ 7223#ifdef _KERNEL 7224 sysevent_t *ev; 7225 sysevent_attr_list_t *attr = NULL; 7226 sysevent_value_t value; 7227 sysevent_id_t eid; 7228 7229 ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", 7230 SE_SLEEP); 7231 7232 value.value_type = SE_DATA_TYPE_STRING; 7233 value.value.sv_string = spa_name(spa); 7234 if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) 7235 goto done; 7236 7237 value.value_type = SE_DATA_TYPE_UINT64; 7238 value.value.sv_uint64 = spa_guid(spa); 7239 if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) 7240 goto done; 7241 7242 if (vd) { 7243 value.value_type = SE_DATA_TYPE_UINT64; 7244 value.value.sv_uint64 = vd->vdev_guid; 7245 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, 7246 SE_SLEEP) != 0) 7247 goto done; 7248 7249 if (vd->vdev_path) { 7250 value.value_type = SE_DATA_TYPE_STRING; 7251 value.value.sv_string = vd->vdev_path; 7252 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, 7253 &value, SE_SLEEP) != 0) 7254 goto done; 7255 } 7256 } 7257 7258 if (sysevent_attach_attributes(ev, attr) != 0) 7259 goto done; 7260 attr = NULL; 7261 7262 (void) log_sysevent(ev, SE_SLEEP, &eid); 7263 7264done: 7265 if (attr) 7266 sysevent_free_attr(attr); 7267 sysevent_free(ev); 7268#endif 7269} 7270