spa.c revision 168715
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22/* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27#pragma ident "%Z%%M% %I% %E% SMI" 28 29/* 30 * This file contains all the routines used when modifying on-disk SPA state. 31 * This includes opening, importing, destroying, exporting a pool, and syncing a 32 * pool. 33 */ 34 35#include <sys/zfs_context.h> 36#include <sys/fm/fs/zfs.h> 37#include <sys/spa_impl.h> 38#include <sys/zio.h> 39#include <sys/zio_checksum.h> 40#include <sys/zio_compress.h> 41#include <sys/dmu.h> 42#include <sys/dmu_tx.h> 43#include <sys/zap.h> 44#include <sys/zil.h> 45#include <sys/vdev_impl.h> 46#include <sys/metaslab.h> 47#include <sys/uberblock_impl.h> 48#include <sys/txg.h> 49#include <sys/avl.h> 50#include <sys/dmu_traverse.h> 51#include <sys/dmu_objset.h> 52#include <sys/unique.h> 53#include <sys/dsl_pool.h> 54#include <sys/dsl_dataset.h> 55#include <sys/dsl_dir.h> 56#include <sys/dsl_prop.h> 57#include <sys/dsl_synctask.h> 58#include <sys/fs/zfs.h> 59#include <sys/callb.h> 60 61int zio_taskq_threads = 0; 62SYSCTL_DECL(_vfs_zfs); 63SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO"); 64TUNABLE_INT("vfs.zfs.zio.taskq_threads", &zio_taskq_threads); 65SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, taskq_threads, CTLFLAG_RW, 66 &zio_taskq_threads, 0, "Number of ZIO threads per ZIO type"); 67 68 69/* 70 * ========================================================================== 71 * SPA state manipulation (open/create/destroy/import/export) 72 * ========================================================================== 73 */ 74 75static int 76spa_error_entry_compare(const void *a, const void *b) 77{ 78 spa_error_entry_t *sa = (spa_error_entry_t *)a; 79 spa_error_entry_t *sb = (spa_error_entry_t *)b; 80 int ret; 81 82 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 83 sizeof (zbookmark_t)); 84 85 if (ret < 0) 86 return (-1); 87 else if (ret > 0) 88 return (1); 89 else 90 return (0); 91} 92 93/* 94 * Utility function which retrieves copies of the current logs and 95 * re-initializes them in the process. 96 */ 97void 98spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 99{ 100 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 101 102 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 103 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 104 105 avl_create(&spa->spa_errlist_scrub, 106 spa_error_entry_compare, sizeof (spa_error_entry_t), 107 offsetof(spa_error_entry_t, se_avl)); 108 avl_create(&spa->spa_errlist_last, 109 spa_error_entry_compare, sizeof (spa_error_entry_t), 110 offsetof(spa_error_entry_t, se_avl)); 111} 112 113/* 114 * Activate an uninitialized pool. 115 */ 116static void 117spa_activate(spa_t *spa) 118{ 119 int t; 120 int nthreads = zio_taskq_threads; 121 char name[32]; 122 123 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 124 125 spa->spa_state = POOL_STATE_ACTIVE; 126 127 spa->spa_normal_class = metaslab_class_create(); 128 129 if (nthreads == 0) 130 nthreads = max_ncpus; 131 for (t = 0; t < ZIO_TYPES; t++) { 132 snprintf(name, sizeof(name), "spa_zio_issue %d", t); 133 spa->spa_zio_issue_taskq[t] = taskq_create(name, nthreads, 134 maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE); 135 snprintf(name, sizeof(name), "spa_zio_intr %d", t); 136 spa->spa_zio_intr_taskq[t] = taskq_create(name, nthreads, 137 maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE); 138 } 139 140 rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL); 141 142 mutex_init(&spa->spa_uberblock_lock, NULL, MUTEX_DEFAULT, NULL); 143 mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL); 144 mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL); 145 mutex_init(&spa->spa_config_lock.scl_lock, NULL, MUTEX_DEFAULT, NULL); 146 cv_init(&spa->spa_config_lock.scl_cv, NULL, CV_DEFAULT, NULL); 147 mutex_init(&spa->spa_sync_bplist.bpl_lock, NULL, MUTEX_DEFAULT, NULL); 148 mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL); 149 mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL); 150 151 list_create(&spa->spa_dirty_list, sizeof (vdev_t), 152 offsetof(vdev_t, vdev_dirty_node)); 153 154 txg_list_create(&spa->spa_vdev_txg_list, 155 offsetof(struct vdev, vdev_txg_node)); 156 157 avl_create(&spa->spa_errlist_scrub, 158 spa_error_entry_compare, sizeof (spa_error_entry_t), 159 offsetof(spa_error_entry_t, se_avl)); 160 avl_create(&spa->spa_errlist_last, 161 spa_error_entry_compare, sizeof (spa_error_entry_t), 162 offsetof(spa_error_entry_t, se_avl)); 163} 164 165/* 166 * Opposite of spa_activate(). 167 */ 168static void 169spa_deactivate(spa_t *spa) 170{ 171 int t; 172 173 ASSERT(spa->spa_sync_on == B_FALSE); 174 ASSERT(spa->spa_dsl_pool == NULL); 175 ASSERT(spa->spa_root_vdev == NULL); 176 177 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 178 179 txg_list_destroy(&spa->spa_vdev_txg_list); 180 181 list_destroy(&spa->spa_dirty_list); 182 183 for (t = 0; t < ZIO_TYPES; t++) { 184 taskq_destroy(spa->spa_zio_issue_taskq[t]); 185 taskq_destroy(spa->spa_zio_intr_taskq[t]); 186 spa->spa_zio_issue_taskq[t] = NULL; 187 spa->spa_zio_intr_taskq[t] = NULL; 188 } 189 190 metaslab_class_destroy(spa->spa_normal_class); 191 spa->spa_normal_class = NULL; 192 193 /* 194 * If this was part of an import or the open otherwise failed, we may 195 * still have errors left in the queues. Empty them just in case. 196 */ 197 spa_errlog_drain(spa); 198 199 avl_destroy(&spa->spa_errlist_scrub); 200 avl_destroy(&spa->spa_errlist_last); 201 202 rw_destroy(&spa->spa_traverse_lock); 203 mutex_destroy(&spa->spa_uberblock_lock); 204 mutex_destroy(&spa->spa_errlog_lock); 205 mutex_destroy(&spa->spa_errlist_lock); 206 mutex_destroy(&spa->spa_config_lock.scl_lock); 207 cv_destroy(&spa->spa_config_lock.scl_cv); 208 mutex_destroy(&spa->spa_sync_bplist.bpl_lock); 209 mutex_destroy(&spa->spa_history_lock); 210 mutex_destroy(&spa->spa_props_lock); 211 212 spa->spa_state = POOL_STATE_UNINITIALIZED; 213} 214 215/* 216 * Verify a pool configuration, and construct the vdev tree appropriately. This 217 * will create all the necessary vdevs in the appropriate layout, with each vdev 218 * in the CLOSED state. This will prep the pool before open/creation/import. 219 * All vdev validation is done by the vdev_alloc() routine. 220 */ 221static int 222spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 223 uint_t id, int atype) 224{ 225 nvlist_t **child; 226 uint_t c, children; 227 int error; 228 229 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 230 return (error); 231 232 if ((*vdp)->vdev_ops->vdev_op_leaf) 233 return (0); 234 235 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 236 &child, &children) != 0) { 237 vdev_free(*vdp); 238 *vdp = NULL; 239 return (EINVAL); 240 } 241 242 for (c = 0; c < children; c++) { 243 vdev_t *vd; 244 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 245 atype)) != 0) { 246 vdev_free(*vdp); 247 *vdp = NULL; 248 return (error); 249 } 250 } 251 252 ASSERT(*vdp != NULL); 253 254 return (0); 255} 256 257/* 258 * Opposite of spa_load(). 259 */ 260static void 261spa_unload(spa_t *spa) 262{ 263 int i; 264 265 /* 266 * Stop async tasks. 267 */ 268 spa_async_suspend(spa); 269 270 /* 271 * Stop syncing. 272 */ 273 if (spa->spa_sync_on) { 274 txg_sync_stop(spa->spa_dsl_pool); 275 spa->spa_sync_on = B_FALSE; 276 } 277 278 /* 279 * Wait for any outstanding prefetch I/O to complete. 280 */ 281 spa_config_enter(spa, RW_WRITER, FTAG); 282 spa_config_exit(spa, FTAG); 283 284 /* 285 * Close the dsl pool. 286 */ 287 if (spa->spa_dsl_pool) { 288 dsl_pool_close(spa->spa_dsl_pool); 289 spa->spa_dsl_pool = NULL; 290 } 291 292 /* 293 * Close all vdevs. 294 */ 295 if (spa->spa_root_vdev) 296 vdev_free(spa->spa_root_vdev); 297 ASSERT(spa->spa_root_vdev == NULL); 298 299 for (i = 0; i < spa->spa_nspares; i++) 300 vdev_free(spa->spa_spares[i]); 301 if (spa->spa_spares) { 302 kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 303 spa->spa_spares = NULL; 304 } 305 if (spa->spa_sparelist) { 306 nvlist_free(spa->spa_sparelist); 307 spa->spa_sparelist = NULL; 308 } 309 310 spa->spa_async_suspended = 0; 311} 312 313/* 314 * Load (or re-load) the current list of vdevs describing the active spares for 315 * this pool. When this is called, we have some form of basic information in 316 * 'spa_sparelist'. We parse this into vdevs, try to open them, and then 317 * re-generate a more complete list including status information. 318 */ 319static void 320spa_load_spares(spa_t *spa) 321{ 322 nvlist_t **spares; 323 uint_t nspares; 324 int i; 325 vdev_t *vd, *tvd; 326 327 /* 328 * First, close and free any existing spare vdevs. 329 */ 330 for (i = 0; i < spa->spa_nspares; i++) { 331 vd = spa->spa_spares[i]; 332 333 /* Undo the call to spa_activate() below */ 334 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL && 335 tvd->vdev_isspare) 336 spa_spare_remove(tvd); 337 vdev_close(vd); 338 vdev_free(vd); 339 } 340 341 if (spa->spa_spares) 342 kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 343 344 if (spa->spa_sparelist == NULL) 345 nspares = 0; 346 else 347 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 348 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 349 350 spa->spa_nspares = (int)nspares; 351 spa->spa_spares = NULL; 352 353 if (nspares == 0) 354 return; 355 356 /* 357 * Construct the array of vdevs, opening them to get status in the 358 * process. For each spare, there is potentially two different vdev_t 359 * structures associated with it: one in the list of spares (used only 360 * for basic validation purposes) and one in the active vdev 361 * configuration (if it's spared in). During this phase we open and 362 * validate each vdev on the spare list. If the vdev also exists in the 363 * active configuration, then we also mark this vdev as an active spare. 364 */ 365 spa->spa_spares = kmem_alloc(nspares * sizeof (void *), KM_SLEEP); 366 for (i = 0; i < spa->spa_nspares; i++) { 367 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 368 VDEV_ALLOC_SPARE) == 0); 369 ASSERT(vd != NULL); 370 371 spa->spa_spares[i] = vd; 372 373 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL) { 374 if (!tvd->vdev_isspare) 375 spa_spare_add(tvd); 376 377 /* 378 * We only mark the spare active if we were successfully 379 * able to load the vdev. Otherwise, importing a pool 380 * with a bad active spare would result in strange 381 * behavior, because multiple pool would think the spare 382 * is actively in use. 383 * 384 * There is a vulnerability here to an equally bizarre 385 * circumstance, where a dead active spare is later 386 * brought back to life (onlined or otherwise). Given 387 * the rarity of this scenario, and the extra complexity 388 * it adds, we ignore the possibility. 389 */ 390 if (!vdev_is_dead(tvd)) 391 spa_spare_activate(tvd); 392 } 393 394 if (vdev_open(vd) != 0) 395 continue; 396 397 vd->vdev_top = vd; 398 (void) vdev_validate_spare(vd); 399 } 400 401 /* 402 * Recompute the stashed list of spares, with status information 403 * this time. 404 */ 405 VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 406 DATA_TYPE_NVLIST_ARRAY) == 0); 407 408 spares = kmem_alloc(spa->spa_nspares * sizeof (void *), KM_SLEEP); 409 for (i = 0; i < spa->spa_nspares; i++) 410 spares[i] = vdev_config_generate(spa, spa->spa_spares[i], 411 B_TRUE, B_TRUE); 412 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 413 spares, spa->spa_nspares) == 0); 414 for (i = 0; i < spa->spa_nspares; i++) 415 nvlist_free(spares[i]); 416 kmem_free(spares, spa->spa_nspares * sizeof (void *)); 417} 418 419static int 420load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 421{ 422 dmu_buf_t *db; 423 char *packed = NULL; 424 size_t nvsize = 0; 425 int error; 426 *value = NULL; 427 428 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 429 nvsize = *(uint64_t *)db->db_data; 430 dmu_buf_rele(db, FTAG); 431 432 packed = kmem_alloc(nvsize, KM_SLEEP); 433 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed); 434 if (error == 0) 435 error = nvlist_unpack(packed, nvsize, value, 0); 436 kmem_free(packed, nvsize); 437 438 return (error); 439} 440 441/* 442 * Load an existing storage pool, using the pool's builtin spa_config as a 443 * source of configuration information. 444 */ 445static int 446spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) 447{ 448 int error = 0; 449 nvlist_t *nvroot = NULL; 450 vdev_t *rvd; 451 uberblock_t *ub = &spa->spa_uberblock; 452 uint64_t config_cache_txg = spa->spa_config_txg; 453 uint64_t pool_guid; 454 uint64_t version; 455 zio_t *zio; 456 457 spa->spa_load_state = state; 458 459 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 460 nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 461 error = EINVAL; 462 goto out; 463 } 464 465 /* 466 * Versioning wasn't explicitly added to the label until later, so if 467 * it's not present treat it as the initial version. 468 */ 469 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0) 470 version = ZFS_VERSION_INITIAL; 471 472 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 473 &spa->spa_config_txg); 474 475 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 476 spa_guid_exists(pool_guid, 0)) { 477 error = EEXIST; 478 goto out; 479 } 480 481 spa->spa_load_guid = pool_guid; 482 483 /* 484 * Parse the configuration into a vdev tree. We explicitly set the 485 * value that will be returned by spa_version() since parsing the 486 * configuration requires knowing the version number. 487 */ 488 spa_config_enter(spa, RW_WRITER, FTAG); 489 spa->spa_ubsync.ub_version = version; 490 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 491 spa_config_exit(spa, FTAG); 492 493 if (error != 0) 494 goto out; 495 496 ASSERT(spa->spa_root_vdev == rvd); 497 ASSERT(spa_guid(spa) == pool_guid); 498 499 /* 500 * Try to open all vdevs, loading each label in the process. 501 */ 502 if (vdev_open(rvd) != 0) { 503 error = ENXIO; 504 goto out; 505 } 506 507 /* 508 * Validate the labels for all leaf vdevs. We need to grab the config 509 * lock because all label I/O is done with the ZIO_FLAG_CONFIG_HELD 510 * flag. 511 */ 512 spa_config_enter(spa, RW_READER, FTAG); 513 error = vdev_validate(rvd); 514 spa_config_exit(spa, FTAG); 515 516 if (error != 0) { 517 error = EBADF; 518 goto out; 519 } 520 521 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 522 error = ENXIO; 523 goto out; 524 } 525 526 /* 527 * Find the best uberblock. 528 */ 529 bzero(ub, sizeof (uberblock_t)); 530 531 zio = zio_root(spa, NULL, NULL, 532 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 533 vdev_uberblock_load(zio, rvd, ub); 534 error = zio_wait(zio); 535 536 /* 537 * If we weren't able to find a single valid uberblock, return failure. 538 */ 539 if (ub->ub_txg == 0) { 540 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 541 VDEV_AUX_CORRUPT_DATA); 542 error = ENXIO; 543 goto out; 544 } 545 546 /* 547 * If the pool is newer than the code, we can't open it. 548 */ 549 if (ub->ub_version > ZFS_VERSION) { 550 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 551 VDEV_AUX_VERSION_NEWER); 552 error = ENOTSUP; 553 goto out; 554 } 555 556 /* 557 * If the vdev guid sum doesn't match the uberblock, we have an 558 * incomplete configuration. 559 */ 560 if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 561 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 562 VDEV_AUX_BAD_GUID_SUM); 563 error = ENXIO; 564 goto out; 565 } 566 567 /* 568 * Initialize internal SPA structures. 569 */ 570 spa->spa_state = POOL_STATE_ACTIVE; 571 spa->spa_ubsync = spa->spa_uberblock; 572 spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 573 error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 574 if (error) { 575 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 576 VDEV_AUX_CORRUPT_DATA); 577 goto out; 578 } 579 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 580 581 if (zap_lookup(spa->spa_meta_objset, 582 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 583 sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { 584 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 585 VDEV_AUX_CORRUPT_DATA); 586 error = EIO; 587 goto out; 588 } 589 590 if (!mosconfig) { 591 nvlist_t *newconfig; 592 uint64_t hostid; 593 594 if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) { 595 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 596 VDEV_AUX_CORRUPT_DATA); 597 error = EIO; 598 goto out; 599 } 600 601 if (nvlist_lookup_uint64(newconfig, ZPOOL_CONFIG_HOSTID, 602 &hostid) == 0) { 603 char *hostname; 604 unsigned long myhostid = 0; 605 606 VERIFY(nvlist_lookup_string(newconfig, 607 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 608 609 (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 610 if ((unsigned long)hostid != myhostid) { 611 cmn_err(CE_WARN, "pool '%s' could not be " 612 "loaded as it was last accessed by " 613 "another system (host: %s hostid: 0x%lx). " 614 "See: http://www.sun.com/msg/ZFS-8000-EY", 615 spa->spa_name, hostname, 616 (unsigned long)hostid); 617 error = EBADF; 618 goto out; 619 } 620 } 621 622 spa_config_set(spa, newconfig); 623 spa_unload(spa); 624 spa_deactivate(spa); 625 spa_activate(spa); 626 627 return (spa_load(spa, newconfig, state, B_TRUE)); 628 } 629 630 if (zap_lookup(spa->spa_meta_objset, 631 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 632 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { 633 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 634 VDEV_AUX_CORRUPT_DATA); 635 error = EIO; 636 goto out; 637 } 638 639 /* 640 * Load the bit that tells us to use the new accounting function 641 * (raid-z deflation). If we have an older pool, this will not 642 * be present. 643 */ 644 error = zap_lookup(spa->spa_meta_objset, 645 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 646 sizeof (uint64_t), 1, &spa->spa_deflate); 647 if (error != 0 && error != ENOENT) { 648 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 649 VDEV_AUX_CORRUPT_DATA); 650 error = EIO; 651 goto out; 652 } 653 654 /* 655 * Load the persistent error log. If we have an older pool, this will 656 * not be present. 657 */ 658 error = zap_lookup(spa->spa_meta_objset, 659 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, 660 sizeof (uint64_t), 1, &spa->spa_errlog_last); 661 if (error != 0 && error != ENOENT) { 662 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 663 VDEV_AUX_CORRUPT_DATA); 664 error = EIO; 665 goto out; 666 } 667 668 error = zap_lookup(spa->spa_meta_objset, 669 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, 670 sizeof (uint64_t), 1, &spa->spa_errlog_scrub); 671 if (error != 0 && error != ENOENT) { 672 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 673 VDEV_AUX_CORRUPT_DATA); 674 error = EIO; 675 goto out; 676 } 677 678 /* 679 * Load the history object. If we have an older pool, this 680 * will not be present. 681 */ 682 error = zap_lookup(spa->spa_meta_objset, 683 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY, 684 sizeof (uint64_t), 1, &spa->spa_history); 685 if (error != 0 && error != ENOENT) { 686 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 687 VDEV_AUX_CORRUPT_DATA); 688 error = EIO; 689 goto out; 690 } 691 692 /* 693 * Load any hot spares for this pool. 694 */ 695 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 696 DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares_object); 697 if (error != 0 && error != ENOENT) { 698 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 699 VDEV_AUX_CORRUPT_DATA); 700 error = EIO; 701 goto out; 702 } 703 if (error == 0) { 704 ASSERT(spa_version(spa) >= ZFS_VERSION_SPARES); 705 if (load_nvlist(spa, spa->spa_spares_object, 706 &spa->spa_sparelist) != 0) { 707 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 708 VDEV_AUX_CORRUPT_DATA); 709 error = EIO; 710 goto out; 711 } 712 713 spa_config_enter(spa, RW_WRITER, FTAG); 714 spa_load_spares(spa); 715 spa_config_exit(spa, FTAG); 716 } 717 718 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 719 DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object); 720 721 if (error && error != ENOENT) { 722 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 723 VDEV_AUX_CORRUPT_DATA); 724 error = EIO; 725 goto out; 726 } 727 728 if (error == 0) { 729 (void) zap_lookup(spa->spa_meta_objset, 730 spa->spa_pool_props_object, 731 zpool_prop_to_name(ZFS_PROP_BOOTFS), 732 sizeof (uint64_t), 1, &spa->spa_bootfs); 733 } 734 735 /* 736 * Load the vdev state for all toplevel vdevs. 737 */ 738 vdev_load(rvd); 739 740 /* 741 * Propagate the leaf DTLs we just loaded all the way up the tree. 742 */ 743 spa_config_enter(spa, RW_WRITER, FTAG); 744 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 745 spa_config_exit(spa, FTAG); 746 747 /* 748 * Check the state of the root vdev. If it can't be opened, it 749 * indicates one or more toplevel vdevs are faulted. 750 */ 751 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 752 error = ENXIO; 753 goto out; 754 } 755 756 if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) { 757 dmu_tx_t *tx; 758 int need_update = B_FALSE; 759 int c; 760 761 /* 762 * Claim log blocks that haven't been committed yet. 763 * This must all happen in a single txg. 764 */ 765 tx = dmu_tx_create_assigned(spa_get_dsl(spa), 766 spa_first_txg(spa)); 767 (void) dmu_objset_find(spa->spa_name, 768 zil_claim, tx, DS_FIND_CHILDREN); 769 dmu_tx_commit(tx); 770 771 spa->spa_sync_on = B_TRUE; 772 txg_sync_start(spa->spa_dsl_pool); 773 774 /* 775 * Wait for all claims to sync. 776 */ 777 txg_wait_synced(spa->spa_dsl_pool, 0); 778 779 /* 780 * If the config cache is stale, or we have uninitialized 781 * metaslabs (see spa_vdev_add()), then update the config. 782 */ 783 if (config_cache_txg != spa->spa_config_txg || 784 state == SPA_LOAD_IMPORT) 785 need_update = B_TRUE; 786 787 for (c = 0; c < rvd->vdev_children; c++) 788 if (rvd->vdev_child[c]->vdev_ms_array == 0) 789 need_update = B_TRUE; 790 791 /* 792 * Update the config cache asychronously in case we're the 793 * root pool, in which case the config cache isn't writable yet. 794 */ 795 if (need_update) 796 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 797 } 798 799 error = 0; 800out: 801 if (error && error != EBADF) 802 zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0); 803 spa->spa_load_state = SPA_LOAD_NONE; 804 spa->spa_ena = 0; 805 806 return (error); 807} 808 809/* 810 * Pool Open/Import 811 * 812 * The import case is identical to an open except that the configuration is sent 813 * down from userland, instead of grabbed from the configuration cache. For the 814 * case of an open, the pool configuration will exist in the 815 * POOL_STATE_UNITIALIZED state. 816 * 817 * The stats information (gen/count/ustats) is used to gather vdev statistics at 818 * the same time open the pool, without having to keep around the spa_t in some 819 * ambiguous state. 820 */ 821static int 822spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 823{ 824 spa_t *spa; 825 int error; 826 int loaded = B_FALSE; 827 int locked = B_FALSE; 828 829 *spapp = NULL; 830 831 /* 832 * As disgusting as this is, we need to support recursive calls to this 833 * function because dsl_dir_open() is called during spa_load(), and ends 834 * up calling spa_open() again. The real fix is to figure out how to 835 * avoid dsl_dir_open() calling this in the first place. 836 */ 837 if (mutex_owner(&spa_namespace_lock) != curthread) { 838 mutex_enter(&spa_namespace_lock); 839 locked = B_TRUE; 840 } 841 842 if ((spa = spa_lookup(pool)) == NULL) { 843 if (locked) 844 mutex_exit(&spa_namespace_lock); 845 return (ENOENT); 846 } 847 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 848 849 spa_activate(spa); 850 851 error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE); 852 853 if (error == EBADF) { 854 /* 855 * If vdev_validate() returns failure (indicated by 856 * EBADF), it indicates that one of the vdevs indicates 857 * that the pool has been exported or destroyed. If 858 * this is the case, the config cache is out of sync and 859 * we should remove the pool from the namespace. 860 */ 861 zfs_post_ok(spa, NULL); 862 spa_unload(spa); 863 spa_deactivate(spa); 864 spa_remove(spa); 865 spa_config_sync(); 866 if (locked) 867 mutex_exit(&spa_namespace_lock); 868 return (ENOENT); 869 } 870 871 if (error) { 872 /* 873 * We can't open the pool, but we still have useful 874 * information: the state of each vdev after the 875 * attempted vdev_open(). Return this to the user. 876 */ 877 if (config != NULL && spa->spa_root_vdev != NULL) { 878 spa_config_enter(spa, RW_READER, FTAG); 879 *config = spa_config_generate(spa, NULL, -1ULL, 880 B_TRUE); 881 spa_config_exit(spa, FTAG); 882 } 883 spa_unload(spa); 884 spa_deactivate(spa); 885 spa->spa_last_open_failed = B_TRUE; 886 if (locked) 887 mutex_exit(&spa_namespace_lock); 888 *spapp = NULL; 889 return (error); 890 } else { 891 zfs_post_ok(spa, NULL); 892 spa->spa_last_open_failed = B_FALSE; 893 } 894 895 loaded = B_TRUE; 896 } 897 898 spa_open_ref(spa, tag); 899 if (locked) 900 mutex_exit(&spa_namespace_lock); 901 902 *spapp = spa; 903 904 if (config != NULL) { 905 spa_config_enter(spa, RW_READER, FTAG); 906 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 907 spa_config_exit(spa, FTAG); 908 } 909 910 /* 911 * If we just loaded the pool, resilver anything that's out of date. 912 */ 913 if (loaded && (spa_mode & FWRITE)) 914 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 915 916 return (0); 917} 918 919int 920spa_open(const char *name, spa_t **spapp, void *tag) 921{ 922 return (spa_open_common(name, spapp, tag, NULL)); 923} 924 925/* 926 * Lookup the given spa_t, incrementing the inject count in the process, 927 * preventing it from being exported or destroyed. 928 */ 929spa_t * 930spa_inject_addref(char *name) 931{ 932 spa_t *spa; 933 934 mutex_enter(&spa_namespace_lock); 935 if ((spa = spa_lookup(name)) == NULL) { 936 mutex_exit(&spa_namespace_lock); 937 return (NULL); 938 } 939 spa->spa_inject_ref++; 940 mutex_exit(&spa_namespace_lock); 941 942 return (spa); 943} 944 945void 946spa_inject_delref(spa_t *spa) 947{ 948 mutex_enter(&spa_namespace_lock); 949 spa->spa_inject_ref--; 950 mutex_exit(&spa_namespace_lock); 951} 952 953static void 954spa_add_spares(spa_t *spa, nvlist_t *config) 955{ 956 nvlist_t **spares; 957 uint_t i, nspares; 958 nvlist_t *nvroot; 959 uint64_t guid; 960 vdev_stat_t *vs; 961 uint_t vsc; 962 uint64_t pool; 963 964 if (spa->spa_nspares == 0) 965 return; 966 967 VERIFY(nvlist_lookup_nvlist(config, 968 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 969 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 970 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 971 if (nspares != 0) { 972 VERIFY(nvlist_add_nvlist_array(nvroot, 973 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 974 VERIFY(nvlist_lookup_nvlist_array(nvroot, 975 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 976 977 /* 978 * Go through and find any spares which have since been 979 * repurposed as an active spare. If this is the case, update 980 * their status appropriately. 981 */ 982 for (i = 0; i < nspares; i++) { 983 VERIFY(nvlist_lookup_uint64(spares[i], 984 ZPOOL_CONFIG_GUID, &guid) == 0); 985 if (spa_spare_exists(guid, &pool) && pool != 0ULL) { 986 VERIFY(nvlist_lookup_uint64_array( 987 spares[i], ZPOOL_CONFIG_STATS, 988 (uint64_t **)&vs, &vsc) == 0); 989 vs->vs_state = VDEV_STATE_CANT_OPEN; 990 vs->vs_aux = VDEV_AUX_SPARED; 991 } 992 } 993 } 994} 995 996int 997spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 998{ 999 int error; 1000 spa_t *spa; 1001 1002 *config = NULL; 1003 error = spa_open_common(name, &spa, FTAG, config); 1004 1005 if (spa && *config != NULL) { 1006 VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, 1007 spa_get_errlog_size(spa)) == 0); 1008 1009 spa_add_spares(spa, *config); 1010 } 1011 1012 /* 1013 * We want to get the alternate root even for faulted pools, so we cheat 1014 * and call spa_lookup() directly. 1015 */ 1016 if (altroot) { 1017 if (spa == NULL) { 1018 mutex_enter(&spa_namespace_lock); 1019 spa = spa_lookup(name); 1020 if (spa) 1021 spa_altroot(spa, altroot, buflen); 1022 else 1023 altroot[0] = '\0'; 1024 spa = NULL; 1025 mutex_exit(&spa_namespace_lock); 1026 } else { 1027 spa_altroot(spa, altroot, buflen); 1028 } 1029 } 1030 1031 if (spa != NULL) 1032 spa_close(spa, FTAG); 1033 1034 return (error); 1035} 1036 1037/* 1038 * Validate that the 'spares' array is well formed. We must have an array of 1039 * nvlists, each which describes a valid leaf vdev. If this is an import (mode 1040 * is VDEV_ALLOC_SPARE), then we allow corrupted spares to be specified, as long 1041 * as they are well-formed. 1042 */ 1043static int 1044spa_validate_spares(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 1045{ 1046 nvlist_t **spares; 1047 uint_t i, nspares; 1048 vdev_t *vd; 1049 int error; 1050 1051 /* 1052 * It's acceptable to have no spares specified. 1053 */ 1054 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1055 &spares, &nspares) != 0) 1056 return (0); 1057 1058 if (nspares == 0) 1059 return (EINVAL); 1060 1061 /* 1062 * Make sure the pool is formatted with a version that supports hot 1063 * spares. 1064 */ 1065 if (spa_version(spa) < ZFS_VERSION_SPARES) 1066 return (ENOTSUP); 1067 1068 /* 1069 * Set the pending spare list so we correctly handle device in-use 1070 * checking. 1071 */ 1072 spa->spa_pending_spares = spares; 1073 spa->spa_pending_nspares = nspares; 1074 1075 for (i = 0; i < nspares; i++) { 1076 if ((error = spa_config_parse(spa, &vd, spares[i], NULL, 0, 1077 mode)) != 0) 1078 goto out; 1079 1080 if (!vd->vdev_ops->vdev_op_leaf) { 1081 vdev_free(vd); 1082 error = EINVAL; 1083 goto out; 1084 } 1085 1086 vd->vdev_top = vd; 1087 1088 if ((error = vdev_open(vd)) == 0 && 1089 (error = vdev_label_init(vd, crtxg, 1090 VDEV_LABEL_SPARE)) == 0) { 1091 VERIFY(nvlist_add_uint64(spares[i], ZPOOL_CONFIG_GUID, 1092 vd->vdev_guid) == 0); 1093 } 1094 1095 vdev_free(vd); 1096 1097 if (error && mode != VDEV_ALLOC_SPARE) 1098 goto out; 1099 else 1100 error = 0; 1101 } 1102 1103out: 1104 spa->spa_pending_spares = NULL; 1105 spa->spa_pending_nspares = 0; 1106 return (error); 1107} 1108 1109/* 1110 * Pool Creation 1111 */ 1112int 1113spa_create(const char *pool, nvlist_t *nvroot, const char *altroot) 1114{ 1115 spa_t *spa; 1116 vdev_t *rvd; 1117 dsl_pool_t *dp; 1118 dmu_tx_t *tx; 1119 int c, error = 0; 1120 uint64_t txg = TXG_INITIAL; 1121 nvlist_t **spares; 1122 uint_t nspares; 1123 1124 /* 1125 * If this pool already exists, return failure. 1126 */ 1127 mutex_enter(&spa_namespace_lock); 1128 if (spa_lookup(pool) != NULL) { 1129 mutex_exit(&spa_namespace_lock); 1130 return (EEXIST); 1131 } 1132 1133 /* 1134 * Allocate a new spa_t structure. 1135 */ 1136 spa = spa_add(pool, altroot); 1137 spa_activate(spa); 1138 1139 spa->spa_uberblock.ub_txg = txg - 1; 1140 spa->spa_uberblock.ub_version = ZFS_VERSION; 1141 spa->spa_ubsync = spa->spa_uberblock; 1142 1143 /* 1144 * Create the root vdev. 1145 */ 1146 spa_config_enter(spa, RW_WRITER, FTAG); 1147 1148 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 1149 1150 ASSERT(error != 0 || rvd != NULL); 1151 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 1152 1153 if (error == 0 && rvd->vdev_children == 0) 1154 error = EINVAL; 1155 1156 if (error == 0 && 1157 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 1158 (error = spa_validate_spares(spa, nvroot, txg, 1159 VDEV_ALLOC_ADD)) == 0) { 1160 for (c = 0; c < rvd->vdev_children; c++) 1161 vdev_init(rvd->vdev_child[c], txg); 1162 vdev_config_dirty(rvd); 1163 } 1164 1165 spa_config_exit(spa, FTAG); 1166 1167 if (error != 0) { 1168 spa_unload(spa); 1169 spa_deactivate(spa); 1170 spa_remove(spa); 1171 mutex_exit(&spa_namespace_lock); 1172 return (error); 1173 } 1174 1175 /* 1176 * Get the list of spares, if specified. 1177 */ 1178 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1179 &spares, &nspares) == 0) { 1180 VERIFY(nvlist_alloc(&spa->spa_sparelist, NV_UNIQUE_NAME, 1181 KM_SLEEP) == 0); 1182 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1183 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1184 spa_config_enter(spa, RW_WRITER, FTAG); 1185 spa_load_spares(spa); 1186 spa_config_exit(spa, FTAG); 1187 spa->spa_sync_spares = B_TRUE; 1188 } 1189 1190 spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg); 1191 spa->spa_meta_objset = dp->dp_meta_objset; 1192 1193 tx = dmu_tx_create_assigned(dp, txg); 1194 1195 /* 1196 * Create the pool config object. 1197 */ 1198 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 1199 DMU_OT_PACKED_NVLIST, 1 << 14, 1200 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 1201 1202 if (zap_add(spa->spa_meta_objset, 1203 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 1204 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 1205 cmn_err(CE_PANIC, "failed to add pool config"); 1206 } 1207 1208 /* Newly created pools are always deflated. */ 1209 spa->spa_deflate = TRUE; 1210 if (zap_add(spa->spa_meta_objset, 1211 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 1212 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 1213 cmn_err(CE_PANIC, "failed to add deflate"); 1214 } 1215 1216 /* 1217 * Create the deferred-free bplist object. Turn off compression 1218 * because sync-to-convergence takes longer if the blocksize 1219 * keeps changing. 1220 */ 1221 spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 1222 1 << 14, tx); 1223 dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 1224 ZIO_COMPRESS_OFF, tx); 1225 1226 if (zap_add(spa->spa_meta_objset, 1227 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 1228 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { 1229 cmn_err(CE_PANIC, "failed to add bplist"); 1230 } 1231 1232 /* 1233 * Create the pool's history object. 1234 */ 1235 spa_history_create_obj(spa, tx); 1236 1237 dmu_tx_commit(tx); 1238 1239 spa->spa_bootfs = zfs_prop_default_numeric(ZFS_PROP_BOOTFS); 1240 spa->spa_sync_on = B_TRUE; 1241 txg_sync_start(spa->spa_dsl_pool); 1242 1243 /* 1244 * We explicitly wait for the first transaction to complete so that our 1245 * bean counters are appropriately updated. 1246 */ 1247 txg_wait_synced(spa->spa_dsl_pool, txg); 1248 1249 spa_config_sync(); 1250 1251 mutex_exit(&spa_namespace_lock); 1252 1253 return (0); 1254} 1255 1256/* 1257 * Import the given pool into the system. We set up the necessary spa_t and 1258 * then call spa_load() to do the dirty work. 1259 */ 1260int 1261spa_import(const char *pool, nvlist_t *config, const char *altroot) 1262{ 1263 spa_t *spa; 1264 int error; 1265 nvlist_t *nvroot; 1266 nvlist_t **spares; 1267 uint_t nspares; 1268 1269 if (!(spa_mode & FWRITE)) 1270 return (EROFS); 1271 1272 /* 1273 * If a pool with this name exists, return failure. 1274 */ 1275 mutex_enter(&spa_namespace_lock); 1276 if (spa_lookup(pool) != NULL) { 1277 mutex_exit(&spa_namespace_lock); 1278 return (EEXIST); 1279 } 1280 1281 /* 1282 * Create and initialize the spa structure. 1283 */ 1284 spa = spa_add(pool, altroot); 1285 spa_activate(spa); 1286 1287 /* 1288 * Pass off the heavy lifting to spa_load(). 1289 * Pass TRUE for mosconfig because the user-supplied config 1290 * is actually the one to trust when doing an import. 1291 */ 1292 error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE); 1293 1294 spa_config_enter(spa, RW_WRITER, FTAG); 1295 /* 1296 * Toss any existing sparelist, as it doesn't have any validity anymore, 1297 * and conflicts with spa_has_spare(). 1298 */ 1299 if (spa->spa_sparelist) { 1300 nvlist_free(spa->spa_sparelist); 1301 spa->spa_sparelist = NULL; 1302 spa_load_spares(spa); 1303 } 1304 1305 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 1306 &nvroot) == 0); 1307 if (error == 0) 1308 error = spa_validate_spares(spa, nvroot, -1ULL, 1309 VDEV_ALLOC_SPARE); 1310 spa_config_exit(spa, FTAG); 1311 1312 if (error != 0) { 1313 spa_unload(spa); 1314 spa_deactivate(spa); 1315 spa_remove(spa); 1316 mutex_exit(&spa_namespace_lock); 1317 return (error); 1318 } 1319 1320 /* 1321 * Override any spares as specified by the user, as these may have 1322 * correct device names/devids, etc. 1323 */ 1324 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1325 &spares, &nspares) == 0) { 1326 if (spa->spa_sparelist) 1327 VERIFY(nvlist_remove(spa->spa_sparelist, 1328 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 1329 else 1330 VERIFY(nvlist_alloc(&spa->spa_sparelist, 1331 NV_UNIQUE_NAME, KM_SLEEP) == 0); 1332 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1333 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1334 spa_config_enter(spa, RW_WRITER, FTAG); 1335 spa_load_spares(spa); 1336 spa_config_exit(spa, FTAG); 1337 spa->spa_sync_spares = B_TRUE; 1338 } 1339 1340 /* 1341 * Update the config cache to include the newly-imported pool. 1342 */ 1343 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 1344 1345 mutex_exit(&spa_namespace_lock); 1346 1347 /* 1348 * Resilver anything that's out of date. 1349 */ 1350 if (spa_mode & FWRITE) 1351 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1352 1353 return (0); 1354} 1355 1356/* 1357 * This (illegal) pool name is used when temporarily importing a spa_t in order 1358 * to get the vdev stats associated with the imported devices. 1359 */ 1360#define TRYIMPORT_NAME "$import" 1361 1362nvlist_t * 1363spa_tryimport(nvlist_t *tryconfig) 1364{ 1365 nvlist_t *config = NULL; 1366 char *poolname; 1367 spa_t *spa; 1368 uint64_t state; 1369 1370 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 1371 return (NULL); 1372 1373 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 1374 return (NULL); 1375 1376 /* 1377 * Create and initialize the spa structure. 1378 */ 1379 mutex_enter(&spa_namespace_lock); 1380 spa = spa_add(TRYIMPORT_NAME, NULL); 1381 spa_activate(spa); 1382 1383 /* 1384 * Pass off the heavy lifting to spa_load(). 1385 * Pass TRUE for mosconfig because the user-supplied config 1386 * is actually the one to trust when doing an import. 1387 */ 1388 (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); 1389 1390 /* 1391 * If 'tryconfig' was at least parsable, return the current config. 1392 */ 1393 if (spa->spa_root_vdev != NULL) { 1394 spa_config_enter(spa, RW_READER, FTAG); 1395 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 1396 spa_config_exit(spa, FTAG); 1397 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 1398 poolname) == 0); 1399 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 1400 state) == 0); 1401 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 1402 spa->spa_uberblock.ub_timestamp) == 0); 1403 1404 /* 1405 * Add the list of hot spares. 1406 */ 1407 spa_add_spares(spa, config); 1408 } 1409 1410 spa_unload(spa); 1411 spa_deactivate(spa); 1412 spa_remove(spa); 1413 mutex_exit(&spa_namespace_lock); 1414 1415 return (config); 1416} 1417 1418/* 1419 * Pool export/destroy 1420 * 1421 * The act of destroying or exporting a pool is very simple. We make sure there 1422 * is no more pending I/O and any references to the pool are gone. Then, we 1423 * update the pool state and sync all the labels to disk, removing the 1424 * configuration from the cache afterwards. 1425 */ 1426static int 1427spa_export_common(char *pool, int new_state, nvlist_t **oldconfig) 1428{ 1429 spa_t *spa; 1430 1431 if (oldconfig) 1432 *oldconfig = NULL; 1433 1434 if (!(spa_mode & FWRITE)) 1435 return (EROFS); 1436 1437 mutex_enter(&spa_namespace_lock); 1438 if ((spa = spa_lookup(pool)) == NULL) { 1439 mutex_exit(&spa_namespace_lock); 1440 return (ENOENT); 1441 } 1442 1443 /* 1444 * Put a hold on the pool, drop the namespace lock, stop async tasks, 1445 * reacquire the namespace lock, and see if we can export. 1446 */ 1447 spa_open_ref(spa, FTAG); 1448 mutex_exit(&spa_namespace_lock); 1449 spa_async_suspend(spa); 1450 mutex_enter(&spa_namespace_lock); 1451 spa_close(spa, FTAG); 1452 1453 /* 1454 * The pool will be in core if it's openable, 1455 * in which case we can modify its state. 1456 */ 1457 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 1458 /* 1459 * Objsets may be open only because they're dirty, so we 1460 * have to force it to sync before checking spa_refcnt. 1461 */ 1462 spa_scrub_suspend(spa); 1463 txg_wait_synced(spa->spa_dsl_pool, 0); 1464 1465 /* 1466 * A pool cannot be exported or destroyed if there are active 1467 * references. If we are resetting a pool, allow references by 1468 * fault injection handlers. 1469 */ 1470 if (!spa_refcount_zero(spa) || 1471 (spa->spa_inject_ref != 0 && 1472 new_state != POOL_STATE_UNINITIALIZED)) { 1473 spa_scrub_resume(spa); 1474 spa_async_resume(spa); 1475 mutex_exit(&spa_namespace_lock); 1476 return (EBUSY); 1477 } 1478 1479 spa_scrub_resume(spa); 1480 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 1481 1482 /* 1483 * We want this to be reflected on every label, 1484 * so mark them all dirty. spa_unload() will do the 1485 * final sync that pushes these changes out. 1486 */ 1487 if (new_state != POOL_STATE_UNINITIALIZED) { 1488 spa_config_enter(spa, RW_WRITER, FTAG); 1489 spa->spa_state = new_state; 1490 spa->spa_final_txg = spa_last_synced_txg(spa) + 1; 1491 vdev_config_dirty(spa->spa_root_vdev); 1492 spa_config_exit(spa, FTAG); 1493 } 1494 } 1495 1496 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 1497 spa_unload(spa); 1498 spa_deactivate(spa); 1499 } 1500 1501 if (oldconfig && spa->spa_config) 1502 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 1503 1504 if (new_state != POOL_STATE_UNINITIALIZED) { 1505 spa_remove(spa); 1506 spa_config_sync(); 1507 } 1508 mutex_exit(&spa_namespace_lock); 1509 1510 return (0); 1511} 1512 1513/* 1514 * Destroy a storage pool. 1515 */ 1516int 1517spa_destroy(char *pool) 1518{ 1519 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL)); 1520} 1521 1522/* 1523 * Export a storage pool. 1524 */ 1525int 1526spa_export(char *pool, nvlist_t **oldconfig) 1527{ 1528 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig)); 1529} 1530 1531/* 1532 * Similar to spa_export(), this unloads the spa_t without actually removing it 1533 * from the namespace in any way. 1534 */ 1535int 1536spa_reset(char *pool) 1537{ 1538 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL)); 1539} 1540 1541 1542/* 1543 * ========================================================================== 1544 * Device manipulation 1545 * ========================================================================== 1546 */ 1547 1548/* 1549 * Add capacity to a storage pool. 1550 */ 1551int 1552spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 1553{ 1554 uint64_t txg; 1555 int c, error; 1556 vdev_t *rvd = spa->spa_root_vdev; 1557 vdev_t *vd, *tvd; 1558 nvlist_t **spares; 1559 uint_t i, nspares; 1560 1561 txg = spa_vdev_enter(spa); 1562 1563 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 1564 VDEV_ALLOC_ADD)) != 0) 1565 return (spa_vdev_exit(spa, NULL, txg, error)); 1566 1567 spa->spa_pending_vdev = vd; 1568 1569 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1570 &spares, &nspares) != 0) 1571 nspares = 0; 1572 1573 if (vd->vdev_children == 0 && nspares == 0) { 1574 spa->spa_pending_vdev = NULL; 1575 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 1576 } 1577 1578 if (vd->vdev_children != 0) { 1579 if ((error = vdev_create(vd, txg, B_FALSE)) != 0) { 1580 spa->spa_pending_vdev = NULL; 1581 return (spa_vdev_exit(spa, vd, txg, error)); 1582 } 1583 } 1584 1585 /* 1586 * We must validate the spares after checking the children. Otherwise, 1587 * vdev_inuse() will blindly overwrite the spare. 1588 */ 1589 if ((error = spa_validate_spares(spa, nvroot, txg, 1590 VDEV_ALLOC_ADD)) != 0) { 1591 spa->spa_pending_vdev = NULL; 1592 return (spa_vdev_exit(spa, vd, txg, error)); 1593 } 1594 1595 spa->spa_pending_vdev = NULL; 1596 1597 /* 1598 * Transfer each new top-level vdev from vd to rvd. 1599 */ 1600 for (c = 0; c < vd->vdev_children; c++) { 1601 tvd = vd->vdev_child[c]; 1602 vdev_remove_child(vd, tvd); 1603 tvd->vdev_id = rvd->vdev_children; 1604 vdev_add_child(rvd, tvd); 1605 vdev_config_dirty(tvd); 1606 } 1607 1608 if (nspares != 0) { 1609 if (spa->spa_sparelist != NULL) { 1610 nvlist_t **oldspares; 1611 uint_t oldnspares; 1612 nvlist_t **newspares; 1613 1614 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 1615 ZPOOL_CONFIG_SPARES, &oldspares, &oldnspares) == 0); 1616 1617 newspares = kmem_alloc(sizeof (void *) * 1618 (nspares + oldnspares), KM_SLEEP); 1619 for (i = 0; i < oldnspares; i++) 1620 VERIFY(nvlist_dup(oldspares[i], 1621 &newspares[i], KM_SLEEP) == 0); 1622 for (i = 0; i < nspares; i++) 1623 VERIFY(nvlist_dup(spares[i], 1624 &newspares[i + oldnspares], 1625 KM_SLEEP) == 0); 1626 1627 VERIFY(nvlist_remove(spa->spa_sparelist, 1628 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 1629 1630 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1631 ZPOOL_CONFIG_SPARES, newspares, 1632 nspares + oldnspares) == 0); 1633 for (i = 0; i < oldnspares + nspares; i++) 1634 nvlist_free(newspares[i]); 1635 kmem_free(newspares, (oldnspares + nspares) * 1636 sizeof (void *)); 1637 } else { 1638 VERIFY(nvlist_alloc(&spa->spa_sparelist, 1639 NV_UNIQUE_NAME, KM_SLEEP) == 0); 1640 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1641 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1642 } 1643 1644 spa_load_spares(spa); 1645 spa->spa_sync_spares = B_TRUE; 1646 } 1647 1648 /* 1649 * We have to be careful when adding new vdevs to an existing pool. 1650 * If other threads start allocating from these vdevs before we 1651 * sync the config cache, and we lose power, then upon reboot we may 1652 * fail to open the pool because there are DVAs that the config cache 1653 * can't translate. Therefore, we first add the vdevs without 1654 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 1655 * and then let spa_config_update() initialize the new metaslabs. 1656 * 1657 * spa_load() checks for added-but-not-initialized vdevs, so that 1658 * if we lose power at any point in this sequence, the remaining 1659 * steps will be completed the next time we load the pool. 1660 */ 1661 (void) spa_vdev_exit(spa, vd, txg, 0); 1662 1663 mutex_enter(&spa_namespace_lock); 1664 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 1665 mutex_exit(&spa_namespace_lock); 1666 1667 return (0); 1668} 1669 1670/* 1671 * Attach a device to a mirror. The arguments are the path to any device 1672 * in the mirror, and the nvroot for the new device. If the path specifies 1673 * a device that is not mirrored, we automatically insert the mirror vdev. 1674 * 1675 * If 'replacing' is specified, the new device is intended to replace the 1676 * existing device; in this case the two devices are made into their own 1677 * mirror using the 'replacing' vdev, which is functionally idendical to 1678 * the mirror vdev (it actually reuses all the same ops) but has a few 1679 * extra rules: you can't attach to it after it's been created, and upon 1680 * completion of resilvering, the first disk (the one being replaced) 1681 * is automatically detached. 1682 */ 1683int 1684spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 1685{ 1686 uint64_t txg, open_txg; 1687 int error; 1688 vdev_t *rvd = spa->spa_root_vdev; 1689 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 1690 vdev_ops_t *pvops; 1691 1692 txg = spa_vdev_enter(spa); 1693 1694 oldvd = vdev_lookup_by_guid(rvd, guid); 1695 1696 if (oldvd == NULL) 1697 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1698 1699 if (!oldvd->vdev_ops->vdev_op_leaf) 1700 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1701 1702 pvd = oldvd->vdev_parent; 1703 1704 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 1705 VDEV_ALLOC_ADD)) != 0 || newrootvd->vdev_children != 1) 1706 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1707 1708 newvd = newrootvd->vdev_child[0]; 1709 1710 if (!newvd->vdev_ops->vdev_op_leaf) 1711 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1712 1713 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 1714 return (spa_vdev_exit(spa, newrootvd, txg, error)); 1715 1716 if (!replacing) { 1717 /* 1718 * For attach, the only allowable parent is a mirror or the root 1719 * vdev. 1720 */ 1721 if (pvd->vdev_ops != &vdev_mirror_ops && 1722 pvd->vdev_ops != &vdev_root_ops) 1723 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1724 1725 pvops = &vdev_mirror_ops; 1726 } else { 1727 /* 1728 * Active hot spares can only be replaced by inactive hot 1729 * spares. 1730 */ 1731 if (pvd->vdev_ops == &vdev_spare_ops && 1732 pvd->vdev_child[1] == oldvd && 1733 !spa_has_spare(spa, newvd->vdev_guid)) 1734 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1735 1736 /* 1737 * If the source is a hot spare, and the parent isn't already a 1738 * spare, then we want to create a new hot spare. Otherwise, we 1739 * want to create a replacing vdev. The user is not allowed to 1740 * attach to a spared vdev child unless the 'isspare' state is 1741 * the same (spare replaces spare, non-spare replaces 1742 * non-spare). 1743 */ 1744 if (pvd->vdev_ops == &vdev_replacing_ops) 1745 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1746 else if (pvd->vdev_ops == &vdev_spare_ops && 1747 newvd->vdev_isspare != oldvd->vdev_isspare) 1748 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1749 else if (pvd->vdev_ops != &vdev_spare_ops && 1750 newvd->vdev_isspare) 1751 pvops = &vdev_spare_ops; 1752 else 1753 pvops = &vdev_replacing_ops; 1754 } 1755 1756 /* 1757 * Compare the new device size with the replaceable/attachable 1758 * device size. 1759 */ 1760 if (newvd->vdev_psize < vdev_get_rsize(oldvd)) 1761 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 1762 1763 /* 1764 * The new device cannot have a higher alignment requirement 1765 * than the top-level vdev. 1766 */ 1767 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 1768 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 1769 1770 /* 1771 * If this is an in-place replacement, update oldvd's path and devid 1772 * to make it distinguishable from newvd, and unopenable from now on. 1773 */ 1774 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 1775 spa_strfree(oldvd->vdev_path); 1776 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 1777 KM_SLEEP); 1778 (void) sprintf(oldvd->vdev_path, "%s/%s", 1779 newvd->vdev_path, "old"); 1780 if (oldvd->vdev_devid != NULL) { 1781 spa_strfree(oldvd->vdev_devid); 1782 oldvd->vdev_devid = NULL; 1783 } 1784 } 1785 1786 /* 1787 * If the parent is not a mirror, or if we're replacing, insert the new 1788 * mirror/replacing/spare vdev above oldvd. 1789 */ 1790 if (pvd->vdev_ops != pvops) 1791 pvd = vdev_add_parent(oldvd, pvops); 1792 1793 ASSERT(pvd->vdev_top->vdev_parent == rvd); 1794 ASSERT(pvd->vdev_ops == pvops); 1795 ASSERT(oldvd->vdev_parent == pvd); 1796 1797 /* 1798 * Extract the new device from its root and add it to pvd. 1799 */ 1800 vdev_remove_child(newrootvd, newvd); 1801 newvd->vdev_id = pvd->vdev_children; 1802 vdev_add_child(pvd, newvd); 1803 1804 /* 1805 * If newvd is smaller than oldvd, but larger than its rsize, 1806 * the addition of newvd may have decreased our parent's asize. 1807 */ 1808 pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); 1809 1810 tvd = newvd->vdev_top; 1811 ASSERT(pvd->vdev_top == tvd); 1812 ASSERT(tvd->vdev_parent == rvd); 1813 1814 vdev_config_dirty(tvd); 1815 1816 /* 1817 * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 1818 * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 1819 */ 1820 open_txg = txg + TXG_CONCURRENT_STATES - 1; 1821 1822 mutex_enter(&newvd->vdev_dtl_lock); 1823 space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, 1824 open_txg - TXG_INITIAL + 1); 1825 mutex_exit(&newvd->vdev_dtl_lock); 1826 1827 if (newvd->vdev_isspare) 1828 spa_spare_activate(newvd); 1829 1830 /* 1831 * Mark newvd's DTL dirty in this txg. 1832 */ 1833 vdev_dirty(tvd, VDD_DTL, newvd, txg); 1834 1835 (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 1836 1837 /* 1838 * Kick off a resilver to update newvd. 1839 */ 1840 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1841 1842 return (0); 1843} 1844 1845/* 1846 * Detach a device from a mirror or replacing vdev. 1847 * If 'replace_done' is specified, only detach if the parent 1848 * is a replacing vdev. 1849 */ 1850int 1851spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) 1852{ 1853 uint64_t txg; 1854 int c, t, error; 1855 vdev_t *rvd = spa->spa_root_vdev; 1856 vdev_t *vd, *pvd, *cvd, *tvd; 1857 boolean_t unspare = B_FALSE; 1858 uint64_t unspare_guid; 1859 1860 txg = spa_vdev_enter(spa); 1861 1862 vd = vdev_lookup_by_guid(rvd, guid); 1863 1864 if (vd == NULL) 1865 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1866 1867 if (!vd->vdev_ops->vdev_op_leaf) 1868 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1869 1870 pvd = vd->vdev_parent; 1871 1872 /* 1873 * If replace_done is specified, only remove this device if it's 1874 * the first child of a replacing vdev. For the 'spare' vdev, either 1875 * disk can be removed. 1876 */ 1877 if (replace_done) { 1878 if (pvd->vdev_ops == &vdev_replacing_ops) { 1879 if (vd->vdev_id != 0) 1880 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1881 } else if (pvd->vdev_ops != &vdev_spare_ops) { 1882 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1883 } 1884 } 1885 1886 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 1887 spa_version(spa) >= ZFS_VERSION_SPARES); 1888 1889 /* 1890 * Only mirror, replacing, and spare vdevs support detach. 1891 */ 1892 if (pvd->vdev_ops != &vdev_replacing_ops && 1893 pvd->vdev_ops != &vdev_mirror_ops && 1894 pvd->vdev_ops != &vdev_spare_ops) 1895 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1896 1897 /* 1898 * If there's only one replica, you can't detach it. 1899 */ 1900 if (pvd->vdev_children <= 1) 1901 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1902 1903 /* 1904 * If all siblings have non-empty DTLs, this device may have the only 1905 * valid copy of the data, which means we cannot safely detach it. 1906 * 1907 * XXX -- as in the vdev_offline() case, we really want a more 1908 * precise DTL check. 1909 */ 1910 for (c = 0; c < pvd->vdev_children; c++) { 1911 uint64_t dirty; 1912 1913 cvd = pvd->vdev_child[c]; 1914 if (cvd == vd) 1915 continue; 1916 if (vdev_is_dead(cvd)) 1917 continue; 1918 mutex_enter(&cvd->vdev_dtl_lock); 1919 dirty = cvd->vdev_dtl_map.sm_space | 1920 cvd->vdev_dtl_scrub.sm_space; 1921 mutex_exit(&cvd->vdev_dtl_lock); 1922 if (!dirty) 1923 break; 1924 } 1925 1926 /* 1927 * If we are a replacing or spare vdev, then we can always detach the 1928 * latter child, as that is how one cancels the operation. 1929 */ 1930 if ((pvd->vdev_ops == &vdev_mirror_ops || vd->vdev_id != 1) && 1931 c == pvd->vdev_children) 1932 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1933 1934 /* 1935 * If we are detaching the original disk from a spare, then it implies 1936 * that the spare should become a real disk, and be removed from the 1937 * active spare list for the pool. 1938 */ 1939 if (pvd->vdev_ops == &vdev_spare_ops && 1940 vd->vdev_id == 0) 1941 unspare = B_TRUE; 1942 1943 /* 1944 * Erase the disk labels so the disk can be used for other things. 1945 * This must be done after all other error cases are handled, 1946 * but before we disembowel vd (so we can still do I/O to it). 1947 * But if we can't do it, don't treat the error as fatal -- 1948 * it may be that the unwritability of the disk is the reason 1949 * it's being detached! 1950 */ 1951 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 1952 1953 /* 1954 * Remove vd from its parent and compact the parent's children. 1955 */ 1956 vdev_remove_child(pvd, vd); 1957 vdev_compact_children(pvd); 1958 1959 /* 1960 * Remember one of the remaining children so we can get tvd below. 1961 */ 1962 cvd = pvd->vdev_child[0]; 1963 1964 /* 1965 * If we need to remove the remaining child from the list of hot spares, 1966 * do it now, marking the vdev as no longer a spare in the process. We 1967 * must do this before vdev_remove_parent(), because that can change the 1968 * GUID if it creates a new toplevel GUID. 1969 */ 1970 if (unspare) { 1971 ASSERT(cvd->vdev_isspare); 1972 spa_spare_remove(cvd); 1973 unspare_guid = cvd->vdev_guid; 1974 } 1975 1976 /* 1977 * If the parent mirror/replacing vdev only has one child, 1978 * the parent is no longer needed. Remove it from the tree. 1979 */ 1980 if (pvd->vdev_children == 1) 1981 vdev_remove_parent(cvd); 1982 1983 /* 1984 * We don't set tvd until now because the parent we just removed 1985 * may have been the previous top-level vdev. 1986 */ 1987 tvd = cvd->vdev_top; 1988 ASSERT(tvd->vdev_parent == rvd); 1989 1990 /* 1991 * Reevaluate the parent vdev state. 1992 */ 1993 vdev_propagate_state(cvd->vdev_parent); 1994 1995 /* 1996 * If the device we just detached was smaller than the others, it may be 1997 * possible to add metaslabs (i.e. grow the pool). vdev_metaslab_init() 1998 * can't fail because the existing metaslabs are already in core, so 1999 * there's nothing to read from disk. 2000 */ 2001 VERIFY(vdev_metaslab_init(tvd, txg) == 0); 2002 2003 vdev_config_dirty(tvd); 2004 2005 /* 2006 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 2007 * vd->vdev_detached is set and free vd's DTL object in syncing context. 2008 * But first make sure we're not on any *other* txg's DTL list, to 2009 * prevent vd from being accessed after it's freed. 2010 */ 2011 for (t = 0; t < TXG_SIZE; t++) 2012 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 2013 vd->vdev_detached = B_TRUE; 2014 vdev_dirty(tvd, VDD_DTL, vd, txg); 2015 2016 error = spa_vdev_exit(spa, vd, txg, 0); 2017 2018 /* 2019 * If this was the removal of the original device in a hot spare vdev, 2020 * then we want to go through and remove the device from the hot spare 2021 * list of every other pool. 2022 */ 2023 if (unspare) { 2024 spa = NULL; 2025 mutex_enter(&spa_namespace_lock); 2026 while ((spa = spa_next(spa)) != NULL) { 2027 if (spa->spa_state != POOL_STATE_ACTIVE) 2028 continue; 2029 2030 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 2031 } 2032 mutex_exit(&spa_namespace_lock); 2033 } 2034 2035 return (error); 2036} 2037 2038/* 2039 * Remove a device from the pool. Currently, this supports removing only hot 2040 * spares. 2041 */ 2042int 2043spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 2044{ 2045 vdev_t *vd; 2046 nvlist_t **spares, *nv, **newspares; 2047 uint_t i, j, nspares; 2048 int ret = 0; 2049 2050 spa_config_enter(spa, RW_WRITER, FTAG); 2051 2052 vd = spa_lookup_by_guid(spa, guid); 2053 2054 nv = NULL; 2055 if (spa->spa_spares != NULL && 2056 nvlist_lookup_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 2057 &spares, &nspares) == 0) { 2058 for (i = 0; i < nspares; i++) { 2059 uint64_t theguid; 2060 2061 VERIFY(nvlist_lookup_uint64(spares[i], 2062 ZPOOL_CONFIG_GUID, &theguid) == 0); 2063 if (theguid == guid) { 2064 nv = spares[i]; 2065 break; 2066 } 2067 } 2068 } 2069 2070 /* 2071 * We only support removing a hot spare, and only if it's not currently 2072 * in use in this pool. 2073 */ 2074 if (nv == NULL && vd == NULL) { 2075 ret = ENOENT; 2076 goto out; 2077 } 2078 2079 if (nv == NULL && vd != NULL) { 2080 ret = ENOTSUP; 2081 goto out; 2082 } 2083 2084 if (!unspare && nv != NULL && vd != NULL) { 2085 ret = EBUSY; 2086 goto out; 2087 } 2088 2089 if (nspares == 1) { 2090 newspares = NULL; 2091 } else { 2092 newspares = kmem_alloc((nspares - 1) * sizeof (void *), 2093 KM_SLEEP); 2094 for (i = 0, j = 0; i < nspares; i++) { 2095 if (spares[i] != nv) 2096 VERIFY(nvlist_dup(spares[i], 2097 &newspares[j++], KM_SLEEP) == 0); 2098 } 2099 } 2100 2101 VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 2102 DATA_TYPE_NVLIST_ARRAY) == 0); 2103 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 2104 newspares, nspares - 1) == 0); 2105 for (i = 0; i < nspares - 1; i++) 2106 nvlist_free(newspares[i]); 2107 kmem_free(newspares, (nspares - 1) * sizeof (void *)); 2108 spa_load_spares(spa); 2109 spa->spa_sync_spares = B_TRUE; 2110 2111out: 2112 spa_config_exit(spa, FTAG); 2113 2114 return (ret); 2115} 2116 2117/* 2118 * Find any device that's done replacing, so we can detach it. 2119 */ 2120static vdev_t * 2121spa_vdev_replace_done_hunt(vdev_t *vd) 2122{ 2123 vdev_t *newvd, *oldvd; 2124 int c; 2125 2126 for (c = 0; c < vd->vdev_children; c++) { 2127 oldvd = spa_vdev_replace_done_hunt(vd->vdev_child[c]); 2128 if (oldvd != NULL) 2129 return (oldvd); 2130 } 2131 2132 if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 2133 oldvd = vd->vdev_child[0]; 2134 newvd = vd->vdev_child[1]; 2135 2136 mutex_enter(&newvd->vdev_dtl_lock); 2137 if (newvd->vdev_dtl_map.sm_space == 0 && 2138 newvd->vdev_dtl_scrub.sm_space == 0) { 2139 mutex_exit(&newvd->vdev_dtl_lock); 2140 return (oldvd); 2141 } 2142 mutex_exit(&newvd->vdev_dtl_lock); 2143 } 2144 2145 return (NULL); 2146} 2147 2148static void 2149spa_vdev_replace_done(spa_t *spa) 2150{ 2151 vdev_t *vd; 2152 vdev_t *pvd; 2153 uint64_t guid; 2154 uint64_t pguid = 0; 2155 2156 spa_config_enter(spa, RW_READER, FTAG); 2157 2158 while ((vd = spa_vdev_replace_done_hunt(spa->spa_root_vdev)) != NULL) { 2159 guid = vd->vdev_guid; 2160 /* 2161 * If we have just finished replacing a hot spared device, then 2162 * we need to detach the parent's first child (the original hot 2163 * spare) as well. 2164 */ 2165 pvd = vd->vdev_parent; 2166 if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops && 2167 pvd->vdev_id == 0) { 2168 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 2169 ASSERT(pvd->vdev_parent->vdev_children == 2); 2170 pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid; 2171 } 2172 spa_config_exit(spa, FTAG); 2173 if (spa_vdev_detach(spa, guid, B_TRUE) != 0) 2174 return; 2175 if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0) 2176 return; 2177 spa_config_enter(spa, RW_READER, FTAG); 2178 } 2179 2180 spa_config_exit(spa, FTAG); 2181} 2182 2183/* 2184 * Update the stored path for this vdev. Dirty the vdev configuration, relying 2185 * on spa_vdev_enter/exit() to synchronize the labels and cache. 2186 */ 2187int 2188spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 2189{ 2190 vdev_t *rvd, *vd; 2191 uint64_t txg; 2192 2193 rvd = spa->spa_root_vdev; 2194 2195 txg = spa_vdev_enter(spa); 2196 2197 if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 2198 /* 2199 * Determine if this is a reference to a hot spare. In that 2200 * case, update the path as stored in the spare list. 2201 */ 2202 nvlist_t **spares; 2203 uint_t i, nspares; 2204 if (spa->spa_sparelist != NULL) { 2205 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 2206 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 2207 for (i = 0; i < nspares; i++) { 2208 uint64_t theguid; 2209 VERIFY(nvlist_lookup_uint64(spares[i], 2210 ZPOOL_CONFIG_GUID, &theguid) == 0); 2211 if (theguid == guid) 2212 break; 2213 } 2214 2215 if (i == nspares) 2216 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 2217 2218 VERIFY(nvlist_add_string(spares[i], 2219 ZPOOL_CONFIG_PATH, newpath) == 0); 2220 spa_load_spares(spa); 2221 spa->spa_sync_spares = B_TRUE; 2222 return (spa_vdev_exit(spa, NULL, txg, 0)); 2223 } else { 2224 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 2225 } 2226 } 2227 2228 if (!vd->vdev_ops->vdev_op_leaf) 2229 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 2230 2231 spa_strfree(vd->vdev_path); 2232 vd->vdev_path = spa_strdup(newpath); 2233 2234 vdev_config_dirty(vd->vdev_top); 2235 2236 return (spa_vdev_exit(spa, NULL, txg, 0)); 2237} 2238 2239/* 2240 * ========================================================================== 2241 * SPA Scrubbing 2242 * ========================================================================== 2243 */ 2244 2245static void 2246spa_scrub_io_done(zio_t *zio) 2247{ 2248 spa_t *spa = zio->io_spa; 2249 2250 zio_data_buf_free(zio->io_data, zio->io_size); 2251 2252 mutex_enter(&spa->spa_scrub_lock); 2253 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2254 vdev_t *vd = zio->io_vd ? zio->io_vd : spa->spa_root_vdev; 2255 spa->spa_scrub_errors++; 2256 mutex_enter(&vd->vdev_stat_lock); 2257 vd->vdev_stat.vs_scrub_errors++; 2258 mutex_exit(&vd->vdev_stat_lock); 2259 } 2260 2261 if (--spa->spa_scrub_inflight < spa->spa_scrub_maxinflight) 2262 cv_broadcast(&spa->spa_scrub_io_cv); 2263 2264 ASSERT(spa->spa_scrub_inflight >= 0); 2265 2266 mutex_exit(&spa->spa_scrub_lock); 2267} 2268 2269static void 2270spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags, 2271 zbookmark_t *zb) 2272{ 2273 size_t size = BP_GET_LSIZE(bp); 2274 void *data; 2275 2276 mutex_enter(&spa->spa_scrub_lock); 2277 /* 2278 * Do not give too much work to vdev(s). 2279 */ 2280 while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight) { 2281 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2282 } 2283 spa->spa_scrub_inflight++; 2284 mutex_exit(&spa->spa_scrub_lock); 2285 2286 data = zio_data_buf_alloc(size); 2287 2288 if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) 2289 flags |= ZIO_FLAG_SPECULATIVE; /* intent log block */ 2290 2291 flags |= ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL; 2292 2293 zio_nowait(zio_read(NULL, spa, bp, data, size, 2294 spa_scrub_io_done, NULL, priority, flags, zb)); 2295} 2296 2297/* ARGSUSED */ 2298static int 2299spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) 2300{ 2301 blkptr_t *bp = &bc->bc_blkptr; 2302 vdev_t *vd = spa->spa_root_vdev; 2303 dva_t *dva = bp->blk_dva; 2304 int needs_resilver = B_FALSE; 2305 int d; 2306 2307 if (bc->bc_errno) { 2308 /* 2309 * We can't scrub this block, but we can continue to scrub 2310 * the rest of the pool. Note the error and move along. 2311 */ 2312 mutex_enter(&spa->spa_scrub_lock); 2313 spa->spa_scrub_errors++; 2314 mutex_exit(&spa->spa_scrub_lock); 2315 2316 mutex_enter(&vd->vdev_stat_lock); 2317 vd->vdev_stat.vs_scrub_errors++; 2318 mutex_exit(&vd->vdev_stat_lock); 2319 2320 return (ERESTART); 2321 } 2322 2323 ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg); 2324 2325 for (d = 0; d < BP_GET_NDVAS(bp); d++) { 2326 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d])); 2327 2328 ASSERT(vd != NULL); 2329 2330 /* 2331 * Keep track of how much data we've examined so that 2332 * zpool(1M) status can make useful progress reports. 2333 */ 2334 mutex_enter(&vd->vdev_stat_lock); 2335 vd->vdev_stat.vs_scrub_examined += DVA_GET_ASIZE(&dva[d]); 2336 mutex_exit(&vd->vdev_stat_lock); 2337 2338 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) { 2339 if (DVA_GET_GANG(&dva[d])) { 2340 /* 2341 * Gang members may be spread across multiple 2342 * vdevs, so the best we can do is look at the 2343 * pool-wide DTL. 2344 * XXX -- it would be better to change our 2345 * allocation policy to ensure that this can't 2346 * happen. 2347 */ 2348 vd = spa->spa_root_vdev; 2349 } 2350 if (vdev_dtl_contains(&vd->vdev_dtl_map, 2351 bp->blk_birth, 1)) 2352 needs_resilver = B_TRUE; 2353 } 2354 } 2355 2356 if (spa->spa_scrub_type == POOL_SCRUB_EVERYTHING) 2357 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB, 2358 ZIO_FLAG_SCRUB, &bc->bc_bookmark); 2359 else if (needs_resilver) 2360 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, 2361 ZIO_FLAG_RESILVER, &bc->bc_bookmark); 2362 2363 return (0); 2364} 2365 2366static void 2367spa_scrub_thread(void *arg) 2368{ 2369 spa_t *spa = arg; 2370 callb_cpr_t cprinfo; 2371 traverse_handle_t *th = spa->spa_scrub_th; 2372 vdev_t *rvd = spa->spa_root_vdev; 2373 pool_scrub_type_t scrub_type = spa->spa_scrub_type; 2374 int error = 0; 2375 boolean_t complete; 2376 2377 CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG); 2378 2379 /* 2380 * If we're restarting due to a snapshot create/delete, 2381 * wait for that to complete. 2382 */ 2383 txg_wait_synced(spa_get_dsl(spa), 0); 2384 2385 dprintf("start %s mintxg=%llu maxtxg=%llu\n", 2386 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 2387 spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg); 2388 2389 spa_config_enter(spa, RW_WRITER, FTAG); 2390 vdev_reopen(rvd); /* purge all vdev caches */ 2391 vdev_config_dirty(rvd); /* rewrite all disk labels */ 2392 vdev_scrub_stat_update(rvd, scrub_type, B_FALSE); 2393 spa_config_exit(spa, FTAG); 2394 2395 mutex_enter(&spa->spa_scrub_lock); 2396 spa->spa_scrub_errors = 0; 2397 spa->spa_scrub_active = 1; 2398 ASSERT(spa->spa_scrub_inflight == 0); 2399 2400 while (!spa->spa_scrub_stop) { 2401 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2402 while (spa->spa_scrub_suspended) { 2403 spa->spa_scrub_active = 0; 2404 cv_broadcast(&spa->spa_scrub_cv); 2405 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2406 spa->spa_scrub_active = 1; 2407 } 2408 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock); 2409 2410 if (spa->spa_scrub_restart_txg != 0) 2411 break; 2412 2413 mutex_exit(&spa->spa_scrub_lock); 2414 error = traverse_more(th); 2415 mutex_enter(&spa->spa_scrub_lock); 2416 if (error != EAGAIN) 2417 break; 2418 } 2419 2420 while (spa->spa_scrub_inflight) 2421 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2422 2423 spa->spa_scrub_active = 0; 2424 cv_broadcast(&spa->spa_scrub_cv); 2425 2426 mutex_exit(&spa->spa_scrub_lock); 2427 2428 spa_config_enter(spa, RW_WRITER, FTAG); 2429 2430 mutex_enter(&spa->spa_scrub_lock); 2431 2432 /* 2433 * Note: we check spa_scrub_restart_txg under both spa_scrub_lock 2434 * AND the spa config lock to synchronize with any config changes 2435 * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit(). 2436 */ 2437 if (spa->spa_scrub_restart_txg != 0) 2438 error = ERESTART; 2439 2440 if (spa->spa_scrub_stop) 2441 error = EINTR; 2442 2443 /* 2444 * Even if there were uncorrectable errors, we consider the scrub 2445 * completed. The downside is that if there is a transient error during 2446 * a resilver, we won't resilver the data properly to the target. But 2447 * if the damage is permanent (more likely) we will resilver forever, 2448 * which isn't really acceptable. Since there is enough information for 2449 * the user to know what has failed and why, this seems like a more 2450 * tractable approach. 2451 */ 2452 complete = (error == 0); 2453 2454 dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n", 2455 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 2456 spa->spa_scrub_maxtxg, complete ? "done" : "FAILED", 2457 error, spa->spa_scrub_errors, spa->spa_scrub_stop); 2458 2459 mutex_exit(&spa->spa_scrub_lock); 2460 2461 /* 2462 * If the scrub/resilver completed, update all DTLs to reflect this. 2463 * Whether it succeeded or not, vacate all temporary scrub DTLs. 2464 */ 2465 vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1, 2466 complete ? spa->spa_scrub_maxtxg : 0, B_TRUE); 2467 vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); 2468 spa_errlog_rotate(spa); 2469 2470 spa_config_exit(spa, FTAG); 2471 2472 mutex_enter(&spa->spa_scrub_lock); 2473 2474 /* 2475 * We may have finished replacing a device. 2476 * Let the async thread assess this and handle the detach. 2477 */ 2478 spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 2479 2480 /* 2481 * If we were told to restart, our final act is to start a new scrub. 2482 */ 2483 if (error == ERESTART) 2484 spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ? 2485 SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB); 2486 2487 spa->spa_scrub_type = POOL_SCRUB_NONE; 2488 spa->spa_scrub_active = 0; 2489 spa->spa_scrub_thread = NULL; 2490 cv_broadcast(&spa->spa_scrub_cv); 2491 CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */ 2492 thread_exit(); 2493} 2494 2495void 2496spa_scrub_suspend(spa_t *spa) 2497{ 2498 mutex_enter(&spa->spa_scrub_lock); 2499 spa->spa_scrub_suspended++; 2500 while (spa->spa_scrub_active) { 2501 cv_broadcast(&spa->spa_scrub_cv); 2502 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2503 } 2504 while (spa->spa_scrub_inflight) 2505 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2506 mutex_exit(&spa->spa_scrub_lock); 2507} 2508 2509void 2510spa_scrub_resume(spa_t *spa) 2511{ 2512 mutex_enter(&spa->spa_scrub_lock); 2513 ASSERT(spa->spa_scrub_suspended != 0); 2514 if (--spa->spa_scrub_suspended == 0) 2515 cv_broadcast(&spa->spa_scrub_cv); 2516 mutex_exit(&spa->spa_scrub_lock); 2517} 2518 2519void 2520spa_scrub_restart(spa_t *spa, uint64_t txg) 2521{ 2522 /* 2523 * Something happened (e.g. snapshot create/delete) that means 2524 * we must restart any in-progress scrubs. The itinerary will 2525 * fix this properly. 2526 */ 2527 mutex_enter(&spa->spa_scrub_lock); 2528 spa->spa_scrub_restart_txg = txg; 2529 mutex_exit(&spa->spa_scrub_lock); 2530} 2531 2532int 2533spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) 2534{ 2535 space_seg_t *ss; 2536 uint64_t mintxg, maxtxg; 2537 vdev_t *rvd = spa->spa_root_vdev; 2538 2539 if ((uint_t)type >= POOL_SCRUB_TYPES) 2540 return (ENOTSUP); 2541 2542 mutex_enter(&spa->spa_scrub_lock); 2543 2544 /* 2545 * If there's a scrub or resilver already in progress, stop it. 2546 */ 2547 while (spa->spa_scrub_thread != NULL) { 2548 /* 2549 * Don't stop a resilver unless forced. 2550 */ 2551 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) { 2552 mutex_exit(&spa->spa_scrub_lock); 2553 return (EBUSY); 2554 } 2555 spa->spa_scrub_stop = 1; 2556 cv_broadcast(&spa->spa_scrub_cv); 2557 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2558 } 2559 2560 /* 2561 * Terminate the previous traverse. 2562 */ 2563 if (spa->spa_scrub_th != NULL) { 2564 traverse_fini(spa->spa_scrub_th); 2565 spa->spa_scrub_th = NULL; 2566 } 2567 2568 if (rvd == NULL) { 2569 ASSERT(spa->spa_scrub_stop == 0); 2570 ASSERT(spa->spa_scrub_type == type); 2571 ASSERT(spa->spa_scrub_restart_txg == 0); 2572 mutex_exit(&spa->spa_scrub_lock); 2573 return (0); 2574 } 2575 2576 mintxg = TXG_INITIAL - 1; 2577 maxtxg = spa_last_synced_txg(spa) + 1; 2578 2579 mutex_enter(&rvd->vdev_dtl_lock); 2580 2581 if (rvd->vdev_dtl_map.sm_space == 0) { 2582 /* 2583 * The pool-wide DTL is empty. 2584 * If this is a resilver, there's nothing to do except 2585 * check whether any in-progress replacements have completed. 2586 */ 2587 if (type == POOL_SCRUB_RESILVER) { 2588 type = POOL_SCRUB_NONE; 2589 spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 2590 } 2591 } else { 2592 /* 2593 * The pool-wide DTL is non-empty. 2594 * If this is a normal scrub, upgrade to a resilver instead. 2595 */ 2596 if (type == POOL_SCRUB_EVERYTHING) 2597 type = POOL_SCRUB_RESILVER; 2598 } 2599 2600 if (type == POOL_SCRUB_RESILVER) { 2601 /* 2602 * Determine the resilvering boundaries. 2603 * 2604 * Note: (mintxg, maxtxg) is an open interval, 2605 * i.e. mintxg and maxtxg themselves are not included. 2606 * 2607 * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1 2608 * so we don't claim to resilver a txg that's still changing. 2609 */ 2610 ss = avl_first(&rvd->vdev_dtl_map.sm_root); 2611 mintxg = ss->ss_start - 1; 2612 ss = avl_last(&rvd->vdev_dtl_map.sm_root); 2613 maxtxg = MIN(ss->ss_end, maxtxg); 2614 } 2615 2616 mutex_exit(&rvd->vdev_dtl_lock); 2617 2618 spa->spa_scrub_stop = 0; 2619 spa->spa_scrub_type = type; 2620 spa->spa_scrub_restart_txg = 0; 2621 2622 if (type != POOL_SCRUB_NONE) { 2623 spa->spa_scrub_mintxg = mintxg; 2624 spa->spa_scrub_maxtxg = maxtxg; 2625 spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL, 2626 ADVANCE_PRE | ADVANCE_PRUNE | ADVANCE_ZIL, 2627 ZIO_FLAG_CANFAIL); 2628 traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg); 2629 spa->spa_scrub_thread = thread_create(NULL, 0, 2630 spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri); 2631 } 2632 2633 mutex_exit(&spa->spa_scrub_lock); 2634 2635 return (0); 2636} 2637 2638/* 2639 * ========================================================================== 2640 * SPA async task processing 2641 * ========================================================================== 2642 */ 2643 2644static void 2645spa_async_reopen(spa_t *spa) 2646{ 2647 vdev_t *rvd = spa->spa_root_vdev; 2648 vdev_t *tvd; 2649 int c; 2650 2651 spa_config_enter(spa, RW_WRITER, FTAG); 2652 2653 for (c = 0; c < rvd->vdev_children; c++) { 2654 tvd = rvd->vdev_child[c]; 2655 if (tvd->vdev_reopen_wanted) { 2656 tvd->vdev_reopen_wanted = 0; 2657 vdev_reopen(tvd); 2658 } 2659 } 2660 2661 spa_config_exit(spa, FTAG); 2662} 2663 2664static void 2665spa_async_thread(void *arg) 2666{ 2667 spa_t *spa = arg; 2668 int tasks; 2669 2670 ASSERT(spa->spa_sync_on); 2671 2672 mutex_enter(&spa->spa_async_lock); 2673 tasks = spa->spa_async_tasks; 2674 spa->spa_async_tasks = 0; 2675 mutex_exit(&spa->spa_async_lock); 2676 2677 /* 2678 * See if the config needs to be updated. 2679 */ 2680 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 2681 mutex_enter(&spa_namespace_lock); 2682 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 2683 mutex_exit(&spa_namespace_lock); 2684 } 2685 2686 /* 2687 * See if any devices need to be reopened. 2688 */ 2689 if (tasks & SPA_ASYNC_REOPEN) 2690 spa_async_reopen(spa); 2691 2692 /* 2693 * If any devices are done replacing, detach them. 2694 */ 2695 if (tasks & SPA_ASYNC_REPLACE_DONE) 2696 spa_vdev_replace_done(spa); 2697 2698 /* 2699 * Kick off a scrub. 2700 */ 2701 if (tasks & SPA_ASYNC_SCRUB) 2702 VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0); 2703 2704 /* 2705 * Kick off a resilver. 2706 */ 2707 if (tasks & SPA_ASYNC_RESILVER) 2708 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 2709 2710 /* 2711 * Let the world know that we're done. 2712 */ 2713 mutex_enter(&spa->spa_async_lock); 2714 spa->spa_async_thread = NULL; 2715 cv_broadcast(&spa->spa_async_cv); 2716 mutex_exit(&spa->spa_async_lock); 2717 thread_exit(); 2718} 2719 2720void 2721spa_async_suspend(spa_t *spa) 2722{ 2723 mutex_enter(&spa->spa_async_lock); 2724 spa->spa_async_suspended++; 2725 while (spa->spa_async_thread != NULL) 2726 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 2727 mutex_exit(&spa->spa_async_lock); 2728} 2729 2730void 2731spa_async_resume(spa_t *spa) 2732{ 2733 mutex_enter(&spa->spa_async_lock); 2734 ASSERT(spa->spa_async_suspended != 0); 2735 spa->spa_async_suspended--; 2736 mutex_exit(&spa->spa_async_lock); 2737} 2738 2739static void 2740spa_async_dispatch(spa_t *spa) 2741{ 2742 mutex_enter(&spa->spa_async_lock); 2743 if (spa->spa_async_tasks && !spa->spa_async_suspended && 2744 spa->spa_async_thread == NULL && 2745 rootdir != NULL && !vn_is_readonly(rootdir)) 2746 spa->spa_async_thread = thread_create(NULL, 0, 2747 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 2748 mutex_exit(&spa->spa_async_lock); 2749} 2750 2751void 2752spa_async_request(spa_t *spa, int task) 2753{ 2754 mutex_enter(&spa->spa_async_lock); 2755 spa->spa_async_tasks |= task; 2756 mutex_exit(&spa->spa_async_lock); 2757} 2758 2759/* 2760 * ========================================================================== 2761 * SPA syncing routines 2762 * ========================================================================== 2763 */ 2764 2765static void 2766spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 2767{ 2768 bplist_t *bpl = &spa->spa_sync_bplist; 2769 dmu_tx_t *tx; 2770 blkptr_t blk; 2771 uint64_t itor = 0; 2772 zio_t *zio; 2773 int error; 2774 uint8_t c = 1; 2775 2776 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); 2777 2778 while (bplist_iterate(bpl, &itor, &blk) == 0) 2779 zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); 2780 2781 error = zio_wait(zio); 2782 ASSERT3U(error, ==, 0); 2783 2784 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2785 bplist_vacate(bpl, tx); 2786 2787 /* 2788 * Pre-dirty the first block so we sync to convergence faster. 2789 * (Usually only the first block is needed.) 2790 */ 2791 dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 2792 dmu_tx_commit(tx); 2793} 2794 2795static void 2796spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 2797{ 2798 char *packed = NULL; 2799 size_t nvsize = 0; 2800 dmu_buf_t *db; 2801 2802 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 2803 2804 packed = kmem_alloc(nvsize, KM_SLEEP); 2805 2806 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 2807 KM_SLEEP) == 0); 2808 2809 dmu_write(spa->spa_meta_objset, obj, 0, nvsize, packed, tx); 2810 2811 kmem_free(packed, nvsize); 2812 2813 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 2814 dmu_buf_will_dirty(db, tx); 2815 *(uint64_t *)db->db_data = nvsize; 2816 dmu_buf_rele(db, FTAG); 2817} 2818 2819static void 2820spa_sync_spares(spa_t *spa, dmu_tx_t *tx) 2821{ 2822 nvlist_t *nvroot; 2823 nvlist_t **spares; 2824 int i; 2825 2826 if (!spa->spa_sync_spares) 2827 return; 2828 2829 /* 2830 * Update the MOS nvlist describing the list of available spares. 2831 * spa_validate_spares() will have already made sure this nvlist is 2832 * valid and the vdevs are labelled appropriately. 2833 */ 2834 if (spa->spa_spares_object == 0) { 2835 spa->spa_spares_object = dmu_object_alloc(spa->spa_meta_objset, 2836 DMU_OT_PACKED_NVLIST, 1 << 14, 2837 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 2838 VERIFY(zap_update(spa->spa_meta_objset, 2839 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SPARES, 2840 sizeof (uint64_t), 1, &spa->spa_spares_object, tx) == 0); 2841 } 2842 2843 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 2844 if (spa->spa_nspares == 0) { 2845 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2846 NULL, 0) == 0); 2847 } else { 2848 spares = kmem_alloc(spa->spa_nspares * sizeof (void *), 2849 KM_SLEEP); 2850 for (i = 0; i < spa->spa_nspares; i++) 2851 spares[i] = vdev_config_generate(spa, 2852 spa->spa_spares[i], B_FALSE, B_TRUE); 2853 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2854 spares, spa->spa_nspares) == 0); 2855 for (i = 0; i < spa->spa_nspares; i++) 2856 nvlist_free(spares[i]); 2857 kmem_free(spares, spa->spa_nspares * sizeof (void *)); 2858 } 2859 2860 spa_sync_nvlist(spa, spa->spa_spares_object, nvroot, tx); 2861 nvlist_free(nvroot); 2862 2863 spa->spa_sync_spares = B_FALSE; 2864} 2865 2866static void 2867spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 2868{ 2869 nvlist_t *config; 2870 2871 if (list_is_empty(&spa->spa_dirty_list)) 2872 return; 2873 2874 config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); 2875 2876 if (spa->spa_config_syncing) 2877 nvlist_free(spa->spa_config_syncing); 2878 spa->spa_config_syncing = config; 2879 2880 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 2881} 2882 2883static void 2884spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx) 2885{ 2886 spa_t *spa = arg1; 2887 nvlist_t *nvp = arg2; 2888 nvpair_t *nvpair; 2889 objset_t *mos = spa->spa_meta_objset; 2890 uint64_t zapobj; 2891 2892 mutex_enter(&spa->spa_props_lock); 2893 if (spa->spa_pool_props_object == 0) { 2894 zapobj = zap_create(mos, DMU_OT_POOL_PROPS, DMU_OT_NONE, 0, tx); 2895 VERIFY(zapobj > 0); 2896 2897 spa->spa_pool_props_object = zapobj; 2898 2899 VERIFY(zap_update(mos, DMU_POOL_DIRECTORY_OBJECT, 2900 DMU_POOL_PROPS, 8, 1, 2901 &spa->spa_pool_props_object, tx) == 0); 2902 } 2903 mutex_exit(&spa->spa_props_lock); 2904 2905 nvpair = NULL; 2906 while ((nvpair = nvlist_next_nvpair(nvp, nvpair))) { 2907 switch (zpool_name_to_prop(nvpair_name(nvpair))) { 2908 case ZFS_PROP_BOOTFS: 2909 VERIFY(nvlist_lookup_uint64(nvp, 2910 nvpair_name(nvpair), &spa->spa_bootfs) == 0); 2911 VERIFY(zap_update(mos, 2912 spa->spa_pool_props_object, 2913 zpool_prop_to_name(ZFS_PROP_BOOTFS), 8, 1, 2914 &spa->spa_bootfs, tx) == 0); 2915 break; 2916 } 2917 } 2918} 2919 2920/* 2921 * Sync the specified transaction group. New blocks may be dirtied as 2922 * part of the process, so we iterate until it converges. 2923 */ 2924void 2925spa_sync(spa_t *spa, uint64_t txg) 2926{ 2927 dsl_pool_t *dp = spa->spa_dsl_pool; 2928 objset_t *mos = spa->spa_meta_objset; 2929 bplist_t *bpl = &spa->spa_sync_bplist; 2930 vdev_t *rvd = spa->spa_root_vdev; 2931 vdev_t *vd; 2932 dmu_tx_t *tx; 2933 int dirty_vdevs; 2934 2935 /* 2936 * Lock out configuration changes. 2937 */ 2938 spa_config_enter(spa, RW_READER, FTAG); 2939 2940 spa->spa_syncing_txg = txg; 2941 spa->spa_sync_pass = 0; 2942 2943 VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); 2944 2945 tx = dmu_tx_create_assigned(dp, txg); 2946 2947 /* 2948 * If we are upgrading to ZFS_VERSION_RAIDZ_DEFLATE this txg, 2949 * set spa_deflate if we have no raid-z vdevs. 2950 */ 2951 if (spa->spa_ubsync.ub_version < ZFS_VERSION_RAIDZ_DEFLATE && 2952 spa->spa_uberblock.ub_version >= ZFS_VERSION_RAIDZ_DEFLATE) { 2953 int i; 2954 2955 for (i = 0; i < rvd->vdev_children; i++) { 2956 vd = rvd->vdev_child[i]; 2957 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 2958 break; 2959 } 2960 if (i == rvd->vdev_children) { 2961 spa->spa_deflate = TRUE; 2962 VERIFY(0 == zap_add(spa->spa_meta_objset, 2963 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 2964 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 2965 } 2966 } 2967 2968 /* 2969 * If anything has changed in this txg, push the deferred frees 2970 * from the previous txg. If not, leave them alone so that we 2971 * don't generate work on an otherwise idle system. 2972 */ 2973 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 2974 !txg_list_empty(&dp->dp_dirty_dirs, txg) || 2975 !txg_list_empty(&dp->dp_sync_tasks, txg)) 2976 spa_sync_deferred_frees(spa, txg); 2977 2978 /* 2979 * Iterate to convergence. 2980 */ 2981 do { 2982 spa->spa_sync_pass++; 2983 2984 spa_sync_config_object(spa, tx); 2985 spa_sync_spares(spa, tx); 2986 spa_errlog_sync(spa, txg); 2987 dsl_pool_sync(dp, txg); 2988 2989 dirty_vdevs = 0; 2990 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 2991 vdev_sync(vd, txg); 2992 dirty_vdevs++; 2993 } 2994 2995 bplist_sync(bpl, tx); 2996 } while (dirty_vdevs); 2997 2998 bplist_close(bpl); 2999 3000 dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 3001 3002 /* 3003 * Rewrite the vdev configuration (which includes the uberblock) 3004 * to commit the transaction group. 3005 * 3006 * If there are any dirty vdevs, sync the uberblock to all vdevs. 3007 * Otherwise, pick a random top-level vdev that's known to be 3008 * visible in the config cache (see spa_vdev_add() for details). 3009 * If the write fails, try the next vdev until we're tried them all. 3010 */ 3011 if (!list_is_empty(&spa->spa_dirty_list)) { 3012 VERIFY(vdev_config_sync(rvd, txg) == 0); 3013 } else { 3014 int children = rvd->vdev_children; 3015 int c0 = spa_get_random(children); 3016 int c; 3017 3018 for (c = 0; c < children; c++) { 3019 vd = rvd->vdev_child[(c0 + c) % children]; 3020 if (vd->vdev_ms_array == 0) 3021 continue; 3022 if (vdev_config_sync(vd, txg) == 0) 3023 break; 3024 } 3025 if (c == children) 3026 VERIFY(vdev_config_sync(rvd, txg) == 0); 3027 } 3028 3029 dmu_tx_commit(tx); 3030 3031 /* 3032 * Clear the dirty config list. 3033 */ 3034 while ((vd = list_head(&spa->spa_dirty_list)) != NULL) 3035 vdev_config_clean(vd); 3036 3037 /* 3038 * Now that the new config has synced transactionally, 3039 * let it become visible to the config cache. 3040 */ 3041 if (spa->spa_config_syncing != NULL) { 3042 spa_config_set(spa, spa->spa_config_syncing); 3043 spa->spa_config_txg = txg; 3044 spa->spa_config_syncing = NULL; 3045 } 3046 3047 /* 3048 * Make a stable copy of the fully synced uberblock. 3049 * We use this as the root for pool traversals. 3050 */ 3051 spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */ 3052 3053 spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */ 3054 3055 rw_enter(&spa->spa_traverse_lock, RW_WRITER); 3056 spa->spa_traverse_wanted = 0; 3057 spa->spa_ubsync = spa->spa_uberblock; 3058 rw_exit(&spa->spa_traverse_lock); 3059 3060 spa_scrub_resume(spa); /* resume scrub with new ubsync */ 3061 3062 /* 3063 * Clean up the ZIL records for the synced txg. 3064 */ 3065 dsl_pool_zil_clean(dp); 3066 3067 /* 3068 * Update usable space statistics. 3069 */ 3070 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 3071 vdev_sync_done(vd, txg); 3072 3073 /* 3074 * It had better be the case that we didn't dirty anything 3075 * since vdev_config_sync(). 3076 */ 3077 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 3078 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 3079 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 3080 ASSERT(bpl->bpl_queue == NULL); 3081 3082 spa_config_exit(spa, FTAG); 3083 3084 /* 3085 * If any async tasks have been requested, kick them off. 3086 */ 3087 spa_async_dispatch(spa); 3088} 3089 3090/* 3091 * Sync all pools. We don't want to hold the namespace lock across these 3092 * operations, so we take a reference on the spa_t and drop the lock during the 3093 * sync. 3094 */ 3095void 3096spa_sync_allpools(void) 3097{ 3098 spa_t *spa = NULL; 3099 mutex_enter(&spa_namespace_lock); 3100 while ((spa = spa_next(spa)) != NULL) { 3101 if (spa_state(spa) != POOL_STATE_ACTIVE) 3102 continue; 3103 spa_open_ref(spa, FTAG); 3104 mutex_exit(&spa_namespace_lock); 3105 txg_wait_synced(spa_get_dsl(spa), 0); 3106 mutex_enter(&spa_namespace_lock); 3107 spa_close(spa, FTAG); 3108 } 3109 mutex_exit(&spa_namespace_lock); 3110} 3111 3112/* 3113 * ========================================================================== 3114 * Miscellaneous routines 3115 * ========================================================================== 3116 */ 3117 3118/* 3119 * Remove all pools in the system. 3120 */ 3121void 3122spa_evict_all(void) 3123{ 3124 spa_t *spa; 3125 3126 /* 3127 * Remove all cached state. All pools should be closed now, 3128 * so every spa in the AVL tree should be unreferenced. 3129 */ 3130 mutex_enter(&spa_namespace_lock); 3131 while ((spa = spa_next(NULL)) != NULL) { 3132 /* 3133 * Stop async tasks. The async thread may need to detach 3134 * a device that's been replaced, which requires grabbing 3135 * spa_namespace_lock, so we must drop it here. 3136 */ 3137 spa_open_ref(spa, FTAG); 3138 mutex_exit(&spa_namespace_lock); 3139 spa_async_suspend(spa); 3140 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 3141 mutex_enter(&spa_namespace_lock); 3142 spa_close(spa, FTAG); 3143 3144 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 3145 spa_unload(spa); 3146 spa_deactivate(spa); 3147 } 3148 spa_remove(spa); 3149 } 3150 mutex_exit(&spa_namespace_lock); 3151} 3152 3153vdev_t * 3154spa_lookup_by_guid(spa_t *spa, uint64_t guid) 3155{ 3156 return (vdev_lookup_by_guid(spa->spa_root_vdev, guid)); 3157} 3158 3159void 3160spa_upgrade(spa_t *spa) 3161{ 3162 spa_config_enter(spa, RW_WRITER, FTAG); 3163 3164 /* 3165 * This should only be called for a non-faulted pool, and since a 3166 * future version would result in an unopenable pool, this shouldn't be 3167 * possible. 3168 */ 3169 ASSERT(spa->spa_uberblock.ub_version <= ZFS_VERSION); 3170 3171 spa->spa_uberblock.ub_version = ZFS_VERSION; 3172 vdev_config_dirty(spa->spa_root_vdev); 3173 3174 spa_config_exit(spa, FTAG); 3175 3176 txg_wait_synced(spa_get_dsl(spa), 0); 3177} 3178 3179boolean_t 3180spa_has_spare(spa_t *spa, uint64_t guid) 3181{ 3182 int i; 3183 uint64_t spareguid; 3184 3185 for (i = 0; i < spa->spa_nspares; i++) 3186 if (spa->spa_spares[i]->vdev_guid == guid) 3187 return (B_TRUE); 3188 3189 for (i = 0; i < spa->spa_pending_nspares; i++) { 3190 if (nvlist_lookup_uint64(spa->spa_pending_spares[i], 3191 ZPOOL_CONFIG_GUID, &spareguid) == 0 && 3192 spareguid == guid) 3193 return (B_TRUE); 3194 } 3195 3196 return (B_FALSE); 3197} 3198 3199int 3200spa_set_props(spa_t *spa, nvlist_t *nvp) 3201{ 3202 return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, 3203 spa, nvp, 3)); 3204} 3205 3206int 3207spa_get_props(spa_t *spa, nvlist_t **nvp) 3208{ 3209 zap_cursor_t zc; 3210 zap_attribute_t za; 3211 objset_t *mos = spa->spa_meta_objset; 3212 zfs_source_t src; 3213 zfs_prop_t prop; 3214 nvlist_t *propval; 3215 uint64_t value; 3216 int err; 3217 3218 VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3219 3220 mutex_enter(&spa->spa_props_lock); 3221 /* If no props object, then just return empty nvlist */ 3222 if (spa->spa_pool_props_object == 0) { 3223 mutex_exit(&spa->spa_props_lock); 3224 return (0); 3225 } 3226 3227 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 3228 (err = zap_cursor_retrieve(&zc, &za)) == 0; 3229 zap_cursor_advance(&zc)) { 3230 3231 if ((prop = zpool_name_to_prop(za.za_name)) == ZFS_PROP_INVAL) 3232 continue; 3233 3234 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3235 switch (za.za_integer_length) { 3236 case 8: 3237 if (zfs_prop_default_numeric(prop) == 3238 za.za_first_integer) 3239 src = ZFS_SRC_DEFAULT; 3240 else 3241 src = ZFS_SRC_LOCAL; 3242 value = za.za_first_integer; 3243 3244 if (prop == ZFS_PROP_BOOTFS) { 3245 dsl_pool_t *dp; 3246 dsl_dataset_t *ds = NULL; 3247 char strval[MAXPATHLEN]; 3248 3249 dp = spa_get_dsl(spa); 3250 rw_enter(&dp->dp_config_rwlock, RW_READER); 3251 if ((err = dsl_dataset_open_obj(dp, 3252 za.za_first_integer, NULL, DS_MODE_NONE, 3253 FTAG, &ds)) != 0) { 3254 rw_exit(&dp->dp_config_rwlock); 3255 break; 3256 } 3257 dsl_dataset_name(ds, strval); 3258 dsl_dataset_close(ds, DS_MODE_NONE, FTAG); 3259 rw_exit(&dp->dp_config_rwlock); 3260 3261 VERIFY(nvlist_add_uint64(propval, 3262 ZFS_PROP_SOURCE, src) == 0); 3263 VERIFY(nvlist_add_string(propval, 3264 ZFS_PROP_VALUE, strval) == 0); 3265 } else { 3266 VERIFY(nvlist_add_uint64(propval, 3267 ZFS_PROP_SOURCE, src) == 0); 3268 VERIFY(nvlist_add_uint64(propval, 3269 ZFS_PROP_VALUE, value) == 0); 3270 } 3271 VERIFY(nvlist_add_nvlist(*nvp, za.za_name, 3272 propval) == 0); 3273 break; 3274 } 3275 nvlist_free(propval); 3276 } 3277 zap_cursor_fini(&zc); 3278 mutex_exit(&spa->spa_props_lock); 3279 if (err && err != ENOENT) { 3280 nvlist_free(*nvp); 3281 return (err); 3282 } 3283 3284 return (0); 3285} 3286 3287/* 3288 * If the bootfs property value is dsobj, clear it. 3289 */ 3290void 3291spa_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 3292{ 3293 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 3294 VERIFY(zap_remove(spa->spa_meta_objset, 3295 spa->spa_pool_props_object, 3296 zpool_prop_to_name(ZFS_PROP_BOOTFS), tx) == 0); 3297 spa->spa_bootfs = 0; 3298 } 3299} 3300