spa.c revision 168404
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22/* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27#pragma ident "%Z%%M% %I% %E% SMI" 28 29/* 30 * This file contains all the routines used when modifying on-disk SPA state. 31 * This includes opening, importing, destroying, exporting a pool, and syncing a 32 * pool. 33 */ 34 35#include <sys/zfs_context.h> 36#include <sys/fm/fs/zfs.h> 37#include <sys/spa_impl.h> 38#include <sys/zio.h> 39#include <sys/zio_checksum.h> 40#include <sys/zio_compress.h> 41#include <sys/dmu.h> 42#include <sys/dmu_tx.h> 43#include <sys/zap.h> 44#include <sys/zil.h> 45#include <sys/vdev_impl.h> 46#include <sys/metaslab.h> 47#include <sys/uberblock_impl.h> 48#include <sys/txg.h> 49#include <sys/avl.h> 50#include <sys/dmu_traverse.h> 51#include <sys/dmu_objset.h> 52#include <sys/unique.h> 53#include <sys/dsl_pool.h> 54#include <sys/dsl_dataset.h> 55#include <sys/dsl_dir.h> 56#include <sys/dsl_prop.h> 57#include <sys/dsl_synctask.h> 58#include <sys/fs/zfs.h> 59#include <sys/callb.h> 60 61int zio_taskq_threads = 8; 62 63/* 64 * ========================================================================== 65 * SPA state manipulation (open/create/destroy/import/export) 66 * ========================================================================== 67 */ 68 69static int 70spa_error_entry_compare(const void *a, const void *b) 71{ 72 spa_error_entry_t *sa = (spa_error_entry_t *)a; 73 spa_error_entry_t *sb = (spa_error_entry_t *)b; 74 int ret; 75 76 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 77 sizeof (zbookmark_t)); 78 79 if (ret < 0) 80 return (-1); 81 else if (ret > 0) 82 return (1); 83 else 84 return (0); 85} 86 87/* 88 * Utility function which retrieves copies of the current logs and 89 * re-initializes them in the process. 90 */ 91void 92spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 93{ 94 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 95 96 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 97 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 98 99 avl_create(&spa->spa_errlist_scrub, 100 spa_error_entry_compare, sizeof (spa_error_entry_t), 101 offsetof(spa_error_entry_t, se_avl)); 102 avl_create(&spa->spa_errlist_last, 103 spa_error_entry_compare, sizeof (spa_error_entry_t), 104 offsetof(spa_error_entry_t, se_avl)); 105} 106 107/* 108 * Activate an uninitialized pool. 109 */ 110static void 111spa_activate(spa_t *spa) 112{ 113 int t; 114 115 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 116 117 spa->spa_state = POOL_STATE_ACTIVE; 118 119 spa->spa_normal_class = metaslab_class_create(); 120 121 for (t = 0; t < ZIO_TYPES; t++) { 122 spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue", 123 zio_taskq_threads, maxclsyspri, 50, INT_MAX, 124 TASKQ_PREPOPULATE); 125 spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr", 126 zio_taskq_threads, maxclsyspri, 50, INT_MAX, 127 TASKQ_PREPOPULATE); 128 } 129 130 rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL); 131 132 mutex_init(&spa->spa_uberblock_lock, NULL, MUTEX_DEFAULT, NULL); 133 mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL); 134 mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL); 135 mutex_init(&spa->spa_config_lock.scl_lock, NULL, MUTEX_DEFAULT, NULL); 136 cv_init(&spa->spa_config_lock.scl_cv, NULL, CV_DEFAULT, NULL); 137 mutex_init(&spa->spa_sync_bplist.bpl_lock, NULL, MUTEX_DEFAULT, NULL); 138 mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL); 139 mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL); 140 141 list_create(&spa->spa_dirty_list, sizeof (vdev_t), 142 offsetof(vdev_t, vdev_dirty_node)); 143 144 txg_list_create(&spa->spa_vdev_txg_list, 145 offsetof(struct vdev, vdev_txg_node)); 146 147 avl_create(&spa->spa_errlist_scrub, 148 spa_error_entry_compare, sizeof (spa_error_entry_t), 149 offsetof(spa_error_entry_t, se_avl)); 150 avl_create(&spa->spa_errlist_last, 151 spa_error_entry_compare, sizeof (spa_error_entry_t), 152 offsetof(spa_error_entry_t, se_avl)); 153} 154 155/* 156 * Opposite of spa_activate(). 157 */ 158static void 159spa_deactivate(spa_t *spa) 160{ 161 int t; 162 163 ASSERT(spa->spa_sync_on == B_FALSE); 164 ASSERT(spa->spa_dsl_pool == NULL); 165 ASSERT(spa->spa_root_vdev == NULL); 166 167 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 168 169 txg_list_destroy(&spa->spa_vdev_txg_list); 170 171 list_destroy(&spa->spa_dirty_list); 172 173 for (t = 0; t < ZIO_TYPES; t++) { 174 taskq_destroy(spa->spa_zio_issue_taskq[t]); 175 taskq_destroy(spa->spa_zio_intr_taskq[t]); 176 spa->spa_zio_issue_taskq[t] = NULL; 177 spa->spa_zio_intr_taskq[t] = NULL; 178 } 179 180 metaslab_class_destroy(spa->spa_normal_class); 181 spa->spa_normal_class = NULL; 182 183 /* 184 * If this was part of an import or the open otherwise failed, we may 185 * still have errors left in the queues. Empty them just in case. 186 */ 187 spa_errlog_drain(spa); 188 189 avl_destroy(&spa->spa_errlist_scrub); 190 avl_destroy(&spa->spa_errlist_last); 191 192 rw_destroy(&spa->spa_traverse_lock); 193 mutex_destroy(&spa->spa_uberblock_lock); 194 mutex_destroy(&spa->spa_errlog_lock); 195 mutex_destroy(&spa->spa_errlist_lock); 196 mutex_destroy(&spa->spa_config_lock.scl_lock); 197 cv_destroy(&spa->spa_config_lock.scl_cv); 198 mutex_destroy(&spa->spa_sync_bplist.bpl_lock); 199 mutex_destroy(&spa->spa_history_lock); 200 mutex_destroy(&spa->spa_props_lock); 201 202 spa->spa_state = POOL_STATE_UNINITIALIZED; 203} 204 205/* 206 * Verify a pool configuration, and construct the vdev tree appropriately. This 207 * will create all the necessary vdevs in the appropriate layout, with each vdev 208 * in the CLOSED state. This will prep the pool before open/creation/import. 209 * All vdev validation is done by the vdev_alloc() routine. 210 */ 211static int 212spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 213 uint_t id, int atype) 214{ 215 nvlist_t **child; 216 uint_t c, children; 217 int error; 218 219 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 220 return (error); 221 222 if ((*vdp)->vdev_ops->vdev_op_leaf) 223 return (0); 224 225 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 226 &child, &children) != 0) { 227 vdev_free(*vdp); 228 *vdp = NULL; 229 return (EINVAL); 230 } 231 232 for (c = 0; c < children; c++) { 233 vdev_t *vd; 234 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 235 atype)) != 0) { 236 vdev_free(*vdp); 237 *vdp = NULL; 238 return (error); 239 } 240 } 241 242 ASSERT(*vdp != NULL); 243 244 return (0); 245} 246 247/* 248 * Opposite of spa_load(). 249 */ 250static void 251spa_unload(spa_t *spa) 252{ 253 int i; 254 255 /* 256 * Stop async tasks. 257 */ 258 spa_async_suspend(spa); 259 260 /* 261 * Stop syncing. 262 */ 263 if (spa->spa_sync_on) { 264 txg_sync_stop(spa->spa_dsl_pool); 265 spa->spa_sync_on = B_FALSE; 266 } 267 268 /* 269 * Wait for any outstanding prefetch I/O to complete. 270 */ 271 spa_config_enter(spa, RW_WRITER, FTAG); 272 spa_config_exit(spa, FTAG); 273 274 /* 275 * Close the dsl pool. 276 */ 277 if (spa->spa_dsl_pool) { 278 dsl_pool_close(spa->spa_dsl_pool); 279 spa->spa_dsl_pool = NULL; 280 } 281 282 /* 283 * Close all vdevs. 284 */ 285 if (spa->spa_root_vdev) 286 vdev_free(spa->spa_root_vdev); 287 ASSERT(spa->spa_root_vdev == NULL); 288 289 for (i = 0; i < spa->spa_nspares; i++) 290 vdev_free(spa->spa_spares[i]); 291 if (spa->spa_spares) { 292 kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 293 spa->spa_spares = NULL; 294 } 295 if (spa->spa_sparelist) { 296 nvlist_free(spa->spa_sparelist); 297 spa->spa_sparelist = NULL; 298 } 299 300 spa->spa_async_suspended = 0; 301} 302 303/* 304 * Load (or re-load) the current list of vdevs describing the active spares for 305 * this pool. When this is called, we have some form of basic information in 306 * 'spa_sparelist'. We parse this into vdevs, try to open them, and then 307 * re-generate a more complete list including status information. 308 */ 309static void 310spa_load_spares(spa_t *spa) 311{ 312 nvlist_t **spares; 313 uint_t nspares; 314 int i; 315 vdev_t *vd, *tvd; 316 317 /* 318 * First, close and free any existing spare vdevs. 319 */ 320 for (i = 0; i < spa->spa_nspares; i++) { 321 vd = spa->spa_spares[i]; 322 323 /* Undo the call to spa_activate() below */ 324 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL && 325 tvd->vdev_isspare) 326 spa_spare_remove(tvd); 327 vdev_close(vd); 328 vdev_free(vd); 329 } 330 331 if (spa->spa_spares) 332 kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 333 334 if (spa->spa_sparelist == NULL) 335 nspares = 0; 336 else 337 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 338 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 339 340 spa->spa_nspares = (int)nspares; 341 spa->spa_spares = NULL; 342 343 if (nspares == 0) 344 return; 345 346 /* 347 * Construct the array of vdevs, opening them to get status in the 348 * process. For each spare, there is potentially two different vdev_t 349 * structures associated with it: one in the list of spares (used only 350 * for basic validation purposes) and one in the active vdev 351 * configuration (if it's spared in). During this phase we open and 352 * validate each vdev on the spare list. If the vdev also exists in the 353 * active configuration, then we also mark this vdev as an active spare. 354 */ 355 spa->spa_spares = kmem_alloc(nspares * sizeof (void *), KM_SLEEP); 356 for (i = 0; i < spa->spa_nspares; i++) { 357 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 358 VDEV_ALLOC_SPARE) == 0); 359 ASSERT(vd != NULL); 360 361 spa->spa_spares[i] = vd; 362 363 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL) { 364 if (!tvd->vdev_isspare) 365 spa_spare_add(tvd); 366 367 /* 368 * We only mark the spare active if we were successfully 369 * able to load the vdev. Otherwise, importing a pool 370 * with a bad active spare would result in strange 371 * behavior, because multiple pool would think the spare 372 * is actively in use. 373 * 374 * There is a vulnerability here to an equally bizarre 375 * circumstance, where a dead active spare is later 376 * brought back to life (onlined or otherwise). Given 377 * the rarity of this scenario, and the extra complexity 378 * it adds, we ignore the possibility. 379 */ 380 if (!vdev_is_dead(tvd)) 381 spa_spare_activate(tvd); 382 } 383 384 if (vdev_open(vd) != 0) 385 continue; 386 387 vd->vdev_top = vd; 388 (void) vdev_validate_spare(vd); 389 } 390 391 /* 392 * Recompute the stashed list of spares, with status information 393 * this time. 394 */ 395 VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 396 DATA_TYPE_NVLIST_ARRAY) == 0); 397 398 spares = kmem_alloc(spa->spa_nspares * sizeof (void *), KM_SLEEP); 399 for (i = 0; i < spa->spa_nspares; i++) 400 spares[i] = vdev_config_generate(spa, spa->spa_spares[i], 401 B_TRUE, B_TRUE); 402 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 403 spares, spa->spa_nspares) == 0); 404 for (i = 0; i < spa->spa_nspares; i++) 405 nvlist_free(spares[i]); 406 kmem_free(spares, spa->spa_nspares * sizeof (void *)); 407} 408 409static int 410load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 411{ 412 dmu_buf_t *db; 413 char *packed = NULL; 414 size_t nvsize = 0; 415 int error; 416 *value = NULL; 417 418 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 419 nvsize = *(uint64_t *)db->db_data; 420 dmu_buf_rele(db, FTAG); 421 422 packed = kmem_alloc(nvsize, KM_SLEEP); 423 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed); 424 if (error == 0) 425 error = nvlist_unpack(packed, nvsize, value, 0); 426 kmem_free(packed, nvsize); 427 428 return (error); 429} 430 431/* 432 * Load an existing storage pool, using the pool's builtin spa_config as a 433 * source of configuration information. 434 */ 435static int 436spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) 437{ 438 int error = 0; 439 nvlist_t *nvroot = NULL; 440 vdev_t *rvd; 441 uberblock_t *ub = &spa->spa_uberblock; 442 uint64_t config_cache_txg = spa->spa_config_txg; 443 uint64_t pool_guid; 444 uint64_t version; 445 zio_t *zio; 446 447 spa->spa_load_state = state; 448 449 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 450 nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 451 error = EINVAL; 452 goto out; 453 } 454 455 /* 456 * Versioning wasn't explicitly added to the label until later, so if 457 * it's not present treat it as the initial version. 458 */ 459 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0) 460 version = ZFS_VERSION_INITIAL; 461 462 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 463 &spa->spa_config_txg); 464 465 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 466 spa_guid_exists(pool_guid, 0)) { 467 error = EEXIST; 468 goto out; 469 } 470 471 spa->spa_load_guid = pool_guid; 472 473 /* 474 * Parse the configuration into a vdev tree. We explicitly set the 475 * value that will be returned by spa_version() since parsing the 476 * configuration requires knowing the version number. 477 */ 478 spa_config_enter(spa, RW_WRITER, FTAG); 479 spa->spa_ubsync.ub_version = version; 480 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 481 spa_config_exit(spa, FTAG); 482 483 if (error != 0) 484 goto out; 485 486 ASSERT(spa->spa_root_vdev == rvd); 487 ASSERT(spa_guid(spa) == pool_guid); 488 489 /* 490 * Try to open all vdevs, loading each label in the process. 491 */ 492 if (vdev_open(rvd) != 0) { 493 error = ENXIO; 494 goto out; 495 } 496 497 /* 498 * Validate the labels for all leaf vdevs. We need to grab the config 499 * lock because all label I/O is done with the ZIO_FLAG_CONFIG_HELD 500 * flag. 501 */ 502 spa_config_enter(spa, RW_READER, FTAG); 503 error = vdev_validate(rvd); 504 spa_config_exit(spa, FTAG); 505 506 if (error != 0) { 507 error = EBADF; 508 goto out; 509 } 510 511 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 512 error = ENXIO; 513 goto out; 514 } 515 516 /* 517 * Find the best uberblock. 518 */ 519 bzero(ub, sizeof (uberblock_t)); 520 521 zio = zio_root(spa, NULL, NULL, 522 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 523 vdev_uberblock_load(zio, rvd, ub); 524 error = zio_wait(zio); 525 526 /* 527 * If we weren't able to find a single valid uberblock, return failure. 528 */ 529 if (ub->ub_txg == 0) { 530 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 531 VDEV_AUX_CORRUPT_DATA); 532 error = ENXIO; 533 goto out; 534 } 535 536 /* 537 * If the pool is newer than the code, we can't open it. 538 */ 539 if (ub->ub_version > ZFS_VERSION) { 540 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 541 VDEV_AUX_VERSION_NEWER); 542 error = ENOTSUP; 543 goto out; 544 } 545 546 /* 547 * If the vdev guid sum doesn't match the uberblock, we have an 548 * incomplete configuration. 549 */ 550 if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 551 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 552 VDEV_AUX_BAD_GUID_SUM); 553 error = ENXIO; 554 goto out; 555 } 556 557 /* 558 * Initialize internal SPA structures. 559 */ 560 spa->spa_state = POOL_STATE_ACTIVE; 561 spa->spa_ubsync = spa->spa_uberblock; 562 spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 563 error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 564 if (error) { 565 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 566 VDEV_AUX_CORRUPT_DATA); 567 goto out; 568 } 569 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 570 571 if (zap_lookup(spa->spa_meta_objset, 572 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 573 sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { 574 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 575 VDEV_AUX_CORRUPT_DATA); 576 error = EIO; 577 goto out; 578 } 579 580 if (!mosconfig) { 581 nvlist_t *newconfig; 582 583 if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) { 584 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 585 VDEV_AUX_CORRUPT_DATA); 586 error = EIO; 587 goto out; 588 } 589 590 spa_config_set(spa, newconfig); 591 spa_unload(spa); 592 spa_deactivate(spa); 593 spa_activate(spa); 594 595 return (spa_load(spa, newconfig, state, B_TRUE)); 596 } 597 598 if (zap_lookup(spa->spa_meta_objset, 599 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 600 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { 601 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 602 VDEV_AUX_CORRUPT_DATA); 603 error = EIO; 604 goto out; 605 } 606 607 /* 608 * Load the bit that tells us to use the new accounting function 609 * (raid-z deflation). If we have an older pool, this will not 610 * be present. 611 */ 612 error = zap_lookup(spa->spa_meta_objset, 613 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 614 sizeof (uint64_t), 1, &spa->spa_deflate); 615 if (error != 0 && error != ENOENT) { 616 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 617 VDEV_AUX_CORRUPT_DATA); 618 error = EIO; 619 goto out; 620 } 621 622 /* 623 * Load the persistent error log. If we have an older pool, this will 624 * not be present. 625 */ 626 error = zap_lookup(spa->spa_meta_objset, 627 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, 628 sizeof (uint64_t), 1, &spa->spa_errlog_last); 629 if (error != 0 && error != ENOENT) { 630 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 631 VDEV_AUX_CORRUPT_DATA); 632 error = EIO; 633 goto out; 634 } 635 636 error = zap_lookup(spa->spa_meta_objset, 637 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, 638 sizeof (uint64_t), 1, &spa->spa_errlog_scrub); 639 if (error != 0 && error != ENOENT) { 640 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 641 VDEV_AUX_CORRUPT_DATA); 642 error = EIO; 643 goto out; 644 } 645 646 /* 647 * Load the history object. If we have an older pool, this 648 * will not be present. 649 */ 650 error = zap_lookup(spa->spa_meta_objset, 651 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY, 652 sizeof (uint64_t), 1, &spa->spa_history); 653 if (error != 0 && error != ENOENT) { 654 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 655 VDEV_AUX_CORRUPT_DATA); 656 error = EIO; 657 goto out; 658 } 659 660 /* 661 * Load any hot spares for this pool. 662 */ 663 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 664 DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares_object); 665 if (error != 0 && error != ENOENT) { 666 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 667 VDEV_AUX_CORRUPT_DATA); 668 error = EIO; 669 goto out; 670 } 671 if (error == 0) { 672 ASSERT(spa_version(spa) >= ZFS_VERSION_SPARES); 673 if (load_nvlist(spa, spa->spa_spares_object, 674 &spa->spa_sparelist) != 0) { 675 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 676 VDEV_AUX_CORRUPT_DATA); 677 error = EIO; 678 goto out; 679 } 680 681 spa_config_enter(spa, RW_WRITER, FTAG); 682 spa_load_spares(spa); 683 spa_config_exit(spa, FTAG); 684 } 685 686 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 687 DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object); 688 689 if (error && error != ENOENT) { 690 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 691 VDEV_AUX_CORRUPT_DATA); 692 error = EIO; 693 goto out; 694 } 695 696 if (error == 0) { 697 (void) zap_lookup(spa->spa_meta_objset, 698 spa->spa_pool_props_object, 699 zpool_prop_to_name(ZFS_PROP_BOOTFS), 700 sizeof (uint64_t), 1, &spa->spa_bootfs); 701 } 702 703 /* 704 * Load the vdev state for all toplevel vdevs. 705 */ 706 vdev_load(rvd); 707 708 /* 709 * Propagate the leaf DTLs we just loaded all the way up the tree. 710 */ 711 spa_config_enter(spa, RW_WRITER, FTAG); 712 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 713 spa_config_exit(spa, FTAG); 714 715 /* 716 * Check the state of the root vdev. If it can't be opened, it 717 * indicates one or more toplevel vdevs are faulted. 718 */ 719 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 720 error = ENXIO; 721 goto out; 722 } 723 724 if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) { 725 dmu_tx_t *tx; 726 int need_update = B_FALSE; 727 int c; 728 729 /* 730 * Claim log blocks that haven't been committed yet. 731 * This must all happen in a single txg. 732 */ 733 tx = dmu_tx_create_assigned(spa_get_dsl(spa), 734 spa_first_txg(spa)); 735 (void) dmu_objset_find(spa->spa_name, 736 zil_claim, tx, DS_FIND_CHILDREN); 737 dmu_tx_commit(tx); 738 739 spa->spa_sync_on = B_TRUE; 740 txg_sync_start(spa->spa_dsl_pool); 741 742 /* 743 * Wait for all claims to sync. 744 */ 745 txg_wait_synced(spa->spa_dsl_pool, 0); 746 747 /* 748 * If the config cache is stale, or we have uninitialized 749 * metaslabs (see spa_vdev_add()), then update the config. 750 */ 751 if (config_cache_txg != spa->spa_config_txg || 752 state == SPA_LOAD_IMPORT) 753 need_update = B_TRUE; 754 755 for (c = 0; c < rvd->vdev_children; c++) 756 if (rvd->vdev_child[c]->vdev_ms_array == 0) 757 need_update = B_TRUE; 758 759 /* 760 * Update the config cache asychronously in case we're the 761 * root pool, in which case the config cache isn't writable yet. 762 */ 763 if (need_update) 764 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 765 } 766 767 error = 0; 768out: 769 if (error && error != EBADF) 770 zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0); 771 spa->spa_load_state = SPA_LOAD_NONE; 772 spa->spa_ena = 0; 773 774 return (error); 775} 776 777/* 778 * Pool Open/Import 779 * 780 * The import case is identical to an open except that the configuration is sent 781 * down from userland, instead of grabbed from the configuration cache. For the 782 * case of an open, the pool configuration will exist in the 783 * POOL_STATE_UNITIALIZED state. 784 * 785 * The stats information (gen/count/ustats) is used to gather vdev statistics at 786 * the same time open the pool, without having to keep around the spa_t in some 787 * ambiguous state. 788 */ 789static int 790spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 791{ 792 spa_t *spa; 793 int error; 794 int loaded = B_FALSE; 795 int locked = B_FALSE; 796 797 *spapp = NULL; 798 799 /* 800 * As disgusting as this is, we need to support recursive calls to this 801 * function because dsl_dir_open() is called during spa_load(), and ends 802 * up calling spa_open() again. The real fix is to figure out how to 803 * avoid dsl_dir_open() calling this in the first place. 804 */ 805 if (mutex_owner(&spa_namespace_lock) != curthread) { 806 mutex_enter(&spa_namespace_lock); 807 locked = B_TRUE; 808 } 809 810 if ((spa = spa_lookup(pool)) == NULL) { 811 if (locked) 812 mutex_exit(&spa_namespace_lock); 813 return (ENOENT); 814 } 815 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 816 817 spa_activate(spa); 818 819 error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE); 820 821 if (error == EBADF) { 822 /* 823 * If vdev_validate() returns failure (indicated by 824 * EBADF), it indicates that one of the vdevs indicates 825 * that the pool has been exported or destroyed. If 826 * this is the case, the config cache is out of sync and 827 * we should remove the pool from the namespace. 828 */ 829 zfs_post_ok(spa, NULL); 830 spa_unload(spa); 831 spa_deactivate(spa); 832 spa_remove(spa); 833 spa_config_sync(); 834 if (locked) 835 mutex_exit(&spa_namespace_lock); 836 return (ENOENT); 837 } 838 839 if (error) { 840 /* 841 * We can't open the pool, but we still have useful 842 * information: the state of each vdev after the 843 * attempted vdev_open(). Return this to the user. 844 */ 845 if (config != NULL && spa->spa_root_vdev != NULL) { 846 spa_config_enter(spa, RW_READER, FTAG); 847 *config = spa_config_generate(spa, NULL, -1ULL, 848 B_TRUE); 849 spa_config_exit(spa, FTAG); 850 } 851 spa_unload(spa); 852 spa_deactivate(spa); 853 spa->spa_last_open_failed = B_TRUE; 854 if (locked) 855 mutex_exit(&spa_namespace_lock); 856 *spapp = NULL; 857 return (error); 858 } else { 859 zfs_post_ok(spa, NULL); 860 spa->spa_last_open_failed = B_FALSE; 861 } 862 863 loaded = B_TRUE; 864 } 865 866 spa_open_ref(spa, tag); 867 if (locked) 868 mutex_exit(&spa_namespace_lock); 869 870 *spapp = spa; 871 872 if (config != NULL) { 873 spa_config_enter(spa, RW_READER, FTAG); 874 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 875 spa_config_exit(spa, FTAG); 876 } 877 878 /* 879 * If we just loaded the pool, resilver anything that's out of date. 880 */ 881 if (loaded && (spa_mode & FWRITE)) 882 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 883 884 return (0); 885} 886 887int 888spa_open(const char *name, spa_t **spapp, void *tag) 889{ 890 return (spa_open_common(name, spapp, tag, NULL)); 891} 892 893/* 894 * Lookup the given spa_t, incrementing the inject count in the process, 895 * preventing it from being exported or destroyed. 896 */ 897spa_t * 898spa_inject_addref(char *name) 899{ 900 spa_t *spa; 901 902 mutex_enter(&spa_namespace_lock); 903 if ((spa = spa_lookup(name)) == NULL) { 904 mutex_exit(&spa_namespace_lock); 905 return (NULL); 906 } 907 spa->spa_inject_ref++; 908 mutex_exit(&spa_namespace_lock); 909 910 return (spa); 911} 912 913void 914spa_inject_delref(spa_t *spa) 915{ 916 mutex_enter(&spa_namespace_lock); 917 spa->spa_inject_ref--; 918 mutex_exit(&spa_namespace_lock); 919} 920 921static void 922spa_add_spares(spa_t *spa, nvlist_t *config) 923{ 924 nvlist_t **spares; 925 uint_t i, nspares; 926 nvlist_t *nvroot; 927 uint64_t guid; 928 vdev_stat_t *vs; 929 uint_t vsc; 930 uint64_t pool; 931 932 if (spa->spa_nspares == 0) 933 return; 934 935 VERIFY(nvlist_lookup_nvlist(config, 936 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 937 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 938 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 939 if (nspares != 0) { 940 VERIFY(nvlist_add_nvlist_array(nvroot, 941 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 942 VERIFY(nvlist_lookup_nvlist_array(nvroot, 943 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 944 945 /* 946 * Go through and find any spares which have since been 947 * repurposed as an active spare. If this is the case, update 948 * their status appropriately. 949 */ 950 for (i = 0; i < nspares; i++) { 951 VERIFY(nvlist_lookup_uint64(spares[i], 952 ZPOOL_CONFIG_GUID, &guid) == 0); 953 if (spa_spare_exists(guid, &pool) && pool != 0ULL) { 954 VERIFY(nvlist_lookup_uint64_array( 955 spares[i], ZPOOL_CONFIG_STATS, 956 (uint64_t **)&vs, &vsc) == 0); 957 vs->vs_state = VDEV_STATE_CANT_OPEN; 958 vs->vs_aux = VDEV_AUX_SPARED; 959 } 960 } 961 } 962} 963 964int 965spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 966{ 967 int error; 968 spa_t *spa; 969 970 *config = NULL; 971 error = spa_open_common(name, &spa, FTAG, config); 972 973 if (spa && *config != NULL) { 974 VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, 975 spa_get_errlog_size(spa)) == 0); 976 977 spa_add_spares(spa, *config); 978 } 979 980 /* 981 * We want to get the alternate root even for faulted pools, so we cheat 982 * and call spa_lookup() directly. 983 */ 984 if (altroot) { 985 if (spa == NULL) { 986 mutex_enter(&spa_namespace_lock); 987 spa = spa_lookup(name); 988 if (spa) 989 spa_altroot(spa, altroot, buflen); 990 else 991 altroot[0] = '\0'; 992 spa = NULL; 993 mutex_exit(&spa_namespace_lock); 994 } else { 995 spa_altroot(spa, altroot, buflen); 996 } 997 } 998 999 if (spa != NULL) 1000 spa_close(spa, FTAG); 1001 1002 return (error); 1003} 1004 1005/* 1006 * Validate that the 'spares' array is well formed. We must have an array of 1007 * nvlists, each which describes a valid leaf vdev. If this is an import (mode 1008 * is VDEV_ALLOC_SPARE), then we allow corrupted spares to be specified, as long 1009 * as they are well-formed. 1010 */ 1011static int 1012spa_validate_spares(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 1013{ 1014 nvlist_t **spares; 1015 uint_t i, nspares; 1016 vdev_t *vd; 1017 int error; 1018 1019 /* 1020 * It's acceptable to have no spares specified. 1021 */ 1022 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1023 &spares, &nspares) != 0) 1024 return (0); 1025 1026 if (nspares == 0) 1027 return (EINVAL); 1028 1029 /* 1030 * Make sure the pool is formatted with a version that supports hot 1031 * spares. 1032 */ 1033 if (spa_version(spa) < ZFS_VERSION_SPARES) 1034 return (ENOTSUP); 1035 1036 /* 1037 * Set the pending spare list so we correctly handle device in-use 1038 * checking. 1039 */ 1040 spa->spa_pending_spares = spares; 1041 spa->spa_pending_nspares = nspares; 1042 1043 for (i = 0; i < nspares; i++) { 1044 if ((error = spa_config_parse(spa, &vd, spares[i], NULL, 0, 1045 mode)) != 0) 1046 goto out; 1047 1048 if (!vd->vdev_ops->vdev_op_leaf) { 1049 vdev_free(vd); 1050 error = EINVAL; 1051 goto out; 1052 } 1053 1054 vd->vdev_top = vd; 1055 1056 if ((error = vdev_open(vd)) == 0 && 1057 (error = vdev_label_init(vd, crtxg, 1058 VDEV_LABEL_SPARE)) == 0) { 1059 VERIFY(nvlist_add_uint64(spares[i], ZPOOL_CONFIG_GUID, 1060 vd->vdev_guid) == 0); 1061 } 1062 1063 vdev_free(vd); 1064 1065 if (error && mode != VDEV_ALLOC_SPARE) 1066 goto out; 1067 else 1068 error = 0; 1069 } 1070 1071out: 1072 spa->spa_pending_spares = NULL; 1073 spa->spa_pending_nspares = 0; 1074 return (error); 1075} 1076 1077/* 1078 * Pool Creation 1079 */ 1080int 1081spa_create(const char *pool, nvlist_t *nvroot, const char *altroot) 1082{ 1083 spa_t *spa; 1084 vdev_t *rvd; 1085 dsl_pool_t *dp; 1086 dmu_tx_t *tx; 1087 int c, error = 0; 1088 uint64_t txg = TXG_INITIAL; 1089 nvlist_t **spares; 1090 uint_t nspares; 1091 1092 /* 1093 * If this pool already exists, return failure. 1094 */ 1095 mutex_enter(&spa_namespace_lock); 1096 if (spa_lookup(pool) != NULL) { 1097 mutex_exit(&spa_namespace_lock); 1098 return (EEXIST); 1099 } 1100 1101 /* 1102 * Allocate a new spa_t structure. 1103 */ 1104 spa = spa_add(pool, altroot); 1105 spa_activate(spa); 1106 1107 spa->spa_uberblock.ub_txg = txg - 1; 1108 spa->spa_uberblock.ub_version = ZFS_VERSION; 1109 spa->spa_ubsync = spa->spa_uberblock; 1110 1111 /* 1112 * Create the root vdev. 1113 */ 1114 spa_config_enter(spa, RW_WRITER, FTAG); 1115 1116 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 1117 1118 ASSERT(error != 0 || rvd != NULL); 1119 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 1120 1121 if (error == 0 && rvd->vdev_children == 0) 1122 error = EINVAL; 1123 1124 if (error == 0 && 1125 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 1126 (error = spa_validate_spares(spa, nvroot, txg, 1127 VDEV_ALLOC_ADD)) == 0) { 1128 for (c = 0; c < rvd->vdev_children; c++) 1129 vdev_init(rvd->vdev_child[c], txg); 1130 vdev_config_dirty(rvd); 1131 } 1132 1133 spa_config_exit(spa, FTAG); 1134 1135 if (error != 0) { 1136 spa_unload(spa); 1137 spa_deactivate(spa); 1138 spa_remove(spa); 1139 mutex_exit(&spa_namespace_lock); 1140 return (error); 1141 } 1142 1143 /* 1144 * Get the list of spares, if specified. 1145 */ 1146 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1147 &spares, &nspares) == 0) { 1148 VERIFY(nvlist_alloc(&spa->spa_sparelist, NV_UNIQUE_NAME, 1149 KM_SLEEP) == 0); 1150 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1151 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1152 spa_config_enter(spa, RW_WRITER, FTAG); 1153 spa_load_spares(spa); 1154 spa_config_exit(spa, FTAG); 1155 spa->spa_sync_spares = B_TRUE; 1156 } 1157 1158 spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg); 1159 spa->spa_meta_objset = dp->dp_meta_objset; 1160 1161 tx = dmu_tx_create_assigned(dp, txg); 1162 1163 /* 1164 * Create the pool config object. 1165 */ 1166 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 1167 DMU_OT_PACKED_NVLIST, 1 << 14, 1168 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 1169 1170 if (zap_add(spa->spa_meta_objset, 1171 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 1172 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 1173 cmn_err(CE_PANIC, "failed to add pool config"); 1174 } 1175 1176 /* Newly created pools are always deflated. */ 1177 spa->spa_deflate = TRUE; 1178 if (zap_add(spa->spa_meta_objset, 1179 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 1180 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 1181 cmn_err(CE_PANIC, "failed to add deflate"); 1182 } 1183 1184 /* 1185 * Create the deferred-free bplist object. Turn off compression 1186 * because sync-to-convergence takes longer if the blocksize 1187 * keeps changing. 1188 */ 1189 spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 1190 1 << 14, tx); 1191 dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 1192 ZIO_COMPRESS_OFF, tx); 1193 1194 if (zap_add(spa->spa_meta_objset, 1195 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 1196 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { 1197 cmn_err(CE_PANIC, "failed to add bplist"); 1198 } 1199 1200 /* 1201 * Create the pool's history object. 1202 */ 1203 spa_history_create_obj(spa, tx); 1204 1205 dmu_tx_commit(tx); 1206 1207 spa->spa_bootfs = zfs_prop_default_numeric(ZFS_PROP_BOOTFS); 1208 spa->spa_sync_on = B_TRUE; 1209 txg_sync_start(spa->spa_dsl_pool); 1210 1211 /* 1212 * We explicitly wait for the first transaction to complete so that our 1213 * bean counters are appropriately updated. 1214 */ 1215 txg_wait_synced(spa->spa_dsl_pool, txg); 1216 1217 spa_config_sync(); 1218 1219 mutex_exit(&spa_namespace_lock); 1220 1221 return (0); 1222} 1223 1224/* 1225 * Import the given pool into the system. We set up the necessary spa_t and 1226 * then call spa_load() to do the dirty work. 1227 */ 1228int 1229spa_import(const char *pool, nvlist_t *config, const char *altroot) 1230{ 1231 spa_t *spa; 1232 int error; 1233 nvlist_t *nvroot; 1234 nvlist_t **spares; 1235 uint_t nspares; 1236 1237 if (!(spa_mode & FWRITE)) 1238 return (EROFS); 1239 1240 /* 1241 * If a pool with this name exists, return failure. 1242 */ 1243 mutex_enter(&spa_namespace_lock); 1244 if (spa_lookup(pool) != NULL) { 1245 mutex_exit(&spa_namespace_lock); 1246 return (EEXIST); 1247 } 1248 1249 /* 1250 * Create and initialize the spa structure. 1251 */ 1252 spa = spa_add(pool, altroot); 1253 spa_activate(spa); 1254 1255 /* 1256 * Pass off the heavy lifting to spa_load(). 1257 * Pass TRUE for mosconfig because the user-supplied config 1258 * is actually the one to trust when doing an import. 1259 */ 1260 error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE); 1261 1262 spa_config_enter(spa, RW_WRITER, FTAG); 1263 /* 1264 * Toss any existing sparelist, as it doesn't have any validity anymore, 1265 * and conflicts with spa_has_spare(). 1266 */ 1267 if (spa->spa_sparelist) { 1268 nvlist_free(spa->spa_sparelist); 1269 spa->spa_sparelist = NULL; 1270 spa_load_spares(spa); 1271 } 1272 1273 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 1274 &nvroot) == 0); 1275 if (error == 0) 1276 error = spa_validate_spares(spa, nvroot, -1ULL, 1277 VDEV_ALLOC_SPARE); 1278 spa_config_exit(spa, FTAG); 1279 1280 if (error != 0) { 1281 spa_unload(spa); 1282 spa_deactivate(spa); 1283 spa_remove(spa); 1284 mutex_exit(&spa_namespace_lock); 1285 return (error); 1286 } 1287 1288 /* 1289 * Override any spares as specified by the user, as these may have 1290 * correct device names/devids, etc. 1291 */ 1292 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1293 &spares, &nspares) == 0) { 1294 if (spa->spa_sparelist) 1295 VERIFY(nvlist_remove(spa->spa_sparelist, 1296 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 1297 else 1298 VERIFY(nvlist_alloc(&spa->spa_sparelist, 1299 NV_UNIQUE_NAME, KM_SLEEP) == 0); 1300 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1301 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1302 spa_config_enter(spa, RW_WRITER, FTAG); 1303 spa_load_spares(spa); 1304 spa_config_exit(spa, FTAG); 1305 spa->spa_sync_spares = B_TRUE; 1306 } 1307 1308 /* 1309 * Update the config cache to include the newly-imported pool. 1310 */ 1311 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 1312 1313 mutex_exit(&spa_namespace_lock); 1314 1315 /* 1316 * Resilver anything that's out of date. 1317 */ 1318 if (spa_mode & FWRITE) 1319 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1320 1321 return (0); 1322} 1323 1324/* 1325 * This (illegal) pool name is used when temporarily importing a spa_t in order 1326 * to get the vdev stats associated with the imported devices. 1327 */ 1328#define TRYIMPORT_NAME "$import" 1329 1330nvlist_t * 1331spa_tryimport(nvlist_t *tryconfig) 1332{ 1333 nvlist_t *config = NULL; 1334 char *poolname; 1335 spa_t *spa; 1336 uint64_t state; 1337 1338 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 1339 return (NULL); 1340 1341 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 1342 return (NULL); 1343 1344 /* 1345 * Create and initialize the spa structure. 1346 */ 1347 mutex_enter(&spa_namespace_lock); 1348 spa = spa_add(TRYIMPORT_NAME, NULL); 1349 spa_activate(spa); 1350 1351 /* 1352 * Pass off the heavy lifting to spa_load(). 1353 * Pass TRUE for mosconfig because the user-supplied config 1354 * is actually the one to trust when doing an import. 1355 */ 1356 (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); 1357 1358 /* 1359 * If 'tryconfig' was at least parsable, return the current config. 1360 */ 1361 if (spa->spa_root_vdev != NULL) { 1362 spa_config_enter(spa, RW_READER, FTAG); 1363 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 1364 spa_config_exit(spa, FTAG); 1365 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 1366 poolname) == 0); 1367 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 1368 state) == 0); 1369 1370 /* 1371 * Add the list of hot spares. 1372 */ 1373 spa_add_spares(spa, config); 1374 } 1375 1376 spa_unload(spa); 1377 spa_deactivate(spa); 1378 spa_remove(spa); 1379 mutex_exit(&spa_namespace_lock); 1380 1381 return (config); 1382} 1383 1384/* 1385 * Pool export/destroy 1386 * 1387 * The act of destroying or exporting a pool is very simple. We make sure there 1388 * is no more pending I/O and any references to the pool are gone. Then, we 1389 * update the pool state and sync all the labels to disk, removing the 1390 * configuration from the cache afterwards. 1391 */ 1392static int 1393spa_export_common(char *pool, int new_state, nvlist_t **oldconfig) 1394{ 1395 spa_t *spa; 1396 1397 if (oldconfig) 1398 *oldconfig = NULL; 1399 1400 if (!(spa_mode & FWRITE)) 1401 return (EROFS); 1402 1403 mutex_enter(&spa_namespace_lock); 1404 if ((spa = spa_lookup(pool)) == NULL) { 1405 mutex_exit(&spa_namespace_lock); 1406 return (ENOENT); 1407 } 1408 1409 /* 1410 * Put a hold on the pool, drop the namespace lock, stop async tasks, 1411 * reacquire the namespace lock, and see if we can export. 1412 */ 1413 spa_open_ref(spa, FTAG); 1414 mutex_exit(&spa_namespace_lock); 1415 spa_async_suspend(spa); 1416 mutex_enter(&spa_namespace_lock); 1417 spa_close(spa, FTAG); 1418 1419 /* 1420 * The pool will be in core if it's openable, 1421 * in which case we can modify its state. 1422 */ 1423 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 1424 /* 1425 * Objsets may be open only because they're dirty, so we 1426 * have to force it to sync before checking spa_refcnt. 1427 */ 1428 spa_scrub_suspend(spa); 1429 txg_wait_synced(spa->spa_dsl_pool, 0); 1430 1431 /* 1432 * A pool cannot be exported or destroyed if there are active 1433 * references. If we are resetting a pool, allow references by 1434 * fault injection handlers. 1435 */ 1436 if (!spa_refcount_zero(spa) || 1437 (spa->spa_inject_ref != 0 && 1438 new_state != POOL_STATE_UNINITIALIZED)) { 1439 spa_scrub_resume(spa); 1440 spa_async_resume(spa); 1441 mutex_exit(&spa_namespace_lock); 1442 return (EBUSY); 1443 } 1444 1445 spa_scrub_resume(spa); 1446 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 1447 1448 /* 1449 * We want this to be reflected on every label, 1450 * so mark them all dirty. spa_unload() will do the 1451 * final sync that pushes these changes out. 1452 */ 1453 if (new_state != POOL_STATE_UNINITIALIZED) { 1454 spa_config_enter(spa, RW_WRITER, FTAG); 1455 spa->spa_state = new_state; 1456 spa->spa_final_txg = spa_last_synced_txg(spa) + 1; 1457 vdev_config_dirty(spa->spa_root_vdev); 1458 spa_config_exit(spa, FTAG); 1459 } 1460 } 1461 1462 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 1463 spa_unload(spa); 1464 spa_deactivate(spa); 1465 } 1466 1467 if (oldconfig && spa->spa_config) 1468 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 1469 1470 if (new_state != POOL_STATE_UNINITIALIZED) { 1471 spa_remove(spa); 1472 spa_config_sync(); 1473 } 1474 mutex_exit(&spa_namespace_lock); 1475 1476 return (0); 1477} 1478 1479/* 1480 * Destroy a storage pool. 1481 */ 1482int 1483spa_destroy(char *pool) 1484{ 1485 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL)); 1486} 1487 1488/* 1489 * Export a storage pool. 1490 */ 1491int 1492spa_export(char *pool, nvlist_t **oldconfig) 1493{ 1494 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig)); 1495} 1496 1497/* 1498 * Similar to spa_export(), this unloads the spa_t without actually removing it 1499 * from the namespace in any way. 1500 */ 1501int 1502spa_reset(char *pool) 1503{ 1504 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL)); 1505} 1506 1507 1508/* 1509 * ========================================================================== 1510 * Device manipulation 1511 * ========================================================================== 1512 */ 1513 1514/* 1515 * Add capacity to a storage pool. 1516 */ 1517int 1518spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 1519{ 1520 uint64_t txg; 1521 int c, error; 1522 vdev_t *rvd = spa->spa_root_vdev; 1523 vdev_t *vd, *tvd; 1524 nvlist_t **spares; 1525 uint_t i, nspares; 1526 1527 txg = spa_vdev_enter(spa); 1528 1529 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 1530 VDEV_ALLOC_ADD)) != 0) 1531 return (spa_vdev_exit(spa, NULL, txg, error)); 1532 1533 spa->spa_pending_vdev = vd; 1534 1535 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1536 &spares, &nspares) != 0) 1537 nspares = 0; 1538 1539 if (vd->vdev_children == 0 && nspares == 0) { 1540 spa->spa_pending_vdev = NULL; 1541 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 1542 } 1543 1544 if (vd->vdev_children != 0) { 1545 if ((error = vdev_create(vd, txg, B_FALSE)) != 0) { 1546 spa->spa_pending_vdev = NULL; 1547 return (spa_vdev_exit(spa, vd, txg, error)); 1548 } 1549 } 1550 1551 /* 1552 * We must validate the spares after checking the children. Otherwise, 1553 * vdev_inuse() will blindly overwrite the spare. 1554 */ 1555 if ((error = spa_validate_spares(spa, nvroot, txg, 1556 VDEV_ALLOC_ADD)) != 0) { 1557 spa->spa_pending_vdev = NULL; 1558 return (spa_vdev_exit(spa, vd, txg, error)); 1559 } 1560 1561 spa->spa_pending_vdev = NULL; 1562 1563 /* 1564 * Transfer each new top-level vdev from vd to rvd. 1565 */ 1566 for (c = 0; c < vd->vdev_children; c++) { 1567 tvd = vd->vdev_child[c]; 1568 vdev_remove_child(vd, tvd); 1569 tvd->vdev_id = rvd->vdev_children; 1570 vdev_add_child(rvd, tvd); 1571 vdev_config_dirty(tvd); 1572 } 1573 1574 if (nspares != 0) { 1575 if (spa->spa_sparelist != NULL) { 1576 nvlist_t **oldspares; 1577 uint_t oldnspares; 1578 nvlist_t **newspares; 1579 1580 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 1581 ZPOOL_CONFIG_SPARES, &oldspares, &oldnspares) == 0); 1582 1583 newspares = kmem_alloc(sizeof (void *) * 1584 (nspares + oldnspares), KM_SLEEP); 1585 for (i = 0; i < oldnspares; i++) 1586 VERIFY(nvlist_dup(oldspares[i], 1587 &newspares[i], KM_SLEEP) == 0); 1588 for (i = 0; i < nspares; i++) 1589 VERIFY(nvlist_dup(spares[i], 1590 &newspares[i + oldnspares], 1591 KM_SLEEP) == 0); 1592 1593 VERIFY(nvlist_remove(spa->spa_sparelist, 1594 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 1595 1596 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1597 ZPOOL_CONFIG_SPARES, newspares, 1598 nspares + oldnspares) == 0); 1599 for (i = 0; i < oldnspares + nspares; i++) 1600 nvlist_free(newspares[i]); 1601 kmem_free(newspares, (oldnspares + nspares) * 1602 sizeof (void *)); 1603 } else { 1604 VERIFY(nvlist_alloc(&spa->spa_sparelist, 1605 NV_UNIQUE_NAME, KM_SLEEP) == 0); 1606 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1607 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1608 } 1609 1610 spa_load_spares(spa); 1611 spa->spa_sync_spares = B_TRUE; 1612 } 1613 1614 /* 1615 * We have to be careful when adding new vdevs to an existing pool. 1616 * If other threads start allocating from these vdevs before we 1617 * sync the config cache, and we lose power, then upon reboot we may 1618 * fail to open the pool because there are DVAs that the config cache 1619 * can't translate. Therefore, we first add the vdevs without 1620 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 1621 * and then let spa_config_update() initialize the new metaslabs. 1622 * 1623 * spa_load() checks for added-but-not-initialized vdevs, so that 1624 * if we lose power at any point in this sequence, the remaining 1625 * steps will be completed the next time we load the pool. 1626 */ 1627 (void) spa_vdev_exit(spa, vd, txg, 0); 1628 1629 mutex_enter(&spa_namespace_lock); 1630 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 1631 mutex_exit(&spa_namespace_lock); 1632 1633 return (0); 1634} 1635 1636/* 1637 * Attach a device to a mirror. The arguments are the path to any device 1638 * in the mirror, and the nvroot for the new device. If the path specifies 1639 * a device that is not mirrored, we automatically insert the mirror vdev. 1640 * 1641 * If 'replacing' is specified, the new device is intended to replace the 1642 * existing device; in this case the two devices are made into their own 1643 * mirror using the 'replacing' vdev, which is functionally idendical to 1644 * the mirror vdev (it actually reuses all the same ops) but has a few 1645 * extra rules: you can't attach to it after it's been created, and upon 1646 * completion of resilvering, the first disk (the one being replaced) 1647 * is automatically detached. 1648 */ 1649int 1650spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 1651{ 1652 uint64_t txg, open_txg; 1653 int error; 1654 vdev_t *rvd = spa->spa_root_vdev; 1655 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 1656 vdev_ops_t *pvops; 1657 1658 txg = spa_vdev_enter(spa); 1659 1660 oldvd = vdev_lookup_by_guid(rvd, guid); 1661 1662 if (oldvd == NULL) 1663 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1664 1665 if (!oldvd->vdev_ops->vdev_op_leaf) 1666 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1667 1668 pvd = oldvd->vdev_parent; 1669 1670 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 1671 VDEV_ALLOC_ADD)) != 0 || newrootvd->vdev_children != 1) 1672 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1673 1674 newvd = newrootvd->vdev_child[0]; 1675 1676 if (!newvd->vdev_ops->vdev_op_leaf) 1677 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1678 1679 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 1680 return (spa_vdev_exit(spa, newrootvd, txg, error)); 1681 1682 if (!replacing) { 1683 /* 1684 * For attach, the only allowable parent is a mirror or the root 1685 * vdev. 1686 */ 1687 if (pvd->vdev_ops != &vdev_mirror_ops && 1688 pvd->vdev_ops != &vdev_root_ops) 1689 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1690 1691 pvops = &vdev_mirror_ops; 1692 } else { 1693 /* 1694 * Active hot spares can only be replaced by inactive hot 1695 * spares. 1696 */ 1697 if (pvd->vdev_ops == &vdev_spare_ops && 1698 pvd->vdev_child[1] == oldvd && 1699 !spa_has_spare(spa, newvd->vdev_guid)) 1700 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1701 1702 /* 1703 * If the source is a hot spare, and the parent isn't already a 1704 * spare, then we want to create a new hot spare. Otherwise, we 1705 * want to create a replacing vdev. The user is not allowed to 1706 * attach to a spared vdev child unless the 'isspare' state is 1707 * the same (spare replaces spare, non-spare replaces 1708 * non-spare). 1709 */ 1710 if (pvd->vdev_ops == &vdev_replacing_ops) 1711 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1712 else if (pvd->vdev_ops == &vdev_spare_ops && 1713 newvd->vdev_isspare != oldvd->vdev_isspare) 1714 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1715 else if (pvd->vdev_ops != &vdev_spare_ops && 1716 newvd->vdev_isspare) 1717 pvops = &vdev_spare_ops; 1718 else 1719 pvops = &vdev_replacing_ops; 1720 } 1721 1722 /* 1723 * Compare the new device size with the replaceable/attachable 1724 * device size. 1725 */ 1726 if (newvd->vdev_psize < vdev_get_rsize(oldvd)) 1727 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 1728 1729 /* 1730 * The new device cannot have a higher alignment requirement 1731 * than the top-level vdev. 1732 */ 1733 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 1734 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 1735 1736 /* 1737 * If this is an in-place replacement, update oldvd's path and devid 1738 * to make it distinguishable from newvd, and unopenable from now on. 1739 */ 1740 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 1741 spa_strfree(oldvd->vdev_path); 1742 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 1743 KM_SLEEP); 1744 (void) sprintf(oldvd->vdev_path, "%s/%s", 1745 newvd->vdev_path, "old"); 1746 if (oldvd->vdev_devid != NULL) { 1747 spa_strfree(oldvd->vdev_devid); 1748 oldvd->vdev_devid = NULL; 1749 } 1750 } 1751 1752 /* 1753 * If the parent is not a mirror, or if we're replacing, insert the new 1754 * mirror/replacing/spare vdev above oldvd. 1755 */ 1756 if (pvd->vdev_ops != pvops) 1757 pvd = vdev_add_parent(oldvd, pvops); 1758 1759 ASSERT(pvd->vdev_top->vdev_parent == rvd); 1760 ASSERT(pvd->vdev_ops == pvops); 1761 ASSERT(oldvd->vdev_parent == pvd); 1762 1763 /* 1764 * Extract the new device from its root and add it to pvd. 1765 */ 1766 vdev_remove_child(newrootvd, newvd); 1767 newvd->vdev_id = pvd->vdev_children; 1768 vdev_add_child(pvd, newvd); 1769 1770 /* 1771 * If newvd is smaller than oldvd, but larger than its rsize, 1772 * the addition of newvd may have decreased our parent's asize. 1773 */ 1774 pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); 1775 1776 tvd = newvd->vdev_top; 1777 ASSERT(pvd->vdev_top == tvd); 1778 ASSERT(tvd->vdev_parent == rvd); 1779 1780 vdev_config_dirty(tvd); 1781 1782 /* 1783 * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 1784 * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 1785 */ 1786 open_txg = txg + TXG_CONCURRENT_STATES - 1; 1787 1788 mutex_enter(&newvd->vdev_dtl_lock); 1789 space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, 1790 open_txg - TXG_INITIAL + 1); 1791 mutex_exit(&newvd->vdev_dtl_lock); 1792 1793 if (newvd->vdev_isspare) 1794 spa_spare_activate(newvd); 1795 1796 /* 1797 * Mark newvd's DTL dirty in this txg. 1798 */ 1799 vdev_dirty(tvd, VDD_DTL, newvd, txg); 1800 1801 (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 1802 1803 /* 1804 * Kick off a resilver to update newvd. 1805 */ 1806 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1807 1808 return (0); 1809} 1810 1811/* 1812 * Detach a device from a mirror or replacing vdev. 1813 * If 'replace_done' is specified, only detach if the parent 1814 * is a replacing vdev. 1815 */ 1816int 1817spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) 1818{ 1819 uint64_t txg; 1820 int c, t, error; 1821 vdev_t *rvd = spa->spa_root_vdev; 1822 vdev_t *vd, *pvd, *cvd, *tvd; 1823 boolean_t unspare = B_FALSE; 1824 uint64_t unspare_guid; 1825 1826 txg = spa_vdev_enter(spa); 1827 1828 vd = vdev_lookup_by_guid(rvd, guid); 1829 1830 if (vd == NULL) 1831 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1832 1833 if (!vd->vdev_ops->vdev_op_leaf) 1834 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1835 1836 pvd = vd->vdev_parent; 1837 1838 /* 1839 * If replace_done is specified, only remove this device if it's 1840 * the first child of a replacing vdev. For the 'spare' vdev, either 1841 * disk can be removed. 1842 */ 1843 if (replace_done) { 1844 if (pvd->vdev_ops == &vdev_replacing_ops) { 1845 if (vd->vdev_id != 0) 1846 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1847 } else if (pvd->vdev_ops != &vdev_spare_ops) { 1848 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1849 } 1850 } 1851 1852 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 1853 spa_version(spa) >= ZFS_VERSION_SPARES); 1854 1855 /* 1856 * Only mirror, replacing, and spare vdevs support detach. 1857 */ 1858 if (pvd->vdev_ops != &vdev_replacing_ops && 1859 pvd->vdev_ops != &vdev_mirror_ops && 1860 pvd->vdev_ops != &vdev_spare_ops) 1861 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1862 1863 /* 1864 * If there's only one replica, you can't detach it. 1865 */ 1866 if (pvd->vdev_children <= 1) 1867 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1868 1869 /* 1870 * If all siblings have non-empty DTLs, this device may have the only 1871 * valid copy of the data, which means we cannot safely detach it. 1872 * 1873 * XXX -- as in the vdev_offline() case, we really want a more 1874 * precise DTL check. 1875 */ 1876 for (c = 0; c < pvd->vdev_children; c++) { 1877 uint64_t dirty; 1878 1879 cvd = pvd->vdev_child[c]; 1880 if (cvd == vd) 1881 continue; 1882 if (vdev_is_dead(cvd)) 1883 continue; 1884 mutex_enter(&cvd->vdev_dtl_lock); 1885 dirty = cvd->vdev_dtl_map.sm_space | 1886 cvd->vdev_dtl_scrub.sm_space; 1887 mutex_exit(&cvd->vdev_dtl_lock); 1888 if (!dirty) 1889 break; 1890 } 1891 1892 /* 1893 * If we are a replacing or spare vdev, then we can always detach the 1894 * latter child, as that is how one cancels the operation. 1895 */ 1896 if ((pvd->vdev_ops == &vdev_mirror_ops || vd->vdev_id != 1) && 1897 c == pvd->vdev_children) 1898 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1899 1900 /* 1901 * If we are detaching the original disk from a spare, then it implies 1902 * that the spare should become a real disk, and be removed from the 1903 * active spare list for the pool. 1904 */ 1905 if (pvd->vdev_ops == &vdev_spare_ops && 1906 vd->vdev_id == 0) 1907 unspare = B_TRUE; 1908 1909 /* 1910 * Erase the disk labels so the disk can be used for other things. 1911 * This must be done after all other error cases are handled, 1912 * but before we disembowel vd (so we can still do I/O to it). 1913 * But if we can't do it, don't treat the error as fatal -- 1914 * it may be that the unwritability of the disk is the reason 1915 * it's being detached! 1916 */ 1917 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 1918 1919 /* 1920 * Remove vd from its parent and compact the parent's children. 1921 */ 1922 vdev_remove_child(pvd, vd); 1923 vdev_compact_children(pvd); 1924 1925 /* 1926 * Remember one of the remaining children so we can get tvd below. 1927 */ 1928 cvd = pvd->vdev_child[0]; 1929 1930 /* 1931 * If we need to remove the remaining child from the list of hot spares, 1932 * do it now, marking the vdev as no longer a spare in the process. We 1933 * must do this before vdev_remove_parent(), because that can change the 1934 * GUID if it creates a new toplevel GUID. 1935 */ 1936 if (unspare) { 1937 ASSERT(cvd->vdev_isspare); 1938 spa_spare_remove(cvd); 1939 unspare_guid = cvd->vdev_guid; 1940 } 1941 1942 /* 1943 * If the parent mirror/replacing vdev only has one child, 1944 * the parent is no longer needed. Remove it from the tree. 1945 */ 1946 if (pvd->vdev_children == 1) 1947 vdev_remove_parent(cvd); 1948 1949 /* 1950 * We don't set tvd until now because the parent we just removed 1951 * may have been the previous top-level vdev. 1952 */ 1953 tvd = cvd->vdev_top; 1954 ASSERT(tvd->vdev_parent == rvd); 1955 1956 /* 1957 * Reevaluate the parent vdev state. 1958 */ 1959 vdev_propagate_state(cvd->vdev_parent); 1960 1961 /* 1962 * If the device we just detached was smaller than the others, it may be 1963 * possible to add metaslabs (i.e. grow the pool). vdev_metaslab_init() 1964 * can't fail because the existing metaslabs are already in core, so 1965 * there's nothing to read from disk. 1966 */ 1967 VERIFY(vdev_metaslab_init(tvd, txg) == 0); 1968 1969 vdev_config_dirty(tvd); 1970 1971 /* 1972 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 1973 * vd->vdev_detached is set and free vd's DTL object in syncing context. 1974 * But first make sure we're not on any *other* txg's DTL list, to 1975 * prevent vd from being accessed after it's freed. 1976 */ 1977 for (t = 0; t < TXG_SIZE; t++) 1978 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 1979 vd->vdev_detached = B_TRUE; 1980 vdev_dirty(tvd, VDD_DTL, vd, txg); 1981 1982 error = spa_vdev_exit(spa, vd, txg, 0); 1983 1984 /* 1985 * If this was the removal of the original device in a hot spare vdev, 1986 * then we want to go through and remove the device from the hot spare 1987 * list of every other pool. 1988 */ 1989 if (unspare) { 1990 spa = NULL; 1991 mutex_enter(&spa_namespace_lock); 1992 while ((spa = spa_next(spa)) != NULL) { 1993 if (spa->spa_state != POOL_STATE_ACTIVE) 1994 continue; 1995 1996 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 1997 } 1998 mutex_exit(&spa_namespace_lock); 1999 } 2000 2001 return (error); 2002} 2003 2004/* 2005 * Remove a device from the pool. Currently, this supports removing only hot 2006 * spares. 2007 */ 2008int 2009spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 2010{ 2011 vdev_t *vd; 2012 nvlist_t **spares, *nv, **newspares; 2013 uint_t i, j, nspares; 2014 int ret = 0; 2015 2016 spa_config_enter(spa, RW_WRITER, FTAG); 2017 2018 vd = spa_lookup_by_guid(spa, guid); 2019 2020 nv = NULL; 2021 if (spa->spa_spares != NULL && 2022 nvlist_lookup_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 2023 &spares, &nspares) == 0) { 2024 for (i = 0; i < nspares; i++) { 2025 uint64_t theguid; 2026 2027 VERIFY(nvlist_lookup_uint64(spares[i], 2028 ZPOOL_CONFIG_GUID, &theguid) == 0); 2029 if (theguid == guid) { 2030 nv = spares[i]; 2031 break; 2032 } 2033 } 2034 } 2035 2036 /* 2037 * We only support removing a hot spare, and only if it's not currently 2038 * in use in this pool. 2039 */ 2040 if (nv == NULL && vd == NULL) { 2041 ret = ENOENT; 2042 goto out; 2043 } 2044 2045 if (nv == NULL && vd != NULL) { 2046 ret = ENOTSUP; 2047 goto out; 2048 } 2049 2050 if (!unspare && nv != NULL && vd != NULL) { 2051 ret = EBUSY; 2052 goto out; 2053 } 2054 2055 if (nspares == 1) { 2056 newspares = NULL; 2057 } else { 2058 newspares = kmem_alloc((nspares - 1) * sizeof (void *), 2059 KM_SLEEP); 2060 for (i = 0, j = 0; i < nspares; i++) { 2061 if (spares[i] != nv) 2062 VERIFY(nvlist_dup(spares[i], 2063 &newspares[j++], KM_SLEEP) == 0); 2064 } 2065 } 2066 2067 VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 2068 DATA_TYPE_NVLIST_ARRAY) == 0); 2069 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 2070 newspares, nspares - 1) == 0); 2071 for (i = 0; i < nspares - 1; i++) 2072 nvlist_free(newspares[i]); 2073 kmem_free(newspares, (nspares - 1) * sizeof (void *)); 2074 spa_load_spares(spa); 2075 spa->spa_sync_spares = B_TRUE; 2076 2077out: 2078 spa_config_exit(spa, FTAG); 2079 2080 return (ret); 2081} 2082 2083/* 2084 * Find any device that's done replacing, so we can detach it. 2085 */ 2086static vdev_t * 2087spa_vdev_replace_done_hunt(vdev_t *vd) 2088{ 2089 vdev_t *newvd, *oldvd; 2090 int c; 2091 2092 for (c = 0; c < vd->vdev_children; c++) { 2093 oldvd = spa_vdev_replace_done_hunt(vd->vdev_child[c]); 2094 if (oldvd != NULL) 2095 return (oldvd); 2096 } 2097 2098 if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 2099 oldvd = vd->vdev_child[0]; 2100 newvd = vd->vdev_child[1]; 2101 2102 mutex_enter(&newvd->vdev_dtl_lock); 2103 if (newvd->vdev_dtl_map.sm_space == 0 && 2104 newvd->vdev_dtl_scrub.sm_space == 0) { 2105 mutex_exit(&newvd->vdev_dtl_lock); 2106 return (oldvd); 2107 } 2108 mutex_exit(&newvd->vdev_dtl_lock); 2109 } 2110 2111 return (NULL); 2112} 2113 2114static void 2115spa_vdev_replace_done(spa_t *spa) 2116{ 2117 vdev_t *vd; 2118 vdev_t *pvd; 2119 uint64_t guid; 2120 uint64_t pguid = 0; 2121 2122 spa_config_enter(spa, RW_READER, FTAG); 2123 2124 while ((vd = spa_vdev_replace_done_hunt(spa->spa_root_vdev)) != NULL) { 2125 guid = vd->vdev_guid; 2126 /* 2127 * If we have just finished replacing a hot spared device, then 2128 * we need to detach the parent's first child (the original hot 2129 * spare) as well. 2130 */ 2131 pvd = vd->vdev_parent; 2132 if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops && 2133 pvd->vdev_id == 0) { 2134 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 2135 ASSERT(pvd->vdev_parent->vdev_children == 2); 2136 pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid; 2137 } 2138 spa_config_exit(spa, FTAG); 2139 if (spa_vdev_detach(spa, guid, B_TRUE) != 0) 2140 return; 2141 if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0) 2142 return; 2143 spa_config_enter(spa, RW_READER, FTAG); 2144 } 2145 2146 spa_config_exit(spa, FTAG); 2147} 2148 2149/* 2150 * Update the stored path for this vdev. Dirty the vdev configuration, relying 2151 * on spa_vdev_enter/exit() to synchronize the labels and cache. 2152 */ 2153int 2154spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 2155{ 2156 vdev_t *rvd, *vd; 2157 uint64_t txg; 2158 2159 rvd = spa->spa_root_vdev; 2160 2161 txg = spa_vdev_enter(spa); 2162 2163 if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 2164 /* 2165 * Determine if this is a reference to a hot spare. In that 2166 * case, update the path as stored in the spare list. 2167 */ 2168 nvlist_t **spares; 2169 uint_t i, nspares; 2170 if (spa->spa_sparelist != NULL) { 2171 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 2172 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 2173 for (i = 0; i < nspares; i++) { 2174 uint64_t theguid; 2175 VERIFY(nvlist_lookup_uint64(spares[i], 2176 ZPOOL_CONFIG_GUID, &theguid) == 0); 2177 if (theguid == guid) 2178 break; 2179 } 2180 2181 if (i == nspares) 2182 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 2183 2184 VERIFY(nvlist_add_string(spares[i], 2185 ZPOOL_CONFIG_PATH, newpath) == 0); 2186 spa_load_spares(spa); 2187 spa->spa_sync_spares = B_TRUE; 2188 return (spa_vdev_exit(spa, NULL, txg, 0)); 2189 } else { 2190 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 2191 } 2192 } 2193 2194 if (!vd->vdev_ops->vdev_op_leaf) 2195 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 2196 2197 spa_strfree(vd->vdev_path); 2198 vd->vdev_path = spa_strdup(newpath); 2199 2200 vdev_config_dirty(vd->vdev_top); 2201 2202 return (spa_vdev_exit(spa, NULL, txg, 0)); 2203} 2204 2205/* 2206 * ========================================================================== 2207 * SPA Scrubbing 2208 * ========================================================================== 2209 */ 2210 2211static void 2212spa_scrub_io_done(zio_t *zio) 2213{ 2214 spa_t *spa = zio->io_spa; 2215 2216 zio_data_buf_free(zio->io_data, zio->io_size); 2217 2218 mutex_enter(&spa->spa_scrub_lock); 2219 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2220 vdev_t *vd = zio->io_vd ? zio->io_vd : spa->spa_root_vdev; 2221 spa->spa_scrub_errors++; 2222 mutex_enter(&vd->vdev_stat_lock); 2223 vd->vdev_stat.vs_scrub_errors++; 2224 mutex_exit(&vd->vdev_stat_lock); 2225 } 2226 2227 if (--spa->spa_scrub_inflight < spa->spa_scrub_maxinflight) 2228 cv_broadcast(&spa->spa_scrub_io_cv); 2229 2230 ASSERT(spa->spa_scrub_inflight >= 0); 2231 2232 mutex_exit(&spa->spa_scrub_lock); 2233} 2234 2235static void 2236spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags, 2237 zbookmark_t *zb) 2238{ 2239 size_t size = BP_GET_LSIZE(bp); 2240 void *data; 2241 2242 mutex_enter(&spa->spa_scrub_lock); 2243 /* 2244 * Do not give too much work to vdev(s). 2245 */ 2246 while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight) { 2247 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2248 } 2249 spa->spa_scrub_inflight++; 2250 mutex_exit(&spa->spa_scrub_lock); 2251 2252 data = zio_data_buf_alloc(size); 2253 2254 if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) 2255 flags |= ZIO_FLAG_SPECULATIVE; /* intent log block */ 2256 2257 flags |= ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL; 2258 2259 zio_nowait(zio_read(NULL, spa, bp, data, size, 2260 spa_scrub_io_done, NULL, priority, flags, zb)); 2261} 2262 2263/* ARGSUSED */ 2264static int 2265spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) 2266{ 2267 blkptr_t *bp = &bc->bc_blkptr; 2268 vdev_t *vd = spa->spa_root_vdev; 2269 dva_t *dva = bp->blk_dva; 2270 int needs_resilver = B_FALSE; 2271 int d; 2272 2273 if (bc->bc_errno) { 2274 /* 2275 * We can't scrub this block, but we can continue to scrub 2276 * the rest of the pool. Note the error and move along. 2277 */ 2278 mutex_enter(&spa->spa_scrub_lock); 2279 spa->spa_scrub_errors++; 2280 mutex_exit(&spa->spa_scrub_lock); 2281 2282 mutex_enter(&vd->vdev_stat_lock); 2283 vd->vdev_stat.vs_scrub_errors++; 2284 mutex_exit(&vd->vdev_stat_lock); 2285 2286 return (ERESTART); 2287 } 2288 2289 ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg); 2290 2291 for (d = 0; d < BP_GET_NDVAS(bp); d++) { 2292 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d])); 2293 2294 ASSERT(vd != NULL); 2295 2296 /* 2297 * Keep track of how much data we've examined so that 2298 * zpool(1M) status can make useful progress reports. 2299 */ 2300 mutex_enter(&vd->vdev_stat_lock); 2301 vd->vdev_stat.vs_scrub_examined += DVA_GET_ASIZE(&dva[d]); 2302 mutex_exit(&vd->vdev_stat_lock); 2303 2304 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) { 2305 if (DVA_GET_GANG(&dva[d])) { 2306 /* 2307 * Gang members may be spread across multiple 2308 * vdevs, so the best we can do is look at the 2309 * pool-wide DTL. 2310 * XXX -- it would be better to change our 2311 * allocation policy to ensure that this can't 2312 * happen. 2313 */ 2314 vd = spa->spa_root_vdev; 2315 } 2316 if (vdev_dtl_contains(&vd->vdev_dtl_map, 2317 bp->blk_birth, 1)) 2318 needs_resilver = B_TRUE; 2319 } 2320 } 2321 2322 if (spa->spa_scrub_type == POOL_SCRUB_EVERYTHING) 2323 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB, 2324 ZIO_FLAG_SCRUB, &bc->bc_bookmark); 2325 else if (needs_resilver) 2326 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, 2327 ZIO_FLAG_RESILVER, &bc->bc_bookmark); 2328 2329 return (0); 2330} 2331 2332static void 2333spa_scrub_thread(void *arg) 2334{ 2335 spa_t *spa = arg; 2336 callb_cpr_t cprinfo; 2337 traverse_handle_t *th = spa->spa_scrub_th; 2338 vdev_t *rvd = spa->spa_root_vdev; 2339 pool_scrub_type_t scrub_type = spa->spa_scrub_type; 2340 int error = 0; 2341 boolean_t complete; 2342 2343 CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG); 2344 2345 /* 2346 * If we're restarting due to a snapshot create/delete, 2347 * wait for that to complete. 2348 */ 2349 txg_wait_synced(spa_get_dsl(spa), 0); 2350 2351 dprintf("start %s mintxg=%llu maxtxg=%llu\n", 2352 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 2353 spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg); 2354 2355 spa_config_enter(spa, RW_WRITER, FTAG); 2356 vdev_reopen(rvd); /* purge all vdev caches */ 2357 vdev_config_dirty(rvd); /* rewrite all disk labels */ 2358 vdev_scrub_stat_update(rvd, scrub_type, B_FALSE); 2359 spa_config_exit(spa, FTAG); 2360 2361 mutex_enter(&spa->spa_scrub_lock); 2362 spa->spa_scrub_errors = 0; 2363 spa->spa_scrub_active = 1; 2364 ASSERT(spa->spa_scrub_inflight == 0); 2365 2366 while (!spa->spa_scrub_stop) { 2367 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2368 while (spa->spa_scrub_suspended) { 2369 spa->spa_scrub_active = 0; 2370 cv_broadcast(&spa->spa_scrub_cv); 2371 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2372 spa->spa_scrub_active = 1; 2373 } 2374 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock); 2375 2376 if (spa->spa_scrub_restart_txg != 0) 2377 break; 2378 2379 mutex_exit(&spa->spa_scrub_lock); 2380 error = traverse_more(th); 2381 mutex_enter(&spa->spa_scrub_lock); 2382 if (error != EAGAIN) 2383 break; 2384 } 2385 2386 while (spa->spa_scrub_inflight) 2387 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2388 2389 spa->spa_scrub_active = 0; 2390 cv_broadcast(&spa->spa_scrub_cv); 2391 2392 mutex_exit(&spa->spa_scrub_lock); 2393 2394 spa_config_enter(spa, RW_WRITER, FTAG); 2395 2396 mutex_enter(&spa->spa_scrub_lock); 2397 2398 /* 2399 * Note: we check spa_scrub_restart_txg under both spa_scrub_lock 2400 * AND the spa config lock to synchronize with any config changes 2401 * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit(). 2402 */ 2403 if (spa->spa_scrub_restart_txg != 0) 2404 error = ERESTART; 2405 2406 if (spa->spa_scrub_stop) 2407 error = EINTR; 2408 2409 /* 2410 * Even if there were uncorrectable errors, we consider the scrub 2411 * completed. The downside is that if there is a transient error during 2412 * a resilver, we won't resilver the data properly to the target. But 2413 * if the damage is permanent (more likely) we will resilver forever, 2414 * which isn't really acceptable. Since there is enough information for 2415 * the user to know what has failed and why, this seems like a more 2416 * tractable approach. 2417 */ 2418 complete = (error == 0); 2419 2420 dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n", 2421 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 2422 spa->spa_scrub_maxtxg, complete ? "done" : "FAILED", 2423 error, spa->spa_scrub_errors, spa->spa_scrub_stop); 2424 2425 mutex_exit(&spa->spa_scrub_lock); 2426 2427 /* 2428 * If the scrub/resilver completed, update all DTLs to reflect this. 2429 * Whether it succeeded or not, vacate all temporary scrub DTLs. 2430 */ 2431 vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1, 2432 complete ? spa->spa_scrub_maxtxg : 0, B_TRUE); 2433 vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); 2434 spa_errlog_rotate(spa); 2435 2436 spa_config_exit(spa, FTAG); 2437 2438 mutex_enter(&spa->spa_scrub_lock); 2439 2440 /* 2441 * We may have finished replacing a device. 2442 * Let the async thread assess this and handle the detach. 2443 */ 2444 spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 2445 2446 /* 2447 * If we were told to restart, our final act is to start a new scrub. 2448 */ 2449 if (error == ERESTART) 2450 spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ? 2451 SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB); 2452 2453 spa->spa_scrub_type = POOL_SCRUB_NONE; 2454 spa->spa_scrub_active = 0; 2455 spa->spa_scrub_thread = NULL; 2456 cv_broadcast(&spa->spa_scrub_cv); 2457 CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */ 2458 thread_exit(); 2459} 2460 2461void 2462spa_scrub_suspend(spa_t *spa) 2463{ 2464 mutex_enter(&spa->spa_scrub_lock); 2465 spa->spa_scrub_suspended++; 2466 while (spa->spa_scrub_active) { 2467 cv_broadcast(&spa->spa_scrub_cv); 2468 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2469 } 2470 while (spa->spa_scrub_inflight) 2471 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2472 mutex_exit(&spa->spa_scrub_lock); 2473} 2474 2475void 2476spa_scrub_resume(spa_t *spa) 2477{ 2478 mutex_enter(&spa->spa_scrub_lock); 2479 ASSERT(spa->spa_scrub_suspended != 0); 2480 if (--spa->spa_scrub_suspended == 0) 2481 cv_broadcast(&spa->spa_scrub_cv); 2482 mutex_exit(&spa->spa_scrub_lock); 2483} 2484 2485void 2486spa_scrub_restart(spa_t *spa, uint64_t txg) 2487{ 2488 /* 2489 * Something happened (e.g. snapshot create/delete) that means 2490 * we must restart any in-progress scrubs. The itinerary will 2491 * fix this properly. 2492 */ 2493 mutex_enter(&spa->spa_scrub_lock); 2494 spa->spa_scrub_restart_txg = txg; 2495 mutex_exit(&spa->spa_scrub_lock); 2496} 2497 2498int 2499spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) 2500{ 2501 space_seg_t *ss; 2502 uint64_t mintxg, maxtxg; 2503 vdev_t *rvd = spa->spa_root_vdev; 2504 2505 if ((uint_t)type >= POOL_SCRUB_TYPES) 2506 return (ENOTSUP); 2507 2508 mutex_enter(&spa->spa_scrub_lock); 2509 2510 /* 2511 * If there's a scrub or resilver already in progress, stop it. 2512 */ 2513 while (spa->spa_scrub_thread != NULL) { 2514 /* 2515 * Don't stop a resilver unless forced. 2516 */ 2517 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) { 2518 mutex_exit(&spa->spa_scrub_lock); 2519 return (EBUSY); 2520 } 2521 spa->spa_scrub_stop = 1; 2522 cv_broadcast(&spa->spa_scrub_cv); 2523 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2524 } 2525 2526 /* 2527 * Terminate the previous traverse. 2528 */ 2529 if (spa->spa_scrub_th != NULL) { 2530 traverse_fini(spa->spa_scrub_th); 2531 spa->spa_scrub_th = NULL; 2532 } 2533 2534 if (rvd == NULL) { 2535 ASSERT(spa->spa_scrub_stop == 0); 2536 ASSERT(spa->spa_scrub_type == type); 2537 ASSERT(spa->spa_scrub_restart_txg == 0); 2538 mutex_exit(&spa->spa_scrub_lock); 2539 return (0); 2540 } 2541 2542 mintxg = TXG_INITIAL - 1; 2543 maxtxg = spa_last_synced_txg(spa) + 1; 2544 2545 mutex_enter(&rvd->vdev_dtl_lock); 2546 2547 if (rvd->vdev_dtl_map.sm_space == 0) { 2548 /* 2549 * The pool-wide DTL is empty. 2550 * If this is a resilver, there's nothing to do except 2551 * check whether any in-progress replacements have completed. 2552 */ 2553 if (type == POOL_SCRUB_RESILVER) { 2554 type = POOL_SCRUB_NONE; 2555 spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 2556 } 2557 } else { 2558 /* 2559 * The pool-wide DTL is non-empty. 2560 * If this is a normal scrub, upgrade to a resilver instead. 2561 */ 2562 if (type == POOL_SCRUB_EVERYTHING) 2563 type = POOL_SCRUB_RESILVER; 2564 } 2565 2566 if (type == POOL_SCRUB_RESILVER) { 2567 /* 2568 * Determine the resilvering boundaries. 2569 * 2570 * Note: (mintxg, maxtxg) is an open interval, 2571 * i.e. mintxg and maxtxg themselves are not included. 2572 * 2573 * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1 2574 * so we don't claim to resilver a txg that's still changing. 2575 */ 2576 ss = avl_first(&rvd->vdev_dtl_map.sm_root); 2577 mintxg = ss->ss_start - 1; 2578 ss = avl_last(&rvd->vdev_dtl_map.sm_root); 2579 maxtxg = MIN(ss->ss_end, maxtxg); 2580 } 2581 2582 mutex_exit(&rvd->vdev_dtl_lock); 2583 2584 spa->spa_scrub_stop = 0; 2585 spa->spa_scrub_type = type; 2586 spa->spa_scrub_restart_txg = 0; 2587 2588 if (type != POOL_SCRUB_NONE) { 2589 spa->spa_scrub_mintxg = mintxg; 2590 spa->spa_scrub_maxtxg = maxtxg; 2591 spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL, 2592 ADVANCE_PRE | ADVANCE_PRUNE | ADVANCE_ZIL, 2593 ZIO_FLAG_CANFAIL); 2594 traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg); 2595 spa->spa_scrub_thread = thread_create(NULL, 0, 2596 spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri); 2597 } 2598 2599 mutex_exit(&spa->spa_scrub_lock); 2600 2601 return (0); 2602} 2603 2604/* 2605 * ========================================================================== 2606 * SPA async task processing 2607 * ========================================================================== 2608 */ 2609 2610static void 2611spa_async_reopen(spa_t *spa) 2612{ 2613 vdev_t *rvd = spa->spa_root_vdev; 2614 vdev_t *tvd; 2615 int c; 2616 2617 spa_config_enter(spa, RW_WRITER, FTAG); 2618 2619 for (c = 0; c < rvd->vdev_children; c++) { 2620 tvd = rvd->vdev_child[c]; 2621 if (tvd->vdev_reopen_wanted) { 2622 tvd->vdev_reopen_wanted = 0; 2623 vdev_reopen(tvd); 2624 } 2625 } 2626 2627 spa_config_exit(spa, FTAG); 2628} 2629 2630static void 2631spa_async_thread(void *arg) 2632{ 2633 spa_t *spa = arg; 2634 int tasks; 2635 2636 ASSERT(spa->spa_sync_on); 2637 2638 mutex_enter(&spa->spa_async_lock); 2639 tasks = spa->spa_async_tasks; 2640 spa->spa_async_tasks = 0; 2641 mutex_exit(&spa->spa_async_lock); 2642 2643 /* 2644 * See if the config needs to be updated. 2645 */ 2646 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 2647 mutex_enter(&spa_namespace_lock); 2648 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 2649 mutex_exit(&spa_namespace_lock); 2650 } 2651 2652 /* 2653 * See if any devices need to be reopened. 2654 */ 2655 if (tasks & SPA_ASYNC_REOPEN) 2656 spa_async_reopen(spa); 2657 2658 /* 2659 * If any devices are done replacing, detach them. 2660 */ 2661 if (tasks & SPA_ASYNC_REPLACE_DONE) 2662 spa_vdev_replace_done(spa); 2663 2664 /* 2665 * Kick off a scrub. 2666 */ 2667 if (tasks & SPA_ASYNC_SCRUB) 2668 VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0); 2669 2670 /* 2671 * Kick off a resilver. 2672 */ 2673 if (tasks & SPA_ASYNC_RESILVER) 2674 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 2675 2676 /* 2677 * Let the world know that we're done. 2678 */ 2679 mutex_enter(&spa->spa_async_lock); 2680 spa->spa_async_thread = NULL; 2681 cv_broadcast(&spa->spa_async_cv); 2682 mutex_exit(&spa->spa_async_lock); 2683 thread_exit(); 2684} 2685 2686void 2687spa_async_suspend(spa_t *spa) 2688{ 2689 mutex_enter(&spa->spa_async_lock); 2690 spa->spa_async_suspended++; 2691 while (spa->spa_async_thread != NULL) 2692 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 2693 mutex_exit(&spa->spa_async_lock); 2694} 2695 2696void 2697spa_async_resume(spa_t *spa) 2698{ 2699 mutex_enter(&spa->spa_async_lock); 2700 ASSERT(spa->spa_async_suspended != 0); 2701 spa->spa_async_suspended--; 2702 mutex_exit(&spa->spa_async_lock); 2703} 2704 2705static void 2706spa_async_dispatch(spa_t *spa) 2707{ 2708 mutex_enter(&spa->spa_async_lock); 2709 if (spa->spa_async_tasks && !spa->spa_async_suspended && 2710 spa->spa_async_thread == NULL && 2711 rootdir != NULL && !vn_is_readonly(rootdir)) 2712 spa->spa_async_thread = thread_create(NULL, 0, 2713 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 2714 mutex_exit(&spa->spa_async_lock); 2715} 2716 2717void 2718spa_async_request(spa_t *spa, int task) 2719{ 2720 mutex_enter(&spa->spa_async_lock); 2721 spa->spa_async_tasks |= task; 2722 mutex_exit(&spa->spa_async_lock); 2723} 2724 2725/* 2726 * ========================================================================== 2727 * SPA syncing routines 2728 * ========================================================================== 2729 */ 2730 2731static void 2732spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 2733{ 2734 bplist_t *bpl = &spa->spa_sync_bplist; 2735 dmu_tx_t *tx; 2736 blkptr_t blk; 2737 uint64_t itor = 0; 2738 zio_t *zio; 2739 int error; 2740 uint8_t c = 1; 2741 2742 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); 2743 2744 while (bplist_iterate(bpl, &itor, &blk) == 0) 2745 zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); 2746 2747 error = zio_wait(zio); 2748 ASSERT3U(error, ==, 0); 2749 2750 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2751 bplist_vacate(bpl, tx); 2752 2753 /* 2754 * Pre-dirty the first block so we sync to convergence faster. 2755 * (Usually only the first block is needed.) 2756 */ 2757 dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 2758 dmu_tx_commit(tx); 2759} 2760 2761static void 2762spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 2763{ 2764 char *packed = NULL; 2765 size_t nvsize = 0; 2766 dmu_buf_t *db; 2767 2768 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 2769 2770 packed = kmem_alloc(nvsize, KM_SLEEP); 2771 2772 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 2773 KM_SLEEP) == 0); 2774 2775 dmu_write(spa->spa_meta_objset, obj, 0, nvsize, packed, tx); 2776 2777 kmem_free(packed, nvsize); 2778 2779 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 2780 dmu_buf_will_dirty(db, tx); 2781 *(uint64_t *)db->db_data = nvsize; 2782 dmu_buf_rele(db, FTAG); 2783} 2784 2785static void 2786spa_sync_spares(spa_t *spa, dmu_tx_t *tx) 2787{ 2788 nvlist_t *nvroot; 2789 nvlist_t **spares; 2790 int i; 2791 2792 if (!spa->spa_sync_spares) 2793 return; 2794 2795 /* 2796 * Update the MOS nvlist describing the list of available spares. 2797 * spa_validate_spares() will have already made sure this nvlist is 2798 * valid and the vdevs are labelled appropriately. 2799 */ 2800 if (spa->spa_spares_object == 0) { 2801 spa->spa_spares_object = dmu_object_alloc(spa->spa_meta_objset, 2802 DMU_OT_PACKED_NVLIST, 1 << 14, 2803 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 2804 VERIFY(zap_update(spa->spa_meta_objset, 2805 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SPARES, 2806 sizeof (uint64_t), 1, &spa->spa_spares_object, tx) == 0); 2807 } 2808 2809 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 2810 if (spa->spa_nspares == 0) { 2811 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2812 NULL, 0) == 0); 2813 } else { 2814 spares = kmem_alloc(spa->spa_nspares * sizeof (void *), 2815 KM_SLEEP); 2816 for (i = 0; i < spa->spa_nspares; i++) 2817 spares[i] = vdev_config_generate(spa, 2818 spa->spa_spares[i], B_FALSE, B_TRUE); 2819 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2820 spares, spa->spa_nspares) == 0); 2821 for (i = 0; i < spa->spa_nspares; i++) 2822 nvlist_free(spares[i]); 2823 kmem_free(spares, spa->spa_nspares * sizeof (void *)); 2824 } 2825 2826 spa_sync_nvlist(spa, spa->spa_spares_object, nvroot, tx); 2827 nvlist_free(nvroot); 2828 2829 spa->spa_sync_spares = B_FALSE; 2830} 2831 2832static void 2833spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 2834{ 2835 nvlist_t *config; 2836 2837 if (list_is_empty(&spa->spa_dirty_list)) 2838 return; 2839 2840 config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); 2841 2842 if (spa->spa_config_syncing) 2843 nvlist_free(spa->spa_config_syncing); 2844 spa->spa_config_syncing = config; 2845 2846 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 2847} 2848 2849static void 2850spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx) 2851{ 2852 spa_t *spa = arg1; 2853 nvlist_t *nvp = arg2; 2854 nvpair_t *nvpair; 2855 objset_t *mos = spa->spa_meta_objset; 2856 uint64_t zapobj; 2857 2858 mutex_enter(&spa->spa_props_lock); 2859 if (spa->spa_pool_props_object == 0) { 2860 zapobj = zap_create(mos, DMU_OT_POOL_PROPS, DMU_OT_NONE, 0, tx); 2861 VERIFY(zapobj > 0); 2862 2863 spa->spa_pool_props_object = zapobj; 2864 2865 VERIFY(zap_update(mos, DMU_POOL_DIRECTORY_OBJECT, 2866 DMU_POOL_PROPS, 8, 1, 2867 &spa->spa_pool_props_object, tx) == 0); 2868 } 2869 mutex_exit(&spa->spa_props_lock); 2870 2871 nvpair = NULL; 2872 while ((nvpair = nvlist_next_nvpair(nvp, nvpair))) { 2873 switch (zpool_name_to_prop(nvpair_name(nvpair))) { 2874 case ZFS_PROP_BOOTFS: 2875 VERIFY(nvlist_lookup_uint64(nvp, 2876 nvpair_name(nvpair), &spa->spa_bootfs) == 0); 2877 VERIFY(zap_update(mos, 2878 spa->spa_pool_props_object, 2879 zpool_prop_to_name(ZFS_PROP_BOOTFS), 8, 1, 2880 &spa->spa_bootfs, tx) == 0); 2881 break; 2882 } 2883 } 2884} 2885 2886/* 2887 * Sync the specified transaction group. New blocks may be dirtied as 2888 * part of the process, so we iterate until it converges. 2889 */ 2890void 2891spa_sync(spa_t *spa, uint64_t txg) 2892{ 2893 dsl_pool_t *dp = spa->spa_dsl_pool; 2894 objset_t *mos = spa->spa_meta_objset; 2895 bplist_t *bpl = &spa->spa_sync_bplist; 2896 vdev_t *rvd = spa->spa_root_vdev; 2897 vdev_t *vd; 2898 dmu_tx_t *tx; 2899 int dirty_vdevs; 2900 2901 /* 2902 * Lock out configuration changes. 2903 */ 2904 spa_config_enter(spa, RW_READER, FTAG); 2905 2906 spa->spa_syncing_txg = txg; 2907 spa->spa_sync_pass = 0; 2908 2909 VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); 2910 2911 tx = dmu_tx_create_assigned(dp, txg); 2912 2913 /* 2914 * If we are upgrading to ZFS_VERSION_RAIDZ_DEFLATE this txg, 2915 * set spa_deflate if we have no raid-z vdevs. 2916 */ 2917 if (spa->spa_ubsync.ub_version < ZFS_VERSION_RAIDZ_DEFLATE && 2918 spa->spa_uberblock.ub_version >= ZFS_VERSION_RAIDZ_DEFLATE) { 2919 int i; 2920 2921 for (i = 0; i < rvd->vdev_children; i++) { 2922 vd = rvd->vdev_child[i]; 2923 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 2924 break; 2925 } 2926 if (i == rvd->vdev_children) { 2927 spa->spa_deflate = TRUE; 2928 VERIFY(0 == zap_add(spa->spa_meta_objset, 2929 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 2930 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 2931 } 2932 } 2933 2934 /* 2935 * If anything has changed in this txg, push the deferred frees 2936 * from the previous txg. If not, leave them alone so that we 2937 * don't generate work on an otherwise idle system. 2938 */ 2939 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 2940 !txg_list_empty(&dp->dp_dirty_dirs, txg) || 2941 !txg_list_empty(&dp->dp_sync_tasks, txg)) 2942 spa_sync_deferred_frees(spa, txg); 2943 2944 /* 2945 * Iterate to convergence. 2946 */ 2947 do { 2948 spa->spa_sync_pass++; 2949 2950 spa_sync_config_object(spa, tx); 2951 spa_sync_spares(spa, tx); 2952 spa_errlog_sync(spa, txg); 2953 dsl_pool_sync(dp, txg); 2954 2955 dirty_vdevs = 0; 2956 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 2957 vdev_sync(vd, txg); 2958 dirty_vdevs++; 2959 } 2960 2961 bplist_sync(bpl, tx); 2962 } while (dirty_vdevs); 2963 2964 bplist_close(bpl); 2965 2966 dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 2967 2968 /* 2969 * Rewrite the vdev configuration (which includes the uberblock) 2970 * to commit the transaction group. 2971 * 2972 * If there are any dirty vdevs, sync the uberblock to all vdevs. 2973 * Otherwise, pick a random top-level vdev that's known to be 2974 * visible in the config cache (see spa_vdev_add() for details). 2975 * If the write fails, try the next vdev until we're tried them all. 2976 */ 2977 if (!list_is_empty(&spa->spa_dirty_list)) { 2978 VERIFY(vdev_config_sync(rvd, txg) == 0); 2979 } else { 2980 int children = rvd->vdev_children; 2981 int c0 = spa_get_random(children); 2982 int c; 2983 2984 for (c = 0; c < children; c++) { 2985 vd = rvd->vdev_child[(c0 + c) % children]; 2986 if (vd->vdev_ms_array == 0) 2987 continue; 2988 if (vdev_config_sync(vd, txg) == 0) 2989 break; 2990 } 2991 if (c == children) 2992 VERIFY(vdev_config_sync(rvd, txg) == 0); 2993 } 2994 2995 dmu_tx_commit(tx); 2996 2997 /* 2998 * Clear the dirty config list. 2999 */ 3000 while ((vd = list_head(&spa->spa_dirty_list)) != NULL) 3001 vdev_config_clean(vd); 3002 3003 /* 3004 * Now that the new config has synced transactionally, 3005 * let it become visible to the config cache. 3006 */ 3007 if (spa->spa_config_syncing != NULL) { 3008 spa_config_set(spa, spa->spa_config_syncing); 3009 spa->spa_config_txg = txg; 3010 spa->spa_config_syncing = NULL; 3011 } 3012 3013 /* 3014 * Make a stable copy of the fully synced uberblock. 3015 * We use this as the root for pool traversals. 3016 */ 3017 spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */ 3018 3019 spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */ 3020 3021 rw_enter(&spa->spa_traverse_lock, RW_WRITER); 3022 spa->spa_traverse_wanted = 0; 3023 spa->spa_ubsync = spa->spa_uberblock; 3024 rw_exit(&spa->spa_traverse_lock); 3025 3026 spa_scrub_resume(spa); /* resume scrub with new ubsync */ 3027 3028 /* 3029 * Clean up the ZIL records for the synced txg. 3030 */ 3031 dsl_pool_zil_clean(dp); 3032 3033 /* 3034 * Update usable space statistics. 3035 */ 3036 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 3037 vdev_sync_done(vd, txg); 3038 3039 /* 3040 * It had better be the case that we didn't dirty anything 3041 * since vdev_config_sync(). 3042 */ 3043 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 3044 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 3045 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 3046 ASSERT(bpl->bpl_queue == NULL); 3047 3048 spa_config_exit(spa, FTAG); 3049 3050 /* 3051 * If any async tasks have been requested, kick them off. 3052 */ 3053 spa_async_dispatch(spa); 3054} 3055 3056/* 3057 * Sync all pools. We don't want to hold the namespace lock across these 3058 * operations, so we take a reference on the spa_t and drop the lock during the 3059 * sync. 3060 */ 3061void 3062spa_sync_allpools(void) 3063{ 3064 spa_t *spa = NULL; 3065 mutex_enter(&spa_namespace_lock); 3066 while ((spa = spa_next(spa)) != NULL) { 3067 if (spa_state(spa) != POOL_STATE_ACTIVE) 3068 continue; 3069 spa_open_ref(spa, FTAG); 3070 mutex_exit(&spa_namespace_lock); 3071 txg_wait_synced(spa_get_dsl(spa), 0); 3072 mutex_enter(&spa_namespace_lock); 3073 spa_close(spa, FTAG); 3074 } 3075 mutex_exit(&spa_namespace_lock); 3076} 3077 3078/* 3079 * ========================================================================== 3080 * Miscellaneous routines 3081 * ========================================================================== 3082 */ 3083 3084/* 3085 * Remove all pools in the system. 3086 */ 3087void 3088spa_evict_all(void) 3089{ 3090 spa_t *spa; 3091 3092 /* 3093 * Remove all cached state. All pools should be closed now, 3094 * so every spa in the AVL tree should be unreferenced. 3095 */ 3096 mutex_enter(&spa_namespace_lock); 3097 while ((spa = spa_next(NULL)) != NULL) { 3098 /* 3099 * Stop async tasks. The async thread may need to detach 3100 * a device that's been replaced, which requires grabbing 3101 * spa_namespace_lock, so we must drop it here. 3102 */ 3103 spa_open_ref(spa, FTAG); 3104 mutex_exit(&spa_namespace_lock); 3105 spa_async_suspend(spa); 3106 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 3107 mutex_enter(&spa_namespace_lock); 3108 spa_close(spa, FTAG); 3109 3110 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 3111 spa_unload(spa); 3112 spa_deactivate(spa); 3113 } 3114 spa_remove(spa); 3115 } 3116 mutex_exit(&spa_namespace_lock); 3117} 3118 3119vdev_t * 3120spa_lookup_by_guid(spa_t *spa, uint64_t guid) 3121{ 3122 return (vdev_lookup_by_guid(spa->spa_root_vdev, guid)); 3123} 3124 3125void 3126spa_upgrade(spa_t *spa) 3127{ 3128 spa_config_enter(spa, RW_WRITER, FTAG); 3129 3130 /* 3131 * This should only be called for a non-faulted pool, and since a 3132 * future version would result in an unopenable pool, this shouldn't be 3133 * possible. 3134 */ 3135 ASSERT(spa->spa_uberblock.ub_version <= ZFS_VERSION); 3136 3137 spa->spa_uberblock.ub_version = ZFS_VERSION; 3138 vdev_config_dirty(spa->spa_root_vdev); 3139 3140 spa_config_exit(spa, FTAG); 3141 3142 txg_wait_synced(spa_get_dsl(spa), 0); 3143} 3144 3145boolean_t 3146spa_has_spare(spa_t *spa, uint64_t guid) 3147{ 3148 int i; 3149 uint64_t spareguid; 3150 3151 for (i = 0; i < spa->spa_nspares; i++) 3152 if (spa->spa_spares[i]->vdev_guid == guid) 3153 return (B_TRUE); 3154 3155 for (i = 0; i < spa->spa_pending_nspares; i++) { 3156 if (nvlist_lookup_uint64(spa->spa_pending_spares[i], 3157 ZPOOL_CONFIG_GUID, &spareguid) == 0 && 3158 spareguid == guid) 3159 return (B_TRUE); 3160 } 3161 3162 return (B_FALSE); 3163} 3164 3165int 3166spa_set_props(spa_t *spa, nvlist_t *nvp) 3167{ 3168 return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, 3169 spa, nvp, 3)); 3170} 3171 3172int 3173spa_get_props(spa_t *spa, nvlist_t **nvp) 3174{ 3175 zap_cursor_t zc; 3176 zap_attribute_t za; 3177 objset_t *mos = spa->spa_meta_objset; 3178 zfs_source_t src; 3179 zfs_prop_t prop; 3180 nvlist_t *propval; 3181 uint64_t value; 3182 int err; 3183 3184 VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3185 3186 mutex_enter(&spa->spa_props_lock); 3187 /* If no props object, then just return empty nvlist */ 3188 if (spa->spa_pool_props_object == 0) { 3189 mutex_exit(&spa->spa_props_lock); 3190 return (0); 3191 } 3192 3193 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 3194 (err = zap_cursor_retrieve(&zc, &za)) == 0; 3195 zap_cursor_advance(&zc)) { 3196 3197 if ((prop = zpool_name_to_prop(za.za_name)) == ZFS_PROP_INVAL) 3198 continue; 3199 3200 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3201 switch (za.za_integer_length) { 3202 case 8: 3203 if (zfs_prop_default_numeric(prop) == 3204 za.za_first_integer) 3205 src = ZFS_SRC_DEFAULT; 3206 else 3207 src = ZFS_SRC_LOCAL; 3208 value = za.za_first_integer; 3209 3210 if (prop == ZFS_PROP_BOOTFS) { 3211 dsl_pool_t *dp; 3212 dsl_dataset_t *ds = NULL; 3213 char strval[MAXPATHLEN]; 3214 3215 dp = spa_get_dsl(spa); 3216 rw_enter(&dp->dp_config_rwlock, RW_READER); 3217 if ((err = dsl_dataset_open_obj(dp, 3218 za.za_first_integer, NULL, DS_MODE_NONE, 3219 FTAG, &ds)) != 0) { 3220 rw_exit(&dp->dp_config_rwlock); 3221 break; 3222 } 3223 dsl_dataset_name(ds, strval); 3224 dsl_dataset_close(ds, DS_MODE_NONE, FTAG); 3225 rw_exit(&dp->dp_config_rwlock); 3226 3227 VERIFY(nvlist_add_uint64(propval, 3228 ZFS_PROP_SOURCE, src) == 0); 3229 VERIFY(nvlist_add_string(propval, 3230 ZFS_PROP_VALUE, strval) == 0); 3231 } else { 3232 VERIFY(nvlist_add_uint64(propval, 3233 ZFS_PROP_SOURCE, src) == 0); 3234 VERIFY(nvlist_add_uint64(propval, 3235 ZFS_PROP_VALUE, value) == 0); 3236 } 3237 VERIFY(nvlist_add_nvlist(*nvp, za.za_name, 3238 propval) == 0); 3239 break; 3240 } 3241 nvlist_free(propval); 3242 } 3243 zap_cursor_fini(&zc); 3244 mutex_exit(&spa->spa_props_lock); 3245 if (err && err != ENOENT) { 3246 nvlist_free(*nvp); 3247 return (err); 3248 } 3249 3250 return (0); 3251} 3252 3253/* 3254 * If the bootfs property value is dsobj, clear it. 3255 */ 3256void 3257spa_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 3258{ 3259 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 3260 VERIFY(zap_remove(spa->spa_meta_objset, 3261 spa->spa_pool_props_object, 3262 zpool_prop_to_name(ZFS_PROP_BOOTFS), tx) == 0); 3263 spa->spa_bootfs = 0; 3264 } 3265} 3266