spa.c revision 168821
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22/* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27#pragma ident "%Z%%M% %I% %E% SMI" 28 29/* 30 * This file contains all the routines used when modifying on-disk SPA state. 31 * This includes opening, importing, destroying, exporting a pool, and syncing a 32 * pool. 33 */ 34 35#include <sys/zfs_context.h> 36#include <sys/fm/fs/zfs.h> 37#include <sys/spa_impl.h> 38#include <sys/zio.h> 39#include <sys/zio_checksum.h> 40#include <sys/zio_compress.h> 41#include <sys/dmu.h> 42#include <sys/dmu_tx.h> 43#include <sys/zap.h> 44#include <sys/zil.h> 45#include <sys/vdev_impl.h> 46#include <sys/metaslab.h> 47#include <sys/uberblock_impl.h> 48#include <sys/txg.h> 49#include <sys/avl.h> 50#include <sys/dmu_traverse.h> 51#include <sys/dmu_objset.h> 52#include <sys/unique.h> 53#include <sys/dsl_pool.h> 54#include <sys/dsl_dataset.h> 55#include <sys/dsl_dir.h> 56#include <sys/dsl_prop.h> 57#include <sys/dsl_synctask.h> 58#include <sys/fs/zfs.h> 59#include <sys/callb.h> 60 61int zio_taskq_threads = 0; 62SYSCTL_DECL(_vfs_zfs); 63SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO"); 64TUNABLE_INT("vfs.zfs.zio.taskq_threads", &zio_taskq_threads); 65SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, taskq_threads, CTLFLAG_RW, 66 &zio_taskq_threads, 0, "Number of ZIO threads per ZIO type"); 67 68 69/* 70 * ========================================================================== 71 * SPA state manipulation (open/create/destroy/import/export) 72 * ========================================================================== 73 */ 74 75static int 76spa_error_entry_compare(const void *a, const void *b) 77{ 78 spa_error_entry_t *sa = (spa_error_entry_t *)a; 79 spa_error_entry_t *sb = (spa_error_entry_t *)b; 80 int ret; 81 82 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 83 sizeof (zbookmark_t)); 84 85 if (ret < 0) 86 return (-1); 87 else if (ret > 0) 88 return (1); 89 else 90 return (0); 91} 92 93/* 94 * Utility function which retrieves copies of the current logs and 95 * re-initializes them in the process. 96 */ 97void 98spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 99{ 100 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 101 102 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 103 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 104 105 avl_create(&spa->spa_errlist_scrub, 106 spa_error_entry_compare, sizeof (spa_error_entry_t), 107 offsetof(spa_error_entry_t, se_avl)); 108 avl_create(&spa->spa_errlist_last, 109 spa_error_entry_compare, sizeof (spa_error_entry_t), 110 offsetof(spa_error_entry_t, se_avl)); 111} 112 113/* 114 * Activate an uninitialized pool. 115 */ 116static void 117spa_activate(spa_t *spa) 118{ 119 int t; 120 int nthreads = zio_taskq_threads; 121 char name[32]; 122 123 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 124 125 spa->spa_state = POOL_STATE_ACTIVE; 126 127 spa->spa_normal_class = metaslab_class_create(); 128 129 if (nthreads == 0) 130 nthreads = max_ncpus; 131 for (t = 0; t < ZIO_TYPES; t++) { 132 snprintf(name, sizeof(name), "spa_zio_issue %d", t); 133 spa->spa_zio_issue_taskq[t] = taskq_create(name, nthreads, 134 maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE); 135 snprintf(name, sizeof(name), "spa_zio_intr %d", t); 136 spa->spa_zio_intr_taskq[t] = taskq_create(name, nthreads, 137 maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE); 138 } 139 140 rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL); 141 142 mutex_init(&spa->spa_uberblock_lock, NULL, MUTEX_DEFAULT, NULL); 143 mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL); 144 mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL); 145 mutex_init(&spa->spa_config_lock.scl_lock, NULL, MUTEX_DEFAULT, NULL); 146 cv_init(&spa->spa_config_lock.scl_cv, NULL, CV_DEFAULT, NULL); 147 mutex_init(&spa->spa_sync_bplist.bpl_lock, NULL, MUTEX_DEFAULT, NULL); 148 mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL); 149 mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL); 150 151 list_create(&spa->spa_dirty_list, sizeof (vdev_t), 152 offsetof(vdev_t, vdev_dirty_node)); 153 154 txg_list_create(&spa->spa_vdev_txg_list, 155 offsetof(struct vdev, vdev_txg_node)); 156 157 avl_create(&spa->spa_errlist_scrub, 158 spa_error_entry_compare, sizeof (spa_error_entry_t), 159 offsetof(spa_error_entry_t, se_avl)); 160 avl_create(&spa->spa_errlist_last, 161 spa_error_entry_compare, sizeof (spa_error_entry_t), 162 offsetof(spa_error_entry_t, se_avl)); 163} 164 165/* 166 * Opposite of spa_activate(). 167 */ 168static void 169spa_deactivate(spa_t *spa) 170{ 171 int t; 172 173 ASSERT(spa->spa_sync_on == B_FALSE); 174 ASSERT(spa->spa_dsl_pool == NULL); 175 ASSERT(spa->spa_root_vdev == NULL); 176 177 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 178 179 txg_list_destroy(&spa->spa_vdev_txg_list); 180 181 list_destroy(&spa->spa_dirty_list); 182 183 for (t = 0; t < ZIO_TYPES; t++) { 184 taskq_destroy(spa->spa_zio_issue_taskq[t]); 185 taskq_destroy(spa->spa_zio_intr_taskq[t]); 186 spa->spa_zio_issue_taskq[t] = NULL; 187 spa->spa_zio_intr_taskq[t] = NULL; 188 } 189 190 metaslab_class_destroy(spa->spa_normal_class); 191 spa->spa_normal_class = NULL; 192 193 /* 194 * If this was part of an import or the open otherwise failed, we may 195 * still have errors left in the queues. Empty them just in case. 196 */ 197 spa_errlog_drain(spa); 198 199 avl_destroy(&spa->spa_errlist_scrub); 200 avl_destroy(&spa->spa_errlist_last); 201 202 rw_destroy(&spa->spa_traverse_lock); 203 mutex_destroy(&spa->spa_uberblock_lock); 204 mutex_destroy(&spa->spa_errlog_lock); 205 mutex_destroy(&spa->spa_errlist_lock); 206 mutex_destroy(&spa->spa_config_lock.scl_lock); 207 cv_destroy(&spa->spa_config_lock.scl_cv); 208 mutex_destroy(&spa->spa_sync_bplist.bpl_lock); 209 mutex_destroy(&spa->spa_history_lock); 210 mutex_destroy(&spa->spa_props_lock); 211 212 spa->spa_state = POOL_STATE_UNINITIALIZED; 213} 214 215/* 216 * Verify a pool configuration, and construct the vdev tree appropriately. This 217 * will create all the necessary vdevs in the appropriate layout, with each vdev 218 * in the CLOSED state. This will prep the pool before open/creation/import. 219 * All vdev validation is done by the vdev_alloc() routine. 220 */ 221static int 222spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 223 uint_t id, int atype) 224{ 225 nvlist_t **child; 226 uint_t c, children; 227 int error; 228 229 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 230 return (error); 231 232 if ((*vdp)->vdev_ops->vdev_op_leaf) 233 return (0); 234 235 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 236 &child, &children) != 0) { 237 vdev_free(*vdp); 238 *vdp = NULL; 239 return (EINVAL); 240 } 241 242 for (c = 0; c < children; c++) { 243 vdev_t *vd; 244 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 245 atype)) != 0) { 246 vdev_free(*vdp); 247 *vdp = NULL; 248 return (error); 249 } 250 } 251 252 ASSERT(*vdp != NULL); 253 254 return (0); 255} 256 257/* 258 * Opposite of spa_load(). 259 */ 260static void 261spa_unload(spa_t *spa) 262{ 263 int i; 264 265 /* 266 * Stop async tasks. 267 */ 268 spa_async_suspend(spa); 269 270 /* 271 * Stop syncing. 272 */ 273 if (spa->spa_sync_on) { 274 txg_sync_stop(spa->spa_dsl_pool); 275 spa->spa_sync_on = B_FALSE; 276 } 277 278 /* 279 * Wait for any outstanding prefetch I/O to complete. 280 */ 281 spa_config_enter(spa, RW_WRITER, FTAG); 282 spa_config_exit(spa, FTAG); 283 284 /* 285 * Close the dsl pool. 286 */ 287 if (spa->spa_dsl_pool) { 288 dsl_pool_close(spa->spa_dsl_pool); 289 spa->spa_dsl_pool = NULL; 290 } 291 292 /* 293 * Close all vdevs. 294 */ 295 if (spa->spa_root_vdev) 296 vdev_free(spa->spa_root_vdev); 297 ASSERT(spa->spa_root_vdev == NULL); 298 299 for (i = 0; i < spa->spa_nspares; i++) 300 vdev_free(spa->spa_spares[i]); 301 if (spa->spa_spares) { 302 kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 303 spa->spa_spares = NULL; 304 } 305 if (spa->spa_sparelist) { 306 nvlist_free(spa->spa_sparelist); 307 spa->spa_sparelist = NULL; 308 } 309 310 spa->spa_async_suspended = 0; 311} 312 313/* 314 * Load (or re-load) the current list of vdevs describing the active spares for 315 * this pool. When this is called, we have some form of basic information in 316 * 'spa_sparelist'. We parse this into vdevs, try to open them, and then 317 * re-generate a more complete list including status information. 318 */ 319static void 320spa_load_spares(spa_t *spa) 321{ 322 nvlist_t **spares; 323 uint_t nspares; 324 int i; 325 vdev_t *vd, *tvd; 326 327 /* 328 * First, close and free any existing spare vdevs. 329 */ 330 for (i = 0; i < spa->spa_nspares; i++) { 331 vd = spa->spa_spares[i]; 332 333 /* Undo the call to spa_activate() below */ 334 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL && 335 tvd->vdev_isspare) 336 spa_spare_remove(tvd); 337 vdev_close(vd); 338 vdev_free(vd); 339 } 340 341 if (spa->spa_spares) 342 kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 343 344 if (spa->spa_sparelist == NULL) 345 nspares = 0; 346 else 347 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 348 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 349 350 spa->spa_nspares = (int)nspares; 351 spa->spa_spares = NULL; 352 353 if (nspares == 0) 354 return; 355 356 /* 357 * Construct the array of vdevs, opening them to get status in the 358 * process. For each spare, there is potentially two different vdev_t 359 * structures associated with it: one in the list of spares (used only 360 * for basic validation purposes) and one in the active vdev 361 * configuration (if it's spared in). During this phase we open and 362 * validate each vdev on the spare list. If the vdev also exists in the 363 * active configuration, then we also mark this vdev as an active spare. 364 */ 365 spa->spa_spares = kmem_alloc(nspares * sizeof (void *), KM_SLEEP); 366 for (i = 0; i < spa->spa_nspares; i++) { 367 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 368 VDEV_ALLOC_SPARE) == 0); 369 ASSERT(vd != NULL); 370 371 spa->spa_spares[i] = vd; 372 373 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL) { 374 if (!tvd->vdev_isspare) 375 spa_spare_add(tvd); 376 377 /* 378 * We only mark the spare active if we were successfully 379 * able to load the vdev. Otherwise, importing a pool 380 * with a bad active spare would result in strange 381 * behavior, because multiple pool would think the spare 382 * is actively in use. 383 * 384 * There is a vulnerability here to an equally bizarre 385 * circumstance, where a dead active spare is later 386 * brought back to life (onlined or otherwise). Given 387 * the rarity of this scenario, and the extra complexity 388 * it adds, we ignore the possibility. 389 */ 390 if (!vdev_is_dead(tvd)) 391 spa_spare_activate(tvd); 392 } 393 394 if (vdev_open(vd) != 0) 395 continue; 396 397 vd->vdev_top = vd; 398 (void) vdev_validate_spare(vd); 399 } 400 401 /* 402 * Recompute the stashed list of spares, with status information 403 * this time. 404 */ 405 VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 406 DATA_TYPE_NVLIST_ARRAY) == 0); 407 408 spares = kmem_alloc(spa->spa_nspares * sizeof (void *), KM_SLEEP); 409 for (i = 0; i < spa->spa_nspares; i++) 410 spares[i] = vdev_config_generate(spa, spa->spa_spares[i], 411 B_TRUE, B_TRUE); 412 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 413 spares, spa->spa_nspares) == 0); 414 for (i = 0; i < spa->spa_nspares; i++) 415 nvlist_free(spares[i]); 416 kmem_free(spares, spa->spa_nspares * sizeof (void *)); 417} 418 419static int 420load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 421{ 422 dmu_buf_t *db; 423 char *packed = NULL; 424 size_t nvsize = 0; 425 int error; 426 *value = NULL; 427 428 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 429 nvsize = *(uint64_t *)db->db_data; 430 dmu_buf_rele(db, FTAG); 431 432 packed = kmem_alloc(nvsize, KM_SLEEP); 433 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed); 434 if (error == 0) 435 error = nvlist_unpack(packed, nvsize, value, 0); 436 kmem_free(packed, nvsize); 437 438 return (error); 439} 440 441/* 442 * Load an existing storage pool, using the pool's builtin spa_config as a 443 * source of configuration information. 444 */ 445static int 446spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) 447{ 448 int error = 0; 449 nvlist_t *nvroot = NULL; 450 vdev_t *rvd; 451 uberblock_t *ub = &spa->spa_uberblock; 452 uint64_t config_cache_txg = spa->spa_config_txg; 453 uint64_t pool_guid; 454 uint64_t version; 455 zio_t *zio; 456 457 spa->spa_load_state = state; 458 459 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 460 nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 461 error = EINVAL; 462 goto out; 463 } 464 465 /* 466 * Versioning wasn't explicitly added to the label until later, so if 467 * it's not present treat it as the initial version. 468 */ 469 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0) 470 version = ZFS_VERSION_INITIAL; 471 472 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 473 &spa->spa_config_txg); 474 475 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 476 spa_guid_exists(pool_guid, 0)) { 477 error = EEXIST; 478 goto out; 479 } 480 481 spa->spa_load_guid = pool_guid; 482 483 /* 484 * Parse the configuration into a vdev tree. We explicitly set the 485 * value that will be returned by spa_version() since parsing the 486 * configuration requires knowing the version number. 487 */ 488 spa_config_enter(spa, RW_WRITER, FTAG); 489 spa->spa_ubsync.ub_version = version; 490 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 491 spa_config_exit(spa, FTAG); 492 493 if (error != 0) 494 goto out; 495 496 ASSERT(spa->spa_root_vdev == rvd); 497 ASSERT(spa_guid(spa) == pool_guid); 498 499 /* 500 * Try to open all vdevs, loading each label in the process. 501 */ 502 if (vdev_open(rvd) != 0) { 503 error = ENXIO; 504 goto out; 505 } 506 507 /* 508 * Validate the labels for all leaf vdevs. We need to grab the config 509 * lock because all label I/O is done with the ZIO_FLAG_CONFIG_HELD 510 * flag. 511 */ 512 spa_config_enter(spa, RW_READER, FTAG); 513 error = vdev_validate(rvd); 514 spa_config_exit(spa, FTAG); 515 516 if (error != 0) { 517 error = EBADF; 518 goto out; 519 } 520 521 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 522 error = ENXIO; 523 goto out; 524 } 525 526 /* 527 * Find the best uberblock. 528 */ 529 bzero(ub, sizeof (uberblock_t)); 530 531 zio = zio_root(spa, NULL, NULL, 532 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 533 vdev_uberblock_load(zio, rvd, ub); 534 error = zio_wait(zio); 535 536 /* 537 * If we weren't able to find a single valid uberblock, return failure. 538 */ 539 if (ub->ub_txg == 0) { 540 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 541 VDEV_AUX_CORRUPT_DATA); 542 error = ENXIO; 543 goto out; 544 } 545 546 /* 547 * If the pool is newer than the code, we can't open it. 548 */ 549 if (ub->ub_version > ZFS_VERSION) { 550 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 551 VDEV_AUX_VERSION_NEWER); 552 error = ENOTSUP; 553 goto out; 554 } 555 556 /* 557 * If the vdev guid sum doesn't match the uberblock, we have an 558 * incomplete configuration. 559 */ 560 if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 561 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 562 VDEV_AUX_BAD_GUID_SUM); 563 error = ENXIO; 564 goto out; 565 } 566 567 /* 568 * Initialize internal SPA structures. 569 */ 570 spa->spa_state = POOL_STATE_ACTIVE; 571 spa->spa_ubsync = spa->spa_uberblock; 572 spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 573 error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 574 if (error) { 575 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 576 VDEV_AUX_CORRUPT_DATA); 577 goto out; 578 } 579 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 580 581 if (zap_lookup(spa->spa_meta_objset, 582 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 583 sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { 584 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 585 VDEV_AUX_CORRUPT_DATA); 586 error = EIO; 587 goto out; 588 } 589 590 if (!mosconfig) { 591 nvlist_t *newconfig; 592 uint64_t hostid; 593 594 if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) { 595 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 596 VDEV_AUX_CORRUPT_DATA); 597 error = EIO; 598 goto out; 599 } 600 601 /* 602 * hostid is set after the root file system is mounted, so 603 * ignore the check until it's done. 604 */ 605 if (nvlist_lookup_uint64(newconfig, ZPOOL_CONFIG_HOSTID, 606 &hostid) == 0 && root_mounted()) { 607 char *hostname; 608 unsigned long myhostid = 0; 609 610 VERIFY(nvlist_lookup_string(newconfig, 611 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 612 613 (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 614 if ((unsigned long)hostid != myhostid) { 615 cmn_err(CE_WARN, "pool '%s' could not be " 616 "loaded as it was last accessed by " 617 "another system (host: %s hostid: 0x%lx). " 618 "See: http://www.sun.com/msg/ZFS-8000-EY", 619 spa->spa_name, hostname, 620 (unsigned long)hostid); 621 error = EBADF; 622 goto out; 623 } 624 } 625 626 spa_config_set(spa, newconfig); 627 spa_unload(spa); 628 spa_deactivate(spa); 629 spa_activate(spa); 630 631 return (spa_load(spa, newconfig, state, B_TRUE)); 632 } 633 634 if (zap_lookup(spa->spa_meta_objset, 635 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 636 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { 637 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 638 VDEV_AUX_CORRUPT_DATA); 639 error = EIO; 640 goto out; 641 } 642 643 /* 644 * Load the bit that tells us to use the new accounting function 645 * (raid-z deflation). If we have an older pool, this will not 646 * be present. 647 */ 648 error = zap_lookup(spa->spa_meta_objset, 649 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 650 sizeof (uint64_t), 1, &spa->spa_deflate); 651 if (error != 0 && error != ENOENT) { 652 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 653 VDEV_AUX_CORRUPT_DATA); 654 error = EIO; 655 goto out; 656 } 657 658 /* 659 * Load the persistent error log. If we have an older pool, this will 660 * not be present. 661 */ 662 error = zap_lookup(spa->spa_meta_objset, 663 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, 664 sizeof (uint64_t), 1, &spa->spa_errlog_last); 665 if (error != 0 && error != ENOENT) { 666 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 667 VDEV_AUX_CORRUPT_DATA); 668 error = EIO; 669 goto out; 670 } 671 672 error = zap_lookup(spa->spa_meta_objset, 673 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, 674 sizeof (uint64_t), 1, &spa->spa_errlog_scrub); 675 if (error != 0 && error != ENOENT) { 676 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 677 VDEV_AUX_CORRUPT_DATA); 678 error = EIO; 679 goto out; 680 } 681 682 /* 683 * Load the history object. If we have an older pool, this 684 * will not be present. 685 */ 686 error = zap_lookup(spa->spa_meta_objset, 687 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY, 688 sizeof (uint64_t), 1, &spa->spa_history); 689 if (error != 0 && error != ENOENT) { 690 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 691 VDEV_AUX_CORRUPT_DATA); 692 error = EIO; 693 goto out; 694 } 695 696 /* 697 * Load any hot spares for this pool. 698 */ 699 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 700 DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares_object); 701 if (error != 0 && error != ENOENT) { 702 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 703 VDEV_AUX_CORRUPT_DATA); 704 error = EIO; 705 goto out; 706 } 707 if (error == 0) { 708 ASSERT(spa_version(spa) >= ZFS_VERSION_SPARES); 709 if (load_nvlist(spa, spa->spa_spares_object, 710 &spa->spa_sparelist) != 0) { 711 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 712 VDEV_AUX_CORRUPT_DATA); 713 error = EIO; 714 goto out; 715 } 716 717 spa_config_enter(spa, RW_WRITER, FTAG); 718 spa_load_spares(spa); 719 spa_config_exit(spa, FTAG); 720 } 721 722 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 723 DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object); 724 725 if (error && error != ENOENT) { 726 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 727 VDEV_AUX_CORRUPT_DATA); 728 error = EIO; 729 goto out; 730 } 731 732 if (error == 0) { 733 (void) zap_lookup(spa->spa_meta_objset, 734 spa->spa_pool_props_object, 735 zpool_prop_to_name(ZFS_PROP_BOOTFS), 736 sizeof (uint64_t), 1, &spa->spa_bootfs); 737 } 738 739 /* 740 * Load the vdev state for all toplevel vdevs. 741 */ 742 vdev_load(rvd); 743 744 /* 745 * Propagate the leaf DTLs we just loaded all the way up the tree. 746 */ 747 spa_config_enter(spa, RW_WRITER, FTAG); 748 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 749 spa_config_exit(spa, FTAG); 750 751 /* 752 * Check the state of the root vdev. If it can't be opened, it 753 * indicates one or more toplevel vdevs are faulted. 754 */ 755 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 756 error = ENXIO; 757 goto out; 758 } 759 760 if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) { 761 dmu_tx_t *tx; 762 int need_update = B_FALSE; 763 int c; 764 765 /* 766 * Claim log blocks that haven't been committed yet. 767 * This must all happen in a single txg. 768 */ 769 tx = dmu_tx_create_assigned(spa_get_dsl(spa), 770 spa_first_txg(spa)); 771 (void) dmu_objset_find(spa->spa_name, 772 zil_claim, tx, DS_FIND_CHILDREN); 773 dmu_tx_commit(tx); 774 775 spa->spa_sync_on = B_TRUE; 776 txg_sync_start(spa->spa_dsl_pool); 777 778 /* 779 * Wait for all claims to sync. 780 */ 781 txg_wait_synced(spa->spa_dsl_pool, 0); 782 783 /* 784 * If the config cache is stale, or we have uninitialized 785 * metaslabs (see spa_vdev_add()), then update the config. 786 */ 787 if (config_cache_txg != spa->spa_config_txg || 788 state == SPA_LOAD_IMPORT) 789 need_update = B_TRUE; 790 791 for (c = 0; c < rvd->vdev_children; c++) 792 if (rvd->vdev_child[c]->vdev_ms_array == 0) 793 need_update = B_TRUE; 794 795 /* 796 * Update the config cache asychronously in case we're the 797 * root pool, in which case the config cache isn't writable yet. 798 */ 799 if (need_update) 800 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 801 } 802 803 error = 0; 804out: 805 if (error && error != EBADF) 806 zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0); 807 spa->spa_load_state = SPA_LOAD_NONE; 808 spa->spa_ena = 0; 809 810 return (error); 811} 812 813/* 814 * Pool Open/Import 815 * 816 * The import case is identical to an open except that the configuration is sent 817 * down from userland, instead of grabbed from the configuration cache. For the 818 * case of an open, the pool configuration will exist in the 819 * POOL_STATE_UNITIALIZED state. 820 * 821 * The stats information (gen/count/ustats) is used to gather vdev statistics at 822 * the same time open the pool, without having to keep around the spa_t in some 823 * ambiguous state. 824 */ 825static int 826spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 827{ 828 spa_t *spa; 829 int error; 830 int loaded = B_FALSE; 831 int locked = B_FALSE; 832 833 *spapp = NULL; 834 835 /* 836 * As disgusting as this is, we need to support recursive calls to this 837 * function because dsl_dir_open() is called during spa_load(), and ends 838 * up calling spa_open() again. The real fix is to figure out how to 839 * avoid dsl_dir_open() calling this in the first place. 840 */ 841 if (mutex_owner(&spa_namespace_lock) != curthread) { 842 mutex_enter(&spa_namespace_lock); 843 locked = B_TRUE; 844 } 845 846 if ((spa = spa_lookup(pool)) == NULL) { 847 if (locked) 848 mutex_exit(&spa_namespace_lock); 849 return (ENOENT); 850 } 851 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 852 853 spa_activate(spa); 854 855 error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE); 856 857 if (error == EBADF) { 858 /* 859 * If vdev_validate() returns failure (indicated by 860 * EBADF), it indicates that one of the vdevs indicates 861 * that the pool has been exported or destroyed. If 862 * this is the case, the config cache is out of sync and 863 * we should remove the pool from the namespace. 864 */ 865 zfs_post_ok(spa, NULL); 866 spa_unload(spa); 867 spa_deactivate(spa); 868 spa_remove(spa); 869 spa_config_sync(); 870 if (locked) 871 mutex_exit(&spa_namespace_lock); 872 return (ENOENT); 873 } 874 875 if (error) { 876 /* 877 * We can't open the pool, but we still have useful 878 * information: the state of each vdev after the 879 * attempted vdev_open(). Return this to the user. 880 */ 881 if (config != NULL && spa->spa_root_vdev != NULL) { 882 spa_config_enter(spa, RW_READER, FTAG); 883 *config = spa_config_generate(spa, NULL, -1ULL, 884 B_TRUE); 885 spa_config_exit(spa, FTAG); 886 } 887 spa_unload(spa); 888 spa_deactivate(spa); 889 spa->spa_last_open_failed = B_TRUE; 890 if (locked) 891 mutex_exit(&spa_namespace_lock); 892 *spapp = NULL; 893 return (error); 894 } else { 895 zfs_post_ok(spa, NULL); 896 spa->spa_last_open_failed = B_FALSE; 897 } 898 899 loaded = B_TRUE; 900 } 901 902 spa_open_ref(spa, tag); 903 if (locked) 904 mutex_exit(&spa_namespace_lock); 905 906 *spapp = spa; 907 908 if (config != NULL) { 909 spa_config_enter(spa, RW_READER, FTAG); 910 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 911 spa_config_exit(spa, FTAG); 912 } 913 914 /* 915 * If we just loaded the pool, resilver anything that's out of date. 916 */ 917 if (loaded && (spa_mode & FWRITE)) 918 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 919 920 return (0); 921} 922 923int 924spa_open(const char *name, spa_t **spapp, void *tag) 925{ 926 return (spa_open_common(name, spapp, tag, NULL)); 927} 928 929/* 930 * Lookup the given spa_t, incrementing the inject count in the process, 931 * preventing it from being exported or destroyed. 932 */ 933spa_t * 934spa_inject_addref(char *name) 935{ 936 spa_t *spa; 937 938 mutex_enter(&spa_namespace_lock); 939 if ((spa = spa_lookup(name)) == NULL) { 940 mutex_exit(&spa_namespace_lock); 941 return (NULL); 942 } 943 spa->spa_inject_ref++; 944 mutex_exit(&spa_namespace_lock); 945 946 return (spa); 947} 948 949void 950spa_inject_delref(spa_t *spa) 951{ 952 mutex_enter(&spa_namespace_lock); 953 spa->spa_inject_ref--; 954 mutex_exit(&spa_namespace_lock); 955} 956 957static void 958spa_add_spares(spa_t *spa, nvlist_t *config) 959{ 960 nvlist_t **spares; 961 uint_t i, nspares; 962 nvlist_t *nvroot; 963 uint64_t guid; 964 vdev_stat_t *vs; 965 uint_t vsc; 966 uint64_t pool; 967 968 if (spa->spa_nspares == 0) 969 return; 970 971 VERIFY(nvlist_lookup_nvlist(config, 972 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 973 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 974 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 975 if (nspares != 0) { 976 VERIFY(nvlist_add_nvlist_array(nvroot, 977 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 978 VERIFY(nvlist_lookup_nvlist_array(nvroot, 979 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 980 981 /* 982 * Go through and find any spares which have since been 983 * repurposed as an active spare. If this is the case, update 984 * their status appropriately. 985 */ 986 for (i = 0; i < nspares; i++) { 987 VERIFY(nvlist_lookup_uint64(spares[i], 988 ZPOOL_CONFIG_GUID, &guid) == 0); 989 if (spa_spare_exists(guid, &pool) && pool != 0ULL) { 990 VERIFY(nvlist_lookup_uint64_array( 991 spares[i], ZPOOL_CONFIG_STATS, 992 (uint64_t **)&vs, &vsc) == 0); 993 vs->vs_state = VDEV_STATE_CANT_OPEN; 994 vs->vs_aux = VDEV_AUX_SPARED; 995 } 996 } 997 } 998} 999 1000int 1001spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 1002{ 1003 int error; 1004 spa_t *spa; 1005 1006 *config = NULL; 1007 error = spa_open_common(name, &spa, FTAG, config); 1008 1009 if (spa && *config != NULL) { 1010 VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, 1011 spa_get_errlog_size(spa)) == 0); 1012 1013 spa_add_spares(spa, *config); 1014 } 1015 1016 /* 1017 * We want to get the alternate root even for faulted pools, so we cheat 1018 * and call spa_lookup() directly. 1019 */ 1020 if (altroot) { 1021 if (spa == NULL) { 1022 mutex_enter(&spa_namespace_lock); 1023 spa = spa_lookup(name); 1024 if (spa) 1025 spa_altroot(spa, altroot, buflen); 1026 else 1027 altroot[0] = '\0'; 1028 spa = NULL; 1029 mutex_exit(&spa_namespace_lock); 1030 } else { 1031 spa_altroot(spa, altroot, buflen); 1032 } 1033 } 1034 1035 if (spa != NULL) 1036 spa_close(spa, FTAG); 1037 1038 return (error); 1039} 1040 1041/* 1042 * Validate that the 'spares' array is well formed. We must have an array of 1043 * nvlists, each which describes a valid leaf vdev. If this is an import (mode 1044 * is VDEV_ALLOC_SPARE), then we allow corrupted spares to be specified, as long 1045 * as they are well-formed. 1046 */ 1047static int 1048spa_validate_spares(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 1049{ 1050 nvlist_t **spares; 1051 uint_t i, nspares; 1052 vdev_t *vd; 1053 int error; 1054 1055 /* 1056 * It's acceptable to have no spares specified. 1057 */ 1058 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1059 &spares, &nspares) != 0) 1060 return (0); 1061 1062 if (nspares == 0) 1063 return (EINVAL); 1064 1065 /* 1066 * Make sure the pool is formatted with a version that supports hot 1067 * spares. 1068 */ 1069 if (spa_version(spa) < ZFS_VERSION_SPARES) 1070 return (ENOTSUP); 1071 1072 /* 1073 * Set the pending spare list so we correctly handle device in-use 1074 * checking. 1075 */ 1076 spa->spa_pending_spares = spares; 1077 spa->spa_pending_nspares = nspares; 1078 1079 for (i = 0; i < nspares; i++) { 1080 if ((error = spa_config_parse(spa, &vd, spares[i], NULL, 0, 1081 mode)) != 0) 1082 goto out; 1083 1084 if (!vd->vdev_ops->vdev_op_leaf) { 1085 vdev_free(vd); 1086 error = EINVAL; 1087 goto out; 1088 } 1089 1090 vd->vdev_top = vd; 1091 1092 if ((error = vdev_open(vd)) == 0 && 1093 (error = vdev_label_init(vd, crtxg, 1094 VDEV_LABEL_SPARE)) == 0) { 1095 VERIFY(nvlist_add_uint64(spares[i], ZPOOL_CONFIG_GUID, 1096 vd->vdev_guid) == 0); 1097 } 1098 1099 vdev_free(vd); 1100 1101 if (error && mode != VDEV_ALLOC_SPARE) 1102 goto out; 1103 else 1104 error = 0; 1105 } 1106 1107out: 1108 spa->spa_pending_spares = NULL; 1109 spa->spa_pending_nspares = 0; 1110 return (error); 1111} 1112 1113/* 1114 * Pool Creation 1115 */ 1116int 1117spa_create(const char *pool, nvlist_t *nvroot, const char *altroot) 1118{ 1119 spa_t *spa; 1120 vdev_t *rvd; 1121 dsl_pool_t *dp; 1122 dmu_tx_t *tx; 1123 int c, error = 0; 1124 uint64_t txg = TXG_INITIAL; 1125 nvlist_t **spares; 1126 uint_t nspares; 1127 1128 /* 1129 * If this pool already exists, return failure. 1130 */ 1131 mutex_enter(&spa_namespace_lock); 1132 if (spa_lookup(pool) != NULL) { 1133 mutex_exit(&spa_namespace_lock); 1134 return (EEXIST); 1135 } 1136 1137 /* 1138 * Allocate a new spa_t structure. 1139 */ 1140 spa = spa_add(pool, altroot); 1141 spa_activate(spa); 1142 1143 spa->spa_uberblock.ub_txg = txg - 1; 1144 spa->spa_uberblock.ub_version = ZFS_VERSION; 1145 spa->spa_ubsync = spa->spa_uberblock; 1146 1147 /* 1148 * Create the root vdev. 1149 */ 1150 spa_config_enter(spa, RW_WRITER, FTAG); 1151 1152 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 1153 1154 ASSERT(error != 0 || rvd != NULL); 1155 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 1156 1157 if (error == 0 && rvd->vdev_children == 0) 1158 error = EINVAL; 1159 1160 if (error == 0 && 1161 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 1162 (error = spa_validate_spares(spa, nvroot, txg, 1163 VDEV_ALLOC_ADD)) == 0) { 1164 for (c = 0; c < rvd->vdev_children; c++) 1165 vdev_init(rvd->vdev_child[c], txg); 1166 vdev_config_dirty(rvd); 1167 } 1168 1169 spa_config_exit(spa, FTAG); 1170 1171 if (error != 0) { 1172 spa_unload(spa); 1173 spa_deactivate(spa); 1174 spa_remove(spa); 1175 mutex_exit(&spa_namespace_lock); 1176 return (error); 1177 } 1178 1179 /* 1180 * Get the list of spares, if specified. 1181 */ 1182 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1183 &spares, &nspares) == 0) { 1184 VERIFY(nvlist_alloc(&spa->spa_sparelist, NV_UNIQUE_NAME, 1185 KM_SLEEP) == 0); 1186 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1187 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1188 spa_config_enter(spa, RW_WRITER, FTAG); 1189 spa_load_spares(spa); 1190 spa_config_exit(spa, FTAG); 1191 spa->spa_sync_spares = B_TRUE; 1192 } 1193 1194 spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg); 1195 spa->spa_meta_objset = dp->dp_meta_objset; 1196 1197 tx = dmu_tx_create_assigned(dp, txg); 1198 1199 /* 1200 * Create the pool config object. 1201 */ 1202 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 1203 DMU_OT_PACKED_NVLIST, 1 << 14, 1204 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 1205 1206 if (zap_add(spa->spa_meta_objset, 1207 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 1208 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 1209 cmn_err(CE_PANIC, "failed to add pool config"); 1210 } 1211 1212 /* Newly created pools are always deflated. */ 1213 spa->spa_deflate = TRUE; 1214 if (zap_add(spa->spa_meta_objset, 1215 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 1216 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 1217 cmn_err(CE_PANIC, "failed to add deflate"); 1218 } 1219 1220 /* 1221 * Create the deferred-free bplist object. Turn off compression 1222 * because sync-to-convergence takes longer if the blocksize 1223 * keeps changing. 1224 */ 1225 spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 1226 1 << 14, tx); 1227 dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 1228 ZIO_COMPRESS_OFF, tx); 1229 1230 if (zap_add(spa->spa_meta_objset, 1231 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 1232 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { 1233 cmn_err(CE_PANIC, "failed to add bplist"); 1234 } 1235 1236 /* 1237 * Create the pool's history object. 1238 */ 1239 spa_history_create_obj(spa, tx); 1240 1241 dmu_tx_commit(tx); 1242 1243 spa->spa_bootfs = zfs_prop_default_numeric(ZFS_PROP_BOOTFS); 1244 spa->spa_sync_on = B_TRUE; 1245 txg_sync_start(spa->spa_dsl_pool); 1246 1247 /* 1248 * We explicitly wait for the first transaction to complete so that our 1249 * bean counters are appropriately updated. 1250 */ 1251 txg_wait_synced(spa->spa_dsl_pool, txg); 1252 1253 spa_config_sync(); 1254 1255 mutex_exit(&spa_namespace_lock); 1256 1257 return (0); 1258} 1259 1260/* 1261 * Import the given pool into the system. We set up the necessary spa_t and 1262 * then call spa_load() to do the dirty work. 1263 */ 1264int 1265spa_import(const char *pool, nvlist_t *config, const char *altroot) 1266{ 1267 spa_t *spa; 1268 int error; 1269 nvlist_t *nvroot; 1270 nvlist_t **spares; 1271 uint_t nspares; 1272 1273 if (!(spa_mode & FWRITE)) 1274 return (EROFS); 1275 1276 /* 1277 * If a pool with this name exists, return failure. 1278 */ 1279 mutex_enter(&spa_namespace_lock); 1280 if (spa_lookup(pool) != NULL) { 1281 mutex_exit(&spa_namespace_lock); 1282 return (EEXIST); 1283 } 1284 1285 /* 1286 * Create and initialize the spa structure. 1287 */ 1288 spa = spa_add(pool, altroot); 1289 spa_activate(spa); 1290 1291 /* 1292 * Pass off the heavy lifting to spa_load(). 1293 * Pass TRUE for mosconfig because the user-supplied config 1294 * is actually the one to trust when doing an import. 1295 */ 1296 error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE); 1297 1298 spa_config_enter(spa, RW_WRITER, FTAG); 1299 /* 1300 * Toss any existing sparelist, as it doesn't have any validity anymore, 1301 * and conflicts with spa_has_spare(). 1302 */ 1303 if (spa->spa_sparelist) { 1304 nvlist_free(spa->spa_sparelist); 1305 spa->spa_sparelist = NULL; 1306 spa_load_spares(spa); 1307 } 1308 1309 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 1310 &nvroot) == 0); 1311 if (error == 0) 1312 error = spa_validate_spares(spa, nvroot, -1ULL, 1313 VDEV_ALLOC_SPARE); 1314 spa_config_exit(spa, FTAG); 1315 1316 if (error != 0) { 1317 spa_unload(spa); 1318 spa_deactivate(spa); 1319 spa_remove(spa); 1320 mutex_exit(&spa_namespace_lock); 1321 return (error); 1322 } 1323 1324 /* 1325 * Override any spares as specified by the user, as these may have 1326 * correct device names/devids, etc. 1327 */ 1328 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1329 &spares, &nspares) == 0) { 1330 if (spa->spa_sparelist) 1331 VERIFY(nvlist_remove(spa->spa_sparelist, 1332 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 1333 else 1334 VERIFY(nvlist_alloc(&spa->spa_sparelist, 1335 NV_UNIQUE_NAME, KM_SLEEP) == 0); 1336 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1337 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1338 spa_config_enter(spa, RW_WRITER, FTAG); 1339 spa_load_spares(spa); 1340 spa_config_exit(spa, FTAG); 1341 spa->spa_sync_spares = B_TRUE; 1342 } 1343 1344 /* 1345 * Update the config cache to include the newly-imported pool. 1346 */ 1347 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 1348 1349 mutex_exit(&spa_namespace_lock); 1350 1351 /* 1352 * Resilver anything that's out of date. 1353 */ 1354 if (spa_mode & FWRITE) 1355 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1356 1357 return (0); 1358} 1359 1360/* 1361 * This (illegal) pool name is used when temporarily importing a spa_t in order 1362 * to get the vdev stats associated with the imported devices. 1363 */ 1364#define TRYIMPORT_NAME "$import" 1365 1366nvlist_t * 1367spa_tryimport(nvlist_t *tryconfig) 1368{ 1369 nvlist_t *config = NULL; 1370 char *poolname; 1371 spa_t *spa; 1372 uint64_t state; 1373 1374 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 1375 return (NULL); 1376 1377 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 1378 return (NULL); 1379 1380 /* 1381 * Create and initialize the spa structure. 1382 */ 1383 mutex_enter(&spa_namespace_lock); 1384 spa = spa_add(TRYIMPORT_NAME, NULL); 1385 spa_activate(spa); 1386 1387 /* 1388 * Pass off the heavy lifting to spa_load(). 1389 * Pass TRUE for mosconfig because the user-supplied config 1390 * is actually the one to trust when doing an import. 1391 */ 1392 (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); 1393 1394 /* 1395 * If 'tryconfig' was at least parsable, return the current config. 1396 */ 1397 if (spa->spa_root_vdev != NULL) { 1398 spa_config_enter(spa, RW_READER, FTAG); 1399 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 1400 spa_config_exit(spa, FTAG); 1401 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 1402 poolname) == 0); 1403 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 1404 state) == 0); 1405 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 1406 spa->spa_uberblock.ub_timestamp) == 0); 1407 1408 /* 1409 * Add the list of hot spares. 1410 */ 1411 spa_add_spares(spa, config); 1412 } 1413 1414 spa_unload(spa); 1415 spa_deactivate(spa); 1416 spa_remove(spa); 1417 mutex_exit(&spa_namespace_lock); 1418 1419 return (config); 1420} 1421 1422/* 1423 * Pool export/destroy 1424 * 1425 * The act of destroying or exporting a pool is very simple. We make sure there 1426 * is no more pending I/O and any references to the pool are gone. Then, we 1427 * update the pool state and sync all the labels to disk, removing the 1428 * configuration from the cache afterwards. 1429 */ 1430static int 1431spa_export_common(char *pool, int new_state, nvlist_t **oldconfig) 1432{ 1433 spa_t *spa; 1434 1435 if (oldconfig) 1436 *oldconfig = NULL; 1437 1438 if (!(spa_mode & FWRITE)) 1439 return (EROFS); 1440 1441 mutex_enter(&spa_namespace_lock); 1442 if ((spa = spa_lookup(pool)) == NULL) { 1443 mutex_exit(&spa_namespace_lock); 1444 return (ENOENT); 1445 } 1446 1447 /* 1448 * Put a hold on the pool, drop the namespace lock, stop async tasks, 1449 * reacquire the namespace lock, and see if we can export. 1450 */ 1451 spa_open_ref(spa, FTAG); 1452 mutex_exit(&spa_namespace_lock); 1453 spa_async_suspend(spa); 1454 mutex_enter(&spa_namespace_lock); 1455 spa_close(spa, FTAG); 1456 1457 /* 1458 * The pool will be in core if it's openable, 1459 * in which case we can modify its state. 1460 */ 1461 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 1462 /* 1463 * Objsets may be open only because they're dirty, so we 1464 * have to force it to sync before checking spa_refcnt. 1465 */ 1466 spa_scrub_suspend(spa); 1467 txg_wait_synced(spa->spa_dsl_pool, 0); 1468 1469 /* 1470 * A pool cannot be exported or destroyed if there are active 1471 * references. If we are resetting a pool, allow references by 1472 * fault injection handlers. 1473 */ 1474 if (!spa_refcount_zero(spa) || 1475 (spa->spa_inject_ref != 0 && 1476 new_state != POOL_STATE_UNINITIALIZED)) { 1477 spa_scrub_resume(spa); 1478 spa_async_resume(spa); 1479 mutex_exit(&spa_namespace_lock); 1480 return (EBUSY); 1481 } 1482 1483 spa_scrub_resume(spa); 1484 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 1485 1486 /* 1487 * We want this to be reflected on every label, 1488 * so mark them all dirty. spa_unload() will do the 1489 * final sync that pushes these changes out. 1490 */ 1491 if (new_state != POOL_STATE_UNINITIALIZED) { 1492 spa_config_enter(spa, RW_WRITER, FTAG); 1493 spa->spa_state = new_state; 1494 spa->spa_final_txg = spa_last_synced_txg(spa) + 1; 1495 vdev_config_dirty(spa->spa_root_vdev); 1496 spa_config_exit(spa, FTAG); 1497 } 1498 } 1499 1500 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 1501 spa_unload(spa); 1502 spa_deactivate(spa); 1503 } 1504 1505 if (oldconfig && spa->spa_config) 1506 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 1507 1508 if (new_state != POOL_STATE_UNINITIALIZED) { 1509 spa_remove(spa); 1510 spa_config_sync(); 1511 } 1512 mutex_exit(&spa_namespace_lock); 1513 1514 return (0); 1515} 1516 1517/* 1518 * Destroy a storage pool. 1519 */ 1520int 1521spa_destroy(char *pool) 1522{ 1523 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL)); 1524} 1525 1526/* 1527 * Export a storage pool. 1528 */ 1529int 1530spa_export(char *pool, nvlist_t **oldconfig) 1531{ 1532 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig)); 1533} 1534 1535/* 1536 * Similar to spa_export(), this unloads the spa_t without actually removing it 1537 * from the namespace in any way. 1538 */ 1539int 1540spa_reset(char *pool) 1541{ 1542 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL)); 1543} 1544 1545 1546/* 1547 * ========================================================================== 1548 * Device manipulation 1549 * ========================================================================== 1550 */ 1551 1552/* 1553 * Add capacity to a storage pool. 1554 */ 1555int 1556spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 1557{ 1558 uint64_t txg; 1559 int c, error; 1560 vdev_t *rvd = spa->spa_root_vdev; 1561 vdev_t *vd, *tvd; 1562 nvlist_t **spares; 1563 uint_t i, nspares; 1564 1565 txg = spa_vdev_enter(spa); 1566 1567 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 1568 VDEV_ALLOC_ADD)) != 0) 1569 return (spa_vdev_exit(spa, NULL, txg, error)); 1570 1571 spa->spa_pending_vdev = vd; 1572 1573 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1574 &spares, &nspares) != 0) 1575 nspares = 0; 1576 1577 if (vd->vdev_children == 0 && nspares == 0) { 1578 spa->spa_pending_vdev = NULL; 1579 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 1580 } 1581 1582 if (vd->vdev_children != 0) { 1583 if ((error = vdev_create(vd, txg, B_FALSE)) != 0) { 1584 spa->spa_pending_vdev = NULL; 1585 return (spa_vdev_exit(spa, vd, txg, error)); 1586 } 1587 } 1588 1589 /* 1590 * We must validate the spares after checking the children. Otherwise, 1591 * vdev_inuse() will blindly overwrite the spare. 1592 */ 1593 if ((error = spa_validate_spares(spa, nvroot, txg, 1594 VDEV_ALLOC_ADD)) != 0) { 1595 spa->spa_pending_vdev = NULL; 1596 return (spa_vdev_exit(spa, vd, txg, error)); 1597 } 1598 1599 spa->spa_pending_vdev = NULL; 1600 1601 /* 1602 * Transfer each new top-level vdev from vd to rvd. 1603 */ 1604 for (c = 0; c < vd->vdev_children; c++) { 1605 tvd = vd->vdev_child[c]; 1606 vdev_remove_child(vd, tvd); 1607 tvd->vdev_id = rvd->vdev_children; 1608 vdev_add_child(rvd, tvd); 1609 vdev_config_dirty(tvd); 1610 } 1611 1612 if (nspares != 0) { 1613 if (spa->spa_sparelist != NULL) { 1614 nvlist_t **oldspares; 1615 uint_t oldnspares; 1616 nvlist_t **newspares; 1617 1618 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 1619 ZPOOL_CONFIG_SPARES, &oldspares, &oldnspares) == 0); 1620 1621 newspares = kmem_alloc(sizeof (void *) * 1622 (nspares + oldnspares), KM_SLEEP); 1623 for (i = 0; i < oldnspares; i++) 1624 VERIFY(nvlist_dup(oldspares[i], 1625 &newspares[i], KM_SLEEP) == 0); 1626 for (i = 0; i < nspares; i++) 1627 VERIFY(nvlist_dup(spares[i], 1628 &newspares[i + oldnspares], 1629 KM_SLEEP) == 0); 1630 1631 VERIFY(nvlist_remove(spa->spa_sparelist, 1632 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 1633 1634 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1635 ZPOOL_CONFIG_SPARES, newspares, 1636 nspares + oldnspares) == 0); 1637 for (i = 0; i < oldnspares + nspares; i++) 1638 nvlist_free(newspares[i]); 1639 kmem_free(newspares, (oldnspares + nspares) * 1640 sizeof (void *)); 1641 } else { 1642 VERIFY(nvlist_alloc(&spa->spa_sparelist, 1643 NV_UNIQUE_NAME, KM_SLEEP) == 0); 1644 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1645 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1646 } 1647 1648 spa_load_spares(spa); 1649 spa->spa_sync_spares = B_TRUE; 1650 } 1651 1652 /* 1653 * We have to be careful when adding new vdevs to an existing pool. 1654 * If other threads start allocating from these vdevs before we 1655 * sync the config cache, and we lose power, then upon reboot we may 1656 * fail to open the pool because there are DVAs that the config cache 1657 * can't translate. Therefore, we first add the vdevs without 1658 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 1659 * and then let spa_config_update() initialize the new metaslabs. 1660 * 1661 * spa_load() checks for added-but-not-initialized vdevs, so that 1662 * if we lose power at any point in this sequence, the remaining 1663 * steps will be completed the next time we load the pool. 1664 */ 1665 (void) spa_vdev_exit(spa, vd, txg, 0); 1666 1667 mutex_enter(&spa_namespace_lock); 1668 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 1669 mutex_exit(&spa_namespace_lock); 1670 1671 return (0); 1672} 1673 1674/* 1675 * Attach a device to a mirror. The arguments are the path to any device 1676 * in the mirror, and the nvroot for the new device. If the path specifies 1677 * a device that is not mirrored, we automatically insert the mirror vdev. 1678 * 1679 * If 'replacing' is specified, the new device is intended to replace the 1680 * existing device; in this case the two devices are made into their own 1681 * mirror using the 'replacing' vdev, which is functionally idendical to 1682 * the mirror vdev (it actually reuses all the same ops) but has a few 1683 * extra rules: you can't attach to it after it's been created, and upon 1684 * completion of resilvering, the first disk (the one being replaced) 1685 * is automatically detached. 1686 */ 1687int 1688spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 1689{ 1690 uint64_t txg, open_txg; 1691 int error; 1692 vdev_t *rvd = spa->spa_root_vdev; 1693 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 1694 vdev_ops_t *pvops; 1695 1696 txg = spa_vdev_enter(spa); 1697 1698 oldvd = vdev_lookup_by_guid(rvd, guid); 1699 1700 if (oldvd == NULL) 1701 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1702 1703 if (!oldvd->vdev_ops->vdev_op_leaf) 1704 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1705 1706 pvd = oldvd->vdev_parent; 1707 1708 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 1709 VDEV_ALLOC_ADD)) != 0 || newrootvd->vdev_children != 1) 1710 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1711 1712 newvd = newrootvd->vdev_child[0]; 1713 1714 if (!newvd->vdev_ops->vdev_op_leaf) 1715 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1716 1717 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 1718 return (spa_vdev_exit(spa, newrootvd, txg, error)); 1719 1720 if (!replacing) { 1721 /* 1722 * For attach, the only allowable parent is a mirror or the root 1723 * vdev. 1724 */ 1725 if (pvd->vdev_ops != &vdev_mirror_ops && 1726 pvd->vdev_ops != &vdev_root_ops) 1727 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1728 1729 pvops = &vdev_mirror_ops; 1730 } else { 1731 /* 1732 * Active hot spares can only be replaced by inactive hot 1733 * spares. 1734 */ 1735 if (pvd->vdev_ops == &vdev_spare_ops && 1736 pvd->vdev_child[1] == oldvd && 1737 !spa_has_spare(spa, newvd->vdev_guid)) 1738 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1739 1740 /* 1741 * If the source is a hot spare, and the parent isn't already a 1742 * spare, then we want to create a new hot spare. Otherwise, we 1743 * want to create a replacing vdev. The user is not allowed to 1744 * attach to a spared vdev child unless the 'isspare' state is 1745 * the same (spare replaces spare, non-spare replaces 1746 * non-spare). 1747 */ 1748 if (pvd->vdev_ops == &vdev_replacing_ops) 1749 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1750 else if (pvd->vdev_ops == &vdev_spare_ops && 1751 newvd->vdev_isspare != oldvd->vdev_isspare) 1752 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1753 else if (pvd->vdev_ops != &vdev_spare_ops && 1754 newvd->vdev_isspare) 1755 pvops = &vdev_spare_ops; 1756 else 1757 pvops = &vdev_replacing_ops; 1758 } 1759 1760 /* 1761 * Compare the new device size with the replaceable/attachable 1762 * device size. 1763 */ 1764 if (newvd->vdev_psize < vdev_get_rsize(oldvd)) 1765 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 1766 1767 /* 1768 * The new device cannot have a higher alignment requirement 1769 * than the top-level vdev. 1770 */ 1771 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 1772 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 1773 1774 /* 1775 * If this is an in-place replacement, update oldvd's path and devid 1776 * to make it distinguishable from newvd, and unopenable from now on. 1777 */ 1778 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 1779 spa_strfree(oldvd->vdev_path); 1780 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 1781 KM_SLEEP); 1782 (void) sprintf(oldvd->vdev_path, "%s/%s", 1783 newvd->vdev_path, "old"); 1784 if (oldvd->vdev_devid != NULL) { 1785 spa_strfree(oldvd->vdev_devid); 1786 oldvd->vdev_devid = NULL; 1787 } 1788 } 1789 1790 /* 1791 * If the parent is not a mirror, or if we're replacing, insert the new 1792 * mirror/replacing/spare vdev above oldvd. 1793 */ 1794 if (pvd->vdev_ops != pvops) 1795 pvd = vdev_add_parent(oldvd, pvops); 1796 1797 ASSERT(pvd->vdev_top->vdev_parent == rvd); 1798 ASSERT(pvd->vdev_ops == pvops); 1799 ASSERT(oldvd->vdev_parent == pvd); 1800 1801 /* 1802 * Extract the new device from its root and add it to pvd. 1803 */ 1804 vdev_remove_child(newrootvd, newvd); 1805 newvd->vdev_id = pvd->vdev_children; 1806 vdev_add_child(pvd, newvd); 1807 1808 /* 1809 * If newvd is smaller than oldvd, but larger than its rsize, 1810 * the addition of newvd may have decreased our parent's asize. 1811 */ 1812 pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); 1813 1814 tvd = newvd->vdev_top; 1815 ASSERT(pvd->vdev_top == tvd); 1816 ASSERT(tvd->vdev_parent == rvd); 1817 1818 vdev_config_dirty(tvd); 1819 1820 /* 1821 * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 1822 * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 1823 */ 1824 open_txg = txg + TXG_CONCURRENT_STATES - 1; 1825 1826 mutex_enter(&newvd->vdev_dtl_lock); 1827 space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, 1828 open_txg - TXG_INITIAL + 1); 1829 mutex_exit(&newvd->vdev_dtl_lock); 1830 1831 if (newvd->vdev_isspare) 1832 spa_spare_activate(newvd); 1833 1834 /* 1835 * Mark newvd's DTL dirty in this txg. 1836 */ 1837 vdev_dirty(tvd, VDD_DTL, newvd, txg); 1838 1839 (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 1840 1841 /* 1842 * Kick off a resilver to update newvd. 1843 */ 1844 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1845 1846 return (0); 1847} 1848 1849/* 1850 * Detach a device from a mirror or replacing vdev. 1851 * If 'replace_done' is specified, only detach if the parent 1852 * is a replacing vdev. 1853 */ 1854int 1855spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) 1856{ 1857 uint64_t txg; 1858 int c, t, error; 1859 vdev_t *rvd = spa->spa_root_vdev; 1860 vdev_t *vd, *pvd, *cvd, *tvd; 1861 boolean_t unspare = B_FALSE; 1862 uint64_t unspare_guid; 1863 1864 txg = spa_vdev_enter(spa); 1865 1866 vd = vdev_lookup_by_guid(rvd, guid); 1867 1868 if (vd == NULL) 1869 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1870 1871 if (!vd->vdev_ops->vdev_op_leaf) 1872 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1873 1874 pvd = vd->vdev_parent; 1875 1876 /* 1877 * If replace_done is specified, only remove this device if it's 1878 * the first child of a replacing vdev. For the 'spare' vdev, either 1879 * disk can be removed. 1880 */ 1881 if (replace_done) { 1882 if (pvd->vdev_ops == &vdev_replacing_ops) { 1883 if (vd->vdev_id != 0) 1884 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1885 } else if (pvd->vdev_ops != &vdev_spare_ops) { 1886 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1887 } 1888 } 1889 1890 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 1891 spa_version(spa) >= ZFS_VERSION_SPARES); 1892 1893 /* 1894 * Only mirror, replacing, and spare vdevs support detach. 1895 */ 1896 if (pvd->vdev_ops != &vdev_replacing_ops && 1897 pvd->vdev_ops != &vdev_mirror_ops && 1898 pvd->vdev_ops != &vdev_spare_ops) 1899 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1900 1901 /* 1902 * If there's only one replica, you can't detach it. 1903 */ 1904 if (pvd->vdev_children <= 1) 1905 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1906 1907 /* 1908 * If all siblings have non-empty DTLs, this device may have the only 1909 * valid copy of the data, which means we cannot safely detach it. 1910 * 1911 * XXX -- as in the vdev_offline() case, we really want a more 1912 * precise DTL check. 1913 */ 1914 for (c = 0; c < pvd->vdev_children; c++) { 1915 uint64_t dirty; 1916 1917 cvd = pvd->vdev_child[c]; 1918 if (cvd == vd) 1919 continue; 1920 if (vdev_is_dead(cvd)) 1921 continue; 1922 mutex_enter(&cvd->vdev_dtl_lock); 1923 dirty = cvd->vdev_dtl_map.sm_space | 1924 cvd->vdev_dtl_scrub.sm_space; 1925 mutex_exit(&cvd->vdev_dtl_lock); 1926 if (!dirty) 1927 break; 1928 } 1929 1930 /* 1931 * If we are a replacing or spare vdev, then we can always detach the 1932 * latter child, as that is how one cancels the operation. 1933 */ 1934 if ((pvd->vdev_ops == &vdev_mirror_ops || vd->vdev_id != 1) && 1935 c == pvd->vdev_children) 1936 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1937 1938 /* 1939 * If we are detaching the original disk from a spare, then it implies 1940 * that the spare should become a real disk, and be removed from the 1941 * active spare list for the pool. 1942 */ 1943 if (pvd->vdev_ops == &vdev_spare_ops && 1944 vd->vdev_id == 0) 1945 unspare = B_TRUE; 1946 1947 /* 1948 * Erase the disk labels so the disk can be used for other things. 1949 * This must be done after all other error cases are handled, 1950 * but before we disembowel vd (so we can still do I/O to it). 1951 * But if we can't do it, don't treat the error as fatal -- 1952 * it may be that the unwritability of the disk is the reason 1953 * it's being detached! 1954 */ 1955 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 1956 1957 /* 1958 * Remove vd from its parent and compact the parent's children. 1959 */ 1960 vdev_remove_child(pvd, vd); 1961 vdev_compact_children(pvd); 1962 1963 /* 1964 * Remember one of the remaining children so we can get tvd below. 1965 */ 1966 cvd = pvd->vdev_child[0]; 1967 1968 /* 1969 * If we need to remove the remaining child from the list of hot spares, 1970 * do it now, marking the vdev as no longer a spare in the process. We 1971 * must do this before vdev_remove_parent(), because that can change the 1972 * GUID if it creates a new toplevel GUID. 1973 */ 1974 if (unspare) { 1975 ASSERT(cvd->vdev_isspare); 1976 spa_spare_remove(cvd); 1977 unspare_guid = cvd->vdev_guid; 1978 } 1979 1980 /* 1981 * If the parent mirror/replacing vdev only has one child, 1982 * the parent is no longer needed. Remove it from the tree. 1983 */ 1984 if (pvd->vdev_children == 1) 1985 vdev_remove_parent(cvd); 1986 1987 /* 1988 * We don't set tvd until now because the parent we just removed 1989 * may have been the previous top-level vdev. 1990 */ 1991 tvd = cvd->vdev_top; 1992 ASSERT(tvd->vdev_parent == rvd); 1993 1994 /* 1995 * Reevaluate the parent vdev state. 1996 */ 1997 vdev_propagate_state(cvd->vdev_parent); 1998 1999 /* 2000 * If the device we just detached was smaller than the others, it may be 2001 * possible to add metaslabs (i.e. grow the pool). vdev_metaslab_init() 2002 * can't fail because the existing metaslabs are already in core, so 2003 * there's nothing to read from disk. 2004 */ 2005 VERIFY(vdev_metaslab_init(tvd, txg) == 0); 2006 2007 vdev_config_dirty(tvd); 2008 2009 /* 2010 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 2011 * vd->vdev_detached is set and free vd's DTL object in syncing context. 2012 * But first make sure we're not on any *other* txg's DTL list, to 2013 * prevent vd from being accessed after it's freed. 2014 */ 2015 for (t = 0; t < TXG_SIZE; t++) 2016 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 2017 vd->vdev_detached = B_TRUE; 2018 vdev_dirty(tvd, VDD_DTL, vd, txg); 2019 2020 error = spa_vdev_exit(spa, vd, txg, 0); 2021 2022 /* 2023 * If this was the removal of the original device in a hot spare vdev, 2024 * then we want to go through and remove the device from the hot spare 2025 * list of every other pool. 2026 */ 2027 if (unspare) { 2028 spa = NULL; 2029 mutex_enter(&spa_namespace_lock); 2030 while ((spa = spa_next(spa)) != NULL) { 2031 if (spa->spa_state != POOL_STATE_ACTIVE) 2032 continue; 2033 2034 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 2035 } 2036 mutex_exit(&spa_namespace_lock); 2037 } 2038 2039 return (error); 2040} 2041 2042/* 2043 * Remove a device from the pool. Currently, this supports removing only hot 2044 * spares. 2045 */ 2046int 2047spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 2048{ 2049 vdev_t *vd; 2050 nvlist_t **spares, *nv, **newspares; 2051 uint_t i, j, nspares; 2052 int ret = 0; 2053 2054 spa_config_enter(spa, RW_WRITER, FTAG); 2055 2056 vd = spa_lookup_by_guid(spa, guid); 2057 2058 nv = NULL; 2059 if (spa->spa_spares != NULL && 2060 nvlist_lookup_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 2061 &spares, &nspares) == 0) { 2062 for (i = 0; i < nspares; i++) { 2063 uint64_t theguid; 2064 2065 VERIFY(nvlist_lookup_uint64(spares[i], 2066 ZPOOL_CONFIG_GUID, &theguid) == 0); 2067 if (theguid == guid) { 2068 nv = spares[i]; 2069 break; 2070 } 2071 } 2072 } 2073 2074 /* 2075 * We only support removing a hot spare, and only if it's not currently 2076 * in use in this pool. 2077 */ 2078 if (nv == NULL && vd == NULL) { 2079 ret = ENOENT; 2080 goto out; 2081 } 2082 2083 if (nv == NULL && vd != NULL) { 2084 ret = ENOTSUP; 2085 goto out; 2086 } 2087 2088 if (!unspare && nv != NULL && vd != NULL) { 2089 ret = EBUSY; 2090 goto out; 2091 } 2092 2093 if (nspares == 1) { 2094 newspares = NULL; 2095 } else { 2096 newspares = kmem_alloc((nspares - 1) * sizeof (void *), 2097 KM_SLEEP); 2098 for (i = 0, j = 0; i < nspares; i++) { 2099 if (spares[i] != nv) 2100 VERIFY(nvlist_dup(spares[i], 2101 &newspares[j++], KM_SLEEP) == 0); 2102 } 2103 } 2104 2105 VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 2106 DATA_TYPE_NVLIST_ARRAY) == 0); 2107 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 2108 newspares, nspares - 1) == 0); 2109 for (i = 0; i < nspares - 1; i++) 2110 nvlist_free(newspares[i]); 2111 kmem_free(newspares, (nspares - 1) * sizeof (void *)); 2112 spa_load_spares(spa); 2113 spa->spa_sync_spares = B_TRUE; 2114 2115out: 2116 spa_config_exit(spa, FTAG); 2117 2118 return (ret); 2119} 2120 2121/* 2122 * Find any device that's done replacing, so we can detach it. 2123 */ 2124static vdev_t * 2125spa_vdev_replace_done_hunt(vdev_t *vd) 2126{ 2127 vdev_t *newvd, *oldvd; 2128 int c; 2129 2130 for (c = 0; c < vd->vdev_children; c++) { 2131 oldvd = spa_vdev_replace_done_hunt(vd->vdev_child[c]); 2132 if (oldvd != NULL) 2133 return (oldvd); 2134 } 2135 2136 if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 2137 oldvd = vd->vdev_child[0]; 2138 newvd = vd->vdev_child[1]; 2139 2140 mutex_enter(&newvd->vdev_dtl_lock); 2141 if (newvd->vdev_dtl_map.sm_space == 0 && 2142 newvd->vdev_dtl_scrub.sm_space == 0) { 2143 mutex_exit(&newvd->vdev_dtl_lock); 2144 return (oldvd); 2145 } 2146 mutex_exit(&newvd->vdev_dtl_lock); 2147 } 2148 2149 return (NULL); 2150} 2151 2152static void 2153spa_vdev_replace_done(spa_t *spa) 2154{ 2155 vdev_t *vd; 2156 vdev_t *pvd; 2157 uint64_t guid; 2158 uint64_t pguid = 0; 2159 2160 spa_config_enter(spa, RW_READER, FTAG); 2161 2162 while ((vd = spa_vdev_replace_done_hunt(spa->spa_root_vdev)) != NULL) { 2163 guid = vd->vdev_guid; 2164 /* 2165 * If we have just finished replacing a hot spared device, then 2166 * we need to detach the parent's first child (the original hot 2167 * spare) as well. 2168 */ 2169 pvd = vd->vdev_parent; 2170 if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops && 2171 pvd->vdev_id == 0) { 2172 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 2173 ASSERT(pvd->vdev_parent->vdev_children == 2); 2174 pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid; 2175 } 2176 spa_config_exit(spa, FTAG); 2177 if (spa_vdev_detach(spa, guid, B_TRUE) != 0) 2178 return; 2179 if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0) 2180 return; 2181 spa_config_enter(spa, RW_READER, FTAG); 2182 } 2183 2184 spa_config_exit(spa, FTAG); 2185} 2186 2187/* 2188 * Update the stored path for this vdev. Dirty the vdev configuration, relying 2189 * on spa_vdev_enter/exit() to synchronize the labels and cache. 2190 */ 2191int 2192spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 2193{ 2194 vdev_t *rvd, *vd; 2195 uint64_t txg; 2196 2197 rvd = spa->spa_root_vdev; 2198 2199 txg = spa_vdev_enter(spa); 2200 2201 if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 2202 /* 2203 * Determine if this is a reference to a hot spare. In that 2204 * case, update the path as stored in the spare list. 2205 */ 2206 nvlist_t **spares; 2207 uint_t i, nspares; 2208 if (spa->spa_sparelist != NULL) { 2209 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 2210 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 2211 for (i = 0; i < nspares; i++) { 2212 uint64_t theguid; 2213 VERIFY(nvlist_lookup_uint64(spares[i], 2214 ZPOOL_CONFIG_GUID, &theguid) == 0); 2215 if (theguid == guid) 2216 break; 2217 } 2218 2219 if (i == nspares) 2220 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 2221 2222 VERIFY(nvlist_add_string(spares[i], 2223 ZPOOL_CONFIG_PATH, newpath) == 0); 2224 spa_load_spares(spa); 2225 spa->spa_sync_spares = B_TRUE; 2226 return (spa_vdev_exit(spa, NULL, txg, 0)); 2227 } else { 2228 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 2229 } 2230 } 2231 2232 if (!vd->vdev_ops->vdev_op_leaf) 2233 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 2234 2235 spa_strfree(vd->vdev_path); 2236 vd->vdev_path = spa_strdup(newpath); 2237 2238 vdev_config_dirty(vd->vdev_top); 2239 2240 return (spa_vdev_exit(spa, NULL, txg, 0)); 2241} 2242 2243/* 2244 * ========================================================================== 2245 * SPA Scrubbing 2246 * ========================================================================== 2247 */ 2248 2249static void 2250spa_scrub_io_done(zio_t *zio) 2251{ 2252 spa_t *spa = zio->io_spa; 2253 2254 zio_data_buf_free(zio->io_data, zio->io_size); 2255 2256 mutex_enter(&spa->spa_scrub_lock); 2257 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2258 vdev_t *vd = zio->io_vd ? zio->io_vd : spa->spa_root_vdev; 2259 spa->spa_scrub_errors++; 2260 mutex_enter(&vd->vdev_stat_lock); 2261 vd->vdev_stat.vs_scrub_errors++; 2262 mutex_exit(&vd->vdev_stat_lock); 2263 } 2264 2265 if (--spa->spa_scrub_inflight < spa->spa_scrub_maxinflight) 2266 cv_broadcast(&spa->spa_scrub_io_cv); 2267 2268 ASSERT(spa->spa_scrub_inflight >= 0); 2269 2270 mutex_exit(&spa->spa_scrub_lock); 2271} 2272 2273static void 2274spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags, 2275 zbookmark_t *zb) 2276{ 2277 size_t size = BP_GET_LSIZE(bp); 2278 void *data; 2279 2280 mutex_enter(&spa->spa_scrub_lock); 2281 /* 2282 * Do not give too much work to vdev(s). 2283 */ 2284 while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight) { 2285 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2286 } 2287 spa->spa_scrub_inflight++; 2288 mutex_exit(&spa->spa_scrub_lock); 2289 2290 data = zio_data_buf_alloc(size); 2291 2292 if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) 2293 flags |= ZIO_FLAG_SPECULATIVE; /* intent log block */ 2294 2295 flags |= ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL; 2296 2297 zio_nowait(zio_read(NULL, spa, bp, data, size, 2298 spa_scrub_io_done, NULL, priority, flags, zb)); 2299} 2300 2301/* ARGSUSED */ 2302static int 2303spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) 2304{ 2305 blkptr_t *bp = &bc->bc_blkptr; 2306 vdev_t *vd = spa->spa_root_vdev; 2307 dva_t *dva = bp->blk_dva; 2308 int needs_resilver = B_FALSE; 2309 int d; 2310 2311 if (bc->bc_errno) { 2312 /* 2313 * We can't scrub this block, but we can continue to scrub 2314 * the rest of the pool. Note the error and move along. 2315 */ 2316 mutex_enter(&spa->spa_scrub_lock); 2317 spa->spa_scrub_errors++; 2318 mutex_exit(&spa->spa_scrub_lock); 2319 2320 mutex_enter(&vd->vdev_stat_lock); 2321 vd->vdev_stat.vs_scrub_errors++; 2322 mutex_exit(&vd->vdev_stat_lock); 2323 2324 return (ERESTART); 2325 } 2326 2327 ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg); 2328 2329 for (d = 0; d < BP_GET_NDVAS(bp); d++) { 2330 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d])); 2331 2332 ASSERT(vd != NULL); 2333 2334 /* 2335 * Keep track of how much data we've examined so that 2336 * zpool(1M) status can make useful progress reports. 2337 */ 2338 mutex_enter(&vd->vdev_stat_lock); 2339 vd->vdev_stat.vs_scrub_examined += DVA_GET_ASIZE(&dva[d]); 2340 mutex_exit(&vd->vdev_stat_lock); 2341 2342 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) { 2343 if (DVA_GET_GANG(&dva[d])) { 2344 /* 2345 * Gang members may be spread across multiple 2346 * vdevs, so the best we can do is look at the 2347 * pool-wide DTL. 2348 * XXX -- it would be better to change our 2349 * allocation policy to ensure that this can't 2350 * happen. 2351 */ 2352 vd = spa->spa_root_vdev; 2353 } 2354 if (vdev_dtl_contains(&vd->vdev_dtl_map, 2355 bp->blk_birth, 1)) 2356 needs_resilver = B_TRUE; 2357 } 2358 } 2359 2360 if (spa->spa_scrub_type == POOL_SCRUB_EVERYTHING) 2361 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB, 2362 ZIO_FLAG_SCRUB, &bc->bc_bookmark); 2363 else if (needs_resilver) 2364 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, 2365 ZIO_FLAG_RESILVER, &bc->bc_bookmark); 2366 2367 return (0); 2368} 2369 2370static void 2371spa_scrub_thread(void *arg) 2372{ 2373 spa_t *spa = arg; 2374 callb_cpr_t cprinfo; 2375 traverse_handle_t *th = spa->spa_scrub_th; 2376 vdev_t *rvd = spa->spa_root_vdev; 2377 pool_scrub_type_t scrub_type = spa->spa_scrub_type; 2378 int error = 0; 2379 boolean_t complete; 2380 2381 CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG); 2382 2383 /* 2384 * If we're restarting due to a snapshot create/delete, 2385 * wait for that to complete. 2386 */ 2387 txg_wait_synced(spa_get_dsl(spa), 0); 2388 2389 dprintf("start %s mintxg=%llu maxtxg=%llu\n", 2390 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 2391 spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg); 2392 2393 spa_config_enter(spa, RW_WRITER, FTAG); 2394 vdev_reopen(rvd); /* purge all vdev caches */ 2395 vdev_config_dirty(rvd); /* rewrite all disk labels */ 2396 vdev_scrub_stat_update(rvd, scrub_type, B_FALSE); 2397 spa_config_exit(spa, FTAG); 2398 2399 mutex_enter(&spa->spa_scrub_lock); 2400 spa->spa_scrub_errors = 0; 2401 spa->spa_scrub_active = 1; 2402 ASSERT(spa->spa_scrub_inflight == 0); 2403 2404 while (!spa->spa_scrub_stop) { 2405 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2406 while (spa->spa_scrub_suspended) { 2407 spa->spa_scrub_active = 0; 2408 cv_broadcast(&spa->spa_scrub_cv); 2409 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2410 spa->spa_scrub_active = 1; 2411 } 2412 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock); 2413 2414 if (spa->spa_scrub_restart_txg != 0) 2415 break; 2416 2417 mutex_exit(&spa->spa_scrub_lock); 2418 error = traverse_more(th); 2419 mutex_enter(&spa->spa_scrub_lock); 2420 if (error != EAGAIN) 2421 break; 2422 } 2423 2424 while (spa->spa_scrub_inflight) 2425 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2426 2427 spa->spa_scrub_active = 0; 2428 cv_broadcast(&spa->spa_scrub_cv); 2429 2430 mutex_exit(&spa->spa_scrub_lock); 2431 2432 spa_config_enter(spa, RW_WRITER, FTAG); 2433 2434 mutex_enter(&spa->spa_scrub_lock); 2435 2436 /* 2437 * Note: we check spa_scrub_restart_txg under both spa_scrub_lock 2438 * AND the spa config lock to synchronize with any config changes 2439 * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit(). 2440 */ 2441 if (spa->spa_scrub_restart_txg != 0) 2442 error = ERESTART; 2443 2444 if (spa->spa_scrub_stop) 2445 error = EINTR; 2446 2447 /* 2448 * Even if there were uncorrectable errors, we consider the scrub 2449 * completed. The downside is that if there is a transient error during 2450 * a resilver, we won't resilver the data properly to the target. But 2451 * if the damage is permanent (more likely) we will resilver forever, 2452 * which isn't really acceptable. Since there is enough information for 2453 * the user to know what has failed and why, this seems like a more 2454 * tractable approach. 2455 */ 2456 complete = (error == 0); 2457 2458 dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n", 2459 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 2460 spa->spa_scrub_maxtxg, complete ? "done" : "FAILED", 2461 error, spa->spa_scrub_errors, spa->spa_scrub_stop); 2462 2463 mutex_exit(&spa->spa_scrub_lock); 2464 2465 /* 2466 * If the scrub/resilver completed, update all DTLs to reflect this. 2467 * Whether it succeeded or not, vacate all temporary scrub DTLs. 2468 */ 2469 vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1, 2470 complete ? spa->spa_scrub_maxtxg : 0, B_TRUE); 2471 vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); 2472 spa_errlog_rotate(spa); 2473 2474 spa_config_exit(spa, FTAG); 2475 2476 mutex_enter(&spa->spa_scrub_lock); 2477 2478 /* 2479 * We may have finished replacing a device. 2480 * Let the async thread assess this and handle the detach. 2481 */ 2482 spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 2483 2484 /* 2485 * If we were told to restart, our final act is to start a new scrub. 2486 */ 2487 if (error == ERESTART) 2488 spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ? 2489 SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB); 2490 2491 spa->spa_scrub_type = POOL_SCRUB_NONE; 2492 spa->spa_scrub_active = 0; 2493 spa->spa_scrub_thread = NULL; 2494 cv_broadcast(&spa->spa_scrub_cv); 2495 CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */ 2496 thread_exit(); 2497} 2498 2499void 2500spa_scrub_suspend(spa_t *spa) 2501{ 2502 mutex_enter(&spa->spa_scrub_lock); 2503 spa->spa_scrub_suspended++; 2504 while (spa->spa_scrub_active) { 2505 cv_broadcast(&spa->spa_scrub_cv); 2506 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2507 } 2508 while (spa->spa_scrub_inflight) 2509 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2510 mutex_exit(&spa->spa_scrub_lock); 2511} 2512 2513void 2514spa_scrub_resume(spa_t *spa) 2515{ 2516 mutex_enter(&spa->spa_scrub_lock); 2517 ASSERT(spa->spa_scrub_suspended != 0); 2518 if (--spa->spa_scrub_suspended == 0) 2519 cv_broadcast(&spa->spa_scrub_cv); 2520 mutex_exit(&spa->spa_scrub_lock); 2521} 2522 2523void 2524spa_scrub_restart(spa_t *spa, uint64_t txg) 2525{ 2526 /* 2527 * Something happened (e.g. snapshot create/delete) that means 2528 * we must restart any in-progress scrubs. The itinerary will 2529 * fix this properly. 2530 */ 2531 mutex_enter(&spa->spa_scrub_lock); 2532 spa->spa_scrub_restart_txg = txg; 2533 mutex_exit(&spa->spa_scrub_lock); 2534} 2535 2536int 2537spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) 2538{ 2539 space_seg_t *ss; 2540 uint64_t mintxg, maxtxg; 2541 vdev_t *rvd = spa->spa_root_vdev; 2542 2543 if ((uint_t)type >= POOL_SCRUB_TYPES) 2544 return (ENOTSUP); 2545 2546 mutex_enter(&spa->spa_scrub_lock); 2547 2548 /* 2549 * If there's a scrub or resilver already in progress, stop it. 2550 */ 2551 while (spa->spa_scrub_thread != NULL) { 2552 /* 2553 * Don't stop a resilver unless forced. 2554 */ 2555 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) { 2556 mutex_exit(&spa->spa_scrub_lock); 2557 return (EBUSY); 2558 } 2559 spa->spa_scrub_stop = 1; 2560 cv_broadcast(&spa->spa_scrub_cv); 2561 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2562 } 2563 2564 /* 2565 * Terminate the previous traverse. 2566 */ 2567 if (spa->spa_scrub_th != NULL) { 2568 traverse_fini(spa->spa_scrub_th); 2569 spa->spa_scrub_th = NULL; 2570 } 2571 2572 if (rvd == NULL) { 2573 ASSERT(spa->spa_scrub_stop == 0); 2574 ASSERT(spa->spa_scrub_type == type); 2575 ASSERT(spa->spa_scrub_restart_txg == 0); 2576 mutex_exit(&spa->spa_scrub_lock); 2577 return (0); 2578 } 2579 2580 mintxg = TXG_INITIAL - 1; 2581 maxtxg = spa_last_synced_txg(spa) + 1; 2582 2583 mutex_enter(&rvd->vdev_dtl_lock); 2584 2585 if (rvd->vdev_dtl_map.sm_space == 0) { 2586 /* 2587 * The pool-wide DTL is empty. 2588 * If this is a resilver, there's nothing to do except 2589 * check whether any in-progress replacements have completed. 2590 */ 2591 if (type == POOL_SCRUB_RESILVER) { 2592 type = POOL_SCRUB_NONE; 2593 spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 2594 } 2595 } else { 2596 /* 2597 * The pool-wide DTL is non-empty. 2598 * If this is a normal scrub, upgrade to a resilver instead. 2599 */ 2600 if (type == POOL_SCRUB_EVERYTHING) 2601 type = POOL_SCRUB_RESILVER; 2602 } 2603 2604 if (type == POOL_SCRUB_RESILVER) { 2605 /* 2606 * Determine the resilvering boundaries. 2607 * 2608 * Note: (mintxg, maxtxg) is an open interval, 2609 * i.e. mintxg and maxtxg themselves are not included. 2610 * 2611 * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1 2612 * so we don't claim to resilver a txg that's still changing. 2613 */ 2614 ss = avl_first(&rvd->vdev_dtl_map.sm_root); 2615 mintxg = ss->ss_start - 1; 2616 ss = avl_last(&rvd->vdev_dtl_map.sm_root); 2617 maxtxg = MIN(ss->ss_end, maxtxg); 2618 } 2619 2620 mutex_exit(&rvd->vdev_dtl_lock); 2621 2622 spa->spa_scrub_stop = 0; 2623 spa->spa_scrub_type = type; 2624 spa->spa_scrub_restart_txg = 0; 2625 2626 if (type != POOL_SCRUB_NONE) { 2627 spa->spa_scrub_mintxg = mintxg; 2628 spa->spa_scrub_maxtxg = maxtxg; 2629 spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL, 2630 ADVANCE_PRE | ADVANCE_PRUNE | ADVANCE_ZIL, 2631 ZIO_FLAG_CANFAIL); 2632 traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg); 2633 spa->spa_scrub_thread = thread_create(NULL, 0, 2634 spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri); 2635 } 2636 2637 mutex_exit(&spa->spa_scrub_lock); 2638 2639 return (0); 2640} 2641 2642/* 2643 * ========================================================================== 2644 * SPA async task processing 2645 * ========================================================================== 2646 */ 2647 2648static void 2649spa_async_reopen(spa_t *spa) 2650{ 2651 vdev_t *rvd = spa->spa_root_vdev; 2652 vdev_t *tvd; 2653 int c; 2654 2655 spa_config_enter(spa, RW_WRITER, FTAG); 2656 2657 for (c = 0; c < rvd->vdev_children; c++) { 2658 tvd = rvd->vdev_child[c]; 2659 if (tvd->vdev_reopen_wanted) { 2660 tvd->vdev_reopen_wanted = 0; 2661 vdev_reopen(tvd); 2662 } 2663 } 2664 2665 spa_config_exit(spa, FTAG); 2666} 2667 2668static void 2669spa_async_thread(void *arg) 2670{ 2671 spa_t *spa = arg; 2672 int tasks; 2673 2674 ASSERT(spa->spa_sync_on); 2675 2676 mutex_enter(&spa->spa_async_lock); 2677 tasks = spa->spa_async_tasks; 2678 spa->spa_async_tasks = 0; 2679 mutex_exit(&spa->spa_async_lock); 2680 2681 /* 2682 * See if the config needs to be updated. 2683 */ 2684 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 2685 mutex_enter(&spa_namespace_lock); 2686 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 2687 mutex_exit(&spa_namespace_lock); 2688 } 2689 2690 /* 2691 * See if any devices need to be reopened. 2692 */ 2693 if (tasks & SPA_ASYNC_REOPEN) 2694 spa_async_reopen(spa); 2695 2696 /* 2697 * If any devices are done replacing, detach them. 2698 */ 2699 if (tasks & SPA_ASYNC_REPLACE_DONE) 2700 spa_vdev_replace_done(spa); 2701 2702 /* 2703 * Kick off a scrub. 2704 */ 2705 if (tasks & SPA_ASYNC_SCRUB) 2706 VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0); 2707 2708 /* 2709 * Kick off a resilver. 2710 */ 2711 if (tasks & SPA_ASYNC_RESILVER) 2712 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 2713 2714 /* 2715 * Let the world know that we're done. 2716 */ 2717 mutex_enter(&spa->spa_async_lock); 2718 spa->spa_async_thread = NULL; 2719 cv_broadcast(&spa->spa_async_cv); 2720 mutex_exit(&spa->spa_async_lock); 2721 thread_exit(); 2722} 2723 2724void 2725spa_async_suspend(spa_t *spa) 2726{ 2727 mutex_enter(&spa->spa_async_lock); 2728 spa->spa_async_suspended++; 2729 while (spa->spa_async_thread != NULL) 2730 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 2731 mutex_exit(&spa->spa_async_lock); 2732} 2733 2734void 2735spa_async_resume(spa_t *spa) 2736{ 2737 mutex_enter(&spa->spa_async_lock); 2738 ASSERT(spa->spa_async_suspended != 0); 2739 spa->spa_async_suspended--; 2740 mutex_exit(&spa->spa_async_lock); 2741} 2742 2743static void 2744spa_async_dispatch(spa_t *spa) 2745{ 2746 mutex_enter(&spa->spa_async_lock); 2747 if (spa->spa_async_tasks && !spa->spa_async_suspended && 2748 spa->spa_async_thread == NULL && 2749 rootdir != NULL && !vn_is_readonly(rootdir)) 2750 spa->spa_async_thread = thread_create(NULL, 0, 2751 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 2752 mutex_exit(&spa->spa_async_lock); 2753} 2754 2755void 2756spa_async_request(spa_t *spa, int task) 2757{ 2758 mutex_enter(&spa->spa_async_lock); 2759 spa->spa_async_tasks |= task; 2760 mutex_exit(&spa->spa_async_lock); 2761} 2762 2763/* 2764 * ========================================================================== 2765 * SPA syncing routines 2766 * ========================================================================== 2767 */ 2768 2769static void 2770spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 2771{ 2772 bplist_t *bpl = &spa->spa_sync_bplist; 2773 dmu_tx_t *tx; 2774 blkptr_t blk; 2775 uint64_t itor = 0; 2776 zio_t *zio; 2777 int error; 2778 uint8_t c = 1; 2779 2780 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); 2781 2782 while (bplist_iterate(bpl, &itor, &blk) == 0) 2783 zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); 2784 2785 error = zio_wait(zio); 2786 ASSERT3U(error, ==, 0); 2787 2788 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2789 bplist_vacate(bpl, tx); 2790 2791 /* 2792 * Pre-dirty the first block so we sync to convergence faster. 2793 * (Usually only the first block is needed.) 2794 */ 2795 dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 2796 dmu_tx_commit(tx); 2797} 2798 2799static void 2800spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 2801{ 2802 char *packed = NULL; 2803 size_t nvsize = 0; 2804 dmu_buf_t *db; 2805 2806 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 2807 2808 packed = kmem_alloc(nvsize, KM_SLEEP); 2809 2810 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 2811 KM_SLEEP) == 0); 2812 2813 dmu_write(spa->spa_meta_objset, obj, 0, nvsize, packed, tx); 2814 2815 kmem_free(packed, nvsize); 2816 2817 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 2818 dmu_buf_will_dirty(db, tx); 2819 *(uint64_t *)db->db_data = nvsize; 2820 dmu_buf_rele(db, FTAG); 2821} 2822 2823static void 2824spa_sync_spares(spa_t *spa, dmu_tx_t *tx) 2825{ 2826 nvlist_t *nvroot; 2827 nvlist_t **spares; 2828 int i; 2829 2830 if (!spa->spa_sync_spares) 2831 return; 2832 2833 /* 2834 * Update the MOS nvlist describing the list of available spares. 2835 * spa_validate_spares() will have already made sure this nvlist is 2836 * valid and the vdevs are labelled appropriately. 2837 */ 2838 if (spa->spa_spares_object == 0) { 2839 spa->spa_spares_object = dmu_object_alloc(spa->spa_meta_objset, 2840 DMU_OT_PACKED_NVLIST, 1 << 14, 2841 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 2842 VERIFY(zap_update(spa->spa_meta_objset, 2843 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SPARES, 2844 sizeof (uint64_t), 1, &spa->spa_spares_object, tx) == 0); 2845 } 2846 2847 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 2848 if (spa->spa_nspares == 0) { 2849 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2850 NULL, 0) == 0); 2851 } else { 2852 spares = kmem_alloc(spa->spa_nspares * sizeof (void *), 2853 KM_SLEEP); 2854 for (i = 0; i < spa->spa_nspares; i++) 2855 spares[i] = vdev_config_generate(spa, 2856 spa->spa_spares[i], B_FALSE, B_TRUE); 2857 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2858 spares, spa->spa_nspares) == 0); 2859 for (i = 0; i < spa->spa_nspares; i++) 2860 nvlist_free(spares[i]); 2861 kmem_free(spares, spa->spa_nspares * sizeof (void *)); 2862 } 2863 2864 spa_sync_nvlist(spa, spa->spa_spares_object, nvroot, tx); 2865 nvlist_free(nvroot); 2866 2867 spa->spa_sync_spares = B_FALSE; 2868} 2869 2870static void 2871spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 2872{ 2873 nvlist_t *config; 2874 2875 if (list_is_empty(&spa->spa_dirty_list)) 2876 return; 2877 2878 config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); 2879 2880 if (spa->spa_config_syncing) 2881 nvlist_free(spa->spa_config_syncing); 2882 spa->spa_config_syncing = config; 2883 2884 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 2885} 2886 2887static void 2888spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx) 2889{ 2890 spa_t *spa = arg1; 2891 nvlist_t *nvp = arg2; 2892 nvpair_t *nvpair; 2893 objset_t *mos = spa->spa_meta_objset; 2894 uint64_t zapobj; 2895 2896 mutex_enter(&spa->spa_props_lock); 2897 if (spa->spa_pool_props_object == 0) { 2898 zapobj = zap_create(mos, DMU_OT_POOL_PROPS, DMU_OT_NONE, 0, tx); 2899 VERIFY(zapobj > 0); 2900 2901 spa->spa_pool_props_object = zapobj; 2902 2903 VERIFY(zap_update(mos, DMU_POOL_DIRECTORY_OBJECT, 2904 DMU_POOL_PROPS, 8, 1, 2905 &spa->spa_pool_props_object, tx) == 0); 2906 } 2907 mutex_exit(&spa->spa_props_lock); 2908 2909 nvpair = NULL; 2910 while ((nvpair = nvlist_next_nvpair(nvp, nvpair))) { 2911 switch (zpool_name_to_prop(nvpair_name(nvpair))) { 2912 case ZFS_PROP_BOOTFS: 2913 VERIFY(nvlist_lookup_uint64(nvp, 2914 nvpair_name(nvpair), &spa->spa_bootfs) == 0); 2915 VERIFY(zap_update(mos, 2916 spa->spa_pool_props_object, 2917 zpool_prop_to_name(ZFS_PROP_BOOTFS), 8, 1, 2918 &spa->spa_bootfs, tx) == 0); 2919 break; 2920 } 2921 } 2922} 2923 2924/* 2925 * Sync the specified transaction group. New blocks may be dirtied as 2926 * part of the process, so we iterate until it converges. 2927 */ 2928void 2929spa_sync(spa_t *spa, uint64_t txg) 2930{ 2931 dsl_pool_t *dp = spa->spa_dsl_pool; 2932 objset_t *mos = spa->spa_meta_objset; 2933 bplist_t *bpl = &spa->spa_sync_bplist; 2934 vdev_t *rvd = spa->spa_root_vdev; 2935 vdev_t *vd; 2936 dmu_tx_t *tx; 2937 int dirty_vdevs; 2938 2939 /* 2940 * Lock out configuration changes. 2941 */ 2942 spa_config_enter(spa, RW_READER, FTAG); 2943 2944 spa->spa_syncing_txg = txg; 2945 spa->spa_sync_pass = 0; 2946 2947 VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); 2948 2949 tx = dmu_tx_create_assigned(dp, txg); 2950 2951 /* 2952 * If we are upgrading to ZFS_VERSION_RAIDZ_DEFLATE this txg, 2953 * set spa_deflate if we have no raid-z vdevs. 2954 */ 2955 if (spa->spa_ubsync.ub_version < ZFS_VERSION_RAIDZ_DEFLATE && 2956 spa->spa_uberblock.ub_version >= ZFS_VERSION_RAIDZ_DEFLATE) { 2957 int i; 2958 2959 for (i = 0; i < rvd->vdev_children; i++) { 2960 vd = rvd->vdev_child[i]; 2961 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 2962 break; 2963 } 2964 if (i == rvd->vdev_children) { 2965 spa->spa_deflate = TRUE; 2966 VERIFY(0 == zap_add(spa->spa_meta_objset, 2967 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 2968 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 2969 } 2970 } 2971 2972 /* 2973 * If anything has changed in this txg, push the deferred frees 2974 * from the previous txg. If not, leave them alone so that we 2975 * don't generate work on an otherwise idle system. 2976 */ 2977 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 2978 !txg_list_empty(&dp->dp_dirty_dirs, txg) || 2979 !txg_list_empty(&dp->dp_sync_tasks, txg)) 2980 spa_sync_deferred_frees(spa, txg); 2981 2982 /* 2983 * Iterate to convergence. 2984 */ 2985 do { 2986 spa->spa_sync_pass++; 2987 2988 spa_sync_config_object(spa, tx); 2989 spa_sync_spares(spa, tx); 2990 spa_errlog_sync(spa, txg); 2991 dsl_pool_sync(dp, txg); 2992 2993 dirty_vdevs = 0; 2994 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 2995 vdev_sync(vd, txg); 2996 dirty_vdevs++; 2997 } 2998 2999 bplist_sync(bpl, tx); 3000 } while (dirty_vdevs); 3001 3002 bplist_close(bpl); 3003 3004 dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 3005 3006 /* 3007 * Rewrite the vdev configuration (which includes the uberblock) 3008 * to commit the transaction group. 3009 * 3010 * If there are any dirty vdevs, sync the uberblock to all vdevs. 3011 * Otherwise, pick a random top-level vdev that's known to be 3012 * visible in the config cache (see spa_vdev_add() for details). 3013 * If the write fails, try the next vdev until we're tried them all. 3014 */ 3015 if (!list_is_empty(&spa->spa_dirty_list)) { 3016 VERIFY(vdev_config_sync(rvd, txg) == 0); 3017 } else { 3018 int children = rvd->vdev_children; 3019 int c0 = spa_get_random(children); 3020 int c; 3021 3022 for (c = 0; c < children; c++) { 3023 vd = rvd->vdev_child[(c0 + c) % children]; 3024 if (vd->vdev_ms_array == 0) 3025 continue; 3026 if (vdev_config_sync(vd, txg) == 0) 3027 break; 3028 } 3029 if (c == children) 3030 VERIFY(vdev_config_sync(rvd, txg) == 0); 3031 } 3032 3033 dmu_tx_commit(tx); 3034 3035 /* 3036 * Clear the dirty config list. 3037 */ 3038 while ((vd = list_head(&spa->spa_dirty_list)) != NULL) 3039 vdev_config_clean(vd); 3040 3041 /* 3042 * Now that the new config has synced transactionally, 3043 * let it become visible to the config cache. 3044 */ 3045 if (spa->spa_config_syncing != NULL) { 3046 spa_config_set(spa, spa->spa_config_syncing); 3047 spa->spa_config_txg = txg; 3048 spa->spa_config_syncing = NULL; 3049 } 3050 3051 /* 3052 * Make a stable copy of the fully synced uberblock. 3053 * We use this as the root for pool traversals. 3054 */ 3055 spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */ 3056 3057 spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */ 3058 3059 rw_enter(&spa->spa_traverse_lock, RW_WRITER); 3060 spa->spa_traverse_wanted = 0; 3061 spa->spa_ubsync = spa->spa_uberblock; 3062 rw_exit(&spa->spa_traverse_lock); 3063 3064 spa_scrub_resume(spa); /* resume scrub with new ubsync */ 3065 3066 /* 3067 * Clean up the ZIL records for the synced txg. 3068 */ 3069 dsl_pool_zil_clean(dp); 3070 3071 /* 3072 * Update usable space statistics. 3073 */ 3074 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 3075 vdev_sync_done(vd, txg); 3076 3077 /* 3078 * It had better be the case that we didn't dirty anything 3079 * since vdev_config_sync(). 3080 */ 3081 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 3082 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 3083 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 3084 ASSERT(bpl->bpl_queue == NULL); 3085 3086 spa_config_exit(spa, FTAG); 3087 3088 /* 3089 * If any async tasks have been requested, kick them off. 3090 */ 3091 spa_async_dispatch(spa); 3092} 3093 3094/* 3095 * Sync all pools. We don't want to hold the namespace lock across these 3096 * operations, so we take a reference on the spa_t and drop the lock during the 3097 * sync. 3098 */ 3099void 3100spa_sync_allpools(void) 3101{ 3102 spa_t *spa = NULL; 3103 mutex_enter(&spa_namespace_lock); 3104 while ((spa = spa_next(spa)) != NULL) { 3105 if (spa_state(spa) != POOL_STATE_ACTIVE) 3106 continue; 3107 spa_open_ref(spa, FTAG); 3108 mutex_exit(&spa_namespace_lock); 3109 txg_wait_synced(spa_get_dsl(spa), 0); 3110 mutex_enter(&spa_namespace_lock); 3111 spa_close(spa, FTAG); 3112 } 3113 mutex_exit(&spa_namespace_lock); 3114} 3115 3116/* 3117 * ========================================================================== 3118 * Miscellaneous routines 3119 * ========================================================================== 3120 */ 3121 3122/* 3123 * Remove all pools in the system. 3124 */ 3125void 3126spa_evict_all(void) 3127{ 3128 spa_t *spa; 3129 3130 /* 3131 * Remove all cached state. All pools should be closed now, 3132 * so every spa in the AVL tree should be unreferenced. 3133 */ 3134 mutex_enter(&spa_namespace_lock); 3135 while ((spa = spa_next(NULL)) != NULL) { 3136 /* 3137 * Stop async tasks. The async thread may need to detach 3138 * a device that's been replaced, which requires grabbing 3139 * spa_namespace_lock, so we must drop it here. 3140 */ 3141 spa_open_ref(spa, FTAG); 3142 mutex_exit(&spa_namespace_lock); 3143 spa_async_suspend(spa); 3144 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 3145 mutex_enter(&spa_namespace_lock); 3146 spa_close(spa, FTAG); 3147 3148 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 3149 spa_unload(spa); 3150 spa_deactivate(spa); 3151 } 3152 spa_remove(spa); 3153 } 3154 mutex_exit(&spa_namespace_lock); 3155} 3156 3157vdev_t * 3158spa_lookup_by_guid(spa_t *spa, uint64_t guid) 3159{ 3160 return (vdev_lookup_by_guid(spa->spa_root_vdev, guid)); 3161} 3162 3163void 3164spa_upgrade(spa_t *spa) 3165{ 3166 spa_config_enter(spa, RW_WRITER, FTAG); 3167 3168 /* 3169 * This should only be called for a non-faulted pool, and since a 3170 * future version would result in an unopenable pool, this shouldn't be 3171 * possible. 3172 */ 3173 ASSERT(spa->spa_uberblock.ub_version <= ZFS_VERSION); 3174 3175 spa->spa_uberblock.ub_version = ZFS_VERSION; 3176 vdev_config_dirty(spa->spa_root_vdev); 3177 3178 spa_config_exit(spa, FTAG); 3179 3180 txg_wait_synced(spa_get_dsl(spa), 0); 3181} 3182 3183boolean_t 3184spa_has_spare(spa_t *spa, uint64_t guid) 3185{ 3186 int i; 3187 uint64_t spareguid; 3188 3189 for (i = 0; i < spa->spa_nspares; i++) 3190 if (spa->spa_spares[i]->vdev_guid == guid) 3191 return (B_TRUE); 3192 3193 for (i = 0; i < spa->spa_pending_nspares; i++) { 3194 if (nvlist_lookup_uint64(spa->spa_pending_spares[i], 3195 ZPOOL_CONFIG_GUID, &spareguid) == 0 && 3196 spareguid == guid) 3197 return (B_TRUE); 3198 } 3199 3200 return (B_FALSE); 3201} 3202 3203int 3204spa_set_props(spa_t *spa, nvlist_t *nvp) 3205{ 3206 return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, 3207 spa, nvp, 3)); 3208} 3209 3210int 3211spa_get_props(spa_t *spa, nvlist_t **nvp) 3212{ 3213 zap_cursor_t zc; 3214 zap_attribute_t za; 3215 objset_t *mos = spa->spa_meta_objset; 3216 zfs_source_t src; 3217 zfs_prop_t prop; 3218 nvlist_t *propval; 3219 uint64_t value; 3220 int err; 3221 3222 VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3223 3224 mutex_enter(&spa->spa_props_lock); 3225 /* If no props object, then just return empty nvlist */ 3226 if (spa->spa_pool_props_object == 0) { 3227 mutex_exit(&spa->spa_props_lock); 3228 return (0); 3229 } 3230 3231 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 3232 (err = zap_cursor_retrieve(&zc, &za)) == 0; 3233 zap_cursor_advance(&zc)) { 3234 3235 if ((prop = zpool_name_to_prop(za.za_name)) == ZFS_PROP_INVAL) 3236 continue; 3237 3238 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3239 switch (za.za_integer_length) { 3240 case 8: 3241 if (zfs_prop_default_numeric(prop) == 3242 za.za_first_integer) 3243 src = ZFS_SRC_DEFAULT; 3244 else 3245 src = ZFS_SRC_LOCAL; 3246 value = za.za_first_integer; 3247 3248 if (prop == ZFS_PROP_BOOTFS) { 3249 dsl_pool_t *dp; 3250 dsl_dataset_t *ds = NULL; 3251 char strval[MAXPATHLEN]; 3252 3253 dp = spa_get_dsl(spa); 3254 rw_enter(&dp->dp_config_rwlock, RW_READER); 3255 if ((err = dsl_dataset_open_obj(dp, 3256 za.za_first_integer, NULL, DS_MODE_NONE, 3257 FTAG, &ds)) != 0) { 3258 rw_exit(&dp->dp_config_rwlock); 3259 break; 3260 } 3261 dsl_dataset_name(ds, strval); 3262 dsl_dataset_close(ds, DS_MODE_NONE, FTAG); 3263 rw_exit(&dp->dp_config_rwlock); 3264 3265 VERIFY(nvlist_add_uint64(propval, 3266 ZFS_PROP_SOURCE, src) == 0); 3267 VERIFY(nvlist_add_string(propval, 3268 ZFS_PROP_VALUE, strval) == 0); 3269 } else { 3270 VERIFY(nvlist_add_uint64(propval, 3271 ZFS_PROP_SOURCE, src) == 0); 3272 VERIFY(nvlist_add_uint64(propval, 3273 ZFS_PROP_VALUE, value) == 0); 3274 } 3275 VERIFY(nvlist_add_nvlist(*nvp, za.za_name, 3276 propval) == 0); 3277 break; 3278 } 3279 nvlist_free(propval); 3280 } 3281 zap_cursor_fini(&zc); 3282 mutex_exit(&spa->spa_props_lock); 3283 if (err && err != ENOENT) { 3284 nvlist_free(*nvp); 3285 return (err); 3286 } 3287 3288 return (0); 3289} 3290 3291/* 3292 * If the bootfs property value is dsobj, clear it. 3293 */ 3294void 3295spa_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 3296{ 3297 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 3298 VERIFY(zap_remove(spa->spa_meta_objset, 3299 spa->spa_pool_props_object, 3300 zpool_prop_to_name(ZFS_PROP_BOOTFS), tx) == 0); 3301 spa->spa_bootfs = 0; 3302 } 3303} 3304