spa.c revision 168962
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22/* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27#pragma ident "%Z%%M% %I% %E% SMI" 28 29/* 30 * This file contains all the routines used when modifying on-disk SPA state. 31 * This includes opening, importing, destroying, exporting a pool, and syncing a 32 * pool. 33 */ 34 35#include <sys/zfs_context.h> 36#include <sys/fm/fs/zfs.h> 37#include <sys/spa_impl.h> 38#include <sys/zio.h> 39#include <sys/zio_checksum.h> 40#include <sys/zio_compress.h> 41#include <sys/dmu.h> 42#include <sys/dmu_tx.h> 43#include <sys/zap.h> 44#include <sys/zil.h> 45#include <sys/vdev_impl.h> 46#include <sys/metaslab.h> 47#include <sys/uberblock_impl.h> 48#include <sys/txg.h> 49#include <sys/avl.h> 50#include <sys/dmu_traverse.h> 51#include <sys/dmu_objset.h> 52#include <sys/unique.h> 53#include <sys/dsl_pool.h> 54#include <sys/dsl_dataset.h> 55#include <sys/dsl_dir.h> 56#include <sys/dsl_prop.h> 57#include <sys/dsl_synctask.h> 58#include <sys/fs/zfs.h> 59#include <sys/callb.h> 60#include <sys/sunddi.h> 61 62int zio_taskq_threads = 0; 63SYSCTL_DECL(_vfs_zfs); 64SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO"); 65TUNABLE_INT("vfs.zfs.zio.taskq_threads", &zio_taskq_threads); 66SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, taskq_threads, CTLFLAG_RW, 67 &zio_taskq_threads, 0, "Number of ZIO threads per ZIO type"); 68 69 70/* 71 * ========================================================================== 72 * SPA state manipulation (open/create/destroy/import/export) 73 * ========================================================================== 74 */ 75 76static int 77spa_error_entry_compare(const void *a, const void *b) 78{ 79 spa_error_entry_t *sa = (spa_error_entry_t *)a; 80 spa_error_entry_t *sb = (spa_error_entry_t *)b; 81 int ret; 82 83 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 84 sizeof (zbookmark_t)); 85 86 if (ret < 0) 87 return (-1); 88 else if (ret > 0) 89 return (1); 90 else 91 return (0); 92} 93 94/* 95 * Utility function which retrieves copies of the current logs and 96 * re-initializes them in the process. 97 */ 98void 99spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 100{ 101 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 102 103 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 104 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 105 106 avl_create(&spa->spa_errlist_scrub, 107 spa_error_entry_compare, sizeof (spa_error_entry_t), 108 offsetof(spa_error_entry_t, se_avl)); 109 avl_create(&spa->spa_errlist_last, 110 spa_error_entry_compare, sizeof (spa_error_entry_t), 111 offsetof(spa_error_entry_t, se_avl)); 112} 113 114/* 115 * Activate an uninitialized pool. 116 */ 117static void 118spa_activate(spa_t *spa) 119{ 120 int t; 121 int nthreads = zio_taskq_threads; 122 char name[32]; 123 124 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 125 126 spa->spa_state = POOL_STATE_ACTIVE; 127 128 spa->spa_normal_class = metaslab_class_create(); 129 130 if (nthreads == 0) 131 nthreads = max_ncpus; 132 for (t = 0; t < ZIO_TYPES; t++) { 133 snprintf(name, sizeof(name), "spa_zio_issue %d", t); 134 spa->spa_zio_issue_taskq[t] = taskq_create(name, nthreads, 135 maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE); 136 snprintf(name, sizeof(name), "spa_zio_intr %d", t); 137 spa->spa_zio_intr_taskq[t] = taskq_create(name, nthreads, 138 maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE); 139 } 140 141 rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL); 142 143 mutex_init(&spa->spa_uberblock_lock, NULL, MUTEX_DEFAULT, NULL); 144 mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL); 145 mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL); 146 mutex_init(&spa->spa_config_lock.scl_lock, NULL, MUTEX_DEFAULT, NULL); 147 cv_init(&spa->spa_config_lock.scl_cv, NULL, CV_DEFAULT, NULL); 148 mutex_init(&spa->spa_sync_bplist.bpl_lock, NULL, MUTEX_DEFAULT, NULL); 149 mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL); 150 mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL); 151 152 list_create(&spa->spa_dirty_list, sizeof (vdev_t), 153 offsetof(vdev_t, vdev_dirty_node)); 154 155 txg_list_create(&spa->spa_vdev_txg_list, 156 offsetof(struct vdev, vdev_txg_node)); 157 158 avl_create(&spa->spa_errlist_scrub, 159 spa_error_entry_compare, sizeof (spa_error_entry_t), 160 offsetof(spa_error_entry_t, se_avl)); 161 avl_create(&spa->spa_errlist_last, 162 spa_error_entry_compare, sizeof (spa_error_entry_t), 163 offsetof(spa_error_entry_t, se_avl)); 164} 165 166/* 167 * Opposite of spa_activate(). 168 */ 169static void 170spa_deactivate(spa_t *spa) 171{ 172 int t; 173 174 ASSERT(spa->spa_sync_on == B_FALSE); 175 ASSERT(spa->spa_dsl_pool == NULL); 176 ASSERT(spa->spa_root_vdev == NULL); 177 178 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 179 180 txg_list_destroy(&spa->spa_vdev_txg_list); 181 182 list_destroy(&spa->spa_dirty_list); 183 184 for (t = 0; t < ZIO_TYPES; t++) { 185 taskq_destroy(spa->spa_zio_issue_taskq[t]); 186 taskq_destroy(spa->spa_zio_intr_taskq[t]); 187 spa->spa_zio_issue_taskq[t] = NULL; 188 spa->spa_zio_intr_taskq[t] = NULL; 189 } 190 191 metaslab_class_destroy(spa->spa_normal_class); 192 spa->spa_normal_class = NULL; 193 194 /* 195 * If this was part of an import or the open otherwise failed, we may 196 * still have errors left in the queues. Empty them just in case. 197 */ 198 spa_errlog_drain(spa); 199 200 avl_destroy(&spa->spa_errlist_scrub); 201 avl_destroy(&spa->spa_errlist_last); 202 203 rw_destroy(&spa->spa_traverse_lock); 204 mutex_destroy(&spa->spa_uberblock_lock); 205 mutex_destroy(&spa->spa_errlog_lock); 206 mutex_destroy(&spa->spa_errlist_lock); 207 mutex_destroy(&spa->spa_config_lock.scl_lock); 208 cv_destroy(&spa->spa_config_lock.scl_cv); 209 mutex_destroy(&spa->spa_sync_bplist.bpl_lock); 210 mutex_destroy(&spa->spa_history_lock); 211 mutex_destroy(&spa->spa_props_lock); 212 213 spa->spa_state = POOL_STATE_UNINITIALIZED; 214} 215 216/* 217 * Verify a pool configuration, and construct the vdev tree appropriately. This 218 * will create all the necessary vdevs in the appropriate layout, with each vdev 219 * in the CLOSED state. This will prep the pool before open/creation/import. 220 * All vdev validation is done by the vdev_alloc() routine. 221 */ 222static int 223spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 224 uint_t id, int atype) 225{ 226 nvlist_t **child; 227 uint_t c, children; 228 int error; 229 230 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 231 return (error); 232 233 if ((*vdp)->vdev_ops->vdev_op_leaf) 234 return (0); 235 236 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 237 &child, &children) != 0) { 238 vdev_free(*vdp); 239 *vdp = NULL; 240 return (EINVAL); 241 } 242 243 for (c = 0; c < children; c++) { 244 vdev_t *vd; 245 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 246 atype)) != 0) { 247 vdev_free(*vdp); 248 *vdp = NULL; 249 return (error); 250 } 251 } 252 253 ASSERT(*vdp != NULL); 254 255 return (0); 256} 257 258/* 259 * Opposite of spa_load(). 260 */ 261static void 262spa_unload(spa_t *spa) 263{ 264 int i; 265 266 /* 267 * Stop async tasks. 268 */ 269 spa_async_suspend(spa); 270 271 /* 272 * Stop syncing. 273 */ 274 if (spa->spa_sync_on) { 275 txg_sync_stop(spa->spa_dsl_pool); 276 spa->spa_sync_on = B_FALSE; 277 } 278 279 /* 280 * Wait for any outstanding prefetch I/O to complete. 281 */ 282 spa_config_enter(spa, RW_WRITER, FTAG); 283 spa_config_exit(spa, FTAG); 284 285 /* 286 * Close the dsl pool. 287 */ 288 if (spa->spa_dsl_pool) { 289 dsl_pool_close(spa->spa_dsl_pool); 290 spa->spa_dsl_pool = NULL; 291 } 292 293 /* 294 * Close all vdevs. 295 */ 296 if (spa->spa_root_vdev) 297 vdev_free(spa->spa_root_vdev); 298 ASSERT(spa->spa_root_vdev == NULL); 299 300 for (i = 0; i < spa->spa_nspares; i++) 301 vdev_free(spa->spa_spares[i]); 302 if (spa->spa_spares) { 303 kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 304 spa->spa_spares = NULL; 305 } 306 if (spa->spa_sparelist) { 307 nvlist_free(spa->spa_sparelist); 308 spa->spa_sparelist = NULL; 309 } 310 311 spa->spa_async_suspended = 0; 312} 313 314/* 315 * Load (or re-load) the current list of vdevs describing the active spares for 316 * this pool. When this is called, we have some form of basic information in 317 * 'spa_sparelist'. We parse this into vdevs, try to open them, and then 318 * re-generate a more complete list including status information. 319 */ 320static void 321spa_load_spares(spa_t *spa) 322{ 323 nvlist_t **spares; 324 uint_t nspares; 325 int i; 326 vdev_t *vd, *tvd; 327 328 /* 329 * First, close and free any existing spare vdevs. 330 */ 331 for (i = 0; i < spa->spa_nspares; i++) { 332 vd = spa->spa_spares[i]; 333 334 /* Undo the call to spa_activate() below */ 335 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL && 336 tvd->vdev_isspare) 337 spa_spare_remove(tvd); 338 vdev_close(vd); 339 vdev_free(vd); 340 } 341 342 if (spa->spa_spares) 343 kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 344 345 if (spa->spa_sparelist == NULL) 346 nspares = 0; 347 else 348 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 349 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 350 351 spa->spa_nspares = (int)nspares; 352 spa->spa_spares = NULL; 353 354 if (nspares == 0) 355 return; 356 357 /* 358 * Construct the array of vdevs, opening them to get status in the 359 * process. For each spare, there is potentially two different vdev_t 360 * structures associated with it: one in the list of spares (used only 361 * for basic validation purposes) and one in the active vdev 362 * configuration (if it's spared in). During this phase we open and 363 * validate each vdev on the spare list. If the vdev also exists in the 364 * active configuration, then we also mark this vdev as an active spare. 365 */ 366 spa->spa_spares = kmem_alloc(nspares * sizeof (void *), KM_SLEEP); 367 for (i = 0; i < spa->spa_nspares; i++) { 368 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 369 VDEV_ALLOC_SPARE) == 0); 370 ASSERT(vd != NULL); 371 372 spa->spa_spares[i] = vd; 373 374 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL) { 375 if (!tvd->vdev_isspare) 376 spa_spare_add(tvd); 377 378 /* 379 * We only mark the spare active if we were successfully 380 * able to load the vdev. Otherwise, importing a pool 381 * with a bad active spare would result in strange 382 * behavior, because multiple pool would think the spare 383 * is actively in use. 384 * 385 * There is a vulnerability here to an equally bizarre 386 * circumstance, where a dead active spare is later 387 * brought back to life (onlined or otherwise). Given 388 * the rarity of this scenario, and the extra complexity 389 * it adds, we ignore the possibility. 390 */ 391 if (!vdev_is_dead(tvd)) 392 spa_spare_activate(tvd); 393 } 394 395 if (vdev_open(vd) != 0) 396 continue; 397 398 vd->vdev_top = vd; 399 (void) vdev_validate_spare(vd); 400 } 401 402 /* 403 * Recompute the stashed list of spares, with status information 404 * this time. 405 */ 406 VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 407 DATA_TYPE_NVLIST_ARRAY) == 0); 408 409 spares = kmem_alloc(spa->spa_nspares * sizeof (void *), KM_SLEEP); 410 for (i = 0; i < spa->spa_nspares; i++) 411 spares[i] = vdev_config_generate(spa, spa->spa_spares[i], 412 B_TRUE, B_TRUE); 413 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 414 spares, spa->spa_nspares) == 0); 415 for (i = 0; i < spa->spa_nspares; i++) 416 nvlist_free(spares[i]); 417 kmem_free(spares, spa->spa_nspares * sizeof (void *)); 418} 419 420static int 421load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 422{ 423 dmu_buf_t *db; 424 char *packed = NULL; 425 size_t nvsize = 0; 426 int error; 427 *value = NULL; 428 429 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 430 nvsize = *(uint64_t *)db->db_data; 431 dmu_buf_rele(db, FTAG); 432 433 packed = kmem_alloc(nvsize, KM_SLEEP); 434 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed); 435 if (error == 0) 436 error = nvlist_unpack(packed, nvsize, value, 0); 437 kmem_free(packed, nvsize); 438 439 return (error); 440} 441 442/* 443 * Load an existing storage pool, using the pool's builtin spa_config as a 444 * source of configuration information. 445 */ 446static int 447spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) 448{ 449 int error = 0; 450 nvlist_t *nvroot = NULL; 451 vdev_t *rvd; 452 uberblock_t *ub = &spa->spa_uberblock; 453 uint64_t config_cache_txg = spa->spa_config_txg; 454 uint64_t pool_guid; 455 uint64_t version; 456 zio_t *zio; 457 458 spa->spa_load_state = state; 459 460 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 461 nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 462 error = EINVAL; 463 goto out; 464 } 465 466 /* 467 * Versioning wasn't explicitly added to the label until later, so if 468 * it's not present treat it as the initial version. 469 */ 470 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0) 471 version = ZFS_VERSION_INITIAL; 472 473 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 474 &spa->spa_config_txg); 475 476 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 477 spa_guid_exists(pool_guid, 0)) { 478 error = EEXIST; 479 goto out; 480 } 481 482 spa->spa_load_guid = pool_guid; 483 484 /* 485 * Parse the configuration into a vdev tree. We explicitly set the 486 * value that will be returned by spa_version() since parsing the 487 * configuration requires knowing the version number. 488 */ 489 spa_config_enter(spa, RW_WRITER, FTAG); 490 spa->spa_ubsync.ub_version = version; 491 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 492 spa_config_exit(spa, FTAG); 493 494 if (error != 0) 495 goto out; 496 497 ASSERT(spa->spa_root_vdev == rvd); 498 ASSERT(spa_guid(spa) == pool_guid); 499 500 /* 501 * Try to open all vdevs, loading each label in the process. 502 */ 503 error = vdev_open(rvd); 504 if (error != 0) 505 goto out; 506 507 /* 508 * Validate the labels for all leaf vdevs. We need to grab the config 509 * lock because all label I/O is done with the ZIO_FLAG_CONFIG_HELD 510 * flag. 511 */ 512 spa_config_enter(spa, RW_READER, FTAG); 513 error = vdev_validate(rvd); 514 spa_config_exit(spa, FTAG); 515 516 if (error != 0) 517 goto out; 518 519 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 520 error = ENXIO; 521 goto out; 522 } 523 524 /* 525 * Find the best uberblock. 526 */ 527 bzero(ub, sizeof (uberblock_t)); 528 529 zio = zio_root(spa, NULL, NULL, 530 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 531 vdev_uberblock_load(zio, rvd, ub); 532 error = zio_wait(zio); 533 534 /* 535 * If we weren't able to find a single valid uberblock, return failure. 536 */ 537 if (ub->ub_txg == 0) { 538 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 539 VDEV_AUX_CORRUPT_DATA); 540 error = ENXIO; 541 goto out; 542 } 543 544 /* 545 * If the pool is newer than the code, we can't open it. 546 */ 547 if (ub->ub_version > ZFS_VERSION) { 548 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 549 VDEV_AUX_VERSION_NEWER); 550 error = ENOTSUP; 551 goto out; 552 } 553 554 /* 555 * If the vdev guid sum doesn't match the uberblock, we have an 556 * incomplete configuration. 557 */ 558 if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 559 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 560 VDEV_AUX_BAD_GUID_SUM); 561 error = ENXIO; 562 goto out; 563 } 564 565 /* 566 * Initialize internal SPA structures. 567 */ 568 spa->spa_state = POOL_STATE_ACTIVE; 569 spa->spa_ubsync = spa->spa_uberblock; 570 spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 571 error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 572 if (error) { 573 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 574 VDEV_AUX_CORRUPT_DATA); 575 goto out; 576 } 577 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 578 579 if (zap_lookup(spa->spa_meta_objset, 580 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 581 sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { 582 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 583 VDEV_AUX_CORRUPT_DATA); 584 error = EIO; 585 goto out; 586 } 587 588 if (!mosconfig) { 589 nvlist_t *newconfig; 590 uint64_t hostid; 591 592 if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) { 593 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 594 VDEV_AUX_CORRUPT_DATA); 595 error = EIO; 596 goto out; 597 } 598 599 /* 600 * hostid is set after the root file system is mounted, so 601 * ignore the check until it's done. 602 */ 603 if (nvlist_lookup_uint64(newconfig, ZPOOL_CONFIG_HOSTID, 604 &hostid) == 0 && root_mounted()) { 605 char *hostname; 606 unsigned long myhostid = 0; 607 608 VERIFY(nvlist_lookup_string(newconfig, 609 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 610 611 (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 612 if ((unsigned long)hostid != myhostid) { 613 cmn_err(CE_WARN, "pool '%s' could not be " 614 "loaded as it was last accessed by " 615 "another system (host: %s hostid: 0x%lx). " 616 "See: http://www.sun.com/msg/ZFS-8000-EY", 617 spa->spa_name, hostname, 618 (unsigned long)hostid); 619 error = EBADF; 620 goto out; 621 } 622 } 623 624 spa_config_set(spa, newconfig); 625 spa_unload(spa); 626 spa_deactivate(spa); 627 spa_activate(spa); 628 629 return (spa_load(spa, newconfig, state, B_TRUE)); 630 } 631 632 if (zap_lookup(spa->spa_meta_objset, 633 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 634 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { 635 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 636 VDEV_AUX_CORRUPT_DATA); 637 error = EIO; 638 goto out; 639 } 640 641 /* 642 * Load the bit that tells us to use the new accounting function 643 * (raid-z deflation). If we have an older pool, this will not 644 * be present. 645 */ 646 error = zap_lookup(spa->spa_meta_objset, 647 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 648 sizeof (uint64_t), 1, &spa->spa_deflate); 649 if (error != 0 && error != ENOENT) { 650 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 651 VDEV_AUX_CORRUPT_DATA); 652 error = EIO; 653 goto out; 654 } 655 656 /* 657 * Load the persistent error log. If we have an older pool, this will 658 * not be present. 659 */ 660 error = zap_lookup(spa->spa_meta_objset, 661 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, 662 sizeof (uint64_t), 1, &spa->spa_errlog_last); 663 if (error != 0 && error != ENOENT) { 664 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 665 VDEV_AUX_CORRUPT_DATA); 666 error = EIO; 667 goto out; 668 } 669 670 error = zap_lookup(spa->spa_meta_objset, 671 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, 672 sizeof (uint64_t), 1, &spa->spa_errlog_scrub); 673 if (error != 0 && error != ENOENT) { 674 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 675 VDEV_AUX_CORRUPT_DATA); 676 error = EIO; 677 goto out; 678 } 679 680 /* 681 * Load the history object. If we have an older pool, this 682 * will not be present. 683 */ 684 error = zap_lookup(spa->spa_meta_objset, 685 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY, 686 sizeof (uint64_t), 1, &spa->spa_history); 687 if (error != 0 && error != ENOENT) { 688 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 689 VDEV_AUX_CORRUPT_DATA); 690 error = EIO; 691 goto out; 692 } 693 694 /* 695 * Load any hot spares for this pool. 696 */ 697 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 698 DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares_object); 699 if (error != 0 && error != ENOENT) { 700 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 701 VDEV_AUX_CORRUPT_DATA); 702 error = EIO; 703 goto out; 704 } 705 if (error == 0) { 706 ASSERT(spa_version(spa) >= ZFS_VERSION_SPARES); 707 if (load_nvlist(spa, spa->spa_spares_object, 708 &spa->spa_sparelist) != 0) { 709 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 710 VDEV_AUX_CORRUPT_DATA); 711 error = EIO; 712 goto out; 713 } 714 715 spa_config_enter(spa, RW_WRITER, FTAG); 716 spa_load_spares(spa); 717 spa_config_exit(spa, FTAG); 718 } 719 720 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 721 DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object); 722 723 if (error && error != ENOENT) { 724 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 725 VDEV_AUX_CORRUPT_DATA); 726 error = EIO; 727 goto out; 728 } 729 730 if (error == 0) { 731 (void) zap_lookup(spa->spa_meta_objset, 732 spa->spa_pool_props_object, 733 zpool_prop_to_name(ZFS_PROP_BOOTFS), 734 sizeof (uint64_t), 1, &spa->spa_bootfs); 735 } 736 737 /* 738 * Load the vdev state for all toplevel vdevs. 739 */ 740 vdev_load(rvd); 741 742 /* 743 * Propagate the leaf DTLs we just loaded all the way up the tree. 744 */ 745 spa_config_enter(spa, RW_WRITER, FTAG); 746 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 747 spa_config_exit(spa, FTAG); 748 749 /* 750 * Check the state of the root vdev. If it can't be opened, it 751 * indicates one or more toplevel vdevs are faulted. 752 */ 753 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 754 error = ENXIO; 755 goto out; 756 } 757 758 if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) { 759 dmu_tx_t *tx; 760 int need_update = B_FALSE; 761 int c; 762 763 /* 764 * Claim log blocks that haven't been committed yet. 765 * This must all happen in a single txg. 766 */ 767 tx = dmu_tx_create_assigned(spa_get_dsl(spa), 768 spa_first_txg(spa)); 769 (void) dmu_objset_find(spa->spa_name, 770 zil_claim, tx, DS_FIND_CHILDREN); 771 dmu_tx_commit(tx); 772 773 spa->spa_sync_on = B_TRUE; 774 txg_sync_start(spa->spa_dsl_pool); 775 776 /* 777 * Wait for all claims to sync. 778 */ 779 txg_wait_synced(spa->spa_dsl_pool, 0); 780 781 /* 782 * If the config cache is stale, or we have uninitialized 783 * metaslabs (see spa_vdev_add()), then update the config. 784 */ 785 if (config_cache_txg != spa->spa_config_txg || 786 state == SPA_LOAD_IMPORT) 787 need_update = B_TRUE; 788 789 for (c = 0; c < rvd->vdev_children; c++) 790 if (rvd->vdev_child[c]->vdev_ms_array == 0) 791 need_update = B_TRUE; 792 793 /* 794 * Update the config cache asychronously in case we're the 795 * root pool, in which case the config cache isn't writable yet. 796 */ 797 if (need_update) 798 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 799 } 800 801 error = 0; 802out: 803 if (error && error != EBADF) 804 zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0); 805 spa->spa_load_state = SPA_LOAD_NONE; 806 spa->spa_ena = 0; 807 808 return (error); 809} 810 811/* 812 * Pool Open/Import 813 * 814 * The import case is identical to an open except that the configuration is sent 815 * down from userland, instead of grabbed from the configuration cache. For the 816 * case of an open, the pool configuration will exist in the 817 * POOL_STATE_UNITIALIZED state. 818 * 819 * The stats information (gen/count/ustats) is used to gather vdev statistics at 820 * the same time open the pool, without having to keep around the spa_t in some 821 * ambiguous state. 822 */ 823static int 824spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 825{ 826 spa_t *spa; 827 int error; 828 int loaded = B_FALSE; 829 int locked = B_FALSE; 830 831 *spapp = NULL; 832 833 /* 834 * As disgusting as this is, we need to support recursive calls to this 835 * function because dsl_dir_open() is called during spa_load(), and ends 836 * up calling spa_open() again. The real fix is to figure out how to 837 * avoid dsl_dir_open() calling this in the first place. 838 */ 839 if (mutex_owner(&spa_namespace_lock) != curthread) { 840 mutex_enter(&spa_namespace_lock); 841 locked = B_TRUE; 842 } 843 844 if ((spa = spa_lookup(pool)) == NULL) { 845 if (locked) 846 mutex_exit(&spa_namespace_lock); 847 return (ENOENT); 848 } 849 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 850 851 spa_activate(spa); 852 853 error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE); 854 855 if (error == EBADF) { 856 /* 857 * If vdev_validate() returns failure (indicated by 858 * EBADF), it indicates that one of the vdevs indicates 859 * that the pool has been exported or destroyed. If 860 * this is the case, the config cache is out of sync and 861 * we should remove the pool from the namespace. 862 */ 863 zfs_post_ok(spa, NULL); 864 spa_unload(spa); 865 spa_deactivate(spa); 866 spa_remove(spa); 867 spa_config_sync(); 868 if (locked) 869 mutex_exit(&spa_namespace_lock); 870 return (ENOENT); 871 } 872 873 if (error) { 874 /* 875 * We can't open the pool, but we still have useful 876 * information: the state of each vdev after the 877 * attempted vdev_open(). Return this to the user. 878 */ 879 if (config != NULL && spa->spa_root_vdev != NULL) { 880 spa_config_enter(spa, RW_READER, FTAG); 881 *config = spa_config_generate(spa, NULL, -1ULL, 882 B_TRUE); 883 spa_config_exit(spa, FTAG); 884 } 885 spa_unload(spa); 886 spa_deactivate(spa); 887 spa->spa_last_open_failed = B_TRUE; 888 if (locked) 889 mutex_exit(&spa_namespace_lock); 890 *spapp = NULL; 891 return (error); 892 } else { 893 zfs_post_ok(spa, NULL); 894 spa->spa_last_open_failed = B_FALSE; 895 } 896 897 loaded = B_TRUE; 898 } 899 900 spa_open_ref(spa, tag); 901 if (locked) 902 mutex_exit(&spa_namespace_lock); 903 904 *spapp = spa; 905 906 if (config != NULL) { 907 spa_config_enter(spa, RW_READER, FTAG); 908 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 909 spa_config_exit(spa, FTAG); 910 } 911 912 /* 913 * If we just loaded the pool, resilver anything that's out of date. 914 */ 915 if (loaded && (spa_mode & FWRITE)) 916 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 917 918 return (0); 919} 920 921int 922spa_open(const char *name, spa_t **spapp, void *tag) 923{ 924 return (spa_open_common(name, spapp, tag, NULL)); 925} 926 927/* 928 * Lookup the given spa_t, incrementing the inject count in the process, 929 * preventing it from being exported or destroyed. 930 */ 931spa_t * 932spa_inject_addref(char *name) 933{ 934 spa_t *spa; 935 936 mutex_enter(&spa_namespace_lock); 937 if ((spa = spa_lookup(name)) == NULL) { 938 mutex_exit(&spa_namespace_lock); 939 return (NULL); 940 } 941 spa->spa_inject_ref++; 942 mutex_exit(&spa_namespace_lock); 943 944 return (spa); 945} 946 947void 948spa_inject_delref(spa_t *spa) 949{ 950 mutex_enter(&spa_namespace_lock); 951 spa->spa_inject_ref--; 952 mutex_exit(&spa_namespace_lock); 953} 954 955static void 956spa_add_spares(spa_t *spa, nvlist_t *config) 957{ 958 nvlist_t **spares; 959 uint_t i, nspares; 960 nvlist_t *nvroot; 961 uint64_t guid; 962 vdev_stat_t *vs; 963 uint_t vsc; 964 uint64_t pool; 965 966 if (spa->spa_nspares == 0) 967 return; 968 969 VERIFY(nvlist_lookup_nvlist(config, 970 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 971 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 972 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 973 if (nspares != 0) { 974 VERIFY(nvlist_add_nvlist_array(nvroot, 975 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 976 VERIFY(nvlist_lookup_nvlist_array(nvroot, 977 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 978 979 /* 980 * Go through and find any spares which have since been 981 * repurposed as an active spare. If this is the case, update 982 * their status appropriately. 983 */ 984 for (i = 0; i < nspares; i++) { 985 VERIFY(nvlist_lookup_uint64(spares[i], 986 ZPOOL_CONFIG_GUID, &guid) == 0); 987 if (spa_spare_exists(guid, &pool) && pool != 0ULL) { 988 VERIFY(nvlist_lookup_uint64_array( 989 spares[i], ZPOOL_CONFIG_STATS, 990 (uint64_t **)&vs, &vsc) == 0); 991 vs->vs_state = VDEV_STATE_CANT_OPEN; 992 vs->vs_aux = VDEV_AUX_SPARED; 993 } 994 } 995 } 996} 997 998int 999spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 1000{ 1001 int error; 1002 spa_t *spa; 1003 1004 *config = NULL; 1005 error = spa_open_common(name, &spa, FTAG, config); 1006 1007 if (spa && *config != NULL) { 1008 VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, 1009 spa_get_errlog_size(spa)) == 0); 1010 1011 spa_add_spares(spa, *config); 1012 } 1013 1014 /* 1015 * We want to get the alternate root even for faulted pools, so we cheat 1016 * and call spa_lookup() directly. 1017 */ 1018 if (altroot) { 1019 if (spa == NULL) { 1020 mutex_enter(&spa_namespace_lock); 1021 spa = spa_lookup(name); 1022 if (spa) 1023 spa_altroot(spa, altroot, buflen); 1024 else 1025 altroot[0] = '\0'; 1026 spa = NULL; 1027 mutex_exit(&spa_namespace_lock); 1028 } else { 1029 spa_altroot(spa, altroot, buflen); 1030 } 1031 } 1032 1033 if (spa != NULL) 1034 spa_close(spa, FTAG); 1035 1036 return (error); 1037} 1038 1039/* 1040 * Validate that the 'spares' array is well formed. We must have an array of 1041 * nvlists, each which describes a valid leaf vdev. If this is an import (mode 1042 * is VDEV_ALLOC_SPARE), then we allow corrupted spares to be specified, as long 1043 * as they are well-formed. 1044 */ 1045static int 1046spa_validate_spares(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 1047{ 1048 nvlist_t **spares; 1049 uint_t i, nspares; 1050 vdev_t *vd; 1051 int error; 1052 1053 /* 1054 * It's acceptable to have no spares specified. 1055 */ 1056 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1057 &spares, &nspares) != 0) 1058 return (0); 1059 1060 if (nspares == 0) 1061 return (EINVAL); 1062 1063 /* 1064 * Make sure the pool is formatted with a version that supports hot 1065 * spares. 1066 */ 1067 if (spa_version(spa) < ZFS_VERSION_SPARES) 1068 return (ENOTSUP); 1069 1070 /* 1071 * Set the pending spare list so we correctly handle device in-use 1072 * checking. 1073 */ 1074 spa->spa_pending_spares = spares; 1075 spa->spa_pending_nspares = nspares; 1076 1077 for (i = 0; i < nspares; i++) { 1078 if ((error = spa_config_parse(spa, &vd, spares[i], NULL, 0, 1079 mode)) != 0) 1080 goto out; 1081 1082 if (!vd->vdev_ops->vdev_op_leaf) { 1083 vdev_free(vd); 1084 error = EINVAL; 1085 goto out; 1086 } 1087 1088 vd->vdev_top = vd; 1089 1090 if ((error = vdev_open(vd)) == 0 && 1091 (error = vdev_label_init(vd, crtxg, 1092 VDEV_LABEL_SPARE)) == 0) { 1093 VERIFY(nvlist_add_uint64(spares[i], ZPOOL_CONFIG_GUID, 1094 vd->vdev_guid) == 0); 1095 } 1096 1097 vdev_free(vd); 1098 1099 if (error && mode != VDEV_ALLOC_SPARE) 1100 goto out; 1101 else 1102 error = 0; 1103 } 1104 1105out: 1106 spa->spa_pending_spares = NULL; 1107 spa->spa_pending_nspares = 0; 1108 return (error); 1109} 1110 1111/* 1112 * Pool Creation 1113 */ 1114int 1115spa_create(const char *pool, nvlist_t *nvroot, const char *altroot) 1116{ 1117 spa_t *spa; 1118 vdev_t *rvd; 1119 dsl_pool_t *dp; 1120 dmu_tx_t *tx; 1121 int c, error = 0; 1122 uint64_t txg = TXG_INITIAL; 1123 nvlist_t **spares; 1124 uint_t nspares; 1125 1126 /* 1127 * If this pool already exists, return failure. 1128 */ 1129 mutex_enter(&spa_namespace_lock); 1130 if (spa_lookup(pool) != NULL) { 1131 mutex_exit(&spa_namespace_lock); 1132 return (EEXIST); 1133 } 1134 1135 /* 1136 * Allocate a new spa_t structure. 1137 */ 1138 spa = spa_add(pool, altroot); 1139 spa_activate(spa); 1140 1141 spa->spa_uberblock.ub_txg = txg - 1; 1142 spa->spa_uberblock.ub_version = ZFS_VERSION; 1143 spa->spa_ubsync = spa->spa_uberblock; 1144 1145 /* 1146 * Create the root vdev. 1147 */ 1148 spa_config_enter(spa, RW_WRITER, FTAG); 1149 1150 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 1151 1152 ASSERT(error != 0 || rvd != NULL); 1153 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 1154 1155 if (error == 0 && rvd->vdev_children == 0) 1156 error = EINVAL; 1157 1158 if (error == 0 && 1159 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 1160 (error = spa_validate_spares(spa, nvroot, txg, 1161 VDEV_ALLOC_ADD)) == 0) { 1162 for (c = 0; c < rvd->vdev_children; c++) 1163 vdev_init(rvd->vdev_child[c], txg); 1164 vdev_config_dirty(rvd); 1165 } 1166 1167 spa_config_exit(spa, FTAG); 1168 1169 if (error != 0) { 1170 spa_unload(spa); 1171 spa_deactivate(spa); 1172 spa_remove(spa); 1173 mutex_exit(&spa_namespace_lock); 1174 return (error); 1175 } 1176 1177 /* 1178 * Get the list of spares, if specified. 1179 */ 1180 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1181 &spares, &nspares) == 0) { 1182 VERIFY(nvlist_alloc(&spa->spa_sparelist, NV_UNIQUE_NAME, 1183 KM_SLEEP) == 0); 1184 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1185 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1186 spa_config_enter(spa, RW_WRITER, FTAG); 1187 spa_load_spares(spa); 1188 spa_config_exit(spa, FTAG); 1189 spa->spa_sync_spares = B_TRUE; 1190 } 1191 1192 spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg); 1193 spa->spa_meta_objset = dp->dp_meta_objset; 1194 1195 tx = dmu_tx_create_assigned(dp, txg); 1196 1197 /* 1198 * Create the pool config object. 1199 */ 1200 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 1201 DMU_OT_PACKED_NVLIST, 1 << 14, 1202 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 1203 1204 if (zap_add(spa->spa_meta_objset, 1205 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 1206 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 1207 cmn_err(CE_PANIC, "failed to add pool config"); 1208 } 1209 1210 /* Newly created pools are always deflated. */ 1211 spa->spa_deflate = TRUE; 1212 if (zap_add(spa->spa_meta_objset, 1213 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 1214 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 1215 cmn_err(CE_PANIC, "failed to add deflate"); 1216 } 1217 1218 /* 1219 * Create the deferred-free bplist object. Turn off compression 1220 * because sync-to-convergence takes longer if the blocksize 1221 * keeps changing. 1222 */ 1223 spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 1224 1 << 14, tx); 1225 dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 1226 ZIO_COMPRESS_OFF, tx); 1227 1228 if (zap_add(spa->spa_meta_objset, 1229 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 1230 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { 1231 cmn_err(CE_PANIC, "failed to add bplist"); 1232 } 1233 1234 /* 1235 * Create the pool's history object. 1236 */ 1237 spa_history_create_obj(spa, tx); 1238 1239 dmu_tx_commit(tx); 1240 1241 spa->spa_bootfs = zfs_prop_default_numeric(ZFS_PROP_BOOTFS); 1242 spa->spa_sync_on = B_TRUE; 1243 txg_sync_start(spa->spa_dsl_pool); 1244 1245 /* 1246 * We explicitly wait for the first transaction to complete so that our 1247 * bean counters are appropriately updated. 1248 */ 1249 txg_wait_synced(spa->spa_dsl_pool, txg); 1250 1251 spa_config_sync(); 1252 1253 mutex_exit(&spa_namespace_lock); 1254 1255 return (0); 1256} 1257 1258/* 1259 * Import the given pool into the system. We set up the necessary spa_t and 1260 * then call spa_load() to do the dirty work. 1261 */ 1262int 1263spa_import(const char *pool, nvlist_t *config, const char *altroot) 1264{ 1265 spa_t *spa; 1266 int error; 1267 nvlist_t *nvroot; 1268 nvlist_t **spares; 1269 uint_t nspares; 1270 1271 if (!(spa_mode & FWRITE)) 1272 return (EROFS); 1273 1274 /* 1275 * If a pool with this name exists, return failure. 1276 */ 1277 mutex_enter(&spa_namespace_lock); 1278 if (spa_lookup(pool) != NULL) { 1279 mutex_exit(&spa_namespace_lock); 1280 return (EEXIST); 1281 } 1282 1283 /* 1284 * Create and initialize the spa structure. 1285 */ 1286 spa = spa_add(pool, altroot); 1287 spa_activate(spa); 1288 1289 /* 1290 * Pass off the heavy lifting to spa_load(). 1291 * Pass TRUE for mosconfig because the user-supplied config 1292 * is actually the one to trust when doing an import. 1293 */ 1294 error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE); 1295 1296 spa_config_enter(spa, RW_WRITER, FTAG); 1297 /* 1298 * Toss any existing sparelist, as it doesn't have any validity anymore, 1299 * and conflicts with spa_has_spare(). 1300 */ 1301 if (spa->spa_sparelist) { 1302 nvlist_free(spa->spa_sparelist); 1303 spa->spa_sparelist = NULL; 1304 spa_load_spares(spa); 1305 } 1306 1307 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 1308 &nvroot) == 0); 1309 if (error == 0) 1310 error = spa_validate_spares(spa, nvroot, -1ULL, 1311 VDEV_ALLOC_SPARE); 1312 spa_config_exit(spa, FTAG); 1313 1314 if (error != 0) { 1315 spa_unload(spa); 1316 spa_deactivate(spa); 1317 spa_remove(spa); 1318 mutex_exit(&spa_namespace_lock); 1319 return (error); 1320 } 1321 1322 /* 1323 * Override any spares as specified by the user, as these may have 1324 * correct device names/devids, etc. 1325 */ 1326 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1327 &spares, &nspares) == 0) { 1328 if (spa->spa_sparelist) 1329 VERIFY(nvlist_remove(spa->spa_sparelist, 1330 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 1331 else 1332 VERIFY(nvlist_alloc(&spa->spa_sparelist, 1333 NV_UNIQUE_NAME, KM_SLEEP) == 0); 1334 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1335 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1336 spa_config_enter(spa, RW_WRITER, FTAG); 1337 spa_load_spares(spa); 1338 spa_config_exit(spa, FTAG); 1339 spa->spa_sync_spares = B_TRUE; 1340 } 1341 1342 /* 1343 * Update the config cache to include the newly-imported pool. 1344 */ 1345 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 1346 1347 mutex_exit(&spa_namespace_lock); 1348 1349 /* 1350 * Resilver anything that's out of date. 1351 */ 1352 if (spa_mode & FWRITE) 1353 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1354 1355 return (0); 1356} 1357 1358/* 1359 * This (illegal) pool name is used when temporarily importing a spa_t in order 1360 * to get the vdev stats associated with the imported devices. 1361 */ 1362#define TRYIMPORT_NAME "$import" 1363 1364nvlist_t * 1365spa_tryimport(nvlist_t *tryconfig) 1366{ 1367 nvlist_t *config = NULL; 1368 char *poolname; 1369 spa_t *spa; 1370 uint64_t state; 1371 1372 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 1373 return (NULL); 1374 1375 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 1376 return (NULL); 1377 1378 /* 1379 * Create and initialize the spa structure. 1380 */ 1381 mutex_enter(&spa_namespace_lock); 1382 spa = spa_add(TRYIMPORT_NAME, NULL); 1383 spa_activate(spa); 1384 1385 /* 1386 * Pass off the heavy lifting to spa_load(). 1387 * Pass TRUE for mosconfig because the user-supplied config 1388 * is actually the one to trust when doing an import. 1389 */ 1390 (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); 1391 1392 /* 1393 * If 'tryconfig' was at least parsable, return the current config. 1394 */ 1395 if (spa->spa_root_vdev != NULL) { 1396 spa_config_enter(spa, RW_READER, FTAG); 1397 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 1398 spa_config_exit(spa, FTAG); 1399 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 1400 poolname) == 0); 1401 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 1402 state) == 0); 1403 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 1404 spa->spa_uberblock.ub_timestamp) == 0); 1405 1406 /* 1407 * Add the list of hot spares. 1408 */ 1409 spa_add_spares(spa, config); 1410 } 1411 1412 spa_unload(spa); 1413 spa_deactivate(spa); 1414 spa_remove(spa); 1415 mutex_exit(&spa_namespace_lock); 1416 1417 return (config); 1418} 1419 1420/* 1421 * Pool export/destroy 1422 * 1423 * The act of destroying or exporting a pool is very simple. We make sure there 1424 * is no more pending I/O and any references to the pool are gone. Then, we 1425 * update the pool state and sync all the labels to disk, removing the 1426 * configuration from the cache afterwards. 1427 */ 1428static int 1429spa_export_common(char *pool, int new_state, nvlist_t **oldconfig) 1430{ 1431 spa_t *spa; 1432 1433 if (oldconfig) 1434 *oldconfig = NULL; 1435 1436 if (!(spa_mode & FWRITE)) 1437 return (EROFS); 1438 1439 mutex_enter(&spa_namespace_lock); 1440 if ((spa = spa_lookup(pool)) == NULL) { 1441 mutex_exit(&spa_namespace_lock); 1442 return (ENOENT); 1443 } 1444 1445 /* 1446 * Put a hold on the pool, drop the namespace lock, stop async tasks, 1447 * reacquire the namespace lock, and see if we can export. 1448 */ 1449 spa_open_ref(spa, FTAG); 1450 mutex_exit(&spa_namespace_lock); 1451 spa_async_suspend(spa); 1452 mutex_enter(&spa_namespace_lock); 1453 spa_close(spa, FTAG); 1454 1455 /* 1456 * The pool will be in core if it's openable, 1457 * in which case we can modify its state. 1458 */ 1459 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 1460 /* 1461 * Objsets may be open only because they're dirty, so we 1462 * have to force it to sync before checking spa_refcnt. 1463 */ 1464 spa_scrub_suspend(spa); 1465 txg_wait_synced(spa->spa_dsl_pool, 0); 1466 1467 /* 1468 * A pool cannot be exported or destroyed if there are active 1469 * references. If we are resetting a pool, allow references by 1470 * fault injection handlers. 1471 */ 1472 if (!spa_refcount_zero(spa) || 1473 (spa->spa_inject_ref != 0 && 1474 new_state != POOL_STATE_UNINITIALIZED)) { 1475 spa_scrub_resume(spa); 1476 spa_async_resume(spa); 1477 mutex_exit(&spa_namespace_lock); 1478 return (EBUSY); 1479 } 1480 1481 spa_scrub_resume(spa); 1482 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 1483 1484 /* 1485 * We want this to be reflected on every label, 1486 * so mark them all dirty. spa_unload() will do the 1487 * final sync that pushes these changes out. 1488 */ 1489 if (new_state != POOL_STATE_UNINITIALIZED) { 1490 spa_config_enter(spa, RW_WRITER, FTAG); 1491 spa->spa_state = new_state; 1492 spa->spa_final_txg = spa_last_synced_txg(spa) + 1; 1493 vdev_config_dirty(spa->spa_root_vdev); 1494 spa_config_exit(spa, FTAG); 1495 } 1496 } 1497 1498 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 1499 spa_unload(spa); 1500 spa_deactivate(spa); 1501 } 1502 1503 if (oldconfig && spa->spa_config) 1504 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 1505 1506 if (new_state != POOL_STATE_UNINITIALIZED) { 1507 spa_remove(spa); 1508 spa_config_sync(); 1509 } 1510 mutex_exit(&spa_namespace_lock); 1511 1512 return (0); 1513} 1514 1515/* 1516 * Destroy a storage pool. 1517 */ 1518int 1519spa_destroy(char *pool) 1520{ 1521 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL)); 1522} 1523 1524/* 1525 * Export a storage pool. 1526 */ 1527int 1528spa_export(char *pool, nvlist_t **oldconfig) 1529{ 1530 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig)); 1531} 1532 1533/* 1534 * Similar to spa_export(), this unloads the spa_t without actually removing it 1535 * from the namespace in any way. 1536 */ 1537int 1538spa_reset(char *pool) 1539{ 1540 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL)); 1541} 1542 1543 1544/* 1545 * ========================================================================== 1546 * Device manipulation 1547 * ========================================================================== 1548 */ 1549 1550/* 1551 * Add capacity to a storage pool. 1552 */ 1553int 1554spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 1555{ 1556 uint64_t txg; 1557 int c, error; 1558 vdev_t *rvd = spa->spa_root_vdev; 1559 vdev_t *vd, *tvd; 1560 nvlist_t **spares; 1561 uint_t i, nspares; 1562 1563 txg = spa_vdev_enter(spa); 1564 1565 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 1566 VDEV_ALLOC_ADD)) != 0) 1567 return (spa_vdev_exit(spa, NULL, txg, error)); 1568 1569 spa->spa_pending_vdev = vd; 1570 1571 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1572 &spares, &nspares) != 0) 1573 nspares = 0; 1574 1575 if (vd->vdev_children == 0 && nspares == 0) { 1576 spa->spa_pending_vdev = NULL; 1577 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 1578 } 1579 1580 if (vd->vdev_children != 0) { 1581 if ((error = vdev_create(vd, txg, B_FALSE)) != 0) { 1582 spa->spa_pending_vdev = NULL; 1583 return (spa_vdev_exit(spa, vd, txg, error)); 1584 } 1585 } 1586 1587 /* 1588 * We must validate the spares after checking the children. Otherwise, 1589 * vdev_inuse() will blindly overwrite the spare. 1590 */ 1591 if ((error = spa_validate_spares(spa, nvroot, txg, 1592 VDEV_ALLOC_ADD)) != 0) { 1593 spa->spa_pending_vdev = NULL; 1594 return (spa_vdev_exit(spa, vd, txg, error)); 1595 } 1596 1597 spa->spa_pending_vdev = NULL; 1598 1599 /* 1600 * Transfer each new top-level vdev from vd to rvd. 1601 */ 1602 for (c = 0; c < vd->vdev_children; c++) { 1603 tvd = vd->vdev_child[c]; 1604 vdev_remove_child(vd, tvd); 1605 tvd->vdev_id = rvd->vdev_children; 1606 vdev_add_child(rvd, tvd); 1607 vdev_config_dirty(tvd); 1608 } 1609 1610 if (nspares != 0) { 1611 if (spa->spa_sparelist != NULL) { 1612 nvlist_t **oldspares; 1613 uint_t oldnspares; 1614 nvlist_t **newspares; 1615 1616 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 1617 ZPOOL_CONFIG_SPARES, &oldspares, &oldnspares) == 0); 1618 1619 newspares = kmem_alloc(sizeof (void *) * 1620 (nspares + oldnspares), KM_SLEEP); 1621 for (i = 0; i < oldnspares; i++) 1622 VERIFY(nvlist_dup(oldspares[i], 1623 &newspares[i], KM_SLEEP) == 0); 1624 for (i = 0; i < nspares; i++) 1625 VERIFY(nvlist_dup(spares[i], 1626 &newspares[i + oldnspares], 1627 KM_SLEEP) == 0); 1628 1629 VERIFY(nvlist_remove(spa->spa_sparelist, 1630 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 1631 1632 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1633 ZPOOL_CONFIG_SPARES, newspares, 1634 nspares + oldnspares) == 0); 1635 for (i = 0; i < oldnspares + nspares; i++) 1636 nvlist_free(newspares[i]); 1637 kmem_free(newspares, (oldnspares + nspares) * 1638 sizeof (void *)); 1639 } else { 1640 VERIFY(nvlist_alloc(&spa->spa_sparelist, 1641 NV_UNIQUE_NAME, KM_SLEEP) == 0); 1642 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1643 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1644 } 1645 1646 spa_load_spares(spa); 1647 spa->spa_sync_spares = B_TRUE; 1648 } 1649 1650 /* 1651 * We have to be careful when adding new vdevs to an existing pool. 1652 * If other threads start allocating from these vdevs before we 1653 * sync the config cache, and we lose power, then upon reboot we may 1654 * fail to open the pool because there are DVAs that the config cache 1655 * can't translate. Therefore, we first add the vdevs without 1656 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 1657 * and then let spa_config_update() initialize the new metaslabs. 1658 * 1659 * spa_load() checks for added-but-not-initialized vdevs, so that 1660 * if we lose power at any point in this sequence, the remaining 1661 * steps will be completed the next time we load the pool. 1662 */ 1663 (void) spa_vdev_exit(spa, vd, txg, 0); 1664 1665 mutex_enter(&spa_namespace_lock); 1666 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 1667 mutex_exit(&spa_namespace_lock); 1668 1669 return (0); 1670} 1671 1672/* 1673 * Attach a device to a mirror. The arguments are the path to any device 1674 * in the mirror, and the nvroot for the new device. If the path specifies 1675 * a device that is not mirrored, we automatically insert the mirror vdev. 1676 * 1677 * If 'replacing' is specified, the new device is intended to replace the 1678 * existing device; in this case the two devices are made into their own 1679 * mirror using the 'replacing' vdev, which is functionally idendical to 1680 * the mirror vdev (it actually reuses all the same ops) but has a few 1681 * extra rules: you can't attach to it after it's been created, and upon 1682 * completion of resilvering, the first disk (the one being replaced) 1683 * is automatically detached. 1684 */ 1685int 1686spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 1687{ 1688 uint64_t txg, open_txg; 1689 int error; 1690 vdev_t *rvd = spa->spa_root_vdev; 1691 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 1692 vdev_ops_t *pvops; 1693 1694 txg = spa_vdev_enter(spa); 1695 1696 oldvd = vdev_lookup_by_guid(rvd, guid); 1697 1698 if (oldvd == NULL) 1699 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1700 1701 if (!oldvd->vdev_ops->vdev_op_leaf) 1702 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1703 1704 pvd = oldvd->vdev_parent; 1705 1706 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 1707 VDEV_ALLOC_ADD)) != 0 || newrootvd->vdev_children != 1) 1708 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1709 1710 newvd = newrootvd->vdev_child[0]; 1711 1712 if (!newvd->vdev_ops->vdev_op_leaf) 1713 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1714 1715 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 1716 return (spa_vdev_exit(spa, newrootvd, txg, error)); 1717 1718 if (!replacing) { 1719 /* 1720 * For attach, the only allowable parent is a mirror or the root 1721 * vdev. 1722 */ 1723 if (pvd->vdev_ops != &vdev_mirror_ops && 1724 pvd->vdev_ops != &vdev_root_ops) 1725 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1726 1727 pvops = &vdev_mirror_ops; 1728 } else { 1729 /* 1730 * Active hot spares can only be replaced by inactive hot 1731 * spares. 1732 */ 1733 if (pvd->vdev_ops == &vdev_spare_ops && 1734 pvd->vdev_child[1] == oldvd && 1735 !spa_has_spare(spa, newvd->vdev_guid)) 1736 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1737 1738 /* 1739 * If the source is a hot spare, and the parent isn't already a 1740 * spare, then we want to create a new hot spare. Otherwise, we 1741 * want to create a replacing vdev. The user is not allowed to 1742 * attach to a spared vdev child unless the 'isspare' state is 1743 * the same (spare replaces spare, non-spare replaces 1744 * non-spare). 1745 */ 1746 if (pvd->vdev_ops == &vdev_replacing_ops) 1747 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1748 else if (pvd->vdev_ops == &vdev_spare_ops && 1749 newvd->vdev_isspare != oldvd->vdev_isspare) 1750 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1751 else if (pvd->vdev_ops != &vdev_spare_ops && 1752 newvd->vdev_isspare) 1753 pvops = &vdev_spare_ops; 1754 else 1755 pvops = &vdev_replacing_ops; 1756 } 1757 1758 /* 1759 * Compare the new device size with the replaceable/attachable 1760 * device size. 1761 */ 1762 if (newvd->vdev_psize < vdev_get_rsize(oldvd)) 1763 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 1764 1765 /* 1766 * The new device cannot have a higher alignment requirement 1767 * than the top-level vdev. 1768 */ 1769 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 1770 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 1771 1772 /* 1773 * If this is an in-place replacement, update oldvd's path and devid 1774 * to make it distinguishable from newvd, and unopenable from now on. 1775 */ 1776 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 1777 spa_strfree(oldvd->vdev_path); 1778 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 1779 KM_SLEEP); 1780 (void) sprintf(oldvd->vdev_path, "%s/%s", 1781 newvd->vdev_path, "old"); 1782 if (oldvd->vdev_devid != NULL) { 1783 spa_strfree(oldvd->vdev_devid); 1784 oldvd->vdev_devid = NULL; 1785 } 1786 } 1787 1788 /* 1789 * If the parent is not a mirror, or if we're replacing, insert the new 1790 * mirror/replacing/spare vdev above oldvd. 1791 */ 1792 if (pvd->vdev_ops != pvops) 1793 pvd = vdev_add_parent(oldvd, pvops); 1794 1795 ASSERT(pvd->vdev_top->vdev_parent == rvd); 1796 ASSERT(pvd->vdev_ops == pvops); 1797 ASSERT(oldvd->vdev_parent == pvd); 1798 1799 /* 1800 * Extract the new device from its root and add it to pvd. 1801 */ 1802 vdev_remove_child(newrootvd, newvd); 1803 newvd->vdev_id = pvd->vdev_children; 1804 vdev_add_child(pvd, newvd); 1805 1806 /* 1807 * If newvd is smaller than oldvd, but larger than its rsize, 1808 * the addition of newvd may have decreased our parent's asize. 1809 */ 1810 pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); 1811 1812 tvd = newvd->vdev_top; 1813 ASSERT(pvd->vdev_top == tvd); 1814 ASSERT(tvd->vdev_parent == rvd); 1815 1816 vdev_config_dirty(tvd); 1817 1818 /* 1819 * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 1820 * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 1821 */ 1822 open_txg = txg + TXG_CONCURRENT_STATES - 1; 1823 1824 mutex_enter(&newvd->vdev_dtl_lock); 1825 space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, 1826 open_txg - TXG_INITIAL + 1); 1827 mutex_exit(&newvd->vdev_dtl_lock); 1828 1829 if (newvd->vdev_isspare) 1830 spa_spare_activate(newvd); 1831 1832 /* 1833 * Mark newvd's DTL dirty in this txg. 1834 */ 1835 vdev_dirty(tvd, VDD_DTL, newvd, txg); 1836 1837 (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 1838 1839 /* 1840 * Kick off a resilver to update newvd. 1841 */ 1842 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1843 1844 return (0); 1845} 1846 1847/* 1848 * Detach a device from a mirror or replacing vdev. 1849 * If 'replace_done' is specified, only detach if the parent 1850 * is a replacing vdev. 1851 */ 1852int 1853spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) 1854{ 1855 uint64_t txg; 1856 int c, t, error; 1857 vdev_t *rvd = spa->spa_root_vdev; 1858 vdev_t *vd, *pvd, *cvd, *tvd; 1859 boolean_t unspare = B_FALSE; 1860 uint64_t unspare_guid; 1861 1862 txg = spa_vdev_enter(spa); 1863 1864 vd = vdev_lookup_by_guid(rvd, guid); 1865 1866 if (vd == NULL) 1867 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1868 1869 if (!vd->vdev_ops->vdev_op_leaf) 1870 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1871 1872 pvd = vd->vdev_parent; 1873 1874 /* 1875 * If replace_done is specified, only remove this device if it's 1876 * the first child of a replacing vdev. For the 'spare' vdev, either 1877 * disk can be removed. 1878 */ 1879 if (replace_done) { 1880 if (pvd->vdev_ops == &vdev_replacing_ops) { 1881 if (vd->vdev_id != 0) 1882 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1883 } else if (pvd->vdev_ops != &vdev_spare_ops) { 1884 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1885 } 1886 } 1887 1888 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 1889 spa_version(spa) >= ZFS_VERSION_SPARES); 1890 1891 /* 1892 * Only mirror, replacing, and spare vdevs support detach. 1893 */ 1894 if (pvd->vdev_ops != &vdev_replacing_ops && 1895 pvd->vdev_ops != &vdev_mirror_ops && 1896 pvd->vdev_ops != &vdev_spare_ops) 1897 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1898 1899 /* 1900 * If there's only one replica, you can't detach it. 1901 */ 1902 if (pvd->vdev_children <= 1) 1903 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1904 1905 /* 1906 * If all siblings have non-empty DTLs, this device may have the only 1907 * valid copy of the data, which means we cannot safely detach it. 1908 * 1909 * XXX -- as in the vdev_offline() case, we really want a more 1910 * precise DTL check. 1911 */ 1912 for (c = 0; c < pvd->vdev_children; c++) { 1913 uint64_t dirty; 1914 1915 cvd = pvd->vdev_child[c]; 1916 if (cvd == vd) 1917 continue; 1918 if (vdev_is_dead(cvd)) 1919 continue; 1920 mutex_enter(&cvd->vdev_dtl_lock); 1921 dirty = cvd->vdev_dtl_map.sm_space | 1922 cvd->vdev_dtl_scrub.sm_space; 1923 mutex_exit(&cvd->vdev_dtl_lock); 1924 if (!dirty) 1925 break; 1926 } 1927 1928 /* 1929 * If we are a replacing or spare vdev, then we can always detach the 1930 * latter child, as that is how one cancels the operation. 1931 */ 1932 if ((pvd->vdev_ops == &vdev_mirror_ops || vd->vdev_id != 1) && 1933 c == pvd->vdev_children) 1934 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1935 1936 /* 1937 * If we are detaching the original disk from a spare, then it implies 1938 * that the spare should become a real disk, and be removed from the 1939 * active spare list for the pool. 1940 */ 1941 if (pvd->vdev_ops == &vdev_spare_ops && 1942 vd->vdev_id == 0) 1943 unspare = B_TRUE; 1944 1945 /* 1946 * Erase the disk labels so the disk can be used for other things. 1947 * This must be done after all other error cases are handled, 1948 * but before we disembowel vd (so we can still do I/O to it). 1949 * But if we can't do it, don't treat the error as fatal -- 1950 * it may be that the unwritability of the disk is the reason 1951 * it's being detached! 1952 */ 1953 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 1954 1955 /* 1956 * Remove vd from its parent and compact the parent's children. 1957 */ 1958 vdev_remove_child(pvd, vd); 1959 vdev_compact_children(pvd); 1960 1961 /* 1962 * Remember one of the remaining children so we can get tvd below. 1963 */ 1964 cvd = pvd->vdev_child[0]; 1965 1966 /* 1967 * If we need to remove the remaining child from the list of hot spares, 1968 * do it now, marking the vdev as no longer a spare in the process. We 1969 * must do this before vdev_remove_parent(), because that can change the 1970 * GUID if it creates a new toplevel GUID. 1971 */ 1972 if (unspare) { 1973 ASSERT(cvd->vdev_isspare); 1974 spa_spare_remove(cvd); 1975 unspare_guid = cvd->vdev_guid; 1976 } 1977 1978 /* 1979 * If the parent mirror/replacing vdev only has one child, 1980 * the parent is no longer needed. Remove it from the tree. 1981 */ 1982 if (pvd->vdev_children == 1) 1983 vdev_remove_parent(cvd); 1984 1985 /* 1986 * We don't set tvd until now because the parent we just removed 1987 * may have been the previous top-level vdev. 1988 */ 1989 tvd = cvd->vdev_top; 1990 ASSERT(tvd->vdev_parent == rvd); 1991 1992 /* 1993 * Reevaluate the parent vdev state. 1994 */ 1995 vdev_propagate_state(cvd->vdev_parent); 1996 1997 /* 1998 * If the device we just detached was smaller than the others, it may be 1999 * possible to add metaslabs (i.e. grow the pool). vdev_metaslab_init() 2000 * can't fail because the existing metaslabs are already in core, so 2001 * there's nothing to read from disk. 2002 */ 2003 VERIFY(vdev_metaslab_init(tvd, txg) == 0); 2004 2005 vdev_config_dirty(tvd); 2006 2007 /* 2008 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 2009 * vd->vdev_detached is set and free vd's DTL object in syncing context. 2010 * But first make sure we're not on any *other* txg's DTL list, to 2011 * prevent vd from being accessed after it's freed. 2012 */ 2013 for (t = 0; t < TXG_SIZE; t++) 2014 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 2015 vd->vdev_detached = B_TRUE; 2016 vdev_dirty(tvd, VDD_DTL, vd, txg); 2017 2018 error = spa_vdev_exit(spa, vd, txg, 0); 2019 2020 /* 2021 * If this was the removal of the original device in a hot spare vdev, 2022 * then we want to go through and remove the device from the hot spare 2023 * list of every other pool. 2024 */ 2025 if (unspare) { 2026 spa = NULL; 2027 mutex_enter(&spa_namespace_lock); 2028 while ((spa = spa_next(spa)) != NULL) { 2029 if (spa->spa_state != POOL_STATE_ACTIVE) 2030 continue; 2031 2032 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 2033 } 2034 mutex_exit(&spa_namespace_lock); 2035 } 2036 2037 return (error); 2038} 2039 2040/* 2041 * Remove a device from the pool. Currently, this supports removing only hot 2042 * spares. 2043 */ 2044int 2045spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 2046{ 2047 vdev_t *vd; 2048 nvlist_t **spares, *nv, **newspares; 2049 uint_t i, j, nspares; 2050 int ret = 0; 2051 2052 spa_config_enter(spa, RW_WRITER, FTAG); 2053 2054 vd = spa_lookup_by_guid(spa, guid); 2055 2056 nv = NULL; 2057 if (spa->spa_spares != NULL && 2058 nvlist_lookup_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 2059 &spares, &nspares) == 0) { 2060 for (i = 0; i < nspares; i++) { 2061 uint64_t theguid; 2062 2063 VERIFY(nvlist_lookup_uint64(spares[i], 2064 ZPOOL_CONFIG_GUID, &theguid) == 0); 2065 if (theguid == guid) { 2066 nv = spares[i]; 2067 break; 2068 } 2069 } 2070 } 2071 2072 /* 2073 * We only support removing a hot spare, and only if it's not currently 2074 * in use in this pool. 2075 */ 2076 if (nv == NULL && vd == NULL) { 2077 ret = ENOENT; 2078 goto out; 2079 } 2080 2081 if (nv == NULL && vd != NULL) { 2082 ret = ENOTSUP; 2083 goto out; 2084 } 2085 2086 if (!unspare && nv != NULL && vd != NULL) { 2087 ret = EBUSY; 2088 goto out; 2089 } 2090 2091 if (nspares == 1) { 2092 newspares = NULL; 2093 } else { 2094 newspares = kmem_alloc((nspares - 1) * sizeof (void *), 2095 KM_SLEEP); 2096 for (i = 0, j = 0; i < nspares; i++) { 2097 if (spares[i] != nv) 2098 VERIFY(nvlist_dup(spares[i], 2099 &newspares[j++], KM_SLEEP) == 0); 2100 } 2101 } 2102 2103 VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 2104 DATA_TYPE_NVLIST_ARRAY) == 0); 2105 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 2106 newspares, nspares - 1) == 0); 2107 for (i = 0; i < nspares - 1; i++) 2108 nvlist_free(newspares[i]); 2109 kmem_free(newspares, (nspares - 1) * sizeof (void *)); 2110 spa_load_spares(spa); 2111 spa->spa_sync_spares = B_TRUE; 2112 2113out: 2114 spa_config_exit(spa, FTAG); 2115 2116 return (ret); 2117} 2118 2119/* 2120 * Find any device that's done replacing, so we can detach it. 2121 */ 2122static vdev_t * 2123spa_vdev_replace_done_hunt(vdev_t *vd) 2124{ 2125 vdev_t *newvd, *oldvd; 2126 int c; 2127 2128 for (c = 0; c < vd->vdev_children; c++) { 2129 oldvd = spa_vdev_replace_done_hunt(vd->vdev_child[c]); 2130 if (oldvd != NULL) 2131 return (oldvd); 2132 } 2133 2134 if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 2135 oldvd = vd->vdev_child[0]; 2136 newvd = vd->vdev_child[1]; 2137 2138 mutex_enter(&newvd->vdev_dtl_lock); 2139 if (newvd->vdev_dtl_map.sm_space == 0 && 2140 newvd->vdev_dtl_scrub.sm_space == 0) { 2141 mutex_exit(&newvd->vdev_dtl_lock); 2142 return (oldvd); 2143 } 2144 mutex_exit(&newvd->vdev_dtl_lock); 2145 } 2146 2147 return (NULL); 2148} 2149 2150static void 2151spa_vdev_replace_done(spa_t *spa) 2152{ 2153 vdev_t *vd; 2154 vdev_t *pvd; 2155 uint64_t guid; 2156 uint64_t pguid = 0; 2157 2158 spa_config_enter(spa, RW_READER, FTAG); 2159 2160 while ((vd = spa_vdev_replace_done_hunt(spa->spa_root_vdev)) != NULL) { 2161 guid = vd->vdev_guid; 2162 /* 2163 * If we have just finished replacing a hot spared device, then 2164 * we need to detach the parent's first child (the original hot 2165 * spare) as well. 2166 */ 2167 pvd = vd->vdev_parent; 2168 if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops && 2169 pvd->vdev_id == 0) { 2170 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 2171 ASSERT(pvd->vdev_parent->vdev_children == 2); 2172 pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid; 2173 } 2174 spa_config_exit(spa, FTAG); 2175 if (spa_vdev_detach(spa, guid, B_TRUE) != 0) 2176 return; 2177 if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0) 2178 return; 2179 spa_config_enter(spa, RW_READER, FTAG); 2180 } 2181 2182 spa_config_exit(spa, FTAG); 2183} 2184 2185/* 2186 * Update the stored path for this vdev. Dirty the vdev configuration, relying 2187 * on spa_vdev_enter/exit() to synchronize the labels and cache. 2188 */ 2189int 2190spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 2191{ 2192 vdev_t *rvd, *vd; 2193 uint64_t txg; 2194 2195 rvd = spa->spa_root_vdev; 2196 2197 txg = spa_vdev_enter(spa); 2198 2199 if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 2200 /* 2201 * Determine if this is a reference to a hot spare. In that 2202 * case, update the path as stored in the spare list. 2203 */ 2204 nvlist_t **spares; 2205 uint_t i, nspares; 2206 if (spa->spa_sparelist != NULL) { 2207 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 2208 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 2209 for (i = 0; i < nspares; i++) { 2210 uint64_t theguid; 2211 VERIFY(nvlist_lookup_uint64(spares[i], 2212 ZPOOL_CONFIG_GUID, &theguid) == 0); 2213 if (theguid == guid) 2214 break; 2215 } 2216 2217 if (i == nspares) 2218 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 2219 2220 VERIFY(nvlist_add_string(spares[i], 2221 ZPOOL_CONFIG_PATH, newpath) == 0); 2222 spa_load_spares(spa); 2223 spa->spa_sync_spares = B_TRUE; 2224 return (spa_vdev_exit(spa, NULL, txg, 0)); 2225 } else { 2226 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 2227 } 2228 } 2229 2230 if (!vd->vdev_ops->vdev_op_leaf) 2231 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 2232 2233 spa_strfree(vd->vdev_path); 2234 vd->vdev_path = spa_strdup(newpath); 2235 2236 vdev_config_dirty(vd->vdev_top); 2237 2238 return (spa_vdev_exit(spa, NULL, txg, 0)); 2239} 2240 2241/* 2242 * ========================================================================== 2243 * SPA Scrubbing 2244 * ========================================================================== 2245 */ 2246 2247static void 2248spa_scrub_io_done(zio_t *zio) 2249{ 2250 spa_t *spa = zio->io_spa; 2251 2252 zio_data_buf_free(zio->io_data, zio->io_size); 2253 2254 mutex_enter(&spa->spa_scrub_lock); 2255 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2256 vdev_t *vd = zio->io_vd ? zio->io_vd : spa->spa_root_vdev; 2257 spa->spa_scrub_errors++; 2258 mutex_enter(&vd->vdev_stat_lock); 2259 vd->vdev_stat.vs_scrub_errors++; 2260 mutex_exit(&vd->vdev_stat_lock); 2261 } 2262 2263 if (--spa->spa_scrub_inflight < spa->spa_scrub_maxinflight) 2264 cv_broadcast(&spa->spa_scrub_io_cv); 2265 2266 ASSERT(spa->spa_scrub_inflight >= 0); 2267 2268 mutex_exit(&spa->spa_scrub_lock); 2269} 2270 2271static void 2272spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags, 2273 zbookmark_t *zb) 2274{ 2275 size_t size = BP_GET_LSIZE(bp); 2276 void *data; 2277 2278 mutex_enter(&spa->spa_scrub_lock); 2279 /* 2280 * Do not give too much work to vdev(s). 2281 */ 2282 while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight) { 2283 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2284 } 2285 spa->spa_scrub_inflight++; 2286 mutex_exit(&spa->spa_scrub_lock); 2287 2288 data = zio_data_buf_alloc(size); 2289 2290 if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) 2291 flags |= ZIO_FLAG_SPECULATIVE; /* intent log block */ 2292 2293 flags |= ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL; 2294 2295 zio_nowait(zio_read(NULL, spa, bp, data, size, 2296 spa_scrub_io_done, NULL, priority, flags, zb)); 2297} 2298 2299/* ARGSUSED */ 2300static int 2301spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) 2302{ 2303 blkptr_t *bp = &bc->bc_blkptr; 2304 vdev_t *vd = spa->spa_root_vdev; 2305 dva_t *dva = bp->blk_dva; 2306 int needs_resilver = B_FALSE; 2307 int d; 2308 2309 if (bc->bc_errno) { 2310 /* 2311 * We can't scrub this block, but we can continue to scrub 2312 * the rest of the pool. Note the error and move along. 2313 */ 2314 mutex_enter(&spa->spa_scrub_lock); 2315 spa->spa_scrub_errors++; 2316 mutex_exit(&spa->spa_scrub_lock); 2317 2318 mutex_enter(&vd->vdev_stat_lock); 2319 vd->vdev_stat.vs_scrub_errors++; 2320 mutex_exit(&vd->vdev_stat_lock); 2321 2322 return (ERESTART); 2323 } 2324 2325 ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg); 2326 2327 for (d = 0; d < BP_GET_NDVAS(bp); d++) { 2328 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d])); 2329 2330 ASSERT(vd != NULL); 2331 2332 /* 2333 * Keep track of how much data we've examined so that 2334 * zpool(1M) status can make useful progress reports. 2335 */ 2336 mutex_enter(&vd->vdev_stat_lock); 2337 vd->vdev_stat.vs_scrub_examined += DVA_GET_ASIZE(&dva[d]); 2338 mutex_exit(&vd->vdev_stat_lock); 2339 2340 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) { 2341 if (DVA_GET_GANG(&dva[d])) { 2342 /* 2343 * Gang members may be spread across multiple 2344 * vdevs, so the best we can do is look at the 2345 * pool-wide DTL. 2346 * XXX -- it would be better to change our 2347 * allocation policy to ensure that this can't 2348 * happen. 2349 */ 2350 vd = spa->spa_root_vdev; 2351 } 2352 if (vdev_dtl_contains(&vd->vdev_dtl_map, 2353 bp->blk_birth, 1)) 2354 needs_resilver = B_TRUE; 2355 } 2356 } 2357 2358 if (spa->spa_scrub_type == POOL_SCRUB_EVERYTHING) 2359 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB, 2360 ZIO_FLAG_SCRUB, &bc->bc_bookmark); 2361 else if (needs_resilver) 2362 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, 2363 ZIO_FLAG_RESILVER, &bc->bc_bookmark); 2364 2365 return (0); 2366} 2367 2368static void 2369spa_scrub_thread(void *arg) 2370{ 2371 spa_t *spa = arg; 2372 callb_cpr_t cprinfo; 2373 traverse_handle_t *th = spa->spa_scrub_th; 2374 vdev_t *rvd = spa->spa_root_vdev; 2375 pool_scrub_type_t scrub_type = spa->spa_scrub_type; 2376 int error = 0; 2377 boolean_t complete; 2378 2379 CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG); 2380 2381 /* 2382 * If we're restarting due to a snapshot create/delete, 2383 * wait for that to complete. 2384 */ 2385 txg_wait_synced(spa_get_dsl(spa), 0); 2386 2387 dprintf("start %s mintxg=%llu maxtxg=%llu\n", 2388 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 2389 spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg); 2390 2391 spa_config_enter(spa, RW_WRITER, FTAG); 2392 vdev_reopen(rvd); /* purge all vdev caches */ 2393 vdev_config_dirty(rvd); /* rewrite all disk labels */ 2394 vdev_scrub_stat_update(rvd, scrub_type, B_FALSE); 2395 spa_config_exit(spa, FTAG); 2396 2397 mutex_enter(&spa->spa_scrub_lock); 2398 spa->spa_scrub_errors = 0; 2399 spa->spa_scrub_active = 1; 2400 ASSERT(spa->spa_scrub_inflight == 0); 2401 2402 while (!spa->spa_scrub_stop) { 2403 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2404 while (spa->spa_scrub_suspended) { 2405 spa->spa_scrub_active = 0; 2406 cv_broadcast(&spa->spa_scrub_cv); 2407 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2408 spa->spa_scrub_active = 1; 2409 } 2410 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock); 2411 2412 if (spa->spa_scrub_restart_txg != 0) 2413 break; 2414 2415 mutex_exit(&spa->spa_scrub_lock); 2416 error = traverse_more(th); 2417 mutex_enter(&spa->spa_scrub_lock); 2418 if (error != EAGAIN) 2419 break; 2420 } 2421 2422 while (spa->spa_scrub_inflight) 2423 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2424 2425 spa->spa_scrub_active = 0; 2426 cv_broadcast(&spa->spa_scrub_cv); 2427 2428 mutex_exit(&spa->spa_scrub_lock); 2429 2430 spa_config_enter(spa, RW_WRITER, FTAG); 2431 2432 mutex_enter(&spa->spa_scrub_lock); 2433 2434 /* 2435 * Note: we check spa_scrub_restart_txg under both spa_scrub_lock 2436 * AND the spa config lock to synchronize with any config changes 2437 * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit(). 2438 */ 2439 if (spa->spa_scrub_restart_txg != 0) 2440 error = ERESTART; 2441 2442 if (spa->spa_scrub_stop) 2443 error = EINTR; 2444 2445 /* 2446 * Even if there were uncorrectable errors, we consider the scrub 2447 * completed. The downside is that if there is a transient error during 2448 * a resilver, we won't resilver the data properly to the target. But 2449 * if the damage is permanent (more likely) we will resilver forever, 2450 * which isn't really acceptable. Since there is enough information for 2451 * the user to know what has failed and why, this seems like a more 2452 * tractable approach. 2453 */ 2454 complete = (error == 0); 2455 2456 dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n", 2457 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 2458 spa->spa_scrub_maxtxg, complete ? "done" : "FAILED", 2459 error, spa->spa_scrub_errors, spa->spa_scrub_stop); 2460 2461 mutex_exit(&spa->spa_scrub_lock); 2462 2463 /* 2464 * If the scrub/resilver completed, update all DTLs to reflect this. 2465 * Whether it succeeded or not, vacate all temporary scrub DTLs. 2466 */ 2467 vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1, 2468 complete ? spa->spa_scrub_maxtxg : 0, B_TRUE); 2469 vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); 2470 spa_errlog_rotate(spa); 2471 2472 spa_config_exit(spa, FTAG); 2473 2474 mutex_enter(&spa->spa_scrub_lock); 2475 2476 /* 2477 * We may have finished replacing a device. 2478 * Let the async thread assess this and handle the detach. 2479 */ 2480 spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 2481 2482 /* 2483 * If we were told to restart, our final act is to start a new scrub. 2484 */ 2485 if (error == ERESTART) 2486 spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ? 2487 SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB); 2488 2489 spa->spa_scrub_type = POOL_SCRUB_NONE; 2490 spa->spa_scrub_active = 0; 2491 spa->spa_scrub_thread = NULL; 2492 cv_broadcast(&spa->spa_scrub_cv); 2493 CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */ 2494 thread_exit(); 2495} 2496 2497void 2498spa_scrub_suspend(spa_t *spa) 2499{ 2500 mutex_enter(&spa->spa_scrub_lock); 2501 spa->spa_scrub_suspended++; 2502 while (spa->spa_scrub_active) { 2503 cv_broadcast(&spa->spa_scrub_cv); 2504 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2505 } 2506 while (spa->spa_scrub_inflight) 2507 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2508 mutex_exit(&spa->spa_scrub_lock); 2509} 2510 2511void 2512spa_scrub_resume(spa_t *spa) 2513{ 2514 mutex_enter(&spa->spa_scrub_lock); 2515 ASSERT(spa->spa_scrub_suspended != 0); 2516 if (--spa->spa_scrub_suspended == 0) 2517 cv_broadcast(&spa->spa_scrub_cv); 2518 mutex_exit(&spa->spa_scrub_lock); 2519} 2520 2521void 2522spa_scrub_restart(spa_t *spa, uint64_t txg) 2523{ 2524 /* 2525 * Something happened (e.g. snapshot create/delete) that means 2526 * we must restart any in-progress scrubs. The itinerary will 2527 * fix this properly. 2528 */ 2529 mutex_enter(&spa->spa_scrub_lock); 2530 spa->spa_scrub_restart_txg = txg; 2531 mutex_exit(&spa->spa_scrub_lock); 2532} 2533 2534int 2535spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) 2536{ 2537 space_seg_t *ss; 2538 uint64_t mintxg, maxtxg; 2539 vdev_t *rvd = spa->spa_root_vdev; 2540 2541 if ((uint_t)type >= POOL_SCRUB_TYPES) 2542 return (ENOTSUP); 2543 2544 mutex_enter(&spa->spa_scrub_lock); 2545 2546 /* 2547 * If there's a scrub or resilver already in progress, stop it. 2548 */ 2549 while (spa->spa_scrub_thread != NULL) { 2550 /* 2551 * Don't stop a resilver unless forced. 2552 */ 2553 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) { 2554 mutex_exit(&spa->spa_scrub_lock); 2555 return (EBUSY); 2556 } 2557 spa->spa_scrub_stop = 1; 2558 cv_broadcast(&spa->spa_scrub_cv); 2559 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2560 } 2561 2562 /* 2563 * Terminate the previous traverse. 2564 */ 2565 if (spa->spa_scrub_th != NULL) { 2566 traverse_fini(spa->spa_scrub_th); 2567 spa->spa_scrub_th = NULL; 2568 } 2569 2570 if (rvd == NULL) { 2571 ASSERT(spa->spa_scrub_stop == 0); 2572 ASSERT(spa->spa_scrub_type == type); 2573 ASSERT(spa->spa_scrub_restart_txg == 0); 2574 mutex_exit(&spa->spa_scrub_lock); 2575 return (0); 2576 } 2577 2578 mintxg = TXG_INITIAL - 1; 2579 maxtxg = spa_last_synced_txg(spa) + 1; 2580 2581 mutex_enter(&rvd->vdev_dtl_lock); 2582 2583 if (rvd->vdev_dtl_map.sm_space == 0) { 2584 /* 2585 * The pool-wide DTL is empty. 2586 * If this is a resilver, there's nothing to do except 2587 * check whether any in-progress replacements have completed. 2588 */ 2589 if (type == POOL_SCRUB_RESILVER) { 2590 type = POOL_SCRUB_NONE; 2591 spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 2592 } 2593 } else { 2594 /* 2595 * The pool-wide DTL is non-empty. 2596 * If this is a normal scrub, upgrade to a resilver instead. 2597 */ 2598 if (type == POOL_SCRUB_EVERYTHING) 2599 type = POOL_SCRUB_RESILVER; 2600 } 2601 2602 if (type == POOL_SCRUB_RESILVER) { 2603 /* 2604 * Determine the resilvering boundaries. 2605 * 2606 * Note: (mintxg, maxtxg) is an open interval, 2607 * i.e. mintxg and maxtxg themselves are not included. 2608 * 2609 * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1 2610 * so we don't claim to resilver a txg that's still changing. 2611 */ 2612 ss = avl_first(&rvd->vdev_dtl_map.sm_root); 2613 mintxg = ss->ss_start - 1; 2614 ss = avl_last(&rvd->vdev_dtl_map.sm_root); 2615 maxtxg = MIN(ss->ss_end, maxtxg); 2616 } 2617 2618 mutex_exit(&rvd->vdev_dtl_lock); 2619 2620 spa->spa_scrub_stop = 0; 2621 spa->spa_scrub_type = type; 2622 spa->spa_scrub_restart_txg = 0; 2623 2624 if (type != POOL_SCRUB_NONE) { 2625 spa->spa_scrub_mintxg = mintxg; 2626 spa->spa_scrub_maxtxg = maxtxg; 2627 spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL, 2628 ADVANCE_PRE | ADVANCE_PRUNE | ADVANCE_ZIL, 2629 ZIO_FLAG_CANFAIL); 2630 traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg); 2631 spa->spa_scrub_thread = thread_create(NULL, 0, 2632 spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri); 2633 } 2634 2635 mutex_exit(&spa->spa_scrub_lock); 2636 2637 return (0); 2638} 2639 2640/* 2641 * ========================================================================== 2642 * SPA async task processing 2643 * ========================================================================== 2644 */ 2645 2646static void 2647spa_async_reopen(spa_t *spa) 2648{ 2649 vdev_t *rvd = spa->spa_root_vdev; 2650 vdev_t *tvd; 2651 int c; 2652 2653 spa_config_enter(spa, RW_WRITER, FTAG); 2654 2655 for (c = 0; c < rvd->vdev_children; c++) { 2656 tvd = rvd->vdev_child[c]; 2657 if (tvd->vdev_reopen_wanted) { 2658 tvd->vdev_reopen_wanted = 0; 2659 vdev_reopen(tvd); 2660 } 2661 } 2662 2663 spa_config_exit(spa, FTAG); 2664} 2665 2666static void 2667spa_async_thread(void *arg) 2668{ 2669 spa_t *spa = arg; 2670 int tasks; 2671 2672 ASSERT(spa->spa_sync_on); 2673 2674 mutex_enter(&spa->spa_async_lock); 2675 tasks = spa->spa_async_tasks; 2676 spa->spa_async_tasks = 0; 2677 mutex_exit(&spa->spa_async_lock); 2678 2679 /* 2680 * See if the config needs to be updated. 2681 */ 2682 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 2683 mutex_enter(&spa_namespace_lock); 2684 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 2685 mutex_exit(&spa_namespace_lock); 2686 } 2687 2688 /* 2689 * See if any devices need to be reopened. 2690 */ 2691 if (tasks & SPA_ASYNC_REOPEN) 2692 spa_async_reopen(spa); 2693 2694 /* 2695 * If any devices are done replacing, detach them. 2696 */ 2697 if (tasks & SPA_ASYNC_REPLACE_DONE) 2698 spa_vdev_replace_done(spa); 2699 2700 /* 2701 * Kick off a scrub. 2702 */ 2703 if (tasks & SPA_ASYNC_SCRUB) 2704 VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0); 2705 2706 /* 2707 * Kick off a resilver. 2708 */ 2709 if (tasks & SPA_ASYNC_RESILVER) 2710 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 2711 2712 /* 2713 * Let the world know that we're done. 2714 */ 2715 mutex_enter(&spa->spa_async_lock); 2716 spa->spa_async_thread = NULL; 2717 cv_broadcast(&spa->spa_async_cv); 2718 mutex_exit(&spa->spa_async_lock); 2719 thread_exit(); 2720} 2721 2722void 2723spa_async_suspend(spa_t *spa) 2724{ 2725 mutex_enter(&spa->spa_async_lock); 2726 spa->spa_async_suspended++; 2727 while (spa->spa_async_thread != NULL) 2728 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 2729 mutex_exit(&spa->spa_async_lock); 2730} 2731 2732void 2733spa_async_resume(spa_t *spa) 2734{ 2735 mutex_enter(&spa->spa_async_lock); 2736 ASSERT(spa->spa_async_suspended != 0); 2737 spa->spa_async_suspended--; 2738 mutex_exit(&spa->spa_async_lock); 2739} 2740 2741static void 2742spa_async_dispatch(spa_t *spa) 2743{ 2744 mutex_enter(&spa->spa_async_lock); 2745 if (spa->spa_async_tasks && !spa->spa_async_suspended && 2746 spa->spa_async_thread == NULL && 2747 rootdir != NULL && !vn_is_readonly(rootdir)) 2748 spa->spa_async_thread = thread_create(NULL, 0, 2749 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 2750 mutex_exit(&spa->spa_async_lock); 2751} 2752 2753void 2754spa_async_request(spa_t *spa, int task) 2755{ 2756 mutex_enter(&spa->spa_async_lock); 2757 spa->spa_async_tasks |= task; 2758 mutex_exit(&spa->spa_async_lock); 2759} 2760 2761/* 2762 * ========================================================================== 2763 * SPA syncing routines 2764 * ========================================================================== 2765 */ 2766 2767static void 2768spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 2769{ 2770 bplist_t *bpl = &spa->spa_sync_bplist; 2771 dmu_tx_t *tx; 2772 blkptr_t blk; 2773 uint64_t itor = 0; 2774 zio_t *zio; 2775 int error; 2776 uint8_t c = 1; 2777 2778 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); 2779 2780 while (bplist_iterate(bpl, &itor, &blk) == 0) 2781 zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); 2782 2783 error = zio_wait(zio); 2784 ASSERT3U(error, ==, 0); 2785 2786 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2787 bplist_vacate(bpl, tx); 2788 2789 /* 2790 * Pre-dirty the first block so we sync to convergence faster. 2791 * (Usually only the first block is needed.) 2792 */ 2793 dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 2794 dmu_tx_commit(tx); 2795} 2796 2797static void 2798spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 2799{ 2800 char *packed = NULL; 2801 size_t nvsize = 0; 2802 dmu_buf_t *db; 2803 2804 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 2805 2806 packed = kmem_alloc(nvsize, KM_SLEEP); 2807 2808 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 2809 KM_SLEEP) == 0); 2810 2811 dmu_write(spa->spa_meta_objset, obj, 0, nvsize, packed, tx); 2812 2813 kmem_free(packed, nvsize); 2814 2815 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 2816 dmu_buf_will_dirty(db, tx); 2817 *(uint64_t *)db->db_data = nvsize; 2818 dmu_buf_rele(db, FTAG); 2819} 2820 2821static void 2822spa_sync_spares(spa_t *spa, dmu_tx_t *tx) 2823{ 2824 nvlist_t *nvroot; 2825 nvlist_t **spares; 2826 int i; 2827 2828 if (!spa->spa_sync_spares) 2829 return; 2830 2831 /* 2832 * Update the MOS nvlist describing the list of available spares. 2833 * spa_validate_spares() will have already made sure this nvlist is 2834 * valid and the vdevs are labelled appropriately. 2835 */ 2836 if (spa->spa_spares_object == 0) { 2837 spa->spa_spares_object = dmu_object_alloc(spa->spa_meta_objset, 2838 DMU_OT_PACKED_NVLIST, 1 << 14, 2839 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 2840 VERIFY(zap_update(spa->spa_meta_objset, 2841 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SPARES, 2842 sizeof (uint64_t), 1, &spa->spa_spares_object, tx) == 0); 2843 } 2844 2845 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 2846 if (spa->spa_nspares == 0) { 2847 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2848 NULL, 0) == 0); 2849 } else { 2850 spares = kmem_alloc(spa->spa_nspares * sizeof (void *), 2851 KM_SLEEP); 2852 for (i = 0; i < spa->spa_nspares; i++) 2853 spares[i] = vdev_config_generate(spa, 2854 spa->spa_spares[i], B_FALSE, B_TRUE); 2855 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2856 spares, spa->spa_nspares) == 0); 2857 for (i = 0; i < spa->spa_nspares; i++) 2858 nvlist_free(spares[i]); 2859 kmem_free(spares, spa->spa_nspares * sizeof (void *)); 2860 } 2861 2862 spa_sync_nvlist(spa, spa->spa_spares_object, nvroot, tx); 2863 nvlist_free(nvroot); 2864 2865 spa->spa_sync_spares = B_FALSE; 2866} 2867 2868static void 2869spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 2870{ 2871 nvlist_t *config; 2872 2873 if (list_is_empty(&spa->spa_dirty_list)) 2874 return; 2875 2876 config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); 2877 2878 if (spa->spa_config_syncing) 2879 nvlist_free(spa->spa_config_syncing); 2880 spa->spa_config_syncing = config; 2881 2882 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 2883} 2884 2885static void 2886spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx) 2887{ 2888 spa_t *spa = arg1; 2889 nvlist_t *nvp = arg2; 2890 nvpair_t *nvpair; 2891 objset_t *mos = spa->spa_meta_objset; 2892 uint64_t zapobj; 2893 2894 mutex_enter(&spa->spa_props_lock); 2895 if (spa->spa_pool_props_object == 0) { 2896 zapobj = zap_create(mos, DMU_OT_POOL_PROPS, DMU_OT_NONE, 0, tx); 2897 VERIFY(zapobj > 0); 2898 2899 spa->spa_pool_props_object = zapobj; 2900 2901 VERIFY(zap_update(mos, DMU_POOL_DIRECTORY_OBJECT, 2902 DMU_POOL_PROPS, 8, 1, 2903 &spa->spa_pool_props_object, tx) == 0); 2904 } 2905 mutex_exit(&spa->spa_props_lock); 2906 2907 nvpair = NULL; 2908 while ((nvpair = nvlist_next_nvpair(nvp, nvpair))) { 2909 switch (zpool_name_to_prop(nvpair_name(nvpair))) { 2910 case ZFS_PROP_BOOTFS: 2911 VERIFY(nvlist_lookup_uint64(nvp, 2912 nvpair_name(nvpair), &spa->spa_bootfs) == 0); 2913 VERIFY(zap_update(mos, 2914 spa->spa_pool_props_object, 2915 zpool_prop_to_name(ZFS_PROP_BOOTFS), 8, 1, 2916 &spa->spa_bootfs, tx) == 0); 2917 break; 2918 } 2919 } 2920} 2921 2922/* 2923 * Sync the specified transaction group. New blocks may be dirtied as 2924 * part of the process, so we iterate until it converges. 2925 */ 2926void 2927spa_sync(spa_t *spa, uint64_t txg) 2928{ 2929 dsl_pool_t *dp = spa->spa_dsl_pool; 2930 objset_t *mos = spa->spa_meta_objset; 2931 bplist_t *bpl = &spa->spa_sync_bplist; 2932 vdev_t *rvd = spa->spa_root_vdev; 2933 vdev_t *vd; 2934 dmu_tx_t *tx; 2935 int dirty_vdevs; 2936 2937 /* 2938 * Lock out configuration changes. 2939 */ 2940 spa_config_enter(spa, RW_READER, FTAG); 2941 2942 spa->spa_syncing_txg = txg; 2943 spa->spa_sync_pass = 0; 2944 2945 VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); 2946 2947 tx = dmu_tx_create_assigned(dp, txg); 2948 2949 /* 2950 * If we are upgrading to ZFS_VERSION_RAIDZ_DEFLATE this txg, 2951 * set spa_deflate if we have no raid-z vdevs. 2952 */ 2953 if (spa->spa_ubsync.ub_version < ZFS_VERSION_RAIDZ_DEFLATE && 2954 spa->spa_uberblock.ub_version >= ZFS_VERSION_RAIDZ_DEFLATE) { 2955 int i; 2956 2957 for (i = 0; i < rvd->vdev_children; i++) { 2958 vd = rvd->vdev_child[i]; 2959 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 2960 break; 2961 } 2962 if (i == rvd->vdev_children) { 2963 spa->spa_deflate = TRUE; 2964 VERIFY(0 == zap_add(spa->spa_meta_objset, 2965 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 2966 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 2967 } 2968 } 2969 2970 /* 2971 * If anything has changed in this txg, push the deferred frees 2972 * from the previous txg. If not, leave them alone so that we 2973 * don't generate work on an otherwise idle system. 2974 */ 2975 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 2976 !txg_list_empty(&dp->dp_dirty_dirs, txg) || 2977 !txg_list_empty(&dp->dp_sync_tasks, txg)) 2978 spa_sync_deferred_frees(spa, txg); 2979 2980 /* 2981 * Iterate to convergence. 2982 */ 2983 do { 2984 spa->spa_sync_pass++; 2985 2986 spa_sync_config_object(spa, tx); 2987 spa_sync_spares(spa, tx); 2988 spa_errlog_sync(spa, txg); 2989 dsl_pool_sync(dp, txg); 2990 2991 dirty_vdevs = 0; 2992 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 2993 vdev_sync(vd, txg); 2994 dirty_vdevs++; 2995 } 2996 2997 bplist_sync(bpl, tx); 2998 } while (dirty_vdevs); 2999 3000 bplist_close(bpl); 3001 3002 dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 3003 3004 /* 3005 * Rewrite the vdev configuration (which includes the uberblock) 3006 * to commit the transaction group. 3007 * 3008 * If there are any dirty vdevs, sync the uberblock to all vdevs. 3009 * Otherwise, pick a random top-level vdev that's known to be 3010 * visible in the config cache (see spa_vdev_add() for details). 3011 * If the write fails, try the next vdev until we're tried them all. 3012 */ 3013 if (!list_is_empty(&spa->spa_dirty_list)) { 3014 VERIFY(vdev_config_sync(rvd, txg) == 0); 3015 } else { 3016 int children = rvd->vdev_children; 3017 int c0 = spa_get_random(children); 3018 int c; 3019 3020 for (c = 0; c < children; c++) { 3021 vd = rvd->vdev_child[(c0 + c) % children]; 3022 if (vd->vdev_ms_array == 0) 3023 continue; 3024 if (vdev_config_sync(vd, txg) == 0) 3025 break; 3026 } 3027 if (c == children) 3028 VERIFY(vdev_config_sync(rvd, txg) == 0); 3029 } 3030 3031 dmu_tx_commit(tx); 3032 3033 /* 3034 * Clear the dirty config list. 3035 */ 3036 while ((vd = list_head(&spa->spa_dirty_list)) != NULL) 3037 vdev_config_clean(vd); 3038 3039 /* 3040 * Now that the new config has synced transactionally, 3041 * let it become visible to the config cache. 3042 */ 3043 if (spa->spa_config_syncing != NULL) { 3044 spa_config_set(spa, spa->spa_config_syncing); 3045 spa->spa_config_txg = txg; 3046 spa->spa_config_syncing = NULL; 3047 } 3048 3049 /* 3050 * Make a stable copy of the fully synced uberblock. 3051 * We use this as the root for pool traversals. 3052 */ 3053 spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */ 3054 3055 spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */ 3056 3057 rw_enter(&spa->spa_traverse_lock, RW_WRITER); 3058 spa->spa_traverse_wanted = 0; 3059 spa->spa_ubsync = spa->spa_uberblock; 3060 rw_exit(&spa->spa_traverse_lock); 3061 3062 spa_scrub_resume(spa); /* resume scrub with new ubsync */ 3063 3064 /* 3065 * Clean up the ZIL records for the synced txg. 3066 */ 3067 dsl_pool_zil_clean(dp); 3068 3069 /* 3070 * Update usable space statistics. 3071 */ 3072 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 3073 vdev_sync_done(vd, txg); 3074 3075 /* 3076 * It had better be the case that we didn't dirty anything 3077 * since vdev_config_sync(). 3078 */ 3079 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 3080 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 3081 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 3082 ASSERT(bpl->bpl_queue == NULL); 3083 3084 spa_config_exit(spa, FTAG); 3085 3086 /* 3087 * If any async tasks have been requested, kick them off. 3088 */ 3089 spa_async_dispatch(spa); 3090} 3091 3092/* 3093 * Sync all pools. We don't want to hold the namespace lock across these 3094 * operations, so we take a reference on the spa_t and drop the lock during the 3095 * sync. 3096 */ 3097void 3098spa_sync_allpools(void) 3099{ 3100 spa_t *spa = NULL; 3101 mutex_enter(&spa_namespace_lock); 3102 while ((spa = spa_next(spa)) != NULL) { 3103 if (spa_state(spa) != POOL_STATE_ACTIVE) 3104 continue; 3105 spa_open_ref(spa, FTAG); 3106 mutex_exit(&spa_namespace_lock); 3107 txg_wait_synced(spa_get_dsl(spa), 0); 3108 mutex_enter(&spa_namespace_lock); 3109 spa_close(spa, FTAG); 3110 } 3111 mutex_exit(&spa_namespace_lock); 3112} 3113 3114/* 3115 * ========================================================================== 3116 * Miscellaneous routines 3117 * ========================================================================== 3118 */ 3119 3120/* 3121 * Remove all pools in the system. 3122 */ 3123void 3124spa_evict_all(void) 3125{ 3126 spa_t *spa; 3127 3128 /* 3129 * Remove all cached state. All pools should be closed now, 3130 * so every spa in the AVL tree should be unreferenced. 3131 */ 3132 mutex_enter(&spa_namespace_lock); 3133 while ((spa = spa_next(NULL)) != NULL) { 3134 /* 3135 * Stop async tasks. The async thread may need to detach 3136 * a device that's been replaced, which requires grabbing 3137 * spa_namespace_lock, so we must drop it here. 3138 */ 3139 spa_open_ref(spa, FTAG); 3140 mutex_exit(&spa_namespace_lock); 3141 spa_async_suspend(spa); 3142 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 3143 mutex_enter(&spa_namespace_lock); 3144 spa_close(spa, FTAG); 3145 3146 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 3147 spa_unload(spa); 3148 spa_deactivate(spa); 3149 } 3150 spa_remove(spa); 3151 } 3152 mutex_exit(&spa_namespace_lock); 3153} 3154 3155vdev_t * 3156spa_lookup_by_guid(spa_t *spa, uint64_t guid) 3157{ 3158 return (vdev_lookup_by_guid(spa->spa_root_vdev, guid)); 3159} 3160 3161void 3162spa_upgrade(spa_t *spa) 3163{ 3164 spa_config_enter(spa, RW_WRITER, FTAG); 3165 3166 /* 3167 * This should only be called for a non-faulted pool, and since a 3168 * future version would result in an unopenable pool, this shouldn't be 3169 * possible. 3170 */ 3171 ASSERT(spa->spa_uberblock.ub_version <= ZFS_VERSION); 3172 3173 spa->spa_uberblock.ub_version = ZFS_VERSION; 3174 vdev_config_dirty(spa->spa_root_vdev); 3175 3176 spa_config_exit(spa, FTAG); 3177 3178 txg_wait_synced(spa_get_dsl(spa), 0); 3179} 3180 3181boolean_t 3182spa_has_spare(spa_t *spa, uint64_t guid) 3183{ 3184 int i; 3185 uint64_t spareguid; 3186 3187 for (i = 0; i < spa->spa_nspares; i++) 3188 if (spa->spa_spares[i]->vdev_guid == guid) 3189 return (B_TRUE); 3190 3191 for (i = 0; i < spa->spa_pending_nspares; i++) { 3192 if (nvlist_lookup_uint64(spa->spa_pending_spares[i], 3193 ZPOOL_CONFIG_GUID, &spareguid) == 0 && 3194 spareguid == guid) 3195 return (B_TRUE); 3196 } 3197 3198 return (B_FALSE); 3199} 3200 3201int 3202spa_set_props(spa_t *spa, nvlist_t *nvp) 3203{ 3204 return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, 3205 spa, nvp, 3)); 3206} 3207 3208int 3209spa_get_props(spa_t *spa, nvlist_t **nvp) 3210{ 3211 zap_cursor_t zc; 3212 zap_attribute_t za; 3213 objset_t *mos = spa->spa_meta_objset; 3214 zfs_source_t src; 3215 zfs_prop_t prop; 3216 nvlist_t *propval; 3217 uint64_t value; 3218 int err; 3219 3220 VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3221 3222 mutex_enter(&spa->spa_props_lock); 3223 /* If no props object, then just return empty nvlist */ 3224 if (spa->spa_pool_props_object == 0) { 3225 mutex_exit(&spa->spa_props_lock); 3226 return (0); 3227 } 3228 3229 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 3230 (err = zap_cursor_retrieve(&zc, &za)) == 0; 3231 zap_cursor_advance(&zc)) { 3232 3233 if ((prop = zpool_name_to_prop(za.za_name)) == ZFS_PROP_INVAL) 3234 continue; 3235 3236 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3237 switch (za.za_integer_length) { 3238 case 8: 3239 if (zfs_prop_default_numeric(prop) == 3240 za.za_first_integer) 3241 src = ZFS_SRC_DEFAULT; 3242 else 3243 src = ZFS_SRC_LOCAL; 3244 value = za.za_first_integer; 3245 3246 if (prop == ZFS_PROP_BOOTFS) { 3247 dsl_pool_t *dp; 3248 dsl_dataset_t *ds = NULL; 3249 char strval[MAXPATHLEN]; 3250 3251 dp = spa_get_dsl(spa); 3252 rw_enter(&dp->dp_config_rwlock, RW_READER); 3253 if ((err = dsl_dataset_open_obj(dp, 3254 za.za_first_integer, NULL, DS_MODE_NONE, 3255 FTAG, &ds)) != 0) { 3256 rw_exit(&dp->dp_config_rwlock); 3257 break; 3258 } 3259 dsl_dataset_name(ds, strval); 3260 dsl_dataset_close(ds, DS_MODE_NONE, FTAG); 3261 rw_exit(&dp->dp_config_rwlock); 3262 3263 VERIFY(nvlist_add_uint64(propval, 3264 ZFS_PROP_SOURCE, src) == 0); 3265 VERIFY(nvlist_add_string(propval, 3266 ZFS_PROP_VALUE, strval) == 0); 3267 } else { 3268 VERIFY(nvlist_add_uint64(propval, 3269 ZFS_PROP_SOURCE, src) == 0); 3270 VERIFY(nvlist_add_uint64(propval, 3271 ZFS_PROP_VALUE, value) == 0); 3272 } 3273 VERIFY(nvlist_add_nvlist(*nvp, za.za_name, 3274 propval) == 0); 3275 break; 3276 } 3277 nvlist_free(propval); 3278 } 3279 zap_cursor_fini(&zc); 3280 mutex_exit(&spa->spa_props_lock); 3281 if (err && err != ENOENT) { 3282 nvlist_free(*nvp); 3283 return (err); 3284 } 3285 3286 return (0); 3287} 3288 3289/* 3290 * If the bootfs property value is dsobj, clear it. 3291 */ 3292void 3293spa_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 3294{ 3295 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 3296 VERIFY(zap_remove(spa->spa_meta_objset, 3297 spa->spa_pool_props_object, 3298 zpool_prop_to_name(ZFS_PROP_BOOTFS), tx) == 0); 3299 spa->spa_bootfs = 0; 3300 } 3301} 3302