spa.c revision 168498
1109998Smarkm/* 2296465Sdelphij * CDDL HEADER START 3296465Sdelphij * 4296465Sdelphij * The contents of this file are subject to the terms of the 5109998Smarkm * Common Development and Distribution License (the "License"). 6109998Smarkm * You may not use this file except in compliance with the License. 7109998Smarkm * 8109998Smarkm * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9109998Smarkm * or http://www.opensolaris.org/os/licensing. 10109998Smarkm * See the License for the specific language governing permissions 11109998Smarkm * and limitations under the License. 12109998Smarkm * 13109998Smarkm * When distributing Covered Code, include this CDDL HEADER in each 14296465Sdelphij * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15109998Smarkm * If applicable, add the following below this CDDL HEADER, with the 16109998Smarkm * fields enclosed by brackets "[]" replaced with your own identifying 17109998Smarkm * information: Portions Copyright [yyyy] [name of copyright owner] 18109998Smarkm * 19109998Smarkm * CDDL HEADER END 20109998Smarkm */ 21109998Smarkm 22109998Smarkm/* 23109998Smarkm * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24109998Smarkm * Use is subject to license terms. 25109998Smarkm */ 26109998Smarkm 27109998Smarkm#pragma ident "%Z%%M% %I% %E% SMI" 28109998Smarkm 29109998Smarkm/* 30109998Smarkm * This file contains all the routines used when modifying on-disk SPA state. 31109998Smarkm * This includes opening, importing, destroying, exporting a pool, and syncing a 32109998Smarkm * pool. 33109998Smarkm */ 34109998Smarkm 35109998Smarkm#include <sys/zfs_context.h> 36109998Smarkm#include <sys/fm/fs/zfs.h> 37109998Smarkm#include <sys/spa_impl.h> 38109998Smarkm#include <sys/zio.h> 39109998Smarkm#include <sys/zio_checksum.h> 40109998Smarkm#include <sys/zio_compress.h> 41109998Smarkm#include <sys/dmu.h> 42109998Smarkm#include <sys/dmu_tx.h> 43109998Smarkm#include <sys/zap.h> 44109998Smarkm#include <sys/zil.h> 45109998Smarkm#include <sys/vdev_impl.h> 46109998Smarkm#include <sys/metaslab.h> 47109998Smarkm#include <sys/uberblock_impl.h> 48109998Smarkm#include <sys/txg.h> 49109998Smarkm#include <sys/avl.h> 50109998Smarkm#include <sys/dmu_traverse.h> 51109998Smarkm#include <sys/dmu_objset.h> 52109998Smarkm#include <sys/unique.h> 53109998Smarkm#include <sys/dsl_pool.h> 54109998Smarkm#include <sys/dsl_dataset.h> 55109998Smarkm#include <sys/dsl_dir.h> 56109998Smarkm#include <sys/dsl_prop.h> 57109998Smarkm#include <sys/dsl_synctask.h> 58109998Smarkm#include <sys/fs/zfs.h> 59109998Smarkm#include <sys/callb.h> 60160817Ssimon 61109998Smarkmint zio_taskq_threads = 8; 62109998Smarkm 63109998Smarkm/* 64296465Sdelphij * ========================================================================== 65296465Sdelphij * SPA state manipulation (open/create/destroy/import/export) 66296465Sdelphij * ========================================================================== 67296465Sdelphij */ 68296465Sdelphij 69296465Sdelphijstatic int 70296465Sdelphijspa_error_entry_compare(const void *a, const void *b) 71109998Smarkm{ 72296465Sdelphij spa_error_entry_t *sa = (spa_error_entry_t *)a; 73109998Smarkm spa_error_entry_t *sb = (spa_error_entry_t *)b; 74167615Ssimon int ret; 75296465Sdelphij 76167615Ssimon ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 77296465Sdelphij sizeof (zbookmark_t)); 78160817Ssimon 79296465Sdelphij if (ret < 0) 80296465Sdelphij return (-1); 81296465Sdelphij else if (ret > 0) 82296465Sdelphij return (1); 83296465Sdelphij else 84296465Sdelphij return (0); 85296465Sdelphij} 86296465Sdelphij 87296465Sdelphij/* 88296465Sdelphij * Utility function which retrieves copies of the current logs and 89296465Sdelphij * re-initializes them in the process. 90296465Sdelphij */ 91296465Sdelphijvoid 92296465Sdelphijspa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 93296465Sdelphij{ 94296465Sdelphij ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 95296465Sdelphij 96296465Sdelphij bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 97296465Sdelphij bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 98296465Sdelphij 99296465Sdelphij avl_create(&spa->spa_errlist_scrub, 100296465Sdelphij spa_error_entry_compare, sizeof (spa_error_entry_t), 101296465Sdelphij offsetof(spa_error_entry_t, se_avl)); 102296465Sdelphij avl_create(&spa->spa_errlist_last, 103296465Sdelphij spa_error_entry_compare, sizeof (spa_error_entry_t), 104296465Sdelphij offsetof(spa_error_entry_t, se_avl)); 105296465Sdelphij} 106296465Sdelphij 107296465Sdelphij/* 108296465Sdelphij * Activate an uninitialized pool. 109296465Sdelphij */ 110296465Sdelphijstatic void 111160817Ssimonspa_activate(spa_t *spa) 112167615Ssimon{ 113296465Sdelphij int t; 114296465Sdelphij 115296465Sdelphij ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 116109998Smarkm 117296465Sdelphij spa->spa_state = POOL_STATE_ACTIVE; 118109998Smarkm 119110018Snectar spa->spa_normal_class = metaslab_class_create(); 120296465Sdelphij 121296465Sdelphij for (t = 0; t < ZIO_TYPES; t++) { 122296465Sdelphij spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue", 123296465Sdelphij zio_taskq_threads, maxclsyspri, 50, INT_MAX, 124296465Sdelphij TASKQ_PREPOPULATE); 125296465Sdelphij spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr", 126296465Sdelphij zio_taskq_threads, maxclsyspri, 50, INT_MAX, 127296465Sdelphij TASKQ_PREPOPULATE); 128109998Smarkm } 129109998Smarkm 130 rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL); 131 132 mutex_init(&spa->spa_uberblock_lock, NULL, MUTEX_DEFAULT, NULL); 133 mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL); 134 mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL); 135 mutex_init(&spa->spa_config_lock.scl_lock, NULL, MUTEX_DEFAULT, NULL); 136 cv_init(&spa->spa_config_lock.scl_cv, NULL, CV_DEFAULT, NULL); 137 mutex_init(&spa->spa_sync_bplist.bpl_lock, NULL, MUTEX_DEFAULT, NULL); 138 mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL); 139 mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL); 140 141 list_create(&spa->spa_dirty_list, sizeof (vdev_t), 142 offsetof(vdev_t, vdev_dirty_node)); 143 144 txg_list_create(&spa->spa_vdev_txg_list, 145 offsetof(struct vdev, vdev_txg_node)); 146 147 avl_create(&spa->spa_errlist_scrub, 148 spa_error_entry_compare, sizeof (spa_error_entry_t), 149 offsetof(spa_error_entry_t, se_avl)); 150 avl_create(&spa->spa_errlist_last, 151 spa_error_entry_compare, sizeof (spa_error_entry_t), 152 offsetof(spa_error_entry_t, se_avl)); 153} 154 155/* 156 * Opposite of spa_activate(). 157 */ 158static void 159spa_deactivate(spa_t *spa) 160{ 161 int t; 162 163 ASSERT(spa->spa_sync_on == B_FALSE); 164 ASSERT(spa->spa_dsl_pool == NULL); 165 ASSERT(spa->spa_root_vdev == NULL); 166 167 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 168 169 txg_list_destroy(&spa->spa_vdev_txg_list); 170 171 list_destroy(&spa->spa_dirty_list); 172 173 for (t = 0; t < ZIO_TYPES; t++) { 174 taskq_destroy(spa->spa_zio_issue_taskq[t]); 175 taskq_destroy(spa->spa_zio_intr_taskq[t]); 176 spa->spa_zio_issue_taskq[t] = NULL; 177 spa->spa_zio_intr_taskq[t] = NULL; 178 } 179 180 metaslab_class_destroy(spa->spa_normal_class); 181 spa->spa_normal_class = NULL; 182 183 /* 184 * If this was part of an import or the open otherwise failed, we may 185 * still have errors left in the queues. Empty them just in case. 186 */ 187 spa_errlog_drain(spa); 188 189 avl_destroy(&spa->spa_errlist_scrub); 190 avl_destroy(&spa->spa_errlist_last); 191 192 rw_destroy(&spa->spa_traverse_lock); 193 mutex_destroy(&spa->spa_uberblock_lock); 194 mutex_destroy(&spa->spa_errlog_lock); 195 mutex_destroy(&spa->spa_errlist_lock); 196 mutex_destroy(&spa->spa_config_lock.scl_lock); 197 cv_destroy(&spa->spa_config_lock.scl_cv); 198 mutex_destroy(&spa->spa_sync_bplist.bpl_lock); 199 mutex_destroy(&spa->spa_history_lock); 200 mutex_destroy(&spa->spa_props_lock); 201 202 spa->spa_state = POOL_STATE_UNINITIALIZED; 203} 204 205/* 206 * Verify a pool configuration, and construct the vdev tree appropriately. This 207 * will create all the necessary vdevs in the appropriate layout, with each vdev 208 * in the CLOSED state. This will prep the pool before open/creation/import. 209 * All vdev validation is done by the vdev_alloc() routine. 210 */ 211static int 212spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 213 uint_t id, int atype) 214{ 215 nvlist_t **child; 216 uint_t c, children; 217 int error; 218 219 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 220 return (error); 221 222 if ((*vdp)->vdev_ops->vdev_op_leaf) 223 return (0); 224 225 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 226 &child, &children) != 0) { 227 vdev_free(*vdp); 228 *vdp = NULL; 229 return (EINVAL); 230 } 231 232 for (c = 0; c < children; c++) { 233 vdev_t *vd; 234 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 235 atype)) != 0) { 236 vdev_free(*vdp); 237 *vdp = NULL; 238 return (error); 239 } 240 } 241 242 ASSERT(*vdp != NULL); 243 244 return (0); 245} 246 247/* 248 * Opposite of spa_load(). 249 */ 250static void 251spa_unload(spa_t *spa) 252{ 253 int i; 254 255 /* 256 * Stop async tasks. 257 */ 258 spa_async_suspend(spa); 259 260 /* 261 * Stop syncing. 262 */ 263 if (spa->spa_sync_on) { 264 txg_sync_stop(spa->spa_dsl_pool); 265 spa->spa_sync_on = B_FALSE; 266 } 267 268 /* 269 * Wait for any outstanding prefetch I/O to complete. 270 */ 271 spa_config_enter(spa, RW_WRITER, FTAG); 272 spa_config_exit(spa, FTAG); 273 274 /* 275 * Close the dsl pool. 276 */ 277 if (spa->spa_dsl_pool) { 278 dsl_pool_close(spa->spa_dsl_pool); 279 spa->spa_dsl_pool = NULL; 280 } 281 282 /* 283 * Close all vdevs. 284 */ 285 if (spa->spa_root_vdev) 286 vdev_free(spa->spa_root_vdev); 287 ASSERT(spa->spa_root_vdev == NULL); 288 289 for (i = 0; i < spa->spa_nspares; i++) 290 vdev_free(spa->spa_spares[i]); 291 if (spa->spa_spares) { 292 kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 293 spa->spa_spares = NULL; 294 } 295 if (spa->spa_sparelist) { 296 nvlist_free(spa->spa_sparelist); 297 spa->spa_sparelist = NULL; 298 } 299 300 spa->spa_async_suspended = 0; 301} 302 303/* 304 * Load (or re-load) the current list of vdevs describing the active spares for 305 * this pool. When this is called, we have some form of basic information in 306 * 'spa_sparelist'. We parse this into vdevs, try to open them, and then 307 * re-generate a more complete list including status information. 308 */ 309static void 310spa_load_spares(spa_t *spa) 311{ 312 nvlist_t **spares; 313 uint_t nspares; 314 int i; 315 vdev_t *vd, *tvd; 316 317 /* 318 * First, close and free any existing spare vdevs. 319 */ 320 for (i = 0; i < spa->spa_nspares; i++) { 321 vd = spa->spa_spares[i]; 322 323 /* Undo the call to spa_activate() below */ 324 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL && 325 tvd->vdev_isspare) 326 spa_spare_remove(tvd); 327 vdev_close(vd); 328 vdev_free(vd); 329 } 330 331 if (spa->spa_spares) 332 kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 333 334 if (spa->spa_sparelist == NULL) 335 nspares = 0; 336 else 337 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 338 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 339 340 spa->spa_nspares = (int)nspares; 341 spa->spa_spares = NULL; 342 343 if (nspares == 0) 344 return; 345 346 /* 347 * Construct the array of vdevs, opening them to get status in the 348 * process. For each spare, there is potentially two different vdev_t 349 * structures associated with it: one in the list of spares (used only 350 * for basic validation purposes) and one in the active vdev 351 * configuration (if it's spared in). During this phase we open and 352 * validate each vdev on the spare list. If the vdev also exists in the 353 * active configuration, then we also mark this vdev as an active spare. 354 */ 355 spa->spa_spares = kmem_alloc(nspares * sizeof (void *), KM_SLEEP); 356 for (i = 0; i < spa->spa_nspares; i++) { 357 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 358 VDEV_ALLOC_SPARE) == 0); 359 ASSERT(vd != NULL); 360 361 spa->spa_spares[i] = vd; 362 363 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL) { 364 if (!tvd->vdev_isspare) 365 spa_spare_add(tvd); 366 367 /* 368 * We only mark the spare active if we were successfully 369 * able to load the vdev. Otherwise, importing a pool 370 * with a bad active spare would result in strange 371 * behavior, because multiple pool would think the spare 372 * is actively in use. 373 * 374 * There is a vulnerability here to an equally bizarre 375 * circumstance, where a dead active spare is later 376 * brought back to life (onlined or otherwise). Given 377 * the rarity of this scenario, and the extra complexity 378 * it adds, we ignore the possibility. 379 */ 380 if (!vdev_is_dead(tvd)) 381 spa_spare_activate(tvd); 382 } 383 384 if (vdev_open(vd) != 0) 385 continue; 386 387 vd->vdev_top = vd; 388 (void) vdev_validate_spare(vd); 389 } 390 391 /* 392 * Recompute the stashed list of spares, with status information 393 * this time. 394 */ 395 VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 396 DATA_TYPE_NVLIST_ARRAY) == 0); 397 398 spares = kmem_alloc(spa->spa_nspares * sizeof (void *), KM_SLEEP); 399 for (i = 0; i < spa->spa_nspares; i++) 400 spares[i] = vdev_config_generate(spa, spa->spa_spares[i], 401 B_TRUE, B_TRUE); 402 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 403 spares, spa->spa_nspares) == 0); 404 for (i = 0; i < spa->spa_nspares; i++) 405 nvlist_free(spares[i]); 406 kmem_free(spares, spa->spa_nspares * sizeof (void *)); 407} 408 409static int 410load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 411{ 412 dmu_buf_t *db; 413 char *packed = NULL; 414 size_t nvsize = 0; 415 int error; 416 *value = NULL; 417 418 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 419 nvsize = *(uint64_t *)db->db_data; 420 dmu_buf_rele(db, FTAG); 421 422 packed = kmem_alloc(nvsize, KM_SLEEP); 423 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed); 424 if (error == 0) 425 error = nvlist_unpack(packed, nvsize, value, 0); 426 kmem_free(packed, nvsize); 427 428 return (error); 429} 430 431/* 432 * Load an existing storage pool, using the pool's builtin spa_config as a 433 * source of configuration information. 434 */ 435static int 436spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) 437{ 438 int error = 0; 439 nvlist_t *nvroot = NULL; 440 vdev_t *rvd; 441 uberblock_t *ub = &spa->spa_uberblock; 442 uint64_t config_cache_txg = spa->spa_config_txg; 443 uint64_t pool_guid; 444 uint64_t version; 445 zio_t *zio; 446 447 spa->spa_load_state = state; 448 449 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 450 nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 451 error = EINVAL; 452 goto out; 453 } 454 455 /* 456 * Versioning wasn't explicitly added to the label until later, so if 457 * it's not present treat it as the initial version. 458 */ 459 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0) 460 version = ZFS_VERSION_INITIAL; 461 462 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 463 &spa->spa_config_txg); 464 465 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 466 spa_guid_exists(pool_guid, 0)) { 467 error = EEXIST; 468 goto out; 469 } 470 471 spa->spa_load_guid = pool_guid; 472 473 /* 474 * Parse the configuration into a vdev tree. We explicitly set the 475 * value that will be returned by spa_version() since parsing the 476 * configuration requires knowing the version number. 477 */ 478 spa_config_enter(spa, RW_WRITER, FTAG); 479 spa->spa_ubsync.ub_version = version; 480 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 481 spa_config_exit(spa, FTAG); 482 483 if (error != 0) 484 goto out; 485 486 ASSERT(spa->spa_root_vdev == rvd); 487 ASSERT(spa_guid(spa) == pool_guid); 488 489 /* 490 * Try to open all vdevs, loading each label in the process. 491 */ 492 if (vdev_open(rvd) != 0) { 493 error = ENXIO; 494 goto out; 495 } 496 497 /* 498 * Validate the labels for all leaf vdevs. We need to grab the config 499 * lock because all label I/O is done with the ZIO_FLAG_CONFIG_HELD 500 * flag. 501 */ 502 spa_config_enter(spa, RW_READER, FTAG); 503 error = vdev_validate(rvd); 504 spa_config_exit(spa, FTAG); 505 506 if (error != 0) { 507 error = EBADF; 508 goto out; 509 } 510 511 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 512 error = ENXIO; 513 goto out; 514 } 515 516 /* 517 * Find the best uberblock. 518 */ 519 bzero(ub, sizeof (uberblock_t)); 520 521 zio = zio_root(spa, NULL, NULL, 522 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 523 vdev_uberblock_load(zio, rvd, ub); 524 error = zio_wait(zio); 525 526 /* 527 * If we weren't able to find a single valid uberblock, return failure. 528 */ 529 if (ub->ub_txg == 0) { 530 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 531 VDEV_AUX_CORRUPT_DATA); 532 error = ENXIO; 533 goto out; 534 } 535 536 /* 537 * If the pool is newer than the code, we can't open it. 538 */ 539 if (ub->ub_version > ZFS_VERSION) { 540 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 541 VDEV_AUX_VERSION_NEWER); 542 error = ENOTSUP; 543 goto out; 544 } 545 546 /* 547 * If the vdev guid sum doesn't match the uberblock, we have an 548 * incomplete configuration. 549 */ 550 if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 551 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 552 VDEV_AUX_BAD_GUID_SUM); 553 error = ENXIO; 554 goto out; 555 } 556 557 /* 558 * Initialize internal SPA structures. 559 */ 560 spa->spa_state = POOL_STATE_ACTIVE; 561 spa->spa_ubsync = spa->spa_uberblock; 562 spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 563 error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 564 if (error) { 565 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 566 VDEV_AUX_CORRUPT_DATA); 567 goto out; 568 } 569 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 570 571 if (zap_lookup(spa->spa_meta_objset, 572 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 573 sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { 574 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 575 VDEV_AUX_CORRUPT_DATA); 576 error = EIO; 577 goto out; 578 } 579 580 if (!mosconfig) { 581 nvlist_t *newconfig; 582 uint64_t hostid; 583 584 if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) { 585 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 586 VDEV_AUX_CORRUPT_DATA); 587 error = EIO; 588 goto out; 589 } 590 591 if (nvlist_lookup_uint64(newconfig, ZPOOL_CONFIG_HOSTID, 592 &hostid) == 0) { 593 char *hostname; 594 unsigned long myhostid = 0; 595 596 VERIFY(nvlist_lookup_string(newconfig, 597 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 598 599 (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 600 if ((unsigned long)hostid != myhostid) { 601 cmn_err(CE_WARN, "pool '%s' could not be " 602 "loaded as it was last accessed by " 603 "another system (host: %s hostid: 0x%lx). " 604 "See: http://www.sun.com/msg/ZFS-8000-EY", 605 spa->spa_name, hostname, 606 (unsigned long)hostid); 607 error = EBADF; 608 goto out; 609 } 610 } 611 612 spa_config_set(spa, newconfig); 613 spa_unload(spa); 614 spa_deactivate(spa); 615 spa_activate(spa); 616 617 return (spa_load(spa, newconfig, state, B_TRUE)); 618 } 619 620 if (zap_lookup(spa->spa_meta_objset, 621 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 622 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { 623 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 624 VDEV_AUX_CORRUPT_DATA); 625 error = EIO; 626 goto out; 627 } 628 629 /* 630 * Load the bit that tells us to use the new accounting function 631 * (raid-z deflation). If we have an older pool, this will not 632 * be present. 633 */ 634 error = zap_lookup(spa->spa_meta_objset, 635 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 636 sizeof (uint64_t), 1, &spa->spa_deflate); 637 if (error != 0 && error != ENOENT) { 638 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 639 VDEV_AUX_CORRUPT_DATA); 640 error = EIO; 641 goto out; 642 } 643 644 /* 645 * Load the persistent error log. If we have an older pool, this will 646 * not be present. 647 */ 648 error = zap_lookup(spa->spa_meta_objset, 649 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, 650 sizeof (uint64_t), 1, &spa->spa_errlog_last); 651 if (error != 0 && error != ENOENT) { 652 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 653 VDEV_AUX_CORRUPT_DATA); 654 error = EIO; 655 goto out; 656 } 657 658 error = zap_lookup(spa->spa_meta_objset, 659 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, 660 sizeof (uint64_t), 1, &spa->spa_errlog_scrub); 661 if (error != 0 && error != ENOENT) { 662 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 663 VDEV_AUX_CORRUPT_DATA); 664 error = EIO; 665 goto out; 666 } 667 668 /* 669 * Load the history object. If we have an older pool, this 670 * will not be present. 671 */ 672 error = zap_lookup(spa->spa_meta_objset, 673 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY, 674 sizeof (uint64_t), 1, &spa->spa_history); 675 if (error != 0 && error != ENOENT) { 676 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 677 VDEV_AUX_CORRUPT_DATA); 678 error = EIO; 679 goto out; 680 } 681 682 /* 683 * Load any hot spares for this pool. 684 */ 685 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 686 DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares_object); 687 if (error != 0 && error != ENOENT) { 688 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 689 VDEV_AUX_CORRUPT_DATA); 690 error = EIO; 691 goto out; 692 } 693 if (error == 0) { 694 ASSERT(spa_version(spa) >= ZFS_VERSION_SPARES); 695 if (load_nvlist(spa, spa->spa_spares_object, 696 &spa->spa_sparelist) != 0) { 697 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 698 VDEV_AUX_CORRUPT_DATA); 699 error = EIO; 700 goto out; 701 } 702 703 spa_config_enter(spa, RW_WRITER, FTAG); 704 spa_load_spares(spa); 705 spa_config_exit(spa, FTAG); 706 } 707 708 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 709 DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object); 710 711 if (error && error != ENOENT) { 712 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 713 VDEV_AUX_CORRUPT_DATA); 714 error = EIO; 715 goto out; 716 } 717 718 if (error == 0) { 719 (void) zap_lookup(spa->spa_meta_objset, 720 spa->spa_pool_props_object, 721 zpool_prop_to_name(ZFS_PROP_BOOTFS), 722 sizeof (uint64_t), 1, &spa->spa_bootfs); 723 } 724 725 /* 726 * Load the vdev state for all toplevel vdevs. 727 */ 728 vdev_load(rvd); 729 730 /* 731 * Propagate the leaf DTLs we just loaded all the way up the tree. 732 */ 733 spa_config_enter(spa, RW_WRITER, FTAG); 734 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 735 spa_config_exit(spa, FTAG); 736 737 /* 738 * Check the state of the root vdev. If it can't be opened, it 739 * indicates one or more toplevel vdevs are faulted. 740 */ 741 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 742 error = ENXIO; 743 goto out; 744 } 745 746 if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) { 747 dmu_tx_t *tx; 748 int need_update = B_FALSE; 749 int c; 750 751 /* 752 * Claim log blocks that haven't been committed yet. 753 * This must all happen in a single txg. 754 */ 755 tx = dmu_tx_create_assigned(spa_get_dsl(spa), 756 spa_first_txg(spa)); 757 (void) dmu_objset_find(spa->spa_name, 758 zil_claim, tx, DS_FIND_CHILDREN); 759 dmu_tx_commit(tx); 760 761 spa->spa_sync_on = B_TRUE; 762 txg_sync_start(spa->spa_dsl_pool); 763 764 /* 765 * Wait for all claims to sync. 766 */ 767 txg_wait_synced(spa->spa_dsl_pool, 0); 768 769 /* 770 * If the config cache is stale, or we have uninitialized 771 * metaslabs (see spa_vdev_add()), then update the config. 772 */ 773 if (config_cache_txg != spa->spa_config_txg || 774 state == SPA_LOAD_IMPORT) 775 need_update = B_TRUE; 776 777 for (c = 0; c < rvd->vdev_children; c++) 778 if (rvd->vdev_child[c]->vdev_ms_array == 0) 779 need_update = B_TRUE; 780 781 /* 782 * Update the config cache asychronously in case we're the 783 * root pool, in which case the config cache isn't writable yet. 784 */ 785 if (need_update) 786 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 787 } 788 789 error = 0; 790out: 791 if (error && error != EBADF) 792 zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0); 793 spa->spa_load_state = SPA_LOAD_NONE; 794 spa->spa_ena = 0; 795 796 return (error); 797} 798 799/* 800 * Pool Open/Import 801 * 802 * The import case is identical to an open except that the configuration is sent 803 * down from userland, instead of grabbed from the configuration cache. For the 804 * case of an open, the pool configuration will exist in the 805 * POOL_STATE_UNITIALIZED state. 806 * 807 * The stats information (gen/count/ustats) is used to gather vdev statistics at 808 * the same time open the pool, without having to keep around the spa_t in some 809 * ambiguous state. 810 */ 811static int 812spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 813{ 814 spa_t *spa; 815 int error; 816 int loaded = B_FALSE; 817 int locked = B_FALSE; 818 819 *spapp = NULL; 820 821 /* 822 * As disgusting as this is, we need to support recursive calls to this 823 * function because dsl_dir_open() is called during spa_load(), and ends 824 * up calling spa_open() again. The real fix is to figure out how to 825 * avoid dsl_dir_open() calling this in the first place. 826 */ 827 if (mutex_owner(&spa_namespace_lock) != curthread) { 828 mutex_enter(&spa_namespace_lock); 829 locked = B_TRUE; 830 } 831 832 if ((spa = spa_lookup(pool)) == NULL) { 833 if (locked) 834 mutex_exit(&spa_namespace_lock); 835 return (ENOENT); 836 } 837 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 838 839 spa_activate(spa); 840 841 error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE); 842 843 if (error == EBADF) { 844 /* 845 * If vdev_validate() returns failure (indicated by 846 * EBADF), it indicates that one of the vdevs indicates 847 * that the pool has been exported or destroyed. If 848 * this is the case, the config cache is out of sync and 849 * we should remove the pool from the namespace. 850 */ 851 zfs_post_ok(spa, NULL); 852 spa_unload(spa); 853 spa_deactivate(spa); 854 spa_remove(spa); 855 spa_config_sync(); 856 if (locked) 857 mutex_exit(&spa_namespace_lock); 858 return (ENOENT); 859 } 860 861 if (error) { 862 /* 863 * We can't open the pool, but we still have useful 864 * information: the state of each vdev after the 865 * attempted vdev_open(). Return this to the user. 866 */ 867 if (config != NULL && spa->spa_root_vdev != NULL) { 868 spa_config_enter(spa, RW_READER, FTAG); 869 *config = spa_config_generate(spa, NULL, -1ULL, 870 B_TRUE); 871 spa_config_exit(spa, FTAG); 872 } 873 spa_unload(spa); 874 spa_deactivate(spa); 875 spa->spa_last_open_failed = B_TRUE; 876 if (locked) 877 mutex_exit(&spa_namespace_lock); 878 *spapp = NULL; 879 return (error); 880 } else { 881 zfs_post_ok(spa, NULL); 882 spa->spa_last_open_failed = B_FALSE; 883 } 884 885 loaded = B_TRUE; 886 } 887 888 spa_open_ref(spa, tag); 889 if (locked) 890 mutex_exit(&spa_namespace_lock); 891 892 *spapp = spa; 893 894 if (config != NULL) { 895 spa_config_enter(spa, RW_READER, FTAG); 896 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 897 spa_config_exit(spa, FTAG); 898 } 899 900 /* 901 * If we just loaded the pool, resilver anything that's out of date. 902 */ 903 if (loaded && (spa_mode & FWRITE)) 904 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 905 906 return (0); 907} 908 909int 910spa_open(const char *name, spa_t **spapp, void *tag) 911{ 912 return (spa_open_common(name, spapp, tag, NULL)); 913} 914 915/* 916 * Lookup the given spa_t, incrementing the inject count in the process, 917 * preventing it from being exported or destroyed. 918 */ 919spa_t * 920spa_inject_addref(char *name) 921{ 922 spa_t *spa; 923 924 mutex_enter(&spa_namespace_lock); 925 if ((spa = spa_lookup(name)) == NULL) { 926 mutex_exit(&spa_namespace_lock); 927 return (NULL); 928 } 929 spa->spa_inject_ref++; 930 mutex_exit(&spa_namespace_lock); 931 932 return (spa); 933} 934 935void 936spa_inject_delref(spa_t *spa) 937{ 938 mutex_enter(&spa_namespace_lock); 939 spa->spa_inject_ref--; 940 mutex_exit(&spa_namespace_lock); 941} 942 943static void 944spa_add_spares(spa_t *spa, nvlist_t *config) 945{ 946 nvlist_t **spares; 947 uint_t i, nspares; 948 nvlist_t *nvroot; 949 uint64_t guid; 950 vdev_stat_t *vs; 951 uint_t vsc; 952 uint64_t pool; 953 954 if (spa->spa_nspares == 0) 955 return; 956 957 VERIFY(nvlist_lookup_nvlist(config, 958 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 959 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 960 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 961 if (nspares != 0) { 962 VERIFY(nvlist_add_nvlist_array(nvroot, 963 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 964 VERIFY(nvlist_lookup_nvlist_array(nvroot, 965 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 966 967 /* 968 * Go through and find any spares which have since been 969 * repurposed as an active spare. If this is the case, update 970 * their status appropriately. 971 */ 972 for (i = 0; i < nspares; i++) { 973 VERIFY(nvlist_lookup_uint64(spares[i], 974 ZPOOL_CONFIG_GUID, &guid) == 0); 975 if (spa_spare_exists(guid, &pool) && pool != 0ULL) { 976 VERIFY(nvlist_lookup_uint64_array( 977 spares[i], ZPOOL_CONFIG_STATS, 978 (uint64_t **)&vs, &vsc) == 0); 979 vs->vs_state = VDEV_STATE_CANT_OPEN; 980 vs->vs_aux = VDEV_AUX_SPARED; 981 } 982 } 983 } 984} 985 986int 987spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 988{ 989 int error; 990 spa_t *spa; 991 992 *config = NULL; 993 error = spa_open_common(name, &spa, FTAG, config); 994 995 if (spa && *config != NULL) { 996 VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, 997 spa_get_errlog_size(spa)) == 0); 998 999 spa_add_spares(spa, *config); 1000 } 1001 1002 /* 1003 * We want to get the alternate root even for faulted pools, so we cheat 1004 * and call spa_lookup() directly. 1005 */ 1006 if (altroot) { 1007 if (spa == NULL) { 1008 mutex_enter(&spa_namespace_lock); 1009 spa = spa_lookup(name); 1010 if (spa) 1011 spa_altroot(spa, altroot, buflen); 1012 else 1013 altroot[0] = '\0'; 1014 spa = NULL; 1015 mutex_exit(&spa_namespace_lock); 1016 } else { 1017 spa_altroot(spa, altroot, buflen); 1018 } 1019 } 1020 1021 if (spa != NULL) 1022 spa_close(spa, FTAG); 1023 1024 return (error); 1025} 1026 1027/* 1028 * Validate that the 'spares' array is well formed. We must have an array of 1029 * nvlists, each which describes a valid leaf vdev. If this is an import (mode 1030 * is VDEV_ALLOC_SPARE), then we allow corrupted spares to be specified, as long 1031 * as they are well-formed. 1032 */ 1033static int 1034spa_validate_spares(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 1035{ 1036 nvlist_t **spares; 1037 uint_t i, nspares; 1038 vdev_t *vd; 1039 int error; 1040 1041 /* 1042 * It's acceptable to have no spares specified. 1043 */ 1044 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1045 &spares, &nspares) != 0) 1046 return (0); 1047 1048 if (nspares == 0) 1049 return (EINVAL); 1050 1051 /* 1052 * Make sure the pool is formatted with a version that supports hot 1053 * spares. 1054 */ 1055 if (spa_version(spa) < ZFS_VERSION_SPARES) 1056 return (ENOTSUP); 1057 1058 /* 1059 * Set the pending spare list so we correctly handle device in-use 1060 * checking. 1061 */ 1062 spa->spa_pending_spares = spares; 1063 spa->spa_pending_nspares = nspares; 1064 1065 for (i = 0; i < nspares; i++) { 1066 if ((error = spa_config_parse(spa, &vd, spares[i], NULL, 0, 1067 mode)) != 0) 1068 goto out; 1069 1070 if (!vd->vdev_ops->vdev_op_leaf) { 1071 vdev_free(vd); 1072 error = EINVAL; 1073 goto out; 1074 } 1075 1076 vd->vdev_top = vd; 1077 1078 if ((error = vdev_open(vd)) == 0 && 1079 (error = vdev_label_init(vd, crtxg, 1080 VDEV_LABEL_SPARE)) == 0) { 1081 VERIFY(nvlist_add_uint64(spares[i], ZPOOL_CONFIG_GUID, 1082 vd->vdev_guid) == 0); 1083 } 1084 1085 vdev_free(vd); 1086 1087 if (error && mode != VDEV_ALLOC_SPARE) 1088 goto out; 1089 else 1090 error = 0; 1091 } 1092 1093out: 1094 spa->spa_pending_spares = NULL; 1095 spa->spa_pending_nspares = 0; 1096 return (error); 1097} 1098 1099/* 1100 * Pool Creation 1101 */ 1102int 1103spa_create(const char *pool, nvlist_t *nvroot, const char *altroot) 1104{ 1105 spa_t *spa; 1106 vdev_t *rvd; 1107 dsl_pool_t *dp; 1108 dmu_tx_t *tx; 1109 int c, error = 0; 1110 uint64_t txg = TXG_INITIAL; 1111 nvlist_t **spares; 1112 uint_t nspares; 1113 1114 /* 1115 * If this pool already exists, return failure. 1116 */ 1117 mutex_enter(&spa_namespace_lock); 1118 if (spa_lookup(pool) != NULL) { 1119 mutex_exit(&spa_namespace_lock); 1120 return (EEXIST); 1121 } 1122 1123 /* 1124 * Allocate a new spa_t structure. 1125 */ 1126 spa = spa_add(pool, altroot); 1127 spa_activate(spa); 1128 1129 spa->spa_uberblock.ub_txg = txg - 1; 1130 spa->spa_uberblock.ub_version = ZFS_VERSION; 1131 spa->spa_ubsync = spa->spa_uberblock; 1132 1133 /* 1134 * Create the root vdev. 1135 */ 1136 spa_config_enter(spa, RW_WRITER, FTAG); 1137 1138 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 1139 1140 ASSERT(error != 0 || rvd != NULL); 1141 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 1142 1143 if (error == 0 && rvd->vdev_children == 0) 1144 error = EINVAL; 1145 1146 if (error == 0 && 1147 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 1148 (error = spa_validate_spares(spa, nvroot, txg, 1149 VDEV_ALLOC_ADD)) == 0) { 1150 for (c = 0; c < rvd->vdev_children; c++) 1151 vdev_init(rvd->vdev_child[c], txg); 1152 vdev_config_dirty(rvd); 1153 } 1154 1155 spa_config_exit(spa, FTAG); 1156 1157 if (error != 0) { 1158 spa_unload(spa); 1159 spa_deactivate(spa); 1160 spa_remove(spa); 1161 mutex_exit(&spa_namespace_lock); 1162 return (error); 1163 } 1164 1165 /* 1166 * Get the list of spares, if specified. 1167 */ 1168 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1169 &spares, &nspares) == 0) { 1170 VERIFY(nvlist_alloc(&spa->spa_sparelist, NV_UNIQUE_NAME, 1171 KM_SLEEP) == 0); 1172 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1173 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1174 spa_config_enter(spa, RW_WRITER, FTAG); 1175 spa_load_spares(spa); 1176 spa_config_exit(spa, FTAG); 1177 spa->spa_sync_spares = B_TRUE; 1178 } 1179 1180 spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg); 1181 spa->spa_meta_objset = dp->dp_meta_objset; 1182 1183 tx = dmu_tx_create_assigned(dp, txg); 1184 1185 /* 1186 * Create the pool config object. 1187 */ 1188 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 1189 DMU_OT_PACKED_NVLIST, 1 << 14, 1190 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 1191 1192 if (zap_add(spa->spa_meta_objset, 1193 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 1194 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 1195 cmn_err(CE_PANIC, "failed to add pool config"); 1196 } 1197 1198 /* Newly created pools are always deflated. */ 1199 spa->spa_deflate = TRUE; 1200 if (zap_add(spa->spa_meta_objset, 1201 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 1202 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 1203 cmn_err(CE_PANIC, "failed to add deflate"); 1204 } 1205 1206 /* 1207 * Create the deferred-free bplist object. Turn off compression 1208 * because sync-to-convergence takes longer if the blocksize 1209 * keeps changing. 1210 */ 1211 spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 1212 1 << 14, tx); 1213 dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 1214 ZIO_COMPRESS_OFF, tx); 1215 1216 if (zap_add(spa->spa_meta_objset, 1217 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 1218 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { 1219 cmn_err(CE_PANIC, "failed to add bplist"); 1220 } 1221 1222 /* 1223 * Create the pool's history object. 1224 */ 1225 spa_history_create_obj(spa, tx); 1226 1227 dmu_tx_commit(tx); 1228 1229 spa->spa_bootfs = zfs_prop_default_numeric(ZFS_PROP_BOOTFS); 1230 spa->spa_sync_on = B_TRUE; 1231 txg_sync_start(spa->spa_dsl_pool); 1232 1233 /* 1234 * We explicitly wait for the first transaction to complete so that our 1235 * bean counters are appropriately updated. 1236 */ 1237 txg_wait_synced(spa->spa_dsl_pool, txg); 1238 1239 spa_config_sync(); 1240 1241 mutex_exit(&spa_namespace_lock); 1242 1243 return (0); 1244} 1245 1246/* 1247 * Import the given pool into the system. We set up the necessary spa_t and 1248 * then call spa_load() to do the dirty work. 1249 */ 1250int 1251spa_import(const char *pool, nvlist_t *config, const char *altroot) 1252{ 1253 spa_t *spa; 1254 int error; 1255 nvlist_t *nvroot; 1256 nvlist_t **spares; 1257 uint_t nspares; 1258 1259 if (!(spa_mode & FWRITE)) 1260 return (EROFS); 1261 1262 /* 1263 * If a pool with this name exists, return failure. 1264 */ 1265 mutex_enter(&spa_namespace_lock); 1266 if (spa_lookup(pool) != NULL) { 1267 mutex_exit(&spa_namespace_lock); 1268 return (EEXIST); 1269 } 1270 1271 /* 1272 * Create and initialize the spa structure. 1273 */ 1274 spa = spa_add(pool, altroot); 1275 spa_activate(spa); 1276 1277 /* 1278 * Pass off the heavy lifting to spa_load(). 1279 * Pass TRUE for mosconfig because the user-supplied config 1280 * is actually the one to trust when doing an import. 1281 */ 1282 error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE); 1283 1284 spa_config_enter(spa, RW_WRITER, FTAG); 1285 /* 1286 * Toss any existing sparelist, as it doesn't have any validity anymore, 1287 * and conflicts with spa_has_spare(). 1288 */ 1289 if (spa->spa_sparelist) { 1290 nvlist_free(spa->spa_sparelist); 1291 spa->spa_sparelist = NULL; 1292 spa_load_spares(spa); 1293 } 1294 1295 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 1296 &nvroot) == 0); 1297 if (error == 0) 1298 error = spa_validate_spares(spa, nvroot, -1ULL, 1299 VDEV_ALLOC_SPARE); 1300 spa_config_exit(spa, FTAG); 1301 1302 if (error != 0) { 1303 spa_unload(spa); 1304 spa_deactivate(spa); 1305 spa_remove(spa); 1306 mutex_exit(&spa_namespace_lock); 1307 return (error); 1308 } 1309 1310 /* 1311 * Override any spares as specified by the user, as these may have 1312 * correct device names/devids, etc. 1313 */ 1314 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1315 &spares, &nspares) == 0) { 1316 if (spa->spa_sparelist) 1317 VERIFY(nvlist_remove(spa->spa_sparelist, 1318 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 1319 else 1320 VERIFY(nvlist_alloc(&spa->spa_sparelist, 1321 NV_UNIQUE_NAME, KM_SLEEP) == 0); 1322 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1323 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1324 spa_config_enter(spa, RW_WRITER, FTAG); 1325 spa_load_spares(spa); 1326 spa_config_exit(spa, FTAG); 1327 spa->spa_sync_spares = B_TRUE; 1328 } 1329 1330 /* 1331 * Update the config cache to include the newly-imported pool. 1332 */ 1333 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 1334 1335 mutex_exit(&spa_namespace_lock); 1336 1337 /* 1338 * Resilver anything that's out of date. 1339 */ 1340 if (spa_mode & FWRITE) 1341 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1342 1343 return (0); 1344} 1345 1346/* 1347 * This (illegal) pool name is used when temporarily importing a spa_t in order 1348 * to get the vdev stats associated with the imported devices. 1349 */ 1350#define TRYIMPORT_NAME "$import" 1351 1352nvlist_t * 1353spa_tryimport(nvlist_t *tryconfig) 1354{ 1355 nvlist_t *config = NULL; 1356 char *poolname; 1357 spa_t *spa; 1358 uint64_t state; 1359 1360 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 1361 return (NULL); 1362 1363 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 1364 return (NULL); 1365 1366 /* 1367 * Create and initialize the spa structure. 1368 */ 1369 mutex_enter(&spa_namespace_lock); 1370 spa = spa_add(TRYIMPORT_NAME, NULL); 1371 spa_activate(spa); 1372 1373 /* 1374 * Pass off the heavy lifting to spa_load(). 1375 * Pass TRUE for mosconfig because the user-supplied config 1376 * is actually the one to trust when doing an import. 1377 */ 1378 (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); 1379 1380 /* 1381 * If 'tryconfig' was at least parsable, return the current config. 1382 */ 1383 if (spa->spa_root_vdev != NULL) { 1384 spa_config_enter(spa, RW_READER, FTAG); 1385 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 1386 spa_config_exit(spa, FTAG); 1387 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 1388 poolname) == 0); 1389 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 1390 state) == 0); 1391 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 1392 spa->spa_uberblock.ub_timestamp) == 0); 1393 1394 /* 1395 * Add the list of hot spares. 1396 */ 1397 spa_add_spares(spa, config); 1398 } 1399 1400 spa_unload(spa); 1401 spa_deactivate(spa); 1402 spa_remove(spa); 1403 mutex_exit(&spa_namespace_lock); 1404 1405 return (config); 1406} 1407 1408/* 1409 * Pool export/destroy 1410 * 1411 * The act of destroying or exporting a pool is very simple. We make sure there 1412 * is no more pending I/O and any references to the pool are gone. Then, we 1413 * update the pool state and sync all the labels to disk, removing the 1414 * configuration from the cache afterwards. 1415 */ 1416static int 1417spa_export_common(char *pool, int new_state, nvlist_t **oldconfig) 1418{ 1419 spa_t *spa; 1420 1421 if (oldconfig) 1422 *oldconfig = NULL; 1423 1424 if (!(spa_mode & FWRITE)) 1425 return (EROFS); 1426 1427 mutex_enter(&spa_namespace_lock); 1428 if ((spa = spa_lookup(pool)) == NULL) { 1429 mutex_exit(&spa_namespace_lock); 1430 return (ENOENT); 1431 } 1432 1433 /* 1434 * Put a hold on the pool, drop the namespace lock, stop async tasks, 1435 * reacquire the namespace lock, and see if we can export. 1436 */ 1437 spa_open_ref(spa, FTAG); 1438 mutex_exit(&spa_namespace_lock); 1439 spa_async_suspend(spa); 1440 mutex_enter(&spa_namespace_lock); 1441 spa_close(spa, FTAG); 1442 1443 /* 1444 * The pool will be in core if it's openable, 1445 * in which case we can modify its state. 1446 */ 1447 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 1448 /* 1449 * Objsets may be open only because they're dirty, so we 1450 * have to force it to sync before checking spa_refcnt. 1451 */ 1452 spa_scrub_suspend(spa); 1453 txg_wait_synced(spa->spa_dsl_pool, 0); 1454 1455 /* 1456 * A pool cannot be exported or destroyed if there are active 1457 * references. If we are resetting a pool, allow references by 1458 * fault injection handlers. 1459 */ 1460 if (!spa_refcount_zero(spa) || 1461 (spa->spa_inject_ref != 0 && 1462 new_state != POOL_STATE_UNINITIALIZED)) { 1463 spa_scrub_resume(spa); 1464 spa_async_resume(spa); 1465 mutex_exit(&spa_namespace_lock); 1466 return (EBUSY); 1467 } 1468 1469 spa_scrub_resume(spa); 1470 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 1471 1472 /* 1473 * We want this to be reflected on every label, 1474 * so mark them all dirty. spa_unload() will do the 1475 * final sync that pushes these changes out. 1476 */ 1477 if (new_state != POOL_STATE_UNINITIALIZED) { 1478 spa_config_enter(spa, RW_WRITER, FTAG); 1479 spa->spa_state = new_state; 1480 spa->spa_final_txg = spa_last_synced_txg(spa) + 1; 1481 vdev_config_dirty(spa->spa_root_vdev); 1482 spa_config_exit(spa, FTAG); 1483 } 1484 } 1485 1486 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 1487 spa_unload(spa); 1488 spa_deactivate(spa); 1489 } 1490 1491 if (oldconfig && spa->spa_config) 1492 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 1493 1494 if (new_state != POOL_STATE_UNINITIALIZED) { 1495 spa_remove(spa); 1496 spa_config_sync(); 1497 } 1498 mutex_exit(&spa_namespace_lock); 1499 1500 return (0); 1501} 1502 1503/* 1504 * Destroy a storage pool. 1505 */ 1506int 1507spa_destroy(char *pool) 1508{ 1509 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL)); 1510} 1511 1512/* 1513 * Export a storage pool. 1514 */ 1515int 1516spa_export(char *pool, nvlist_t **oldconfig) 1517{ 1518 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig)); 1519} 1520 1521/* 1522 * Similar to spa_export(), this unloads the spa_t without actually removing it 1523 * from the namespace in any way. 1524 */ 1525int 1526spa_reset(char *pool) 1527{ 1528 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL)); 1529} 1530 1531 1532/* 1533 * ========================================================================== 1534 * Device manipulation 1535 * ========================================================================== 1536 */ 1537 1538/* 1539 * Add capacity to a storage pool. 1540 */ 1541int 1542spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 1543{ 1544 uint64_t txg; 1545 int c, error; 1546 vdev_t *rvd = spa->spa_root_vdev; 1547 vdev_t *vd, *tvd; 1548 nvlist_t **spares; 1549 uint_t i, nspares; 1550 1551 txg = spa_vdev_enter(spa); 1552 1553 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 1554 VDEV_ALLOC_ADD)) != 0) 1555 return (spa_vdev_exit(spa, NULL, txg, error)); 1556 1557 spa->spa_pending_vdev = vd; 1558 1559 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1560 &spares, &nspares) != 0) 1561 nspares = 0; 1562 1563 if (vd->vdev_children == 0 && nspares == 0) { 1564 spa->spa_pending_vdev = NULL; 1565 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 1566 } 1567 1568 if (vd->vdev_children != 0) { 1569 if ((error = vdev_create(vd, txg, B_FALSE)) != 0) { 1570 spa->spa_pending_vdev = NULL; 1571 return (spa_vdev_exit(spa, vd, txg, error)); 1572 } 1573 } 1574 1575 /* 1576 * We must validate the spares after checking the children. Otherwise, 1577 * vdev_inuse() will blindly overwrite the spare. 1578 */ 1579 if ((error = spa_validate_spares(spa, nvroot, txg, 1580 VDEV_ALLOC_ADD)) != 0) { 1581 spa->spa_pending_vdev = NULL; 1582 return (spa_vdev_exit(spa, vd, txg, error)); 1583 } 1584 1585 spa->spa_pending_vdev = NULL; 1586 1587 /* 1588 * Transfer each new top-level vdev from vd to rvd. 1589 */ 1590 for (c = 0; c < vd->vdev_children; c++) { 1591 tvd = vd->vdev_child[c]; 1592 vdev_remove_child(vd, tvd); 1593 tvd->vdev_id = rvd->vdev_children; 1594 vdev_add_child(rvd, tvd); 1595 vdev_config_dirty(tvd); 1596 } 1597 1598 if (nspares != 0) { 1599 if (spa->spa_sparelist != NULL) { 1600 nvlist_t **oldspares; 1601 uint_t oldnspares; 1602 nvlist_t **newspares; 1603 1604 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 1605 ZPOOL_CONFIG_SPARES, &oldspares, &oldnspares) == 0); 1606 1607 newspares = kmem_alloc(sizeof (void *) * 1608 (nspares + oldnspares), KM_SLEEP); 1609 for (i = 0; i < oldnspares; i++) 1610 VERIFY(nvlist_dup(oldspares[i], 1611 &newspares[i], KM_SLEEP) == 0); 1612 for (i = 0; i < nspares; i++) 1613 VERIFY(nvlist_dup(spares[i], 1614 &newspares[i + oldnspares], 1615 KM_SLEEP) == 0); 1616 1617 VERIFY(nvlist_remove(spa->spa_sparelist, 1618 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 1619 1620 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1621 ZPOOL_CONFIG_SPARES, newspares, 1622 nspares + oldnspares) == 0); 1623 for (i = 0; i < oldnspares + nspares; i++) 1624 nvlist_free(newspares[i]); 1625 kmem_free(newspares, (oldnspares + nspares) * 1626 sizeof (void *)); 1627 } else { 1628 VERIFY(nvlist_alloc(&spa->spa_sparelist, 1629 NV_UNIQUE_NAME, KM_SLEEP) == 0); 1630 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1631 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1632 } 1633 1634 spa_load_spares(spa); 1635 spa->spa_sync_spares = B_TRUE; 1636 } 1637 1638 /* 1639 * We have to be careful when adding new vdevs to an existing pool. 1640 * If other threads start allocating from these vdevs before we 1641 * sync the config cache, and we lose power, then upon reboot we may 1642 * fail to open the pool because there are DVAs that the config cache 1643 * can't translate. Therefore, we first add the vdevs without 1644 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 1645 * and then let spa_config_update() initialize the new metaslabs. 1646 * 1647 * spa_load() checks for added-but-not-initialized vdevs, so that 1648 * if we lose power at any point in this sequence, the remaining 1649 * steps will be completed the next time we load the pool. 1650 */ 1651 (void) spa_vdev_exit(spa, vd, txg, 0); 1652 1653 mutex_enter(&spa_namespace_lock); 1654 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 1655 mutex_exit(&spa_namespace_lock); 1656 1657 return (0); 1658} 1659 1660/* 1661 * Attach a device to a mirror. The arguments are the path to any device 1662 * in the mirror, and the nvroot for the new device. If the path specifies 1663 * a device that is not mirrored, we automatically insert the mirror vdev. 1664 * 1665 * If 'replacing' is specified, the new device is intended to replace the 1666 * existing device; in this case the two devices are made into their own 1667 * mirror using the 'replacing' vdev, which is functionally idendical to 1668 * the mirror vdev (it actually reuses all the same ops) but has a few 1669 * extra rules: you can't attach to it after it's been created, and upon 1670 * completion of resilvering, the first disk (the one being replaced) 1671 * is automatically detached. 1672 */ 1673int 1674spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 1675{ 1676 uint64_t txg, open_txg; 1677 int error; 1678 vdev_t *rvd = spa->spa_root_vdev; 1679 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 1680 vdev_ops_t *pvops; 1681 1682 txg = spa_vdev_enter(spa); 1683 1684 oldvd = vdev_lookup_by_guid(rvd, guid); 1685 1686 if (oldvd == NULL) 1687 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1688 1689 if (!oldvd->vdev_ops->vdev_op_leaf) 1690 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1691 1692 pvd = oldvd->vdev_parent; 1693 1694 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 1695 VDEV_ALLOC_ADD)) != 0 || newrootvd->vdev_children != 1) 1696 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1697 1698 newvd = newrootvd->vdev_child[0]; 1699 1700 if (!newvd->vdev_ops->vdev_op_leaf) 1701 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1702 1703 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 1704 return (spa_vdev_exit(spa, newrootvd, txg, error)); 1705 1706 if (!replacing) { 1707 /* 1708 * For attach, the only allowable parent is a mirror or the root 1709 * vdev. 1710 */ 1711 if (pvd->vdev_ops != &vdev_mirror_ops && 1712 pvd->vdev_ops != &vdev_root_ops) 1713 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1714 1715 pvops = &vdev_mirror_ops; 1716 } else { 1717 /* 1718 * Active hot spares can only be replaced by inactive hot 1719 * spares. 1720 */ 1721 if (pvd->vdev_ops == &vdev_spare_ops && 1722 pvd->vdev_child[1] == oldvd && 1723 !spa_has_spare(spa, newvd->vdev_guid)) 1724 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1725 1726 /* 1727 * If the source is a hot spare, and the parent isn't already a 1728 * spare, then we want to create a new hot spare. Otherwise, we 1729 * want to create a replacing vdev. The user is not allowed to 1730 * attach to a spared vdev child unless the 'isspare' state is 1731 * the same (spare replaces spare, non-spare replaces 1732 * non-spare). 1733 */ 1734 if (pvd->vdev_ops == &vdev_replacing_ops) 1735 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1736 else if (pvd->vdev_ops == &vdev_spare_ops && 1737 newvd->vdev_isspare != oldvd->vdev_isspare) 1738 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1739 else if (pvd->vdev_ops != &vdev_spare_ops && 1740 newvd->vdev_isspare) 1741 pvops = &vdev_spare_ops; 1742 else 1743 pvops = &vdev_replacing_ops; 1744 } 1745 1746 /* 1747 * Compare the new device size with the replaceable/attachable 1748 * device size. 1749 */ 1750 if (newvd->vdev_psize < vdev_get_rsize(oldvd)) 1751 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 1752 1753 /* 1754 * The new device cannot have a higher alignment requirement 1755 * than the top-level vdev. 1756 */ 1757 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 1758 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 1759 1760 /* 1761 * If this is an in-place replacement, update oldvd's path and devid 1762 * to make it distinguishable from newvd, and unopenable from now on. 1763 */ 1764 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 1765 spa_strfree(oldvd->vdev_path); 1766 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 1767 KM_SLEEP); 1768 (void) sprintf(oldvd->vdev_path, "%s/%s", 1769 newvd->vdev_path, "old"); 1770 if (oldvd->vdev_devid != NULL) { 1771 spa_strfree(oldvd->vdev_devid); 1772 oldvd->vdev_devid = NULL; 1773 } 1774 } 1775 1776 /* 1777 * If the parent is not a mirror, or if we're replacing, insert the new 1778 * mirror/replacing/spare vdev above oldvd. 1779 */ 1780 if (pvd->vdev_ops != pvops) 1781 pvd = vdev_add_parent(oldvd, pvops); 1782 1783 ASSERT(pvd->vdev_top->vdev_parent == rvd); 1784 ASSERT(pvd->vdev_ops == pvops); 1785 ASSERT(oldvd->vdev_parent == pvd); 1786 1787 /* 1788 * Extract the new device from its root and add it to pvd. 1789 */ 1790 vdev_remove_child(newrootvd, newvd); 1791 newvd->vdev_id = pvd->vdev_children; 1792 vdev_add_child(pvd, newvd); 1793 1794 /* 1795 * If newvd is smaller than oldvd, but larger than its rsize, 1796 * the addition of newvd may have decreased our parent's asize. 1797 */ 1798 pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); 1799 1800 tvd = newvd->vdev_top; 1801 ASSERT(pvd->vdev_top == tvd); 1802 ASSERT(tvd->vdev_parent == rvd); 1803 1804 vdev_config_dirty(tvd); 1805 1806 /* 1807 * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 1808 * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 1809 */ 1810 open_txg = txg + TXG_CONCURRENT_STATES - 1; 1811 1812 mutex_enter(&newvd->vdev_dtl_lock); 1813 space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, 1814 open_txg - TXG_INITIAL + 1); 1815 mutex_exit(&newvd->vdev_dtl_lock); 1816 1817 if (newvd->vdev_isspare) 1818 spa_spare_activate(newvd); 1819 1820 /* 1821 * Mark newvd's DTL dirty in this txg. 1822 */ 1823 vdev_dirty(tvd, VDD_DTL, newvd, txg); 1824 1825 (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 1826 1827 /* 1828 * Kick off a resilver to update newvd. 1829 */ 1830 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1831 1832 return (0); 1833} 1834 1835/* 1836 * Detach a device from a mirror or replacing vdev. 1837 * If 'replace_done' is specified, only detach if the parent 1838 * is a replacing vdev. 1839 */ 1840int 1841spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) 1842{ 1843 uint64_t txg; 1844 int c, t, error; 1845 vdev_t *rvd = spa->spa_root_vdev; 1846 vdev_t *vd, *pvd, *cvd, *tvd; 1847 boolean_t unspare = B_FALSE; 1848 uint64_t unspare_guid; 1849 1850 txg = spa_vdev_enter(spa); 1851 1852 vd = vdev_lookup_by_guid(rvd, guid); 1853 1854 if (vd == NULL) 1855 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1856 1857 if (!vd->vdev_ops->vdev_op_leaf) 1858 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1859 1860 pvd = vd->vdev_parent; 1861 1862 /* 1863 * If replace_done is specified, only remove this device if it's 1864 * the first child of a replacing vdev. For the 'spare' vdev, either 1865 * disk can be removed. 1866 */ 1867 if (replace_done) { 1868 if (pvd->vdev_ops == &vdev_replacing_ops) { 1869 if (vd->vdev_id != 0) 1870 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1871 } else if (pvd->vdev_ops != &vdev_spare_ops) { 1872 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1873 } 1874 } 1875 1876 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 1877 spa_version(spa) >= ZFS_VERSION_SPARES); 1878 1879 /* 1880 * Only mirror, replacing, and spare vdevs support detach. 1881 */ 1882 if (pvd->vdev_ops != &vdev_replacing_ops && 1883 pvd->vdev_ops != &vdev_mirror_ops && 1884 pvd->vdev_ops != &vdev_spare_ops) 1885 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1886 1887 /* 1888 * If there's only one replica, you can't detach it. 1889 */ 1890 if (pvd->vdev_children <= 1) 1891 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1892 1893 /* 1894 * If all siblings have non-empty DTLs, this device may have the only 1895 * valid copy of the data, which means we cannot safely detach it. 1896 * 1897 * XXX -- as in the vdev_offline() case, we really want a more 1898 * precise DTL check. 1899 */ 1900 for (c = 0; c < pvd->vdev_children; c++) { 1901 uint64_t dirty; 1902 1903 cvd = pvd->vdev_child[c]; 1904 if (cvd == vd) 1905 continue; 1906 if (vdev_is_dead(cvd)) 1907 continue; 1908 mutex_enter(&cvd->vdev_dtl_lock); 1909 dirty = cvd->vdev_dtl_map.sm_space | 1910 cvd->vdev_dtl_scrub.sm_space; 1911 mutex_exit(&cvd->vdev_dtl_lock); 1912 if (!dirty) 1913 break; 1914 } 1915 1916 /* 1917 * If we are a replacing or spare vdev, then we can always detach the 1918 * latter child, as that is how one cancels the operation. 1919 */ 1920 if ((pvd->vdev_ops == &vdev_mirror_ops || vd->vdev_id != 1) && 1921 c == pvd->vdev_children) 1922 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1923 1924 /* 1925 * If we are detaching the original disk from a spare, then it implies 1926 * that the spare should become a real disk, and be removed from the 1927 * active spare list for the pool. 1928 */ 1929 if (pvd->vdev_ops == &vdev_spare_ops && 1930 vd->vdev_id == 0) 1931 unspare = B_TRUE; 1932 1933 /* 1934 * Erase the disk labels so the disk can be used for other things. 1935 * This must be done after all other error cases are handled, 1936 * but before we disembowel vd (so we can still do I/O to it). 1937 * But if we can't do it, don't treat the error as fatal -- 1938 * it may be that the unwritability of the disk is the reason 1939 * it's being detached! 1940 */ 1941 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 1942 1943 /* 1944 * Remove vd from its parent and compact the parent's children. 1945 */ 1946 vdev_remove_child(pvd, vd); 1947 vdev_compact_children(pvd); 1948 1949 /* 1950 * Remember one of the remaining children so we can get tvd below. 1951 */ 1952 cvd = pvd->vdev_child[0]; 1953 1954 /* 1955 * If we need to remove the remaining child from the list of hot spares, 1956 * do it now, marking the vdev as no longer a spare in the process. We 1957 * must do this before vdev_remove_parent(), because that can change the 1958 * GUID if it creates a new toplevel GUID. 1959 */ 1960 if (unspare) { 1961 ASSERT(cvd->vdev_isspare); 1962 spa_spare_remove(cvd); 1963 unspare_guid = cvd->vdev_guid; 1964 } 1965 1966 /* 1967 * If the parent mirror/replacing vdev only has one child, 1968 * the parent is no longer needed. Remove it from the tree. 1969 */ 1970 if (pvd->vdev_children == 1) 1971 vdev_remove_parent(cvd); 1972 1973 /* 1974 * We don't set tvd until now because the parent we just removed 1975 * may have been the previous top-level vdev. 1976 */ 1977 tvd = cvd->vdev_top; 1978 ASSERT(tvd->vdev_parent == rvd); 1979 1980 /* 1981 * Reevaluate the parent vdev state. 1982 */ 1983 vdev_propagate_state(cvd->vdev_parent); 1984 1985 /* 1986 * If the device we just detached was smaller than the others, it may be 1987 * possible to add metaslabs (i.e. grow the pool). vdev_metaslab_init() 1988 * can't fail because the existing metaslabs are already in core, so 1989 * there's nothing to read from disk. 1990 */ 1991 VERIFY(vdev_metaslab_init(tvd, txg) == 0); 1992 1993 vdev_config_dirty(tvd); 1994 1995 /* 1996 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 1997 * vd->vdev_detached is set and free vd's DTL object in syncing context. 1998 * But first make sure we're not on any *other* txg's DTL list, to 1999 * prevent vd from being accessed after it's freed. 2000 */ 2001 for (t = 0; t < TXG_SIZE; t++) 2002 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 2003 vd->vdev_detached = B_TRUE; 2004 vdev_dirty(tvd, VDD_DTL, vd, txg); 2005 2006 error = spa_vdev_exit(spa, vd, txg, 0); 2007 2008 /* 2009 * If this was the removal of the original device in a hot spare vdev, 2010 * then we want to go through and remove the device from the hot spare 2011 * list of every other pool. 2012 */ 2013 if (unspare) { 2014 spa = NULL; 2015 mutex_enter(&spa_namespace_lock); 2016 while ((spa = spa_next(spa)) != NULL) { 2017 if (spa->spa_state != POOL_STATE_ACTIVE) 2018 continue; 2019 2020 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 2021 } 2022 mutex_exit(&spa_namespace_lock); 2023 } 2024 2025 return (error); 2026} 2027 2028/* 2029 * Remove a device from the pool. Currently, this supports removing only hot 2030 * spares. 2031 */ 2032int 2033spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 2034{ 2035 vdev_t *vd; 2036 nvlist_t **spares, *nv, **newspares; 2037 uint_t i, j, nspares; 2038 int ret = 0; 2039 2040 spa_config_enter(spa, RW_WRITER, FTAG); 2041 2042 vd = spa_lookup_by_guid(spa, guid); 2043 2044 nv = NULL; 2045 if (spa->spa_spares != NULL && 2046 nvlist_lookup_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 2047 &spares, &nspares) == 0) { 2048 for (i = 0; i < nspares; i++) { 2049 uint64_t theguid; 2050 2051 VERIFY(nvlist_lookup_uint64(spares[i], 2052 ZPOOL_CONFIG_GUID, &theguid) == 0); 2053 if (theguid == guid) { 2054 nv = spares[i]; 2055 break; 2056 } 2057 } 2058 } 2059 2060 /* 2061 * We only support removing a hot spare, and only if it's not currently 2062 * in use in this pool. 2063 */ 2064 if (nv == NULL && vd == NULL) { 2065 ret = ENOENT; 2066 goto out; 2067 } 2068 2069 if (nv == NULL && vd != NULL) { 2070 ret = ENOTSUP; 2071 goto out; 2072 } 2073 2074 if (!unspare && nv != NULL && vd != NULL) { 2075 ret = EBUSY; 2076 goto out; 2077 } 2078 2079 if (nspares == 1) { 2080 newspares = NULL; 2081 } else { 2082 newspares = kmem_alloc((nspares - 1) * sizeof (void *), 2083 KM_SLEEP); 2084 for (i = 0, j = 0; i < nspares; i++) { 2085 if (spares[i] != nv) 2086 VERIFY(nvlist_dup(spares[i], 2087 &newspares[j++], KM_SLEEP) == 0); 2088 } 2089 } 2090 2091 VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 2092 DATA_TYPE_NVLIST_ARRAY) == 0); 2093 VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 2094 newspares, nspares - 1) == 0); 2095 for (i = 0; i < nspares - 1; i++) 2096 nvlist_free(newspares[i]); 2097 kmem_free(newspares, (nspares - 1) * sizeof (void *)); 2098 spa_load_spares(spa); 2099 spa->spa_sync_spares = B_TRUE; 2100 2101out: 2102 spa_config_exit(spa, FTAG); 2103 2104 return (ret); 2105} 2106 2107/* 2108 * Find any device that's done replacing, so we can detach it. 2109 */ 2110static vdev_t * 2111spa_vdev_replace_done_hunt(vdev_t *vd) 2112{ 2113 vdev_t *newvd, *oldvd; 2114 int c; 2115 2116 for (c = 0; c < vd->vdev_children; c++) { 2117 oldvd = spa_vdev_replace_done_hunt(vd->vdev_child[c]); 2118 if (oldvd != NULL) 2119 return (oldvd); 2120 } 2121 2122 if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 2123 oldvd = vd->vdev_child[0]; 2124 newvd = vd->vdev_child[1]; 2125 2126 mutex_enter(&newvd->vdev_dtl_lock); 2127 if (newvd->vdev_dtl_map.sm_space == 0 && 2128 newvd->vdev_dtl_scrub.sm_space == 0) { 2129 mutex_exit(&newvd->vdev_dtl_lock); 2130 return (oldvd); 2131 } 2132 mutex_exit(&newvd->vdev_dtl_lock); 2133 } 2134 2135 return (NULL); 2136} 2137 2138static void 2139spa_vdev_replace_done(spa_t *spa) 2140{ 2141 vdev_t *vd; 2142 vdev_t *pvd; 2143 uint64_t guid; 2144 uint64_t pguid = 0; 2145 2146 spa_config_enter(spa, RW_READER, FTAG); 2147 2148 while ((vd = spa_vdev_replace_done_hunt(spa->spa_root_vdev)) != NULL) { 2149 guid = vd->vdev_guid; 2150 /* 2151 * If we have just finished replacing a hot spared device, then 2152 * we need to detach the parent's first child (the original hot 2153 * spare) as well. 2154 */ 2155 pvd = vd->vdev_parent; 2156 if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops && 2157 pvd->vdev_id == 0) { 2158 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 2159 ASSERT(pvd->vdev_parent->vdev_children == 2); 2160 pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid; 2161 } 2162 spa_config_exit(spa, FTAG); 2163 if (spa_vdev_detach(spa, guid, B_TRUE) != 0) 2164 return; 2165 if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0) 2166 return; 2167 spa_config_enter(spa, RW_READER, FTAG); 2168 } 2169 2170 spa_config_exit(spa, FTAG); 2171} 2172 2173/* 2174 * Update the stored path for this vdev. Dirty the vdev configuration, relying 2175 * on spa_vdev_enter/exit() to synchronize the labels and cache. 2176 */ 2177int 2178spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 2179{ 2180 vdev_t *rvd, *vd; 2181 uint64_t txg; 2182 2183 rvd = spa->spa_root_vdev; 2184 2185 txg = spa_vdev_enter(spa); 2186 2187 if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 2188 /* 2189 * Determine if this is a reference to a hot spare. In that 2190 * case, update the path as stored in the spare list. 2191 */ 2192 nvlist_t **spares; 2193 uint_t i, nspares; 2194 if (spa->spa_sparelist != NULL) { 2195 VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 2196 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 2197 for (i = 0; i < nspares; i++) { 2198 uint64_t theguid; 2199 VERIFY(nvlist_lookup_uint64(spares[i], 2200 ZPOOL_CONFIG_GUID, &theguid) == 0); 2201 if (theguid == guid) 2202 break; 2203 } 2204 2205 if (i == nspares) 2206 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 2207 2208 VERIFY(nvlist_add_string(spares[i], 2209 ZPOOL_CONFIG_PATH, newpath) == 0); 2210 spa_load_spares(spa); 2211 spa->spa_sync_spares = B_TRUE; 2212 return (spa_vdev_exit(spa, NULL, txg, 0)); 2213 } else { 2214 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 2215 } 2216 } 2217 2218 if (!vd->vdev_ops->vdev_op_leaf) 2219 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 2220 2221 spa_strfree(vd->vdev_path); 2222 vd->vdev_path = spa_strdup(newpath); 2223 2224 vdev_config_dirty(vd->vdev_top); 2225 2226 return (spa_vdev_exit(spa, NULL, txg, 0)); 2227} 2228 2229/* 2230 * ========================================================================== 2231 * SPA Scrubbing 2232 * ========================================================================== 2233 */ 2234 2235static void 2236spa_scrub_io_done(zio_t *zio) 2237{ 2238 spa_t *spa = zio->io_spa; 2239 2240 zio_data_buf_free(zio->io_data, zio->io_size); 2241 2242 mutex_enter(&spa->spa_scrub_lock); 2243 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2244 vdev_t *vd = zio->io_vd ? zio->io_vd : spa->spa_root_vdev; 2245 spa->spa_scrub_errors++; 2246 mutex_enter(&vd->vdev_stat_lock); 2247 vd->vdev_stat.vs_scrub_errors++; 2248 mutex_exit(&vd->vdev_stat_lock); 2249 } 2250 2251 if (--spa->spa_scrub_inflight < spa->spa_scrub_maxinflight) 2252 cv_broadcast(&spa->spa_scrub_io_cv); 2253 2254 ASSERT(spa->spa_scrub_inflight >= 0); 2255 2256 mutex_exit(&spa->spa_scrub_lock); 2257} 2258 2259static void 2260spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags, 2261 zbookmark_t *zb) 2262{ 2263 size_t size = BP_GET_LSIZE(bp); 2264 void *data; 2265 2266 mutex_enter(&spa->spa_scrub_lock); 2267 /* 2268 * Do not give too much work to vdev(s). 2269 */ 2270 while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight) { 2271 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2272 } 2273 spa->spa_scrub_inflight++; 2274 mutex_exit(&spa->spa_scrub_lock); 2275 2276 data = zio_data_buf_alloc(size); 2277 2278 if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) 2279 flags |= ZIO_FLAG_SPECULATIVE; /* intent log block */ 2280 2281 flags |= ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL; 2282 2283 zio_nowait(zio_read(NULL, spa, bp, data, size, 2284 spa_scrub_io_done, NULL, priority, flags, zb)); 2285} 2286 2287/* ARGSUSED */ 2288static int 2289spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) 2290{ 2291 blkptr_t *bp = &bc->bc_blkptr; 2292 vdev_t *vd = spa->spa_root_vdev; 2293 dva_t *dva = bp->blk_dva; 2294 int needs_resilver = B_FALSE; 2295 int d; 2296 2297 if (bc->bc_errno) { 2298 /* 2299 * We can't scrub this block, but we can continue to scrub 2300 * the rest of the pool. Note the error and move along. 2301 */ 2302 mutex_enter(&spa->spa_scrub_lock); 2303 spa->spa_scrub_errors++; 2304 mutex_exit(&spa->spa_scrub_lock); 2305 2306 mutex_enter(&vd->vdev_stat_lock); 2307 vd->vdev_stat.vs_scrub_errors++; 2308 mutex_exit(&vd->vdev_stat_lock); 2309 2310 return (ERESTART); 2311 } 2312 2313 ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg); 2314 2315 for (d = 0; d < BP_GET_NDVAS(bp); d++) { 2316 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d])); 2317 2318 ASSERT(vd != NULL); 2319 2320 /* 2321 * Keep track of how much data we've examined so that 2322 * zpool(1M) status can make useful progress reports. 2323 */ 2324 mutex_enter(&vd->vdev_stat_lock); 2325 vd->vdev_stat.vs_scrub_examined += DVA_GET_ASIZE(&dva[d]); 2326 mutex_exit(&vd->vdev_stat_lock); 2327 2328 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) { 2329 if (DVA_GET_GANG(&dva[d])) { 2330 /* 2331 * Gang members may be spread across multiple 2332 * vdevs, so the best we can do is look at the 2333 * pool-wide DTL. 2334 * XXX -- it would be better to change our 2335 * allocation policy to ensure that this can't 2336 * happen. 2337 */ 2338 vd = spa->spa_root_vdev; 2339 } 2340 if (vdev_dtl_contains(&vd->vdev_dtl_map, 2341 bp->blk_birth, 1)) 2342 needs_resilver = B_TRUE; 2343 } 2344 } 2345 2346 if (spa->spa_scrub_type == POOL_SCRUB_EVERYTHING) 2347 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB, 2348 ZIO_FLAG_SCRUB, &bc->bc_bookmark); 2349 else if (needs_resilver) 2350 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, 2351 ZIO_FLAG_RESILVER, &bc->bc_bookmark); 2352 2353 return (0); 2354} 2355 2356static void 2357spa_scrub_thread(void *arg) 2358{ 2359 spa_t *spa = arg; 2360 callb_cpr_t cprinfo; 2361 traverse_handle_t *th = spa->spa_scrub_th; 2362 vdev_t *rvd = spa->spa_root_vdev; 2363 pool_scrub_type_t scrub_type = spa->spa_scrub_type; 2364 int error = 0; 2365 boolean_t complete; 2366 2367 CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG); 2368 2369 /* 2370 * If we're restarting due to a snapshot create/delete, 2371 * wait for that to complete. 2372 */ 2373 txg_wait_synced(spa_get_dsl(spa), 0); 2374 2375 dprintf("start %s mintxg=%llu maxtxg=%llu\n", 2376 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 2377 spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg); 2378 2379 spa_config_enter(spa, RW_WRITER, FTAG); 2380 vdev_reopen(rvd); /* purge all vdev caches */ 2381 vdev_config_dirty(rvd); /* rewrite all disk labels */ 2382 vdev_scrub_stat_update(rvd, scrub_type, B_FALSE); 2383 spa_config_exit(spa, FTAG); 2384 2385 mutex_enter(&spa->spa_scrub_lock); 2386 spa->spa_scrub_errors = 0; 2387 spa->spa_scrub_active = 1; 2388 ASSERT(spa->spa_scrub_inflight == 0); 2389 2390 while (!spa->spa_scrub_stop) { 2391 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2392 while (spa->spa_scrub_suspended) { 2393 spa->spa_scrub_active = 0; 2394 cv_broadcast(&spa->spa_scrub_cv); 2395 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2396 spa->spa_scrub_active = 1; 2397 } 2398 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock); 2399 2400 if (spa->spa_scrub_restart_txg != 0) 2401 break; 2402 2403 mutex_exit(&spa->spa_scrub_lock); 2404 error = traverse_more(th); 2405 mutex_enter(&spa->spa_scrub_lock); 2406 if (error != EAGAIN) 2407 break; 2408 } 2409 2410 while (spa->spa_scrub_inflight) 2411 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2412 2413 spa->spa_scrub_active = 0; 2414 cv_broadcast(&spa->spa_scrub_cv); 2415 2416 mutex_exit(&spa->spa_scrub_lock); 2417 2418 spa_config_enter(spa, RW_WRITER, FTAG); 2419 2420 mutex_enter(&spa->spa_scrub_lock); 2421 2422 /* 2423 * Note: we check spa_scrub_restart_txg under both spa_scrub_lock 2424 * AND the spa config lock to synchronize with any config changes 2425 * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit(). 2426 */ 2427 if (spa->spa_scrub_restart_txg != 0) 2428 error = ERESTART; 2429 2430 if (spa->spa_scrub_stop) 2431 error = EINTR; 2432 2433 /* 2434 * Even if there were uncorrectable errors, we consider the scrub 2435 * completed. The downside is that if there is a transient error during 2436 * a resilver, we won't resilver the data properly to the target. But 2437 * if the damage is permanent (more likely) we will resilver forever, 2438 * which isn't really acceptable. Since there is enough information for 2439 * the user to know what has failed and why, this seems like a more 2440 * tractable approach. 2441 */ 2442 complete = (error == 0); 2443 2444 dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n", 2445 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 2446 spa->spa_scrub_maxtxg, complete ? "done" : "FAILED", 2447 error, spa->spa_scrub_errors, spa->spa_scrub_stop); 2448 2449 mutex_exit(&spa->spa_scrub_lock); 2450 2451 /* 2452 * If the scrub/resilver completed, update all DTLs to reflect this. 2453 * Whether it succeeded or not, vacate all temporary scrub DTLs. 2454 */ 2455 vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1, 2456 complete ? spa->spa_scrub_maxtxg : 0, B_TRUE); 2457 vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); 2458 spa_errlog_rotate(spa); 2459 2460 spa_config_exit(spa, FTAG); 2461 2462 mutex_enter(&spa->spa_scrub_lock); 2463 2464 /* 2465 * We may have finished replacing a device. 2466 * Let the async thread assess this and handle the detach. 2467 */ 2468 spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 2469 2470 /* 2471 * If we were told to restart, our final act is to start a new scrub. 2472 */ 2473 if (error == ERESTART) 2474 spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ? 2475 SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB); 2476 2477 spa->spa_scrub_type = POOL_SCRUB_NONE; 2478 spa->spa_scrub_active = 0; 2479 spa->spa_scrub_thread = NULL; 2480 cv_broadcast(&spa->spa_scrub_cv); 2481 CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */ 2482 thread_exit(); 2483} 2484 2485void 2486spa_scrub_suspend(spa_t *spa) 2487{ 2488 mutex_enter(&spa->spa_scrub_lock); 2489 spa->spa_scrub_suspended++; 2490 while (spa->spa_scrub_active) { 2491 cv_broadcast(&spa->spa_scrub_cv); 2492 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2493 } 2494 while (spa->spa_scrub_inflight) 2495 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2496 mutex_exit(&spa->spa_scrub_lock); 2497} 2498 2499void 2500spa_scrub_resume(spa_t *spa) 2501{ 2502 mutex_enter(&spa->spa_scrub_lock); 2503 ASSERT(spa->spa_scrub_suspended != 0); 2504 if (--spa->spa_scrub_suspended == 0) 2505 cv_broadcast(&spa->spa_scrub_cv); 2506 mutex_exit(&spa->spa_scrub_lock); 2507} 2508 2509void 2510spa_scrub_restart(spa_t *spa, uint64_t txg) 2511{ 2512 /* 2513 * Something happened (e.g. snapshot create/delete) that means 2514 * we must restart any in-progress scrubs. The itinerary will 2515 * fix this properly. 2516 */ 2517 mutex_enter(&spa->spa_scrub_lock); 2518 spa->spa_scrub_restart_txg = txg; 2519 mutex_exit(&spa->spa_scrub_lock); 2520} 2521 2522int 2523spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) 2524{ 2525 space_seg_t *ss; 2526 uint64_t mintxg, maxtxg; 2527 vdev_t *rvd = spa->spa_root_vdev; 2528 2529 if ((uint_t)type >= POOL_SCRUB_TYPES) 2530 return (ENOTSUP); 2531 2532 mutex_enter(&spa->spa_scrub_lock); 2533 2534 /* 2535 * If there's a scrub or resilver already in progress, stop it. 2536 */ 2537 while (spa->spa_scrub_thread != NULL) { 2538 /* 2539 * Don't stop a resilver unless forced. 2540 */ 2541 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) { 2542 mutex_exit(&spa->spa_scrub_lock); 2543 return (EBUSY); 2544 } 2545 spa->spa_scrub_stop = 1; 2546 cv_broadcast(&spa->spa_scrub_cv); 2547 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2548 } 2549 2550 /* 2551 * Terminate the previous traverse. 2552 */ 2553 if (spa->spa_scrub_th != NULL) { 2554 traverse_fini(spa->spa_scrub_th); 2555 spa->spa_scrub_th = NULL; 2556 } 2557 2558 if (rvd == NULL) { 2559 ASSERT(spa->spa_scrub_stop == 0); 2560 ASSERT(spa->spa_scrub_type == type); 2561 ASSERT(spa->spa_scrub_restart_txg == 0); 2562 mutex_exit(&spa->spa_scrub_lock); 2563 return (0); 2564 } 2565 2566 mintxg = TXG_INITIAL - 1; 2567 maxtxg = spa_last_synced_txg(spa) + 1; 2568 2569 mutex_enter(&rvd->vdev_dtl_lock); 2570 2571 if (rvd->vdev_dtl_map.sm_space == 0) { 2572 /* 2573 * The pool-wide DTL is empty. 2574 * If this is a resilver, there's nothing to do except 2575 * check whether any in-progress replacements have completed. 2576 */ 2577 if (type == POOL_SCRUB_RESILVER) { 2578 type = POOL_SCRUB_NONE; 2579 spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 2580 } 2581 } else { 2582 /* 2583 * The pool-wide DTL is non-empty. 2584 * If this is a normal scrub, upgrade to a resilver instead. 2585 */ 2586 if (type == POOL_SCRUB_EVERYTHING) 2587 type = POOL_SCRUB_RESILVER; 2588 } 2589 2590 if (type == POOL_SCRUB_RESILVER) { 2591 /* 2592 * Determine the resilvering boundaries. 2593 * 2594 * Note: (mintxg, maxtxg) is an open interval, 2595 * i.e. mintxg and maxtxg themselves are not included. 2596 * 2597 * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1 2598 * so we don't claim to resilver a txg that's still changing. 2599 */ 2600 ss = avl_first(&rvd->vdev_dtl_map.sm_root); 2601 mintxg = ss->ss_start - 1; 2602 ss = avl_last(&rvd->vdev_dtl_map.sm_root); 2603 maxtxg = MIN(ss->ss_end, maxtxg); 2604 } 2605 2606 mutex_exit(&rvd->vdev_dtl_lock); 2607 2608 spa->spa_scrub_stop = 0; 2609 spa->spa_scrub_type = type; 2610 spa->spa_scrub_restart_txg = 0; 2611 2612 if (type != POOL_SCRUB_NONE) { 2613 spa->spa_scrub_mintxg = mintxg; 2614 spa->spa_scrub_maxtxg = maxtxg; 2615 spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL, 2616 ADVANCE_PRE | ADVANCE_PRUNE | ADVANCE_ZIL, 2617 ZIO_FLAG_CANFAIL); 2618 traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg); 2619 spa->spa_scrub_thread = thread_create(NULL, 0, 2620 spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri); 2621 } 2622 2623 mutex_exit(&spa->spa_scrub_lock); 2624 2625 return (0); 2626} 2627 2628/* 2629 * ========================================================================== 2630 * SPA async task processing 2631 * ========================================================================== 2632 */ 2633 2634static void 2635spa_async_reopen(spa_t *spa) 2636{ 2637 vdev_t *rvd = spa->spa_root_vdev; 2638 vdev_t *tvd; 2639 int c; 2640 2641 spa_config_enter(spa, RW_WRITER, FTAG); 2642 2643 for (c = 0; c < rvd->vdev_children; c++) { 2644 tvd = rvd->vdev_child[c]; 2645 if (tvd->vdev_reopen_wanted) { 2646 tvd->vdev_reopen_wanted = 0; 2647 vdev_reopen(tvd); 2648 } 2649 } 2650 2651 spa_config_exit(spa, FTAG); 2652} 2653 2654static void 2655spa_async_thread(void *arg) 2656{ 2657 spa_t *spa = arg; 2658 int tasks; 2659 2660 ASSERT(spa->spa_sync_on); 2661 2662 mutex_enter(&spa->spa_async_lock); 2663 tasks = spa->spa_async_tasks; 2664 spa->spa_async_tasks = 0; 2665 mutex_exit(&spa->spa_async_lock); 2666 2667 /* 2668 * See if the config needs to be updated. 2669 */ 2670 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 2671 mutex_enter(&spa_namespace_lock); 2672 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 2673 mutex_exit(&spa_namespace_lock); 2674 } 2675 2676 /* 2677 * See if any devices need to be reopened. 2678 */ 2679 if (tasks & SPA_ASYNC_REOPEN) 2680 spa_async_reopen(spa); 2681 2682 /* 2683 * If any devices are done replacing, detach them. 2684 */ 2685 if (tasks & SPA_ASYNC_REPLACE_DONE) 2686 spa_vdev_replace_done(spa); 2687 2688 /* 2689 * Kick off a scrub. 2690 */ 2691 if (tasks & SPA_ASYNC_SCRUB) 2692 VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0); 2693 2694 /* 2695 * Kick off a resilver. 2696 */ 2697 if (tasks & SPA_ASYNC_RESILVER) 2698 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 2699 2700 /* 2701 * Let the world know that we're done. 2702 */ 2703 mutex_enter(&spa->spa_async_lock); 2704 spa->spa_async_thread = NULL; 2705 cv_broadcast(&spa->spa_async_cv); 2706 mutex_exit(&spa->spa_async_lock); 2707 thread_exit(); 2708} 2709 2710void 2711spa_async_suspend(spa_t *spa) 2712{ 2713 mutex_enter(&spa->spa_async_lock); 2714 spa->spa_async_suspended++; 2715 while (spa->spa_async_thread != NULL) 2716 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 2717 mutex_exit(&spa->spa_async_lock); 2718} 2719 2720void 2721spa_async_resume(spa_t *spa) 2722{ 2723 mutex_enter(&spa->spa_async_lock); 2724 ASSERT(spa->spa_async_suspended != 0); 2725 spa->spa_async_suspended--; 2726 mutex_exit(&spa->spa_async_lock); 2727} 2728 2729static void 2730spa_async_dispatch(spa_t *spa) 2731{ 2732 mutex_enter(&spa->spa_async_lock); 2733 if (spa->spa_async_tasks && !spa->spa_async_suspended && 2734 spa->spa_async_thread == NULL && 2735 rootdir != NULL && !vn_is_readonly(rootdir)) 2736 spa->spa_async_thread = thread_create(NULL, 0, 2737 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 2738 mutex_exit(&spa->spa_async_lock); 2739} 2740 2741void 2742spa_async_request(spa_t *spa, int task) 2743{ 2744 mutex_enter(&spa->spa_async_lock); 2745 spa->spa_async_tasks |= task; 2746 mutex_exit(&spa->spa_async_lock); 2747} 2748 2749/* 2750 * ========================================================================== 2751 * SPA syncing routines 2752 * ========================================================================== 2753 */ 2754 2755static void 2756spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 2757{ 2758 bplist_t *bpl = &spa->spa_sync_bplist; 2759 dmu_tx_t *tx; 2760 blkptr_t blk; 2761 uint64_t itor = 0; 2762 zio_t *zio; 2763 int error; 2764 uint8_t c = 1; 2765 2766 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); 2767 2768 while (bplist_iterate(bpl, &itor, &blk) == 0) 2769 zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); 2770 2771 error = zio_wait(zio); 2772 ASSERT3U(error, ==, 0); 2773 2774 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2775 bplist_vacate(bpl, tx); 2776 2777 /* 2778 * Pre-dirty the first block so we sync to convergence faster. 2779 * (Usually only the first block is needed.) 2780 */ 2781 dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 2782 dmu_tx_commit(tx); 2783} 2784 2785static void 2786spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 2787{ 2788 char *packed = NULL; 2789 size_t nvsize = 0; 2790 dmu_buf_t *db; 2791 2792 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 2793 2794 packed = kmem_alloc(nvsize, KM_SLEEP); 2795 2796 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 2797 KM_SLEEP) == 0); 2798 2799 dmu_write(spa->spa_meta_objset, obj, 0, nvsize, packed, tx); 2800 2801 kmem_free(packed, nvsize); 2802 2803 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 2804 dmu_buf_will_dirty(db, tx); 2805 *(uint64_t *)db->db_data = nvsize; 2806 dmu_buf_rele(db, FTAG); 2807} 2808 2809static void 2810spa_sync_spares(spa_t *spa, dmu_tx_t *tx) 2811{ 2812 nvlist_t *nvroot; 2813 nvlist_t **spares; 2814 int i; 2815 2816 if (!spa->spa_sync_spares) 2817 return; 2818 2819 /* 2820 * Update the MOS nvlist describing the list of available spares. 2821 * spa_validate_spares() will have already made sure this nvlist is 2822 * valid and the vdevs are labelled appropriately. 2823 */ 2824 if (spa->spa_spares_object == 0) { 2825 spa->spa_spares_object = dmu_object_alloc(spa->spa_meta_objset, 2826 DMU_OT_PACKED_NVLIST, 1 << 14, 2827 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 2828 VERIFY(zap_update(spa->spa_meta_objset, 2829 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SPARES, 2830 sizeof (uint64_t), 1, &spa->spa_spares_object, tx) == 0); 2831 } 2832 2833 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 2834 if (spa->spa_nspares == 0) { 2835 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2836 NULL, 0) == 0); 2837 } else { 2838 spares = kmem_alloc(spa->spa_nspares * sizeof (void *), 2839 KM_SLEEP); 2840 for (i = 0; i < spa->spa_nspares; i++) 2841 spares[i] = vdev_config_generate(spa, 2842 spa->spa_spares[i], B_FALSE, B_TRUE); 2843 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2844 spares, spa->spa_nspares) == 0); 2845 for (i = 0; i < spa->spa_nspares; i++) 2846 nvlist_free(spares[i]); 2847 kmem_free(spares, spa->spa_nspares * sizeof (void *)); 2848 } 2849 2850 spa_sync_nvlist(spa, spa->spa_spares_object, nvroot, tx); 2851 nvlist_free(nvroot); 2852 2853 spa->spa_sync_spares = B_FALSE; 2854} 2855 2856static void 2857spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 2858{ 2859 nvlist_t *config; 2860 2861 if (list_is_empty(&spa->spa_dirty_list)) 2862 return; 2863 2864 config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); 2865 2866 if (spa->spa_config_syncing) 2867 nvlist_free(spa->spa_config_syncing); 2868 spa->spa_config_syncing = config; 2869 2870 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 2871} 2872 2873static void 2874spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx) 2875{ 2876 spa_t *spa = arg1; 2877 nvlist_t *nvp = arg2; 2878 nvpair_t *nvpair; 2879 objset_t *mos = spa->spa_meta_objset; 2880 uint64_t zapobj; 2881 2882 mutex_enter(&spa->spa_props_lock); 2883 if (spa->spa_pool_props_object == 0) { 2884 zapobj = zap_create(mos, DMU_OT_POOL_PROPS, DMU_OT_NONE, 0, tx); 2885 VERIFY(zapobj > 0); 2886 2887 spa->spa_pool_props_object = zapobj; 2888 2889 VERIFY(zap_update(mos, DMU_POOL_DIRECTORY_OBJECT, 2890 DMU_POOL_PROPS, 8, 1, 2891 &spa->spa_pool_props_object, tx) == 0); 2892 } 2893 mutex_exit(&spa->spa_props_lock); 2894 2895 nvpair = NULL; 2896 while ((nvpair = nvlist_next_nvpair(nvp, nvpair))) { 2897 switch (zpool_name_to_prop(nvpair_name(nvpair))) { 2898 case ZFS_PROP_BOOTFS: 2899 VERIFY(nvlist_lookup_uint64(nvp, 2900 nvpair_name(nvpair), &spa->spa_bootfs) == 0); 2901 VERIFY(zap_update(mos, 2902 spa->spa_pool_props_object, 2903 zpool_prop_to_name(ZFS_PROP_BOOTFS), 8, 1, 2904 &spa->spa_bootfs, tx) == 0); 2905 break; 2906 } 2907 } 2908} 2909 2910/* 2911 * Sync the specified transaction group. New blocks may be dirtied as 2912 * part of the process, so we iterate until it converges. 2913 */ 2914void 2915spa_sync(spa_t *spa, uint64_t txg) 2916{ 2917 dsl_pool_t *dp = spa->spa_dsl_pool; 2918 objset_t *mos = spa->spa_meta_objset; 2919 bplist_t *bpl = &spa->spa_sync_bplist; 2920 vdev_t *rvd = spa->spa_root_vdev; 2921 vdev_t *vd; 2922 dmu_tx_t *tx; 2923 int dirty_vdevs; 2924 2925 /* 2926 * Lock out configuration changes. 2927 */ 2928 spa_config_enter(spa, RW_READER, FTAG); 2929 2930 spa->spa_syncing_txg = txg; 2931 spa->spa_sync_pass = 0; 2932 2933 VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); 2934 2935 tx = dmu_tx_create_assigned(dp, txg); 2936 2937 /* 2938 * If we are upgrading to ZFS_VERSION_RAIDZ_DEFLATE this txg, 2939 * set spa_deflate if we have no raid-z vdevs. 2940 */ 2941 if (spa->spa_ubsync.ub_version < ZFS_VERSION_RAIDZ_DEFLATE && 2942 spa->spa_uberblock.ub_version >= ZFS_VERSION_RAIDZ_DEFLATE) { 2943 int i; 2944 2945 for (i = 0; i < rvd->vdev_children; i++) { 2946 vd = rvd->vdev_child[i]; 2947 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 2948 break; 2949 } 2950 if (i == rvd->vdev_children) { 2951 spa->spa_deflate = TRUE; 2952 VERIFY(0 == zap_add(spa->spa_meta_objset, 2953 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 2954 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 2955 } 2956 } 2957 2958 /* 2959 * If anything has changed in this txg, push the deferred frees 2960 * from the previous txg. If not, leave them alone so that we 2961 * don't generate work on an otherwise idle system. 2962 */ 2963 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 2964 !txg_list_empty(&dp->dp_dirty_dirs, txg) || 2965 !txg_list_empty(&dp->dp_sync_tasks, txg)) 2966 spa_sync_deferred_frees(spa, txg); 2967 2968 /* 2969 * Iterate to convergence. 2970 */ 2971 do { 2972 spa->spa_sync_pass++; 2973 2974 spa_sync_config_object(spa, tx); 2975 spa_sync_spares(spa, tx); 2976 spa_errlog_sync(spa, txg); 2977 dsl_pool_sync(dp, txg); 2978 2979 dirty_vdevs = 0; 2980 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 2981 vdev_sync(vd, txg); 2982 dirty_vdevs++; 2983 } 2984 2985 bplist_sync(bpl, tx); 2986 } while (dirty_vdevs); 2987 2988 bplist_close(bpl); 2989 2990 dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 2991 2992 /* 2993 * Rewrite the vdev configuration (which includes the uberblock) 2994 * to commit the transaction group. 2995 * 2996 * If there are any dirty vdevs, sync the uberblock to all vdevs. 2997 * Otherwise, pick a random top-level vdev that's known to be 2998 * visible in the config cache (see spa_vdev_add() for details). 2999 * If the write fails, try the next vdev until we're tried them all. 3000 */ 3001 if (!list_is_empty(&spa->spa_dirty_list)) { 3002 VERIFY(vdev_config_sync(rvd, txg) == 0); 3003 } else { 3004 int children = rvd->vdev_children; 3005 int c0 = spa_get_random(children); 3006 int c; 3007 3008 for (c = 0; c < children; c++) { 3009 vd = rvd->vdev_child[(c0 + c) % children]; 3010 if (vd->vdev_ms_array == 0) 3011 continue; 3012 if (vdev_config_sync(vd, txg) == 0) 3013 break; 3014 } 3015 if (c == children) 3016 VERIFY(vdev_config_sync(rvd, txg) == 0); 3017 } 3018 3019 dmu_tx_commit(tx); 3020 3021 /* 3022 * Clear the dirty config list. 3023 */ 3024 while ((vd = list_head(&spa->spa_dirty_list)) != NULL) 3025 vdev_config_clean(vd); 3026 3027 /* 3028 * Now that the new config has synced transactionally, 3029 * let it become visible to the config cache. 3030 */ 3031 if (spa->spa_config_syncing != NULL) { 3032 spa_config_set(spa, spa->spa_config_syncing); 3033 spa->spa_config_txg = txg; 3034 spa->spa_config_syncing = NULL; 3035 } 3036 3037 /* 3038 * Make a stable copy of the fully synced uberblock. 3039 * We use this as the root for pool traversals. 3040 */ 3041 spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */ 3042 3043 spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */ 3044 3045 rw_enter(&spa->spa_traverse_lock, RW_WRITER); 3046 spa->spa_traverse_wanted = 0; 3047 spa->spa_ubsync = spa->spa_uberblock; 3048 rw_exit(&spa->spa_traverse_lock); 3049 3050 spa_scrub_resume(spa); /* resume scrub with new ubsync */ 3051 3052 /* 3053 * Clean up the ZIL records for the synced txg. 3054 */ 3055 dsl_pool_zil_clean(dp); 3056 3057 /* 3058 * Update usable space statistics. 3059 */ 3060 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 3061 vdev_sync_done(vd, txg); 3062 3063 /* 3064 * It had better be the case that we didn't dirty anything 3065 * since vdev_config_sync(). 3066 */ 3067 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 3068 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 3069 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 3070 ASSERT(bpl->bpl_queue == NULL); 3071 3072 spa_config_exit(spa, FTAG); 3073 3074 /* 3075 * If any async tasks have been requested, kick them off. 3076 */ 3077 spa_async_dispatch(spa); 3078} 3079 3080/* 3081 * Sync all pools. We don't want to hold the namespace lock across these 3082 * operations, so we take a reference on the spa_t and drop the lock during the 3083 * sync. 3084 */ 3085void 3086spa_sync_allpools(void) 3087{ 3088 spa_t *spa = NULL; 3089 mutex_enter(&spa_namespace_lock); 3090 while ((spa = spa_next(spa)) != NULL) { 3091 if (spa_state(spa) != POOL_STATE_ACTIVE) 3092 continue; 3093 spa_open_ref(spa, FTAG); 3094 mutex_exit(&spa_namespace_lock); 3095 txg_wait_synced(spa_get_dsl(spa), 0); 3096 mutex_enter(&spa_namespace_lock); 3097 spa_close(spa, FTAG); 3098 } 3099 mutex_exit(&spa_namespace_lock); 3100} 3101 3102/* 3103 * ========================================================================== 3104 * Miscellaneous routines 3105 * ========================================================================== 3106 */ 3107 3108/* 3109 * Remove all pools in the system. 3110 */ 3111void 3112spa_evict_all(void) 3113{ 3114 spa_t *spa; 3115 3116 /* 3117 * Remove all cached state. All pools should be closed now, 3118 * so every spa in the AVL tree should be unreferenced. 3119 */ 3120 mutex_enter(&spa_namespace_lock); 3121 while ((spa = spa_next(NULL)) != NULL) { 3122 /* 3123 * Stop async tasks. The async thread may need to detach 3124 * a device that's been replaced, which requires grabbing 3125 * spa_namespace_lock, so we must drop it here. 3126 */ 3127 spa_open_ref(spa, FTAG); 3128 mutex_exit(&spa_namespace_lock); 3129 spa_async_suspend(spa); 3130 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 3131 mutex_enter(&spa_namespace_lock); 3132 spa_close(spa, FTAG); 3133 3134 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 3135 spa_unload(spa); 3136 spa_deactivate(spa); 3137 } 3138 spa_remove(spa); 3139 } 3140 mutex_exit(&spa_namespace_lock); 3141} 3142 3143vdev_t * 3144spa_lookup_by_guid(spa_t *spa, uint64_t guid) 3145{ 3146 return (vdev_lookup_by_guid(spa->spa_root_vdev, guid)); 3147} 3148 3149void 3150spa_upgrade(spa_t *spa) 3151{ 3152 spa_config_enter(spa, RW_WRITER, FTAG); 3153 3154 /* 3155 * This should only be called for a non-faulted pool, and since a 3156 * future version would result in an unopenable pool, this shouldn't be 3157 * possible. 3158 */ 3159 ASSERT(spa->spa_uberblock.ub_version <= ZFS_VERSION); 3160 3161 spa->spa_uberblock.ub_version = ZFS_VERSION; 3162 vdev_config_dirty(spa->spa_root_vdev); 3163 3164 spa_config_exit(spa, FTAG); 3165 3166 txg_wait_synced(spa_get_dsl(spa), 0); 3167} 3168 3169boolean_t 3170spa_has_spare(spa_t *spa, uint64_t guid) 3171{ 3172 int i; 3173 uint64_t spareguid; 3174 3175 for (i = 0; i < spa->spa_nspares; i++) 3176 if (spa->spa_spares[i]->vdev_guid == guid) 3177 return (B_TRUE); 3178 3179 for (i = 0; i < spa->spa_pending_nspares; i++) { 3180 if (nvlist_lookup_uint64(spa->spa_pending_spares[i], 3181 ZPOOL_CONFIG_GUID, &spareguid) == 0 && 3182 spareguid == guid) 3183 return (B_TRUE); 3184 } 3185 3186 return (B_FALSE); 3187} 3188 3189int 3190spa_set_props(spa_t *spa, nvlist_t *nvp) 3191{ 3192 return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, 3193 spa, nvp, 3)); 3194} 3195 3196int 3197spa_get_props(spa_t *spa, nvlist_t **nvp) 3198{ 3199 zap_cursor_t zc; 3200 zap_attribute_t za; 3201 objset_t *mos = spa->spa_meta_objset; 3202 zfs_source_t src; 3203 zfs_prop_t prop; 3204 nvlist_t *propval; 3205 uint64_t value; 3206 int err; 3207 3208 VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3209 3210 mutex_enter(&spa->spa_props_lock); 3211 /* If no props object, then just return empty nvlist */ 3212 if (spa->spa_pool_props_object == 0) { 3213 mutex_exit(&spa->spa_props_lock); 3214 return (0); 3215 } 3216 3217 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 3218 (err = zap_cursor_retrieve(&zc, &za)) == 0; 3219 zap_cursor_advance(&zc)) { 3220 3221 if ((prop = zpool_name_to_prop(za.za_name)) == ZFS_PROP_INVAL) 3222 continue; 3223 3224 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3225 switch (za.za_integer_length) { 3226 case 8: 3227 if (zfs_prop_default_numeric(prop) == 3228 za.za_first_integer) 3229 src = ZFS_SRC_DEFAULT; 3230 else 3231 src = ZFS_SRC_LOCAL; 3232 value = za.za_first_integer; 3233 3234 if (prop == ZFS_PROP_BOOTFS) { 3235 dsl_pool_t *dp; 3236 dsl_dataset_t *ds = NULL; 3237 char strval[MAXPATHLEN]; 3238 3239 dp = spa_get_dsl(spa); 3240 rw_enter(&dp->dp_config_rwlock, RW_READER); 3241 if ((err = dsl_dataset_open_obj(dp, 3242 za.za_first_integer, NULL, DS_MODE_NONE, 3243 FTAG, &ds)) != 0) { 3244 rw_exit(&dp->dp_config_rwlock); 3245 break; 3246 } 3247 dsl_dataset_name(ds, strval); 3248 dsl_dataset_close(ds, DS_MODE_NONE, FTAG); 3249 rw_exit(&dp->dp_config_rwlock); 3250 3251 VERIFY(nvlist_add_uint64(propval, 3252 ZFS_PROP_SOURCE, src) == 0); 3253 VERIFY(nvlist_add_string(propval, 3254 ZFS_PROP_VALUE, strval) == 0); 3255 } else { 3256 VERIFY(nvlist_add_uint64(propval, 3257 ZFS_PROP_SOURCE, src) == 0); 3258 VERIFY(nvlist_add_uint64(propval, 3259 ZFS_PROP_VALUE, value) == 0); 3260 } 3261 VERIFY(nvlist_add_nvlist(*nvp, za.za_name, 3262 propval) == 0); 3263 break; 3264 } 3265 nvlist_free(propval); 3266 } 3267 zap_cursor_fini(&zc); 3268 mutex_exit(&spa->spa_props_lock); 3269 if (err && err != ENOENT) { 3270 nvlist_free(*nvp); 3271 return (err); 3272 } 3273 3274 return (0); 3275} 3276 3277/* 3278 * If the bootfs property value is dsobj, clear it. 3279 */ 3280void 3281spa_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 3282{ 3283 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 3284 VERIFY(zap_remove(spa->spa_meta_objset, 3285 spa->spa_pool_props_object, 3286 zpool_prop_to_name(ZFS_PROP_BOOTFS), tx) == 0); 3287 spa->spa_bootfs = 0; 3288 } 3289} 3290