spa.c revision 211931
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22/* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27/* 28 * This file contains all the routines used when modifying on-disk SPA state. 29 * This includes opening, importing, destroying, exporting a pool, and syncing a 30 * pool. 31 */ 32 33#include <sys/zfs_context.h> 34#include <sys/fm/fs/zfs.h> 35#include <sys/spa_impl.h> 36#include <sys/zio.h> 37#include <sys/zio_checksum.h> 38#include <sys/zio_compress.h> 39#include <sys/dmu.h> 40#include <sys/dmu_tx.h> 41#include <sys/zap.h> 42#include <sys/zil.h> 43#include <sys/vdev_impl.h> 44#include <sys/metaslab.h> 45#include <sys/uberblock_impl.h> 46#include <sys/txg.h> 47#include <sys/avl.h> 48#include <sys/dmu_traverse.h> 49#include <sys/dmu_objset.h> 50#include <sys/unique.h> 51#include <sys/dsl_pool.h> 52#include <sys/dsl_dataset.h> 53#include <sys/dsl_dir.h> 54#include <sys/dsl_prop.h> 55#include <sys/dsl_synctask.h> 56#include <sys/fs/zfs.h> 57#include <sys/arc.h> 58#include <sys/callb.h> 59#include <sys/sunddi.h> 60#include <sys/spa_boot.h> 61 62#include "zfs_prop.h" 63#include "zfs_comutil.h" 64 65/* Check hostid on import? */ 66static int check_hostid = 1; 67 68SYSCTL_DECL(_vfs_zfs); 69TUNABLE_INT("vfs.zfs.check_hostid", &check_hostid); 70SYSCTL_INT(_vfs_zfs, OID_AUTO, check_hostid, CTLFLAG_RW, &check_hostid, 0, 71 "Check hostid on import?"); 72 73enum zti_modes { 74 zti_mode_fixed, /* value is # of threads (min 1) */ 75 zti_mode_online_percent, /* value is % of online CPUs */ 76 zti_mode_tune, /* fill from zio_taskq_tune_* */ 77 zti_mode_null, /* don't create a taskq */ 78 zti_nmodes 79}; 80 81#define ZTI_FIX(n) { zti_mode_fixed, (n) } 82#define ZTI_PCT(n) { zti_mode_online_percent, (n) } 83#define ZTI_TUNE { zti_mode_tune, 0 } 84#define ZTI_NULL { zti_mode_null, 0 } 85 86#define ZTI_ONE ZTI_FIX(1) 87 88typedef struct zio_taskq_info { 89 enum zti_modes zti_mode; 90 uint_t zti_value; 91} zio_taskq_info_t; 92 93static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { 94 "issue", "issue_high", "intr", "intr_high" 95}; 96 97/* 98 * Define the taskq threads for the following I/O types: 99 * NULL, READ, WRITE, FREE, CLAIM, and IOCTL 100 */ 101const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { 102 /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ 103 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 104 { ZTI_FIX(8), ZTI_NULL, ZTI_TUNE, ZTI_NULL }, 105 { ZTI_TUNE, ZTI_FIX(5), ZTI_FIX(8), ZTI_FIX(5) }, 106 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 107 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 108 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 109}; 110 111enum zti_modes zio_taskq_tune_mode = zti_mode_online_percent; 112uint_t zio_taskq_tune_value = 80; /* #threads = 80% of # online CPUs */ 113 114static void spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx); 115static boolean_t spa_has_active_shared_spare(spa_t *spa); 116 117/* 118 * ========================================================================== 119 * SPA properties routines 120 * ========================================================================== 121 */ 122 123/* 124 * Add a (source=src, propname=propval) list to an nvlist. 125 */ 126static void 127spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, 128 uint64_t intval, zprop_source_t src) 129{ 130 const char *propname = zpool_prop_to_name(prop); 131 nvlist_t *propval; 132 133 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 134 VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); 135 136 if (strval != NULL) 137 VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); 138 else 139 VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); 140 141 VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); 142 nvlist_free(propval); 143} 144 145/* 146 * Get property values from the spa configuration. 147 */ 148static void 149spa_prop_get_config(spa_t *spa, nvlist_t **nvp) 150{ 151 uint64_t size; 152 uint64_t used; 153 uint64_t cap, version; 154 zprop_source_t src = ZPROP_SRC_NONE; 155 spa_config_dirent_t *dp; 156 157 ASSERT(MUTEX_HELD(&spa->spa_props_lock)); 158 159 if (spa->spa_root_vdev != NULL) { 160 size = spa_get_space(spa); 161 used = spa_get_alloc(spa); 162 spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); 163 spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); 164 spa_prop_add_list(*nvp, ZPOOL_PROP_USED, NULL, used, src); 165 spa_prop_add_list(*nvp, ZPOOL_PROP_AVAILABLE, NULL, 166 size - used, src); 167 168 cap = (size == 0) ? 0 : (used * 100 / size); 169 spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); 170 171 spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 172 spa->spa_root_vdev->vdev_state, src); 173 174 version = spa_version(spa); 175 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) 176 src = ZPROP_SRC_DEFAULT; 177 else 178 src = ZPROP_SRC_LOCAL; 179 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); 180 } 181 182 spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); 183 184 if (spa->spa_root != NULL) 185 spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 186 0, ZPROP_SRC_LOCAL); 187 188 if ((dp = list_head(&spa->spa_config_list)) != NULL) { 189 if (dp->scd_path == NULL) { 190 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 191 "none", 0, ZPROP_SRC_LOCAL); 192 } else if (strcmp(dp->scd_path, spa_config_path) != 0) { 193 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 194 dp->scd_path, 0, ZPROP_SRC_LOCAL); 195 } 196 } 197} 198 199/* 200 * Get zpool property values. 201 */ 202int 203spa_prop_get(spa_t *spa, nvlist_t **nvp) 204{ 205 zap_cursor_t zc; 206 zap_attribute_t za; 207 objset_t *mos = spa->spa_meta_objset; 208 int err; 209 210 VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 211 212 mutex_enter(&spa->spa_props_lock); 213 214 /* 215 * Get properties from the spa config. 216 */ 217 spa_prop_get_config(spa, nvp); 218 219 /* If no pool property object, no more prop to get. */ 220 if (spa->spa_pool_props_object == 0) { 221 mutex_exit(&spa->spa_props_lock); 222 return (0); 223 } 224 225 /* 226 * Get properties from the MOS pool property object. 227 */ 228 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 229 (err = zap_cursor_retrieve(&zc, &za)) == 0; 230 zap_cursor_advance(&zc)) { 231 uint64_t intval = 0; 232 char *strval = NULL; 233 zprop_source_t src = ZPROP_SRC_DEFAULT; 234 zpool_prop_t prop; 235 236 if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL) 237 continue; 238 239 switch (za.za_integer_length) { 240 case 8: 241 /* integer property */ 242 if (za.za_first_integer != 243 zpool_prop_default_numeric(prop)) 244 src = ZPROP_SRC_LOCAL; 245 246 if (prop == ZPOOL_PROP_BOOTFS) { 247 dsl_pool_t *dp; 248 dsl_dataset_t *ds = NULL; 249 250 dp = spa_get_dsl(spa); 251 rw_enter(&dp->dp_config_rwlock, RW_READER); 252 if (err = dsl_dataset_hold_obj(dp, 253 za.za_first_integer, FTAG, &ds)) { 254 rw_exit(&dp->dp_config_rwlock); 255 break; 256 } 257 258 strval = kmem_alloc( 259 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, 260 KM_SLEEP); 261 dsl_dataset_name(ds, strval); 262 dsl_dataset_rele(ds, FTAG); 263 rw_exit(&dp->dp_config_rwlock); 264 } else { 265 strval = NULL; 266 intval = za.za_first_integer; 267 } 268 269 spa_prop_add_list(*nvp, prop, strval, intval, src); 270 271 if (strval != NULL) 272 kmem_free(strval, 273 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); 274 275 break; 276 277 case 1: 278 /* string property */ 279 strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 280 err = zap_lookup(mos, spa->spa_pool_props_object, 281 za.za_name, 1, za.za_num_integers, strval); 282 if (err) { 283 kmem_free(strval, za.za_num_integers); 284 break; 285 } 286 spa_prop_add_list(*nvp, prop, strval, 0, src); 287 kmem_free(strval, za.za_num_integers); 288 break; 289 290 default: 291 break; 292 } 293 } 294 zap_cursor_fini(&zc); 295 mutex_exit(&spa->spa_props_lock); 296out: 297 if (err && err != ENOENT) { 298 nvlist_free(*nvp); 299 *nvp = NULL; 300 return (err); 301 } 302 303 return (0); 304} 305 306/* 307 * Validate the given pool properties nvlist and modify the list 308 * for the property values to be set. 309 */ 310static int 311spa_prop_validate(spa_t *spa, nvlist_t *props) 312{ 313 nvpair_t *elem; 314 int error = 0, reset_bootfs = 0; 315 uint64_t objnum; 316 317 elem = NULL; 318 while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 319 zpool_prop_t prop; 320 char *propname, *strval; 321 uint64_t intval; 322 objset_t *os; 323 char *slash; 324 325 propname = nvpair_name(elem); 326 327 if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL) 328 return (EINVAL); 329 330 switch (prop) { 331 case ZPOOL_PROP_VERSION: 332 error = nvpair_value_uint64(elem, &intval); 333 if (!error && 334 (intval < spa_version(spa) || intval > SPA_VERSION)) 335 error = EINVAL; 336 break; 337 338 case ZPOOL_PROP_DELEGATION: 339 case ZPOOL_PROP_AUTOREPLACE: 340 case ZPOOL_PROP_LISTSNAPS: 341 error = nvpair_value_uint64(elem, &intval); 342 if (!error && intval > 1) 343 error = EINVAL; 344 break; 345 346 case ZPOOL_PROP_BOOTFS: 347 /* 348 * If the pool version is less than SPA_VERSION_BOOTFS, 349 * or the pool is still being created (version == 0), 350 * the bootfs property cannot be set. 351 */ 352 if (spa_version(spa) < SPA_VERSION_BOOTFS) { 353 error = ENOTSUP; 354 break; 355 } 356 357 /* 358 * Make sure the vdev config is bootable 359 */ 360 if (!vdev_is_bootable(spa->spa_root_vdev)) { 361 error = ENOTSUP; 362 break; 363 } 364 365 reset_bootfs = 1; 366 367 error = nvpair_value_string(elem, &strval); 368 369 if (!error) { 370 uint64_t compress; 371 372 if (strval == NULL || strval[0] == '\0') { 373 objnum = zpool_prop_default_numeric( 374 ZPOOL_PROP_BOOTFS); 375 break; 376 } 377 378 if (error = dmu_objset_open(strval, DMU_OST_ZFS, 379 DS_MODE_USER | DS_MODE_READONLY, &os)) 380 break; 381 382 /* We don't support gzip bootable datasets */ 383 if ((error = dsl_prop_get_integer(strval, 384 zfs_prop_to_name(ZFS_PROP_COMPRESSION), 385 &compress, NULL)) == 0 && 386 !BOOTFS_COMPRESS_VALID(compress)) { 387 error = ENOTSUP; 388 } else { 389 objnum = dmu_objset_id(os); 390 } 391 dmu_objset_close(os); 392 } 393 break; 394 395 case ZPOOL_PROP_FAILUREMODE: 396 error = nvpair_value_uint64(elem, &intval); 397 if (!error && (intval < ZIO_FAILURE_MODE_WAIT || 398 intval > ZIO_FAILURE_MODE_PANIC)) 399 error = EINVAL; 400 401 /* 402 * This is a special case which only occurs when 403 * the pool has completely failed. This allows 404 * the user to change the in-core failmode property 405 * without syncing it out to disk (I/Os might 406 * currently be blocked). We do this by returning 407 * EIO to the caller (spa_prop_set) to trick it 408 * into thinking we encountered a property validation 409 * error. 410 */ 411 if (!error && spa_suspended(spa)) { 412 spa->spa_failmode = intval; 413 error = EIO; 414 } 415 break; 416 417 case ZPOOL_PROP_CACHEFILE: 418 if ((error = nvpair_value_string(elem, &strval)) != 0) 419 break; 420 421 if (strval[0] == '\0') 422 break; 423 424 if (strcmp(strval, "none") == 0) 425 break; 426 427 if (strval[0] != '/') { 428 error = EINVAL; 429 break; 430 } 431 432 slash = strrchr(strval, '/'); 433 ASSERT(slash != NULL); 434 435 if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 436 strcmp(slash, "/..") == 0) 437 error = EINVAL; 438 break; 439 } 440 441 if (error) 442 break; 443 } 444 445 if (!error && reset_bootfs) { 446 error = nvlist_remove(props, 447 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 448 449 if (!error) { 450 error = nvlist_add_uint64(props, 451 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 452 } 453 } 454 455 return (error); 456} 457 458void 459spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) 460{ 461 char *cachefile; 462 spa_config_dirent_t *dp; 463 464 if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), 465 &cachefile) != 0) 466 return; 467 468 dp = kmem_alloc(sizeof (spa_config_dirent_t), 469 KM_SLEEP); 470 471 if (cachefile[0] == '\0') 472 dp->scd_path = spa_strdup(spa_config_path); 473 else if (strcmp(cachefile, "none") == 0) 474 dp->scd_path = NULL; 475 else 476 dp->scd_path = spa_strdup(cachefile); 477 478 list_insert_head(&spa->spa_config_list, dp); 479 if (need_sync) 480 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 481} 482 483int 484spa_prop_set(spa_t *spa, nvlist_t *nvp) 485{ 486 int error; 487 nvpair_t *elem; 488 boolean_t need_sync = B_FALSE; 489 zpool_prop_t prop; 490 491 if ((error = spa_prop_validate(spa, nvp)) != 0) 492 return (error); 493 494 elem = NULL; 495 while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { 496 if ((prop = zpool_name_to_prop( 497 nvpair_name(elem))) == ZPROP_INVAL) 498 return (EINVAL); 499 500 if (prop == ZPOOL_PROP_CACHEFILE || prop == ZPOOL_PROP_ALTROOT) 501 continue; 502 503 need_sync = B_TRUE; 504 break; 505 } 506 507 if (need_sync) 508 return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, 509 spa, nvp, 3)); 510 else 511 return (0); 512} 513 514/* 515 * If the bootfs property value is dsobj, clear it. 516 */ 517void 518spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 519{ 520 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 521 VERIFY(zap_remove(spa->spa_meta_objset, 522 spa->spa_pool_props_object, 523 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 524 spa->spa_bootfs = 0; 525 } 526} 527 528/* 529 * ========================================================================== 530 * SPA state manipulation (open/create/destroy/import/export) 531 * ========================================================================== 532 */ 533 534static int 535spa_error_entry_compare(const void *a, const void *b) 536{ 537 spa_error_entry_t *sa = (spa_error_entry_t *)a; 538 spa_error_entry_t *sb = (spa_error_entry_t *)b; 539 int ret; 540 541 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 542 sizeof (zbookmark_t)); 543 544 if (ret < 0) 545 return (-1); 546 else if (ret > 0) 547 return (1); 548 else 549 return (0); 550} 551 552/* 553 * Utility function which retrieves copies of the current logs and 554 * re-initializes them in the process. 555 */ 556void 557spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 558{ 559 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 560 561 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 562 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 563 564 avl_create(&spa->spa_errlist_scrub, 565 spa_error_entry_compare, sizeof (spa_error_entry_t), 566 offsetof(spa_error_entry_t, se_avl)); 567 avl_create(&spa->spa_errlist_last, 568 spa_error_entry_compare, sizeof (spa_error_entry_t), 569 offsetof(spa_error_entry_t, se_avl)); 570} 571 572/* 573 * Activate an uninitialized pool. 574 */ 575static void 576spa_activate(spa_t *spa, int mode) 577{ 578 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 579 580 spa->spa_state = POOL_STATE_ACTIVE; 581 spa->spa_mode = mode; 582 583 spa->spa_normal_class = metaslab_class_create(zfs_metaslab_ops); 584 spa->spa_log_class = metaslab_class_create(zfs_metaslab_ops); 585 586 for (int t = 0; t < ZIO_TYPES; t++) { 587 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 588 const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; 589 enum zti_modes mode = ztip->zti_mode; 590 uint_t value = ztip->zti_value; 591 char name[32]; 592 593 (void) snprintf(name, sizeof (name), 594 "%s_%s", zio_type_name[t], zio_taskq_types[q]); 595 596 if (mode == zti_mode_tune) { 597 mode = zio_taskq_tune_mode; 598 value = zio_taskq_tune_value; 599 if (mode == zti_mode_tune) 600 mode = zti_mode_online_percent; 601 } 602 603 switch (mode) { 604 case zti_mode_fixed: 605 ASSERT3U(value, >=, 1); 606 value = MAX(value, 1); 607 608 spa->spa_zio_taskq[t][q] = taskq_create(name, 609 value, maxclsyspri, 50, INT_MAX, 610 TASKQ_PREPOPULATE); 611 break; 612 613 case zti_mode_online_percent: 614 spa->spa_zio_taskq[t][q] = taskq_create(name, 615 value, maxclsyspri, 50, INT_MAX, 616 TASKQ_PREPOPULATE | TASKQ_THREADS_CPU_PCT); 617 break; 618 619 case zti_mode_null: 620 spa->spa_zio_taskq[t][q] = NULL; 621 break; 622 623 case zti_mode_tune: 624 default: 625 panic("unrecognized mode for " 626 "zio_taskqs[%u]->zti_nthreads[%u] (%u:%u) " 627 "in spa_activate()", 628 t, q, mode, value); 629 break; 630 } 631 } 632 } 633 634 list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), 635 offsetof(vdev_t, vdev_config_dirty_node)); 636 list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), 637 offsetof(vdev_t, vdev_state_dirty_node)); 638 639 txg_list_create(&spa->spa_vdev_txg_list, 640 offsetof(struct vdev, vdev_txg_node)); 641 642 avl_create(&spa->spa_errlist_scrub, 643 spa_error_entry_compare, sizeof (spa_error_entry_t), 644 offsetof(spa_error_entry_t, se_avl)); 645 avl_create(&spa->spa_errlist_last, 646 spa_error_entry_compare, sizeof (spa_error_entry_t), 647 offsetof(spa_error_entry_t, se_avl)); 648} 649 650/* 651 * Opposite of spa_activate(). 652 */ 653static void 654spa_deactivate(spa_t *spa) 655{ 656 ASSERT(spa->spa_sync_on == B_FALSE); 657 ASSERT(spa->spa_dsl_pool == NULL); 658 ASSERT(spa->spa_root_vdev == NULL); 659 ASSERT(spa->spa_async_zio_root == NULL); 660 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 661 662 txg_list_destroy(&spa->spa_vdev_txg_list); 663 664 list_destroy(&spa->spa_config_dirty_list); 665 list_destroy(&spa->spa_state_dirty_list); 666 667 for (int t = 0; t < ZIO_TYPES; t++) { 668 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 669 if (spa->spa_zio_taskq[t][q] != NULL) 670 taskq_destroy(spa->spa_zio_taskq[t][q]); 671 spa->spa_zio_taskq[t][q] = NULL; 672 } 673 } 674 675 metaslab_class_destroy(spa->spa_normal_class); 676 spa->spa_normal_class = NULL; 677 678 metaslab_class_destroy(spa->spa_log_class); 679 spa->spa_log_class = NULL; 680 681 /* 682 * If this was part of an import or the open otherwise failed, we may 683 * still have errors left in the queues. Empty them just in case. 684 */ 685 spa_errlog_drain(spa); 686 687 avl_destroy(&spa->spa_errlist_scrub); 688 avl_destroy(&spa->spa_errlist_last); 689 690 spa->spa_state = POOL_STATE_UNINITIALIZED; 691} 692 693/* 694 * Verify a pool configuration, and construct the vdev tree appropriately. This 695 * will create all the necessary vdevs in the appropriate layout, with each vdev 696 * in the CLOSED state. This will prep the pool before open/creation/import. 697 * All vdev validation is done by the vdev_alloc() routine. 698 */ 699static int 700spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 701 uint_t id, int atype) 702{ 703 nvlist_t **child; 704 uint_t c, children; 705 int error; 706 707 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 708 return (error); 709 710 if ((*vdp)->vdev_ops->vdev_op_leaf) 711 return (0); 712 713 error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 714 &child, &children); 715 716 if (error == ENOENT) 717 return (0); 718 719 if (error) { 720 vdev_free(*vdp); 721 *vdp = NULL; 722 return (EINVAL); 723 } 724 725 for (c = 0; c < children; c++) { 726 vdev_t *vd; 727 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 728 atype)) != 0) { 729 vdev_free(*vdp); 730 *vdp = NULL; 731 return (error); 732 } 733 } 734 735 ASSERT(*vdp != NULL); 736 737 return (0); 738} 739 740/* 741 * Opposite of spa_load(). 742 */ 743static void 744spa_unload(spa_t *spa) 745{ 746 int i; 747 748 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 749 750 /* 751 * Stop async tasks. 752 */ 753 spa_async_suspend(spa); 754 755 /* 756 * Stop syncing. 757 */ 758 if (spa->spa_sync_on) { 759 txg_sync_stop(spa->spa_dsl_pool); 760 spa->spa_sync_on = B_FALSE; 761 } 762 763 /* 764 * Wait for any outstanding async I/O to complete. 765 */ 766 if (spa->spa_async_zio_root != NULL) { 767 (void) zio_wait(spa->spa_async_zio_root); 768 spa->spa_async_zio_root = NULL; 769 } 770 771 /* 772 * Close the dsl pool. 773 */ 774 if (spa->spa_dsl_pool) { 775 dsl_pool_close(spa->spa_dsl_pool); 776 spa->spa_dsl_pool = NULL; 777 } 778 779 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 780 781 /* 782 * Drop and purge level 2 cache 783 */ 784 spa_l2cache_drop(spa); 785 786 /* 787 * Close all vdevs. 788 */ 789 if (spa->spa_root_vdev) 790 vdev_free(spa->spa_root_vdev); 791 ASSERT(spa->spa_root_vdev == NULL); 792 793 for (i = 0; i < spa->spa_spares.sav_count; i++) 794 vdev_free(spa->spa_spares.sav_vdevs[i]); 795 if (spa->spa_spares.sav_vdevs) { 796 kmem_free(spa->spa_spares.sav_vdevs, 797 spa->spa_spares.sav_count * sizeof (void *)); 798 spa->spa_spares.sav_vdevs = NULL; 799 } 800 if (spa->spa_spares.sav_config) { 801 nvlist_free(spa->spa_spares.sav_config); 802 spa->spa_spares.sav_config = NULL; 803 } 804 spa->spa_spares.sav_count = 0; 805 806 for (i = 0; i < spa->spa_l2cache.sav_count; i++) 807 vdev_free(spa->spa_l2cache.sav_vdevs[i]); 808 if (spa->spa_l2cache.sav_vdevs) { 809 kmem_free(spa->spa_l2cache.sav_vdevs, 810 spa->spa_l2cache.sav_count * sizeof (void *)); 811 spa->spa_l2cache.sav_vdevs = NULL; 812 } 813 if (spa->spa_l2cache.sav_config) { 814 nvlist_free(spa->spa_l2cache.sav_config); 815 spa->spa_l2cache.sav_config = NULL; 816 } 817 spa->spa_l2cache.sav_count = 0; 818 819 spa->spa_async_suspended = 0; 820 821 spa_config_exit(spa, SCL_ALL, FTAG); 822} 823 824/* 825 * Load (or re-load) the current list of vdevs describing the active spares for 826 * this pool. When this is called, we have some form of basic information in 827 * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and 828 * then re-generate a more complete list including status information. 829 */ 830static void 831spa_load_spares(spa_t *spa) 832{ 833 nvlist_t **spares; 834 uint_t nspares; 835 int i; 836 vdev_t *vd, *tvd; 837 838 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 839 840 /* 841 * First, close and free any existing spare vdevs. 842 */ 843 for (i = 0; i < spa->spa_spares.sav_count; i++) { 844 vd = spa->spa_spares.sav_vdevs[i]; 845 846 /* Undo the call to spa_activate() below */ 847 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 848 B_FALSE)) != NULL && tvd->vdev_isspare) 849 spa_spare_remove(tvd); 850 vdev_close(vd); 851 vdev_free(vd); 852 } 853 854 if (spa->spa_spares.sav_vdevs) 855 kmem_free(spa->spa_spares.sav_vdevs, 856 spa->spa_spares.sav_count * sizeof (void *)); 857 858 if (spa->spa_spares.sav_config == NULL) 859 nspares = 0; 860 else 861 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 862 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 863 864 spa->spa_spares.sav_count = (int)nspares; 865 spa->spa_spares.sav_vdevs = NULL; 866 867 if (nspares == 0) 868 return; 869 870 /* 871 * Construct the array of vdevs, opening them to get status in the 872 * process. For each spare, there is potentially two different vdev_t 873 * structures associated with it: one in the list of spares (used only 874 * for basic validation purposes) and one in the active vdev 875 * configuration (if it's spared in). During this phase we open and 876 * validate each vdev on the spare list. If the vdev also exists in the 877 * active configuration, then we also mark this vdev as an active spare. 878 */ 879 spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), 880 KM_SLEEP); 881 for (i = 0; i < spa->spa_spares.sav_count; i++) { 882 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 883 VDEV_ALLOC_SPARE) == 0); 884 ASSERT(vd != NULL); 885 886 spa->spa_spares.sav_vdevs[i] = vd; 887 888 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 889 B_FALSE)) != NULL) { 890 if (!tvd->vdev_isspare) 891 spa_spare_add(tvd); 892 893 /* 894 * We only mark the spare active if we were successfully 895 * able to load the vdev. Otherwise, importing a pool 896 * with a bad active spare would result in strange 897 * behavior, because multiple pool would think the spare 898 * is actively in use. 899 * 900 * There is a vulnerability here to an equally bizarre 901 * circumstance, where a dead active spare is later 902 * brought back to life (onlined or otherwise). Given 903 * the rarity of this scenario, and the extra complexity 904 * it adds, we ignore the possibility. 905 */ 906 if (!vdev_is_dead(tvd)) 907 spa_spare_activate(tvd); 908 } 909 910 vd->vdev_top = vd; 911 vd->vdev_aux = &spa->spa_spares; 912 913 if (vdev_open(vd) != 0) 914 continue; 915 916 if (vdev_validate_aux(vd) == 0) 917 spa_spare_add(vd); 918 } 919 920 /* 921 * Recompute the stashed list of spares, with status information 922 * this time. 923 */ 924 VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, 925 DATA_TYPE_NVLIST_ARRAY) == 0); 926 927 spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), 928 KM_SLEEP); 929 for (i = 0; i < spa->spa_spares.sav_count; i++) 930 spares[i] = vdev_config_generate(spa, 931 spa->spa_spares.sav_vdevs[i], B_TRUE, B_TRUE, B_FALSE); 932 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 933 ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); 934 for (i = 0; i < spa->spa_spares.sav_count; i++) 935 nvlist_free(spares[i]); 936 kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); 937} 938 939/* 940 * Load (or re-load) the current list of vdevs describing the active l2cache for 941 * this pool. When this is called, we have some form of basic information in 942 * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and 943 * then re-generate a more complete list including status information. 944 * Devices which are already active have their details maintained, and are 945 * not re-opened. 946 */ 947static void 948spa_load_l2cache(spa_t *spa) 949{ 950 nvlist_t **l2cache; 951 uint_t nl2cache; 952 int i, j, oldnvdevs; 953 uint64_t guid, size; 954 vdev_t *vd, **oldvdevs, **newvdevs; 955 spa_aux_vdev_t *sav = &spa->spa_l2cache; 956 957 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 958 959 if (sav->sav_config != NULL) { 960 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 961 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 962 newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); 963 } else { 964 nl2cache = 0; 965 } 966 967 oldvdevs = sav->sav_vdevs; 968 oldnvdevs = sav->sav_count; 969 sav->sav_vdevs = NULL; 970 sav->sav_count = 0; 971 972 /* 973 * Process new nvlist of vdevs. 974 */ 975 for (i = 0; i < nl2cache; i++) { 976 VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, 977 &guid) == 0); 978 979 newvdevs[i] = NULL; 980 for (j = 0; j < oldnvdevs; j++) { 981 vd = oldvdevs[j]; 982 if (vd != NULL && guid == vd->vdev_guid) { 983 /* 984 * Retain previous vdev for add/remove ops. 985 */ 986 newvdevs[i] = vd; 987 oldvdevs[j] = NULL; 988 break; 989 } 990 } 991 992 if (newvdevs[i] == NULL) { 993 /* 994 * Create new vdev 995 */ 996 VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, 997 VDEV_ALLOC_L2CACHE) == 0); 998 ASSERT(vd != NULL); 999 newvdevs[i] = vd; 1000 1001 /* 1002 * Commit this vdev as an l2cache device, 1003 * even if it fails to open. 1004 */ 1005 spa_l2cache_add(vd); 1006 1007 vd->vdev_top = vd; 1008 vd->vdev_aux = sav; 1009 1010 spa_l2cache_activate(vd); 1011 1012 if (vdev_open(vd) != 0) 1013 continue; 1014 1015 (void) vdev_validate_aux(vd); 1016 1017 if (!vdev_is_dead(vd)) { 1018 size = vdev_get_rsize(vd); 1019 l2arc_add_vdev(spa, vd, 1020 VDEV_LABEL_START_SIZE, 1021 size - VDEV_LABEL_START_SIZE); 1022 } 1023 } 1024 } 1025 1026 /* 1027 * Purge vdevs that were dropped 1028 */ 1029 for (i = 0; i < oldnvdevs; i++) { 1030 uint64_t pool; 1031 1032 vd = oldvdevs[i]; 1033 if (vd != NULL) { 1034 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 1035 pool != 0ULL && l2arc_vdev_present(vd)) 1036 l2arc_remove_vdev(vd); 1037 (void) vdev_close(vd); 1038 spa_l2cache_remove(vd); 1039 } 1040 } 1041 1042 if (oldvdevs) 1043 kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); 1044 1045 if (sav->sav_config == NULL) 1046 goto out; 1047 1048 sav->sav_vdevs = newvdevs; 1049 sav->sav_count = (int)nl2cache; 1050 1051 /* 1052 * Recompute the stashed list of l2cache devices, with status 1053 * information this time. 1054 */ 1055 VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 1056 DATA_TYPE_NVLIST_ARRAY) == 0); 1057 1058 l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 1059 for (i = 0; i < sav->sav_count; i++) 1060 l2cache[i] = vdev_config_generate(spa, 1061 sav->sav_vdevs[i], B_TRUE, B_FALSE, B_TRUE); 1062 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 1063 ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); 1064out: 1065 for (i = 0; i < sav->sav_count; i++) 1066 nvlist_free(l2cache[i]); 1067 if (sav->sav_count) 1068 kmem_free(l2cache, sav->sav_count * sizeof (void *)); 1069} 1070 1071static int 1072load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 1073{ 1074 dmu_buf_t *db; 1075 char *packed = NULL; 1076 size_t nvsize = 0; 1077 int error; 1078 *value = NULL; 1079 1080 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 1081 nvsize = *(uint64_t *)db->db_data; 1082 dmu_buf_rele(db, FTAG); 1083 1084 packed = kmem_alloc(nvsize, KM_SLEEP); 1085 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, 1086 DMU_READ_PREFETCH); 1087 if (error == 0) 1088 error = nvlist_unpack(packed, nvsize, value, 0); 1089 kmem_free(packed, nvsize); 1090 1091 return (error); 1092} 1093 1094/* 1095 * Checks to see if the given vdev could not be opened, in which case we post a 1096 * sysevent to notify the autoreplace code that the device has been removed. 1097 */ 1098static void 1099spa_check_removed(vdev_t *vd) 1100{ 1101 int c; 1102 1103 for (c = 0; c < vd->vdev_children; c++) 1104 spa_check_removed(vd->vdev_child[c]); 1105 1106 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) { 1107 zfs_post_autoreplace(vd->vdev_spa, vd); 1108 spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK); 1109 } 1110} 1111 1112/* 1113 * Check for missing log devices 1114 */ 1115int 1116spa_check_logs(spa_t *spa) 1117{ 1118 switch (spa->spa_log_state) { 1119 case SPA_LOG_MISSING: 1120 /* need to recheck in case slog has been restored */ 1121 case SPA_LOG_UNKNOWN: 1122 if (dmu_objset_find(spa->spa_name, zil_check_log_chain, NULL, 1123 DS_FIND_CHILDREN)) { 1124 spa->spa_log_state = SPA_LOG_MISSING; 1125 return (1); 1126 } 1127 break; 1128 1129 case SPA_LOG_CLEAR: 1130 (void) dmu_objset_find(spa->spa_name, zil_clear_log_chain, NULL, 1131 DS_FIND_CHILDREN); 1132 break; 1133 } 1134 spa->spa_log_state = SPA_LOG_GOOD; 1135 return (0); 1136} 1137 1138/* 1139 * Load an existing storage pool, using the pool's builtin spa_config as a 1140 * source of configuration information. 1141 */ 1142static int 1143spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) 1144{ 1145 int error = 0; 1146 nvlist_t *nvroot = NULL; 1147 vdev_t *rvd; 1148 uberblock_t *ub = &spa->spa_uberblock; 1149 uint64_t config_cache_txg = spa->spa_config_txg; 1150 uint64_t pool_guid; 1151 uint64_t version; 1152 uint64_t autoreplace = 0; 1153 int orig_mode = spa->spa_mode; 1154 char *ereport = FM_EREPORT_ZFS_POOL; 1155 1156 /* 1157 * If this is an untrusted config, access the pool in read-only mode. 1158 * This prevents things like resilvering recently removed devices. 1159 */ 1160 if (!mosconfig) 1161 spa->spa_mode = FREAD; 1162 1163 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1164 1165 spa->spa_load_state = state; 1166 1167 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 1168 nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 1169 error = EINVAL; 1170 goto out; 1171 } 1172 1173 /* 1174 * Versioning wasn't explicitly added to the label until later, so if 1175 * it's not present treat it as the initial version. 1176 */ 1177 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0) 1178 version = SPA_VERSION_INITIAL; 1179 1180 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 1181 &spa->spa_config_txg); 1182 1183 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 1184 spa_guid_exists(pool_guid, 0)) { 1185 error = EEXIST; 1186 goto out; 1187 } 1188 1189 spa->spa_load_guid = pool_guid; 1190 1191 /* 1192 * Create "The Godfather" zio to hold all async IOs 1193 */ 1194 spa->spa_async_zio_root = zio_root(spa, NULL, NULL, 1195 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); 1196 1197 /* 1198 * Parse the configuration into a vdev tree. We explicitly set the 1199 * value that will be returned by spa_version() since parsing the 1200 * configuration requires knowing the version number. 1201 */ 1202 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1203 spa->spa_ubsync.ub_version = version; 1204 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 1205 spa_config_exit(spa, SCL_ALL, FTAG); 1206 1207 if (error != 0) 1208 goto out; 1209 1210 ASSERT(spa->spa_root_vdev == rvd); 1211 ASSERT(spa_guid(spa) == pool_guid); 1212 1213 /* 1214 * Try to open all vdevs, loading each label in the process. 1215 */ 1216 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1217 error = vdev_open(rvd); 1218 spa_config_exit(spa, SCL_ALL, FTAG); 1219 if (error != 0) 1220 goto out; 1221 1222 /* 1223 * We need to validate the vdev labels against the configuration that 1224 * we have in hand, which is dependent on the setting of mosconfig. If 1225 * mosconfig is true then we're validating the vdev labels based on 1226 * that config. Otherwise, we're validating against the cached config 1227 * (zpool.cache) that was read when we loaded the zfs module, and then 1228 * later we will recursively call spa_load() and validate against 1229 * the vdev config. 1230 */ 1231 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1232 error = vdev_validate(rvd); 1233 spa_config_exit(spa, SCL_ALL, FTAG); 1234 if (error != 0) 1235 goto out; 1236 1237 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 1238 error = ENXIO; 1239 goto out; 1240 } 1241 1242 /* 1243 * Find the best uberblock. 1244 */ 1245 vdev_uberblock_load(NULL, rvd, ub); 1246 1247 /* 1248 * If we weren't able to find a single valid uberblock, return failure. 1249 */ 1250 if (ub->ub_txg == 0) { 1251 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1252 VDEV_AUX_CORRUPT_DATA); 1253 error = ENXIO; 1254 goto out; 1255 } 1256 1257 /* 1258 * If the pool is newer than the code, we can't open it. 1259 */ 1260 if (ub->ub_version > SPA_VERSION) { 1261 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1262 VDEV_AUX_VERSION_NEWER); 1263 error = ENOTSUP; 1264 goto out; 1265 } 1266 1267 /* 1268 * If the vdev guid sum doesn't match the uberblock, we have an 1269 * incomplete configuration. 1270 */ 1271 if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 1272 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1273 VDEV_AUX_BAD_GUID_SUM); 1274 error = ENXIO; 1275 goto out; 1276 } 1277 1278 /* 1279 * Initialize internal SPA structures. 1280 */ 1281 spa->spa_state = POOL_STATE_ACTIVE; 1282 spa->spa_ubsync = spa->spa_uberblock; 1283 spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 1284 error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 1285 if (error) { 1286 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1287 VDEV_AUX_CORRUPT_DATA); 1288 goto out; 1289 } 1290 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 1291 1292 if (zap_lookup(spa->spa_meta_objset, 1293 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 1294 sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { 1295 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1296 VDEV_AUX_CORRUPT_DATA); 1297 error = EIO; 1298 goto out; 1299 } 1300 1301 if (!mosconfig) { 1302 nvlist_t *newconfig; 1303 uint64_t hostid; 1304 1305 if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) { 1306 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1307 VDEV_AUX_CORRUPT_DATA); 1308 error = EIO; 1309 goto out; 1310 } 1311 1312 if (!spa_is_root(spa) && nvlist_lookup_uint64(newconfig, 1313 ZPOOL_CONFIG_HOSTID, &hostid) == 0) { 1314 char *hostname; 1315 unsigned long myhostid = 0; 1316 1317 VERIFY(nvlist_lookup_string(newconfig, 1318 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 1319 1320 (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 1321 if (check_hostid && hostid != 0 && myhostid != 0 && 1322 (unsigned long)hostid != myhostid) { 1323 cmn_err(CE_WARN, "pool '%s' could not be " 1324 "loaded as it was last accessed by " 1325 "another system (host: %s hostid: 0x%lx). " 1326 "See: http://www.sun.com/msg/ZFS-8000-EY", 1327 spa_name(spa), hostname, 1328 (unsigned long)hostid); 1329 error = EBADF; 1330 goto out; 1331 } 1332 } 1333 1334 spa_config_set(spa, newconfig); 1335 spa_unload(spa); 1336 spa_deactivate(spa); 1337 spa_activate(spa, orig_mode); 1338 1339 return (spa_load(spa, newconfig, state, B_TRUE)); 1340 } 1341 1342 if (zap_lookup(spa->spa_meta_objset, 1343 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 1344 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { 1345 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1346 VDEV_AUX_CORRUPT_DATA); 1347 error = EIO; 1348 goto out; 1349 } 1350 1351 /* 1352 * Load the bit that tells us to use the new accounting function 1353 * (raid-z deflation). If we have an older pool, this will not 1354 * be present. 1355 */ 1356 error = zap_lookup(spa->spa_meta_objset, 1357 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 1358 sizeof (uint64_t), 1, &spa->spa_deflate); 1359 if (error != 0 && error != ENOENT) { 1360 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1361 VDEV_AUX_CORRUPT_DATA); 1362 error = EIO; 1363 goto out; 1364 } 1365 1366 /* 1367 * Load the persistent error log. If we have an older pool, this will 1368 * not be present. 1369 */ 1370 error = zap_lookup(spa->spa_meta_objset, 1371 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, 1372 sizeof (uint64_t), 1, &spa->spa_errlog_last); 1373 if (error != 0 && error != ENOENT) { 1374 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1375 VDEV_AUX_CORRUPT_DATA); 1376 error = EIO; 1377 goto out; 1378 } 1379 1380 error = zap_lookup(spa->spa_meta_objset, 1381 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, 1382 sizeof (uint64_t), 1, &spa->spa_errlog_scrub); 1383 if (error != 0 && error != ENOENT) { 1384 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1385 VDEV_AUX_CORRUPT_DATA); 1386 error = EIO; 1387 goto out; 1388 } 1389 1390 /* 1391 * Load the history object. If we have an older pool, this 1392 * will not be present. 1393 */ 1394 error = zap_lookup(spa->spa_meta_objset, 1395 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY, 1396 sizeof (uint64_t), 1, &spa->spa_history); 1397 if (error != 0 && error != ENOENT) { 1398 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1399 VDEV_AUX_CORRUPT_DATA); 1400 error = EIO; 1401 goto out; 1402 } 1403 1404 /* 1405 * Load any hot spares for this pool. 1406 */ 1407 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1408 DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares.sav_object); 1409 if (error != 0 && error != ENOENT) { 1410 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1411 VDEV_AUX_CORRUPT_DATA); 1412 error = EIO; 1413 goto out; 1414 } 1415 if (error == 0) { 1416 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 1417 if (load_nvlist(spa, spa->spa_spares.sav_object, 1418 &spa->spa_spares.sav_config) != 0) { 1419 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1420 VDEV_AUX_CORRUPT_DATA); 1421 error = EIO; 1422 goto out; 1423 } 1424 1425 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1426 spa_load_spares(spa); 1427 spa_config_exit(spa, SCL_ALL, FTAG); 1428 } 1429 1430 /* 1431 * Load any level 2 ARC devices for this pool. 1432 */ 1433 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1434 DMU_POOL_L2CACHE, sizeof (uint64_t), 1, 1435 &spa->spa_l2cache.sav_object); 1436 if (error != 0 && error != ENOENT) { 1437 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1438 VDEV_AUX_CORRUPT_DATA); 1439 error = EIO; 1440 goto out; 1441 } 1442 if (error == 0) { 1443 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 1444 if (load_nvlist(spa, spa->spa_l2cache.sav_object, 1445 &spa->spa_l2cache.sav_config) != 0) { 1446 vdev_set_state(rvd, B_TRUE, 1447 VDEV_STATE_CANT_OPEN, 1448 VDEV_AUX_CORRUPT_DATA); 1449 error = EIO; 1450 goto out; 1451 } 1452 1453 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1454 spa_load_l2cache(spa); 1455 spa_config_exit(spa, SCL_ALL, FTAG); 1456 } 1457 1458 if (spa_check_logs(spa)) { 1459 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1460 VDEV_AUX_BAD_LOG); 1461 error = ENXIO; 1462 ereport = FM_EREPORT_ZFS_LOG_REPLAY; 1463 goto out; 1464 } 1465 1466 1467 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 1468 1469 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1470 DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object); 1471 1472 if (error && error != ENOENT) { 1473 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1474 VDEV_AUX_CORRUPT_DATA); 1475 error = EIO; 1476 goto out; 1477 } 1478 1479 if (error == 0) { 1480 (void) zap_lookup(spa->spa_meta_objset, 1481 spa->spa_pool_props_object, 1482 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), 1483 sizeof (uint64_t), 1, &spa->spa_bootfs); 1484 (void) zap_lookup(spa->spa_meta_objset, 1485 spa->spa_pool_props_object, 1486 zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), 1487 sizeof (uint64_t), 1, &autoreplace); 1488 (void) zap_lookup(spa->spa_meta_objset, 1489 spa->spa_pool_props_object, 1490 zpool_prop_to_name(ZPOOL_PROP_DELEGATION), 1491 sizeof (uint64_t), 1, &spa->spa_delegation); 1492 (void) zap_lookup(spa->spa_meta_objset, 1493 spa->spa_pool_props_object, 1494 zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE), 1495 sizeof (uint64_t), 1, &spa->spa_failmode); 1496 } 1497 1498 /* 1499 * If the 'autoreplace' property is set, then post a resource notifying 1500 * the ZFS DE that it should not issue any faults for unopenable 1501 * devices. We also iterate over the vdevs, and post a sysevent for any 1502 * unopenable vdevs so that the normal autoreplace handler can take 1503 * over. 1504 */ 1505 if (autoreplace && state != SPA_LOAD_TRYIMPORT) 1506 spa_check_removed(spa->spa_root_vdev); 1507 1508 /* 1509 * Load the vdev state for all toplevel vdevs. 1510 */ 1511 vdev_load(rvd); 1512 1513 /* 1514 * Propagate the leaf DTLs we just loaded all the way up the tree. 1515 */ 1516 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1517 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 1518 spa_config_exit(spa, SCL_ALL, FTAG); 1519 1520 /* 1521 * Check the state of the root vdev. If it can't be opened, it 1522 * indicates one or more toplevel vdevs are faulted. 1523 */ 1524 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 1525 error = ENXIO; 1526 goto out; 1527 } 1528 1529 if (spa_writeable(spa)) { 1530 dmu_tx_t *tx; 1531 int need_update = B_FALSE; 1532 1533 ASSERT(state != SPA_LOAD_TRYIMPORT); 1534 1535 /* 1536 * Claim log blocks that haven't been committed yet. 1537 * This must all happen in a single txg. 1538 */ 1539 tx = dmu_tx_create_assigned(spa_get_dsl(spa), 1540 spa_first_txg(spa)); 1541 (void) dmu_objset_find(spa_name(spa), 1542 zil_claim, tx, DS_FIND_CHILDREN); 1543 dmu_tx_commit(tx); 1544 1545 spa->spa_sync_on = B_TRUE; 1546 txg_sync_start(spa->spa_dsl_pool); 1547 1548 /* 1549 * Wait for all claims to sync. 1550 */ 1551 txg_wait_synced(spa->spa_dsl_pool, 0); 1552 1553 /* 1554 * If the config cache is stale, or we have uninitialized 1555 * metaslabs (see spa_vdev_add()), then update the config. 1556 * 1557 * If spa_load_verbatim is true, trust the current 1558 * in-core spa_config and update the disk labels. 1559 */ 1560 if (config_cache_txg != spa->spa_config_txg || 1561 state == SPA_LOAD_IMPORT || spa->spa_load_verbatim) 1562 need_update = B_TRUE; 1563 1564 for (int c = 0; c < rvd->vdev_children; c++) 1565 if (rvd->vdev_child[c]->vdev_ms_array == 0) 1566 need_update = B_TRUE; 1567 1568 /* 1569 * Update the config cache asychronously in case we're the 1570 * root pool, in which case the config cache isn't writable yet. 1571 */ 1572 if (need_update) 1573 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 1574 1575 /* 1576 * Check all DTLs to see if anything needs resilvering. 1577 */ 1578 if (vdev_resilver_needed(rvd, NULL, NULL)) 1579 spa_async_request(spa, SPA_ASYNC_RESILVER); 1580 } 1581 1582 error = 0; 1583out: 1584 spa->spa_minref = refcount_count(&spa->spa_refcount); 1585 if (error && error != EBADF) 1586 zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); 1587 spa->spa_load_state = SPA_LOAD_NONE; 1588 spa->spa_ena = 0; 1589 1590 return (error); 1591} 1592 1593/* 1594 * Pool Open/Import 1595 * 1596 * The import case is identical to an open except that the configuration is sent 1597 * down from userland, instead of grabbed from the configuration cache. For the 1598 * case of an open, the pool configuration will exist in the 1599 * POOL_STATE_UNINITIALIZED state. 1600 * 1601 * The stats information (gen/count/ustats) is used to gather vdev statistics at 1602 * the same time open the pool, without having to keep around the spa_t in some 1603 * ambiguous state. 1604 */ 1605static int 1606spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 1607{ 1608 spa_t *spa; 1609 int error; 1610 int locked = B_FALSE; 1611 1612 *spapp = NULL; 1613 1614 /* 1615 * As disgusting as this is, we need to support recursive calls to this 1616 * function because dsl_dir_open() is called during spa_load(), and ends 1617 * up calling spa_open() again. The real fix is to figure out how to 1618 * avoid dsl_dir_open() calling this in the first place. 1619 */ 1620 if (mutex_owner(&spa_namespace_lock) != curthread) { 1621 mutex_enter(&spa_namespace_lock); 1622 locked = B_TRUE; 1623 } 1624 1625 if ((spa = spa_lookup(pool)) == NULL) { 1626 if (locked) 1627 mutex_exit(&spa_namespace_lock); 1628 return (ENOENT); 1629 } 1630 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 1631 1632 spa_activate(spa, spa_mode_global); 1633 1634 error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE); 1635 1636 if (error == EBADF) { 1637 /* 1638 * If vdev_validate() returns failure (indicated by 1639 * EBADF), it indicates that one of the vdevs indicates 1640 * that the pool has been exported or destroyed. If 1641 * this is the case, the config cache is out of sync and 1642 * we should remove the pool from the namespace. 1643 */ 1644 spa_unload(spa); 1645 spa_deactivate(spa); 1646 spa_config_sync(spa, B_TRUE, B_TRUE); 1647 spa_remove(spa); 1648 if (locked) 1649 mutex_exit(&spa_namespace_lock); 1650 return (ENOENT); 1651 } 1652 1653 if (error) { 1654 /* 1655 * We can't open the pool, but we still have useful 1656 * information: the state of each vdev after the 1657 * attempted vdev_open(). Return this to the user. 1658 */ 1659 if (config != NULL && spa->spa_root_vdev != NULL) 1660 *config = spa_config_generate(spa, NULL, -1ULL, 1661 B_TRUE); 1662 spa_unload(spa); 1663 spa_deactivate(spa); 1664 spa->spa_last_open_failed = B_TRUE; 1665 if (locked) 1666 mutex_exit(&spa_namespace_lock); 1667 *spapp = NULL; 1668 return (error); 1669 } else { 1670 spa->spa_last_open_failed = B_FALSE; 1671 } 1672 } 1673 1674 spa_open_ref(spa, tag); 1675 1676 if (locked) 1677 mutex_exit(&spa_namespace_lock); 1678 1679 *spapp = spa; 1680 1681 if (config != NULL) 1682 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 1683 1684 return (0); 1685} 1686 1687int 1688spa_open(const char *name, spa_t **spapp, void *tag) 1689{ 1690 return (spa_open_common(name, spapp, tag, NULL)); 1691} 1692 1693/* 1694 * Lookup the given spa_t, incrementing the inject count in the process, 1695 * preventing it from being exported or destroyed. 1696 */ 1697spa_t * 1698spa_inject_addref(char *name) 1699{ 1700 spa_t *spa; 1701 1702 mutex_enter(&spa_namespace_lock); 1703 if ((spa = spa_lookup(name)) == NULL) { 1704 mutex_exit(&spa_namespace_lock); 1705 return (NULL); 1706 } 1707 spa->spa_inject_ref++; 1708 mutex_exit(&spa_namespace_lock); 1709 1710 return (spa); 1711} 1712 1713void 1714spa_inject_delref(spa_t *spa) 1715{ 1716 mutex_enter(&spa_namespace_lock); 1717 spa->spa_inject_ref--; 1718 mutex_exit(&spa_namespace_lock); 1719} 1720 1721/* 1722 * Add spares device information to the nvlist. 1723 */ 1724static void 1725spa_add_spares(spa_t *spa, nvlist_t *config) 1726{ 1727 nvlist_t **spares; 1728 uint_t i, nspares; 1729 nvlist_t *nvroot; 1730 uint64_t guid; 1731 vdev_stat_t *vs; 1732 uint_t vsc; 1733 uint64_t pool; 1734 1735 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 1736 1737 if (spa->spa_spares.sav_count == 0) 1738 return; 1739 1740 VERIFY(nvlist_lookup_nvlist(config, 1741 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 1742 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 1743 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1744 if (nspares != 0) { 1745 VERIFY(nvlist_add_nvlist_array(nvroot, 1746 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1747 VERIFY(nvlist_lookup_nvlist_array(nvroot, 1748 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1749 1750 /* 1751 * Go through and find any spares which have since been 1752 * repurposed as an active spare. If this is the case, update 1753 * their status appropriately. 1754 */ 1755 for (i = 0; i < nspares; i++) { 1756 VERIFY(nvlist_lookup_uint64(spares[i], 1757 ZPOOL_CONFIG_GUID, &guid) == 0); 1758 if (spa_spare_exists(guid, &pool, NULL) && 1759 pool != 0ULL) { 1760 VERIFY(nvlist_lookup_uint64_array( 1761 spares[i], ZPOOL_CONFIG_STATS, 1762 (uint64_t **)&vs, &vsc) == 0); 1763 vs->vs_state = VDEV_STATE_CANT_OPEN; 1764 vs->vs_aux = VDEV_AUX_SPARED; 1765 } 1766 } 1767 } 1768} 1769 1770/* 1771 * Add l2cache device information to the nvlist, including vdev stats. 1772 */ 1773static void 1774spa_add_l2cache(spa_t *spa, nvlist_t *config) 1775{ 1776 nvlist_t **l2cache; 1777 uint_t i, j, nl2cache; 1778 nvlist_t *nvroot; 1779 uint64_t guid; 1780 vdev_t *vd; 1781 vdev_stat_t *vs; 1782 uint_t vsc; 1783 1784 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 1785 1786 if (spa->spa_l2cache.sav_count == 0) 1787 return; 1788 1789 VERIFY(nvlist_lookup_nvlist(config, 1790 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 1791 VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 1792 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 1793 if (nl2cache != 0) { 1794 VERIFY(nvlist_add_nvlist_array(nvroot, 1795 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 1796 VERIFY(nvlist_lookup_nvlist_array(nvroot, 1797 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 1798 1799 /* 1800 * Update level 2 cache device stats. 1801 */ 1802 1803 for (i = 0; i < nl2cache; i++) { 1804 VERIFY(nvlist_lookup_uint64(l2cache[i], 1805 ZPOOL_CONFIG_GUID, &guid) == 0); 1806 1807 vd = NULL; 1808 for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 1809 if (guid == 1810 spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 1811 vd = spa->spa_l2cache.sav_vdevs[j]; 1812 break; 1813 } 1814 } 1815 ASSERT(vd != NULL); 1816 1817 VERIFY(nvlist_lookup_uint64_array(l2cache[i], 1818 ZPOOL_CONFIG_STATS, (uint64_t **)&vs, &vsc) == 0); 1819 vdev_get_stats(vd, vs); 1820 } 1821 } 1822} 1823 1824int 1825spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 1826{ 1827 int error; 1828 spa_t *spa; 1829 1830 *config = NULL; 1831 error = spa_open_common(name, &spa, FTAG, config); 1832 1833 if (spa != NULL) { 1834 /* 1835 * This still leaves a window of inconsistency where the spares 1836 * or l2cache devices could change and the config would be 1837 * self-inconsistent. 1838 */ 1839 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 1840 1841 if (*config != NULL) { 1842 VERIFY(nvlist_add_uint64(*config, 1843 ZPOOL_CONFIG_ERRCOUNT, 1844 spa_get_errlog_size(spa)) == 0); 1845 1846 if (spa_suspended(spa)) 1847 VERIFY(nvlist_add_uint64(*config, 1848 ZPOOL_CONFIG_SUSPENDED, 1849 spa->spa_failmode) == 0); 1850 1851 spa_add_spares(spa, *config); 1852 spa_add_l2cache(spa, *config); 1853 } 1854 } 1855 1856 /* 1857 * We want to get the alternate root even for faulted pools, so we cheat 1858 * and call spa_lookup() directly. 1859 */ 1860 if (altroot) { 1861 if (spa == NULL) { 1862 mutex_enter(&spa_namespace_lock); 1863 spa = spa_lookup(name); 1864 if (spa) 1865 spa_altroot(spa, altroot, buflen); 1866 else 1867 altroot[0] = '\0'; 1868 spa = NULL; 1869 mutex_exit(&spa_namespace_lock); 1870 } else { 1871 spa_altroot(spa, altroot, buflen); 1872 } 1873 } 1874 1875 if (spa != NULL) { 1876 spa_config_exit(spa, SCL_CONFIG, FTAG); 1877 spa_close(spa, FTAG); 1878 } 1879 1880 return (error); 1881} 1882 1883/* 1884 * Validate that the auxiliary device array is well formed. We must have an 1885 * array of nvlists, each which describes a valid leaf vdev. If this is an 1886 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 1887 * specified, as long as they are well-formed. 1888 */ 1889static int 1890spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 1891 spa_aux_vdev_t *sav, const char *config, uint64_t version, 1892 vdev_labeltype_t label) 1893{ 1894 nvlist_t **dev; 1895 uint_t i, ndev; 1896 vdev_t *vd; 1897 int error; 1898 1899 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1900 1901 /* 1902 * It's acceptable to have no devs specified. 1903 */ 1904 if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 1905 return (0); 1906 1907 if (ndev == 0) 1908 return (EINVAL); 1909 1910 /* 1911 * Make sure the pool is formatted with a version that supports this 1912 * device type. 1913 */ 1914 if (spa_version(spa) < version) 1915 return (ENOTSUP); 1916 1917 /* 1918 * Set the pending device list so we correctly handle device in-use 1919 * checking. 1920 */ 1921 sav->sav_pending = dev; 1922 sav->sav_npending = ndev; 1923 1924 for (i = 0; i < ndev; i++) { 1925 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 1926 mode)) != 0) 1927 goto out; 1928 1929 if (!vd->vdev_ops->vdev_op_leaf) { 1930 vdev_free(vd); 1931 error = EINVAL; 1932 goto out; 1933 } 1934 1935 /* 1936 * The L2ARC currently only supports disk devices in 1937 * kernel context. For user-level testing, we allow it. 1938 */ 1939#ifdef _KERNEL 1940 if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && 1941 strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { 1942 error = ENOTBLK; 1943 goto out; 1944 } 1945#endif 1946 vd->vdev_top = vd; 1947 1948 if ((error = vdev_open(vd)) == 0 && 1949 (error = vdev_label_init(vd, crtxg, label)) == 0) { 1950 VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 1951 vd->vdev_guid) == 0); 1952 } 1953 1954 vdev_free(vd); 1955 1956 if (error && 1957 (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 1958 goto out; 1959 else 1960 error = 0; 1961 } 1962 1963out: 1964 sav->sav_pending = NULL; 1965 sav->sav_npending = 0; 1966 return (error); 1967} 1968 1969static int 1970spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 1971{ 1972 int error; 1973 1974 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1975 1976 if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 1977 &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 1978 VDEV_LABEL_SPARE)) != 0) { 1979 return (error); 1980 } 1981 1982 return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 1983 &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 1984 VDEV_LABEL_L2CACHE)); 1985} 1986 1987static void 1988spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 1989 const char *config) 1990{ 1991 int i; 1992 1993 if (sav->sav_config != NULL) { 1994 nvlist_t **olddevs; 1995 uint_t oldndevs; 1996 nvlist_t **newdevs; 1997 1998 /* 1999 * Generate new dev list by concatentating with the 2000 * current dev list. 2001 */ 2002 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, 2003 &olddevs, &oldndevs) == 0); 2004 2005 newdevs = kmem_alloc(sizeof (void *) * 2006 (ndevs + oldndevs), KM_SLEEP); 2007 for (i = 0; i < oldndevs; i++) 2008 VERIFY(nvlist_dup(olddevs[i], &newdevs[i], 2009 KM_SLEEP) == 0); 2010 for (i = 0; i < ndevs; i++) 2011 VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], 2012 KM_SLEEP) == 0); 2013 2014 VERIFY(nvlist_remove(sav->sav_config, config, 2015 DATA_TYPE_NVLIST_ARRAY) == 0); 2016 2017 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 2018 config, newdevs, ndevs + oldndevs) == 0); 2019 for (i = 0; i < oldndevs + ndevs; i++) 2020 nvlist_free(newdevs[i]); 2021 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 2022 } else { 2023 /* 2024 * Generate a new dev list. 2025 */ 2026 VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, 2027 KM_SLEEP) == 0); 2028 VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, 2029 devs, ndevs) == 0); 2030 } 2031} 2032 2033/* 2034 * Stop and drop level 2 ARC devices 2035 */ 2036void 2037spa_l2cache_drop(spa_t *spa) 2038{ 2039 vdev_t *vd; 2040 int i; 2041 spa_aux_vdev_t *sav = &spa->spa_l2cache; 2042 2043 for (i = 0; i < sav->sav_count; i++) { 2044 uint64_t pool; 2045 2046 vd = sav->sav_vdevs[i]; 2047 ASSERT(vd != NULL); 2048 2049 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 2050 pool != 0ULL && l2arc_vdev_present(vd)) 2051 l2arc_remove_vdev(vd); 2052 if (vd->vdev_isl2cache) 2053 spa_l2cache_remove(vd); 2054 vdev_clear_stats(vd); 2055 (void) vdev_close(vd); 2056 } 2057} 2058 2059/* 2060 * Pool Creation 2061 */ 2062int 2063spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 2064 const char *history_str, nvlist_t *zplprops) 2065{ 2066 spa_t *spa; 2067 char *altroot = NULL; 2068 vdev_t *rvd; 2069 dsl_pool_t *dp; 2070 dmu_tx_t *tx; 2071 int c, error = 0; 2072 uint64_t txg = TXG_INITIAL; 2073 nvlist_t **spares, **l2cache; 2074 uint_t nspares, nl2cache; 2075 uint64_t version; 2076 2077 /* 2078 * If this pool already exists, return failure. 2079 */ 2080 mutex_enter(&spa_namespace_lock); 2081 if (spa_lookup(pool) != NULL) { 2082 mutex_exit(&spa_namespace_lock); 2083 return (EEXIST); 2084 } 2085 2086 /* 2087 * Allocate a new spa_t structure. 2088 */ 2089 (void) nvlist_lookup_string(props, 2090 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 2091 spa = spa_add(pool, altroot); 2092 spa_activate(spa, spa_mode_global); 2093 2094 spa->spa_uberblock.ub_txg = txg - 1; 2095 2096 if (props && (error = spa_prop_validate(spa, props))) { 2097 spa_deactivate(spa); 2098 spa_remove(spa); 2099 mutex_exit(&spa_namespace_lock); 2100 return (error); 2101 } 2102 2103 if (nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), 2104 &version) != 0) 2105 version = SPA_VERSION; 2106 ASSERT(version <= SPA_VERSION); 2107 spa->spa_uberblock.ub_version = version; 2108 spa->spa_ubsync = spa->spa_uberblock; 2109 2110 /* 2111 * Create "The Godfather" zio to hold all async IOs 2112 */ 2113 spa->spa_async_zio_root = zio_root(spa, NULL, NULL, 2114 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); 2115 2116 /* 2117 * Create the root vdev. 2118 */ 2119 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2120 2121 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 2122 2123 ASSERT(error != 0 || rvd != NULL); 2124 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 2125 2126 if (error == 0 && !zfs_allocatable_devs(nvroot)) 2127 error = EINVAL; 2128 2129 if (error == 0 && 2130 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 2131 (error = spa_validate_aux(spa, nvroot, txg, 2132 VDEV_ALLOC_ADD)) == 0) { 2133 for (c = 0; c < rvd->vdev_children; c++) 2134 vdev_init(rvd->vdev_child[c], txg); 2135 vdev_config_dirty(rvd); 2136 } 2137 2138 spa_config_exit(spa, SCL_ALL, FTAG); 2139 2140 if (error != 0) { 2141 spa_unload(spa); 2142 spa_deactivate(spa); 2143 spa_remove(spa); 2144 mutex_exit(&spa_namespace_lock); 2145 return (error); 2146 } 2147 2148 /* 2149 * Get the list of spares, if specified. 2150 */ 2151 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2152 &spares, &nspares) == 0) { 2153 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, 2154 KM_SLEEP) == 0); 2155 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 2156 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 2157 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2158 spa_load_spares(spa); 2159 spa_config_exit(spa, SCL_ALL, FTAG); 2160 spa->spa_spares.sav_sync = B_TRUE; 2161 } 2162 2163 /* 2164 * Get the list of level 2 cache devices, if specified. 2165 */ 2166 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 2167 &l2cache, &nl2cache) == 0) { 2168 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 2169 NV_UNIQUE_NAME, KM_SLEEP) == 0); 2170 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 2171 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 2172 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2173 spa_load_l2cache(spa); 2174 spa_config_exit(spa, SCL_ALL, FTAG); 2175 spa->spa_l2cache.sav_sync = B_TRUE; 2176 } 2177 2178 spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); 2179 spa->spa_meta_objset = dp->dp_meta_objset; 2180 2181 tx = dmu_tx_create_assigned(dp, txg); 2182 2183 /* 2184 * Create the pool config object. 2185 */ 2186 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 2187 DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, 2188 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 2189 2190 if (zap_add(spa->spa_meta_objset, 2191 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 2192 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 2193 cmn_err(CE_PANIC, "failed to add pool config"); 2194 } 2195 2196 /* Newly created pools with the right version are always deflated. */ 2197 if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 2198 spa->spa_deflate = TRUE; 2199 if (zap_add(spa->spa_meta_objset, 2200 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 2201 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 2202 cmn_err(CE_PANIC, "failed to add deflate"); 2203 } 2204 } 2205 2206 /* 2207 * Create the deferred-free bplist object. Turn off compression 2208 * because sync-to-convergence takes longer if the blocksize 2209 * keeps changing. 2210 */ 2211 spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 2212 1 << 14, tx); 2213 dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 2214 ZIO_COMPRESS_OFF, tx); 2215 2216 if (zap_add(spa->spa_meta_objset, 2217 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 2218 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { 2219 cmn_err(CE_PANIC, "failed to add bplist"); 2220 } 2221 2222 /* 2223 * Create the pool's history object. 2224 */ 2225 if (version >= SPA_VERSION_ZPOOL_HISTORY) 2226 spa_history_create_obj(spa, tx); 2227 2228 /* 2229 * Set pool properties. 2230 */ 2231 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 2232 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 2233 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 2234 if (props != NULL) { 2235 spa_configfile_set(spa, props, B_FALSE); 2236 spa_sync_props(spa, props, CRED(), tx); 2237 } 2238 2239 dmu_tx_commit(tx); 2240 2241 spa->spa_sync_on = B_TRUE; 2242 txg_sync_start(spa->spa_dsl_pool); 2243 2244 /* 2245 * We explicitly wait for the first transaction to complete so that our 2246 * bean counters are appropriately updated. 2247 */ 2248 txg_wait_synced(spa->spa_dsl_pool, txg); 2249 2250 spa_config_sync(spa, B_FALSE, B_TRUE); 2251 2252 if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL) 2253 (void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE); 2254 2255 spa->spa_minref = refcount_count(&spa->spa_refcount); 2256 2257 mutex_exit(&spa_namespace_lock); 2258 2259 return (0); 2260} 2261 2262#ifdef sun 2263#ifdef _KERNEL 2264/* 2265 * Build a "root" vdev for a top level vdev read in from a rootpool 2266 * device label. 2267 */ 2268static void 2269spa_build_rootpool_config(nvlist_t *config) 2270{ 2271 nvlist_t *nvtop, *nvroot; 2272 uint64_t pgid; 2273 2274 /* 2275 * Add this top-level vdev to the child array. 2276 */ 2277 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtop) 2278 == 0); 2279 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pgid) 2280 == 0); 2281 2282 /* 2283 * Put this pool's top-level vdevs into a root vdev. 2284 */ 2285 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 2286 VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) 2287 == 0); 2288 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 2289 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 2290 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 2291 &nvtop, 1) == 0); 2292 2293 /* 2294 * Replace the existing vdev_tree with the new root vdev in 2295 * this pool's configuration (remove the old, add the new). 2296 */ 2297 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 2298 nvlist_free(nvroot); 2299} 2300 2301/* 2302 * Get the root pool information from the root disk, then import the root pool 2303 * during the system boot up time. 2304 */ 2305extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); 2306 2307int 2308spa_check_rootconf(char *devpath, char *devid, nvlist_t **bestconf, 2309 uint64_t *besttxg) 2310{ 2311 nvlist_t *config; 2312 uint64_t txg; 2313 int error; 2314 2315 if (error = vdev_disk_read_rootlabel(devpath, devid, &config)) 2316 return (error); 2317 2318 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); 2319 2320 if (bestconf != NULL) 2321 *bestconf = config; 2322 else 2323 nvlist_free(config); 2324 *besttxg = txg; 2325 return (0); 2326} 2327 2328boolean_t 2329spa_rootdev_validate(nvlist_t *nv) 2330{ 2331 uint64_t ival; 2332 2333 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &ival) == 0 || 2334 nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, &ival) == 0 || 2335 nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, &ival) == 0) 2336 return (B_FALSE); 2337 2338 return (B_TRUE); 2339} 2340 2341 2342/* 2343 * Given the boot device's physical path or devid, check if the device 2344 * is in a valid state. If so, return the configuration from the vdev 2345 * label. 2346 */ 2347int 2348spa_get_rootconf(char *devpath, char *devid, nvlist_t **bestconf) 2349{ 2350 nvlist_t *conf = NULL; 2351 uint64_t txg = 0; 2352 nvlist_t *nvtop, **child; 2353 char *type; 2354 char *bootpath = NULL; 2355 uint_t children, c; 2356 char *tmp; 2357 int error; 2358 2359 if (devpath && ((tmp = strchr(devpath, ' ')) != NULL)) 2360 *tmp = '\0'; 2361 if (error = spa_check_rootconf(devpath, devid, &conf, &txg)) { 2362 cmn_err(CE_NOTE, "error reading device label"); 2363 return (error); 2364 } 2365 if (txg == 0) { 2366 cmn_err(CE_NOTE, "this device is detached"); 2367 nvlist_free(conf); 2368 return (EINVAL); 2369 } 2370 2371 VERIFY(nvlist_lookup_nvlist(conf, ZPOOL_CONFIG_VDEV_TREE, 2372 &nvtop) == 0); 2373 VERIFY(nvlist_lookup_string(nvtop, ZPOOL_CONFIG_TYPE, &type) == 0); 2374 2375 if (strcmp(type, VDEV_TYPE_DISK) == 0) { 2376 if (spa_rootdev_validate(nvtop)) { 2377 goto out; 2378 } else { 2379 nvlist_free(conf); 2380 return (EINVAL); 2381 } 2382 } 2383 2384 ASSERT(strcmp(type, VDEV_TYPE_MIRROR) == 0); 2385 2386 VERIFY(nvlist_lookup_nvlist_array(nvtop, ZPOOL_CONFIG_CHILDREN, 2387 &child, &children) == 0); 2388 2389 /* 2390 * Go thru vdevs in the mirror to see if the given device 2391 * has the most recent txg. Only the device with the most 2392 * recent txg has valid information and should be booted. 2393 */ 2394 for (c = 0; c < children; c++) { 2395 char *cdevid, *cpath; 2396 uint64_t tmptxg; 2397 2398 cpath = NULL; 2399 cdevid = NULL; 2400 if (nvlist_lookup_string(child[c], ZPOOL_CONFIG_PHYS_PATH, 2401 &cpath) != 0 && nvlist_lookup_string(child[c], 2402 ZPOOL_CONFIG_DEVID, &cdevid) != 0) 2403 return (EINVAL); 2404 if ((spa_check_rootconf(cpath, cdevid, NULL, 2405 &tmptxg) == 0) && (tmptxg > txg)) { 2406 txg = tmptxg; 2407 VERIFY(nvlist_lookup_string(child[c], 2408 ZPOOL_CONFIG_PATH, &bootpath) == 0); 2409 } 2410 } 2411 2412 /* Does the best device match the one we've booted from? */ 2413 if (bootpath) { 2414 cmn_err(CE_NOTE, "try booting from '%s'", bootpath); 2415 return (EINVAL); 2416 } 2417out: 2418 *bestconf = conf; 2419 return (0); 2420} 2421 2422/* 2423 * Import a root pool. 2424 * 2425 * For x86. devpath_list will consist of devid and/or physpath name of 2426 * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). 2427 * The GRUB "findroot" command will return the vdev we should boot. 2428 * 2429 * For Sparc, devpath_list consists the physpath name of the booting device 2430 * no matter the rootpool is a single device pool or a mirrored pool. 2431 * e.g. 2432 * "/pci@1f,0/ide@d/disk@0,0:a" 2433 */ 2434int 2435spa_import_rootpool(char *devpath, char *devid) 2436{ 2437 nvlist_t *conf = NULL; 2438 char *pname; 2439 int error; 2440 spa_t *spa; 2441 2442 /* 2443 * Get the vdev pathname and configuation from the most 2444 * recently updated vdev (highest txg). 2445 */ 2446 if (error = spa_get_rootconf(devpath, devid, &conf)) 2447 goto msg_out; 2448 2449 /* 2450 * Add type "root" vdev to the config. 2451 */ 2452 spa_build_rootpool_config(conf); 2453 2454 VERIFY(nvlist_lookup_string(conf, ZPOOL_CONFIG_POOL_NAME, &pname) == 0); 2455 2456 mutex_enter(&spa_namespace_lock); 2457 if ((spa = spa_lookup(pname)) != NULL) { 2458 /* 2459 * Remove the existing root pool from the namespace so that we 2460 * can replace it with the correct config we just read in. 2461 */ 2462 spa_remove(spa); 2463 } 2464 2465 spa = spa_add(pname, NULL); 2466 spa->spa_is_root = B_TRUE; 2467 spa->spa_load_verbatim = B_TRUE; 2468 2469 VERIFY(nvlist_dup(conf, &spa->spa_config, 0) == 0); 2470 mutex_exit(&spa_namespace_lock); 2471 2472 nvlist_free(conf); 2473 return (0); 2474 2475msg_out: 2476 cmn_err(CE_NOTE, "\n" 2477 " *************************************************** \n" 2478 " * This device is not bootable! * \n" 2479 " * It is either offlined or detached or faulted. * \n" 2480 " * Please try to boot from a different device. * \n" 2481 " *************************************************** "); 2482 2483 return (error); 2484} 2485#endif 2486#endif /* sun */ 2487 2488/* 2489 * Take a pool and insert it into the namespace as if it had been loaded at 2490 * boot. 2491 */ 2492int 2493spa_import_verbatim(const char *pool, nvlist_t *config, nvlist_t *props) 2494{ 2495 spa_t *spa; 2496 char *altroot = NULL; 2497 2498 mutex_enter(&spa_namespace_lock); 2499 if (spa_lookup(pool) != NULL) { 2500 mutex_exit(&spa_namespace_lock); 2501 return (EEXIST); 2502 } 2503 2504 (void) nvlist_lookup_string(props, 2505 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 2506 spa = spa_add(pool, altroot); 2507 2508 spa->spa_load_verbatim = B_TRUE; 2509 2510 VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0); 2511 2512 if (props != NULL) 2513 spa_configfile_set(spa, props, B_FALSE); 2514 2515 spa_config_sync(spa, B_FALSE, B_TRUE); 2516 2517 mutex_exit(&spa_namespace_lock); 2518 2519 return (0); 2520} 2521 2522/* 2523 * Import a non-root pool into the system. 2524 */ 2525int 2526spa_import(const char *pool, nvlist_t *config, nvlist_t *props) 2527{ 2528 spa_t *spa; 2529 char *altroot = NULL; 2530 int error; 2531 nvlist_t *nvroot; 2532 nvlist_t **spares, **l2cache; 2533 uint_t nspares, nl2cache; 2534 2535 /* 2536 * If a pool with this name exists, return failure. 2537 */ 2538 mutex_enter(&spa_namespace_lock); 2539 if ((spa = spa_lookup(pool)) != NULL) { 2540 mutex_exit(&spa_namespace_lock); 2541 return (EEXIST); 2542 } 2543 2544 /* 2545 * Create and initialize the spa structure. 2546 */ 2547 (void) nvlist_lookup_string(props, 2548 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 2549 spa = spa_add(pool, altroot); 2550 spa_activate(spa, spa_mode_global); 2551 2552 /* 2553 * Don't start async tasks until we know everything is healthy. 2554 */ 2555 spa_async_suspend(spa); 2556 2557 /* 2558 * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig 2559 * because the user-supplied config is actually the one to trust when 2560 * doing an import. 2561 */ 2562 error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE); 2563 2564 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2565 /* 2566 * Toss any existing sparelist, as it doesn't have any validity 2567 * anymore, and conflicts with spa_has_spare(). 2568 */ 2569 if (spa->spa_spares.sav_config) { 2570 nvlist_free(spa->spa_spares.sav_config); 2571 spa->spa_spares.sav_config = NULL; 2572 spa_load_spares(spa); 2573 } 2574 if (spa->spa_l2cache.sav_config) { 2575 nvlist_free(spa->spa_l2cache.sav_config); 2576 spa->spa_l2cache.sav_config = NULL; 2577 spa_load_l2cache(spa); 2578 } 2579 2580 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 2581 &nvroot) == 0); 2582 if (error == 0) 2583 error = spa_validate_aux(spa, nvroot, -1ULL, 2584 VDEV_ALLOC_SPARE); 2585 if (error == 0) 2586 error = spa_validate_aux(spa, nvroot, -1ULL, 2587 VDEV_ALLOC_L2CACHE); 2588 spa_config_exit(spa, SCL_ALL, FTAG); 2589 2590 if (props != NULL) 2591 spa_configfile_set(spa, props, B_FALSE); 2592 2593 if (error != 0 || (props && spa_writeable(spa) && 2594 (error = spa_prop_set(spa, props)))) { 2595 spa_unload(spa); 2596 spa_deactivate(spa); 2597 spa_remove(spa); 2598 mutex_exit(&spa_namespace_lock); 2599 return (error); 2600 } 2601 2602 spa_async_resume(spa); 2603 2604 /* 2605 * Override any spares and level 2 cache devices as specified by 2606 * the user, as these may have correct device names/devids, etc. 2607 */ 2608 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2609 &spares, &nspares) == 0) { 2610 if (spa->spa_spares.sav_config) 2611 VERIFY(nvlist_remove(spa->spa_spares.sav_config, 2612 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 2613 else 2614 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, 2615 NV_UNIQUE_NAME, KM_SLEEP) == 0); 2616 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 2617 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 2618 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2619 spa_load_spares(spa); 2620 spa_config_exit(spa, SCL_ALL, FTAG); 2621 spa->spa_spares.sav_sync = B_TRUE; 2622 } 2623 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 2624 &l2cache, &nl2cache) == 0) { 2625 if (spa->spa_l2cache.sav_config) 2626 VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, 2627 ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); 2628 else 2629 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 2630 NV_UNIQUE_NAME, KM_SLEEP) == 0); 2631 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 2632 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 2633 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2634 spa_load_l2cache(spa); 2635 spa_config_exit(spa, SCL_ALL, FTAG); 2636 spa->spa_l2cache.sav_sync = B_TRUE; 2637 } 2638 2639 if (spa_writeable(spa)) { 2640 /* 2641 * Update the config cache to include the newly-imported pool. 2642 */ 2643 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 2644 } 2645 2646 mutex_exit(&spa_namespace_lock); 2647 2648 return (0); 2649} 2650 2651/* 2652 * This (illegal) pool name is used when temporarily importing a spa_t in order 2653 * to get the vdev stats associated with the imported devices. 2654 */ 2655#define TRYIMPORT_NAME "$import" 2656 2657nvlist_t * 2658spa_tryimport(nvlist_t *tryconfig) 2659{ 2660 nvlist_t *config = NULL; 2661 char *poolname; 2662 spa_t *spa; 2663 uint64_t state; 2664 int error; 2665 2666 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 2667 return (NULL); 2668 2669 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 2670 return (NULL); 2671 2672 /* 2673 * Create and initialize the spa structure. 2674 */ 2675 mutex_enter(&spa_namespace_lock); 2676 spa = spa_add(TRYIMPORT_NAME, NULL); 2677 spa_activate(spa, FREAD); 2678 2679 /* 2680 * Pass off the heavy lifting to spa_load(). 2681 * Pass TRUE for mosconfig because the user-supplied config 2682 * is actually the one to trust when doing an import. 2683 */ 2684 error = spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); 2685 2686 /* 2687 * If 'tryconfig' was at least parsable, return the current config. 2688 */ 2689 if (spa->spa_root_vdev != NULL) { 2690 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 2691 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 2692 poolname) == 0); 2693 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 2694 state) == 0); 2695 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 2696 spa->spa_uberblock.ub_timestamp) == 0); 2697 2698 /* 2699 * If the bootfs property exists on this pool then we 2700 * copy it out so that external consumers can tell which 2701 * pools are bootable. 2702 */ 2703 if ((!error || error == EEXIST) && spa->spa_bootfs) { 2704 char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 2705 2706 /* 2707 * We have to play games with the name since the 2708 * pool was opened as TRYIMPORT_NAME. 2709 */ 2710 if (dsl_dsobj_to_dsname(spa_name(spa), 2711 spa->spa_bootfs, tmpname) == 0) { 2712 char *cp; 2713 char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 2714 2715 cp = strchr(tmpname, '/'); 2716 if (cp == NULL) { 2717 (void) strlcpy(dsname, tmpname, 2718 MAXPATHLEN); 2719 } else { 2720 (void) snprintf(dsname, MAXPATHLEN, 2721 "%s/%s", poolname, ++cp); 2722 } 2723 VERIFY(nvlist_add_string(config, 2724 ZPOOL_CONFIG_BOOTFS, dsname) == 0); 2725 kmem_free(dsname, MAXPATHLEN); 2726 } 2727 kmem_free(tmpname, MAXPATHLEN); 2728 } 2729 2730 /* 2731 * Add the list of hot spares and level 2 cache devices. 2732 */ 2733 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 2734 spa_add_spares(spa, config); 2735 spa_add_l2cache(spa, config); 2736 spa_config_exit(spa, SCL_CONFIG, FTAG); 2737 } 2738 2739 spa_unload(spa); 2740 spa_deactivate(spa); 2741 spa_remove(spa); 2742 mutex_exit(&spa_namespace_lock); 2743 2744 return (config); 2745} 2746 2747/* 2748 * Pool export/destroy 2749 * 2750 * The act of destroying or exporting a pool is very simple. We make sure there 2751 * is no more pending I/O and any references to the pool are gone. Then, we 2752 * update the pool state and sync all the labels to disk, removing the 2753 * configuration from the cache afterwards. If the 'hardforce' flag is set, then 2754 * we don't sync the labels or remove the configuration cache. 2755 */ 2756static int 2757spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, 2758 boolean_t force, boolean_t hardforce) 2759{ 2760 spa_t *spa; 2761 2762 if (oldconfig) 2763 *oldconfig = NULL; 2764 2765 if (!(spa_mode_global & FWRITE)) 2766 return (EROFS); 2767 2768 mutex_enter(&spa_namespace_lock); 2769 if ((spa = spa_lookup(pool)) == NULL) { 2770 mutex_exit(&spa_namespace_lock); 2771 return (ENOENT); 2772 } 2773 2774 /* 2775 * Put a hold on the pool, drop the namespace lock, stop async tasks, 2776 * reacquire the namespace lock, and see if we can export. 2777 */ 2778 spa_open_ref(spa, FTAG); 2779 mutex_exit(&spa_namespace_lock); 2780 spa_async_suspend(spa); 2781 mutex_enter(&spa_namespace_lock); 2782 spa_close(spa, FTAG); 2783 2784 /* 2785 * The pool will be in core if it's openable, 2786 * in which case we can modify its state. 2787 */ 2788 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 2789 /* 2790 * Objsets may be open only because they're dirty, so we 2791 * have to force it to sync before checking spa_refcnt. 2792 */ 2793 txg_wait_synced(spa->spa_dsl_pool, 0); 2794 2795 /* 2796 * A pool cannot be exported or destroyed if there are active 2797 * references. If we are resetting a pool, allow references by 2798 * fault injection handlers. 2799 */ 2800 if (!spa_refcount_zero(spa) || 2801 (spa->spa_inject_ref != 0 && 2802 new_state != POOL_STATE_UNINITIALIZED)) { 2803 spa_async_resume(spa); 2804 mutex_exit(&spa_namespace_lock); 2805 return (EBUSY); 2806 } 2807 2808 /* 2809 * A pool cannot be exported if it has an active shared spare. 2810 * This is to prevent other pools stealing the active spare 2811 * from an exported pool. At user's own will, such pool can 2812 * be forcedly exported. 2813 */ 2814 if (!force && new_state == POOL_STATE_EXPORTED && 2815 spa_has_active_shared_spare(spa)) { 2816 spa_async_resume(spa); 2817 mutex_exit(&spa_namespace_lock); 2818 return (EXDEV); 2819 } 2820 2821 /* 2822 * We want this to be reflected on every label, 2823 * so mark them all dirty. spa_unload() will do the 2824 * final sync that pushes these changes out. 2825 */ 2826 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 2827 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2828 spa->spa_state = new_state; 2829 spa->spa_final_txg = spa_last_synced_txg(spa) + 1; 2830 vdev_config_dirty(spa->spa_root_vdev); 2831 spa_config_exit(spa, SCL_ALL, FTAG); 2832 } 2833 } 2834 2835 spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY); 2836 2837 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 2838 spa_unload(spa); 2839 spa_deactivate(spa); 2840 } 2841 2842 if (oldconfig && spa->spa_config) 2843 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 2844 2845 if (new_state != POOL_STATE_UNINITIALIZED) { 2846 if (!hardforce) 2847 spa_config_sync(spa, B_TRUE, B_TRUE); 2848 spa_remove(spa); 2849 } 2850 mutex_exit(&spa_namespace_lock); 2851 2852 return (0); 2853} 2854 2855/* 2856 * Destroy a storage pool. 2857 */ 2858int 2859spa_destroy(char *pool) 2860{ 2861 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, 2862 B_FALSE, B_FALSE)); 2863} 2864 2865/* 2866 * Export a storage pool. 2867 */ 2868int 2869spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, 2870 boolean_t hardforce) 2871{ 2872 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, 2873 force, hardforce)); 2874} 2875 2876/* 2877 * Similar to spa_export(), this unloads the spa_t without actually removing it 2878 * from the namespace in any way. 2879 */ 2880int 2881spa_reset(char *pool) 2882{ 2883 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, 2884 B_FALSE, B_FALSE)); 2885} 2886 2887/* 2888 * ========================================================================== 2889 * Device manipulation 2890 * ========================================================================== 2891 */ 2892 2893/* 2894 * Add a device to a storage pool. 2895 */ 2896int 2897spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 2898{ 2899 uint64_t txg; 2900 int error; 2901 vdev_t *rvd = spa->spa_root_vdev; 2902 vdev_t *vd, *tvd; 2903 nvlist_t **spares, **l2cache; 2904 uint_t nspares, nl2cache; 2905 2906 txg = spa_vdev_enter(spa); 2907 2908 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 2909 VDEV_ALLOC_ADD)) != 0) 2910 return (spa_vdev_exit(spa, NULL, txg, error)); 2911 2912 spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ 2913 2914 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 2915 &nspares) != 0) 2916 nspares = 0; 2917 2918 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 2919 &nl2cache) != 0) 2920 nl2cache = 0; 2921 2922 if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) 2923 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 2924 2925 if (vd->vdev_children != 0 && 2926 (error = vdev_create(vd, txg, B_FALSE)) != 0) 2927 return (spa_vdev_exit(spa, vd, txg, error)); 2928 2929 /* 2930 * We must validate the spares and l2cache devices after checking the 2931 * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 2932 */ 2933 if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) 2934 return (spa_vdev_exit(spa, vd, txg, error)); 2935 2936 /* 2937 * Transfer each new top-level vdev from vd to rvd. 2938 */ 2939 for (int c = 0; c < vd->vdev_children; c++) { 2940 tvd = vd->vdev_child[c]; 2941 vdev_remove_child(vd, tvd); 2942 tvd->vdev_id = rvd->vdev_children; 2943 vdev_add_child(rvd, tvd); 2944 vdev_config_dirty(tvd); 2945 } 2946 2947 if (nspares != 0) { 2948 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 2949 ZPOOL_CONFIG_SPARES); 2950 spa_load_spares(spa); 2951 spa->spa_spares.sav_sync = B_TRUE; 2952 } 2953 2954 if (nl2cache != 0) { 2955 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 2956 ZPOOL_CONFIG_L2CACHE); 2957 spa_load_l2cache(spa); 2958 spa->spa_l2cache.sav_sync = B_TRUE; 2959 } 2960 2961 /* 2962 * We have to be careful when adding new vdevs to an existing pool. 2963 * If other threads start allocating from these vdevs before we 2964 * sync the config cache, and we lose power, then upon reboot we may 2965 * fail to open the pool because there are DVAs that the config cache 2966 * can't translate. Therefore, we first add the vdevs without 2967 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 2968 * and then let spa_config_update() initialize the new metaslabs. 2969 * 2970 * spa_load() checks for added-but-not-initialized vdevs, so that 2971 * if we lose power at any point in this sequence, the remaining 2972 * steps will be completed the next time we load the pool. 2973 */ 2974 (void) spa_vdev_exit(spa, vd, txg, 0); 2975 2976 mutex_enter(&spa_namespace_lock); 2977 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 2978 mutex_exit(&spa_namespace_lock); 2979 2980 return (0); 2981} 2982 2983/* 2984 * Attach a device to a mirror. The arguments are the path to any device 2985 * in the mirror, and the nvroot for the new device. If the path specifies 2986 * a device that is not mirrored, we automatically insert the mirror vdev. 2987 * 2988 * If 'replacing' is specified, the new device is intended to replace the 2989 * existing device; in this case the two devices are made into their own 2990 * mirror using the 'replacing' vdev, which is functionally identical to 2991 * the mirror vdev (it actually reuses all the same ops) but has a few 2992 * extra rules: you can't attach to it after it's been created, and upon 2993 * completion of resilvering, the first disk (the one being replaced) 2994 * is automatically detached. 2995 */ 2996int 2997spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 2998{ 2999 uint64_t txg, open_txg; 3000 vdev_t *rvd = spa->spa_root_vdev; 3001 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 3002 vdev_ops_t *pvops; 3003 dmu_tx_t *tx; 3004 char *oldvdpath, *newvdpath; 3005 int newvd_isspare; 3006 int error; 3007 3008 txg = spa_vdev_enter(spa); 3009 3010 oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 3011 3012 if (oldvd == NULL) 3013 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 3014 3015 if (!oldvd->vdev_ops->vdev_op_leaf) 3016 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3017 3018 pvd = oldvd->vdev_parent; 3019 3020 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 3021 VDEV_ALLOC_ADD)) != 0) 3022 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 3023 3024 if (newrootvd->vdev_children != 1) 3025 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 3026 3027 newvd = newrootvd->vdev_child[0]; 3028 3029 if (!newvd->vdev_ops->vdev_op_leaf) 3030 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 3031 3032 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 3033 return (spa_vdev_exit(spa, newrootvd, txg, error)); 3034 3035 /* 3036 * Spares can't replace logs 3037 */ 3038 if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) 3039 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3040 3041 if (!replacing) { 3042 /* 3043 * For attach, the only allowable parent is a mirror or the root 3044 * vdev. 3045 */ 3046 if (pvd->vdev_ops != &vdev_mirror_ops && 3047 pvd->vdev_ops != &vdev_root_ops) 3048 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3049 3050 pvops = &vdev_mirror_ops; 3051 } else { 3052 /* 3053 * Active hot spares can only be replaced by inactive hot 3054 * spares. 3055 */ 3056 if (pvd->vdev_ops == &vdev_spare_ops && 3057 pvd->vdev_child[1] == oldvd && 3058 !spa_has_spare(spa, newvd->vdev_guid)) 3059 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3060 3061 /* 3062 * If the source is a hot spare, and the parent isn't already a 3063 * spare, then we want to create a new hot spare. Otherwise, we 3064 * want to create a replacing vdev. The user is not allowed to 3065 * attach to a spared vdev child unless the 'isspare' state is 3066 * the same (spare replaces spare, non-spare replaces 3067 * non-spare). 3068 */ 3069 if (pvd->vdev_ops == &vdev_replacing_ops) 3070 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3071 else if (pvd->vdev_ops == &vdev_spare_ops && 3072 newvd->vdev_isspare != oldvd->vdev_isspare) 3073 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3074 else if (pvd->vdev_ops != &vdev_spare_ops && 3075 newvd->vdev_isspare) 3076 pvops = &vdev_spare_ops; 3077 else 3078 pvops = &vdev_replacing_ops; 3079 } 3080 3081 /* 3082 * Compare the new device size with the replaceable/attachable 3083 * device size. 3084 */ 3085 if (newvd->vdev_psize < vdev_get_rsize(oldvd)) 3086 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 3087 3088 /* 3089 * The new device cannot have a higher alignment requirement 3090 * than the top-level vdev. 3091 */ 3092 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 3093 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 3094 3095 /* 3096 * If this is an in-place replacement, update oldvd's path and devid 3097 * to make it distinguishable from newvd, and unopenable from now on. 3098 */ 3099 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 3100 spa_strfree(oldvd->vdev_path); 3101 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 3102 KM_SLEEP); 3103 (void) sprintf(oldvd->vdev_path, "%s/%s", 3104 newvd->vdev_path, "old"); 3105 if (oldvd->vdev_devid != NULL) { 3106 spa_strfree(oldvd->vdev_devid); 3107 oldvd->vdev_devid = NULL; 3108 } 3109 } 3110 3111 /* 3112 * If the parent is not a mirror, or if we're replacing, insert the new 3113 * mirror/replacing/spare vdev above oldvd. 3114 */ 3115 if (pvd->vdev_ops != pvops) 3116 pvd = vdev_add_parent(oldvd, pvops); 3117 3118 ASSERT(pvd->vdev_top->vdev_parent == rvd); 3119 ASSERT(pvd->vdev_ops == pvops); 3120 ASSERT(oldvd->vdev_parent == pvd); 3121 3122 /* 3123 * Extract the new device from its root and add it to pvd. 3124 */ 3125 vdev_remove_child(newrootvd, newvd); 3126 newvd->vdev_id = pvd->vdev_children; 3127 vdev_add_child(pvd, newvd); 3128 3129 /* 3130 * If newvd is smaller than oldvd, but larger than its rsize, 3131 * the addition of newvd may have decreased our parent's asize. 3132 */ 3133 pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); 3134 3135 tvd = newvd->vdev_top; 3136 ASSERT(pvd->vdev_top == tvd); 3137 ASSERT(tvd->vdev_parent == rvd); 3138 3139 vdev_config_dirty(tvd); 3140 3141 /* 3142 * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 3143 * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 3144 */ 3145 open_txg = txg + TXG_CONCURRENT_STATES - 1; 3146 3147 vdev_dtl_dirty(newvd, DTL_MISSING, 3148 TXG_INITIAL, open_txg - TXG_INITIAL + 1); 3149 3150 if (newvd->vdev_isspare) { 3151 spa_spare_activate(newvd); 3152 spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE); 3153 } 3154 3155 oldvdpath = spa_strdup(oldvd->vdev_path); 3156 newvdpath = spa_strdup(newvd->vdev_path); 3157 newvd_isspare = newvd->vdev_isspare; 3158 3159 /* 3160 * Mark newvd's DTL dirty in this txg. 3161 */ 3162 vdev_dirty(tvd, VDD_DTL, newvd, txg); 3163 3164 (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 3165 3166 tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 3167 if (dmu_tx_assign(tx, TXG_WAIT) == 0) { 3168 spa_history_internal_log(LOG_POOL_VDEV_ATTACH, spa, tx, 3169 CRED(), "%s vdev=%s %s vdev=%s", 3170 replacing && newvd_isspare ? "spare in" : 3171 replacing ? "replace" : "attach", newvdpath, 3172 replacing ? "for" : "to", oldvdpath); 3173 dmu_tx_commit(tx); 3174 } else { 3175 dmu_tx_abort(tx); 3176 } 3177 3178 spa_strfree(oldvdpath); 3179 spa_strfree(newvdpath); 3180 3181 /* 3182 * Kick off a resilver to update newvd. 3183 */ 3184 VERIFY3U(spa_scrub(spa, POOL_SCRUB_RESILVER), ==, 0); 3185 3186 return (0); 3187} 3188 3189/* 3190 * Detach a device from a mirror or replacing vdev. 3191 * If 'replace_done' is specified, only detach if the parent 3192 * is a replacing vdev. 3193 */ 3194int 3195spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) 3196{ 3197 uint64_t txg; 3198 int error; 3199 vdev_t *rvd = spa->spa_root_vdev; 3200 vdev_t *vd, *pvd, *cvd, *tvd; 3201 boolean_t unspare = B_FALSE; 3202 uint64_t unspare_guid; 3203 size_t len; 3204 3205 txg = spa_vdev_enter(spa); 3206 3207 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 3208 3209 if (vd == NULL) 3210 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 3211 3212 if (!vd->vdev_ops->vdev_op_leaf) 3213 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3214 3215 pvd = vd->vdev_parent; 3216 3217 /* 3218 * If the parent/child relationship is not as expected, don't do it. 3219 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing 3220 * vdev that's replacing B with C. The user's intent in replacing 3221 * is to go from M(A,B) to M(A,C). If the user decides to cancel 3222 * the replace by detaching C, the expected behavior is to end up 3223 * M(A,B). But suppose that right after deciding to detach C, 3224 * the replacement of B completes. We would have M(A,C), and then 3225 * ask to detach C, which would leave us with just A -- not what 3226 * the user wanted. To prevent this, we make sure that the 3227 * parent/child relationship hasn't changed -- in this example, 3228 * that C's parent is still the replacing vdev R. 3229 */ 3230 if (pvd->vdev_guid != pguid && pguid != 0) 3231 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 3232 3233 /* 3234 * If replace_done is specified, only remove this device if it's 3235 * the first child of a replacing vdev. For the 'spare' vdev, either 3236 * disk can be removed. 3237 */ 3238 if (replace_done) { 3239 if (pvd->vdev_ops == &vdev_replacing_ops) { 3240 if (vd->vdev_id != 0) 3241 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3242 } else if (pvd->vdev_ops != &vdev_spare_ops) { 3243 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3244 } 3245 } 3246 3247 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 3248 spa_version(spa) >= SPA_VERSION_SPARES); 3249 3250 /* 3251 * Only mirror, replacing, and spare vdevs support detach. 3252 */ 3253 if (pvd->vdev_ops != &vdev_replacing_ops && 3254 pvd->vdev_ops != &vdev_mirror_ops && 3255 pvd->vdev_ops != &vdev_spare_ops) 3256 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3257 3258 /* 3259 * If this device has the only valid copy of some data, 3260 * we cannot safely detach it. 3261 */ 3262 if (vdev_dtl_required(vd)) 3263 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 3264 3265 ASSERT(pvd->vdev_children >= 2); 3266 3267 /* 3268 * If we are detaching the second disk from a replacing vdev, then 3269 * check to see if we changed the original vdev's path to have "/old" 3270 * at the end in spa_vdev_attach(). If so, undo that change now. 3271 */ 3272 if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id == 1 && 3273 pvd->vdev_child[0]->vdev_path != NULL && 3274 pvd->vdev_child[1]->vdev_path != NULL) { 3275 ASSERT(pvd->vdev_child[1] == vd); 3276 cvd = pvd->vdev_child[0]; 3277 len = strlen(vd->vdev_path); 3278 if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 3279 strcmp(cvd->vdev_path + len, "/old") == 0) { 3280 spa_strfree(cvd->vdev_path); 3281 cvd->vdev_path = spa_strdup(vd->vdev_path); 3282 } 3283 } 3284 3285 /* 3286 * If we are detaching the original disk from a spare, then it implies 3287 * that the spare should become a real disk, and be removed from the 3288 * active spare list for the pool. 3289 */ 3290 if (pvd->vdev_ops == &vdev_spare_ops && 3291 vd->vdev_id == 0 && pvd->vdev_child[1]->vdev_isspare) 3292 unspare = B_TRUE; 3293 3294 /* 3295 * Erase the disk labels so the disk can be used for other things. 3296 * This must be done after all other error cases are handled, 3297 * but before we disembowel vd (so we can still do I/O to it). 3298 * But if we can't do it, don't treat the error as fatal -- 3299 * it may be that the unwritability of the disk is the reason 3300 * it's being detached! 3301 */ 3302 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 3303 3304 /* 3305 * Remove vd from its parent and compact the parent's children. 3306 */ 3307 vdev_remove_child(pvd, vd); 3308 vdev_compact_children(pvd); 3309 3310 /* 3311 * Remember one of the remaining children so we can get tvd below. 3312 */ 3313 cvd = pvd->vdev_child[0]; 3314 3315 /* 3316 * If we need to remove the remaining child from the list of hot spares, 3317 * do it now, marking the vdev as no longer a spare in the process. 3318 * We must do this before vdev_remove_parent(), because that can 3319 * change the GUID if it creates a new toplevel GUID. For a similar 3320 * reason, we must remove the spare now, in the same txg as the detach; 3321 * otherwise someone could attach a new sibling, change the GUID, and 3322 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. 3323 */ 3324 if (unspare) { 3325 ASSERT(cvd->vdev_isspare); 3326 spa_spare_remove(cvd); 3327 unspare_guid = cvd->vdev_guid; 3328 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 3329 } 3330 3331 /* 3332 * If the parent mirror/replacing vdev only has one child, 3333 * the parent is no longer needed. Remove it from the tree. 3334 */ 3335 if (pvd->vdev_children == 1) 3336 vdev_remove_parent(cvd); 3337 3338 /* 3339 * We don't set tvd until now because the parent we just removed 3340 * may have been the previous top-level vdev. 3341 */ 3342 tvd = cvd->vdev_top; 3343 ASSERT(tvd->vdev_parent == rvd); 3344 3345 /* 3346 * Reevaluate the parent vdev state. 3347 */ 3348 vdev_propagate_state(cvd); 3349 3350 /* 3351 * If the device we just detached was smaller than the others, it may be 3352 * possible to add metaslabs (i.e. grow the pool). vdev_metaslab_init() 3353 * can't fail because the existing metaslabs are already in core, so 3354 * there's nothing to read from disk. 3355 */ 3356 VERIFY(vdev_metaslab_init(tvd, txg) == 0); 3357 3358 vdev_config_dirty(tvd); 3359 3360 /* 3361 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 3362 * vd->vdev_detached is set and free vd's DTL object in syncing context. 3363 * But first make sure we're not on any *other* txg's DTL list, to 3364 * prevent vd from being accessed after it's freed. 3365 */ 3366 for (int t = 0; t < TXG_SIZE; t++) 3367 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 3368 vd->vdev_detached = B_TRUE; 3369 vdev_dirty(tvd, VDD_DTL, vd, txg); 3370 3371 spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); 3372 3373 error = spa_vdev_exit(spa, vd, txg, 0); 3374 3375 /* 3376 * If this was the removal of the original device in a hot spare vdev, 3377 * then we want to go through and remove the device from the hot spare 3378 * list of every other pool. 3379 */ 3380 if (unspare) { 3381 spa_t *myspa = spa; 3382 spa = NULL; 3383 mutex_enter(&spa_namespace_lock); 3384 while ((spa = spa_next(spa)) != NULL) { 3385 if (spa->spa_state != POOL_STATE_ACTIVE) 3386 continue; 3387 if (spa == myspa) 3388 continue; 3389 spa_open_ref(spa, FTAG); 3390 mutex_exit(&spa_namespace_lock); 3391 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 3392 mutex_enter(&spa_namespace_lock); 3393 spa_close(spa, FTAG); 3394 } 3395 mutex_exit(&spa_namespace_lock); 3396 } 3397 3398 return (error); 3399} 3400 3401static nvlist_t * 3402spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) 3403{ 3404 for (int i = 0; i < count; i++) { 3405 uint64_t guid; 3406 3407 VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID, 3408 &guid) == 0); 3409 3410 if (guid == target_guid) 3411 return (nvpp[i]); 3412 } 3413 3414 return (NULL); 3415} 3416 3417static void 3418spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, 3419 nvlist_t *dev_to_remove) 3420{ 3421 nvlist_t **newdev = NULL; 3422 3423 if (count > 1) 3424 newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); 3425 3426 for (int i = 0, j = 0; i < count; i++) { 3427 if (dev[i] == dev_to_remove) 3428 continue; 3429 VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); 3430 } 3431 3432 VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); 3433 VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); 3434 3435 for (int i = 0; i < count - 1; i++) 3436 nvlist_free(newdev[i]); 3437 3438 if (count > 1) 3439 kmem_free(newdev, (count - 1) * sizeof (void *)); 3440} 3441 3442/* 3443 * Remove a device from the pool. Currently, this supports removing only hot 3444 * spares and level 2 ARC devices. 3445 */ 3446int 3447spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 3448{ 3449 vdev_t *vd; 3450 nvlist_t **spares, **l2cache, *nv; 3451 uint_t nspares, nl2cache; 3452 uint64_t txg = 0; 3453 int error = 0; 3454 boolean_t locked = MUTEX_HELD(&spa_namespace_lock); 3455 3456 if (!locked) 3457 txg = spa_vdev_enter(spa); 3458 3459 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 3460 3461 if (spa->spa_spares.sav_vdevs != NULL && 3462 nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 3463 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && 3464 (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { 3465 /* 3466 * Only remove the hot spare if it's not currently in use 3467 * in this pool. 3468 */ 3469 if (vd == NULL || unspare) { 3470 spa_vdev_remove_aux(spa->spa_spares.sav_config, 3471 ZPOOL_CONFIG_SPARES, spares, nspares, nv); 3472 spa_load_spares(spa); 3473 spa->spa_spares.sav_sync = B_TRUE; 3474 } else { 3475 error = EBUSY; 3476 } 3477 } else if (spa->spa_l2cache.sav_vdevs != NULL && 3478 nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 3479 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && 3480 (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { 3481 /* 3482 * Cache devices can always be removed. 3483 */ 3484 spa_vdev_remove_aux(spa->spa_l2cache.sav_config, 3485 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); 3486 spa_load_l2cache(spa); 3487 spa->spa_l2cache.sav_sync = B_TRUE; 3488 } else if (vd != NULL) { 3489 /* 3490 * Normal vdevs cannot be removed (yet). 3491 */ 3492 error = ENOTSUP; 3493 } else { 3494 /* 3495 * There is no vdev of any kind with the specified guid. 3496 */ 3497 error = ENOENT; 3498 } 3499 3500 if (!locked) 3501 return (spa_vdev_exit(spa, NULL, txg, error)); 3502 3503 return (error); 3504} 3505 3506/* 3507 * Find any device that's done replacing, or a vdev marked 'unspare' that's 3508 * current spared, so we can detach it. 3509 */ 3510static vdev_t * 3511spa_vdev_resilver_done_hunt(vdev_t *vd) 3512{ 3513 vdev_t *newvd, *oldvd; 3514 int c; 3515 3516 for (c = 0; c < vd->vdev_children; c++) { 3517 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 3518 if (oldvd != NULL) 3519 return (oldvd); 3520 } 3521 3522 /* 3523 * Check for a completed replacement. 3524 */ 3525 if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 3526 oldvd = vd->vdev_child[0]; 3527 newvd = vd->vdev_child[1]; 3528 3529 if (vdev_dtl_empty(newvd, DTL_MISSING) && 3530 !vdev_dtl_required(oldvd)) 3531 return (oldvd); 3532 } 3533 3534 /* 3535 * Check for a completed resilver with the 'unspare' flag set. 3536 */ 3537 if (vd->vdev_ops == &vdev_spare_ops && vd->vdev_children == 2) { 3538 newvd = vd->vdev_child[0]; 3539 oldvd = vd->vdev_child[1]; 3540 3541 if (newvd->vdev_unspare && 3542 vdev_dtl_empty(newvd, DTL_MISSING) && 3543 !vdev_dtl_required(oldvd)) { 3544 newvd->vdev_unspare = 0; 3545 return (oldvd); 3546 } 3547 } 3548 3549 return (NULL); 3550} 3551 3552static void 3553spa_vdev_resilver_done(spa_t *spa) 3554{ 3555 vdev_t *vd, *pvd, *ppvd; 3556 uint64_t guid, sguid, pguid, ppguid; 3557 3558 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3559 3560 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 3561 pvd = vd->vdev_parent; 3562 ppvd = pvd->vdev_parent; 3563 guid = vd->vdev_guid; 3564 pguid = pvd->vdev_guid; 3565 ppguid = ppvd->vdev_guid; 3566 sguid = 0; 3567 /* 3568 * If we have just finished replacing a hot spared device, then 3569 * we need to detach the parent's first child (the original hot 3570 * spare) as well. 3571 */ 3572 if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0) { 3573 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 3574 ASSERT(ppvd->vdev_children == 2); 3575 sguid = ppvd->vdev_child[1]->vdev_guid; 3576 } 3577 spa_config_exit(spa, SCL_ALL, FTAG); 3578 if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) 3579 return; 3580 if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) 3581 return; 3582 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3583 } 3584 3585 spa_config_exit(spa, SCL_ALL, FTAG); 3586} 3587 3588/* 3589 * Update the stored path or FRU for this vdev. Dirty the vdev configuration, 3590 * relying on spa_vdev_enter/exit() to synchronize the labels and cache. 3591 */ 3592int 3593spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, 3594 boolean_t ispath) 3595{ 3596 vdev_t *vd; 3597 uint64_t txg; 3598 3599 txg = spa_vdev_enter(spa); 3600 3601 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 3602 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 3603 3604 if (!vd->vdev_ops->vdev_op_leaf) 3605 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3606 3607 if (ispath) { 3608 spa_strfree(vd->vdev_path); 3609 vd->vdev_path = spa_strdup(value); 3610 } else { 3611 if (vd->vdev_fru != NULL) 3612 spa_strfree(vd->vdev_fru); 3613 vd->vdev_fru = spa_strdup(value); 3614 } 3615 3616 vdev_config_dirty(vd->vdev_top); 3617 3618 return (spa_vdev_exit(spa, NULL, txg, 0)); 3619} 3620 3621int 3622spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 3623{ 3624 return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); 3625} 3626 3627int 3628spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) 3629{ 3630 return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); 3631} 3632 3633/* 3634 * ========================================================================== 3635 * SPA Scrubbing 3636 * ========================================================================== 3637 */ 3638 3639int 3640spa_scrub(spa_t *spa, pool_scrub_type_t type) 3641{ 3642 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 3643 3644 if ((uint_t)type >= POOL_SCRUB_TYPES) 3645 return (ENOTSUP); 3646 3647 /* 3648 * If a resilver was requested, but there is no DTL on a 3649 * writeable leaf device, we have nothing to do. 3650 */ 3651 if (type == POOL_SCRUB_RESILVER && 3652 !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 3653 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 3654 return (0); 3655 } 3656 3657 if (type == POOL_SCRUB_EVERYTHING && 3658 spa->spa_dsl_pool->dp_scrub_func != SCRUB_FUNC_NONE && 3659 spa->spa_dsl_pool->dp_scrub_isresilver) 3660 return (EBUSY); 3661 3662 if (type == POOL_SCRUB_EVERYTHING || type == POOL_SCRUB_RESILVER) { 3663 return (dsl_pool_scrub_clean(spa->spa_dsl_pool)); 3664 } else if (type == POOL_SCRUB_NONE) { 3665 return (dsl_pool_scrub_cancel(spa->spa_dsl_pool)); 3666 } else { 3667 return (EINVAL); 3668 } 3669} 3670 3671/* 3672 * ========================================================================== 3673 * SPA async task processing 3674 * ========================================================================== 3675 */ 3676 3677static void 3678spa_async_remove(spa_t *spa, vdev_t *vd) 3679{ 3680 if (vd->vdev_remove_wanted) { 3681 vd->vdev_remove_wanted = 0; 3682 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 3683 3684 /* 3685 * We want to clear the stats, but we don't want to do a full 3686 * vdev_clear() as that will cause us to throw away 3687 * degraded/faulted state as well as attempt to reopen the 3688 * device, all of which is a waste. 3689 */ 3690 vd->vdev_stat.vs_read_errors = 0; 3691 vd->vdev_stat.vs_write_errors = 0; 3692 vd->vdev_stat.vs_checksum_errors = 0; 3693 3694 vdev_state_dirty(vd->vdev_top); 3695 } 3696 3697 for (int c = 0; c < vd->vdev_children; c++) 3698 spa_async_remove(spa, vd->vdev_child[c]); 3699} 3700 3701static void 3702spa_async_probe(spa_t *spa, vdev_t *vd) 3703{ 3704 if (vd->vdev_probe_wanted) { 3705 vd->vdev_probe_wanted = 0; 3706 vdev_reopen(vd); /* vdev_open() does the actual probe */ 3707 } 3708 3709 for (int c = 0; c < vd->vdev_children; c++) 3710 spa_async_probe(spa, vd->vdev_child[c]); 3711} 3712 3713static void 3714spa_async_thread(void *arg) 3715{ 3716 spa_t *spa = arg; 3717 int tasks; 3718 3719 ASSERT(spa->spa_sync_on); 3720 3721 mutex_enter(&spa->spa_async_lock); 3722 tasks = spa->spa_async_tasks; 3723 spa->spa_async_tasks = 0; 3724 mutex_exit(&spa->spa_async_lock); 3725 3726 /* 3727 * See if the config needs to be updated. 3728 */ 3729 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 3730 mutex_enter(&spa_namespace_lock); 3731 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 3732 mutex_exit(&spa_namespace_lock); 3733 } 3734 3735 /* 3736 * See if any devices need to be marked REMOVED. 3737 */ 3738 if (tasks & SPA_ASYNC_REMOVE) { 3739 spa_vdev_state_enter(spa); 3740 spa_async_remove(spa, spa->spa_root_vdev); 3741 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 3742 spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); 3743 for (int i = 0; i < spa->spa_spares.sav_count; i++) 3744 spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); 3745 (void) spa_vdev_state_exit(spa, NULL, 0); 3746 } 3747 3748 /* 3749 * See if any devices need to be probed. 3750 */ 3751 if (tasks & SPA_ASYNC_PROBE) { 3752 spa_vdev_state_enter(spa); 3753 spa_async_probe(spa, spa->spa_root_vdev); 3754 (void) spa_vdev_state_exit(spa, NULL, 0); 3755 } 3756 3757 /* 3758 * If any devices are done replacing, detach them. 3759 */ 3760 if (tasks & SPA_ASYNC_RESILVER_DONE) 3761 spa_vdev_resilver_done(spa); 3762 3763 /* 3764 * Kick off a resilver. 3765 */ 3766 if (tasks & SPA_ASYNC_RESILVER) 3767 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER) == 0); 3768 3769 /* 3770 * Let the world know that we're done. 3771 */ 3772 mutex_enter(&spa->spa_async_lock); 3773 spa->spa_async_thread = NULL; 3774 cv_broadcast(&spa->spa_async_cv); 3775 mutex_exit(&spa->spa_async_lock); 3776 thread_exit(); 3777} 3778 3779void 3780spa_async_suspend(spa_t *spa) 3781{ 3782 mutex_enter(&spa->spa_async_lock); 3783 spa->spa_async_suspended++; 3784 while (spa->spa_async_thread != NULL) 3785 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 3786 mutex_exit(&spa->spa_async_lock); 3787} 3788 3789void 3790spa_async_resume(spa_t *spa) 3791{ 3792 mutex_enter(&spa->spa_async_lock); 3793 ASSERT(spa->spa_async_suspended != 0); 3794 spa->spa_async_suspended--; 3795 mutex_exit(&spa->spa_async_lock); 3796} 3797 3798static void 3799spa_async_dispatch(spa_t *spa) 3800{ 3801 mutex_enter(&spa->spa_async_lock); 3802 if (spa->spa_async_tasks && !spa->spa_async_suspended && 3803 spa->spa_async_thread == NULL && 3804 rootdir != NULL && !vn_is_readonly(rootdir)) 3805 spa->spa_async_thread = thread_create(NULL, 0, 3806 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 3807 mutex_exit(&spa->spa_async_lock); 3808} 3809 3810void 3811spa_async_request(spa_t *spa, int task) 3812{ 3813 mutex_enter(&spa->spa_async_lock); 3814 spa->spa_async_tasks |= task; 3815 mutex_exit(&spa->spa_async_lock); 3816} 3817 3818/* 3819 * ========================================================================== 3820 * SPA syncing routines 3821 * ========================================================================== 3822 */ 3823 3824static void 3825spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 3826{ 3827 bplist_t *bpl = &spa->spa_sync_bplist; 3828 dmu_tx_t *tx; 3829 blkptr_t blk; 3830 uint64_t itor = 0; 3831 zio_t *zio; 3832 int error; 3833 uint8_t c = 1; 3834 3835 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 3836 3837 while (bplist_iterate(bpl, &itor, &blk) == 0) { 3838 ASSERT(blk.blk_birth < txg); 3839 zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL, 3840 ZIO_FLAG_MUSTSUCCEED)); 3841 } 3842 3843 error = zio_wait(zio); 3844 ASSERT3U(error, ==, 0); 3845 3846 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 3847 bplist_vacate(bpl, tx); 3848 3849 /* 3850 * Pre-dirty the first block so we sync to convergence faster. 3851 * (Usually only the first block is needed.) 3852 */ 3853 dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 3854 dmu_tx_commit(tx); 3855} 3856 3857static void 3858spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 3859{ 3860 char *packed = NULL; 3861 size_t bufsize; 3862 size_t nvsize = 0; 3863 dmu_buf_t *db; 3864 3865 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 3866 3867 /* 3868 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 3869 * information. This avoids the dbuf_will_dirty() path and 3870 * saves us a pre-read to get data we don't actually care about. 3871 */ 3872 bufsize = P2ROUNDUP(nvsize, SPA_CONFIG_BLOCKSIZE); 3873 packed = kmem_alloc(bufsize, KM_SLEEP); 3874 3875 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 3876 KM_SLEEP) == 0); 3877 bzero(packed + nvsize, bufsize - nvsize); 3878 3879 dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); 3880 3881 kmem_free(packed, bufsize); 3882 3883 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 3884 dmu_buf_will_dirty(db, tx); 3885 *(uint64_t *)db->db_data = nvsize; 3886 dmu_buf_rele(db, FTAG); 3887} 3888 3889static void 3890spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 3891 const char *config, const char *entry) 3892{ 3893 nvlist_t *nvroot; 3894 nvlist_t **list; 3895 int i; 3896 3897 if (!sav->sav_sync) 3898 return; 3899 3900 /* 3901 * Update the MOS nvlist describing the list of available devices. 3902 * spa_validate_aux() will have already made sure this nvlist is 3903 * valid and the vdevs are labeled appropriately. 3904 */ 3905 if (sav->sav_object == 0) { 3906 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 3907 DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 3908 sizeof (uint64_t), tx); 3909 VERIFY(zap_update(spa->spa_meta_objset, 3910 DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 3911 &sav->sav_object, tx) == 0); 3912 } 3913 3914 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3915 if (sav->sav_count == 0) { 3916 VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); 3917 } else { 3918 list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 3919 for (i = 0; i < sav->sav_count; i++) 3920 list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 3921 B_FALSE, B_FALSE, B_TRUE); 3922 VERIFY(nvlist_add_nvlist_array(nvroot, config, list, 3923 sav->sav_count) == 0); 3924 for (i = 0; i < sav->sav_count; i++) 3925 nvlist_free(list[i]); 3926 kmem_free(list, sav->sav_count * sizeof (void *)); 3927 } 3928 3929 spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 3930 nvlist_free(nvroot); 3931 3932 sav->sav_sync = B_FALSE; 3933} 3934 3935static void 3936spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 3937{ 3938 nvlist_t *config; 3939 3940 if (list_is_empty(&spa->spa_config_dirty_list)) 3941 return; 3942 3943 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 3944 3945 config = spa_config_generate(spa, spa->spa_root_vdev, 3946 dmu_tx_get_txg(tx), B_FALSE); 3947 3948 spa_config_exit(spa, SCL_STATE, FTAG); 3949 3950 if (spa->spa_config_syncing) 3951 nvlist_free(spa->spa_config_syncing); 3952 spa->spa_config_syncing = config; 3953 3954 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 3955} 3956 3957/* 3958 * Set zpool properties. 3959 */ 3960static void 3961spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) 3962{ 3963 spa_t *spa = arg1; 3964 objset_t *mos = spa->spa_meta_objset; 3965 nvlist_t *nvp = arg2; 3966 nvpair_t *elem; 3967 uint64_t intval; 3968 char *strval; 3969 zpool_prop_t prop; 3970 const char *propname; 3971 zprop_type_t proptype; 3972 3973 mutex_enter(&spa->spa_props_lock); 3974 3975 elem = NULL; 3976 while ((elem = nvlist_next_nvpair(nvp, elem))) { 3977 switch (prop = zpool_name_to_prop(nvpair_name(elem))) { 3978 case ZPOOL_PROP_VERSION: 3979 /* 3980 * Only set version for non-zpool-creation cases 3981 * (set/import). spa_create() needs special care 3982 * for version setting. 3983 */ 3984 if (tx->tx_txg != TXG_INITIAL) { 3985 VERIFY(nvpair_value_uint64(elem, 3986 &intval) == 0); 3987 ASSERT(intval <= SPA_VERSION); 3988 ASSERT(intval >= spa_version(spa)); 3989 spa->spa_uberblock.ub_version = intval; 3990 vdev_config_dirty(spa->spa_root_vdev); 3991 } 3992 break; 3993 3994 case ZPOOL_PROP_ALTROOT: 3995 /* 3996 * 'altroot' is a non-persistent property. It should 3997 * have been set temporarily at creation or import time. 3998 */ 3999 ASSERT(spa->spa_root != NULL); 4000 break; 4001 4002 case ZPOOL_PROP_CACHEFILE: 4003 /* 4004 * 'cachefile' is also a non-persisitent property. 4005 */ 4006 break; 4007 default: 4008 /* 4009 * Set pool property values in the poolprops mos object. 4010 */ 4011 if (spa->spa_pool_props_object == 0) { 4012 objset_t *mos = spa->spa_meta_objset; 4013 4014 VERIFY((spa->spa_pool_props_object = 4015 zap_create(mos, DMU_OT_POOL_PROPS, 4016 DMU_OT_NONE, 0, tx)) > 0); 4017 4018 VERIFY(zap_update(mos, 4019 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 4020 8, 1, &spa->spa_pool_props_object, tx) 4021 == 0); 4022 } 4023 4024 /* normalize the property name */ 4025 propname = zpool_prop_to_name(prop); 4026 proptype = zpool_prop_get_type(prop); 4027 4028 if (nvpair_type(elem) == DATA_TYPE_STRING) { 4029 ASSERT(proptype == PROP_TYPE_STRING); 4030 VERIFY(nvpair_value_string(elem, &strval) == 0); 4031 VERIFY(zap_update(mos, 4032 spa->spa_pool_props_object, propname, 4033 1, strlen(strval) + 1, strval, tx) == 0); 4034 4035 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 4036 VERIFY(nvpair_value_uint64(elem, &intval) == 0); 4037 4038 if (proptype == PROP_TYPE_INDEX) { 4039 const char *unused; 4040 VERIFY(zpool_prop_index_to_string( 4041 prop, intval, &unused) == 0); 4042 } 4043 VERIFY(zap_update(mos, 4044 spa->spa_pool_props_object, propname, 4045 8, 1, &intval, tx) == 0); 4046 } else { 4047 ASSERT(0); /* not allowed */ 4048 } 4049 4050 switch (prop) { 4051 case ZPOOL_PROP_DELEGATION: 4052 spa->spa_delegation = intval; 4053 break; 4054 case ZPOOL_PROP_BOOTFS: 4055 spa->spa_bootfs = intval; 4056 break; 4057 case ZPOOL_PROP_FAILUREMODE: 4058 spa->spa_failmode = intval; 4059 break; 4060 default: 4061 break; 4062 } 4063 } 4064 4065 /* log internal history if this is not a zpool create */ 4066 if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY && 4067 tx->tx_txg != TXG_INITIAL) { 4068 spa_history_internal_log(LOG_POOL_PROPSET, 4069 spa, tx, cr, "%s %lld %s", 4070 nvpair_name(elem), intval, spa_name(spa)); 4071 } 4072 } 4073 4074 mutex_exit(&spa->spa_props_lock); 4075} 4076 4077/* 4078 * Sync the specified transaction group. New blocks may be dirtied as 4079 * part of the process, so we iterate until it converges. 4080 */ 4081void 4082spa_sync(spa_t *spa, uint64_t txg) 4083{ 4084 dsl_pool_t *dp = spa->spa_dsl_pool; 4085 objset_t *mos = spa->spa_meta_objset; 4086 bplist_t *bpl = &spa->spa_sync_bplist; 4087 vdev_t *rvd = spa->spa_root_vdev; 4088 vdev_t *vd; 4089 dmu_tx_t *tx; 4090 int dirty_vdevs; 4091 int error; 4092 4093 /* 4094 * Lock out configuration changes. 4095 */ 4096 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4097 4098 spa->spa_syncing_txg = txg; 4099 spa->spa_sync_pass = 0; 4100 4101 /* 4102 * If there are any pending vdev state changes, convert them 4103 * into config changes that go out with this transaction group. 4104 */ 4105 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 4106 while (list_head(&spa->spa_state_dirty_list) != NULL) { 4107 /* 4108 * We need the write lock here because, for aux vdevs, 4109 * calling vdev_config_dirty() modifies sav_config. 4110 * This is ugly and will become unnecessary when we 4111 * eliminate the aux vdev wart by integrating all vdevs 4112 * into the root vdev tree. 4113 */ 4114 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 4115 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); 4116 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 4117 vdev_state_clean(vd); 4118 vdev_config_dirty(vd); 4119 } 4120 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 4121 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 4122 } 4123 spa_config_exit(spa, SCL_STATE, FTAG); 4124 4125 VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); 4126 4127 tx = dmu_tx_create_assigned(dp, txg); 4128 4129 /* 4130 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 4131 * set spa_deflate if we have no raid-z vdevs. 4132 */ 4133 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 4134 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 4135 int i; 4136 4137 for (i = 0; i < rvd->vdev_children; i++) { 4138 vd = rvd->vdev_child[i]; 4139 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 4140 break; 4141 } 4142 if (i == rvd->vdev_children) { 4143 spa->spa_deflate = TRUE; 4144 VERIFY(0 == zap_add(spa->spa_meta_objset, 4145 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 4146 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 4147 } 4148 } 4149 4150 if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && 4151 spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { 4152 dsl_pool_create_origin(dp, tx); 4153 4154 /* Keeping the origin open increases spa_minref */ 4155 spa->spa_minref += 3; 4156 } 4157 4158 if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && 4159 spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { 4160 dsl_pool_upgrade_clones(dp, tx); 4161 } 4162 4163 /* 4164 * If anything has changed in this txg, push the deferred frees 4165 * from the previous txg. If not, leave them alone so that we 4166 * don't generate work on an otherwise idle system. 4167 */ 4168 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 4169 !txg_list_empty(&dp->dp_dirty_dirs, txg) || 4170 !txg_list_empty(&dp->dp_sync_tasks, txg)) 4171 spa_sync_deferred_frees(spa, txg); 4172 4173 /* 4174 * Iterate to convergence. 4175 */ 4176 do { 4177 spa->spa_sync_pass++; 4178 4179 spa_sync_config_object(spa, tx); 4180 spa_sync_aux_dev(spa, &spa->spa_spares, tx, 4181 ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 4182 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 4183 ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 4184 spa_errlog_sync(spa, txg); 4185 dsl_pool_sync(dp, txg); 4186 4187 dirty_vdevs = 0; 4188 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 4189 vdev_sync(vd, txg); 4190 dirty_vdevs++; 4191 } 4192 4193 bplist_sync(bpl, tx); 4194 } while (dirty_vdevs); 4195 4196 bplist_close(bpl); 4197 4198 dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 4199 4200 /* 4201 * Rewrite the vdev configuration (which includes the uberblock) 4202 * to commit the transaction group. 4203 * 4204 * If there are no dirty vdevs, we sync the uberblock to a few 4205 * random top-level vdevs that are known to be visible in the 4206 * config cache (see spa_vdev_add() for a complete description). 4207 * If there *are* dirty vdevs, sync the uberblock to all vdevs. 4208 */ 4209 for (;;) { 4210 /* 4211 * We hold SCL_STATE to prevent vdev open/close/etc. 4212 * while we're attempting to write the vdev labels. 4213 */ 4214 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 4215 4216 if (list_is_empty(&spa->spa_config_dirty_list)) { 4217 vdev_t *svd[SPA_DVAS_PER_BP]; 4218 int svdcount = 0; 4219 int children = rvd->vdev_children; 4220 int c0 = spa_get_random(children); 4221 int c; 4222 4223 for (c = 0; c < children; c++) { 4224 vd = rvd->vdev_child[(c0 + c) % children]; 4225 if (vd->vdev_ms_array == 0 || vd->vdev_islog) 4226 continue; 4227 svd[svdcount++] = vd; 4228 if (svdcount == SPA_DVAS_PER_BP) 4229 break; 4230 } 4231 error = vdev_config_sync(svd, svdcount, txg); 4232 } else { 4233 error = vdev_config_sync(rvd->vdev_child, 4234 rvd->vdev_children, txg); 4235 } 4236 4237 spa_config_exit(spa, SCL_STATE, FTAG); 4238 4239 if (error == 0) 4240 break; 4241 zio_suspend(spa, NULL); 4242 zio_resume_wait(spa); 4243 } 4244 dmu_tx_commit(tx); 4245 4246 /* 4247 * Clear the dirty config list. 4248 */ 4249 while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) 4250 vdev_config_clean(vd); 4251 4252 /* 4253 * Now that the new config has synced transactionally, 4254 * let it become visible to the config cache. 4255 */ 4256 if (spa->spa_config_syncing != NULL) { 4257 spa_config_set(spa, spa->spa_config_syncing); 4258 spa->spa_config_txg = txg; 4259 spa->spa_config_syncing = NULL; 4260 } 4261 4262 spa->spa_ubsync = spa->spa_uberblock; 4263 4264 /* 4265 * Clean up the ZIL records for the synced txg. 4266 */ 4267 dsl_pool_zil_clean(dp); 4268 4269 /* 4270 * Update usable space statistics. 4271 */ 4272 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 4273 vdev_sync_done(vd, txg); 4274 4275 /* 4276 * It had better be the case that we didn't dirty anything 4277 * since vdev_config_sync(). 4278 */ 4279 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 4280 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 4281 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 4282 ASSERT(bpl->bpl_queue == NULL); 4283 4284 spa_config_exit(spa, SCL_CONFIG, FTAG); 4285 4286 /* 4287 * If any async tasks have been requested, kick them off. 4288 */ 4289 spa_async_dispatch(spa); 4290} 4291 4292/* 4293 * Sync all pools. We don't want to hold the namespace lock across these 4294 * operations, so we take a reference on the spa_t and drop the lock during the 4295 * sync. 4296 */ 4297void 4298spa_sync_allpools(void) 4299{ 4300 spa_t *spa = NULL; 4301 mutex_enter(&spa_namespace_lock); 4302 while ((spa = spa_next(spa)) != NULL) { 4303 if (spa_state(spa) != POOL_STATE_ACTIVE || spa_suspended(spa)) 4304 continue; 4305 spa_open_ref(spa, FTAG); 4306 mutex_exit(&spa_namespace_lock); 4307 txg_wait_synced(spa_get_dsl(spa), 0); 4308 mutex_enter(&spa_namespace_lock); 4309 spa_close(spa, FTAG); 4310 } 4311 mutex_exit(&spa_namespace_lock); 4312} 4313 4314/* 4315 * ========================================================================== 4316 * Miscellaneous routines 4317 * ========================================================================== 4318 */ 4319 4320/* 4321 * Remove all pools in the system. 4322 */ 4323void 4324spa_evict_all(void) 4325{ 4326 spa_t *spa; 4327 4328 /* 4329 * Remove all cached state. All pools should be closed now, 4330 * so every spa in the AVL tree should be unreferenced. 4331 */ 4332 mutex_enter(&spa_namespace_lock); 4333 while ((spa = spa_next(NULL)) != NULL) { 4334 /* 4335 * Stop async tasks. The async thread may need to detach 4336 * a device that's been replaced, which requires grabbing 4337 * spa_namespace_lock, so we must drop it here. 4338 */ 4339 spa_open_ref(spa, FTAG); 4340 mutex_exit(&spa_namespace_lock); 4341 spa_async_suspend(spa); 4342 mutex_enter(&spa_namespace_lock); 4343 spa_close(spa, FTAG); 4344 4345 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 4346 spa_unload(spa); 4347 spa_deactivate(spa); 4348 } 4349 spa_remove(spa); 4350 } 4351 mutex_exit(&spa_namespace_lock); 4352} 4353 4354vdev_t * 4355spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) 4356{ 4357 vdev_t *vd; 4358 int i; 4359 4360 if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 4361 return (vd); 4362 4363 if (aux) { 4364 for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 4365 vd = spa->spa_l2cache.sav_vdevs[i]; 4366 if (vd->vdev_guid == guid) 4367 return (vd); 4368 } 4369 4370 for (i = 0; i < spa->spa_spares.sav_count; i++) { 4371 vd = spa->spa_spares.sav_vdevs[i]; 4372 if (vd->vdev_guid == guid) 4373 return (vd); 4374 } 4375 } 4376 4377 return (NULL); 4378} 4379 4380void 4381spa_upgrade(spa_t *spa, uint64_t version) 4382{ 4383 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4384 4385 /* 4386 * This should only be called for a non-faulted pool, and since a 4387 * future version would result in an unopenable pool, this shouldn't be 4388 * possible. 4389 */ 4390 ASSERT(spa->spa_uberblock.ub_version <= SPA_VERSION); 4391 ASSERT(version >= spa->spa_uberblock.ub_version); 4392 4393 spa->spa_uberblock.ub_version = version; 4394 vdev_config_dirty(spa->spa_root_vdev); 4395 4396 spa_config_exit(spa, SCL_ALL, FTAG); 4397 4398 txg_wait_synced(spa_get_dsl(spa), 0); 4399} 4400 4401boolean_t 4402spa_has_spare(spa_t *spa, uint64_t guid) 4403{ 4404 int i; 4405 uint64_t spareguid; 4406 spa_aux_vdev_t *sav = &spa->spa_spares; 4407 4408 for (i = 0; i < sav->sav_count; i++) 4409 if (sav->sav_vdevs[i]->vdev_guid == guid) 4410 return (B_TRUE); 4411 4412 for (i = 0; i < sav->sav_npending; i++) { 4413 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 4414 &spareguid) == 0 && spareguid == guid) 4415 return (B_TRUE); 4416 } 4417 4418 return (B_FALSE); 4419} 4420 4421/* 4422 * Check if a pool has an active shared spare device. 4423 * Note: reference count of an active spare is 2, as a spare and as a replace 4424 */ 4425static boolean_t 4426spa_has_active_shared_spare(spa_t *spa) 4427{ 4428 int i, refcnt; 4429 uint64_t pool; 4430 spa_aux_vdev_t *sav = &spa->spa_spares; 4431 4432 for (i = 0; i < sav->sav_count; i++) { 4433 if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, 4434 &refcnt) && pool != 0ULL && pool == spa_guid(spa) && 4435 refcnt > 2) 4436 return (B_TRUE); 4437 } 4438 4439 return (B_FALSE); 4440} 4441 4442/* 4443 * Post a sysevent corresponding to the given event. The 'name' must be one of 4444 * the event definitions in sys/sysevent/eventdefs.h. The payload will be 4445 * filled in from the spa and (optionally) the vdev. This doesn't do anything 4446 * in the userland libzpool, as we don't want consumers to misinterpret ztest 4447 * or zdb as real changes. 4448 */ 4449void 4450spa_event_notify(spa_t *spa, vdev_t *vd, const char *name) 4451{ 4452#if 0 4453#ifdef _KERNEL 4454 sysevent_t *ev; 4455 sysevent_attr_list_t *attr = NULL; 4456 sysevent_value_t value; 4457 sysevent_id_t eid; 4458 4459 ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", 4460 SE_SLEEP); 4461 4462 value.value_type = SE_DATA_TYPE_STRING; 4463 value.value.sv_string = spa_name(spa); 4464 if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) 4465 goto done; 4466 4467 value.value_type = SE_DATA_TYPE_UINT64; 4468 value.value.sv_uint64 = spa_guid(spa); 4469 if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) 4470 goto done; 4471 4472 if (vd) { 4473 value.value_type = SE_DATA_TYPE_UINT64; 4474 value.value.sv_uint64 = vd->vdev_guid; 4475 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, 4476 SE_SLEEP) != 0) 4477 goto done; 4478 4479 if (vd->vdev_path) { 4480 value.value_type = SE_DATA_TYPE_STRING; 4481 value.value.sv_string = vd->vdev_path; 4482 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, 4483 &value, SE_SLEEP) != 0) 4484 goto done; 4485 } 4486 } 4487 4488 if (sysevent_attach_attributes(ev, attr) != 0) 4489 goto done; 4490 attr = NULL; 4491 4492 (void) log_sysevent(ev, SE_SLEEP, &eid); 4493 4494done: 4495 if (attr) 4496 sysevent_free_attr(attr); 4497 sysevent_free(ev); 4498#endif 4499#endif 4500} 4501