1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22/* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2011 by Delphix. All rights reserved. 25 */ 26 27/* 28 * This file contains all the routines used when modifying on-disk SPA state. 29 * This includes opening, importing, destroying, exporting a pool, and syncing a 30 * pool. 31 */ 32 33#include <sys/zfs_context.h> 34#include <sys/fm/fs/zfs.h> 35#include <sys/spa_impl.h> 36#include <sys/zio.h> 37#include <sys/zio_checksum.h> 38#include <sys/dmu.h> 39#include <sys/dmu_tx.h> 40#include <sys/zap.h> 41#include <sys/zil.h> 42#include <sys/ddt.h> 43#include <sys/vdev_impl.h> 44#include <sys/metaslab.h> 45#include <sys/metaslab_impl.h> 46#include <sys/uberblock_impl.h> 47#include <sys/txg.h> 48#include <sys/avl.h> 49#include <sys/dmu_traverse.h> 50#include <sys/dmu_objset.h> 51#include <sys/unique.h> 52#include <sys/dsl_pool.h> 53#include <sys/dsl_dataset.h> 54#include <sys/dsl_dir.h> 55#include <sys/dsl_prop.h> 56#include <sys/dsl_synctask.h> 57#include <sys/fs/zfs.h> 58#include <sys/arc.h> 59#include <sys/callb.h> 60#include <sys/spa_boot.h> 61#include <sys/zfs_ioctl.h> 62#include <sys/dsl_scan.h> 63#include <sys/zvol.h> 64 65#ifdef _KERNEL 66#include <sys/callb.h> 67#include <sys/cpupart.h> 68#include <sys/zone.h> 69#endif /* _KERNEL */ 70 71#include "zfs_prop.h" 72#include "zfs_comutil.h" 73 74/* Check hostid on import? */ 75static int check_hostid = 1; 76 77SYSCTL_DECL(_vfs_zfs); 78TUNABLE_INT("vfs.zfs.check_hostid", &check_hostid); 79SYSCTL_INT(_vfs_zfs, OID_AUTO, check_hostid, CTLFLAG_RW, &check_hostid, 0, 80 "Check hostid on import?"); 81 82typedef enum zti_modes { 83 zti_mode_fixed, /* value is # of threads (min 1) */ 84 zti_mode_online_percent, /* value is % of online CPUs */ 85 zti_mode_batch, /* cpu-intensive; value is ignored */ 86 zti_mode_null, /* don't create a taskq */ 87 zti_nmodes 88} zti_modes_t; 89 90#define ZTI_FIX(n) { zti_mode_fixed, (n) } 91#define ZTI_PCT(n) { zti_mode_online_percent, (n) } 92#define ZTI_BATCH { zti_mode_batch, 0 } 93#define ZTI_NULL { zti_mode_null, 0 } 94 95#define ZTI_ONE ZTI_FIX(1) 96 97typedef struct zio_taskq_info { 98 enum zti_modes zti_mode; 99 uint_t zti_value; 100} zio_taskq_info_t; 101 102static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { 103 "issue", "issue_high", "intr", "intr_high" 104}; 105 106/* 107 * Define the taskq threads for the following I/O types: 108 * NULL, READ, WRITE, FREE, CLAIM, and IOCTL 109 */ 110const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { 111 /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ 112 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 113 { ZTI_FIX(8), ZTI_NULL, ZTI_BATCH, ZTI_NULL }, 114 { ZTI_BATCH, ZTI_FIX(5), ZTI_FIX(8), ZTI_FIX(5) }, 115 { ZTI_FIX(100), ZTI_NULL, ZTI_ONE, ZTI_NULL }, 116 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 117 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 118}; 119 120static dsl_syncfunc_t spa_sync_props; 121static boolean_t spa_has_active_shared_spare(spa_t *spa); 122static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config, 123 spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, 124 char **ereport); 125static void spa_vdev_resilver_done(spa_t *spa); 126 127uint_t zio_taskq_batch_pct = 100; /* 1 thread per cpu in pset */ 128#ifdef PSRSET_BIND 129id_t zio_taskq_psrset_bind = PS_NONE; 130#endif 131#ifdef SYSDC 132boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ 133#endif 134uint_t zio_taskq_basedc = 80; /* base duty cycle */ 135 136boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */ 137 138/* 139 * This (illegal) pool name is used when temporarily importing a spa_t in order 140 * to get the vdev stats associated with the imported devices. 141 */ 142#define TRYIMPORT_NAME "$import" 143 144/* 145 * ========================================================================== 146 * SPA properties routines 147 * ========================================================================== 148 */ 149 150/* 151 * Add a (source=src, propname=propval) list to an nvlist. 152 */ 153static void 154spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, 155 uint64_t intval, zprop_source_t src) 156{ 157 const char *propname = zpool_prop_to_name(prop); 158 nvlist_t *propval; 159 160 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 161 VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); 162 163 if (strval != NULL) 164 VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); 165 else 166 VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); 167 168 VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); 169 nvlist_free(propval); 170} 171 172/* 173 * Get property values from the spa configuration. 174 */ 175static void 176spa_prop_get_config(spa_t *spa, nvlist_t **nvp) 177{ 178 uint64_t size; 179 uint64_t alloc; 180 uint64_t cap, version; 181 zprop_source_t src = ZPROP_SRC_NONE; 182 spa_config_dirent_t *dp; 183 184 ASSERT(MUTEX_HELD(&spa->spa_props_lock)); 185 186 if (spa->spa_root_vdev != NULL) { 187 alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 188 size = metaslab_class_get_space(spa_normal_class(spa)); 189 spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); 190 spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); 191 spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); 192 spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, 193 size - alloc, src); 194 spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, 195 (spa_mode(spa) == FREAD), src); 196 197 cap = (size == 0) ? 0 : (alloc * 100 / size); 198 spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); 199 200 spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, 201 ddt_get_pool_dedup_ratio(spa), src); 202 203 spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 204 spa->spa_root_vdev->vdev_state, src); 205 206 version = spa_version(spa); 207 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) 208 src = ZPROP_SRC_DEFAULT; 209 else 210 src = ZPROP_SRC_LOCAL; 211 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); 212 } 213 214 spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); 215 216 if (spa->spa_comment != NULL) { 217 spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment, 218 0, ZPROP_SRC_LOCAL); 219 } 220 221 if (spa->spa_root != NULL) 222 spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 223 0, ZPROP_SRC_LOCAL); 224 225 if ((dp = list_head(&spa->spa_config_list)) != NULL) { 226 if (dp->scd_path == NULL) { 227 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 228 "none", 0, ZPROP_SRC_LOCAL); 229 } else if (strcmp(dp->scd_path, spa_config_path) != 0) { 230 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 231 dp->scd_path, 0, ZPROP_SRC_LOCAL); 232 } 233 } 234} 235 236/* 237 * Get zpool property values. 238 */ 239int 240spa_prop_get(spa_t *spa, nvlist_t **nvp) 241{ 242 objset_t *mos = spa->spa_meta_objset; 243 zap_cursor_t zc; 244 zap_attribute_t za; 245 int err; 246 247 VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 248 249 mutex_enter(&spa->spa_props_lock); 250 251 /* 252 * Get properties from the spa config. 253 */ 254 spa_prop_get_config(spa, nvp); 255 256 /* If no pool property object, no more prop to get. */ 257 if (mos == NULL || spa->spa_pool_props_object == 0) { 258 mutex_exit(&spa->spa_props_lock); 259 return (0); 260 } 261 262 /* 263 * Get properties from the MOS pool property object. 264 */ 265 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 266 (err = zap_cursor_retrieve(&zc, &za)) == 0; 267 zap_cursor_advance(&zc)) { 268 uint64_t intval = 0; 269 char *strval = NULL; 270 zprop_source_t src = ZPROP_SRC_DEFAULT; 271 zpool_prop_t prop; 272 273 if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL) 274 continue; 275 276 switch (za.za_integer_length) { 277 case 8: 278 /* integer property */ 279 if (za.za_first_integer != 280 zpool_prop_default_numeric(prop)) 281 src = ZPROP_SRC_LOCAL; 282 283 if (prop == ZPOOL_PROP_BOOTFS) { 284 dsl_pool_t *dp; 285 dsl_dataset_t *ds = NULL; 286 287 dp = spa_get_dsl(spa); 288 rw_enter(&dp->dp_config_rwlock, RW_READER); 289 if (err = dsl_dataset_hold_obj(dp, 290 za.za_first_integer, FTAG, &ds)) { 291 rw_exit(&dp->dp_config_rwlock); 292 break; 293 } 294 295 strval = kmem_alloc( 296 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, 297 KM_SLEEP); 298 dsl_dataset_name(ds, strval); 299 dsl_dataset_rele(ds, FTAG); 300 rw_exit(&dp->dp_config_rwlock); 301 } else { 302 strval = NULL; 303 intval = za.za_first_integer; 304 } 305 306 spa_prop_add_list(*nvp, prop, strval, intval, src); 307 308 if (strval != NULL) 309 kmem_free(strval, 310 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); 311 312 break; 313 314 case 1: 315 /* string property */ 316 strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 317 err = zap_lookup(mos, spa->spa_pool_props_object, 318 za.za_name, 1, za.za_num_integers, strval); 319 if (err) { 320 kmem_free(strval, za.za_num_integers); 321 break; 322 } 323 spa_prop_add_list(*nvp, prop, strval, 0, src); 324 kmem_free(strval, za.za_num_integers); 325 break; 326 327 default: 328 break; 329 } 330 } 331 zap_cursor_fini(&zc); 332 mutex_exit(&spa->spa_props_lock); 333out: 334 if (err && err != ENOENT) { 335 nvlist_free(*nvp); 336 *nvp = NULL; 337 return (err); 338 } 339 340 return (0); 341} 342 343/* 344 * Validate the given pool properties nvlist and modify the list 345 * for the property values to be set. 346 */ 347static int 348spa_prop_validate(spa_t *spa, nvlist_t *props) 349{ 350 nvpair_t *elem; 351 int error = 0, reset_bootfs = 0; 352 uint64_t objnum; 353 354 elem = NULL; 355 while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 356 zpool_prop_t prop; 357 char *propname, *strval; 358 uint64_t intval; 359 objset_t *os; 360 char *slash, *check; 361 362 propname = nvpair_name(elem); 363 364 if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL) 365 return (EINVAL); 366 367 switch (prop) { 368 case ZPOOL_PROP_VERSION: 369 error = nvpair_value_uint64(elem, &intval); 370 if (!error && 371 (intval < spa_version(spa) || intval > SPA_VERSION)) 372 error = EINVAL; 373 break; 374 375 case ZPOOL_PROP_DELEGATION: 376 case ZPOOL_PROP_AUTOREPLACE: 377 case ZPOOL_PROP_LISTSNAPS: 378 case ZPOOL_PROP_AUTOEXPAND: 379 error = nvpair_value_uint64(elem, &intval); 380 if (!error && intval > 1) 381 error = EINVAL; 382 break; 383 384 case ZPOOL_PROP_BOOTFS: 385 /* 386 * If the pool version is less than SPA_VERSION_BOOTFS, 387 * or the pool is still being created (version == 0), 388 * the bootfs property cannot be set. 389 */ 390 if (spa_version(spa) < SPA_VERSION_BOOTFS) { 391 error = ENOTSUP; 392 break; 393 } 394 395 /* 396 * Make sure the vdev config is bootable 397 */ 398 if (!vdev_is_bootable(spa->spa_root_vdev)) { 399 error = ENOTSUP; 400 break; 401 } 402 403 reset_bootfs = 1; 404 405 error = nvpair_value_string(elem, &strval); 406 407 if (!error) { 408 uint64_t compress; 409 410 if (strval == NULL || strval[0] == '\0') { 411 objnum = zpool_prop_default_numeric( 412 ZPOOL_PROP_BOOTFS); 413 break; 414 } 415 416 if (error = dmu_objset_hold(strval, FTAG, &os)) 417 break; 418 419 /* Must be ZPL and not gzip compressed. */ 420 421 if (dmu_objset_type(os) != DMU_OST_ZFS) { 422 error = ENOTSUP; 423 } else if ((error = dsl_prop_get_integer(strval, 424 zfs_prop_to_name(ZFS_PROP_COMPRESSION), 425 &compress, NULL)) == 0 && 426 !BOOTFS_COMPRESS_VALID(compress)) { 427 error = ENOTSUP; 428 } else { 429 objnum = dmu_objset_id(os); 430 } 431 dmu_objset_rele(os, FTAG); 432 } 433 break; 434 435 case ZPOOL_PROP_FAILUREMODE: 436 error = nvpair_value_uint64(elem, &intval); 437 if (!error && (intval < ZIO_FAILURE_MODE_WAIT || 438 intval > ZIO_FAILURE_MODE_PANIC)) 439 error = EINVAL; 440 441 /* 442 * This is a special case which only occurs when 443 * the pool has completely failed. This allows 444 * the user to change the in-core failmode property 445 * without syncing it out to disk (I/Os might 446 * currently be blocked). We do this by returning 447 * EIO to the caller (spa_prop_set) to trick it 448 * into thinking we encountered a property validation 449 * error. 450 */ 451 if (!error && spa_suspended(spa)) { 452 spa->spa_failmode = intval; 453 error = EIO; 454 } 455 break; 456 457 case ZPOOL_PROP_CACHEFILE: 458 if ((error = nvpair_value_string(elem, &strval)) != 0) 459 break; 460 461 if (strval[0] == '\0') 462 break; 463 464 if (strcmp(strval, "none") == 0) 465 break; 466 467 if (strval[0] != '/') { 468 error = EINVAL; 469 break; 470 } 471 472 slash = strrchr(strval, '/'); 473 ASSERT(slash != NULL); 474 475 if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 476 strcmp(slash, "/..") == 0) 477 error = EINVAL; 478 break; 479 480 case ZPOOL_PROP_COMMENT: 481 if ((error = nvpair_value_string(elem, &strval)) != 0) 482 break; 483 for (check = strval; *check != '\0'; check++) { 484 /* 485 * The kernel doesn't have an easy isprint() 486 * check. For this kernel check, we merely 487 * check ASCII apart from DEL. Fix this if 488 * there is an easy-to-use kernel isprint(). 489 */ 490 if (*check >= 0x7f) { 491 error = EINVAL; 492 break; 493 } 494 check++; 495 } 496 if (strlen(strval) > ZPROP_MAX_COMMENT) 497 error = E2BIG; 498 break; 499 500 case ZPOOL_PROP_DEDUPDITTO: 501 if (spa_version(spa) < SPA_VERSION_DEDUP) 502 error = ENOTSUP; 503 else 504 error = nvpair_value_uint64(elem, &intval); 505 if (error == 0 && 506 intval != 0 && intval < ZIO_DEDUPDITTO_MIN) 507 error = EINVAL; 508 break; 509 } 510 511 if (error) 512 break; 513 } 514 515 if (!error && reset_bootfs) { 516 error = nvlist_remove(props, 517 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 518 519 if (!error) { 520 error = nvlist_add_uint64(props, 521 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 522 } 523 } 524 525 return (error); 526} 527 528void 529spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) 530{ 531 char *cachefile; 532 spa_config_dirent_t *dp; 533 534 if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), 535 &cachefile) != 0) 536 return; 537 538 dp = kmem_alloc(sizeof (spa_config_dirent_t), 539 KM_SLEEP); 540 541 if (cachefile[0] == '\0') 542 dp->scd_path = spa_strdup(spa_config_path); 543 else if (strcmp(cachefile, "none") == 0) 544 dp->scd_path = NULL; 545 else 546 dp->scd_path = spa_strdup(cachefile); 547 548 list_insert_head(&spa->spa_config_list, dp); 549 if (need_sync) 550 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 551} 552 553int 554spa_prop_set(spa_t *spa, nvlist_t *nvp) 555{ 556 int error; 557 nvpair_t *elem; 558 boolean_t need_sync = B_FALSE; 559 zpool_prop_t prop; 560 561 if ((error = spa_prop_validate(spa, nvp)) != 0) 562 return (error); 563 564 elem = NULL; 565 while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { 566 if ((prop = zpool_name_to_prop( 567 nvpair_name(elem))) == ZPROP_INVAL) 568 return (EINVAL); 569 570 if (prop == ZPOOL_PROP_CACHEFILE || 571 prop == ZPOOL_PROP_ALTROOT || 572 prop == ZPOOL_PROP_READONLY) 573 continue; 574 575 need_sync = B_TRUE; 576 break; 577 } 578 579 if (need_sync) 580 return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, 581 spa, nvp, 3)); 582 else 583 return (0); 584} 585 586/* 587 * If the bootfs property value is dsobj, clear it. 588 */ 589void 590spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 591{ 592 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 593 VERIFY(zap_remove(spa->spa_meta_objset, 594 spa->spa_pool_props_object, 595 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 596 spa->spa_bootfs = 0; 597 } 598} 599 600/* 601 * Change the GUID for the pool. This is done so that we can later 602 * re-import a pool built from a clone of our own vdevs. We will modify 603 * the root vdev's guid, our own pool guid, and then mark all of our 604 * vdevs dirty. Note that we must make sure that all our vdevs are 605 * online when we do this, or else any vdevs that weren't present 606 * would be orphaned from our pool. We are also going to issue a 607 * sysevent to update any watchers. 608 */ 609int 610spa_change_guid(spa_t *spa) 611{ 612 uint64_t oldguid, newguid; 613 uint64_t txg; 614 615 if (!(spa_mode_global & FWRITE)) 616 return (EROFS); 617 618 txg = spa_vdev_enter(spa); 619 620 if (spa->spa_root_vdev->vdev_state != VDEV_STATE_HEALTHY) 621 return (spa_vdev_exit(spa, NULL, txg, ENXIO)); 622 623 oldguid = spa_guid(spa); 624 newguid = spa_generate_guid(NULL); 625 ASSERT3U(oldguid, !=, newguid); 626 627 spa->spa_root_vdev->vdev_guid = newguid; 628 spa->spa_root_vdev->vdev_guid_sum += (newguid - oldguid); 629 630 vdev_config_dirty(spa->spa_root_vdev); 631 632 spa_event_notify(spa, NULL, ESC_ZFS_POOL_REGUID); 633 634 return (spa_vdev_exit(spa, NULL, txg, 0)); 635} 636 637/* 638 * ========================================================================== 639 * SPA state manipulation (open/create/destroy/import/export) 640 * ========================================================================== 641 */ 642 643static int 644spa_error_entry_compare(const void *a, const void *b) 645{ 646 spa_error_entry_t *sa = (spa_error_entry_t *)a; 647 spa_error_entry_t *sb = (spa_error_entry_t *)b; 648 int ret; 649 650 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 651 sizeof (zbookmark_t)); 652 653 if (ret < 0) 654 return (-1); 655 else if (ret > 0) 656 return (1); 657 else 658 return (0); 659} 660 661/* 662 * Utility function which retrieves copies of the current logs and 663 * re-initializes them in the process. 664 */ 665void 666spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 667{ 668 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 669 670 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 671 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 672 673 avl_create(&spa->spa_errlist_scrub, 674 spa_error_entry_compare, sizeof (spa_error_entry_t), 675 offsetof(spa_error_entry_t, se_avl)); 676 avl_create(&spa->spa_errlist_last, 677 spa_error_entry_compare, sizeof (spa_error_entry_t), 678 offsetof(spa_error_entry_t, se_avl)); 679} 680 681static taskq_t * 682spa_taskq_create(spa_t *spa, const char *name, enum zti_modes mode, 683 uint_t value) 684{ 685 uint_t flags = TASKQ_PREPOPULATE; 686 boolean_t batch = B_FALSE; 687 688 switch (mode) { 689 case zti_mode_null: 690 return (NULL); /* no taskq needed */ 691 692 case zti_mode_fixed: 693 ASSERT3U(value, >=, 1); 694 value = MAX(value, 1); 695 break; 696 697 case zti_mode_batch: 698 batch = B_TRUE; 699 flags |= TASKQ_THREADS_CPU_PCT; 700 value = zio_taskq_batch_pct; 701 break; 702 703 case zti_mode_online_percent: 704 flags |= TASKQ_THREADS_CPU_PCT; 705 break; 706 707 default: 708 panic("unrecognized mode for %s taskq (%u:%u) in " 709 "spa_activate()", 710 name, mode, value); 711 break; 712 } 713 714#ifdef SYSDC 715 if (zio_taskq_sysdc && spa->spa_proc != &p0) { 716 if (batch) 717 flags |= TASKQ_DC_BATCH; 718 719 return (taskq_create_sysdc(name, value, 50, INT_MAX, 720 spa->spa_proc, zio_taskq_basedc, flags)); 721 } 722#endif 723 return (taskq_create_proc(name, value, maxclsyspri, 50, INT_MAX, 724 spa->spa_proc, flags)); 725} 726 727static void 728spa_create_zio_taskqs(spa_t *spa) 729{ 730 for (int t = 0; t < ZIO_TYPES; t++) { 731 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 732 const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; 733 enum zti_modes mode = ztip->zti_mode; 734 uint_t value = ztip->zti_value; 735 char name[32]; 736 737 (void) snprintf(name, sizeof (name), 738 "%s_%s", zio_type_name[t], zio_taskq_types[q]); 739 740 spa->spa_zio_taskq[t][q] = 741 spa_taskq_create(spa, name, mode, value); 742 } 743 } 744} 745 746#ifdef _KERNEL 747#ifdef SPA_PROCESS 748static void 749spa_thread(void *arg) 750{ 751 callb_cpr_t cprinfo; 752 753 spa_t *spa = arg; 754 user_t *pu = PTOU(curproc); 755 756 CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, 757 spa->spa_name); 758 759 ASSERT(curproc != &p0); 760 (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), 761 "zpool-%s", spa->spa_name); 762 (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); 763 764#ifdef PSRSET_BIND 765 /* bind this thread to the requested psrset */ 766 if (zio_taskq_psrset_bind != PS_NONE) { 767 pool_lock(); 768 mutex_enter(&cpu_lock); 769 mutex_enter(&pidlock); 770 mutex_enter(&curproc->p_lock); 771 772 if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, 773 0, NULL, NULL) == 0) { 774 curthread->t_bind_pset = zio_taskq_psrset_bind; 775 } else { 776 cmn_err(CE_WARN, 777 "Couldn't bind process for zfs pool \"%s\" to " 778 "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); 779 } 780 781 mutex_exit(&curproc->p_lock); 782 mutex_exit(&pidlock); 783 mutex_exit(&cpu_lock); 784 pool_unlock(); 785 } 786#endif 787 788#ifdef SYSDC 789 if (zio_taskq_sysdc) { 790 sysdc_thread_enter(curthread, 100, 0); 791 } 792#endif 793 794 spa->spa_proc = curproc; 795 spa->spa_did = curthread->t_did; 796 797 spa_create_zio_taskqs(spa); 798 799 mutex_enter(&spa->spa_proc_lock); 800 ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); 801 802 spa->spa_proc_state = SPA_PROC_ACTIVE; 803 cv_broadcast(&spa->spa_proc_cv); 804 805 CALLB_CPR_SAFE_BEGIN(&cprinfo); 806 while (spa->spa_proc_state == SPA_PROC_ACTIVE) 807 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 808 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); 809 810 ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); 811 spa->spa_proc_state = SPA_PROC_GONE; 812 spa->spa_proc = &p0; 813 cv_broadcast(&spa->spa_proc_cv); 814 CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ 815 816 mutex_enter(&curproc->p_lock); 817 lwp_exit(); 818} 819#endif /* SPA_PROCESS */ 820#endif 821 822/* 823 * Activate an uninitialized pool. 824 */ 825static void 826spa_activate(spa_t *spa, int mode) 827{ 828 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 829 830 spa->spa_state = POOL_STATE_ACTIVE; 831 spa->spa_mode = mode; 832 833 spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops); 834 spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); 835 836 /* Try to create a covering process */ 837 mutex_enter(&spa->spa_proc_lock); 838 ASSERT(spa->spa_proc_state == SPA_PROC_NONE); 839 ASSERT(spa->spa_proc == &p0); 840 spa->spa_did = 0; 841 842#ifdef SPA_PROCESS 843 /* Only create a process if we're going to be around a while. */ 844 if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { 845 if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, 846 NULL, 0) == 0) { 847 spa->spa_proc_state = SPA_PROC_CREATED; 848 while (spa->spa_proc_state == SPA_PROC_CREATED) { 849 cv_wait(&spa->spa_proc_cv, 850 &spa->spa_proc_lock); 851 } 852 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 853 ASSERT(spa->spa_proc != &p0); 854 ASSERT(spa->spa_did != 0); 855 } else { 856#ifdef _KERNEL 857 cmn_err(CE_WARN, 858 "Couldn't create process for zfs pool \"%s\"\n", 859 spa->spa_name); 860#endif 861 } 862 } 863#endif /* SPA_PROCESS */ 864 mutex_exit(&spa->spa_proc_lock); 865 866 /* If we didn't create a process, we need to create our taskqs. */ 867 ASSERT(spa->spa_proc == &p0); 868 if (spa->spa_proc == &p0) { 869 spa_create_zio_taskqs(spa); 870 } 871 872 list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), 873 offsetof(vdev_t, vdev_config_dirty_node)); 874 list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), 875 offsetof(vdev_t, vdev_state_dirty_node)); 876 877 txg_list_create(&spa->spa_vdev_txg_list, 878 offsetof(struct vdev, vdev_txg_node)); 879 880 avl_create(&spa->spa_errlist_scrub, 881 spa_error_entry_compare, sizeof (spa_error_entry_t), 882 offsetof(spa_error_entry_t, se_avl)); 883 avl_create(&spa->spa_errlist_last, 884 spa_error_entry_compare, sizeof (spa_error_entry_t), 885 offsetof(spa_error_entry_t, se_avl)); 886} 887 888/* 889 * Opposite of spa_activate(). 890 */ 891static void 892spa_deactivate(spa_t *spa) 893{ 894 ASSERT(spa->spa_sync_on == B_FALSE); 895 ASSERT(spa->spa_dsl_pool == NULL); 896 ASSERT(spa->spa_root_vdev == NULL); 897 ASSERT(spa->spa_async_zio_root == NULL); 898 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 899 900 txg_list_destroy(&spa->spa_vdev_txg_list); 901 902 list_destroy(&spa->spa_config_dirty_list); 903 list_destroy(&spa->spa_state_dirty_list); 904 905 for (int t = 0; t < ZIO_TYPES; t++) { 906 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 907 if (spa->spa_zio_taskq[t][q] != NULL) 908 taskq_destroy(spa->spa_zio_taskq[t][q]); 909 spa->spa_zio_taskq[t][q] = NULL; 910 } 911 } 912 913 metaslab_class_destroy(spa->spa_normal_class); 914 spa->spa_normal_class = NULL; 915 916 metaslab_class_destroy(spa->spa_log_class); 917 spa->spa_log_class = NULL; 918 919 /* 920 * If this was part of an import or the open otherwise failed, we may 921 * still have errors left in the queues. Empty them just in case. 922 */ 923 spa_errlog_drain(spa); 924 925 avl_destroy(&spa->spa_errlist_scrub); 926 avl_destroy(&spa->spa_errlist_last); 927 928 spa->spa_state = POOL_STATE_UNINITIALIZED; 929 930 mutex_enter(&spa->spa_proc_lock); 931 if (spa->spa_proc_state != SPA_PROC_NONE) { 932 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 933 spa->spa_proc_state = SPA_PROC_DEACTIVATE; 934 cv_broadcast(&spa->spa_proc_cv); 935 while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { 936 ASSERT(spa->spa_proc != &p0); 937 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 938 } 939 ASSERT(spa->spa_proc_state == SPA_PROC_GONE); 940 spa->spa_proc_state = SPA_PROC_NONE; 941 } 942 ASSERT(spa->spa_proc == &p0); 943 mutex_exit(&spa->spa_proc_lock); 944 945#ifdef SPA_PROCESS 946 /* 947 * We want to make sure spa_thread() has actually exited the ZFS 948 * module, so that the module can't be unloaded out from underneath 949 * it. 950 */ 951 if (spa->spa_did != 0) { 952 thread_join(spa->spa_did); 953 spa->spa_did = 0; 954 } 955#endif /* SPA_PROCESS */ 956} 957 958/* 959 * Verify a pool configuration, and construct the vdev tree appropriately. This 960 * will create all the necessary vdevs in the appropriate layout, with each vdev 961 * in the CLOSED state. This will prep the pool before open/creation/import. 962 * All vdev validation is done by the vdev_alloc() routine. 963 */ 964static int 965spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 966 uint_t id, int atype) 967{ 968 nvlist_t **child; 969 uint_t children; 970 int error; 971 972 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 973 return (error); 974 975 if ((*vdp)->vdev_ops->vdev_op_leaf) 976 return (0); 977 978 error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 979 &child, &children); 980 981 if (error == ENOENT) 982 return (0); 983 984 if (error) { 985 vdev_free(*vdp); 986 *vdp = NULL; 987 return (EINVAL); 988 } 989 990 for (int c = 0; c < children; c++) { 991 vdev_t *vd; 992 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 993 atype)) != 0) { 994 vdev_free(*vdp); 995 *vdp = NULL; 996 return (error); 997 } 998 } 999 1000 ASSERT(*vdp != NULL); 1001 1002 return (0); 1003} 1004 1005/* 1006 * Opposite of spa_load(). 1007 */ 1008static void 1009spa_unload(spa_t *spa) 1010{ 1011 int i; 1012 1013 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1014 1015 /* 1016 * Stop async tasks. 1017 */ 1018 spa_async_suspend(spa); 1019 1020 /* 1021 * Stop syncing. 1022 */ 1023 if (spa->spa_sync_on) { 1024 txg_sync_stop(spa->spa_dsl_pool); 1025 spa->spa_sync_on = B_FALSE; 1026 } 1027 1028 /* 1029 * Wait for any outstanding async I/O to complete. 1030 */ 1031 if (spa->spa_async_zio_root != NULL) { 1032 (void) zio_wait(spa->spa_async_zio_root); 1033 spa->spa_async_zio_root = NULL; 1034 } 1035 1036 bpobj_close(&spa->spa_deferred_bpobj); 1037 1038 /* 1039 * Close the dsl pool. 1040 */ 1041 if (spa->spa_dsl_pool) { 1042 dsl_pool_close(spa->spa_dsl_pool); 1043 spa->spa_dsl_pool = NULL; 1044 spa->spa_meta_objset = NULL; 1045 } 1046 1047 ddt_unload(spa); 1048 1049 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1050 1051 /* 1052 * Drop and purge level 2 cache 1053 */ 1054 spa_l2cache_drop(spa); 1055 1056 /* 1057 * Close all vdevs. 1058 */ 1059 if (spa->spa_root_vdev) 1060 vdev_free(spa->spa_root_vdev); 1061 ASSERT(spa->spa_root_vdev == NULL); 1062 1063 for (i = 0; i < spa->spa_spares.sav_count; i++) 1064 vdev_free(spa->spa_spares.sav_vdevs[i]); 1065 if (spa->spa_spares.sav_vdevs) { 1066 kmem_free(spa->spa_spares.sav_vdevs, 1067 spa->spa_spares.sav_count * sizeof (void *)); 1068 spa->spa_spares.sav_vdevs = NULL; 1069 } 1070 if (spa->spa_spares.sav_config) { 1071 nvlist_free(spa->spa_spares.sav_config); 1072 spa->spa_spares.sav_config = NULL; 1073 } 1074 spa->spa_spares.sav_count = 0; 1075
| 1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22/* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2011 by Delphix. All rights reserved. 25 */ 26 27/* 28 * This file contains all the routines used when modifying on-disk SPA state. 29 * This includes opening, importing, destroying, exporting a pool, and syncing a 30 * pool. 31 */ 32 33#include <sys/zfs_context.h> 34#include <sys/fm/fs/zfs.h> 35#include <sys/spa_impl.h> 36#include <sys/zio.h> 37#include <sys/zio_checksum.h> 38#include <sys/dmu.h> 39#include <sys/dmu_tx.h> 40#include <sys/zap.h> 41#include <sys/zil.h> 42#include <sys/ddt.h> 43#include <sys/vdev_impl.h> 44#include <sys/metaslab.h> 45#include <sys/metaslab_impl.h> 46#include <sys/uberblock_impl.h> 47#include <sys/txg.h> 48#include <sys/avl.h> 49#include <sys/dmu_traverse.h> 50#include <sys/dmu_objset.h> 51#include <sys/unique.h> 52#include <sys/dsl_pool.h> 53#include <sys/dsl_dataset.h> 54#include <sys/dsl_dir.h> 55#include <sys/dsl_prop.h> 56#include <sys/dsl_synctask.h> 57#include <sys/fs/zfs.h> 58#include <sys/arc.h> 59#include <sys/callb.h> 60#include <sys/spa_boot.h> 61#include <sys/zfs_ioctl.h> 62#include <sys/dsl_scan.h> 63#include <sys/zvol.h> 64 65#ifdef _KERNEL 66#include <sys/callb.h> 67#include <sys/cpupart.h> 68#include <sys/zone.h> 69#endif /* _KERNEL */ 70 71#include "zfs_prop.h" 72#include "zfs_comutil.h" 73 74/* Check hostid on import? */ 75static int check_hostid = 1; 76 77SYSCTL_DECL(_vfs_zfs); 78TUNABLE_INT("vfs.zfs.check_hostid", &check_hostid); 79SYSCTL_INT(_vfs_zfs, OID_AUTO, check_hostid, CTLFLAG_RW, &check_hostid, 0, 80 "Check hostid on import?"); 81 82typedef enum zti_modes { 83 zti_mode_fixed, /* value is # of threads (min 1) */ 84 zti_mode_online_percent, /* value is % of online CPUs */ 85 zti_mode_batch, /* cpu-intensive; value is ignored */ 86 zti_mode_null, /* don't create a taskq */ 87 zti_nmodes 88} zti_modes_t; 89 90#define ZTI_FIX(n) { zti_mode_fixed, (n) } 91#define ZTI_PCT(n) { zti_mode_online_percent, (n) } 92#define ZTI_BATCH { zti_mode_batch, 0 } 93#define ZTI_NULL { zti_mode_null, 0 } 94 95#define ZTI_ONE ZTI_FIX(1) 96 97typedef struct zio_taskq_info { 98 enum zti_modes zti_mode; 99 uint_t zti_value; 100} zio_taskq_info_t; 101 102static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { 103 "issue", "issue_high", "intr", "intr_high" 104}; 105 106/* 107 * Define the taskq threads for the following I/O types: 108 * NULL, READ, WRITE, FREE, CLAIM, and IOCTL 109 */ 110const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { 111 /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ 112 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 113 { ZTI_FIX(8), ZTI_NULL, ZTI_BATCH, ZTI_NULL }, 114 { ZTI_BATCH, ZTI_FIX(5), ZTI_FIX(8), ZTI_FIX(5) }, 115 { ZTI_FIX(100), ZTI_NULL, ZTI_ONE, ZTI_NULL }, 116 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 117 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 118}; 119 120static dsl_syncfunc_t spa_sync_props; 121static boolean_t spa_has_active_shared_spare(spa_t *spa); 122static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config, 123 spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, 124 char **ereport); 125static void spa_vdev_resilver_done(spa_t *spa); 126 127uint_t zio_taskq_batch_pct = 100; /* 1 thread per cpu in pset */ 128#ifdef PSRSET_BIND 129id_t zio_taskq_psrset_bind = PS_NONE; 130#endif 131#ifdef SYSDC 132boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ 133#endif 134uint_t zio_taskq_basedc = 80; /* base duty cycle */ 135 136boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */ 137 138/* 139 * This (illegal) pool name is used when temporarily importing a spa_t in order 140 * to get the vdev stats associated with the imported devices. 141 */ 142#define TRYIMPORT_NAME "$import" 143 144/* 145 * ========================================================================== 146 * SPA properties routines 147 * ========================================================================== 148 */ 149 150/* 151 * Add a (source=src, propname=propval) list to an nvlist. 152 */ 153static void 154spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, 155 uint64_t intval, zprop_source_t src) 156{ 157 const char *propname = zpool_prop_to_name(prop); 158 nvlist_t *propval; 159 160 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 161 VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); 162 163 if (strval != NULL) 164 VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); 165 else 166 VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); 167 168 VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); 169 nvlist_free(propval); 170} 171 172/* 173 * Get property values from the spa configuration. 174 */ 175static void 176spa_prop_get_config(spa_t *spa, nvlist_t **nvp) 177{ 178 uint64_t size; 179 uint64_t alloc; 180 uint64_t cap, version; 181 zprop_source_t src = ZPROP_SRC_NONE; 182 spa_config_dirent_t *dp; 183 184 ASSERT(MUTEX_HELD(&spa->spa_props_lock)); 185 186 if (spa->spa_root_vdev != NULL) { 187 alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 188 size = metaslab_class_get_space(spa_normal_class(spa)); 189 spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); 190 spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); 191 spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); 192 spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, 193 size - alloc, src); 194 spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, 195 (spa_mode(spa) == FREAD), src); 196 197 cap = (size == 0) ? 0 : (alloc * 100 / size); 198 spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); 199 200 spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, 201 ddt_get_pool_dedup_ratio(spa), src); 202 203 spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 204 spa->spa_root_vdev->vdev_state, src); 205 206 version = spa_version(spa); 207 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) 208 src = ZPROP_SRC_DEFAULT; 209 else 210 src = ZPROP_SRC_LOCAL; 211 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); 212 } 213 214 spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); 215 216 if (spa->spa_comment != NULL) { 217 spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment, 218 0, ZPROP_SRC_LOCAL); 219 } 220 221 if (spa->spa_root != NULL) 222 spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 223 0, ZPROP_SRC_LOCAL); 224 225 if ((dp = list_head(&spa->spa_config_list)) != NULL) { 226 if (dp->scd_path == NULL) { 227 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 228 "none", 0, ZPROP_SRC_LOCAL); 229 } else if (strcmp(dp->scd_path, spa_config_path) != 0) { 230 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 231 dp->scd_path, 0, ZPROP_SRC_LOCAL); 232 } 233 } 234} 235 236/* 237 * Get zpool property values. 238 */ 239int 240spa_prop_get(spa_t *spa, nvlist_t **nvp) 241{ 242 objset_t *mos = spa->spa_meta_objset; 243 zap_cursor_t zc; 244 zap_attribute_t za; 245 int err; 246 247 VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 248 249 mutex_enter(&spa->spa_props_lock); 250 251 /* 252 * Get properties from the spa config. 253 */ 254 spa_prop_get_config(spa, nvp); 255 256 /* If no pool property object, no more prop to get. */ 257 if (mos == NULL || spa->spa_pool_props_object == 0) { 258 mutex_exit(&spa->spa_props_lock); 259 return (0); 260 } 261 262 /* 263 * Get properties from the MOS pool property object. 264 */ 265 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 266 (err = zap_cursor_retrieve(&zc, &za)) == 0; 267 zap_cursor_advance(&zc)) { 268 uint64_t intval = 0; 269 char *strval = NULL; 270 zprop_source_t src = ZPROP_SRC_DEFAULT; 271 zpool_prop_t prop; 272 273 if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL) 274 continue; 275 276 switch (za.za_integer_length) { 277 case 8: 278 /* integer property */ 279 if (za.za_first_integer != 280 zpool_prop_default_numeric(prop)) 281 src = ZPROP_SRC_LOCAL; 282 283 if (prop == ZPOOL_PROP_BOOTFS) { 284 dsl_pool_t *dp; 285 dsl_dataset_t *ds = NULL; 286 287 dp = spa_get_dsl(spa); 288 rw_enter(&dp->dp_config_rwlock, RW_READER); 289 if (err = dsl_dataset_hold_obj(dp, 290 za.za_first_integer, FTAG, &ds)) { 291 rw_exit(&dp->dp_config_rwlock); 292 break; 293 } 294 295 strval = kmem_alloc( 296 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, 297 KM_SLEEP); 298 dsl_dataset_name(ds, strval); 299 dsl_dataset_rele(ds, FTAG); 300 rw_exit(&dp->dp_config_rwlock); 301 } else { 302 strval = NULL; 303 intval = za.za_first_integer; 304 } 305 306 spa_prop_add_list(*nvp, prop, strval, intval, src); 307 308 if (strval != NULL) 309 kmem_free(strval, 310 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); 311 312 break; 313 314 case 1: 315 /* string property */ 316 strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 317 err = zap_lookup(mos, spa->spa_pool_props_object, 318 za.za_name, 1, za.za_num_integers, strval); 319 if (err) { 320 kmem_free(strval, za.za_num_integers); 321 break; 322 } 323 spa_prop_add_list(*nvp, prop, strval, 0, src); 324 kmem_free(strval, za.za_num_integers); 325 break; 326 327 default: 328 break; 329 } 330 } 331 zap_cursor_fini(&zc); 332 mutex_exit(&spa->spa_props_lock); 333out: 334 if (err && err != ENOENT) { 335 nvlist_free(*nvp); 336 *nvp = NULL; 337 return (err); 338 } 339 340 return (0); 341} 342 343/* 344 * Validate the given pool properties nvlist and modify the list 345 * for the property values to be set. 346 */ 347static int 348spa_prop_validate(spa_t *spa, nvlist_t *props) 349{ 350 nvpair_t *elem; 351 int error = 0, reset_bootfs = 0; 352 uint64_t objnum; 353 354 elem = NULL; 355 while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 356 zpool_prop_t prop; 357 char *propname, *strval; 358 uint64_t intval; 359 objset_t *os; 360 char *slash, *check; 361 362 propname = nvpair_name(elem); 363 364 if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL) 365 return (EINVAL); 366 367 switch (prop) { 368 case ZPOOL_PROP_VERSION: 369 error = nvpair_value_uint64(elem, &intval); 370 if (!error && 371 (intval < spa_version(spa) || intval > SPA_VERSION)) 372 error = EINVAL; 373 break; 374 375 case ZPOOL_PROP_DELEGATION: 376 case ZPOOL_PROP_AUTOREPLACE: 377 case ZPOOL_PROP_LISTSNAPS: 378 case ZPOOL_PROP_AUTOEXPAND: 379 error = nvpair_value_uint64(elem, &intval); 380 if (!error && intval > 1) 381 error = EINVAL; 382 break; 383 384 case ZPOOL_PROP_BOOTFS: 385 /* 386 * If the pool version is less than SPA_VERSION_BOOTFS, 387 * or the pool is still being created (version == 0), 388 * the bootfs property cannot be set. 389 */ 390 if (spa_version(spa) < SPA_VERSION_BOOTFS) { 391 error = ENOTSUP; 392 break; 393 } 394 395 /* 396 * Make sure the vdev config is bootable 397 */ 398 if (!vdev_is_bootable(spa->spa_root_vdev)) { 399 error = ENOTSUP; 400 break; 401 } 402 403 reset_bootfs = 1; 404 405 error = nvpair_value_string(elem, &strval); 406 407 if (!error) { 408 uint64_t compress; 409 410 if (strval == NULL || strval[0] == '\0') { 411 objnum = zpool_prop_default_numeric( 412 ZPOOL_PROP_BOOTFS); 413 break; 414 } 415 416 if (error = dmu_objset_hold(strval, FTAG, &os)) 417 break; 418 419 /* Must be ZPL and not gzip compressed. */ 420 421 if (dmu_objset_type(os) != DMU_OST_ZFS) { 422 error = ENOTSUP; 423 } else if ((error = dsl_prop_get_integer(strval, 424 zfs_prop_to_name(ZFS_PROP_COMPRESSION), 425 &compress, NULL)) == 0 && 426 !BOOTFS_COMPRESS_VALID(compress)) { 427 error = ENOTSUP; 428 } else { 429 objnum = dmu_objset_id(os); 430 } 431 dmu_objset_rele(os, FTAG); 432 } 433 break; 434 435 case ZPOOL_PROP_FAILUREMODE: 436 error = nvpair_value_uint64(elem, &intval); 437 if (!error && (intval < ZIO_FAILURE_MODE_WAIT || 438 intval > ZIO_FAILURE_MODE_PANIC)) 439 error = EINVAL; 440 441 /* 442 * This is a special case which only occurs when 443 * the pool has completely failed. This allows 444 * the user to change the in-core failmode property 445 * without syncing it out to disk (I/Os might 446 * currently be blocked). We do this by returning 447 * EIO to the caller (spa_prop_set) to trick it 448 * into thinking we encountered a property validation 449 * error. 450 */ 451 if (!error && spa_suspended(spa)) { 452 spa->spa_failmode = intval; 453 error = EIO; 454 } 455 break; 456 457 case ZPOOL_PROP_CACHEFILE: 458 if ((error = nvpair_value_string(elem, &strval)) != 0) 459 break; 460 461 if (strval[0] == '\0') 462 break; 463 464 if (strcmp(strval, "none") == 0) 465 break; 466 467 if (strval[0] != '/') { 468 error = EINVAL; 469 break; 470 } 471 472 slash = strrchr(strval, '/'); 473 ASSERT(slash != NULL); 474 475 if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 476 strcmp(slash, "/..") == 0) 477 error = EINVAL; 478 break; 479 480 case ZPOOL_PROP_COMMENT: 481 if ((error = nvpair_value_string(elem, &strval)) != 0) 482 break; 483 for (check = strval; *check != '\0'; check++) { 484 /* 485 * The kernel doesn't have an easy isprint() 486 * check. For this kernel check, we merely 487 * check ASCII apart from DEL. Fix this if 488 * there is an easy-to-use kernel isprint(). 489 */ 490 if (*check >= 0x7f) { 491 error = EINVAL; 492 break; 493 } 494 check++; 495 } 496 if (strlen(strval) > ZPROP_MAX_COMMENT) 497 error = E2BIG; 498 break; 499 500 case ZPOOL_PROP_DEDUPDITTO: 501 if (spa_version(spa) < SPA_VERSION_DEDUP) 502 error = ENOTSUP; 503 else 504 error = nvpair_value_uint64(elem, &intval); 505 if (error == 0 && 506 intval != 0 && intval < ZIO_DEDUPDITTO_MIN) 507 error = EINVAL; 508 break; 509 } 510 511 if (error) 512 break; 513 } 514 515 if (!error && reset_bootfs) { 516 error = nvlist_remove(props, 517 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 518 519 if (!error) { 520 error = nvlist_add_uint64(props, 521 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 522 } 523 } 524 525 return (error); 526} 527 528void 529spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) 530{ 531 char *cachefile; 532 spa_config_dirent_t *dp; 533 534 if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), 535 &cachefile) != 0) 536 return; 537 538 dp = kmem_alloc(sizeof (spa_config_dirent_t), 539 KM_SLEEP); 540 541 if (cachefile[0] == '\0') 542 dp->scd_path = spa_strdup(spa_config_path); 543 else if (strcmp(cachefile, "none") == 0) 544 dp->scd_path = NULL; 545 else 546 dp->scd_path = spa_strdup(cachefile); 547 548 list_insert_head(&spa->spa_config_list, dp); 549 if (need_sync) 550 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 551} 552 553int 554spa_prop_set(spa_t *spa, nvlist_t *nvp) 555{ 556 int error; 557 nvpair_t *elem; 558 boolean_t need_sync = B_FALSE; 559 zpool_prop_t prop; 560 561 if ((error = spa_prop_validate(spa, nvp)) != 0) 562 return (error); 563 564 elem = NULL; 565 while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { 566 if ((prop = zpool_name_to_prop( 567 nvpair_name(elem))) == ZPROP_INVAL) 568 return (EINVAL); 569 570 if (prop == ZPOOL_PROP_CACHEFILE || 571 prop == ZPOOL_PROP_ALTROOT || 572 prop == ZPOOL_PROP_READONLY) 573 continue; 574 575 need_sync = B_TRUE; 576 break; 577 } 578 579 if (need_sync) 580 return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, 581 spa, nvp, 3)); 582 else 583 return (0); 584} 585 586/* 587 * If the bootfs property value is dsobj, clear it. 588 */ 589void 590spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 591{ 592 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 593 VERIFY(zap_remove(spa->spa_meta_objset, 594 spa->spa_pool_props_object, 595 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 596 spa->spa_bootfs = 0; 597 } 598} 599 600/* 601 * Change the GUID for the pool. This is done so that we can later 602 * re-import a pool built from a clone of our own vdevs. We will modify 603 * the root vdev's guid, our own pool guid, and then mark all of our 604 * vdevs dirty. Note that we must make sure that all our vdevs are 605 * online when we do this, or else any vdevs that weren't present 606 * would be orphaned from our pool. We are also going to issue a 607 * sysevent to update any watchers. 608 */ 609int 610spa_change_guid(spa_t *spa) 611{ 612 uint64_t oldguid, newguid; 613 uint64_t txg; 614 615 if (!(spa_mode_global & FWRITE)) 616 return (EROFS); 617 618 txg = spa_vdev_enter(spa); 619 620 if (spa->spa_root_vdev->vdev_state != VDEV_STATE_HEALTHY) 621 return (spa_vdev_exit(spa, NULL, txg, ENXIO)); 622 623 oldguid = spa_guid(spa); 624 newguid = spa_generate_guid(NULL); 625 ASSERT3U(oldguid, !=, newguid); 626 627 spa->spa_root_vdev->vdev_guid = newguid; 628 spa->spa_root_vdev->vdev_guid_sum += (newguid - oldguid); 629 630 vdev_config_dirty(spa->spa_root_vdev); 631 632 spa_event_notify(spa, NULL, ESC_ZFS_POOL_REGUID); 633 634 return (spa_vdev_exit(spa, NULL, txg, 0)); 635} 636 637/* 638 * ========================================================================== 639 * SPA state manipulation (open/create/destroy/import/export) 640 * ========================================================================== 641 */ 642 643static int 644spa_error_entry_compare(const void *a, const void *b) 645{ 646 spa_error_entry_t *sa = (spa_error_entry_t *)a; 647 spa_error_entry_t *sb = (spa_error_entry_t *)b; 648 int ret; 649 650 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 651 sizeof (zbookmark_t)); 652 653 if (ret < 0) 654 return (-1); 655 else if (ret > 0) 656 return (1); 657 else 658 return (0); 659} 660 661/* 662 * Utility function which retrieves copies of the current logs and 663 * re-initializes them in the process. 664 */ 665void 666spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 667{ 668 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 669 670 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 671 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 672 673 avl_create(&spa->spa_errlist_scrub, 674 spa_error_entry_compare, sizeof (spa_error_entry_t), 675 offsetof(spa_error_entry_t, se_avl)); 676 avl_create(&spa->spa_errlist_last, 677 spa_error_entry_compare, sizeof (spa_error_entry_t), 678 offsetof(spa_error_entry_t, se_avl)); 679} 680 681static taskq_t * 682spa_taskq_create(spa_t *spa, const char *name, enum zti_modes mode, 683 uint_t value) 684{ 685 uint_t flags = TASKQ_PREPOPULATE; 686 boolean_t batch = B_FALSE; 687 688 switch (mode) { 689 case zti_mode_null: 690 return (NULL); /* no taskq needed */ 691 692 case zti_mode_fixed: 693 ASSERT3U(value, >=, 1); 694 value = MAX(value, 1); 695 break; 696 697 case zti_mode_batch: 698 batch = B_TRUE; 699 flags |= TASKQ_THREADS_CPU_PCT; 700 value = zio_taskq_batch_pct; 701 break; 702 703 case zti_mode_online_percent: 704 flags |= TASKQ_THREADS_CPU_PCT; 705 break; 706 707 default: 708 panic("unrecognized mode for %s taskq (%u:%u) in " 709 "spa_activate()", 710 name, mode, value); 711 break; 712 } 713 714#ifdef SYSDC 715 if (zio_taskq_sysdc && spa->spa_proc != &p0) { 716 if (batch) 717 flags |= TASKQ_DC_BATCH; 718 719 return (taskq_create_sysdc(name, value, 50, INT_MAX, 720 spa->spa_proc, zio_taskq_basedc, flags)); 721 } 722#endif 723 return (taskq_create_proc(name, value, maxclsyspri, 50, INT_MAX, 724 spa->spa_proc, flags)); 725} 726 727static void 728spa_create_zio_taskqs(spa_t *spa) 729{ 730 for (int t = 0; t < ZIO_TYPES; t++) { 731 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 732 const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; 733 enum zti_modes mode = ztip->zti_mode; 734 uint_t value = ztip->zti_value; 735 char name[32]; 736 737 (void) snprintf(name, sizeof (name), 738 "%s_%s", zio_type_name[t], zio_taskq_types[q]); 739 740 spa->spa_zio_taskq[t][q] = 741 spa_taskq_create(spa, name, mode, value); 742 } 743 } 744} 745 746#ifdef _KERNEL 747#ifdef SPA_PROCESS 748static void 749spa_thread(void *arg) 750{ 751 callb_cpr_t cprinfo; 752 753 spa_t *spa = arg; 754 user_t *pu = PTOU(curproc); 755 756 CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, 757 spa->spa_name); 758 759 ASSERT(curproc != &p0); 760 (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), 761 "zpool-%s", spa->spa_name); 762 (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); 763 764#ifdef PSRSET_BIND 765 /* bind this thread to the requested psrset */ 766 if (zio_taskq_psrset_bind != PS_NONE) { 767 pool_lock(); 768 mutex_enter(&cpu_lock); 769 mutex_enter(&pidlock); 770 mutex_enter(&curproc->p_lock); 771 772 if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, 773 0, NULL, NULL) == 0) { 774 curthread->t_bind_pset = zio_taskq_psrset_bind; 775 } else { 776 cmn_err(CE_WARN, 777 "Couldn't bind process for zfs pool \"%s\" to " 778 "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); 779 } 780 781 mutex_exit(&curproc->p_lock); 782 mutex_exit(&pidlock); 783 mutex_exit(&cpu_lock); 784 pool_unlock(); 785 } 786#endif 787 788#ifdef SYSDC 789 if (zio_taskq_sysdc) { 790 sysdc_thread_enter(curthread, 100, 0); 791 } 792#endif 793 794 spa->spa_proc = curproc; 795 spa->spa_did = curthread->t_did; 796 797 spa_create_zio_taskqs(spa); 798 799 mutex_enter(&spa->spa_proc_lock); 800 ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); 801 802 spa->spa_proc_state = SPA_PROC_ACTIVE; 803 cv_broadcast(&spa->spa_proc_cv); 804 805 CALLB_CPR_SAFE_BEGIN(&cprinfo); 806 while (spa->spa_proc_state == SPA_PROC_ACTIVE) 807 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 808 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); 809 810 ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); 811 spa->spa_proc_state = SPA_PROC_GONE; 812 spa->spa_proc = &p0; 813 cv_broadcast(&spa->spa_proc_cv); 814 CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ 815 816 mutex_enter(&curproc->p_lock); 817 lwp_exit(); 818} 819#endif /* SPA_PROCESS */ 820#endif 821 822/* 823 * Activate an uninitialized pool. 824 */ 825static void 826spa_activate(spa_t *spa, int mode) 827{ 828 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 829 830 spa->spa_state = POOL_STATE_ACTIVE; 831 spa->spa_mode = mode; 832 833 spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops); 834 spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); 835 836 /* Try to create a covering process */ 837 mutex_enter(&spa->spa_proc_lock); 838 ASSERT(spa->spa_proc_state == SPA_PROC_NONE); 839 ASSERT(spa->spa_proc == &p0); 840 spa->spa_did = 0; 841 842#ifdef SPA_PROCESS 843 /* Only create a process if we're going to be around a while. */ 844 if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { 845 if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, 846 NULL, 0) == 0) { 847 spa->spa_proc_state = SPA_PROC_CREATED; 848 while (spa->spa_proc_state == SPA_PROC_CREATED) { 849 cv_wait(&spa->spa_proc_cv, 850 &spa->spa_proc_lock); 851 } 852 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 853 ASSERT(spa->spa_proc != &p0); 854 ASSERT(spa->spa_did != 0); 855 } else { 856#ifdef _KERNEL 857 cmn_err(CE_WARN, 858 "Couldn't create process for zfs pool \"%s\"\n", 859 spa->spa_name); 860#endif 861 } 862 } 863#endif /* SPA_PROCESS */ 864 mutex_exit(&spa->spa_proc_lock); 865 866 /* If we didn't create a process, we need to create our taskqs. */ 867 ASSERT(spa->spa_proc == &p0); 868 if (spa->spa_proc == &p0) { 869 spa_create_zio_taskqs(spa); 870 } 871 872 list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), 873 offsetof(vdev_t, vdev_config_dirty_node)); 874 list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), 875 offsetof(vdev_t, vdev_state_dirty_node)); 876 877 txg_list_create(&spa->spa_vdev_txg_list, 878 offsetof(struct vdev, vdev_txg_node)); 879 880 avl_create(&spa->spa_errlist_scrub, 881 spa_error_entry_compare, sizeof (spa_error_entry_t), 882 offsetof(spa_error_entry_t, se_avl)); 883 avl_create(&spa->spa_errlist_last, 884 spa_error_entry_compare, sizeof (spa_error_entry_t), 885 offsetof(spa_error_entry_t, se_avl)); 886} 887 888/* 889 * Opposite of spa_activate(). 890 */ 891static void 892spa_deactivate(spa_t *spa) 893{ 894 ASSERT(spa->spa_sync_on == B_FALSE); 895 ASSERT(spa->spa_dsl_pool == NULL); 896 ASSERT(spa->spa_root_vdev == NULL); 897 ASSERT(spa->spa_async_zio_root == NULL); 898 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 899 900 txg_list_destroy(&spa->spa_vdev_txg_list); 901 902 list_destroy(&spa->spa_config_dirty_list); 903 list_destroy(&spa->spa_state_dirty_list); 904 905 for (int t = 0; t < ZIO_TYPES; t++) { 906 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 907 if (spa->spa_zio_taskq[t][q] != NULL) 908 taskq_destroy(spa->spa_zio_taskq[t][q]); 909 spa->spa_zio_taskq[t][q] = NULL; 910 } 911 } 912 913 metaslab_class_destroy(spa->spa_normal_class); 914 spa->spa_normal_class = NULL; 915 916 metaslab_class_destroy(spa->spa_log_class); 917 spa->spa_log_class = NULL; 918 919 /* 920 * If this was part of an import or the open otherwise failed, we may 921 * still have errors left in the queues. Empty them just in case. 922 */ 923 spa_errlog_drain(spa); 924 925 avl_destroy(&spa->spa_errlist_scrub); 926 avl_destroy(&spa->spa_errlist_last); 927 928 spa->spa_state = POOL_STATE_UNINITIALIZED; 929 930 mutex_enter(&spa->spa_proc_lock); 931 if (spa->spa_proc_state != SPA_PROC_NONE) { 932 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 933 spa->spa_proc_state = SPA_PROC_DEACTIVATE; 934 cv_broadcast(&spa->spa_proc_cv); 935 while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { 936 ASSERT(spa->spa_proc != &p0); 937 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 938 } 939 ASSERT(spa->spa_proc_state == SPA_PROC_GONE); 940 spa->spa_proc_state = SPA_PROC_NONE; 941 } 942 ASSERT(spa->spa_proc == &p0); 943 mutex_exit(&spa->spa_proc_lock); 944 945#ifdef SPA_PROCESS 946 /* 947 * We want to make sure spa_thread() has actually exited the ZFS 948 * module, so that the module can't be unloaded out from underneath 949 * it. 950 */ 951 if (spa->spa_did != 0) { 952 thread_join(spa->spa_did); 953 spa->spa_did = 0; 954 } 955#endif /* SPA_PROCESS */ 956} 957 958/* 959 * Verify a pool configuration, and construct the vdev tree appropriately. This 960 * will create all the necessary vdevs in the appropriate layout, with each vdev 961 * in the CLOSED state. This will prep the pool before open/creation/import. 962 * All vdev validation is done by the vdev_alloc() routine. 963 */ 964static int 965spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 966 uint_t id, int atype) 967{ 968 nvlist_t **child; 969 uint_t children; 970 int error; 971 972 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 973 return (error); 974 975 if ((*vdp)->vdev_ops->vdev_op_leaf) 976 return (0); 977 978 error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 979 &child, &children); 980 981 if (error == ENOENT) 982 return (0); 983 984 if (error) { 985 vdev_free(*vdp); 986 *vdp = NULL; 987 return (EINVAL); 988 } 989 990 for (int c = 0; c < children; c++) { 991 vdev_t *vd; 992 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 993 atype)) != 0) { 994 vdev_free(*vdp); 995 *vdp = NULL; 996 return (error); 997 } 998 } 999 1000 ASSERT(*vdp != NULL); 1001 1002 return (0); 1003} 1004 1005/* 1006 * Opposite of spa_load(). 1007 */ 1008static void 1009spa_unload(spa_t *spa) 1010{ 1011 int i; 1012 1013 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1014 1015 /* 1016 * Stop async tasks. 1017 */ 1018 spa_async_suspend(spa); 1019 1020 /* 1021 * Stop syncing. 1022 */ 1023 if (spa->spa_sync_on) { 1024 txg_sync_stop(spa->spa_dsl_pool); 1025 spa->spa_sync_on = B_FALSE; 1026 } 1027 1028 /* 1029 * Wait for any outstanding async I/O to complete. 1030 */ 1031 if (spa->spa_async_zio_root != NULL) { 1032 (void) zio_wait(spa->spa_async_zio_root); 1033 spa->spa_async_zio_root = NULL; 1034 } 1035 1036 bpobj_close(&spa->spa_deferred_bpobj); 1037 1038 /* 1039 * Close the dsl pool. 1040 */ 1041 if (spa->spa_dsl_pool) { 1042 dsl_pool_close(spa->spa_dsl_pool); 1043 spa->spa_dsl_pool = NULL; 1044 spa->spa_meta_objset = NULL; 1045 } 1046 1047 ddt_unload(spa); 1048 1049 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1050 1051 /* 1052 * Drop and purge level 2 cache 1053 */ 1054 spa_l2cache_drop(spa); 1055 1056 /* 1057 * Close all vdevs. 1058 */ 1059 if (spa->spa_root_vdev) 1060 vdev_free(spa->spa_root_vdev); 1061 ASSERT(spa->spa_root_vdev == NULL); 1062 1063 for (i = 0; i < spa->spa_spares.sav_count; i++) 1064 vdev_free(spa->spa_spares.sav_vdevs[i]); 1065 if (spa->spa_spares.sav_vdevs) { 1066 kmem_free(spa->spa_spares.sav_vdevs, 1067 spa->spa_spares.sav_count * sizeof (void *)); 1068 spa->spa_spares.sav_vdevs = NULL; 1069 } 1070 if (spa->spa_spares.sav_config) { 1071 nvlist_free(spa->spa_spares.sav_config); 1072 spa->spa_spares.sav_config = NULL; 1073 } 1074 spa->spa_spares.sav_count = 0; 1075
|
1076 for (i = 0; i < spa->spa_l2cache.sav_count; i++)
| 1076 for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 1077 vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]);
|
1077 vdev_free(spa->spa_l2cache.sav_vdevs[i]);
| 1078 vdev_free(spa->spa_l2cache.sav_vdevs[i]);
|
| 1079 }
|
1078 if (spa->spa_l2cache.sav_vdevs) { 1079 kmem_free(spa->spa_l2cache.sav_vdevs, 1080 spa->spa_l2cache.sav_count * sizeof (void *)); 1081 spa->spa_l2cache.sav_vdevs = NULL; 1082 } 1083 if (spa->spa_l2cache.sav_config) { 1084 nvlist_free(spa->spa_l2cache.sav_config); 1085 spa->spa_l2cache.sav_config = NULL; 1086 } 1087 spa->spa_l2cache.sav_count = 0; 1088 1089 spa->spa_async_suspended = 0; 1090 1091 if (spa->spa_comment != NULL) { 1092 spa_strfree(spa->spa_comment); 1093 spa->spa_comment = NULL; 1094 } 1095 1096 spa_config_exit(spa, SCL_ALL, FTAG); 1097} 1098 1099/* 1100 * Load (or re-load) the current list of vdevs describing the active spares for 1101 * this pool. When this is called, we have some form of basic information in 1102 * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and 1103 * then re-generate a more complete list including status information. 1104 */ 1105static void 1106spa_load_spares(spa_t *spa) 1107{ 1108 nvlist_t **spares; 1109 uint_t nspares; 1110 int i; 1111 vdev_t *vd, *tvd; 1112 1113 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1114 1115 /* 1116 * First, close and free any existing spare vdevs. 1117 */ 1118 for (i = 0; i < spa->spa_spares.sav_count; i++) { 1119 vd = spa->spa_spares.sav_vdevs[i]; 1120 1121 /* Undo the call to spa_activate() below */ 1122 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1123 B_FALSE)) != NULL && tvd->vdev_isspare) 1124 spa_spare_remove(tvd); 1125 vdev_close(vd); 1126 vdev_free(vd); 1127 } 1128 1129 if (spa->spa_spares.sav_vdevs) 1130 kmem_free(spa->spa_spares.sav_vdevs, 1131 spa->spa_spares.sav_count * sizeof (void *)); 1132 1133 if (spa->spa_spares.sav_config == NULL) 1134 nspares = 0; 1135 else 1136 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 1137 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1138 1139 spa->spa_spares.sav_count = (int)nspares; 1140 spa->spa_spares.sav_vdevs = NULL; 1141 1142 if (nspares == 0) 1143 return; 1144 1145 /* 1146 * Construct the array of vdevs, opening them to get status in the 1147 * process. For each spare, there is potentially two different vdev_t 1148 * structures associated with it: one in the list of spares (used only 1149 * for basic validation purposes) and one in the active vdev 1150 * configuration (if it's spared in). During this phase we open and 1151 * validate each vdev on the spare list. If the vdev also exists in the 1152 * active configuration, then we also mark this vdev as an active spare. 1153 */ 1154 spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), 1155 KM_SLEEP); 1156 for (i = 0; i < spa->spa_spares.sav_count; i++) { 1157 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 1158 VDEV_ALLOC_SPARE) == 0); 1159 ASSERT(vd != NULL); 1160 1161 spa->spa_spares.sav_vdevs[i] = vd; 1162 1163 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1164 B_FALSE)) != NULL) { 1165 if (!tvd->vdev_isspare) 1166 spa_spare_add(tvd); 1167 1168 /* 1169 * We only mark the spare active if we were successfully 1170 * able to load the vdev. Otherwise, importing a pool 1171 * with a bad active spare would result in strange 1172 * behavior, because multiple pool would think the spare 1173 * is actively in use. 1174 * 1175 * There is a vulnerability here to an equally bizarre 1176 * circumstance, where a dead active spare is later 1177 * brought back to life (onlined or otherwise). Given 1178 * the rarity of this scenario, and the extra complexity 1179 * it adds, we ignore the possibility. 1180 */ 1181 if (!vdev_is_dead(tvd)) 1182 spa_spare_activate(tvd); 1183 } 1184 1185 vd->vdev_top = vd; 1186 vd->vdev_aux = &spa->spa_spares; 1187 1188 if (vdev_open(vd) != 0) 1189 continue; 1190 1191 if (vdev_validate_aux(vd) == 0) 1192 spa_spare_add(vd); 1193 } 1194 1195 /* 1196 * Recompute the stashed list of spares, with status information 1197 * this time. 1198 */ 1199 VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, 1200 DATA_TYPE_NVLIST_ARRAY) == 0); 1201 1202 spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), 1203 KM_SLEEP); 1204 for (i = 0; i < spa->spa_spares.sav_count; i++) 1205 spares[i] = vdev_config_generate(spa, 1206 spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); 1207 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 1208 ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); 1209 for (i = 0; i < spa->spa_spares.sav_count; i++) 1210 nvlist_free(spares[i]); 1211 kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); 1212} 1213 1214/* 1215 * Load (or re-load) the current list of vdevs describing the active l2cache for 1216 * this pool. When this is called, we have some form of basic information in 1217 * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and 1218 * then re-generate a more complete list including status information. 1219 * Devices which are already active have their details maintained, and are 1220 * not re-opened. 1221 */ 1222static void 1223spa_load_l2cache(spa_t *spa) 1224{ 1225 nvlist_t **l2cache; 1226 uint_t nl2cache; 1227 int i, j, oldnvdevs; 1228 uint64_t guid; 1229 vdev_t *vd, **oldvdevs, **newvdevs; 1230 spa_aux_vdev_t *sav = &spa->spa_l2cache; 1231 1232 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1233 1234 if (sav->sav_config != NULL) { 1235 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 1236 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 1237 newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); 1238 } else { 1239 nl2cache = 0; 1240 } 1241 1242 oldvdevs = sav->sav_vdevs; 1243 oldnvdevs = sav->sav_count; 1244 sav->sav_vdevs = NULL; 1245 sav->sav_count = 0; 1246 1247 /* 1248 * Process new nvlist of vdevs. 1249 */ 1250 for (i = 0; i < nl2cache; i++) { 1251 VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, 1252 &guid) == 0); 1253 1254 newvdevs[i] = NULL; 1255 for (j = 0; j < oldnvdevs; j++) { 1256 vd = oldvdevs[j]; 1257 if (vd != NULL && guid == vd->vdev_guid) { 1258 /* 1259 * Retain previous vdev for add/remove ops. 1260 */ 1261 newvdevs[i] = vd; 1262 oldvdevs[j] = NULL; 1263 break; 1264 } 1265 } 1266 1267 if (newvdevs[i] == NULL) { 1268 /* 1269 * Create new vdev 1270 */ 1271 VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, 1272 VDEV_ALLOC_L2CACHE) == 0); 1273 ASSERT(vd != NULL); 1274 newvdevs[i] = vd; 1275 1276 /* 1277 * Commit this vdev as an l2cache device, 1278 * even if it fails to open. 1279 */ 1280 spa_l2cache_add(vd); 1281 1282 vd->vdev_top = vd; 1283 vd->vdev_aux = sav; 1284 1285 spa_l2cache_activate(vd); 1286 1287 if (vdev_open(vd) != 0) 1288 continue; 1289 1290 (void) vdev_validate_aux(vd); 1291 1292 if (!vdev_is_dead(vd)) 1293 l2arc_add_vdev(spa, vd); 1294 } 1295 } 1296 1297 /* 1298 * Purge vdevs that were dropped 1299 */ 1300 for (i = 0; i < oldnvdevs; i++) { 1301 uint64_t pool; 1302 1303 vd = oldvdevs[i]; 1304 if (vd != NULL) {
| 1080 if (spa->spa_l2cache.sav_vdevs) { 1081 kmem_free(spa->spa_l2cache.sav_vdevs, 1082 spa->spa_l2cache.sav_count * sizeof (void *)); 1083 spa->spa_l2cache.sav_vdevs = NULL; 1084 } 1085 if (spa->spa_l2cache.sav_config) { 1086 nvlist_free(spa->spa_l2cache.sav_config); 1087 spa->spa_l2cache.sav_config = NULL; 1088 } 1089 spa->spa_l2cache.sav_count = 0; 1090 1091 spa->spa_async_suspended = 0; 1092 1093 if (spa->spa_comment != NULL) { 1094 spa_strfree(spa->spa_comment); 1095 spa->spa_comment = NULL; 1096 } 1097 1098 spa_config_exit(spa, SCL_ALL, FTAG); 1099} 1100 1101/* 1102 * Load (or re-load) the current list of vdevs describing the active spares for 1103 * this pool. When this is called, we have some form of basic information in 1104 * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and 1105 * then re-generate a more complete list including status information. 1106 */ 1107static void 1108spa_load_spares(spa_t *spa) 1109{ 1110 nvlist_t **spares; 1111 uint_t nspares; 1112 int i; 1113 vdev_t *vd, *tvd; 1114 1115 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1116 1117 /* 1118 * First, close and free any existing spare vdevs. 1119 */ 1120 for (i = 0; i < spa->spa_spares.sav_count; i++) { 1121 vd = spa->spa_spares.sav_vdevs[i]; 1122 1123 /* Undo the call to spa_activate() below */ 1124 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1125 B_FALSE)) != NULL && tvd->vdev_isspare) 1126 spa_spare_remove(tvd); 1127 vdev_close(vd); 1128 vdev_free(vd); 1129 } 1130 1131 if (spa->spa_spares.sav_vdevs) 1132 kmem_free(spa->spa_spares.sav_vdevs, 1133 spa->spa_spares.sav_count * sizeof (void *)); 1134 1135 if (spa->spa_spares.sav_config == NULL) 1136 nspares = 0; 1137 else 1138 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 1139 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1140 1141 spa->spa_spares.sav_count = (int)nspares; 1142 spa->spa_spares.sav_vdevs = NULL; 1143 1144 if (nspares == 0) 1145 return; 1146 1147 /* 1148 * Construct the array of vdevs, opening them to get status in the 1149 * process. For each spare, there is potentially two different vdev_t 1150 * structures associated with it: one in the list of spares (used only 1151 * for basic validation purposes) and one in the active vdev 1152 * configuration (if it's spared in). During this phase we open and 1153 * validate each vdev on the spare list. If the vdev also exists in the 1154 * active configuration, then we also mark this vdev as an active spare. 1155 */ 1156 spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), 1157 KM_SLEEP); 1158 for (i = 0; i < spa->spa_spares.sav_count; i++) { 1159 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 1160 VDEV_ALLOC_SPARE) == 0); 1161 ASSERT(vd != NULL); 1162 1163 spa->spa_spares.sav_vdevs[i] = vd; 1164 1165 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1166 B_FALSE)) != NULL) { 1167 if (!tvd->vdev_isspare) 1168 spa_spare_add(tvd); 1169 1170 /* 1171 * We only mark the spare active if we were successfully 1172 * able to load the vdev. Otherwise, importing a pool 1173 * with a bad active spare would result in strange 1174 * behavior, because multiple pool would think the spare 1175 * is actively in use. 1176 * 1177 * There is a vulnerability here to an equally bizarre 1178 * circumstance, where a dead active spare is later 1179 * brought back to life (onlined or otherwise). Given 1180 * the rarity of this scenario, and the extra complexity 1181 * it adds, we ignore the possibility. 1182 */ 1183 if (!vdev_is_dead(tvd)) 1184 spa_spare_activate(tvd); 1185 } 1186 1187 vd->vdev_top = vd; 1188 vd->vdev_aux = &spa->spa_spares; 1189 1190 if (vdev_open(vd) != 0) 1191 continue; 1192 1193 if (vdev_validate_aux(vd) == 0) 1194 spa_spare_add(vd); 1195 } 1196 1197 /* 1198 * Recompute the stashed list of spares, with status information 1199 * this time. 1200 */ 1201 VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, 1202 DATA_TYPE_NVLIST_ARRAY) == 0); 1203 1204 spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), 1205 KM_SLEEP); 1206 for (i = 0; i < spa->spa_spares.sav_count; i++) 1207 spares[i] = vdev_config_generate(spa, 1208 spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); 1209 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 1210 ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); 1211 for (i = 0; i < spa->spa_spares.sav_count; i++) 1212 nvlist_free(spares[i]); 1213 kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); 1214} 1215 1216/* 1217 * Load (or re-load) the current list of vdevs describing the active l2cache for 1218 * this pool. When this is called, we have some form of basic information in 1219 * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and 1220 * then re-generate a more complete list including status information. 1221 * Devices which are already active have their details maintained, and are 1222 * not re-opened. 1223 */ 1224static void 1225spa_load_l2cache(spa_t *spa) 1226{ 1227 nvlist_t **l2cache; 1228 uint_t nl2cache; 1229 int i, j, oldnvdevs; 1230 uint64_t guid; 1231 vdev_t *vd, **oldvdevs, **newvdevs; 1232 spa_aux_vdev_t *sav = &spa->spa_l2cache; 1233 1234 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1235 1236 if (sav->sav_config != NULL) { 1237 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 1238 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 1239 newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); 1240 } else { 1241 nl2cache = 0; 1242 } 1243 1244 oldvdevs = sav->sav_vdevs; 1245 oldnvdevs = sav->sav_count; 1246 sav->sav_vdevs = NULL; 1247 sav->sav_count = 0; 1248 1249 /* 1250 * Process new nvlist of vdevs. 1251 */ 1252 for (i = 0; i < nl2cache; i++) { 1253 VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, 1254 &guid) == 0); 1255 1256 newvdevs[i] = NULL; 1257 for (j = 0; j < oldnvdevs; j++) { 1258 vd = oldvdevs[j]; 1259 if (vd != NULL && guid == vd->vdev_guid) { 1260 /* 1261 * Retain previous vdev for add/remove ops. 1262 */ 1263 newvdevs[i] = vd; 1264 oldvdevs[j] = NULL; 1265 break; 1266 } 1267 } 1268 1269 if (newvdevs[i] == NULL) { 1270 /* 1271 * Create new vdev 1272 */ 1273 VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, 1274 VDEV_ALLOC_L2CACHE) == 0); 1275 ASSERT(vd != NULL); 1276 newvdevs[i] = vd; 1277 1278 /* 1279 * Commit this vdev as an l2cache device, 1280 * even if it fails to open. 1281 */ 1282 spa_l2cache_add(vd); 1283 1284 vd->vdev_top = vd; 1285 vd->vdev_aux = sav; 1286 1287 spa_l2cache_activate(vd); 1288 1289 if (vdev_open(vd) != 0) 1290 continue; 1291 1292 (void) vdev_validate_aux(vd); 1293 1294 if (!vdev_is_dead(vd)) 1295 l2arc_add_vdev(spa, vd); 1296 } 1297 } 1298 1299 /* 1300 * Purge vdevs that were dropped 1301 */ 1302 for (i = 0; i < oldnvdevs; i++) { 1303 uint64_t pool; 1304 1305 vd = oldvdevs[i]; 1306 if (vd != NULL) {
|
| 1307 ASSERT(vd->vdev_isl2cache); 1308
|
1305 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 1306 pool != 0ULL && l2arc_vdev_present(vd)) 1307 l2arc_remove_vdev(vd);
| 1309 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 1310 pool != 0ULL && l2arc_vdev_present(vd)) 1311 l2arc_remove_vdev(vd);
|
1308 (void) vdev_close(vd); 1309 spa_l2cache_remove(vd);
| 1312 vdev_clear_stats(vd); 1313 vdev_free(vd);
|
1310 } 1311 } 1312 1313 if (oldvdevs) 1314 kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); 1315 1316 if (sav->sav_config == NULL) 1317 goto out; 1318 1319 sav->sav_vdevs = newvdevs; 1320 sav->sav_count = (int)nl2cache; 1321 1322 /* 1323 * Recompute the stashed list of l2cache devices, with status 1324 * information this time. 1325 */ 1326 VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 1327 DATA_TYPE_NVLIST_ARRAY) == 0); 1328 1329 l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 1330 for (i = 0; i < sav->sav_count; i++) 1331 l2cache[i] = vdev_config_generate(spa, 1332 sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); 1333 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 1334 ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); 1335out: 1336 for (i = 0; i < sav->sav_count; i++) 1337 nvlist_free(l2cache[i]); 1338 if (sav->sav_count) 1339 kmem_free(l2cache, sav->sav_count * sizeof (void *)); 1340} 1341 1342static int 1343load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 1344{ 1345 dmu_buf_t *db; 1346 char *packed = NULL; 1347 size_t nvsize = 0; 1348 int error; 1349 *value = NULL; 1350 1351 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 1352 nvsize = *(uint64_t *)db->db_data; 1353 dmu_buf_rele(db, FTAG); 1354 1355 packed = kmem_alloc(nvsize, KM_SLEEP); 1356 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, 1357 DMU_READ_PREFETCH); 1358 if (error == 0) 1359 error = nvlist_unpack(packed, nvsize, value, 0); 1360 kmem_free(packed, nvsize); 1361 1362 return (error); 1363} 1364 1365/* 1366 * Checks to see if the given vdev could not be opened, in which case we post a 1367 * sysevent to notify the autoreplace code that the device has been removed. 1368 */ 1369static void 1370spa_check_removed(vdev_t *vd) 1371{ 1372 for (int c = 0; c < vd->vdev_children; c++) 1373 spa_check_removed(vd->vdev_child[c]); 1374 1375 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) { 1376 zfs_post_autoreplace(vd->vdev_spa, vd); 1377 spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK); 1378 } 1379} 1380 1381/* 1382 * Validate the current config against the MOS config 1383 */ 1384static boolean_t 1385spa_config_valid(spa_t *spa, nvlist_t *config) 1386{ 1387 vdev_t *mrvd, *rvd = spa->spa_root_vdev; 1388 nvlist_t *nv; 1389 1390 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0); 1391 1392 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1393 VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); 1394 1395 ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children); 1396 1397 /* 1398 * If we're doing a normal import, then build up any additional 1399 * diagnostic information about missing devices in this config. 1400 * We'll pass this up to the user for further processing. 1401 */ 1402 if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { 1403 nvlist_t **child, *nv; 1404 uint64_t idx = 0; 1405 1406 child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **), 1407 KM_SLEEP); 1408 VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); 1409 1410 for (int c = 0; c < rvd->vdev_children; c++) { 1411 vdev_t *tvd = rvd->vdev_child[c]; 1412 vdev_t *mtvd = mrvd->vdev_child[c]; 1413 1414 if (tvd->vdev_ops == &vdev_missing_ops && 1415 mtvd->vdev_ops != &vdev_missing_ops && 1416 mtvd->vdev_islog) 1417 child[idx++] = vdev_config_generate(spa, mtvd, 1418 B_FALSE, 0); 1419 } 1420 1421 if (idx) { 1422 VERIFY(nvlist_add_nvlist_array(nv, 1423 ZPOOL_CONFIG_CHILDREN, child, idx) == 0); 1424 VERIFY(nvlist_add_nvlist(spa->spa_load_info, 1425 ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0); 1426 1427 for (int i = 0; i < idx; i++) 1428 nvlist_free(child[i]); 1429 } 1430 nvlist_free(nv); 1431 kmem_free(child, rvd->vdev_children * sizeof (char **)); 1432 } 1433 1434 /* 1435 * Compare the root vdev tree with the information we have 1436 * from the MOS config (mrvd). Check each top-level vdev 1437 * with the corresponding MOS config top-level (mtvd). 1438 */ 1439 for (int c = 0; c < rvd->vdev_children; c++) { 1440 vdev_t *tvd = rvd->vdev_child[c]; 1441 vdev_t *mtvd = mrvd->vdev_child[c]; 1442 1443 /* 1444 * Resolve any "missing" vdevs in the current configuration. 1445 * If we find that the MOS config has more accurate information 1446 * about the top-level vdev then use that vdev instead. 1447 */ 1448 if (tvd->vdev_ops == &vdev_missing_ops && 1449 mtvd->vdev_ops != &vdev_missing_ops) { 1450 1451 if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) 1452 continue; 1453 1454 /* 1455 * Device specific actions. 1456 */ 1457 if (mtvd->vdev_islog) { 1458 spa_set_log_state(spa, SPA_LOG_CLEAR); 1459 } else { 1460 /* 1461 * XXX - once we have 'readonly' pool 1462 * support we should be able to handle 1463 * missing data devices by transitioning 1464 * the pool to readonly. 1465 */ 1466 continue; 1467 } 1468 1469 /* 1470 * Swap the missing vdev with the data we were 1471 * able to obtain from the MOS config. 1472 */ 1473 vdev_remove_child(rvd, tvd); 1474 vdev_remove_child(mrvd, mtvd); 1475 1476 vdev_add_child(rvd, mtvd); 1477 vdev_add_child(mrvd, tvd); 1478 1479 spa_config_exit(spa, SCL_ALL, FTAG); 1480 vdev_load(mtvd); 1481 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1482 1483 vdev_reopen(rvd); 1484 } else if (mtvd->vdev_islog) { 1485 /* 1486 * Load the slog device's state from the MOS config 1487 * since it's possible that the label does not 1488 * contain the most up-to-date information. 1489 */ 1490 vdev_load_log_state(tvd, mtvd); 1491 vdev_reopen(tvd); 1492 } 1493 } 1494 vdev_free(mrvd); 1495 spa_config_exit(spa, SCL_ALL, FTAG); 1496 1497 /* 1498 * Ensure we were able to validate the config. 1499 */ 1500 return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum); 1501} 1502 1503/* 1504 * Check for missing log devices 1505 */ 1506static int 1507spa_check_logs(spa_t *spa) 1508{ 1509 switch (spa->spa_log_state) { 1510 case SPA_LOG_MISSING: 1511 /* need to recheck in case slog has been restored */ 1512 case SPA_LOG_UNKNOWN: 1513 if (dmu_objset_find(spa->spa_name, zil_check_log_chain, NULL, 1514 DS_FIND_CHILDREN)) { 1515 spa_set_log_state(spa, SPA_LOG_MISSING); 1516 return (1); 1517 } 1518 break; 1519 } 1520 return (0); 1521} 1522 1523static boolean_t 1524spa_passivate_log(spa_t *spa) 1525{ 1526 vdev_t *rvd = spa->spa_root_vdev; 1527 boolean_t slog_found = B_FALSE; 1528 1529 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1530 1531 if (!spa_has_slogs(spa)) 1532 return (B_FALSE); 1533 1534 for (int c = 0; c < rvd->vdev_children; c++) { 1535 vdev_t *tvd = rvd->vdev_child[c]; 1536 metaslab_group_t *mg = tvd->vdev_mg; 1537 1538 if (tvd->vdev_islog) { 1539 metaslab_group_passivate(mg); 1540 slog_found = B_TRUE; 1541 } 1542 } 1543 1544 return (slog_found); 1545} 1546 1547static void 1548spa_activate_log(spa_t *spa) 1549{ 1550 vdev_t *rvd = spa->spa_root_vdev; 1551 1552 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1553 1554 for (int c = 0; c < rvd->vdev_children; c++) { 1555 vdev_t *tvd = rvd->vdev_child[c]; 1556 metaslab_group_t *mg = tvd->vdev_mg; 1557 1558 if (tvd->vdev_islog) 1559 metaslab_group_activate(mg); 1560 } 1561} 1562 1563int 1564spa_offline_log(spa_t *spa) 1565{ 1566 int error = 0; 1567 1568 if ((error = dmu_objset_find(spa_name(spa), zil_vdev_offline, 1569 NULL, DS_FIND_CHILDREN)) == 0) { 1570 1571 /* 1572 * We successfully offlined the log device, sync out the 1573 * current txg so that the "stubby" block can be removed 1574 * by zil_sync(). 1575 */ 1576 txg_wait_synced(spa->spa_dsl_pool, 0); 1577 } 1578 return (error); 1579} 1580 1581static void 1582spa_aux_check_removed(spa_aux_vdev_t *sav) 1583{ 1584 int i; 1585 1586 for (i = 0; i < sav->sav_count; i++) 1587 spa_check_removed(sav->sav_vdevs[i]); 1588} 1589 1590void 1591spa_claim_notify(zio_t *zio) 1592{ 1593 spa_t *spa = zio->io_spa; 1594 1595 if (zio->io_error) 1596 return; 1597 1598 mutex_enter(&spa->spa_props_lock); /* any mutex will do */ 1599 if (spa->spa_claim_max_txg < zio->io_bp->blk_birth) 1600 spa->spa_claim_max_txg = zio->io_bp->blk_birth; 1601 mutex_exit(&spa->spa_props_lock); 1602} 1603 1604typedef struct spa_load_error { 1605 uint64_t sle_meta_count; 1606 uint64_t sle_data_count; 1607} spa_load_error_t; 1608 1609static void 1610spa_load_verify_done(zio_t *zio) 1611{ 1612 blkptr_t *bp = zio->io_bp; 1613 spa_load_error_t *sle = zio->io_private; 1614 dmu_object_type_t type = BP_GET_TYPE(bp); 1615 int error = zio->io_error; 1616 1617 if (error) { 1618 if ((BP_GET_LEVEL(bp) != 0 || dmu_ot[type].ot_metadata) && 1619 type != DMU_OT_INTENT_LOG) 1620 atomic_add_64(&sle->sle_meta_count, 1); 1621 else 1622 atomic_add_64(&sle->sle_data_count, 1); 1623 } 1624 zio_data_buf_free(zio->io_data, zio->io_size); 1625} 1626 1627/*ARGSUSED*/ 1628static int 1629spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 1630 arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) 1631{ 1632 if (bp != NULL) { 1633 zio_t *rio = arg; 1634 size_t size = BP_GET_PSIZE(bp); 1635 void *data = zio_data_buf_alloc(size); 1636 1637 zio_nowait(zio_read(rio, spa, bp, data, size, 1638 spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, 1639 ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | 1640 ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); 1641 } 1642 return (0); 1643} 1644 1645static int 1646spa_load_verify(spa_t *spa) 1647{ 1648 zio_t *rio; 1649 spa_load_error_t sle = { 0 }; 1650 zpool_rewind_policy_t policy; 1651 boolean_t verify_ok = B_FALSE; 1652 int error; 1653 1654 zpool_get_rewind_policy(spa->spa_config, &policy); 1655 1656 if (policy.zrp_request & ZPOOL_NEVER_REWIND) 1657 return (0); 1658 1659 rio = zio_root(spa, NULL, &sle, 1660 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 1661 1662 error = traverse_pool(spa, spa->spa_verify_min_txg, 1663 TRAVERSE_PRE | TRAVERSE_PREFETCH, spa_load_verify_cb, rio); 1664 1665 (void) zio_wait(rio); 1666 1667 spa->spa_load_meta_errors = sle.sle_meta_count; 1668 spa->spa_load_data_errors = sle.sle_data_count; 1669 1670 if (!error && sle.sle_meta_count <= policy.zrp_maxmeta && 1671 sle.sle_data_count <= policy.zrp_maxdata) { 1672 int64_t loss = 0; 1673 1674 verify_ok = B_TRUE; 1675 spa->spa_load_txg = spa->spa_uberblock.ub_txg; 1676 spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; 1677 1678 loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; 1679 VERIFY(nvlist_add_uint64(spa->spa_load_info, 1680 ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0); 1681 VERIFY(nvlist_add_int64(spa->spa_load_info, 1682 ZPOOL_CONFIG_REWIND_TIME, loss) == 0); 1683 VERIFY(nvlist_add_uint64(spa->spa_load_info, 1684 ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0); 1685 } else { 1686 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; 1687 } 1688 1689 if (error) { 1690 if (error != ENXIO && error != EIO) 1691 error = EIO; 1692 return (error); 1693 } 1694 1695 return (verify_ok ? 0 : EIO); 1696} 1697 1698/* 1699 * Find a value in the pool props object. 1700 */ 1701static void 1702spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) 1703{ 1704 (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, 1705 zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); 1706} 1707 1708/* 1709 * Find a value in the pool directory object. 1710 */ 1711static int 1712spa_dir_prop(spa_t *spa, const char *name, uint64_t *val) 1713{ 1714 return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1715 name, sizeof (uint64_t), 1, val)); 1716} 1717 1718static int 1719spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) 1720{ 1721 vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); 1722 return (err); 1723} 1724 1725/* 1726 * Fix up config after a partly-completed split. This is done with the 1727 * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off 1728 * pool have that entry in their config, but only the splitting one contains 1729 * a list of all the guids of the vdevs that are being split off. 1730 * 1731 * This function determines what to do with that list: either rejoin 1732 * all the disks to the pool, or complete the splitting process. To attempt 1733 * the rejoin, each disk that is offlined is marked online again, and 1734 * we do a reopen() call. If the vdev label for every disk that was 1735 * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) 1736 * then we call vdev_split() on each disk, and complete the split. 1737 * 1738 * Otherwise we leave the config alone, with all the vdevs in place in 1739 * the original pool. 1740 */ 1741static void 1742spa_try_repair(spa_t *spa, nvlist_t *config) 1743{ 1744 uint_t extracted; 1745 uint64_t *glist; 1746 uint_t i, gcount; 1747 nvlist_t *nvl; 1748 vdev_t **vd; 1749 boolean_t attempt_reopen; 1750 1751 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) 1752 return; 1753 1754 /* check that the config is complete */ 1755 if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 1756 &glist, &gcount) != 0) 1757 return; 1758 1759 vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); 1760 1761 /* attempt to online all the vdevs & validate */ 1762 attempt_reopen = B_TRUE; 1763 for (i = 0; i < gcount; i++) { 1764 if (glist[i] == 0) /* vdev is hole */ 1765 continue; 1766 1767 vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); 1768 if (vd[i] == NULL) { 1769 /* 1770 * Don't bother attempting to reopen the disks; 1771 * just do the split. 1772 */ 1773 attempt_reopen = B_FALSE; 1774 } else { 1775 /* attempt to re-online it */ 1776 vd[i]->vdev_offline = B_FALSE; 1777 } 1778 } 1779 1780 if (attempt_reopen) { 1781 vdev_reopen(spa->spa_root_vdev); 1782 1783 /* check each device to see what state it's in */ 1784 for (extracted = 0, i = 0; i < gcount; i++) { 1785 if (vd[i] != NULL && 1786 vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) 1787 break; 1788 ++extracted; 1789 } 1790 } 1791 1792 /* 1793 * If every disk has been moved to the new pool, or if we never 1794 * even attempted to look at them, then we split them off for 1795 * good. 1796 */ 1797 if (!attempt_reopen || gcount == extracted) { 1798 for (i = 0; i < gcount; i++) 1799 if (vd[i] != NULL) 1800 vdev_split(vd[i]); 1801 vdev_reopen(spa->spa_root_vdev); 1802 } 1803 1804 kmem_free(vd, gcount * sizeof (vdev_t *)); 1805} 1806 1807static int 1808spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, 1809 boolean_t mosconfig) 1810{ 1811 nvlist_t *config = spa->spa_config; 1812 char *ereport = FM_EREPORT_ZFS_POOL; 1813 char *comment; 1814 int error; 1815 uint64_t pool_guid; 1816 nvlist_t *nvl; 1817 1818 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) 1819 return (EINVAL); 1820 1821 ASSERT(spa->spa_comment == NULL); 1822 if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) 1823 spa->spa_comment = spa_strdup(comment); 1824 1825 /* 1826 * Versioning wasn't explicitly added to the label until later, so if 1827 * it's not present treat it as the initial version. 1828 */ 1829 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 1830 &spa->spa_ubsync.ub_version) != 0) 1831 spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 1832 1833 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 1834 &spa->spa_config_txg); 1835 1836 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 1837 spa_guid_exists(pool_guid, 0)) { 1838 error = EEXIST; 1839 } else { 1840 spa->spa_config_guid = pool_guid; 1841 1842 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, 1843 &nvl) == 0) { 1844 VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting, 1845 KM_SLEEP) == 0); 1846 } 1847 1848 gethrestime(&spa->spa_loaded_ts); 1849 error = spa_load_impl(spa, pool_guid, config, state, type, 1850 mosconfig, &ereport); 1851 } 1852 1853 spa->spa_minref = refcount_count(&spa->spa_refcount); 1854 if (error) { 1855 if (error != EEXIST) { 1856 spa->spa_loaded_ts.tv_sec = 0; 1857 spa->spa_loaded_ts.tv_nsec = 0; 1858 } 1859 if (error != EBADF) { 1860 zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); 1861 } 1862 } 1863 spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; 1864 spa->spa_ena = 0; 1865 1866 return (error); 1867} 1868 1869/* 1870 * Load an existing storage pool, using the pool's builtin spa_config as a 1871 * source of configuration information. 1872 */ 1873static int 1874spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, 1875 spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, 1876 char **ereport) 1877{ 1878 int error = 0; 1879 nvlist_t *nvroot = NULL; 1880 vdev_t *rvd; 1881 uberblock_t *ub = &spa->spa_uberblock; 1882 uint64_t children, config_cache_txg = spa->spa_config_txg; 1883 int orig_mode = spa->spa_mode; 1884 int parse; 1885 uint64_t obj; 1886 1887 /* 1888 * If this is an untrusted config, access the pool in read-only mode. 1889 * This prevents things like resilvering recently removed devices. 1890 */ 1891 if (!mosconfig) 1892 spa->spa_mode = FREAD; 1893 1894 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1895 1896 spa->spa_load_state = state; 1897 1898 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot)) 1899 return (EINVAL); 1900 1901 parse = (type == SPA_IMPORT_EXISTING ? 1902 VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); 1903 1904 /* 1905 * Create "The Godfather" zio to hold all async IOs 1906 */ 1907 spa->spa_async_zio_root = zio_root(spa, NULL, NULL, 1908 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); 1909 1910 /* 1911 * Parse the configuration into a vdev tree. We explicitly set the 1912 * value that will be returned by spa_version() since parsing the 1913 * configuration requires knowing the version number. 1914 */ 1915 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1916 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse); 1917 spa_config_exit(spa, SCL_ALL, FTAG); 1918 1919 if (error != 0) 1920 return (error); 1921 1922 ASSERT(spa->spa_root_vdev == rvd); 1923 1924 if (type != SPA_IMPORT_ASSEMBLE) { 1925 ASSERT(spa_guid(spa) == pool_guid); 1926 } 1927 1928 /* 1929 * Try to open all vdevs, loading each label in the process. 1930 */ 1931 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1932 error = vdev_open(rvd); 1933 spa_config_exit(spa, SCL_ALL, FTAG); 1934 if (error != 0) 1935 return (error); 1936 1937 /* 1938 * We need to validate the vdev labels against the configuration that 1939 * we have in hand, which is dependent on the setting of mosconfig. If 1940 * mosconfig is true then we're validating the vdev labels based on 1941 * that config. Otherwise, we're validating against the cached config 1942 * (zpool.cache) that was read when we loaded the zfs module, and then 1943 * later we will recursively call spa_load() and validate against 1944 * the vdev config. 1945 * 1946 * If we're assembling a new pool that's been split off from an 1947 * existing pool, the labels haven't yet been updated so we skip 1948 * validation for now. 1949 */ 1950 if (type != SPA_IMPORT_ASSEMBLE) { 1951 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
| 1314 } 1315 } 1316 1317 if (oldvdevs) 1318 kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); 1319 1320 if (sav->sav_config == NULL) 1321 goto out; 1322 1323 sav->sav_vdevs = newvdevs; 1324 sav->sav_count = (int)nl2cache; 1325 1326 /* 1327 * Recompute the stashed list of l2cache devices, with status 1328 * information this time. 1329 */ 1330 VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 1331 DATA_TYPE_NVLIST_ARRAY) == 0); 1332 1333 l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 1334 for (i = 0; i < sav->sav_count; i++) 1335 l2cache[i] = vdev_config_generate(spa, 1336 sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); 1337 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 1338 ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); 1339out: 1340 for (i = 0; i < sav->sav_count; i++) 1341 nvlist_free(l2cache[i]); 1342 if (sav->sav_count) 1343 kmem_free(l2cache, sav->sav_count * sizeof (void *)); 1344} 1345 1346static int 1347load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 1348{ 1349 dmu_buf_t *db; 1350 char *packed = NULL; 1351 size_t nvsize = 0; 1352 int error; 1353 *value = NULL; 1354 1355 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 1356 nvsize = *(uint64_t *)db->db_data; 1357 dmu_buf_rele(db, FTAG); 1358 1359 packed = kmem_alloc(nvsize, KM_SLEEP); 1360 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, 1361 DMU_READ_PREFETCH); 1362 if (error == 0) 1363 error = nvlist_unpack(packed, nvsize, value, 0); 1364 kmem_free(packed, nvsize); 1365 1366 return (error); 1367} 1368 1369/* 1370 * Checks to see if the given vdev could not be opened, in which case we post a 1371 * sysevent to notify the autoreplace code that the device has been removed. 1372 */ 1373static void 1374spa_check_removed(vdev_t *vd) 1375{ 1376 for (int c = 0; c < vd->vdev_children; c++) 1377 spa_check_removed(vd->vdev_child[c]); 1378 1379 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) { 1380 zfs_post_autoreplace(vd->vdev_spa, vd); 1381 spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK); 1382 } 1383} 1384 1385/* 1386 * Validate the current config against the MOS config 1387 */ 1388static boolean_t 1389spa_config_valid(spa_t *spa, nvlist_t *config) 1390{ 1391 vdev_t *mrvd, *rvd = spa->spa_root_vdev; 1392 nvlist_t *nv; 1393 1394 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0); 1395 1396 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1397 VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); 1398 1399 ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children); 1400 1401 /* 1402 * If we're doing a normal import, then build up any additional 1403 * diagnostic information about missing devices in this config. 1404 * We'll pass this up to the user for further processing. 1405 */ 1406 if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { 1407 nvlist_t **child, *nv; 1408 uint64_t idx = 0; 1409 1410 child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **), 1411 KM_SLEEP); 1412 VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); 1413 1414 for (int c = 0; c < rvd->vdev_children; c++) { 1415 vdev_t *tvd = rvd->vdev_child[c]; 1416 vdev_t *mtvd = mrvd->vdev_child[c]; 1417 1418 if (tvd->vdev_ops == &vdev_missing_ops && 1419 mtvd->vdev_ops != &vdev_missing_ops && 1420 mtvd->vdev_islog) 1421 child[idx++] = vdev_config_generate(spa, mtvd, 1422 B_FALSE, 0); 1423 } 1424 1425 if (idx) { 1426 VERIFY(nvlist_add_nvlist_array(nv, 1427 ZPOOL_CONFIG_CHILDREN, child, idx) == 0); 1428 VERIFY(nvlist_add_nvlist(spa->spa_load_info, 1429 ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0); 1430 1431 for (int i = 0; i < idx; i++) 1432 nvlist_free(child[i]); 1433 } 1434 nvlist_free(nv); 1435 kmem_free(child, rvd->vdev_children * sizeof (char **)); 1436 } 1437 1438 /* 1439 * Compare the root vdev tree with the information we have 1440 * from the MOS config (mrvd). Check each top-level vdev 1441 * with the corresponding MOS config top-level (mtvd). 1442 */ 1443 for (int c = 0; c < rvd->vdev_children; c++) { 1444 vdev_t *tvd = rvd->vdev_child[c]; 1445 vdev_t *mtvd = mrvd->vdev_child[c]; 1446 1447 /* 1448 * Resolve any "missing" vdevs in the current configuration. 1449 * If we find that the MOS config has more accurate information 1450 * about the top-level vdev then use that vdev instead. 1451 */ 1452 if (tvd->vdev_ops == &vdev_missing_ops && 1453 mtvd->vdev_ops != &vdev_missing_ops) { 1454 1455 if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) 1456 continue; 1457 1458 /* 1459 * Device specific actions. 1460 */ 1461 if (mtvd->vdev_islog) { 1462 spa_set_log_state(spa, SPA_LOG_CLEAR); 1463 } else { 1464 /* 1465 * XXX - once we have 'readonly' pool 1466 * support we should be able to handle 1467 * missing data devices by transitioning 1468 * the pool to readonly. 1469 */ 1470 continue; 1471 } 1472 1473 /* 1474 * Swap the missing vdev with the data we were 1475 * able to obtain from the MOS config. 1476 */ 1477 vdev_remove_child(rvd, tvd); 1478 vdev_remove_child(mrvd, mtvd); 1479 1480 vdev_add_child(rvd, mtvd); 1481 vdev_add_child(mrvd, tvd); 1482 1483 spa_config_exit(spa, SCL_ALL, FTAG); 1484 vdev_load(mtvd); 1485 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1486 1487 vdev_reopen(rvd); 1488 } else if (mtvd->vdev_islog) { 1489 /* 1490 * Load the slog device's state from the MOS config 1491 * since it's possible that the label does not 1492 * contain the most up-to-date information. 1493 */ 1494 vdev_load_log_state(tvd, mtvd); 1495 vdev_reopen(tvd); 1496 } 1497 } 1498 vdev_free(mrvd); 1499 spa_config_exit(spa, SCL_ALL, FTAG); 1500 1501 /* 1502 * Ensure we were able to validate the config. 1503 */ 1504 return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum); 1505} 1506 1507/* 1508 * Check for missing log devices 1509 */ 1510static int 1511spa_check_logs(spa_t *spa) 1512{ 1513 switch (spa->spa_log_state) { 1514 case SPA_LOG_MISSING: 1515 /* need to recheck in case slog has been restored */ 1516 case SPA_LOG_UNKNOWN: 1517 if (dmu_objset_find(spa->spa_name, zil_check_log_chain, NULL, 1518 DS_FIND_CHILDREN)) { 1519 spa_set_log_state(spa, SPA_LOG_MISSING); 1520 return (1); 1521 } 1522 break; 1523 } 1524 return (0); 1525} 1526 1527static boolean_t 1528spa_passivate_log(spa_t *spa) 1529{ 1530 vdev_t *rvd = spa->spa_root_vdev; 1531 boolean_t slog_found = B_FALSE; 1532 1533 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1534 1535 if (!spa_has_slogs(spa)) 1536 return (B_FALSE); 1537 1538 for (int c = 0; c < rvd->vdev_children; c++) { 1539 vdev_t *tvd = rvd->vdev_child[c]; 1540 metaslab_group_t *mg = tvd->vdev_mg; 1541 1542 if (tvd->vdev_islog) { 1543 metaslab_group_passivate(mg); 1544 slog_found = B_TRUE; 1545 } 1546 } 1547 1548 return (slog_found); 1549} 1550 1551static void 1552spa_activate_log(spa_t *spa) 1553{ 1554 vdev_t *rvd = spa->spa_root_vdev; 1555 1556 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1557 1558 for (int c = 0; c < rvd->vdev_children; c++) { 1559 vdev_t *tvd = rvd->vdev_child[c]; 1560 metaslab_group_t *mg = tvd->vdev_mg; 1561 1562 if (tvd->vdev_islog) 1563 metaslab_group_activate(mg); 1564 } 1565} 1566 1567int 1568spa_offline_log(spa_t *spa) 1569{ 1570 int error = 0; 1571 1572 if ((error = dmu_objset_find(spa_name(spa), zil_vdev_offline, 1573 NULL, DS_FIND_CHILDREN)) == 0) { 1574 1575 /* 1576 * We successfully offlined the log device, sync out the 1577 * current txg so that the "stubby" block can be removed 1578 * by zil_sync(). 1579 */ 1580 txg_wait_synced(spa->spa_dsl_pool, 0); 1581 } 1582 return (error); 1583} 1584 1585static void 1586spa_aux_check_removed(spa_aux_vdev_t *sav) 1587{ 1588 int i; 1589 1590 for (i = 0; i < sav->sav_count; i++) 1591 spa_check_removed(sav->sav_vdevs[i]); 1592} 1593 1594void 1595spa_claim_notify(zio_t *zio) 1596{ 1597 spa_t *spa = zio->io_spa; 1598 1599 if (zio->io_error) 1600 return; 1601 1602 mutex_enter(&spa->spa_props_lock); /* any mutex will do */ 1603 if (spa->spa_claim_max_txg < zio->io_bp->blk_birth) 1604 spa->spa_claim_max_txg = zio->io_bp->blk_birth; 1605 mutex_exit(&spa->spa_props_lock); 1606} 1607 1608typedef struct spa_load_error { 1609 uint64_t sle_meta_count; 1610 uint64_t sle_data_count; 1611} spa_load_error_t; 1612 1613static void 1614spa_load_verify_done(zio_t *zio) 1615{ 1616 blkptr_t *bp = zio->io_bp; 1617 spa_load_error_t *sle = zio->io_private; 1618 dmu_object_type_t type = BP_GET_TYPE(bp); 1619 int error = zio->io_error; 1620 1621 if (error) { 1622 if ((BP_GET_LEVEL(bp) != 0 || dmu_ot[type].ot_metadata) && 1623 type != DMU_OT_INTENT_LOG) 1624 atomic_add_64(&sle->sle_meta_count, 1); 1625 else 1626 atomic_add_64(&sle->sle_data_count, 1); 1627 } 1628 zio_data_buf_free(zio->io_data, zio->io_size); 1629} 1630 1631/*ARGSUSED*/ 1632static int 1633spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 1634 arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) 1635{ 1636 if (bp != NULL) { 1637 zio_t *rio = arg; 1638 size_t size = BP_GET_PSIZE(bp); 1639 void *data = zio_data_buf_alloc(size); 1640 1641 zio_nowait(zio_read(rio, spa, bp, data, size, 1642 spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, 1643 ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | 1644 ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); 1645 } 1646 return (0); 1647} 1648 1649static int 1650spa_load_verify(spa_t *spa) 1651{ 1652 zio_t *rio; 1653 spa_load_error_t sle = { 0 }; 1654 zpool_rewind_policy_t policy; 1655 boolean_t verify_ok = B_FALSE; 1656 int error; 1657 1658 zpool_get_rewind_policy(spa->spa_config, &policy); 1659 1660 if (policy.zrp_request & ZPOOL_NEVER_REWIND) 1661 return (0); 1662 1663 rio = zio_root(spa, NULL, &sle, 1664 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 1665 1666 error = traverse_pool(spa, spa->spa_verify_min_txg, 1667 TRAVERSE_PRE | TRAVERSE_PREFETCH, spa_load_verify_cb, rio); 1668 1669 (void) zio_wait(rio); 1670 1671 spa->spa_load_meta_errors = sle.sle_meta_count; 1672 spa->spa_load_data_errors = sle.sle_data_count; 1673 1674 if (!error && sle.sle_meta_count <= policy.zrp_maxmeta && 1675 sle.sle_data_count <= policy.zrp_maxdata) { 1676 int64_t loss = 0; 1677 1678 verify_ok = B_TRUE; 1679 spa->spa_load_txg = spa->spa_uberblock.ub_txg; 1680 spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; 1681 1682 loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; 1683 VERIFY(nvlist_add_uint64(spa->spa_load_info, 1684 ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0); 1685 VERIFY(nvlist_add_int64(spa->spa_load_info, 1686 ZPOOL_CONFIG_REWIND_TIME, loss) == 0); 1687 VERIFY(nvlist_add_uint64(spa->spa_load_info, 1688 ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0); 1689 } else { 1690 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; 1691 } 1692 1693 if (error) { 1694 if (error != ENXIO && error != EIO) 1695 error = EIO; 1696 return (error); 1697 } 1698 1699 return (verify_ok ? 0 : EIO); 1700} 1701 1702/* 1703 * Find a value in the pool props object. 1704 */ 1705static void 1706spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) 1707{ 1708 (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, 1709 zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); 1710} 1711 1712/* 1713 * Find a value in the pool directory object. 1714 */ 1715static int 1716spa_dir_prop(spa_t *spa, const char *name, uint64_t *val) 1717{ 1718 return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1719 name, sizeof (uint64_t), 1, val)); 1720} 1721 1722static int 1723spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) 1724{ 1725 vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); 1726 return (err); 1727} 1728 1729/* 1730 * Fix up config after a partly-completed split. This is done with the 1731 * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off 1732 * pool have that entry in their config, but only the splitting one contains 1733 * a list of all the guids of the vdevs that are being split off. 1734 * 1735 * This function determines what to do with that list: either rejoin 1736 * all the disks to the pool, or complete the splitting process. To attempt 1737 * the rejoin, each disk that is offlined is marked online again, and 1738 * we do a reopen() call. If the vdev label for every disk that was 1739 * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) 1740 * then we call vdev_split() on each disk, and complete the split. 1741 * 1742 * Otherwise we leave the config alone, with all the vdevs in place in 1743 * the original pool. 1744 */ 1745static void 1746spa_try_repair(spa_t *spa, nvlist_t *config) 1747{ 1748 uint_t extracted; 1749 uint64_t *glist; 1750 uint_t i, gcount; 1751 nvlist_t *nvl; 1752 vdev_t **vd; 1753 boolean_t attempt_reopen; 1754 1755 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) 1756 return; 1757 1758 /* check that the config is complete */ 1759 if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 1760 &glist, &gcount) != 0) 1761 return; 1762 1763 vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); 1764 1765 /* attempt to online all the vdevs & validate */ 1766 attempt_reopen = B_TRUE; 1767 for (i = 0; i < gcount; i++) { 1768 if (glist[i] == 0) /* vdev is hole */ 1769 continue; 1770 1771 vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); 1772 if (vd[i] == NULL) { 1773 /* 1774 * Don't bother attempting to reopen the disks; 1775 * just do the split. 1776 */ 1777 attempt_reopen = B_FALSE; 1778 } else { 1779 /* attempt to re-online it */ 1780 vd[i]->vdev_offline = B_FALSE; 1781 } 1782 } 1783 1784 if (attempt_reopen) { 1785 vdev_reopen(spa->spa_root_vdev); 1786 1787 /* check each device to see what state it's in */ 1788 for (extracted = 0, i = 0; i < gcount; i++) { 1789 if (vd[i] != NULL && 1790 vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) 1791 break; 1792 ++extracted; 1793 } 1794 } 1795 1796 /* 1797 * If every disk has been moved to the new pool, or if we never 1798 * even attempted to look at them, then we split them off for 1799 * good. 1800 */ 1801 if (!attempt_reopen || gcount == extracted) { 1802 for (i = 0; i < gcount; i++) 1803 if (vd[i] != NULL) 1804 vdev_split(vd[i]); 1805 vdev_reopen(spa->spa_root_vdev); 1806 } 1807 1808 kmem_free(vd, gcount * sizeof (vdev_t *)); 1809} 1810 1811static int 1812spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, 1813 boolean_t mosconfig) 1814{ 1815 nvlist_t *config = spa->spa_config; 1816 char *ereport = FM_EREPORT_ZFS_POOL; 1817 char *comment; 1818 int error; 1819 uint64_t pool_guid; 1820 nvlist_t *nvl; 1821 1822 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) 1823 return (EINVAL); 1824 1825 ASSERT(spa->spa_comment == NULL); 1826 if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) 1827 spa->spa_comment = spa_strdup(comment); 1828 1829 /* 1830 * Versioning wasn't explicitly added to the label until later, so if 1831 * it's not present treat it as the initial version. 1832 */ 1833 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 1834 &spa->spa_ubsync.ub_version) != 0) 1835 spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 1836 1837 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 1838 &spa->spa_config_txg); 1839 1840 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 1841 spa_guid_exists(pool_guid, 0)) { 1842 error = EEXIST; 1843 } else { 1844 spa->spa_config_guid = pool_guid; 1845 1846 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, 1847 &nvl) == 0) { 1848 VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting, 1849 KM_SLEEP) == 0); 1850 } 1851 1852 gethrestime(&spa->spa_loaded_ts); 1853 error = spa_load_impl(spa, pool_guid, config, state, type, 1854 mosconfig, &ereport); 1855 } 1856 1857 spa->spa_minref = refcount_count(&spa->spa_refcount); 1858 if (error) { 1859 if (error != EEXIST) { 1860 spa->spa_loaded_ts.tv_sec = 0; 1861 spa->spa_loaded_ts.tv_nsec = 0; 1862 } 1863 if (error != EBADF) { 1864 zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); 1865 } 1866 } 1867 spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; 1868 spa->spa_ena = 0; 1869 1870 return (error); 1871} 1872 1873/* 1874 * Load an existing storage pool, using the pool's builtin spa_config as a 1875 * source of configuration information. 1876 */ 1877static int 1878spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, 1879 spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, 1880 char **ereport) 1881{ 1882 int error = 0; 1883 nvlist_t *nvroot = NULL; 1884 vdev_t *rvd; 1885 uberblock_t *ub = &spa->spa_uberblock; 1886 uint64_t children, config_cache_txg = spa->spa_config_txg; 1887 int orig_mode = spa->spa_mode; 1888 int parse; 1889 uint64_t obj; 1890 1891 /* 1892 * If this is an untrusted config, access the pool in read-only mode. 1893 * This prevents things like resilvering recently removed devices. 1894 */ 1895 if (!mosconfig) 1896 spa->spa_mode = FREAD; 1897 1898 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1899 1900 spa->spa_load_state = state; 1901 1902 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot)) 1903 return (EINVAL); 1904 1905 parse = (type == SPA_IMPORT_EXISTING ? 1906 VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); 1907 1908 /* 1909 * Create "The Godfather" zio to hold all async IOs 1910 */ 1911 spa->spa_async_zio_root = zio_root(spa, NULL, NULL, 1912 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); 1913 1914 /* 1915 * Parse the configuration into a vdev tree. We explicitly set the 1916 * value that will be returned by spa_version() since parsing the 1917 * configuration requires knowing the version number. 1918 */ 1919 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1920 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse); 1921 spa_config_exit(spa, SCL_ALL, FTAG); 1922 1923 if (error != 0) 1924 return (error); 1925 1926 ASSERT(spa->spa_root_vdev == rvd); 1927 1928 if (type != SPA_IMPORT_ASSEMBLE) { 1929 ASSERT(spa_guid(spa) == pool_guid); 1930 } 1931 1932 /* 1933 * Try to open all vdevs, loading each label in the process. 1934 */ 1935 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1936 error = vdev_open(rvd); 1937 spa_config_exit(spa, SCL_ALL, FTAG); 1938 if (error != 0) 1939 return (error); 1940 1941 /* 1942 * We need to validate the vdev labels against the configuration that 1943 * we have in hand, which is dependent on the setting of mosconfig. If 1944 * mosconfig is true then we're validating the vdev labels based on 1945 * that config. Otherwise, we're validating against the cached config 1946 * (zpool.cache) that was read when we loaded the zfs module, and then 1947 * later we will recursively call spa_load() and validate against 1948 * the vdev config. 1949 * 1950 * If we're assembling a new pool that's been split off from an 1951 * existing pool, the labels haven't yet been updated so we skip 1952 * validation for now. 1953 */ 1954 if (type != SPA_IMPORT_ASSEMBLE) { 1955 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
|
1952 error = vdev_validate(rvd);
| 1956 error = vdev_validate(rvd, mosconfig);
|
1953 spa_config_exit(spa, SCL_ALL, FTAG); 1954 1955 if (error != 0) 1956 return (error); 1957 1958 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 1959 return (ENXIO); 1960 } 1961 1962 /* 1963 * Find the best uberblock. 1964 */ 1965 vdev_uberblock_load(NULL, rvd, ub); 1966 1967 /* 1968 * If we weren't able to find a single valid uberblock, return failure. 1969 */ 1970 if (ub->ub_txg == 0) 1971 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); 1972 1973 /* 1974 * If the pool is newer than the code, we can't open it. 1975 */ 1976 if (ub->ub_version > SPA_VERSION) 1977 return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); 1978 1979 /* 1980 * If the vdev guid sum doesn't match the uberblock, we have an 1981 * incomplete configuration. We first check to see if the pool 1982 * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN). 1983 * If it is, defer the vdev_guid_sum check till later so we 1984 * can handle missing vdevs. 1985 */ 1986 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN, 1987 &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE && 1988 rvd->vdev_guid_sum != ub->ub_guid_sum) 1989 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); 1990 1991 if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { 1992 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1993 spa_try_repair(spa, config); 1994 spa_config_exit(spa, SCL_ALL, FTAG); 1995 nvlist_free(spa->spa_config_splitting); 1996 spa->spa_config_splitting = NULL; 1997 } 1998 1999 /* 2000 * Initialize internal SPA structures. 2001 */ 2002 spa->spa_state = POOL_STATE_ACTIVE; 2003 spa->spa_ubsync = spa->spa_uberblock; 2004 spa->spa_verify_min_txg = spa->spa_extreme_rewind ? 2005 TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; 2006 spa->spa_first_txg = spa->spa_last_ubsync_txg ? 2007 spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; 2008 spa->spa_claim_max_txg = spa->spa_first_txg; 2009 spa->spa_prev_software_version = ub->ub_software_version; 2010 2011 error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 2012 if (error) 2013 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2014 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 2015 2016 if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0) 2017 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2018 2019 if (!mosconfig) { 2020 uint64_t hostid; 2021 nvlist_t *policy = NULL, *nvconfig; 2022 2023 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) 2024 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2025 2026 if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig, 2027 ZPOOL_CONFIG_HOSTID, &hostid) == 0) { 2028 char *hostname; 2029 unsigned long myhostid = 0; 2030 2031 VERIFY(nvlist_lookup_string(nvconfig, 2032 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 2033 2034#ifdef _KERNEL 2035 myhostid = zone_get_hostid(NULL); 2036#else /* _KERNEL */ 2037 /* 2038 * We're emulating the system's hostid in userland, so 2039 * we can't use zone_get_hostid(). 2040 */ 2041 (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 2042#endif /* _KERNEL */ 2043 if (check_hostid && hostid != 0 && myhostid != 0 && 2044 hostid != myhostid) { 2045 nvlist_free(nvconfig); 2046 cmn_err(CE_WARN, "pool '%s' could not be " 2047 "loaded as it was last accessed by " 2048 "another system (host: %s hostid: 0x%lx). " 2049 "See: http://www.sun.com/msg/ZFS-8000-EY", 2050 spa_name(spa), hostname, 2051 (unsigned long)hostid); 2052 return (EBADF); 2053 } 2054 } 2055 if (nvlist_lookup_nvlist(spa->spa_config, 2056 ZPOOL_REWIND_POLICY, &policy) == 0) 2057 VERIFY(nvlist_add_nvlist(nvconfig, 2058 ZPOOL_REWIND_POLICY, policy) == 0); 2059 2060 spa_config_set(spa, nvconfig); 2061 spa_unload(spa); 2062 spa_deactivate(spa); 2063 spa_activate(spa, orig_mode); 2064 2065 return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE)); 2066 } 2067 2068 if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0) 2069 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2070 error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); 2071 if (error != 0) 2072 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2073 2074 /* 2075 * Load the bit that tells us to use the new accounting function 2076 * (raid-z deflation). If we have an older pool, this will not 2077 * be present. 2078 */ 2079 error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate); 2080 if (error != 0 && error != ENOENT) 2081 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2082 2083 error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, 2084 &spa->spa_creation_version); 2085 if (error != 0 && error != ENOENT) 2086 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2087 2088 /* 2089 * Load the persistent error log. If we have an older pool, this will 2090 * not be present. 2091 */ 2092 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last); 2093 if (error != 0 && error != ENOENT) 2094 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2095 2096 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, 2097 &spa->spa_errlog_scrub); 2098 if (error != 0 && error != ENOENT) 2099 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2100 2101 /* 2102 * Load the history object. If we have an older pool, this 2103 * will not be present. 2104 */ 2105 error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history); 2106 if (error != 0 && error != ENOENT) 2107 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2108 2109 /* 2110 * If we're assembling the pool from the split-off vdevs of 2111 * an existing pool, we don't want to attach the spares & cache 2112 * devices. 2113 */ 2114 2115 /* 2116 * Load any hot spares for this pool. 2117 */ 2118 error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object); 2119 if (error != 0 && error != ENOENT) 2120 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2121 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 2122 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 2123 if (load_nvlist(spa, spa->spa_spares.sav_object, 2124 &spa->spa_spares.sav_config) != 0) 2125 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2126 2127 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2128 spa_load_spares(spa); 2129 spa_config_exit(spa, SCL_ALL, FTAG); 2130 } else if (error == 0) { 2131 spa->spa_spares.sav_sync = B_TRUE; 2132 } 2133 2134 /* 2135 * Load any level 2 ARC devices for this pool. 2136 */ 2137 error = spa_dir_prop(spa, DMU_POOL_L2CACHE, 2138 &spa->spa_l2cache.sav_object); 2139 if (error != 0 && error != ENOENT) 2140 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2141 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 2142 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 2143 if (load_nvlist(spa, spa->spa_l2cache.sav_object, 2144 &spa->spa_l2cache.sav_config) != 0) 2145 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2146 2147 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2148 spa_load_l2cache(spa); 2149 spa_config_exit(spa, SCL_ALL, FTAG); 2150 } else if (error == 0) { 2151 spa->spa_l2cache.sav_sync = B_TRUE; 2152 } 2153 2154 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 2155 2156 error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object); 2157 if (error && error != ENOENT) 2158 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2159 2160 if (error == 0) { 2161 uint64_t autoreplace; 2162 2163 spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); 2164 spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); 2165 spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); 2166 spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); 2167 spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); 2168 spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO, 2169 &spa->spa_dedup_ditto); 2170 2171 spa->spa_autoreplace = (autoreplace != 0); 2172 } 2173 2174 /* 2175 * If the 'autoreplace' property is set, then post a resource notifying 2176 * the ZFS DE that it should not issue any faults for unopenable 2177 * devices. We also iterate over the vdevs, and post a sysevent for any 2178 * unopenable vdevs so that the normal autoreplace handler can take 2179 * over. 2180 */ 2181 if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) { 2182 spa_check_removed(spa->spa_root_vdev); 2183 /* 2184 * For the import case, this is done in spa_import(), because 2185 * at this point we're using the spare definitions from 2186 * the MOS config, not necessarily from the userland config. 2187 */ 2188 if (state != SPA_LOAD_IMPORT) { 2189 spa_aux_check_removed(&spa->spa_spares); 2190 spa_aux_check_removed(&spa->spa_l2cache); 2191 } 2192 } 2193 2194 /* 2195 * Load the vdev state for all toplevel vdevs. 2196 */ 2197 vdev_load(rvd); 2198 2199 /* 2200 * Propagate the leaf DTLs we just loaded all the way up the tree. 2201 */ 2202 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2203 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 2204 spa_config_exit(spa, SCL_ALL, FTAG); 2205 2206 /* 2207 * Load the DDTs (dedup tables). 2208 */ 2209 error = ddt_load(spa); 2210 if (error != 0) 2211 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2212 2213 spa_update_dspace(spa); 2214 2215 /* 2216 * Validate the config, using the MOS config to fill in any 2217 * information which might be missing. If we fail to validate 2218 * the config then declare the pool unfit for use. If we're 2219 * assembling a pool from a split, the log is not transferred 2220 * over. 2221 */ 2222 if (type != SPA_IMPORT_ASSEMBLE) { 2223 nvlist_t *nvconfig; 2224 2225 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) 2226 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2227 2228 if (!spa_config_valid(spa, nvconfig)) { 2229 nvlist_free(nvconfig); 2230 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, 2231 ENXIO)); 2232 } 2233 nvlist_free(nvconfig); 2234 2235 /* 2236 * Now that we've validate the config, check the state of the 2237 * root vdev. If it can't be opened, it indicates one or 2238 * more toplevel vdevs are faulted. 2239 */ 2240 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 2241 return (ENXIO); 2242 2243 if (spa_check_logs(spa)) { 2244 *ereport = FM_EREPORT_ZFS_LOG_REPLAY; 2245 return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO)); 2246 } 2247 } 2248 2249 /* 2250 * We've successfully opened the pool, verify that we're ready 2251 * to start pushing transactions. 2252 */ 2253 if (state != SPA_LOAD_TRYIMPORT) { 2254 if (error = spa_load_verify(spa)) 2255 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 2256 error)); 2257 } 2258 2259 if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER || 2260 spa->spa_load_max_txg == UINT64_MAX)) { 2261 dmu_tx_t *tx; 2262 int need_update = B_FALSE; 2263 2264 ASSERT(state != SPA_LOAD_TRYIMPORT); 2265 2266 /* 2267 * Claim log blocks that haven't been committed yet. 2268 * This must all happen in a single txg. 2269 * Note: spa_claim_max_txg is updated by spa_claim_notify(), 2270 * invoked from zil_claim_log_block()'s i/o done callback. 2271 * Price of rollback is that we abandon the log. 2272 */ 2273 spa->spa_claiming = B_TRUE; 2274 2275 tx = dmu_tx_create_assigned(spa_get_dsl(spa), 2276 spa_first_txg(spa)); 2277 (void) dmu_objset_find(spa_name(spa), 2278 zil_claim, tx, DS_FIND_CHILDREN); 2279 dmu_tx_commit(tx); 2280 2281 spa->spa_claiming = B_FALSE; 2282 2283 spa_set_log_state(spa, SPA_LOG_GOOD); 2284 spa->spa_sync_on = B_TRUE; 2285 txg_sync_start(spa->spa_dsl_pool); 2286 2287 /* 2288 * Wait for all claims to sync. We sync up to the highest 2289 * claimed log block birth time so that claimed log blocks 2290 * don't appear to be from the future. spa_claim_max_txg 2291 * will have been set for us by either zil_check_log_chain() 2292 * (invoked from spa_check_logs()) or zil_claim() above. 2293 */ 2294 txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); 2295 2296 /* 2297 * If the config cache is stale, or we have uninitialized 2298 * metaslabs (see spa_vdev_add()), then update the config. 2299 * 2300 * If this is a verbatim import, trust the current 2301 * in-core spa_config and update the disk labels. 2302 */ 2303 if (config_cache_txg != spa->spa_config_txg || 2304 state == SPA_LOAD_IMPORT || 2305 state == SPA_LOAD_RECOVER || 2306 (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) 2307 need_update = B_TRUE; 2308 2309 for (int c = 0; c < rvd->vdev_children; c++) 2310 if (rvd->vdev_child[c]->vdev_ms_array == 0) 2311 need_update = B_TRUE; 2312 2313 /* 2314 * Update the config cache asychronously in case we're the 2315 * root pool, in which case the config cache isn't writable yet. 2316 */ 2317 if (need_update) 2318 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 2319 2320 /* 2321 * Check all DTLs to see if anything needs resilvering. 2322 */ 2323 if (!dsl_scan_resilvering(spa->spa_dsl_pool) && 2324 vdev_resilver_needed(rvd, NULL, NULL)) 2325 spa_async_request(spa, SPA_ASYNC_RESILVER); 2326 2327 /* 2328 * Delete any inconsistent datasets. 2329 */ 2330 (void) dmu_objset_find(spa_name(spa), 2331 dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); 2332 2333 /* 2334 * Clean up any stale temporary dataset userrefs. 2335 */ 2336 dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); 2337 } 2338 2339 return (0); 2340} 2341 2342static int 2343spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig) 2344{ 2345 int mode = spa->spa_mode; 2346 2347 spa_unload(spa); 2348 spa_deactivate(spa); 2349 2350 spa->spa_load_max_txg--; 2351 2352 spa_activate(spa, mode); 2353 spa_async_suspend(spa); 2354 2355 return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig)); 2356} 2357 2358static int 2359spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, 2360 uint64_t max_request, int rewind_flags) 2361{ 2362 nvlist_t *config = NULL; 2363 int load_error, rewind_error; 2364 uint64_t safe_rewind_txg; 2365 uint64_t min_txg; 2366 2367 if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { 2368 spa->spa_load_max_txg = spa->spa_load_txg; 2369 spa_set_log_state(spa, SPA_LOG_CLEAR); 2370 } else { 2371 spa->spa_load_max_txg = max_request; 2372 } 2373 2374 load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING, 2375 mosconfig); 2376 if (load_error == 0) 2377 return (0); 2378 2379 if (spa->spa_root_vdev != NULL) 2380 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 2381 2382 spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; 2383 spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; 2384 2385 if (rewind_flags & ZPOOL_NEVER_REWIND) { 2386 nvlist_free(config); 2387 return (load_error); 2388 } 2389 2390 /* Price of rolling back is discarding txgs, including log */ 2391 if (state == SPA_LOAD_RECOVER) 2392 spa_set_log_state(spa, SPA_LOG_CLEAR); 2393 2394 spa->spa_load_max_txg = spa->spa_last_ubsync_txg; 2395 safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; 2396 min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? 2397 TXG_INITIAL : safe_rewind_txg; 2398 2399 /* 2400 * Continue as long as we're finding errors, we're still within 2401 * the acceptable rewind range, and we're still finding uberblocks 2402 */ 2403 while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && 2404 spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { 2405 if (spa->spa_load_max_txg < safe_rewind_txg) 2406 spa->spa_extreme_rewind = B_TRUE; 2407 rewind_error = spa_load_retry(spa, state, mosconfig); 2408 } 2409 2410 spa->spa_extreme_rewind = B_FALSE; 2411 spa->spa_load_max_txg = UINT64_MAX; 2412 2413 if (config && (rewind_error || state != SPA_LOAD_RECOVER)) 2414 spa_config_set(spa, config); 2415 2416 return (state == SPA_LOAD_RECOVER ? rewind_error : load_error); 2417} 2418 2419/* 2420 * Pool Open/Import 2421 * 2422 * The import case is identical to an open except that the configuration is sent 2423 * down from userland, instead of grabbed from the configuration cache. For the 2424 * case of an open, the pool configuration will exist in the 2425 * POOL_STATE_UNINITIALIZED state. 2426 * 2427 * The stats information (gen/count/ustats) is used to gather vdev statistics at 2428 * the same time open the pool, without having to keep around the spa_t in some 2429 * ambiguous state. 2430 */ 2431static int 2432spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, 2433 nvlist_t **config) 2434{ 2435 spa_t *spa; 2436 spa_load_state_t state = SPA_LOAD_OPEN; 2437 int error; 2438 int locked = B_FALSE; 2439 int firstopen = B_FALSE; 2440 2441 *spapp = NULL; 2442 2443 /* 2444 * As disgusting as this is, we need to support recursive calls to this 2445 * function because dsl_dir_open() is called during spa_load(), and ends 2446 * up calling spa_open() again. The real fix is to figure out how to 2447 * avoid dsl_dir_open() calling this in the first place. 2448 */ 2449 if (mutex_owner(&spa_namespace_lock) != curthread) { 2450 mutex_enter(&spa_namespace_lock); 2451 locked = B_TRUE; 2452 } 2453 2454 if ((spa = spa_lookup(pool)) == NULL) { 2455 if (locked) 2456 mutex_exit(&spa_namespace_lock); 2457 return (ENOENT); 2458 } 2459 2460 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 2461 zpool_rewind_policy_t policy; 2462 2463 firstopen = B_TRUE; 2464 2465 zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config, 2466 &policy); 2467 if (policy.zrp_request & ZPOOL_DO_REWIND) 2468 state = SPA_LOAD_RECOVER; 2469 2470 spa_activate(spa, spa_mode_global); 2471 2472 if (state != SPA_LOAD_RECOVER) 2473 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 2474 2475 error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg, 2476 policy.zrp_request); 2477 2478 if (error == EBADF) { 2479 /* 2480 * If vdev_validate() returns failure (indicated by 2481 * EBADF), it indicates that one of the vdevs indicates 2482 * that the pool has been exported or destroyed. If 2483 * this is the case, the config cache is out of sync and 2484 * we should remove the pool from the namespace. 2485 */ 2486 spa_unload(spa); 2487 spa_deactivate(spa); 2488 spa_config_sync(spa, B_TRUE, B_TRUE); 2489 spa_remove(spa); 2490 if (locked) 2491 mutex_exit(&spa_namespace_lock); 2492 return (ENOENT); 2493 } 2494 2495 if (error) { 2496 /* 2497 * We can't open the pool, but we still have useful 2498 * information: the state of each vdev after the 2499 * attempted vdev_open(). Return this to the user. 2500 */ 2501 if (config != NULL && spa->spa_config) { 2502 VERIFY(nvlist_dup(spa->spa_config, config, 2503 KM_SLEEP) == 0); 2504 VERIFY(nvlist_add_nvlist(*config, 2505 ZPOOL_CONFIG_LOAD_INFO, 2506 spa->spa_load_info) == 0); 2507 } 2508 spa_unload(spa); 2509 spa_deactivate(spa); 2510 spa->spa_last_open_failed = error; 2511 if (locked) 2512 mutex_exit(&spa_namespace_lock); 2513 *spapp = NULL; 2514 return (error); 2515 } 2516 } 2517 2518 spa_open_ref(spa, tag); 2519 2520 if (config != NULL) 2521 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 2522 2523 /* 2524 * If we've recovered the pool, pass back any information we 2525 * gathered while doing the load. 2526 */ 2527 if (state == SPA_LOAD_RECOVER) { 2528 VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, 2529 spa->spa_load_info) == 0); 2530 } 2531 2532 if (locked) { 2533 spa->spa_last_open_failed = 0; 2534 spa->spa_last_ubsync_txg = 0; 2535 spa->spa_load_txg = 0; 2536 mutex_exit(&spa_namespace_lock); 2537#ifdef __FreeBSD__ 2538#ifdef _KERNEL 2539 if (firstopen) 2540 zvol_create_minors(pool); 2541#endif 2542#endif 2543 } 2544 2545 *spapp = spa; 2546 2547 return (0); 2548} 2549 2550int 2551spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy, 2552 nvlist_t **config) 2553{ 2554 return (spa_open_common(name, spapp, tag, policy, config)); 2555} 2556 2557int 2558spa_open(const char *name, spa_t **spapp, void *tag) 2559{ 2560 return (spa_open_common(name, spapp, tag, NULL, NULL)); 2561} 2562 2563/* 2564 * Lookup the given spa_t, incrementing the inject count in the process, 2565 * preventing it from being exported or destroyed. 2566 */ 2567spa_t * 2568spa_inject_addref(char *name) 2569{ 2570 spa_t *spa; 2571 2572 mutex_enter(&spa_namespace_lock); 2573 if ((spa = spa_lookup(name)) == NULL) { 2574 mutex_exit(&spa_namespace_lock); 2575 return (NULL); 2576 } 2577 spa->spa_inject_ref++; 2578 mutex_exit(&spa_namespace_lock); 2579 2580 return (spa); 2581} 2582 2583void 2584spa_inject_delref(spa_t *spa) 2585{ 2586 mutex_enter(&spa_namespace_lock); 2587 spa->spa_inject_ref--; 2588 mutex_exit(&spa_namespace_lock); 2589} 2590 2591/* 2592 * Add spares device information to the nvlist. 2593 */ 2594static void 2595spa_add_spares(spa_t *spa, nvlist_t *config) 2596{ 2597 nvlist_t **spares; 2598 uint_t i, nspares; 2599 nvlist_t *nvroot; 2600 uint64_t guid; 2601 vdev_stat_t *vs; 2602 uint_t vsc; 2603 uint64_t pool; 2604 2605 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 2606 2607 if (spa->spa_spares.sav_count == 0) 2608 return; 2609 2610 VERIFY(nvlist_lookup_nvlist(config, 2611 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 2612 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 2613 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 2614 if (nspares != 0) { 2615 VERIFY(nvlist_add_nvlist_array(nvroot, 2616 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 2617 VERIFY(nvlist_lookup_nvlist_array(nvroot, 2618 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 2619 2620 /* 2621 * Go through and find any spares which have since been 2622 * repurposed as an active spare. If this is the case, update 2623 * their status appropriately. 2624 */ 2625 for (i = 0; i < nspares; i++) { 2626 VERIFY(nvlist_lookup_uint64(spares[i], 2627 ZPOOL_CONFIG_GUID, &guid) == 0); 2628 if (spa_spare_exists(guid, &pool, NULL) && 2629 pool != 0ULL) { 2630 VERIFY(nvlist_lookup_uint64_array( 2631 spares[i], ZPOOL_CONFIG_VDEV_STATS, 2632 (uint64_t **)&vs, &vsc) == 0); 2633 vs->vs_state = VDEV_STATE_CANT_OPEN; 2634 vs->vs_aux = VDEV_AUX_SPARED; 2635 } 2636 } 2637 } 2638} 2639 2640/* 2641 * Add l2cache device information to the nvlist, including vdev stats. 2642 */ 2643static void 2644spa_add_l2cache(spa_t *spa, nvlist_t *config) 2645{ 2646 nvlist_t **l2cache; 2647 uint_t i, j, nl2cache; 2648 nvlist_t *nvroot; 2649 uint64_t guid; 2650 vdev_t *vd; 2651 vdev_stat_t *vs; 2652 uint_t vsc; 2653 2654 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 2655 2656 if (spa->spa_l2cache.sav_count == 0) 2657 return; 2658 2659 VERIFY(nvlist_lookup_nvlist(config, 2660 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 2661 VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 2662 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 2663 if (nl2cache != 0) { 2664 VERIFY(nvlist_add_nvlist_array(nvroot, 2665 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 2666 VERIFY(nvlist_lookup_nvlist_array(nvroot, 2667 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 2668 2669 /* 2670 * Update level 2 cache device stats. 2671 */ 2672 2673 for (i = 0; i < nl2cache; i++) { 2674 VERIFY(nvlist_lookup_uint64(l2cache[i], 2675 ZPOOL_CONFIG_GUID, &guid) == 0); 2676 2677 vd = NULL; 2678 for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 2679 if (guid == 2680 spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 2681 vd = spa->spa_l2cache.sav_vdevs[j]; 2682 break; 2683 } 2684 } 2685 ASSERT(vd != NULL); 2686 2687 VERIFY(nvlist_lookup_uint64_array(l2cache[i], 2688 ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) 2689 == 0); 2690 vdev_get_stats(vd, vs); 2691 } 2692 } 2693} 2694 2695int 2696spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 2697{ 2698 int error; 2699 spa_t *spa; 2700 2701 *config = NULL; 2702 error = spa_open_common(name, &spa, FTAG, NULL, config); 2703 2704 if (spa != NULL) { 2705 /* 2706 * This still leaves a window of inconsistency where the spares 2707 * or l2cache devices could change and the config would be 2708 * self-inconsistent. 2709 */ 2710 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 2711 2712 if (*config != NULL) { 2713 uint64_t loadtimes[2]; 2714 2715 loadtimes[0] = spa->spa_loaded_ts.tv_sec; 2716 loadtimes[1] = spa->spa_loaded_ts.tv_nsec; 2717 VERIFY(nvlist_add_uint64_array(*config, 2718 ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0); 2719 2720 VERIFY(nvlist_add_uint64(*config, 2721 ZPOOL_CONFIG_ERRCOUNT, 2722 spa_get_errlog_size(spa)) == 0); 2723 2724 if (spa_suspended(spa)) 2725 VERIFY(nvlist_add_uint64(*config, 2726 ZPOOL_CONFIG_SUSPENDED, 2727 spa->spa_failmode) == 0); 2728 2729 spa_add_spares(spa, *config); 2730 spa_add_l2cache(spa, *config); 2731 } 2732 } 2733 2734 /* 2735 * We want to get the alternate root even for faulted pools, so we cheat 2736 * and call spa_lookup() directly. 2737 */ 2738 if (altroot) { 2739 if (spa == NULL) { 2740 mutex_enter(&spa_namespace_lock); 2741 spa = spa_lookup(name); 2742 if (spa) 2743 spa_altroot(spa, altroot, buflen); 2744 else 2745 altroot[0] = '\0'; 2746 spa = NULL; 2747 mutex_exit(&spa_namespace_lock); 2748 } else { 2749 spa_altroot(spa, altroot, buflen); 2750 } 2751 } 2752 2753 if (spa != NULL) { 2754 spa_config_exit(spa, SCL_CONFIG, FTAG); 2755 spa_close(spa, FTAG); 2756 } 2757 2758 return (error); 2759} 2760 2761/* 2762 * Validate that the auxiliary device array is well formed. We must have an 2763 * array of nvlists, each which describes a valid leaf vdev. If this is an 2764 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 2765 * specified, as long as they are well-formed. 2766 */ 2767static int 2768spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 2769 spa_aux_vdev_t *sav, const char *config, uint64_t version, 2770 vdev_labeltype_t label) 2771{ 2772 nvlist_t **dev; 2773 uint_t i, ndev; 2774 vdev_t *vd; 2775 int error; 2776 2777 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 2778 2779 /* 2780 * It's acceptable to have no devs specified. 2781 */ 2782 if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 2783 return (0); 2784 2785 if (ndev == 0) 2786 return (EINVAL); 2787 2788 /* 2789 * Make sure the pool is formatted with a version that supports this 2790 * device type. 2791 */ 2792 if (spa_version(spa) < version) 2793 return (ENOTSUP); 2794 2795 /* 2796 * Set the pending device list so we correctly handle device in-use 2797 * checking. 2798 */ 2799 sav->sav_pending = dev; 2800 sav->sav_npending = ndev; 2801 2802 for (i = 0; i < ndev; i++) { 2803 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 2804 mode)) != 0) 2805 goto out; 2806 2807 if (!vd->vdev_ops->vdev_op_leaf) { 2808 vdev_free(vd); 2809 error = EINVAL; 2810 goto out; 2811 } 2812 2813 /* 2814 * The L2ARC currently only supports disk devices in 2815 * kernel context. For user-level testing, we allow it. 2816 */ 2817#ifdef _KERNEL 2818 if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && 2819 strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { 2820 error = ENOTBLK;
| 1957 spa_config_exit(spa, SCL_ALL, FTAG); 1958 1959 if (error != 0) 1960 return (error); 1961 1962 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 1963 return (ENXIO); 1964 } 1965 1966 /* 1967 * Find the best uberblock. 1968 */ 1969 vdev_uberblock_load(NULL, rvd, ub); 1970 1971 /* 1972 * If we weren't able to find a single valid uberblock, return failure. 1973 */ 1974 if (ub->ub_txg == 0) 1975 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); 1976 1977 /* 1978 * If the pool is newer than the code, we can't open it. 1979 */ 1980 if (ub->ub_version > SPA_VERSION) 1981 return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); 1982 1983 /* 1984 * If the vdev guid sum doesn't match the uberblock, we have an 1985 * incomplete configuration. We first check to see if the pool 1986 * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN). 1987 * If it is, defer the vdev_guid_sum check till later so we 1988 * can handle missing vdevs. 1989 */ 1990 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN, 1991 &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE && 1992 rvd->vdev_guid_sum != ub->ub_guid_sum) 1993 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); 1994 1995 if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { 1996 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1997 spa_try_repair(spa, config); 1998 spa_config_exit(spa, SCL_ALL, FTAG); 1999 nvlist_free(spa->spa_config_splitting); 2000 spa->spa_config_splitting = NULL; 2001 } 2002 2003 /* 2004 * Initialize internal SPA structures. 2005 */ 2006 spa->spa_state = POOL_STATE_ACTIVE; 2007 spa->spa_ubsync = spa->spa_uberblock; 2008 spa->spa_verify_min_txg = spa->spa_extreme_rewind ? 2009 TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; 2010 spa->spa_first_txg = spa->spa_last_ubsync_txg ? 2011 spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; 2012 spa->spa_claim_max_txg = spa->spa_first_txg; 2013 spa->spa_prev_software_version = ub->ub_software_version; 2014 2015 error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 2016 if (error) 2017 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2018 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 2019 2020 if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0) 2021 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2022 2023 if (!mosconfig) { 2024 uint64_t hostid; 2025 nvlist_t *policy = NULL, *nvconfig; 2026 2027 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) 2028 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2029 2030 if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig, 2031 ZPOOL_CONFIG_HOSTID, &hostid) == 0) { 2032 char *hostname; 2033 unsigned long myhostid = 0; 2034 2035 VERIFY(nvlist_lookup_string(nvconfig, 2036 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 2037 2038#ifdef _KERNEL 2039 myhostid = zone_get_hostid(NULL); 2040#else /* _KERNEL */ 2041 /* 2042 * We're emulating the system's hostid in userland, so 2043 * we can't use zone_get_hostid(). 2044 */ 2045 (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 2046#endif /* _KERNEL */ 2047 if (check_hostid && hostid != 0 && myhostid != 0 && 2048 hostid != myhostid) { 2049 nvlist_free(nvconfig); 2050 cmn_err(CE_WARN, "pool '%s' could not be " 2051 "loaded as it was last accessed by " 2052 "another system (host: %s hostid: 0x%lx). " 2053 "See: http://www.sun.com/msg/ZFS-8000-EY", 2054 spa_name(spa), hostname, 2055 (unsigned long)hostid); 2056 return (EBADF); 2057 } 2058 } 2059 if (nvlist_lookup_nvlist(spa->spa_config, 2060 ZPOOL_REWIND_POLICY, &policy) == 0) 2061 VERIFY(nvlist_add_nvlist(nvconfig, 2062 ZPOOL_REWIND_POLICY, policy) == 0); 2063 2064 spa_config_set(spa, nvconfig); 2065 spa_unload(spa); 2066 spa_deactivate(spa); 2067 spa_activate(spa, orig_mode); 2068 2069 return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE)); 2070 } 2071 2072 if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0) 2073 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2074 error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); 2075 if (error != 0) 2076 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2077 2078 /* 2079 * Load the bit that tells us to use the new accounting function 2080 * (raid-z deflation). If we have an older pool, this will not 2081 * be present. 2082 */ 2083 error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate); 2084 if (error != 0 && error != ENOENT) 2085 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2086 2087 error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, 2088 &spa->spa_creation_version); 2089 if (error != 0 && error != ENOENT) 2090 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2091 2092 /* 2093 * Load the persistent error log. If we have an older pool, this will 2094 * not be present. 2095 */ 2096 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last); 2097 if (error != 0 && error != ENOENT) 2098 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2099 2100 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, 2101 &spa->spa_errlog_scrub); 2102 if (error != 0 && error != ENOENT) 2103 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2104 2105 /* 2106 * Load the history object. If we have an older pool, this 2107 * will not be present. 2108 */ 2109 error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history); 2110 if (error != 0 && error != ENOENT) 2111 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2112 2113 /* 2114 * If we're assembling the pool from the split-off vdevs of 2115 * an existing pool, we don't want to attach the spares & cache 2116 * devices. 2117 */ 2118 2119 /* 2120 * Load any hot spares for this pool. 2121 */ 2122 error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object); 2123 if (error != 0 && error != ENOENT) 2124 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2125 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 2126 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 2127 if (load_nvlist(spa, spa->spa_spares.sav_object, 2128 &spa->spa_spares.sav_config) != 0) 2129 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2130 2131 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2132 spa_load_spares(spa); 2133 spa_config_exit(spa, SCL_ALL, FTAG); 2134 } else if (error == 0) { 2135 spa->spa_spares.sav_sync = B_TRUE; 2136 } 2137 2138 /* 2139 * Load any level 2 ARC devices for this pool. 2140 */ 2141 error = spa_dir_prop(spa, DMU_POOL_L2CACHE, 2142 &spa->spa_l2cache.sav_object); 2143 if (error != 0 && error != ENOENT) 2144 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2145 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 2146 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 2147 if (load_nvlist(spa, spa->spa_l2cache.sav_object, 2148 &spa->spa_l2cache.sav_config) != 0) 2149 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2150 2151 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2152 spa_load_l2cache(spa); 2153 spa_config_exit(spa, SCL_ALL, FTAG); 2154 } else if (error == 0) { 2155 spa->spa_l2cache.sav_sync = B_TRUE; 2156 } 2157 2158 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 2159 2160 error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object); 2161 if (error && error != ENOENT) 2162 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2163 2164 if (error == 0) { 2165 uint64_t autoreplace; 2166 2167 spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); 2168 spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); 2169 spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); 2170 spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); 2171 spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); 2172 spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO, 2173 &spa->spa_dedup_ditto); 2174 2175 spa->spa_autoreplace = (autoreplace != 0); 2176 } 2177 2178 /* 2179 * If the 'autoreplace' property is set, then post a resource notifying 2180 * the ZFS DE that it should not issue any faults for unopenable 2181 * devices. We also iterate over the vdevs, and post a sysevent for any 2182 * unopenable vdevs so that the normal autoreplace handler can take 2183 * over. 2184 */ 2185 if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) { 2186 spa_check_removed(spa->spa_root_vdev); 2187 /* 2188 * For the import case, this is done in spa_import(), because 2189 * at this point we're using the spare definitions from 2190 * the MOS config, not necessarily from the userland config. 2191 */ 2192 if (state != SPA_LOAD_IMPORT) { 2193 spa_aux_check_removed(&spa->spa_spares); 2194 spa_aux_check_removed(&spa->spa_l2cache); 2195 } 2196 } 2197 2198 /* 2199 * Load the vdev state for all toplevel vdevs. 2200 */ 2201 vdev_load(rvd); 2202 2203 /* 2204 * Propagate the leaf DTLs we just loaded all the way up the tree. 2205 */ 2206 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2207 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 2208 spa_config_exit(spa, SCL_ALL, FTAG); 2209 2210 /* 2211 * Load the DDTs (dedup tables). 2212 */ 2213 error = ddt_load(spa); 2214 if (error != 0) 2215 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2216 2217 spa_update_dspace(spa); 2218 2219 /* 2220 * Validate the config, using the MOS config to fill in any 2221 * information which might be missing. If we fail to validate 2222 * the config then declare the pool unfit for use. If we're 2223 * assembling a pool from a split, the log is not transferred 2224 * over. 2225 */ 2226 if (type != SPA_IMPORT_ASSEMBLE) { 2227 nvlist_t *nvconfig; 2228 2229 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) 2230 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2231 2232 if (!spa_config_valid(spa, nvconfig)) { 2233 nvlist_free(nvconfig); 2234 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, 2235 ENXIO)); 2236 } 2237 nvlist_free(nvconfig); 2238 2239 /* 2240 * Now that we've validate the config, check the state of the 2241 * root vdev. If it can't be opened, it indicates one or 2242 * more toplevel vdevs are faulted. 2243 */ 2244 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 2245 return (ENXIO); 2246 2247 if (spa_check_logs(spa)) { 2248 *ereport = FM_EREPORT_ZFS_LOG_REPLAY; 2249 return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO)); 2250 } 2251 } 2252 2253 /* 2254 * We've successfully opened the pool, verify that we're ready 2255 * to start pushing transactions. 2256 */ 2257 if (state != SPA_LOAD_TRYIMPORT) { 2258 if (error = spa_load_verify(spa)) 2259 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 2260 error)); 2261 } 2262 2263 if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER || 2264 spa->spa_load_max_txg == UINT64_MAX)) { 2265 dmu_tx_t *tx; 2266 int need_update = B_FALSE; 2267 2268 ASSERT(state != SPA_LOAD_TRYIMPORT); 2269 2270 /* 2271 * Claim log blocks that haven't been committed yet. 2272 * This must all happen in a single txg. 2273 * Note: spa_claim_max_txg is updated by spa_claim_notify(), 2274 * invoked from zil_claim_log_block()'s i/o done callback. 2275 * Price of rollback is that we abandon the log. 2276 */ 2277 spa->spa_claiming = B_TRUE; 2278 2279 tx = dmu_tx_create_assigned(spa_get_dsl(spa), 2280 spa_first_txg(spa)); 2281 (void) dmu_objset_find(spa_name(spa), 2282 zil_claim, tx, DS_FIND_CHILDREN); 2283 dmu_tx_commit(tx); 2284 2285 spa->spa_claiming = B_FALSE; 2286 2287 spa_set_log_state(spa, SPA_LOG_GOOD); 2288 spa->spa_sync_on = B_TRUE; 2289 txg_sync_start(spa->spa_dsl_pool); 2290 2291 /* 2292 * Wait for all claims to sync. We sync up to the highest 2293 * claimed log block birth time so that claimed log blocks 2294 * don't appear to be from the future. spa_claim_max_txg 2295 * will have been set for us by either zil_check_log_chain() 2296 * (invoked from spa_check_logs()) or zil_claim() above. 2297 */ 2298 txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); 2299 2300 /* 2301 * If the config cache is stale, or we have uninitialized 2302 * metaslabs (see spa_vdev_add()), then update the config. 2303 * 2304 * If this is a verbatim import, trust the current 2305 * in-core spa_config and update the disk labels. 2306 */ 2307 if (config_cache_txg != spa->spa_config_txg || 2308 state == SPA_LOAD_IMPORT || 2309 state == SPA_LOAD_RECOVER || 2310 (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) 2311 need_update = B_TRUE; 2312 2313 for (int c = 0; c < rvd->vdev_children; c++) 2314 if (rvd->vdev_child[c]->vdev_ms_array == 0) 2315 need_update = B_TRUE; 2316 2317 /* 2318 * Update the config cache asychronously in case we're the 2319 * root pool, in which case the config cache isn't writable yet. 2320 */ 2321 if (need_update) 2322 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 2323 2324 /* 2325 * Check all DTLs to see if anything needs resilvering. 2326 */ 2327 if (!dsl_scan_resilvering(spa->spa_dsl_pool) && 2328 vdev_resilver_needed(rvd, NULL, NULL)) 2329 spa_async_request(spa, SPA_ASYNC_RESILVER); 2330 2331 /* 2332 * Delete any inconsistent datasets. 2333 */ 2334 (void) dmu_objset_find(spa_name(spa), 2335 dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); 2336 2337 /* 2338 * Clean up any stale temporary dataset userrefs. 2339 */ 2340 dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); 2341 } 2342 2343 return (0); 2344} 2345 2346static int 2347spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig) 2348{ 2349 int mode = spa->spa_mode; 2350 2351 spa_unload(spa); 2352 spa_deactivate(spa); 2353 2354 spa->spa_load_max_txg--; 2355 2356 spa_activate(spa, mode); 2357 spa_async_suspend(spa); 2358 2359 return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig)); 2360} 2361 2362static int 2363spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, 2364 uint64_t max_request, int rewind_flags) 2365{ 2366 nvlist_t *config = NULL; 2367 int load_error, rewind_error; 2368 uint64_t safe_rewind_txg; 2369 uint64_t min_txg; 2370 2371 if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { 2372 spa->spa_load_max_txg = spa->spa_load_txg; 2373 spa_set_log_state(spa, SPA_LOG_CLEAR); 2374 } else { 2375 spa->spa_load_max_txg = max_request; 2376 } 2377 2378 load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING, 2379 mosconfig); 2380 if (load_error == 0) 2381 return (0); 2382 2383 if (spa->spa_root_vdev != NULL) 2384 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 2385 2386 spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; 2387 spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; 2388 2389 if (rewind_flags & ZPOOL_NEVER_REWIND) { 2390 nvlist_free(config); 2391 return (load_error); 2392 } 2393 2394 /* Price of rolling back is discarding txgs, including log */ 2395 if (state == SPA_LOAD_RECOVER) 2396 spa_set_log_state(spa, SPA_LOG_CLEAR); 2397 2398 spa->spa_load_max_txg = spa->spa_last_ubsync_txg; 2399 safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; 2400 min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? 2401 TXG_INITIAL : safe_rewind_txg; 2402 2403 /* 2404 * Continue as long as we're finding errors, we're still within 2405 * the acceptable rewind range, and we're still finding uberblocks 2406 */ 2407 while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && 2408 spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { 2409 if (spa->spa_load_max_txg < safe_rewind_txg) 2410 spa->spa_extreme_rewind = B_TRUE; 2411 rewind_error = spa_load_retry(spa, state, mosconfig); 2412 } 2413 2414 spa->spa_extreme_rewind = B_FALSE; 2415 spa->spa_load_max_txg = UINT64_MAX; 2416 2417 if (config && (rewind_error || state != SPA_LOAD_RECOVER)) 2418 spa_config_set(spa, config); 2419 2420 return (state == SPA_LOAD_RECOVER ? rewind_error : load_error); 2421} 2422 2423/* 2424 * Pool Open/Import 2425 * 2426 * The import case is identical to an open except that the configuration is sent 2427 * down from userland, instead of grabbed from the configuration cache. For the 2428 * case of an open, the pool configuration will exist in the 2429 * POOL_STATE_UNINITIALIZED state. 2430 * 2431 * The stats information (gen/count/ustats) is used to gather vdev statistics at 2432 * the same time open the pool, without having to keep around the spa_t in some 2433 * ambiguous state. 2434 */ 2435static int 2436spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, 2437 nvlist_t **config) 2438{ 2439 spa_t *spa; 2440 spa_load_state_t state = SPA_LOAD_OPEN; 2441 int error; 2442 int locked = B_FALSE; 2443 int firstopen = B_FALSE; 2444 2445 *spapp = NULL; 2446 2447 /* 2448 * As disgusting as this is, we need to support recursive calls to this 2449 * function because dsl_dir_open() is called during spa_load(), and ends 2450 * up calling spa_open() again. The real fix is to figure out how to 2451 * avoid dsl_dir_open() calling this in the first place. 2452 */ 2453 if (mutex_owner(&spa_namespace_lock) != curthread) { 2454 mutex_enter(&spa_namespace_lock); 2455 locked = B_TRUE; 2456 } 2457 2458 if ((spa = spa_lookup(pool)) == NULL) { 2459 if (locked) 2460 mutex_exit(&spa_namespace_lock); 2461 return (ENOENT); 2462 } 2463 2464 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 2465 zpool_rewind_policy_t policy; 2466 2467 firstopen = B_TRUE; 2468 2469 zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config, 2470 &policy); 2471 if (policy.zrp_request & ZPOOL_DO_REWIND) 2472 state = SPA_LOAD_RECOVER; 2473 2474 spa_activate(spa, spa_mode_global); 2475 2476 if (state != SPA_LOAD_RECOVER) 2477 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 2478 2479 error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg, 2480 policy.zrp_request); 2481 2482 if (error == EBADF) { 2483 /* 2484 * If vdev_validate() returns failure (indicated by 2485 * EBADF), it indicates that one of the vdevs indicates 2486 * that the pool has been exported or destroyed. If 2487 * this is the case, the config cache is out of sync and 2488 * we should remove the pool from the namespace. 2489 */ 2490 spa_unload(spa); 2491 spa_deactivate(spa); 2492 spa_config_sync(spa, B_TRUE, B_TRUE); 2493 spa_remove(spa); 2494 if (locked) 2495 mutex_exit(&spa_namespace_lock); 2496 return (ENOENT); 2497 } 2498 2499 if (error) { 2500 /* 2501 * We can't open the pool, but we still have useful 2502 * information: the state of each vdev after the 2503 * attempted vdev_open(). Return this to the user. 2504 */ 2505 if (config != NULL && spa->spa_config) { 2506 VERIFY(nvlist_dup(spa->spa_config, config, 2507 KM_SLEEP) == 0); 2508 VERIFY(nvlist_add_nvlist(*config, 2509 ZPOOL_CONFIG_LOAD_INFO, 2510 spa->spa_load_info) == 0); 2511 } 2512 spa_unload(spa); 2513 spa_deactivate(spa); 2514 spa->spa_last_open_failed = error; 2515 if (locked) 2516 mutex_exit(&spa_namespace_lock); 2517 *spapp = NULL; 2518 return (error); 2519 } 2520 } 2521 2522 spa_open_ref(spa, tag); 2523 2524 if (config != NULL) 2525 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 2526 2527 /* 2528 * If we've recovered the pool, pass back any information we 2529 * gathered while doing the load. 2530 */ 2531 if (state == SPA_LOAD_RECOVER) { 2532 VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, 2533 spa->spa_load_info) == 0); 2534 } 2535 2536 if (locked) { 2537 spa->spa_last_open_failed = 0; 2538 spa->spa_last_ubsync_txg = 0; 2539 spa->spa_load_txg = 0; 2540 mutex_exit(&spa_namespace_lock); 2541#ifdef __FreeBSD__ 2542#ifdef _KERNEL 2543 if (firstopen) 2544 zvol_create_minors(pool); 2545#endif 2546#endif 2547 } 2548 2549 *spapp = spa; 2550 2551 return (0); 2552} 2553 2554int 2555spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy, 2556 nvlist_t **config) 2557{ 2558 return (spa_open_common(name, spapp, tag, policy, config)); 2559} 2560 2561int 2562spa_open(const char *name, spa_t **spapp, void *tag) 2563{ 2564 return (spa_open_common(name, spapp, tag, NULL, NULL)); 2565} 2566 2567/* 2568 * Lookup the given spa_t, incrementing the inject count in the process, 2569 * preventing it from being exported or destroyed. 2570 */ 2571spa_t * 2572spa_inject_addref(char *name) 2573{ 2574 spa_t *spa; 2575 2576 mutex_enter(&spa_namespace_lock); 2577 if ((spa = spa_lookup(name)) == NULL) { 2578 mutex_exit(&spa_namespace_lock); 2579 return (NULL); 2580 } 2581 spa->spa_inject_ref++; 2582 mutex_exit(&spa_namespace_lock); 2583 2584 return (spa); 2585} 2586 2587void 2588spa_inject_delref(spa_t *spa) 2589{ 2590 mutex_enter(&spa_namespace_lock); 2591 spa->spa_inject_ref--; 2592 mutex_exit(&spa_namespace_lock); 2593} 2594 2595/* 2596 * Add spares device information to the nvlist. 2597 */ 2598static void 2599spa_add_spares(spa_t *spa, nvlist_t *config) 2600{ 2601 nvlist_t **spares; 2602 uint_t i, nspares; 2603 nvlist_t *nvroot; 2604 uint64_t guid; 2605 vdev_stat_t *vs; 2606 uint_t vsc; 2607 uint64_t pool; 2608 2609 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 2610 2611 if (spa->spa_spares.sav_count == 0) 2612 return; 2613 2614 VERIFY(nvlist_lookup_nvlist(config, 2615 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 2616 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 2617 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 2618 if (nspares != 0) { 2619 VERIFY(nvlist_add_nvlist_array(nvroot, 2620 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 2621 VERIFY(nvlist_lookup_nvlist_array(nvroot, 2622 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 2623 2624 /* 2625 * Go through and find any spares which have since been 2626 * repurposed as an active spare. If this is the case, update 2627 * their status appropriately. 2628 */ 2629 for (i = 0; i < nspares; i++) { 2630 VERIFY(nvlist_lookup_uint64(spares[i], 2631 ZPOOL_CONFIG_GUID, &guid) == 0); 2632 if (spa_spare_exists(guid, &pool, NULL) && 2633 pool != 0ULL) { 2634 VERIFY(nvlist_lookup_uint64_array( 2635 spares[i], ZPOOL_CONFIG_VDEV_STATS, 2636 (uint64_t **)&vs, &vsc) == 0); 2637 vs->vs_state = VDEV_STATE_CANT_OPEN; 2638 vs->vs_aux = VDEV_AUX_SPARED; 2639 } 2640 } 2641 } 2642} 2643 2644/* 2645 * Add l2cache device information to the nvlist, including vdev stats. 2646 */ 2647static void 2648spa_add_l2cache(spa_t *spa, nvlist_t *config) 2649{ 2650 nvlist_t **l2cache; 2651 uint_t i, j, nl2cache; 2652 nvlist_t *nvroot; 2653 uint64_t guid; 2654 vdev_t *vd; 2655 vdev_stat_t *vs; 2656 uint_t vsc; 2657 2658 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 2659 2660 if (spa->spa_l2cache.sav_count == 0) 2661 return; 2662 2663 VERIFY(nvlist_lookup_nvlist(config, 2664 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 2665 VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 2666 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 2667 if (nl2cache != 0) { 2668 VERIFY(nvlist_add_nvlist_array(nvroot, 2669 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 2670 VERIFY(nvlist_lookup_nvlist_array(nvroot, 2671 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 2672 2673 /* 2674 * Update level 2 cache device stats. 2675 */ 2676 2677 for (i = 0; i < nl2cache; i++) { 2678 VERIFY(nvlist_lookup_uint64(l2cache[i], 2679 ZPOOL_CONFIG_GUID, &guid) == 0); 2680 2681 vd = NULL; 2682 for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 2683 if (guid == 2684 spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 2685 vd = spa->spa_l2cache.sav_vdevs[j]; 2686 break; 2687 } 2688 } 2689 ASSERT(vd != NULL); 2690 2691 VERIFY(nvlist_lookup_uint64_array(l2cache[i], 2692 ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) 2693 == 0); 2694 vdev_get_stats(vd, vs); 2695 } 2696 } 2697} 2698 2699int 2700spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 2701{ 2702 int error; 2703 spa_t *spa; 2704 2705 *config = NULL; 2706 error = spa_open_common(name, &spa, FTAG, NULL, config); 2707 2708 if (spa != NULL) { 2709 /* 2710 * This still leaves a window of inconsistency where the spares 2711 * or l2cache devices could change and the config would be 2712 * self-inconsistent. 2713 */ 2714 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 2715 2716 if (*config != NULL) { 2717 uint64_t loadtimes[2]; 2718 2719 loadtimes[0] = spa->spa_loaded_ts.tv_sec; 2720 loadtimes[1] = spa->spa_loaded_ts.tv_nsec; 2721 VERIFY(nvlist_add_uint64_array(*config, 2722 ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0); 2723 2724 VERIFY(nvlist_add_uint64(*config, 2725 ZPOOL_CONFIG_ERRCOUNT, 2726 spa_get_errlog_size(spa)) == 0); 2727 2728 if (spa_suspended(spa)) 2729 VERIFY(nvlist_add_uint64(*config, 2730 ZPOOL_CONFIG_SUSPENDED, 2731 spa->spa_failmode) == 0); 2732 2733 spa_add_spares(spa, *config); 2734 spa_add_l2cache(spa, *config); 2735 } 2736 } 2737 2738 /* 2739 * We want to get the alternate root even for faulted pools, so we cheat 2740 * and call spa_lookup() directly. 2741 */ 2742 if (altroot) { 2743 if (spa == NULL) { 2744 mutex_enter(&spa_namespace_lock); 2745 spa = spa_lookup(name); 2746 if (spa) 2747 spa_altroot(spa, altroot, buflen); 2748 else 2749 altroot[0] = '\0'; 2750 spa = NULL; 2751 mutex_exit(&spa_namespace_lock); 2752 } else { 2753 spa_altroot(spa, altroot, buflen); 2754 } 2755 } 2756 2757 if (spa != NULL) { 2758 spa_config_exit(spa, SCL_CONFIG, FTAG); 2759 spa_close(spa, FTAG); 2760 } 2761 2762 return (error); 2763} 2764 2765/* 2766 * Validate that the auxiliary device array is well formed. We must have an 2767 * array of nvlists, each which describes a valid leaf vdev. If this is an 2768 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 2769 * specified, as long as they are well-formed. 2770 */ 2771static int 2772spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 2773 spa_aux_vdev_t *sav, const char *config, uint64_t version, 2774 vdev_labeltype_t label) 2775{ 2776 nvlist_t **dev; 2777 uint_t i, ndev; 2778 vdev_t *vd; 2779 int error; 2780 2781 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 2782 2783 /* 2784 * It's acceptable to have no devs specified. 2785 */ 2786 if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 2787 return (0); 2788 2789 if (ndev == 0) 2790 return (EINVAL); 2791 2792 /* 2793 * Make sure the pool is formatted with a version that supports this 2794 * device type. 2795 */ 2796 if (spa_version(spa) < version) 2797 return (ENOTSUP); 2798 2799 /* 2800 * Set the pending device list so we correctly handle device in-use 2801 * checking. 2802 */ 2803 sav->sav_pending = dev; 2804 sav->sav_npending = ndev; 2805 2806 for (i = 0; i < ndev; i++) { 2807 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 2808 mode)) != 0) 2809 goto out; 2810 2811 if (!vd->vdev_ops->vdev_op_leaf) { 2812 vdev_free(vd); 2813 error = EINVAL; 2814 goto out; 2815 } 2816 2817 /* 2818 * The L2ARC currently only supports disk devices in 2819 * kernel context. For user-level testing, we allow it. 2820 */ 2821#ifdef _KERNEL 2822 if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && 2823 strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { 2824 error = ENOTBLK;
|
| 2825 vdev_free(vd);
|
2821 goto out; 2822 } 2823#endif 2824 vd->vdev_top = vd; 2825 2826 if ((error = vdev_open(vd)) == 0 && 2827 (error = vdev_label_init(vd, crtxg, label)) == 0) { 2828 VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 2829 vd->vdev_guid) == 0); 2830 } 2831 2832 vdev_free(vd); 2833 2834 if (error && 2835 (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 2836 goto out; 2837 else 2838 error = 0; 2839 } 2840 2841out: 2842 sav->sav_pending = NULL; 2843 sav->sav_npending = 0; 2844 return (error); 2845} 2846 2847static int 2848spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 2849{ 2850 int error; 2851 2852 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 2853 2854 if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 2855 &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 2856 VDEV_LABEL_SPARE)) != 0) { 2857 return (error); 2858 } 2859 2860 return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 2861 &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 2862 VDEV_LABEL_L2CACHE)); 2863} 2864 2865static void 2866spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 2867 const char *config) 2868{ 2869 int i; 2870 2871 if (sav->sav_config != NULL) { 2872 nvlist_t **olddevs; 2873 uint_t oldndevs; 2874 nvlist_t **newdevs; 2875 2876 /* 2877 * Generate new dev list by concatentating with the 2878 * current dev list. 2879 */ 2880 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, 2881 &olddevs, &oldndevs) == 0); 2882 2883 newdevs = kmem_alloc(sizeof (void *) * 2884 (ndevs + oldndevs), KM_SLEEP); 2885 for (i = 0; i < oldndevs; i++) 2886 VERIFY(nvlist_dup(olddevs[i], &newdevs[i], 2887 KM_SLEEP) == 0); 2888 for (i = 0; i < ndevs; i++) 2889 VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], 2890 KM_SLEEP) == 0); 2891 2892 VERIFY(nvlist_remove(sav->sav_config, config, 2893 DATA_TYPE_NVLIST_ARRAY) == 0); 2894 2895 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 2896 config, newdevs, ndevs + oldndevs) == 0); 2897 for (i = 0; i < oldndevs + ndevs; i++) 2898 nvlist_free(newdevs[i]); 2899 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 2900 } else { 2901 /* 2902 * Generate a new dev list. 2903 */ 2904 VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, 2905 KM_SLEEP) == 0); 2906 VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, 2907 devs, ndevs) == 0); 2908 } 2909} 2910 2911/* 2912 * Stop and drop level 2 ARC devices 2913 */ 2914void 2915spa_l2cache_drop(spa_t *spa) 2916{ 2917 vdev_t *vd; 2918 int i; 2919 spa_aux_vdev_t *sav = &spa->spa_l2cache; 2920 2921 for (i = 0; i < sav->sav_count; i++) { 2922 uint64_t pool; 2923 2924 vd = sav->sav_vdevs[i]; 2925 ASSERT(vd != NULL); 2926 2927 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 2928 pool != 0ULL && l2arc_vdev_present(vd)) 2929 l2arc_remove_vdev(vd);
| 2826 goto out; 2827 } 2828#endif 2829 vd->vdev_top = vd; 2830 2831 if ((error = vdev_open(vd)) == 0 && 2832 (error = vdev_label_init(vd, crtxg, label)) == 0) { 2833 VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 2834 vd->vdev_guid) == 0); 2835 } 2836 2837 vdev_free(vd); 2838 2839 if (error && 2840 (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 2841 goto out; 2842 else 2843 error = 0; 2844 } 2845 2846out: 2847 sav->sav_pending = NULL; 2848 sav->sav_npending = 0; 2849 return (error); 2850} 2851 2852static int 2853spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 2854{ 2855 int error; 2856 2857 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 2858 2859 if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 2860 &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 2861 VDEV_LABEL_SPARE)) != 0) { 2862 return (error); 2863 } 2864 2865 return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 2866 &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 2867 VDEV_LABEL_L2CACHE)); 2868} 2869 2870static void 2871spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 2872 const char *config) 2873{ 2874 int i; 2875 2876 if (sav->sav_config != NULL) { 2877 nvlist_t **olddevs; 2878 uint_t oldndevs; 2879 nvlist_t **newdevs; 2880 2881 /* 2882 * Generate new dev list by concatentating with the 2883 * current dev list. 2884 */ 2885 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, 2886 &olddevs, &oldndevs) == 0); 2887 2888 newdevs = kmem_alloc(sizeof (void *) * 2889 (ndevs + oldndevs), KM_SLEEP); 2890 for (i = 0; i < oldndevs; i++) 2891 VERIFY(nvlist_dup(olddevs[i], &newdevs[i], 2892 KM_SLEEP) == 0); 2893 for (i = 0; i < ndevs; i++) 2894 VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], 2895 KM_SLEEP) == 0); 2896 2897 VERIFY(nvlist_remove(sav->sav_config, config, 2898 DATA_TYPE_NVLIST_ARRAY) == 0); 2899 2900 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 2901 config, newdevs, ndevs + oldndevs) == 0); 2902 for (i = 0; i < oldndevs + ndevs; i++) 2903 nvlist_free(newdevs[i]); 2904 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 2905 } else { 2906 /* 2907 * Generate a new dev list. 2908 */ 2909 VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, 2910 KM_SLEEP) == 0); 2911 VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, 2912 devs, ndevs) == 0); 2913 } 2914} 2915 2916/* 2917 * Stop and drop level 2 ARC devices 2918 */ 2919void 2920spa_l2cache_drop(spa_t *spa) 2921{ 2922 vdev_t *vd; 2923 int i; 2924 spa_aux_vdev_t *sav = &spa->spa_l2cache; 2925 2926 for (i = 0; i < sav->sav_count; i++) { 2927 uint64_t pool; 2928 2929 vd = sav->sav_vdevs[i]; 2930 ASSERT(vd != NULL); 2931 2932 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 2933 pool != 0ULL && l2arc_vdev_present(vd)) 2934 l2arc_remove_vdev(vd);
|
2930 if (vd->vdev_isl2cache) 2931 spa_l2cache_remove(vd); 2932 vdev_clear_stats(vd); 2933 (void) vdev_close(vd);
| |
2934 } 2935} 2936 2937/* 2938 * Pool Creation 2939 */ 2940int 2941spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 2942 const char *history_str, nvlist_t *zplprops) 2943{ 2944 spa_t *spa; 2945 char *altroot = NULL; 2946 vdev_t *rvd; 2947 dsl_pool_t *dp; 2948 dmu_tx_t *tx; 2949 int error = 0; 2950 uint64_t txg = TXG_INITIAL; 2951 nvlist_t **spares, **l2cache; 2952 uint_t nspares, nl2cache; 2953 uint64_t version, obj; 2954 2955 /* 2956 * If this pool already exists, return failure. 2957 */ 2958 mutex_enter(&spa_namespace_lock); 2959 if (spa_lookup(pool) != NULL) { 2960 mutex_exit(&spa_namespace_lock); 2961 return (EEXIST); 2962 } 2963 2964 /* 2965 * Allocate a new spa_t structure. 2966 */ 2967 (void) nvlist_lookup_string(props, 2968 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 2969 spa = spa_add(pool, NULL, altroot); 2970 spa_activate(spa, spa_mode_global); 2971 2972 if (props && (error = spa_prop_validate(spa, props))) { 2973 spa_deactivate(spa); 2974 spa_remove(spa); 2975 mutex_exit(&spa_namespace_lock); 2976 return (error); 2977 } 2978 2979 if (nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), 2980 &version) != 0) 2981 version = SPA_VERSION; 2982 ASSERT(version <= SPA_VERSION); 2983 2984 spa->spa_first_txg = txg; 2985 spa->spa_uberblock.ub_txg = txg - 1; 2986 spa->spa_uberblock.ub_version = version; 2987 spa->spa_ubsync = spa->spa_uberblock; 2988 2989 /* 2990 * Create "The Godfather" zio to hold all async IOs 2991 */ 2992 spa->spa_async_zio_root = zio_root(spa, NULL, NULL, 2993 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); 2994 2995 /* 2996 * Create the root vdev. 2997 */ 2998 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2999 3000 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 3001 3002 ASSERT(error != 0 || rvd != NULL); 3003 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 3004 3005 if (error == 0 && !zfs_allocatable_devs(nvroot)) 3006 error = EINVAL; 3007 3008 if (error == 0 && 3009 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 3010 (error = spa_validate_aux(spa, nvroot, txg, 3011 VDEV_ALLOC_ADD)) == 0) { 3012 for (int c = 0; c < rvd->vdev_children; c++) { 3013 vdev_metaslab_set_size(rvd->vdev_child[c]); 3014 vdev_expand(rvd->vdev_child[c], txg); 3015 } 3016 } 3017 3018 spa_config_exit(spa, SCL_ALL, FTAG); 3019 3020 if (error != 0) { 3021 spa_unload(spa); 3022 spa_deactivate(spa); 3023 spa_remove(spa); 3024 mutex_exit(&spa_namespace_lock); 3025 return (error); 3026 } 3027 3028 /* 3029 * Get the list of spares, if specified. 3030 */ 3031 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 3032 &spares, &nspares) == 0) { 3033 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, 3034 KM_SLEEP) == 0); 3035 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 3036 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 3037 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3038 spa_load_spares(spa); 3039 spa_config_exit(spa, SCL_ALL, FTAG); 3040 spa->spa_spares.sav_sync = B_TRUE; 3041 } 3042 3043 /* 3044 * Get the list of level 2 cache devices, if specified. 3045 */ 3046 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 3047 &l2cache, &nl2cache) == 0) { 3048 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 3049 NV_UNIQUE_NAME, KM_SLEEP) == 0); 3050 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 3051 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 3052 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3053 spa_load_l2cache(spa); 3054 spa_config_exit(spa, SCL_ALL, FTAG); 3055 spa->spa_l2cache.sav_sync = B_TRUE; 3056 } 3057 3058 spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); 3059 spa->spa_meta_objset = dp->dp_meta_objset; 3060 3061 /* 3062 * Create DDTs (dedup tables). 3063 */ 3064 ddt_create(spa); 3065 3066 spa_update_dspace(spa); 3067 3068 tx = dmu_tx_create_assigned(dp, txg); 3069 3070 /* 3071 * Create the pool config object. 3072 */ 3073 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 3074 DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, 3075 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 3076 3077 if (zap_add(spa->spa_meta_objset, 3078 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 3079 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 3080 cmn_err(CE_PANIC, "failed to add pool config"); 3081 } 3082 3083 if (zap_add(spa->spa_meta_objset, 3084 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, 3085 sizeof (uint64_t), 1, &version, tx) != 0) { 3086 cmn_err(CE_PANIC, "failed to add pool version"); 3087 } 3088 3089 /* Newly created pools with the right version are always deflated. */ 3090 if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 3091 spa->spa_deflate = TRUE; 3092 if (zap_add(spa->spa_meta_objset, 3093 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 3094 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 3095 cmn_err(CE_PANIC, "failed to add deflate"); 3096 } 3097 } 3098 3099 /* 3100 * Create the deferred-free bpobj. Turn off compression 3101 * because sync-to-convergence takes longer if the blocksize 3102 * keeps changing. 3103 */ 3104 obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); 3105 dmu_object_set_compress(spa->spa_meta_objset, obj, 3106 ZIO_COMPRESS_OFF, tx); 3107 if (zap_add(spa->spa_meta_objset, 3108 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, 3109 sizeof (uint64_t), 1, &obj, tx) != 0) { 3110 cmn_err(CE_PANIC, "failed to add bpobj"); 3111 } 3112 VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, 3113 spa->spa_meta_objset, obj)); 3114 3115 /* 3116 * Create the pool's history object. 3117 */ 3118 if (version >= SPA_VERSION_ZPOOL_HISTORY) 3119 spa_history_create_obj(spa, tx); 3120 3121 /* 3122 * Set pool properties. 3123 */ 3124 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 3125 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 3126 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 3127 spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); 3128 3129 if (props != NULL) { 3130 spa_configfile_set(spa, props, B_FALSE); 3131 spa_sync_props(spa, props, tx); 3132 } 3133 3134 dmu_tx_commit(tx); 3135 3136 spa->spa_sync_on = B_TRUE; 3137 txg_sync_start(spa->spa_dsl_pool); 3138 3139 /* 3140 * We explicitly wait for the first transaction to complete so that our 3141 * bean counters are appropriately updated. 3142 */ 3143 txg_wait_synced(spa->spa_dsl_pool, txg); 3144 3145 spa_config_sync(spa, B_FALSE, B_TRUE); 3146 3147 if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL) 3148 (void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE); 3149 spa_history_log_version(spa, LOG_POOL_CREATE); 3150 3151 spa->spa_minref = refcount_count(&spa->spa_refcount); 3152 3153 mutex_exit(&spa_namespace_lock); 3154 3155 return (0); 3156} 3157 3158#if defined(sun) 3159#ifdef _KERNEL 3160/* 3161 * Get the root pool information from the root disk, then import the root pool 3162 * during the system boot up time. 3163 */ 3164extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); 3165 3166static nvlist_t * 3167spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid) 3168{ 3169 nvlist_t *config; 3170 nvlist_t *nvtop, *nvroot; 3171 uint64_t pgid; 3172 3173 if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0) 3174 return (NULL); 3175 3176 /* 3177 * Add this top-level vdev to the child array. 3178 */ 3179 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3180 &nvtop) == 0); 3181 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 3182 &pgid) == 0); 3183 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0); 3184 3185 /* 3186 * Put this pool's top-level vdevs into a root vdev. 3187 */ 3188 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3189 VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 3190 VDEV_TYPE_ROOT) == 0); 3191 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 3192 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 3193 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 3194 &nvtop, 1) == 0); 3195 3196 /* 3197 * Replace the existing vdev_tree with the new root vdev in 3198 * this pool's configuration (remove the old, add the new). 3199 */ 3200 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 3201 nvlist_free(nvroot); 3202 return (config); 3203} 3204 3205/* 3206 * Walk the vdev tree and see if we can find a device with "better" 3207 * configuration. A configuration is "better" if the label on that 3208 * device has a more recent txg. 3209 */ 3210static void 3211spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg) 3212{ 3213 for (int c = 0; c < vd->vdev_children; c++) 3214 spa_alt_rootvdev(vd->vdev_child[c], avd, txg); 3215 3216 if (vd->vdev_ops->vdev_op_leaf) { 3217 nvlist_t *label; 3218 uint64_t label_txg; 3219 3220 if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid, 3221 &label) != 0) 3222 return; 3223 3224 VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, 3225 &label_txg) == 0); 3226 3227 /* 3228 * Do we have a better boot device? 3229 */ 3230 if (label_txg > *txg) { 3231 *txg = label_txg; 3232 *avd = vd; 3233 } 3234 nvlist_free(label); 3235 } 3236} 3237 3238/* 3239 * Import a root pool. 3240 * 3241 * For x86. devpath_list will consist of devid and/or physpath name of 3242 * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). 3243 * The GRUB "findroot" command will return the vdev we should boot. 3244 * 3245 * For Sparc, devpath_list consists the physpath name of the booting device 3246 * no matter the rootpool is a single device pool or a mirrored pool. 3247 * e.g. 3248 * "/pci@1f,0/ide@d/disk@0,0:a" 3249 */ 3250int 3251spa_import_rootpool(char *devpath, char *devid) 3252{ 3253 spa_t *spa; 3254 vdev_t *rvd, *bvd, *avd = NULL; 3255 nvlist_t *config, *nvtop; 3256 uint64_t guid, txg; 3257 char *pname; 3258 int error; 3259 3260 /* 3261 * Read the label from the boot device and generate a configuration. 3262 */ 3263 config = spa_generate_rootconf(devpath, devid, &guid); 3264#if defined(_OBP) && defined(_KERNEL) 3265 if (config == NULL) { 3266 if (strstr(devpath, "/iscsi/ssd") != NULL) { 3267 /* iscsi boot */ 3268 get_iscsi_bootpath_phy(devpath); 3269 config = spa_generate_rootconf(devpath, devid, &guid); 3270 } 3271 } 3272#endif 3273 if (config == NULL) { 3274 cmn_err(CE_NOTE, "Can not read the pool label from '%s'", 3275 devpath); 3276 return (EIO); 3277 } 3278 3279 VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 3280 &pname) == 0); 3281 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); 3282 3283 mutex_enter(&spa_namespace_lock); 3284 if ((spa = spa_lookup(pname)) != NULL) { 3285 /* 3286 * Remove the existing root pool from the namespace so that we 3287 * can replace it with the correct config we just read in. 3288 */ 3289 spa_remove(spa); 3290 } 3291 3292 spa = spa_add(pname, config, NULL); 3293 spa->spa_is_root = B_TRUE; 3294 spa->spa_import_flags = ZFS_IMPORT_VERBATIM; 3295 3296 /* 3297 * Build up a vdev tree based on the boot device's label config. 3298 */ 3299 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3300 &nvtop) == 0); 3301 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3302 error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, 3303 VDEV_ALLOC_ROOTPOOL); 3304 spa_config_exit(spa, SCL_ALL, FTAG); 3305 if (error) { 3306 mutex_exit(&spa_namespace_lock); 3307 nvlist_free(config); 3308 cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", 3309 pname); 3310 return (error); 3311 } 3312 3313 /* 3314 * Get the boot vdev. 3315 */ 3316 if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 3317 cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu", 3318 (u_longlong_t)guid); 3319 error = ENOENT; 3320 goto out; 3321 } 3322 3323 /* 3324 * Determine if there is a better boot device. 3325 */ 3326 avd = bvd; 3327 spa_alt_rootvdev(rvd, &avd, &txg); 3328 if (avd != bvd) { 3329 cmn_err(CE_NOTE, "The boot device is 'degraded'. Please " 3330 "try booting from '%s'", avd->vdev_path); 3331 error = EINVAL; 3332 goto out; 3333 } 3334 3335 /* 3336 * If the boot device is part of a spare vdev then ensure that 3337 * we're booting off the active spare. 3338 */ 3339 if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops && 3340 !bvd->vdev_isspare) { 3341 cmn_err(CE_NOTE, "The boot device is currently spared. Please " 3342 "try booting from '%s'", 3343 bvd->vdev_parent-> 3344 vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path); 3345 error = EINVAL; 3346 goto out; 3347 } 3348 3349 error = 0; 3350 spa_history_log_version(spa, LOG_POOL_IMPORT); 3351out: 3352 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3353 vdev_free(rvd); 3354 spa_config_exit(spa, SCL_ALL, FTAG); 3355 mutex_exit(&spa_namespace_lock); 3356 3357 nvlist_free(config); 3358 return (error); 3359} 3360 3361#endif 3362#endif /* sun */ 3363 3364/* 3365 * Import a non-root pool into the system. 3366 */ 3367int 3368spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) 3369{ 3370 spa_t *spa; 3371 char *altroot = NULL; 3372 spa_load_state_t state = SPA_LOAD_IMPORT; 3373 zpool_rewind_policy_t policy; 3374 uint64_t mode = spa_mode_global; 3375 uint64_t readonly = B_FALSE; 3376 int error; 3377 nvlist_t *nvroot; 3378 nvlist_t **spares, **l2cache; 3379 uint_t nspares, nl2cache; 3380 3381 /* 3382 * If a pool with this name exists, return failure. 3383 */ 3384 mutex_enter(&spa_namespace_lock); 3385 if (spa_lookup(pool) != NULL) { 3386 mutex_exit(&spa_namespace_lock); 3387 return (EEXIST); 3388 } 3389 3390 /* 3391 * Create and initialize the spa structure. 3392 */ 3393 (void) nvlist_lookup_string(props, 3394 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 3395 (void) nvlist_lookup_uint64(props, 3396 zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); 3397 if (readonly) 3398 mode = FREAD; 3399 spa = spa_add(pool, config, altroot); 3400 spa->spa_import_flags = flags; 3401 3402 /* 3403 * Verbatim import - Take a pool and insert it into the namespace 3404 * as if it had been loaded at boot. 3405 */ 3406 if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { 3407 if (props != NULL) 3408 spa_configfile_set(spa, props, B_FALSE); 3409 3410 spa_config_sync(spa, B_FALSE, B_TRUE); 3411 3412 mutex_exit(&spa_namespace_lock); 3413 spa_history_log_version(spa, LOG_POOL_IMPORT); 3414 3415 return (0); 3416 } 3417 3418 spa_activate(spa, mode); 3419 3420 /* 3421 * Don't start async tasks until we know everything is healthy. 3422 */ 3423 spa_async_suspend(spa); 3424 3425 zpool_get_rewind_policy(config, &policy); 3426 if (policy.zrp_request & ZPOOL_DO_REWIND) 3427 state = SPA_LOAD_RECOVER; 3428 3429 /* 3430 * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig 3431 * because the user-supplied config is actually the one to trust when 3432 * doing an import. 3433 */ 3434 if (state != SPA_LOAD_RECOVER) 3435 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 3436 3437 error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg, 3438 policy.zrp_request); 3439 3440 /* 3441 * Propagate anything learned while loading the pool and pass it 3442 * back to caller (i.e. rewind info, missing devices, etc). 3443 */ 3444 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 3445 spa->spa_load_info) == 0); 3446 3447 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3448 /* 3449 * Toss any existing sparelist, as it doesn't have any validity 3450 * anymore, and conflicts with spa_has_spare(). 3451 */ 3452 if (spa->spa_spares.sav_config) { 3453 nvlist_free(spa->spa_spares.sav_config); 3454 spa->spa_spares.sav_config = NULL; 3455 spa_load_spares(spa); 3456 } 3457 if (spa->spa_l2cache.sav_config) { 3458 nvlist_free(spa->spa_l2cache.sav_config); 3459 spa->spa_l2cache.sav_config = NULL; 3460 spa_load_l2cache(spa); 3461 } 3462 3463 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3464 &nvroot) == 0); 3465 if (error == 0) 3466 error = spa_validate_aux(spa, nvroot, -1ULL, 3467 VDEV_ALLOC_SPARE); 3468 if (error == 0) 3469 error = spa_validate_aux(spa, nvroot, -1ULL, 3470 VDEV_ALLOC_L2CACHE); 3471 spa_config_exit(spa, SCL_ALL, FTAG); 3472 3473 if (props != NULL) 3474 spa_configfile_set(spa, props, B_FALSE); 3475 3476 if (error != 0 || (props && spa_writeable(spa) && 3477 (error = spa_prop_set(spa, props)))) { 3478 spa_unload(spa); 3479 spa_deactivate(spa); 3480 spa_remove(spa); 3481 mutex_exit(&spa_namespace_lock); 3482 return (error); 3483 } 3484 3485 spa_async_resume(spa); 3486 3487 /* 3488 * Override any spares and level 2 cache devices as specified by 3489 * the user, as these may have correct device names/devids, etc. 3490 */ 3491 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 3492 &spares, &nspares) == 0) { 3493 if (spa->spa_spares.sav_config) 3494 VERIFY(nvlist_remove(spa->spa_spares.sav_config, 3495 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 3496 else 3497 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, 3498 NV_UNIQUE_NAME, KM_SLEEP) == 0); 3499 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 3500 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 3501 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3502 spa_load_spares(spa); 3503 spa_config_exit(spa, SCL_ALL, FTAG); 3504 spa->spa_spares.sav_sync = B_TRUE; 3505 } 3506 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 3507 &l2cache, &nl2cache) == 0) { 3508 if (spa->spa_l2cache.sav_config) 3509 VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, 3510 ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); 3511 else 3512 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 3513 NV_UNIQUE_NAME, KM_SLEEP) == 0); 3514 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 3515 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 3516 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3517 spa_load_l2cache(spa); 3518 spa_config_exit(spa, SCL_ALL, FTAG); 3519 spa->spa_l2cache.sav_sync = B_TRUE; 3520 } 3521 3522 /* 3523 * Check for any removed devices. 3524 */ 3525 if (spa->spa_autoreplace) { 3526 spa_aux_check_removed(&spa->spa_spares); 3527 spa_aux_check_removed(&spa->spa_l2cache); 3528 } 3529 3530 if (spa_writeable(spa)) { 3531 /* 3532 * Update the config cache to include the newly-imported pool. 3533 */ 3534 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 3535 } 3536 3537 /* 3538 * It's possible that the pool was expanded while it was exported. 3539 * We kick off an async task to handle this for us. 3540 */ 3541 spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 3542 3543 mutex_exit(&spa_namespace_lock); 3544 spa_history_log_version(spa, LOG_POOL_IMPORT); 3545 3546#ifdef __FreeBSD__ 3547#ifdef _KERNEL 3548 zvol_create_minors(pool); 3549#endif 3550#endif 3551 return (0); 3552} 3553 3554nvlist_t * 3555spa_tryimport(nvlist_t *tryconfig) 3556{ 3557 nvlist_t *config = NULL; 3558 char *poolname; 3559 spa_t *spa; 3560 uint64_t state; 3561 int error; 3562 3563 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 3564 return (NULL); 3565 3566 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 3567 return (NULL); 3568 3569 /* 3570 * Create and initialize the spa structure. 3571 */ 3572 mutex_enter(&spa_namespace_lock); 3573 spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); 3574 spa_activate(spa, FREAD); 3575 3576 /* 3577 * Pass off the heavy lifting to spa_load(). 3578 * Pass TRUE for mosconfig because the user-supplied config 3579 * is actually the one to trust when doing an import. 3580 */ 3581 error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE); 3582 3583 /* 3584 * If 'tryconfig' was at least parsable, return the current config. 3585 */ 3586 if (spa->spa_root_vdev != NULL) { 3587 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 3588 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 3589 poolname) == 0); 3590 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 3591 state) == 0); 3592 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 3593 spa->spa_uberblock.ub_timestamp) == 0); 3594 3595 /* 3596 * If the bootfs property exists on this pool then we 3597 * copy it out so that external consumers can tell which 3598 * pools are bootable. 3599 */ 3600 if ((!error || error == EEXIST) && spa->spa_bootfs) { 3601 char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 3602 3603 /* 3604 * We have to play games with the name since the 3605 * pool was opened as TRYIMPORT_NAME. 3606 */ 3607 if (dsl_dsobj_to_dsname(spa_name(spa), 3608 spa->spa_bootfs, tmpname) == 0) { 3609 char *cp; 3610 char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 3611 3612 cp = strchr(tmpname, '/'); 3613 if (cp == NULL) { 3614 (void) strlcpy(dsname, tmpname, 3615 MAXPATHLEN); 3616 } else { 3617 (void) snprintf(dsname, MAXPATHLEN, 3618 "%s/%s", poolname, ++cp); 3619 } 3620 VERIFY(nvlist_add_string(config, 3621 ZPOOL_CONFIG_BOOTFS, dsname) == 0); 3622 kmem_free(dsname, MAXPATHLEN); 3623 } 3624 kmem_free(tmpname, MAXPATHLEN); 3625 } 3626 3627 /* 3628 * Add the list of hot spares and level 2 cache devices. 3629 */ 3630 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3631 spa_add_spares(spa, config); 3632 spa_add_l2cache(spa, config); 3633 spa_config_exit(spa, SCL_CONFIG, FTAG); 3634 } 3635 3636 spa_unload(spa); 3637 spa_deactivate(spa); 3638 spa_remove(spa); 3639 mutex_exit(&spa_namespace_lock); 3640 3641 return (config); 3642} 3643 3644/* 3645 * Pool export/destroy 3646 * 3647 * The act of destroying or exporting a pool is very simple. We make sure there 3648 * is no more pending I/O and any references to the pool are gone. Then, we 3649 * update the pool state and sync all the labels to disk, removing the 3650 * configuration from the cache afterwards. If the 'hardforce' flag is set, then 3651 * we don't sync the labels or remove the configuration cache. 3652 */ 3653static int 3654spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, 3655 boolean_t force, boolean_t hardforce) 3656{ 3657 spa_t *spa; 3658 3659 if (oldconfig) 3660 *oldconfig = NULL; 3661 3662 if (!(spa_mode_global & FWRITE)) 3663 return (EROFS); 3664 3665 mutex_enter(&spa_namespace_lock); 3666 if ((spa = spa_lookup(pool)) == NULL) { 3667 mutex_exit(&spa_namespace_lock); 3668 return (ENOENT); 3669 } 3670 3671 /* 3672 * Put a hold on the pool, drop the namespace lock, stop async tasks, 3673 * reacquire the namespace lock, and see if we can export. 3674 */ 3675 spa_open_ref(spa, FTAG); 3676 mutex_exit(&spa_namespace_lock); 3677 spa_async_suspend(spa); 3678 mutex_enter(&spa_namespace_lock); 3679 spa_close(spa, FTAG); 3680 3681 /* 3682 * The pool will be in core if it's openable, 3683 * in which case we can modify its state. 3684 */ 3685 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 3686 /* 3687 * Objsets may be open only because they're dirty, so we 3688 * have to force it to sync before checking spa_refcnt. 3689 */ 3690 txg_wait_synced(spa->spa_dsl_pool, 0); 3691 3692 /* 3693 * A pool cannot be exported or destroyed if there are active 3694 * references. If we are resetting a pool, allow references by 3695 * fault injection handlers. 3696 */ 3697 if (!spa_refcount_zero(spa) || 3698 (spa->spa_inject_ref != 0 && 3699 new_state != POOL_STATE_UNINITIALIZED)) { 3700 spa_async_resume(spa); 3701 mutex_exit(&spa_namespace_lock); 3702 return (EBUSY); 3703 } 3704 3705 /* 3706 * A pool cannot be exported if it has an active shared spare. 3707 * This is to prevent other pools stealing the active spare 3708 * from an exported pool. At user's own will, such pool can 3709 * be forcedly exported. 3710 */ 3711 if (!force && new_state == POOL_STATE_EXPORTED && 3712 spa_has_active_shared_spare(spa)) { 3713 spa_async_resume(spa); 3714 mutex_exit(&spa_namespace_lock); 3715 return (EXDEV); 3716 } 3717 3718 /* 3719 * We want this to be reflected on every label, 3720 * so mark them all dirty. spa_unload() will do the 3721 * final sync that pushes these changes out. 3722 */ 3723 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 3724 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3725 spa->spa_state = new_state; 3726 spa->spa_final_txg = spa_last_synced_txg(spa) + 3727 TXG_DEFER_SIZE + 1; 3728 vdev_config_dirty(spa->spa_root_vdev); 3729 spa_config_exit(spa, SCL_ALL, FTAG); 3730 } 3731 } 3732 3733 spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY); 3734 3735 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 3736 spa_unload(spa); 3737 spa_deactivate(spa); 3738 } 3739 3740 if (oldconfig && spa->spa_config) 3741 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 3742 3743 if (new_state != POOL_STATE_UNINITIALIZED) { 3744 if (!hardforce) 3745 spa_config_sync(spa, B_TRUE, B_TRUE); 3746 spa_remove(spa); 3747 } 3748 mutex_exit(&spa_namespace_lock); 3749 3750 return (0); 3751} 3752 3753/* 3754 * Destroy a storage pool. 3755 */ 3756int 3757spa_destroy(char *pool) 3758{ 3759 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, 3760 B_FALSE, B_FALSE)); 3761} 3762 3763/* 3764 * Export a storage pool. 3765 */ 3766int 3767spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, 3768 boolean_t hardforce) 3769{ 3770 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, 3771 force, hardforce)); 3772} 3773 3774/* 3775 * Similar to spa_export(), this unloads the spa_t without actually removing it 3776 * from the namespace in any way. 3777 */ 3778int 3779spa_reset(char *pool) 3780{ 3781 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, 3782 B_FALSE, B_FALSE)); 3783} 3784 3785/* 3786 * ========================================================================== 3787 * Device manipulation 3788 * ========================================================================== 3789 */ 3790 3791/* 3792 * Add a device to a storage pool. 3793 */ 3794int 3795spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 3796{ 3797 uint64_t txg, id; 3798 int error; 3799 vdev_t *rvd = spa->spa_root_vdev; 3800 vdev_t *vd, *tvd; 3801 nvlist_t **spares, **l2cache; 3802 uint_t nspares, nl2cache; 3803 3804 ASSERT(spa_writeable(spa)); 3805 3806 txg = spa_vdev_enter(spa); 3807 3808 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 3809 VDEV_ALLOC_ADD)) != 0) 3810 return (spa_vdev_exit(spa, NULL, txg, error)); 3811 3812 spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ 3813 3814 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 3815 &nspares) != 0) 3816 nspares = 0; 3817 3818 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 3819 &nl2cache) != 0) 3820 nl2cache = 0; 3821 3822 if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) 3823 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 3824 3825 if (vd->vdev_children != 0 && 3826 (error = vdev_create(vd, txg, B_FALSE)) != 0) 3827 return (spa_vdev_exit(spa, vd, txg, error)); 3828 3829 /* 3830 * We must validate the spares and l2cache devices after checking the 3831 * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 3832 */ 3833 if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) 3834 return (spa_vdev_exit(spa, vd, txg, error)); 3835 3836 /* 3837 * Transfer each new top-level vdev from vd to rvd. 3838 */ 3839 for (int c = 0; c < vd->vdev_children; c++) { 3840 3841 /* 3842 * Set the vdev id to the first hole, if one exists. 3843 */ 3844 for (id = 0; id < rvd->vdev_children; id++) { 3845 if (rvd->vdev_child[id]->vdev_ishole) { 3846 vdev_free(rvd->vdev_child[id]); 3847 break; 3848 } 3849 } 3850 tvd = vd->vdev_child[c]; 3851 vdev_remove_child(vd, tvd); 3852 tvd->vdev_id = id; 3853 vdev_add_child(rvd, tvd); 3854 vdev_config_dirty(tvd); 3855 } 3856 3857 if (nspares != 0) { 3858 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 3859 ZPOOL_CONFIG_SPARES); 3860 spa_load_spares(spa); 3861 spa->spa_spares.sav_sync = B_TRUE; 3862 } 3863 3864 if (nl2cache != 0) { 3865 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 3866 ZPOOL_CONFIG_L2CACHE); 3867 spa_load_l2cache(spa); 3868 spa->spa_l2cache.sav_sync = B_TRUE; 3869 } 3870 3871 /* 3872 * We have to be careful when adding new vdevs to an existing pool. 3873 * If other threads start allocating from these vdevs before we 3874 * sync the config cache, and we lose power, then upon reboot we may 3875 * fail to open the pool because there are DVAs that the config cache 3876 * can't translate. Therefore, we first add the vdevs without 3877 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 3878 * and then let spa_config_update() initialize the new metaslabs. 3879 * 3880 * spa_load() checks for added-but-not-initialized vdevs, so that 3881 * if we lose power at any point in this sequence, the remaining 3882 * steps will be completed the next time we load the pool. 3883 */ 3884 (void) spa_vdev_exit(spa, vd, txg, 0); 3885 3886 mutex_enter(&spa_namespace_lock); 3887 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 3888 mutex_exit(&spa_namespace_lock); 3889 3890 return (0); 3891} 3892 3893/* 3894 * Attach a device to a mirror. The arguments are the path to any device 3895 * in the mirror, and the nvroot for the new device. If the path specifies 3896 * a device that is not mirrored, we automatically insert the mirror vdev. 3897 * 3898 * If 'replacing' is specified, the new device is intended to replace the 3899 * existing device; in this case the two devices are made into their own 3900 * mirror using the 'replacing' vdev, which is functionally identical to 3901 * the mirror vdev (it actually reuses all the same ops) but has a few 3902 * extra rules: you can't attach to it after it's been created, and upon 3903 * completion of resilvering, the first disk (the one being replaced) 3904 * is automatically detached. 3905 */ 3906int 3907spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 3908{ 3909 uint64_t txg, dtl_max_txg; 3910 vdev_t *rvd = spa->spa_root_vdev; 3911 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 3912 vdev_ops_t *pvops; 3913 char *oldvdpath, *newvdpath; 3914 int newvd_isspare; 3915 int error; 3916 3917 ASSERT(spa_writeable(spa)); 3918 3919 txg = spa_vdev_enter(spa); 3920 3921 oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 3922 3923 if (oldvd == NULL) 3924 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 3925 3926 if (!oldvd->vdev_ops->vdev_op_leaf) 3927 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3928 3929 pvd = oldvd->vdev_parent; 3930 3931 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
| 2935 } 2936} 2937 2938/* 2939 * Pool Creation 2940 */ 2941int 2942spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 2943 const char *history_str, nvlist_t *zplprops) 2944{ 2945 spa_t *spa; 2946 char *altroot = NULL; 2947 vdev_t *rvd; 2948 dsl_pool_t *dp; 2949 dmu_tx_t *tx; 2950 int error = 0; 2951 uint64_t txg = TXG_INITIAL; 2952 nvlist_t **spares, **l2cache; 2953 uint_t nspares, nl2cache; 2954 uint64_t version, obj; 2955 2956 /* 2957 * If this pool already exists, return failure. 2958 */ 2959 mutex_enter(&spa_namespace_lock); 2960 if (spa_lookup(pool) != NULL) { 2961 mutex_exit(&spa_namespace_lock); 2962 return (EEXIST); 2963 } 2964 2965 /* 2966 * Allocate a new spa_t structure. 2967 */ 2968 (void) nvlist_lookup_string(props, 2969 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 2970 spa = spa_add(pool, NULL, altroot); 2971 spa_activate(spa, spa_mode_global); 2972 2973 if (props && (error = spa_prop_validate(spa, props))) { 2974 spa_deactivate(spa); 2975 spa_remove(spa); 2976 mutex_exit(&spa_namespace_lock); 2977 return (error); 2978 } 2979 2980 if (nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), 2981 &version) != 0) 2982 version = SPA_VERSION; 2983 ASSERT(version <= SPA_VERSION); 2984 2985 spa->spa_first_txg = txg; 2986 spa->spa_uberblock.ub_txg = txg - 1; 2987 spa->spa_uberblock.ub_version = version; 2988 spa->spa_ubsync = spa->spa_uberblock; 2989 2990 /* 2991 * Create "The Godfather" zio to hold all async IOs 2992 */ 2993 spa->spa_async_zio_root = zio_root(spa, NULL, NULL, 2994 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); 2995 2996 /* 2997 * Create the root vdev. 2998 */ 2999 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3000 3001 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 3002 3003 ASSERT(error != 0 || rvd != NULL); 3004 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 3005 3006 if (error == 0 && !zfs_allocatable_devs(nvroot)) 3007 error = EINVAL; 3008 3009 if (error == 0 && 3010 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 3011 (error = spa_validate_aux(spa, nvroot, txg, 3012 VDEV_ALLOC_ADD)) == 0) { 3013 for (int c = 0; c < rvd->vdev_children; c++) { 3014 vdev_metaslab_set_size(rvd->vdev_child[c]); 3015 vdev_expand(rvd->vdev_child[c], txg); 3016 } 3017 } 3018 3019 spa_config_exit(spa, SCL_ALL, FTAG); 3020 3021 if (error != 0) { 3022 spa_unload(spa); 3023 spa_deactivate(spa); 3024 spa_remove(spa); 3025 mutex_exit(&spa_namespace_lock); 3026 return (error); 3027 } 3028 3029 /* 3030 * Get the list of spares, if specified. 3031 */ 3032 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 3033 &spares, &nspares) == 0) { 3034 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, 3035 KM_SLEEP) == 0); 3036 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 3037 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 3038 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3039 spa_load_spares(spa); 3040 spa_config_exit(spa, SCL_ALL, FTAG); 3041 spa->spa_spares.sav_sync = B_TRUE; 3042 } 3043 3044 /* 3045 * Get the list of level 2 cache devices, if specified. 3046 */ 3047 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 3048 &l2cache, &nl2cache) == 0) { 3049 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 3050 NV_UNIQUE_NAME, KM_SLEEP) == 0); 3051 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 3052 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 3053 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3054 spa_load_l2cache(spa); 3055 spa_config_exit(spa, SCL_ALL, FTAG); 3056 spa->spa_l2cache.sav_sync = B_TRUE; 3057 } 3058 3059 spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); 3060 spa->spa_meta_objset = dp->dp_meta_objset; 3061 3062 /* 3063 * Create DDTs (dedup tables). 3064 */ 3065 ddt_create(spa); 3066 3067 spa_update_dspace(spa); 3068 3069 tx = dmu_tx_create_assigned(dp, txg); 3070 3071 /* 3072 * Create the pool config object. 3073 */ 3074 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 3075 DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, 3076 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 3077 3078 if (zap_add(spa->spa_meta_objset, 3079 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 3080 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 3081 cmn_err(CE_PANIC, "failed to add pool config"); 3082 } 3083 3084 if (zap_add(spa->spa_meta_objset, 3085 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, 3086 sizeof (uint64_t), 1, &version, tx) != 0) { 3087 cmn_err(CE_PANIC, "failed to add pool version"); 3088 } 3089 3090 /* Newly created pools with the right version are always deflated. */ 3091 if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 3092 spa->spa_deflate = TRUE; 3093 if (zap_add(spa->spa_meta_objset, 3094 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 3095 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 3096 cmn_err(CE_PANIC, "failed to add deflate"); 3097 } 3098 } 3099 3100 /* 3101 * Create the deferred-free bpobj. Turn off compression 3102 * because sync-to-convergence takes longer if the blocksize 3103 * keeps changing. 3104 */ 3105 obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); 3106 dmu_object_set_compress(spa->spa_meta_objset, obj, 3107 ZIO_COMPRESS_OFF, tx); 3108 if (zap_add(spa->spa_meta_objset, 3109 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, 3110 sizeof (uint64_t), 1, &obj, tx) != 0) { 3111 cmn_err(CE_PANIC, "failed to add bpobj"); 3112 } 3113 VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, 3114 spa->spa_meta_objset, obj)); 3115 3116 /* 3117 * Create the pool's history object. 3118 */ 3119 if (version >= SPA_VERSION_ZPOOL_HISTORY) 3120 spa_history_create_obj(spa, tx); 3121 3122 /* 3123 * Set pool properties. 3124 */ 3125 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 3126 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 3127 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 3128 spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); 3129 3130 if (props != NULL) { 3131 spa_configfile_set(spa, props, B_FALSE); 3132 spa_sync_props(spa, props, tx); 3133 } 3134 3135 dmu_tx_commit(tx); 3136 3137 spa->spa_sync_on = B_TRUE; 3138 txg_sync_start(spa->spa_dsl_pool); 3139 3140 /* 3141 * We explicitly wait for the first transaction to complete so that our 3142 * bean counters are appropriately updated. 3143 */ 3144 txg_wait_synced(spa->spa_dsl_pool, txg); 3145 3146 spa_config_sync(spa, B_FALSE, B_TRUE); 3147 3148 if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL) 3149 (void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE); 3150 spa_history_log_version(spa, LOG_POOL_CREATE); 3151 3152 spa->spa_minref = refcount_count(&spa->spa_refcount); 3153 3154 mutex_exit(&spa_namespace_lock); 3155 3156 return (0); 3157} 3158 3159#if defined(sun) 3160#ifdef _KERNEL 3161/* 3162 * Get the root pool information from the root disk, then import the root pool 3163 * during the system boot up time. 3164 */ 3165extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); 3166 3167static nvlist_t * 3168spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid) 3169{ 3170 nvlist_t *config; 3171 nvlist_t *nvtop, *nvroot; 3172 uint64_t pgid; 3173 3174 if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0) 3175 return (NULL); 3176 3177 /* 3178 * Add this top-level vdev to the child array. 3179 */ 3180 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3181 &nvtop) == 0); 3182 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 3183 &pgid) == 0); 3184 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0); 3185 3186 /* 3187 * Put this pool's top-level vdevs into a root vdev. 3188 */ 3189 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3190 VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 3191 VDEV_TYPE_ROOT) == 0); 3192 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 3193 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 3194 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 3195 &nvtop, 1) == 0); 3196 3197 /* 3198 * Replace the existing vdev_tree with the new root vdev in 3199 * this pool's configuration (remove the old, add the new). 3200 */ 3201 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 3202 nvlist_free(nvroot); 3203 return (config); 3204} 3205 3206/* 3207 * Walk the vdev tree and see if we can find a device with "better" 3208 * configuration. A configuration is "better" if the label on that 3209 * device has a more recent txg. 3210 */ 3211static void 3212spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg) 3213{ 3214 for (int c = 0; c < vd->vdev_children; c++) 3215 spa_alt_rootvdev(vd->vdev_child[c], avd, txg); 3216 3217 if (vd->vdev_ops->vdev_op_leaf) { 3218 nvlist_t *label; 3219 uint64_t label_txg; 3220 3221 if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid, 3222 &label) != 0) 3223 return; 3224 3225 VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, 3226 &label_txg) == 0); 3227 3228 /* 3229 * Do we have a better boot device? 3230 */ 3231 if (label_txg > *txg) { 3232 *txg = label_txg; 3233 *avd = vd; 3234 } 3235 nvlist_free(label); 3236 } 3237} 3238 3239/* 3240 * Import a root pool. 3241 * 3242 * For x86. devpath_list will consist of devid and/or physpath name of 3243 * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). 3244 * The GRUB "findroot" command will return the vdev we should boot. 3245 * 3246 * For Sparc, devpath_list consists the physpath name of the booting device 3247 * no matter the rootpool is a single device pool or a mirrored pool. 3248 * e.g. 3249 * "/pci@1f,0/ide@d/disk@0,0:a" 3250 */ 3251int 3252spa_import_rootpool(char *devpath, char *devid) 3253{ 3254 spa_t *spa; 3255 vdev_t *rvd, *bvd, *avd = NULL; 3256 nvlist_t *config, *nvtop; 3257 uint64_t guid, txg; 3258 char *pname; 3259 int error; 3260 3261 /* 3262 * Read the label from the boot device and generate a configuration. 3263 */ 3264 config = spa_generate_rootconf(devpath, devid, &guid); 3265#if defined(_OBP) && defined(_KERNEL) 3266 if (config == NULL) { 3267 if (strstr(devpath, "/iscsi/ssd") != NULL) { 3268 /* iscsi boot */ 3269 get_iscsi_bootpath_phy(devpath); 3270 config = spa_generate_rootconf(devpath, devid, &guid); 3271 } 3272 } 3273#endif 3274 if (config == NULL) { 3275 cmn_err(CE_NOTE, "Can not read the pool label from '%s'", 3276 devpath); 3277 return (EIO); 3278 } 3279 3280 VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 3281 &pname) == 0); 3282 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); 3283 3284 mutex_enter(&spa_namespace_lock); 3285 if ((spa = spa_lookup(pname)) != NULL) { 3286 /* 3287 * Remove the existing root pool from the namespace so that we 3288 * can replace it with the correct config we just read in. 3289 */ 3290 spa_remove(spa); 3291 } 3292 3293 spa = spa_add(pname, config, NULL); 3294 spa->spa_is_root = B_TRUE; 3295 spa->spa_import_flags = ZFS_IMPORT_VERBATIM; 3296 3297 /* 3298 * Build up a vdev tree based on the boot device's label config. 3299 */ 3300 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3301 &nvtop) == 0); 3302 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3303 error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, 3304 VDEV_ALLOC_ROOTPOOL); 3305 spa_config_exit(spa, SCL_ALL, FTAG); 3306 if (error) { 3307 mutex_exit(&spa_namespace_lock); 3308 nvlist_free(config); 3309 cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", 3310 pname); 3311 return (error); 3312 } 3313 3314 /* 3315 * Get the boot vdev. 3316 */ 3317 if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 3318 cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu", 3319 (u_longlong_t)guid); 3320 error = ENOENT; 3321 goto out; 3322 } 3323 3324 /* 3325 * Determine if there is a better boot device. 3326 */ 3327 avd = bvd; 3328 spa_alt_rootvdev(rvd, &avd, &txg); 3329 if (avd != bvd) { 3330 cmn_err(CE_NOTE, "The boot device is 'degraded'. Please " 3331 "try booting from '%s'", avd->vdev_path); 3332 error = EINVAL; 3333 goto out; 3334 } 3335 3336 /* 3337 * If the boot device is part of a spare vdev then ensure that 3338 * we're booting off the active spare. 3339 */ 3340 if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops && 3341 !bvd->vdev_isspare) { 3342 cmn_err(CE_NOTE, "The boot device is currently spared. Please " 3343 "try booting from '%s'", 3344 bvd->vdev_parent-> 3345 vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path); 3346 error = EINVAL; 3347 goto out; 3348 } 3349 3350 error = 0; 3351 spa_history_log_version(spa, LOG_POOL_IMPORT); 3352out: 3353 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3354 vdev_free(rvd); 3355 spa_config_exit(spa, SCL_ALL, FTAG); 3356 mutex_exit(&spa_namespace_lock); 3357 3358 nvlist_free(config); 3359 return (error); 3360} 3361 3362#endif 3363#endif /* sun */ 3364 3365/* 3366 * Import a non-root pool into the system. 3367 */ 3368int 3369spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) 3370{ 3371 spa_t *spa; 3372 char *altroot = NULL; 3373 spa_load_state_t state = SPA_LOAD_IMPORT; 3374 zpool_rewind_policy_t policy; 3375 uint64_t mode = spa_mode_global; 3376 uint64_t readonly = B_FALSE; 3377 int error; 3378 nvlist_t *nvroot; 3379 nvlist_t **spares, **l2cache; 3380 uint_t nspares, nl2cache; 3381 3382 /* 3383 * If a pool with this name exists, return failure. 3384 */ 3385 mutex_enter(&spa_namespace_lock); 3386 if (spa_lookup(pool) != NULL) { 3387 mutex_exit(&spa_namespace_lock); 3388 return (EEXIST); 3389 } 3390 3391 /* 3392 * Create and initialize the spa structure. 3393 */ 3394 (void) nvlist_lookup_string(props, 3395 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 3396 (void) nvlist_lookup_uint64(props, 3397 zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); 3398 if (readonly) 3399 mode = FREAD; 3400 spa = spa_add(pool, config, altroot); 3401 spa->spa_import_flags = flags; 3402 3403 /* 3404 * Verbatim import - Take a pool and insert it into the namespace 3405 * as if it had been loaded at boot. 3406 */ 3407 if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { 3408 if (props != NULL) 3409 spa_configfile_set(spa, props, B_FALSE); 3410 3411 spa_config_sync(spa, B_FALSE, B_TRUE); 3412 3413 mutex_exit(&spa_namespace_lock); 3414 spa_history_log_version(spa, LOG_POOL_IMPORT); 3415 3416 return (0); 3417 } 3418 3419 spa_activate(spa, mode); 3420 3421 /* 3422 * Don't start async tasks until we know everything is healthy. 3423 */ 3424 spa_async_suspend(spa); 3425 3426 zpool_get_rewind_policy(config, &policy); 3427 if (policy.zrp_request & ZPOOL_DO_REWIND) 3428 state = SPA_LOAD_RECOVER; 3429 3430 /* 3431 * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig 3432 * because the user-supplied config is actually the one to trust when 3433 * doing an import. 3434 */ 3435 if (state != SPA_LOAD_RECOVER) 3436 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 3437 3438 error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg, 3439 policy.zrp_request); 3440 3441 /* 3442 * Propagate anything learned while loading the pool and pass it 3443 * back to caller (i.e. rewind info, missing devices, etc). 3444 */ 3445 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 3446 spa->spa_load_info) == 0); 3447 3448 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3449 /* 3450 * Toss any existing sparelist, as it doesn't have any validity 3451 * anymore, and conflicts with spa_has_spare(). 3452 */ 3453 if (spa->spa_spares.sav_config) { 3454 nvlist_free(spa->spa_spares.sav_config); 3455 spa->spa_spares.sav_config = NULL; 3456 spa_load_spares(spa); 3457 } 3458 if (spa->spa_l2cache.sav_config) { 3459 nvlist_free(spa->spa_l2cache.sav_config); 3460 spa->spa_l2cache.sav_config = NULL; 3461 spa_load_l2cache(spa); 3462 } 3463 3464 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3465 &nvroot) == 0); 3466 if (error == 0) 3467 error = spa_validate_aux(spa, nvroot, -1ULL, 3468 VDEV_ALLOC_SPARE); 3469 if (error == 0) 3470 error = spa_validate_aux(spa, nvroot, -1ULL, 3471 VDEV_ALLOC_L2CACHE); 3472 spa_config_exit(spa, SCL_ALL, FTAG); 3473 3474 if (props != NULL) 3475 spa_configfile_set(spa, props, B_FALSE); 3476 3477 if (error != 0 || (props && spa_writeable(spa) && 3478 (error = spa_prop_set(spa, props)))) { 3479 spa_unload(spa); 3480 spa_deactivate(spa); 3481 spa_remove(spa); 3482 mutex_exit(&spa_namespace_lock); 3483 return (error); 3484 } 3485 3486 spa_async_resume(spa); 3487 3488 /* 3489 * Override any spares and level 2 cache devices as specified by 3490 * the user, as these may have correct device names/devids, etc. 3491 */ 3492 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 3493 &spares, &nspares) == 0) { 3494 if (spa->spa_spares.sav_config) 3495 VERIFY(nvlist_remove(spa->spa_spares.sav_config, 3496 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 3497 else 3498 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, 3499 NV_UNIQUE_NAME, KM_SLEEP) == 0); 3500 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 3501 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 3502 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3503 spa_load_spares(spa); 3504 spa_config_exit(spa, SCL_ALL, FTAG); 3505 spa->spa_spares.sav_sync = B_TRUE; 3506 } 3507 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 3508 &l2cache, &nl2cache) == 0) { 3509 if (spa->spa_l2cache.sav_config) 3510 VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, 3511 ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); 3512 else 3513 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 3514 NV_UNIQUE_NAME, KM_SLEEP) == 0); 3515 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 3516 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 3517 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3518 spa_load_l2cache(spa); 3519 spa_config_exit(spa, SCL_ALL, FTAG); 3520 spa->spa_l2cache.sav_sync = B_TRUE; 3521 } 3522 3523 /* 3524 * Check for any removed devices. 3525 */ 3526 if (spa->spa_autoreplace) { 3527 spa_aux_check_removed(&spa->spa_spares); 3528 spa_aux_check_removed(&spa->spa_l2cache); 3529 } 3530 3531 if (spa_writeable(spa)) { 3532 /* 3533 * Update the config cache to include the newly-imported pool. 3534 */ 3535 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 3536 } 3537 3538 /* 3539 * It's possible that the pool was expanded while it was exported. 3540 * We kick off an async task to handle this for us. 3541 */ 3542 spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 3543 3544 mutex_exit(&spa_namespace_lock); 3545 spa_history_log_version(spa, LOG_POOL_IMPORT); 3546 3547#ifdef __FreeBSD__ 3548#ifdef _KERNEL 3549 zvol_create_minors(pool); 3550#endif 3551#endif 3552 return (0); 3553} 3554 3555nvlist_t * 3556spa_tryimport(nvlist_t *tryconfig) 3557{ 3558 nvlist_t *config = NULL; 3559 char *poolname; 3560 spa_t *spa; 3561 uint64_t state; 3562 int error; 3563 3564 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 3565 return (NULL); 3566 3567 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 3568 return (NULL); 3569 3570 /* 3571 * Create and initialize the spa structure. 3572 */ 3573 mutex_enter(&spa_namespace_lock); 3574 spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); 3575 spa_activate(spa, FREAD); 3576 3577 /* 3578 * Pass off the heavy lifting to spa_load(). 3579 * Pass TRUE for mosconfig because the user-supplied config 3580 * is actually the one to trust when doing an import. 3581 */ 3582 error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE); 3583 3584 /* 3585 * If 'tryconfig' was at least parsable, return the current config. 3586 */ 3587 if (spa->spa_root_vdev != NULL) { 3588 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 3589 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 3590 poolname) == 0); 3591 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 3592 state) == 0); 3593 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 3594 spa->spa_uberblock.ub_timestamp) == 0); 3595 3596 /* 3597 * If the bootfs property exists on this pool then we 3598 * copy it out so that external consumers can tell which 3599 * pools are bootable. 3600 */ 3601 if ((!error || error == EEXIST) && spa->spa_bootfs) { 3602 char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 3603 3604 /* 3605 * We have to play games with the name since the 3606 * pool was opened as TRYIMPORT_NAME. 3607 */ 3608 if (dsl_dsobj_to_dsname(spa_name(spa), 3609 spa->spa_bootfs, tmpname) == 0) { 3610 char *cp; 3611 char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 3612 3613 cp = strchr(tmpname, '/'); 3614 if (cp == NULL) { 3615 (void) strlcpy(dsname, tmpname, 3616 MAXPATHLEN); 3617 } else { 3618 (void) snprintf(dsname, MAXPATHLEN, 3619 "%s/%s", poolname, ++cp); 3620 } 3621 VERIFY(nvlist_add_string(config, 3622 ZPOOL_CONFIG_BOOTFS, dsname) == 0); 3623 kmem_free(dsname, MAXPATHLEN); 3624 } 3625 kmem_free(tmpname, MAXPATHLEN); 3626 } 3627 3628 /* 3629 * Add the list of hot spares and level 2 cache devices. 3630 */ 3631 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3632 spa_add_spares(spa, config); 3633 spa_add_l2cache(spa, config); 3634 spa_config_exit(spa, SCL_CONFIG, FTAG); 3635 } 3636 3637 spa_unload(spa); 3638 spa_deactivate(spa); 3639 spa_remove(spa); 3640 mutex_exit(&spa_namespace_lock); 3641 3642 return (config); 3643} 3644 3645/* 3646 * Pool export/destroy 3647 * 3648 * The act of destroying or exporting a pool is very simple. We make sure there 3649 * is no more pending I/O and any references to the pool are gone. Then, we 3650 * update the pool state and sync all the labels to disk, removing the 3651 * configuration from the cache afterwards. If the 'hardforce' flag is set, then 3652 * we don't sync the labels or remove the configuration cache. 3653 */ 3654static int 3655spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, 3656 boolean_t force, boolean_t hardforce) 3657{ 3658 spa_t *spa; 3659 3660 if (oldconfig) 3661 *oldconfig = NULL; 3662 3663 if (!(spa_mode_global & FWRITE)) 3664 return (EROFS); 3665 3666 mutex_enter(&spa_namespace_lock); 3667 if ((spa = spa_lookup(pool)) == NULL) { 3668 mutex_exit(&spa_namespace_lock); 3669 return (ENOENT); 3670 } 3671 3672 /* 3673 * Put a hold on the pool, drop the namespace lock, stop async tasks, 3674 * reacquire the namespace lock, and see if we can export. 3675 */ 3676 spa_open_ref(spa, FTAG); 3677 mutex_exit(&spa_namespace_lock); 3678 spa_async_suspend(spa); 3679 mutex_enter(&spa_namespace_lock); 3680 spa_close(spa, FTAG); 3681 3682 /* 3683 * The pool will be in core if it's openable, 3684 * in which case we can modify its state. 3685 */ 3686 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 3687 /* 3688 * Objsets may be open only because they're dirty, so we 3689 * have to force it to sync before checking spa_refcnt. 3690 */ 3691 txg_wait_synced(spa->spa_dsl_pool, 0); 3692 3693 /* 3694 * A pool cannot be exported or destroyed if there are active 3695 * references. If we are resetting a pool, allow references by 3696 * fault injection handlers. 3697 */ 3698 if (!spa_refcount_zero(spa) || 3699 (spa->spa_inject_ref != 0 && 3700 new_state != POOL_STATE_UNINITIALIZED)) { 3701 spa_async_resume(spa); 3702 mutex_exit(&spa_namespace_lock); 3703 return (EBUSY); 3704 } 3705 3706 /* 3707 * A pool cannot be exported if it has an active shared spare. 3708 * This is to prevent other pools stealing the active spare 3709 * from an exported pool. At user's own will, such pool can 3710 * be forcedly exported. 3711 */ 3712 if (!force && new_state == POOL_STATE_EXPORTED && 3713 spa_has_active_shared_spare(spa)) { 3714 spa_async_resume(spa); 3715 mutex_exit(&spa_namespace_lock); 3716 return (EXDEV); 3717 } 3718 3719 /* 3720 * We want this to be reflected on every label, 3721 * so mark them all dirty. spa_unload() will do the 3722 * final sync that pushes these changes out. 3723 */ 3724 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 3725 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3726 spa->spa_state = new_state; 3727 spa->spa_final_txg = spa_last_synced_txg(spa) + 3728 TXG_DEFER_SIZE + 1; 3729 vdev_config_dirty(spa->spa_root_vdev); 3730 spa_config_exit(spa, SCL_ALL, FTAG); 3731 } 3732 } 3733 3734 spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY); 3735 3736 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 3737 spa_unload(spa); 3738 spa_deactivate(spa); 3739 } 3740 3741 if (oldconfig && spa->spa_config) 3742 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 3743 3744 if (new_state != POOL_STATE_UNINITIALIZED) { 3745 if (!hardforce) 3746 spa_config_sync(spa, B_TRUE, B_TRUE); 3747 spa_remove(spa); 3748 } 3749 mutex_exit(&spa_namespace_lock); 3750 3751 return (0); 3752} 3753 3754/* 3755 * Destroy a storage pool. 3756 */ 3757int 3758spa_destroy(char *pool) 3759{ 3760 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, 3761 B_FALSE, B_FALSE)); 3762} 3763 3764/* 3765 * Export a storage pool. 3766 */ 3767int 3768spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, 3769 boolean_t hardforce) 3770{ 3771 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, 3772 force, hardforce)); 3773} 3774 3775/* 3776 * Similar to spa_export(), this unloads the spa_t without actually removing it 3777 * from the namespace in any way. 3778 */ 3779int 3780spa_reset(char *pool) 3781{ 3782 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, 3783 B_FALSE, B_FALSE)); 3784} 3785 3786/* 3787 * ========================================================================== 3788 * Device manipulation 3789 * ========================================================================== 3790 */ 3791 3792/* 3793 * Add a device to a storage pool. 3794 */ 3795int 3796spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 3797{ 3798 uint64_t txg, id; 3799 int error; 3800 vdev_t *rvd = spa->spa_root_vdev; 3801 vdev_t *vd, *tvd; 3802 nvlist_t **spares, **l2cache; 3803 uint_t nspares, nl2cache; 3804 3805 ASSERT(spa_writeable(spa)); 3806 3807 txg = spa_vdev_enter(spa); 3808 3809 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 3810 VDEV_ALLOC_ADD)) != 0) 3811 return (spa_vdev_exit(spa, NULL, txg, error)); 3812 3813 spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ 3814 3815 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 3816 &nspares) != 0) 3817 nspares = 0; 3818 3819 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 3820 &nl2cache) != 0) 3821 nl2cache = 0; 3822 3823 if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) 3824 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 3825 3826 if (vd->vdev_children != 0 && 3827 (error = vdev_create(vd, txg, B_FALSE)) != 0) 3828 return (spa_vdev_exit(spa, vd, txg, error)); 3829 3830 /* 3831 * We must validate the spares and l2cache devices after checking the 3832 * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 3833 */ 3834 if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) 3835 return (spa_vdev_exit(spa, vd, txg, error)); 3836 3837 /* 3838 * Transfer each new top-level vdev from vd to rvd. 3839 */ 3840 for (int c = 0; c < vd->vdev_children; c++) { 3841 3842 /* 3843 * Set the vdev id to the first hole, if one exists. 3844 */ 3845 for (id = 0; id < rvd->vdev_children; id++) { 3846 if (rvd->vdev_child[id]->vdev_ishole) { 3847 vdev_free(rvd->vdev_child[id]); 3848 break; 3849 } 3850 } 3851 tvd = vd->vdev_child[c]; 3852 vdev_remove_child(vd, tvd); 3853 tvd->vdev_id = id; 3854 vdev_add_child(rvd, tvd); 3855 vdev_config_dirty(tvd); 3856 } 3857 3858 if (nspares != 0) { 3859 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 3860 ZPOOL_CONFIG_SPARES); 3861 spa_load_spares(spa); 3862 spa->spa_spares.sav_sync = B_TRUE; 3863 } 3864 3865 if (nl2cache != 0) { 3866 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 3867 ZPOOL_CONFIG_L2CACHE); 3868 spa_load_l2cache(spa); 3869 spa->spa_l2cache.sav_sync = B_TRUE; 3870 } 3871 3872 /* 3873 * We have to be careful when adding new vdevs to an existing pool. 3874 * If other threads start allocating from these vdevs before we 3875 * sync the config cache, and we lose power, then upon reboot we may 3876 * fail to open the pool because there are DVAs that the config cache 3877 * can't translate. Therefore, we first add the vdevs without 3878 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 3879 * and then let spa_config_update() initialize the new metaslabs. 3880 * 3881 * spa_load() checks for added-but-not-initialized vdevs, so that 3882 * if we lose power at any point in this sequence, the remaining 3883 * steps will be completed the next time we load the pool. 3884 */ 3885 (void) spa_vdev_exit(spa, vd, txg, 0); 3886 3887 mutex_enter(&spa_namespace_lock); 3888 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 3889 mutex_exit(&spa_namespace_lock); 3890 3891 return (0); 3892} 3893 3894/* 3895 * Attach a device to a mirror. The arguments are the path to any device 3896 * in the mirror, and the nvroot for the new device. If the path specifies 3897 * a device that is not mirrored, we automatically insert the mirror vdev. 3898 * 3899 * If 'replacing' is specified, the new device is intended to replace the 3900 * existing device; in this case the two devices are made into their own 3901 * mirror using the 'replacing' vdev, which is functionally identical to 3902 * the mirror vdev (it actually reuses all the same ops) but has a few 3903 * extra rules: you can't attach to it after it's been created, and upon 3904 * completion of resilvering, the first disk (the one being replaced) 3905 * is automatically detached. 3906 */ 3907int 3908spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 3909{ 3910 uint64_t txg, dtl_max_txg; 3911 vdev_t *rvd = spa->spa_root_vdev; 3912 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 3913 vdev_ops_t *pvops; 3914 char *oldvdpath, *newvdpath; 3915 int newvd_isspare; 3916 int error; 3917 3918 ASSERT(spa_writeable(spa)); 3919 3920 txg = spa_vdev_enter(spa); 3921 3922 oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 3923 3924 if (oldvd == NULL) 3925 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 3926 3927 if (!oldvd->vdev_ops->vdev_op_leaf) 3928 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3929 3930 pvd = oldvd->vdev_parent; 3931 3932 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
|
3932 VDEV_ALLOC_ADD)) != 0)
| 3933 VDEV_ALLOC_ATTACH)) != 0)
|
3933 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 3934 3935 if (newrootvd->vdev_children != 1) 3936 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 3937 3938 newvd = newrootvd->vdev_child[0]; 3939 3940 if (!newvd->vdev_ops->vdev_op_leaf) 3941 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 3942 3943 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 3944 return (spa_vdev_exit(spa, newrootvd, txg, error)); 3945 3946 /* 3947 * Spares can't replace logs 3948 */ 3949 if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) 3950 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3951 3952 if (!replacing) { 3953 /* 3954 * For attach, the only allowable parent is a mirror or the root 3955 * vdev. 3956 */ 3957 if (pvd->vdev_ops != &vdev_mirror_ops && 3958 pvd->vdev_ops != &vdev_root_ops) 3959 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3960 3961 pvops = &vdev_mirror_ops; 3962 } else { 3963 /* 3964 * Active hot spares can only be replaced by inactive hot 3965 * spares. 3966 */ 3967 if (pvd->vdev_ops == &vdev_spare_ops && 3968 oldvd->vdev_isspare && 3969 !spa_has_spare(spa, newvd->vdev_guid)) 3970 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3971 3972 /* 3973 * If the source is a hot spare, and the parent isn't already a 3974 * spare, then we want to create a new hot spare. Otherwise, we 3975 * want to create a replacing vdev. The user is not allowed to 3976 * attach to a spared vdev child unless the 'isspare' state is 3977 * the same (spare replaces spare, non-spare replaces 3978 * non-spare). 3979 */ 3980 if (pvd->vdev_ops == &vdev_replacing_ops && 3981 spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { 3982 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3983 } else if (pvd->vdev_ops == &vdev_spare_ops && 3984 newvd->vdev_isspare != oldvd->vdev_isspare) { 3985 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3986 } 3987 3988 if (newvd->vdev_isspare) 3989 pvops = &vdev_spare_ops; 3990 else 3991 pvops = &vdev_replacing_ops; 3992 } 3993 3994 /* 3995 * Make sure the new device is big enough. 3996 */ 3997 if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) 3998 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 3999 4000 /* 4001 * The new device cannot have a higher alignment requirement 4002 * than the top-level vdev. 4003 */ 4004 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 4005 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 4006 4007 /* 4008 * If this is an in-place replacement, update oldvd's path and devid 4009 * to make it distinguishable from newvd, and unopenable from now on. 4010 */ 4011 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 4012 spa_strfree(oldvd->vdev_path); 4013 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 4014 KM_SLEEP); 4015 (void) sprintf(oldvd->vdev_path, "%s/%s", 4016 newvd->vdev_path, "old"); 4017 if (oldvd->vdev_devid != NULL) { 4018 spa_strfree(oldvd->vdev_devid); 4019 oldvd->vdev_devid = NULL; 4020 } 4021 } 4022 4023 /* mark the device being resilvered */ 4024 newvd->vdev_resilvering = B_TRUE; 4025 4026 /* 4027 * If the parent is not a mirror, or if we're replacing, insert the new 4028 * mirror/replacing/spare vdev above oldvd. 4029 */ 4030 if (pvd->vdev_ops != pvops) 4031 pvd = vdev_add_parent(oldvd, pvops); 4032 4033 ASSERT(pvd->vdev_top->vdev_parent == rvd); 4034 ASSERT(pvd->vdev_ops == pvops); 4035 ASSERT(oldvd->vdev_parent == pvd); 4036 4037 /* 4038 * Extract the new device from its root and add it to pvd. 4039 */ 4040 vdev_remove_child(newrootvd, newvd); 4041 newvd->vdev_id = pvd->vdev_children; 4042 newvd->vdev_crtxg = oldvd->vdev_crtxg; 4043 vdev_add_child(pvd, newvd); 4044 4045 tvd = newvd->vdev_top; 4046 ASSERT(pvd->vdev_top == tvd); 4047 ASSERT(tvd->vdev_parent == rvd); 4048 4049 vdev_config_dirty(tvd); 4050 4051 /* 4052 * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account 4053 * for any dmu_sync-ed blocks. It will propagate upward when 4054 * spa_vdev_exit() calls vdev_dtl_reassess(). 4055 */ 4056 dtl_max_txg = txg + TXG_CONCURRENT_STATES; 4057 4058 vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, 4059 dtl_max_txg - TXG_INITIAL); 4060 4061 if (newvd->vdev_isspare) { 4062 spa_spare_activate(newvd); 4063 spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE); 4064 } 4065 4066 oldvdpath = spa_strdup(oldvd->vdev_path); 4067 newvdpath = spa_strdup(newvd->vdev_path); 4068 newvd_isspare = newvd->vdev_isspare; 4069 4070 /* 4071 * Mark newvd's DTL dirty in this txg. 4072 */ 4073 vdev_dirty(tvd, VDD_DTL, newvd, txg); 4074 4075 /* 4076 * Restart the resilver 4077 */ 4078 dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg); 4079 4080 /* 4081 * Commit the config 4082 */ 4083 (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); 4084 4085 spa_history_log_internal(LOG_POOL_VDEV_ATTACH, spa, NULL, 4086 "%s vdev=%s %s vdev=%s", 4087 replacing && newvd_isspare ? "spare in" : 4088 replacing ? "replace" : "attach", newvdpath, 4089 replacing ? "for" : "to", oldvdpath); 4090 4091 spa_strfree(oldvdpath); 4092 spa_strfree(newvdpath); 4093 4094 if (spa->spa_bootfs) 4095 spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH); 4096 4097 return (0); 4098} 4099 4100/* 4101 * Detach a device from a mirror or replacing vdev. 4102 * If 'replace_done' is specified, only detach if the parent 4103 * is a replacing vdev. 4104 */ 4105int 4106spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) 4107{ 4108 uint64_t txg; 4109 int error; 4110 vdev_t *rvd = spa->spa_root_vdev; 4111 vdev_t *vd, *pvd, *cvd, *tvd; 4112 boolean_t unspare = B_FALSE; 4113 uint64_t unspare_guid; 4114 char *vdpath; 4115 4116 ASSERT(spa_writeable(spa)); 4117 4118 txg = spa_vdev_enter(spa); 4119 4120 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 4121 4122 if (vd == NULL) 4123 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 4124 4125 if (!vd->vdev_ops->vdev_op_leaf) 4126 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4127 4128 pvd = vd->vdev_parent; 4129 4130 /* 4131 * If the parent/child relationship is not as expected, don't do it. 4132 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing 4133 * vdev that's replacing B with C. The user's intent in replacing 4134 * is to go from M(A,B) to M(A,C). If the user decides to cancel 4135 * the replace by detaching C, the expected behavior is to end up 4136 * M(A,B). But suppose that right after deciding to detach C, 4137 * the replacement of B completes. We would have M(A,C), and then 4138 * ask to detach C, which would leave us with just A -- not what 4139 * the user wanted. To prevent this, we make sure that the 4140 * parent/child relationship hasn't changed -- in this example, 4141 * that C's parent is still the replacing vdev R. 4142 */ 4143 if (pvd->vdev_guid != pguid && pguid != 0) 4144 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 4145 4146 /* 4147 * Only 'replacing' or 'spare' vdevs can be replaced. 4148 */ 4149 if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && 4150 pvd->vdev_ops != &vdev_spare_ops) 4151 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4152 4153 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 4154 spa_version(spa) >= SPA_VERSION_SPARES); 4155 4156 /* 4157 * Only mirror, replacing, and spare vdevs support detach. 4158 */ 4159 if (pvd->vdev_ops != &vdev_replacing_ops && 4160 pvd->vdev_ops != &vdev_mirror_ops && 4161 pvd->vdev_ops != &vdev_spare_ops) 4162 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4163 4164 /* 4165 * If this device has the only valid copy of some data, 4166 * we cannot safely detach it. 4167 */ 4168 if (vdev_dtl_required(vd)) 4169 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 4170 4171 ASSERT(pvd->vdev_children >= 2); 4172 4173 /* 4174 * If we are detaching the second disk from a replacing vdev, then 4175 * check to see if we changed the original vdev's path to have "/old" 4176 * at the end in spa_vdev_attach(). If so, undo that change now. 4177 */ 4178 if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && 4179 vd->vdev_path != NULL) { 4180 size_t len = strlen(vd->vdev_path); 4181 4182 for (int c = 0; c < pvd->vdev_children; c++) { 4183 cvd = pvd->vdev_child[c]; 4184 4185 if (cvd == vd || cvd->vdev_path == NULL) 4186 continue; 4187 4188 if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 4189 strcmp(cvd->vdev_path + len, "/old") == 0) { 4190 spa_strfree(cvd->vdev_path); 4191 cvd->vdev_path = spa_strdup(vd->vdev_path); 4192 break; 4193 } 4194 } 4195 } 4196 4197 /* 4198 * If we are detaching the original disk from a spare, then it implies 4199 * that the spare should become a real disk, and be removed from the 4200 * active spare list for the pool. 4201 */ 4202 if (pvd->vdev_ops == &vdev_spare_ops && 4203 vd->vdev_id == 0 && 4204 pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare) 4205 unspare = B_TRUE; 4206 4207 /* 4208 * Erase the disk labels so the disk can be used for other things. 4209 * This must be done after all other error cases are handled, 4210 * but before we disembowel vd (so we can still do I/O to it). 4211 * But if we can't do it, don't treat the error as fatal -- 4212 * it may be that the unwritability of the disk is the reason 4213 * it's being detached! 4214 */ 4215 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 4216 4217 /* 4218 * Remove vd from its parent and compact the parent's children. 4219 */ 4220 vdev_remove_child(pvd, vd); 4221 vdev_compact_children(pvd); 4222 4223 /* 4224 * Remember one of the remaining children so we can get tvd below. 4225 */ 4226 cvd = pvd->vdev_child[pvd->vdev_children - 1]; 4227 4228 /* 4229 * If we need to remove the remaining child from the list of hot spares, 4230 * do it now, marking the vdev as no longer a spare in the process. 4231 * We must do this before vdev_remove_parent(), because that can 4232 * change the GUID if it creates a new toplevel GUID. For a similar 4233 * reason, we must remove the spare now, in the same txg as the detach; 4234 * otherwise someone could attach a new sibling, change the GUID, and 4235 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. 4236 */ 4237 if (unspare) { 4238 ASSERT(cvd->vdev_isspare); 4239 spa_spare_remove(cvd); 4240 unspare_guid = cvd->vdev_guid; 4241 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 4242 cvd->vdev_unspare = B_TRUE; 4243 } 4244 4245 /* 4246 * If the parent mirror/replacing vdev only has one child, 4247 * the parent is no longer needed. Remove it from the tree. 4248 */ 4249 if (pvd->vdev_children == 1) { 4250 if (pvd->vdev_ops == &vdev_spare_ops) 4251 cvd->vdev_unspare = B_FALSE; 4252 vdev_remove_parent(cvd); 4253 cvd->vdev_resilvering = B_FALSE; 4254 } 4255 4256 4257 /* 4258 * We don't set tvd until now because the parent we just removed 4259 * may have been the previous top-level vdev. 4260 */ 4261 tvd = cvd->vdev_top; 4262 ASSERT(tvd->vdev_parent == rvd); 4263 4264 /* 4265 * Reevaluate the parent vdev state. 4266 */ 4267 vdev_propagate_state(cvd); 4268 4269 /* 4270 * If the 'autoexpand' property is set on the pool then automatically 4271 * try to expand the size of the pool. For example if the device we 4272 * just detached was smaller than the others, it may be possible to 4273 * add metaslabs (i.e. grow the pool). We need to reopen the vdev 4274 * first so that we can obtain the updated sizes of the leaf vdevs. 4275 */ 4276 if (spa->spa_autoexpand) { 4277 vdev_reopen(tvd); 4278 vdev_expand(tvd, txg); 4279 } 4280 4281 vdev_config_dirty(tvd); 4282 4283 /* 4284 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 4285 * vd->vdev_detached is set and free vd's DTL object in syncing context. 4286 * But first make sure we're not on any *other* txg's DTL list, to 4287 * prevent vd from being accessed after it's freed. 4288 */ 4289 vdpath = spa_strdup(vd->vdev_path); 4290 for (int t = 0; t < TXG_SIZE; t++) 4291 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 4292 vd->vdev_detached = B_TRUE; 4293 vdev_dirty(tvd, VDD_DTL, vd, txg); 4294 4295 spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); 4296 4297 /* hang on to the spa before we release the lock */ 4298 spa_open_ref(spa, FTAG); 4299 4300 error = spa_vdev_exit(spa, vd, txg, 0); 4301 4302 spa_history_log_internal(LOG_POOL_VDEV_DETACH, spa, NULL, 4303 "vdev=%s", vdpath); 4304 spa_strfree(vdpath); 4305 4306 /* 4307 * If this was the removal of the original device in a hot spare vdev, 4308 * then we want to go through and remove the device from the hot spare 4309 * list of every other pool. 4310 */ 4311 if (unspare) { 4312 spa_t *altspa = NULL; 4313 4314 mutex_enter(&spa_namespace_lock); 4315 while ((altspa = spa_next(altspa)) != NULL) { 4316 if (altspa->spa_state != POOL_STATE_ACTIVE || 4317 altspa == spa) 4318 continue; 4319 4320 spa_open_ref(altspa, FTAG); 4321 mutex_exit(&spa_namespace_lock); 4322 (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); 4323 mutex_enter(&spa_namespace_lock); 4324 spa_close(altspa, FTAG); 4325 } 4326 mutex_exit(&spa_namespace_lock); 4327 4328 /* search the rest of the vdevs for spares to remove */ 4329 spa_vdev_resilver_done(spa); 4330 } 4331 4332 /* all done with the spa; OK to release */ 4333 mutex_enter(&spa_namespace_lock); 4334 spa_close(spa, FTAG); 4335 mutex_exit(&spa_namespace_lock); 4336 4337 return (error); 4338} 4339 4340/* 4341 * Split a set of devices from their mirrors, and create a new pool from them. 4342 */ 4343int 4344spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, 4345 nvlist_t *props, boolean_t exp) 4346{ 4347 int error = 0; 4348 uint64_t txg, *glist; 4349 spa_t *newspa; 4350 uint_t c, children, lastlog; 4351 nvlist_t **child, *nvl, *tmp; 4352 dmu_tx_t *tx; 4353 char *altroot = NULL; 4354 vdev_t *rvd, **vml = NULL; /* vdev modify list */ 4355 boolean_t activate_slog; 4356 4357 ASSERT(spa_writeable(spa)); 4358 4359 txg = spa_vdev_enter(spa); 4360 4361 /* clear the log and flush everything up to now */ 4362 activate_slog = spa_passivate_log(spa); 4363 (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 4364 error = spa_offline_log(spa); 4365 txg = spa_vdev_config_enter(spa); 4366 4367 if (activate_slog) 4368 spa_activate_log(spa); 4369 4370 if (error != 0) 4371 return (spa_vdev_exit(spa, NULL, txg, error)); 4372 4373 /* check new spa name before going any further */ 4374 if (spa_lookup(newname) != NULL) 4375 return (spa_vdev_exit(spa, NULL, txg, EEXIST)); 4376 4377 /* 4378 * scan through all the children to ensure they're all mirrors 4379 */ 4380 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || 4381 nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, 4382 &children) != 0) 4383 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4384 4385 /* first, check to ensure we've got the right child count */ 4386 rvd = spa->spa_root_vdev; 4387 lastlog = 0; 4388 for (c = 0; c < rvd->vdev_children; c++) { 4389 vdev_t *vd = rvd->vdev_child[c]; 4390 4391 /* don't count the holes & logs as children */ 4392 if (vd->vdev_islog || vd->vdev_ishole) { 4393 if (lastlog == 0) 4394 lastlog = c; 4395 continue; 4396 } 4397 4398 lastlog = 0; 4399 } 4400 if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) 4401 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4402 4403 /* next, ensure no spare or cache devices are part of the split */ 4404 if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || 4405 nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) 4406 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4407 4408 vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); 4409 glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); 4410 4411 /* then, loop over each vdev and validate it */ 4412 for (c = 0; c < children; c++) { 4413 uint64_t is_hole = 0; 4414 4415 (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, 4416 &is_hole); 4417 4418 if (is_hole != 0) { 4419 if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || 4420 spa->spa_root_vdev->vdev_child[c]->vdev_islog) { 4421 continue; 4422 } else { 4423 error = EINVAL; 4424 break; 4425 } 4426 } 4427 4428 /* which disk is going to be split? */ 4429 if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, 4430 &glist[c]) != 0) { 4431 error = EINVAL; 4432 break; 4433 } 4434 4435 /* look it up in the spa */ 4436 vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); 4437 if (vml[c] == NULL) { 4438 error = ENODEV; 4439 break; 4440 } 4441 4442 /* make sure there's nothing stopping the split */ 4443 if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || 4444 vml[c]->vdev_islog || 4445 vml[c]->vdev_ishole || 4446 vml[c]->vdev_isspare || 4447 vml[c]->vdev_isl2cache || 4448 !vdev_writeable(vml[c]) || 4449 vml[c]->vdev_children != 0 || 4450 vml[c]->vdev_state != VDEV_STATE_HEALTHY || 4451 c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { 4452 error = EINVAL; 4453 break; 4454 } 4455 4456 if (vdev_dtl_required(vml[c])) { 4457 error = EBUSY; 4458 break; 4459 } 4460 4461 /* we need certain info from the top level */ 4462 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, 4463 vml[c]->vdev_top->vdev_ms_array) == 0); 4464 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, 4465 vml[c]->vdev_top->vdev_ms_shift) == 0); 4466 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, 4467 vml[c]->vdev_top->vdev_asize) == 0); 4468 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, 4469 vml[c]->vdev_top->vdev_ashift) == 0); 4470 } 4471 4472 if (error != 0) { 4473 kmem_free(vml, children * sizeof (vdev_t *)); 4474 kmem_free(glist, children * sizeof (uint64_t)); 4475 return (spa_vdev_exit(spa, NULL, txg, error)); 4476 } 4477 4478 /* stop writers from using the disks */ 4479 for (c = 0; c < children; c++) { 4480 if (vml[c] != NULL) 4481 vml[c]->vdev_offline = B_TRUE; 4482 } 4483 vdev_reopen(spa->spa_root_vdev); 4484 4485 /* 4486 * Temporarily record the splitting vdevs in the spa config. This 4487 * will disappear once the config is regenerated. 4488 */ 4489 VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); 4490 VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 4491 glist, children) == 0); 4492 kmem_free(glist, children * sizeof (uint64_t)); 4493 4494 mutex_enter(&spa->spa_props_lock); 4495 VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, 4496 nvl) == 0); 4497 mutex_exit(&spa->spa_props_lock); 4498 spa->spa_config_splitting = nvl; 4499 vdev_config_dirty(spa->spa_root_vdev); 4500 4501 /* configure and create the new pool */ 4502 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0); 4503 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 4504 exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0); 4505 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 4506 spa_version(spa)) == 0); 4507 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, 4508 spa->spa_config_txg) == 0); 4509 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, 4510 spa_generate_guid(NULL)) == 0); 4511 (void) nvlist_lookup_string(props, 4512 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 4513 4514 /* add the new pool to the namespace */ 4515 newspa = spa_add(newname, config, altroot); 4516 newspa->spa_config_txg = spa->spa_config_txg; 4517 spa_set_log_state(newspa, SPA_LOG_CLEAR); 4518 4519 /* release the spa config lock, retaining the namespace lock */ 4520 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 4521 4522 if (zio_injection_enabled) 4523 zio_handle_panic_injection(spa, FTAG, 1); 4524 4525 spa_activate(newspa, spa_mode_global); 4526 spa_async_suspend(newspa); 4527 4528#ifndef sun 4529 /* mark that we are creating new spa by splitting */ 4530 newspa->spa_splitting_newspa = B_TRUE; 4531#endif 4532 /* create the new pool from the disks of the original pool */ 4533 error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE); 4534#ifndef sun 4535 newspa->spa_splitting_newspa = B_FALSE; 4536#endif 4537 if (error) 4538 goto out; 4539 4540 /* if that worked, generate a real config for the new pool */ 4541 if (newspa->spa_root_vdev != NULL) { 4542 VERIFY(nvlist_alloc(&newspa->spa_config_splitting, 4543 NV_UNIQUE_NAME, KM_SLEEP) == 0); 4544 VERIFY(nvlist_add_uint64(newspa->spa_config_splitting, 4545 ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0); 4546 spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, 4547 B_TRUE)); 4548 } 4549 4550 /* set the props */ 4551 if (props != NULL) { 4552 spa_configfile_set(newspa, props, B_FALSE); 4553 error = spa_prop_set(newspa, props); 4554 if (error) 4555 goto out; 4556 } 4557 4558 /* flush everything */ 4559 txg = spa_vdev_config_enter(newspa); 4560 vdev_config_dirty(newspa->spa_root_vdev); 4561 (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); 4562 4563 if (zio_injection_enabled) 4564 zio_handle_panic_injection(spa, FTAG, 2); 4565 4566 spa_async_resume(newspa); 4567 4568 /* finally, update the original pool's config */ 4569 txg = spa_vdev_config_enter(spa); 4570 tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 4571 error = dmu_tx_assign(tx, TXG_WAIT); 4572 if (error != 0) 4573 dmu_tx_abort(tx); 4574 for (c = 0; c < children; c++) { 4575 if (vml[c] != NULL) { 4576 vdev_split(vml[c]); 4577 if (error == 0) 4578 spa_history_log_internal(LOG_POOL_VDEV_DETACH, 4579 spa, tx, "vdev=%s", 4580 vml[c]->vdev_path); 4581 vdev_free(vml[c]); 4582 } 4583 } 4584 vdev_config_dirty(spa->spa_root_vdev); 4585 spa->spa_config_splitting = NULL; 4586 nvlist_free(nvl); 4587 if (error == 0) 4588 dmu_tx_commit(tx); 4589 (void) spa_vdev_exit(spa, NULL, txg, 0); 4590 4591 if (zio_injection_enabled) 4592 zio_handle_panic_injection(spa, FTAG, 3); 4593 4594 /* split is complete; log a history record */ 4595 spa_history_log_internal(LOG_POOL_SPLIT, newspa, NULL, 4596 "split new pool %s from pool %s", newname, spa_name(spa)); 4597 4598 kmem_free(vml, children * sizeof (vdev_t *)); 4599 4600 /* if we're not going to mount the filesystems in userland, export */ 4601 if (exp) 4602 error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, 4603 B_FALSE, B_FALSE); 4604 4605 return (error); 4606 4607out: 4608 spa_unload(newspa); 4609 spa_deactivate(newspa); 4610 spa_remove(newspa); 4611 4612 txg = spa_vdev_config_enter(spa); 4613 4614 /* re-online all offlined disks */ 4615 for (c = 0; c < children; c++) { 4616 if (vml[c] != NULL) 4617 vml[c]->vdev_offline = B_FALSE; 4618 } 4619 vdev_reopen(spa->spa_root_vdev); 4620 4621 nvlist_free(spa->spa_config_splitting); 4622 spa->spa_config_splitting = NULL; 4623 (void) spa_vdev_exit(spa, NULL, txg, error); 4624 4625 kmem_free(vml, children * sizeof (vdev_t *)); 4626 return (error); 4627} 4628 4629static nvlist_t * 4630spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) 4631{ 4632 for (int i = 0; i < count; i++) { 4633 uint64_t guid; 4634 4635 VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID, 4636 &guid) == 0); 4637 4638 if (guid == target_guid) 4639 return (nvpp[i]); 4640 } 4641 4642 return (NULL); 4643} 4644 4645static void 4646spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, 4647 nvlist_t *dev_to_remove) 4648{ 4649 nvlist_t **newdev = NULL; 4650 4651 if (count > 1) 4652 newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); 4653 4654 for (int i = 0, j = 0; i < count; i++) { 4655 if (dev[i] == dev_to_remove) 4656 continue; 4657 VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); 4658 } 4659 4660 VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); 4661 VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); 4662 4663 for (int i = 0; i < count - 1; i++) 4664 nvlist_free(newdev[i]); 4665 4666 if (count > 1) 4667 kmem_free(newdev, (count - 1) * sizeof (void *)); 4668} 4669 4670/* 4671 * Evacuate the device. 4672 */ 4673static int 4674spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd) 4675{ 4676 uint64_t txg; 4677 int error = 0; 4678 4679 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 4680 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 4681 ASSERT(vd == vd->vdev_top); 4682 4683 /* 4684 * Evacuate the device. We don't hold the config lock as writer 4685 * since we need to do I/O but we do keep the 4686 * spa_namespace_lock held. Once this completes the device 4687 * should no longer have any blocks allocated on it. 4688 */ 4689 if (vd->vdev_islog) { 4690 if (vd->vdev_stat.vs_alloc != 0) 4691 error = spa_offline_log(spa); 4692 } else { 4693 error = ENOTSUP; 4694 } 4695 4696 if (error) 4697 return (error); 4698 4699 /* 4700 * The evacuation succeeded. Remove any remaining MOS metadata 4701 * associated with this vdev, and wait for these changes to sync. 4702 */ 4703 ASSERT3U(vd->vdev_stat.vs_alloc, ==, 0); 4704 txg = spa_vdev_config_enter(spa); 4705 vd->vdev_removing = B_TRUE; 4706 vdev_dirty(vd, 0, NULL, txg); 4707 vdev_config_dirty(vd); 4708 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 4709 4710 return (0); 4711} 4712 4713/* 4714 * Complete the removal by cleaning up the namespace. 4715 */ 4716static void 4717spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd) 4718{ 4719 vdev_t *rvd = spa->spa_root_vdev; 4720 uint64_t id = vd->vdev_id; 4721 boolean_t last_vdev = (id == (rvd->vdev_children - 1)); 4722 4723 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 4724 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 4725 ASSERT(vd == vd->vdev_top); 4726 4727 /* 4728 * Only remove any devices which are empty. 4729 */ 4730 if (vd->vdev_stat.vs_alloc != 0) 4731 return; 4732 4733 (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 4734 4735 if (list_link_active(&vd->vdev_state_dirty_node)) 4736 vdev_state_clean(vd); 4737 if (list_link_active(&vd->vdev_config_dirty_node)) 4738 vdev_config_clean(vd); 4739 4740 vdev_free(vd); 4741 4742 if (last_vdev) { 4743 vdev_compact_children(rvd); 4744 } else { 4745 vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); 4746 vdev_add_child(rvd, vd); 4747 } 4748 vdev_config_dirty(rvd); 4749 4750 /* 4751 * Reassess the health of our root vdev. 4752 */ 4753 vdev_reopen(rvd); 4754} 4755 4756/* 4757 * Remove a device from the pool - 4758 * 4759 * Removing a device from the vdev namespace requires several steps 4760 * and can take a significant amount of time. As a result we use 4761 * the spa_vdev_config_[enter/exit] functions which allow us to 4762 * grab and release the spa_config_lock while still holding the namespace 4763 * lock. During each step the configuration is synced out. 4764 */ 4765 4766/* 4767 * Remove a device from the pool. Currently, this supports removing only hot 4768 * spares, slogs, and level 2 ARC devices. 4769 */ 4770int 4771spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 4772{ 4773 vdev_t *vd; 4774 metaslab_group_t *mg; 4775 nvlist_t **spares, **l2cache, *nv; 4776 uint64_t txg = 0; 4777 uint_t nspares, nl2cache; 4778 int error = 0; 4779 boolean_t locked = MUTEX_HELD(&spa_namespace_lock); 4780 4781 ASSERT(spa_writeable(spa)); 4782 4783 if (!locked) 4784 txg = spa_vdev_enter(spa); 4785 4786 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 4787 4788 if (spa->spa_spares.sav_vdevs != NULL && 4789 nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 4790 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && 4791 (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { 4792 /* 4793 * Only remove the hot spare if it's not currently in use 4794 * in this pool. 4795 */ 4796 if (vd == NULL || unspare) { 4797 spa_vdev_remove_aux(spa->spa_spares.sav_config, 4798 ZPOOL_CONFIG_SPARES, spares, nspares, nv); 4799 spa_load_spares(spa); 4800 spa->spa_spares.sav_sync = B_TRUE; 4801 } else { 4802 error = EBUSY; 4803 } 4804 } else if (spa->spa_l2cache.sav_vdevs != NULL && 4805 nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 4806 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && 4807 (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { 4808 /* 4809 * Cache devices can always be removed. 4810 */ 4811 spa_vdev_remove_aux(spa->spa_l2cache.sav_config, 4812 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); 4813 spa_load_l2cache(spa); 4814 spa->spa_l2cache.sav_sync = B_TRUE; 4815 } else if (vd != NULL && vd->vdev_islog) { 4816 ASSERT(!locked); 4817 ASSERT(vd == vd->vdev_top); 4818 4819 /* 4820 * XXX - Once we have bp-rewrite this should 4821 * become the common case. 4822 */ 4823 4824 mg = vd->vdev_mg; 4825 4826 /* 4827 * Stop allocating from this vdev. 4828 */ 4829 metaslab_group_passivate(mg); 4830 4831 /* 4832 * Wait for the youngest allocations and frees to sync, 4833 * and then wait for the deferral of those frees to finish. 4834 */ 4835 spa_vdev_config_exit(spa, NULL, 4836 txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); 4837 4838 /* 4839 * Attempt to evacuate the vdev. 4840 */ 4841 error = spa_vdev_remove_evacuate(spa, vd); 4842 4843 txg = spa_vdev_config_enter(spa); 4844 4845 /* 4846 * If we couldn't evacuate the vdev, unwind. 4847 */ 4848 if (error) { 4849 metaslab_group_activate(mg); 4850 return (spa_vdev_exit(spa, NULL, txg, error)); 4851 } 4852 4853 /* 4854 * Clean up the vdev namespace. 4855 */ 4856 spa_vdev_remove_from_namespace(spa, vd); 4857 4858 } else if (vd != NULL) { 4859 /* 4860 * Normal vdevs cannot be removed (yet). 4861 */ 4862 error = ENOTSUP; 4863 } else { 4864 /* 4865 * There is no vdev of any kind with the specified guid. 4866 */ 4867 error = ENOENT; 4868 } 4869 4870 if (!locked) 4871 return (spa_vdev_exit(spa, NULL, txg, error)); 4872 4873 return (error); 4874} 4875 4876/* 4877 * Find any device that's done replacing, or a vdev marked 'unspare' that's 4878 * current spared, so we can detach it. 4879 */ 4880static vdev_t * 4881spa_vdev_resilver_done_hunt(vdev_t *vd) 4882{ 4883 vdev_t *newvd, *oldvd; 4884 4885 for (int c = 0; c < vd->vdev_children; c++) { 4886 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 4887 if (oldvd != NULL) 4888 return (oldvd); 4889 } 4890 4891 /* 4892 * Check for a completed replacement. We always consider the first 4893 * vdev in the list to be the oldest vdev, and the last one to be 4894 * the newest (see spa_vdev_attach() for how that works). In 4895 * the case where the newest vdev is faulted, we will not automatically 4896 * remove it after a resilver completes. This is OK as it will require 4897 * user intervention to determine which disk the admin wishes to keep. 4898 */ 4899 if (vd->vdev_ops == &vdev_replacing_ops) { 4900 ASSERT(vd->vdev_children > 1); 4901 4902 newvd = vd->vdev_child[vd->vdev_children - 1]; 4903 oldvd = vd->vdev_child[0]; 4904 4905 if (vdev_dtl_empty(newvd, DTL_MISSING) && 4906 vdev_dtl_empty(newvd, DTL_OUTAGE) && 4907 !vdev_dtl_required(oldvd)) 4908 return (oldvd); 4909 } 4910 4911 /* 4912 * Check for a completed resilver with the 'unspare' flag set. 4913 */ 4914 if (vd->vdev_ops == &vdev_spare_ops) { 4915 vdev_t *first = vd->vdev_child[0]; 4916 vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; 4917 4918 if (last->vdev_unspare) { 4919 oldvd = first; 4920 newvd = last; 4921 } else if (first->vdev_unspare) { 4922 oldvd = last; 4923 newvd = first; 4924 } else { 4925 oldvd = NULL; 4926 } 4927 4928 if (oldvd != NULL && 4929 vdev_dtl_empty(newvd, DTL_MISSING) && 4930 vdev_dtl_empty(newvd, DTL_OUTAGE) && 4931 !vdev_dtl_required(oldvd)) 4932 return (oldvd); 4933 4934 /* 4935 * If there are more than two spares attached to a disk, 4936 * and those spares are not required, then we want to 4937 * attempt to free them up now so that they can be used 4938 * by other pools. Once we're back down to a single 4939 * disk+spare, we stop removing them. 4940 */ 4941 if (vd->vdev_children > 2) { 4942 newvd = vd->vdev_child[1]; 4943 4944 if (newvd->vdev_isspare && last->vdev_isspare && 4945 vdev_dtl_empty(last, DTL_MISSING) && 4946 vdev_dtl_empty(last, DTL_OUTAGE) && 4947 !vdev_dtl_required(newvd)) 4948 return (newvd); 4949 } 4950 } 4951 4952 return (NULL); 4953} 4954 4955static void 4956spa_vdev_resilver_done(spa_t *spa) 4957{ 4958 vdev_t *vd, *pvd, *ppvd; 4959 uint64_t guid, sguid, pguid, ppguid; 4960 4961 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4962 4963 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 4964 pvd = vd->vdev_parent; 4965 ppvd = pvd->vdev_parent; 4966 guid = vd->vdev_guid; 4967 pguid = pvd->vdev_guid; 4968 ppguid = ppvd->vdev_guid; 4969 sguid = 0; 4970 /* 4971 * If we have just finished replacing a hot spared device, then 4972 * we need to detach the parent's first child (the original hot 4973 * spare) as well. 4974 */ 4975 if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && 4976 ppvd->vdev_children == 2) { 4977 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 4978 sguid = ppvd->vdev_child[1]->vdev_guid; 4979 } 4980 spa_config_exit(spa, SCL_ALL, FTAG); 4981 if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) 4982 return; 4983 if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) 4984 return; 4985 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4986 } 4987 4988 spa_config_exit(spa, SCL_ALL, FTAG); 4989} 4990 4991/* 4992 * Update the stored path or FRU for this vdev. 4993 */ 4994int 4995spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, 4996 boolean_t ispath) 4997{ 4998 vdev_t *vd; 4999 boolean_t sync = B_FALSE; 5000 5001 ASSERT(spa_writeable(spa)); 5002 5003 spa_vdev_state_enter(spa, SCL_ALL); 5004 5005 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 5006 return (spa_vdev_state_exit(spa, NULL, ENOENT)); 5007 5008 if (!vd->vdev_ops->vdev_op_leaf) 5009 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 5010 5011 if (ispath) { 5012 if (strcmp(value, vd->vdev_path) != 0) { 5013 spa_strfree(vd->vdev_path); 5014 vd->vdev_path = spa_strdup(value); 5015 sync = B_TRUE; 5016 } 5017 } else { 5018 if (vd->vdev_fru == NULL) { 5019 vd->vdev_fru = spa_strdup(value); 5020 sync = B_TRUE; 5021 } else if (strcmp(value, vd->vdev_fru) != 0) { 5022 spa_strfree(vd->vdev_fru); 5023 vd->vdev_fru = spa_strdup(value); 5024 sync = B_TRUE; 5025 } 5026 } 5027 5028 return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); 5029} 5030 5031int 5032spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 5033{ 5034 return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); 5035} 5036 5037int 5038spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) 5039{ 5040 return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); 5041} 5042 5043/* 5044 * ========================================================================== 5045 * SPA Scanning 5046 * ========================================================================== 5047 */ 5048 5049int 5050spa_scan_stop(spa_t *spa) 5051{ 5052 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5053 if (dsl_scan_resilvering(spa->spa_dsl_pool)) 5054 return (EBUSY); 5055 return (dsl_scan_cancel(spa->spa_dsl_pool)); 5056} 5057 5058int 5059spa_scan(spa_t *spa, pool_scan_func_t func) 5060{ 5061 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5062 5063 if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) 5064 return (ENOTSUP); 5065 5066 /* 5067 * If a resilver was requested, but there is no DTL on a 5068 * writeable leaf device, we have nothing to do. 5069 */ 5070 if (func == POOL_SCAN_RESILVER && 5071 !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 5072 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 5073 return (0); 5074 } 5075 5076 return (dsl_scan(spa->spa_dsl_pool, func)); 5077} 5078 5079/* 5080 * ========================================================================== 5081 * SPA async task processing 5082 * ========================================================================== 5083 */ 5084 5085static void 5086spa_async_remove(spa_t *spa, vdev_t *vd) 5087{ 5088 if (vd->vdev_remove_wanted) { 5089 vd->vdev_remove_wanted = B_FALSE; 5090 vd->vdev_delayed_close = B_FALSE; 5091 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 5092 5093 /* 5094 * We want to clear the stats, but we don't want to do a full 5095 * vdev_clear() as that will cause us to throw away 5096 * degraded/faulted state as well as attempt to reopen the 5097 * device, all of which is a waste. 5098 */ 5099 vd->vdev_stat.vs_read_errors = 0; 5100 vd->vdev_stat.vs_write_errors = 0; 5101 vd->vdev_stat.vs_checksum_errors = 0; 5102 5103 vdev_state_dirty(vd->vdev_top); 5104 } 5105 5106 for (int c = 0; c < vd->vdev_children; c++) 5107 spa_async_remove(spa, vd->vdev_child[c]); 5108} 5109 5110static void 5111spa_async_probe(spa_t *spa, vdev_t *vd) 5112{ 5113 if (vd->vdev_probe_wanted) { 5114 vd->vdev_probe_wanted = B_FALSE; 5115 vdev_reopen(vd); /* vdev_open() does the actual probe */ 5116 } 5117 5118 for (int c = 0; c < vd->vdev_children; c++) 5119 spa_async_probe(spa, vd->vdev_child[c]); 5120} 5121 5122static void 5123spa_async_autoexpand(spa_t *spa, vdev_t *vd) 5124{ 5125 sysevent_id_t eid; 5126 nvlist_t *attr; 5127 char *physpath; 5128 5129 if (!spa->spa_autoexpand) 5130 return; 5131 5132 for (int c = 0; c < vd->vdev_children; c++) { 5133 vdev_t *cvd = vd->vdev_child[c]; 5134 spa_async_autoexpand(spa, cvd); 5135 } 5136 5137 if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) 5138 return; 5139 5140 physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP); 5141 (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath); 5142 5143 VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5144 VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0); 5145 5146 (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS, 5147 ESC_ZFS_VDEV_AUTOEXPAND, attr, &eid, DDI_SLEEP); 5148 5149 nvlist_free(attr); 5150 kmem_free(physpath, MAXPATHLEN); 5151} 5152 5153static void 5154spa_async_thread(void *arg) 5155{ 5156 spa_t *spa = arg; 5157 int tasks; 5158 5159 ASSERT(spa->spa_sync_on); 5160 5161 mutex_enter(&spa->spa_async_lock); 5162 tasks = spa->spa_async_tasks; 5163 spa->spa_async_tasks = 0; 5164 mutex_exit(&spa->spa_async_lock); 5165 5166 /* 5167 * See if the config needs to be updated. 5168 */ 5169 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 5170 uint64_t old_space, new_space; 5171 5172 mutex_enter(&spa_namespace_lock); 5173 old_space = metaslab_class_get_space(spa_normal_class(spa)); 5174 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 5175 new_space = metaslab_class_get_space(spa_normal_class(spa)); 5176 mutex_exit(&spa_namespace_lock); 5177 5178 /* 5179 * If the pool grew as a result of the config update, 5180 * then log an internal history event. 5181 */ 5182 if (new_space != old_space) { 5183 spa_history_log_internal(LOG_POOL_VDEV_ONLINE, 5184 spa, NULL, 5185 "pool '%s' size: %llu(+%llu)", 5186 spa_name(spa), new_space, new_space - old_space); 5187 } 5188 } 5189 5190 /* 5191 * See if any devices need to be marked REMOVED. 5192 */ 5193 if (tasks & SPA_ASYNC_REMOVE) { 5194 spa_vdev_state_enter(spa, SCL_NONE); 5195 spa_async_remove(spa, spa->spa_root_vdev); 5196 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 5197 spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); 5198 for (int i = 0; i < spa->spa_spares.sav_count; i++) 5199 spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); 5200 (void) spa_vdev_state_exit(spa, NULL, 0); 5201 } 5202 5203 if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { 5204 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 5205 spa_async_autoexpand(spa, spa->spa_root_vdev); 5206 spa_config_exit(spa, SCL_CONFIG, FTAG); 5207 } 5208 5209 /* 5210 * See if any devices need to be probed. 5211 */ 5212 if (tasks & SPA_ASYNC_PROBE) { 5213 spa_vdev_state_enter(spa, SCL_NONE); 5214 spa_async_probe(spa, spa->spa_root_vdev); 5215 (void) spa_vdev_state_exit(spa, NULL, 0); 5216 } 5217 5218 /* 5219 * If any devices are done replacing, detach them. 5220 */ 5221 if (tasks & SPA_ASYNC_RESILVER_DONE) 5222 spa_vdev_resilver_done(spa); 5223 5224 /* 5225 * Kick off a resilver. 5226 */ 5227 if (tasks & SPA_ASYNC_RESILVER) 5228 dsl_resilver_restart(spa->spa_dsl_pool, 0); 5229 5230 /* 5231 * Let the world know that we're done. 5232 */ 5233 mutex_enter(&spa->spa_async_lock); 5234 spa->spa_async_thread = NULL; 5235 cv_broadcast(&spa->spa_async_cv); 5236 mutex_exit(&spa->spa_async_lock); 5237 thread_exit(); 5238} 5239 5240void 5241spa_async_suspend(spa_t *spa) 5242{ 5243 mutex_enter(&spa->spa_async_lock); 5244 spa->spa_async_suspended++; 5245 while (spa->spa_async_thread != NULL) 5246 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 5247 mutex_exit(&spa->spa_async_lock); 5248} 5249 5250void 5251spa_async_resume(spa_t *spa) 5252{ 5253 mutex_enter(&spa->spa_async_lock); 5254 ASSERT(spa->spa_async_suspended != 0); 5255 spa->spa_async_suspended--; 5256 mutex_exit(&spa->spa_async_lock); 5257} 5258 5259static void 5260spa_async_dispatch(spa_t *spa) 5261{ 5262 mutex_enter(&spa->spa_async_lock); 5263 if (spa->spa_async_tasks && !spa->spa_async_suspended && 5264 spa->spa_async_thread == NULL && 5265 rootdir != NULL && !vn_is_readonly(rootdir)) 5266 spa->spa_async_thread = thread_create(NULL, 0, 5267 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 5268 mutex_exit(&spa->spa_async_lock); 5269} 5270 5271void 5272spa_async_request(spa_t *spa, int task) 5273{ 5274 zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); 5275 mutex_enter(&spa->spa_async_lock); 5276 spa->spa_async_tasks |= task; 5277 mutex_exit(&spa->spa_async_lock); 5278} 5279 5280/* 5281 * ========================================================================== 5282 * SPA syncing routines 5283 * ========================================================================== 5284 */ 5285 5286static int 5287bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 5288{ 5289 bpobj_t *bpo = arg; 5290 bpobj_enqueue(bpo, bp, tx); 5291 return (0); 5292} 5293 5294static int 5295spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 5296{ 5297 zio_t *zio = arg; 5298 5299 zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp, 5300 zio->io_flags)); 5301 return (0); 5302} 5303 5304static void 5305spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 5306{ 5307 char *packed = NULL; 5308 size_t bufsize; 5309 size_t nvsize = 0; 5310 dmu_buf_t *db; 5311 5312 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 5313 5314 /* 5315 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 5316 * information. This avoids the dbuf_will_dirty() path and 5317 * saves us a pre-read to get data we don't actually care about. 5318 */ 5319 bufsize = P2ROUNDUP(nvsize, SPA_CONFIG_BLOCKSIZE); 5320 packed = kmem_alloc(bufsize, KM_SLEEP); 5321 5322 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 5323 KM_SLEEP) == 0); 5324 bzero(packed + nvsize, bufsize - nvsize); 5325 5326 dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); 5327 5328 kmem_free(packed, bufsize); 5329 5330 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 5331 dmu_buf_will_dirty(db, tx); 5332 *(uint64_t *)db->db_data = nvsize; 5333 dmu_buf_rele(db, FTAG); 5334} 5335 5336static void 5337spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 5338 const char *config, const char *entry) 5339{ 5340 nvlist_t *nvroot; 5341 nvlist_t **list; 5342 int i; 5343 5344 if (!sav->sav_sync) 5345 return; 5346 5347 /* 5348 * Update the MOS nvlist describing the list of available devices. 5349 * spa_validate_aux() will have already made sure this nvlist is 5350 * valid and the vdevs are labeled appropriately. 5351 */ 5352 if (sav->sav_object == 0) { 5353 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 5354 DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 5355 sizeof (uint64_t), tx); 5356 VERIFY(zap_update(spa->spa_meta_objset, 5357 DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 5358 &sav->sav_object, tx) == 0); 5359 } 5360 5361 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5362 if (sav->sav_count == 0) { 5363 VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); 5364 } else { 5365 list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 5366 for (i = 0; i < sav->sav_count; i++) 5367 list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 5368 B_FALSE, VDEV_CONFIG_L2CACHE); 5369 VERIFY(nvlist_add_nvlist_array(nvroot, config, list, 5370 sav->sav_count) == 0); 5371 for (i = 0; i < sav->sav_count; i++) 5372 nvlist_free(list[i]); 5373 kmem_free(list, sav->sav_count * sizeof (void *)); 5374 } 5375 5376 spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 5377 nvlist_free(nvroot); 5378 5379 sav->sav_sync = B_FALSE; 5380} 5381 5382static void 5383spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 5384{ 5385 nvlist_t *config; 5386 5387 if (list_is_empty(&spa->spa_config_dirty_list)) 5388 return; 5389 5390 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 5391 5392 config = spa_config_generate(spa, spa->spa_root_vdev, 5393 dmu_tx_get_txg(tx), B_FALSE); 5394 5395 spa_config_exit(spa, SCL_STATE, FTAG); 5396 5397 if (spa->spa_config_syncing) 5398 nvlist_free(spa->spa_config_syncing); 5399 spa->spa_config_syncing = config; 5400 5401 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 5402} 5403 5404/* 5405 * Set zpool properties. 5406 */ 5407static void 5408spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx) 5409{ 5410 spa_t *spa = arg1; 5411 objset_t *mos = spa->spa_meta_objset; 5412 nvlist_t *nvp = arg2; 5413 nvpair_t *elem; 5414 uint64_t intval; 5415 char *strval; 5416 zpool_prop_t prop; 5417 const char *propname; 5418 zprop_type_t proptype; 5419 5420 mutex_enter(&spa->spa_props_lock); 5421 5422 elem = NULL; 5423 while ((elem = nvlist_next_nvpair(nvp, elem))) { 5424 switch (prop = zpool_name_to_prop(nvpair_name(elem))) { 5425 case ZPOOL_PROP_VERSION: 5426 /* 5427 * Only set version for non-zpool-creation cases 5428 * (set/import). spa_create() needs special care 5429 * for version setting. 5430 */ 5431 if (tx->tx_txg != TXG_INITIAL) { 5432 VERIFY(nvpair_value_uint64(elem, 5433 &intval) == 0); 5434 ASSERT(intval <= SPA_VERSION); 5435 ASSERT(intval >= spa_version(spa)); 5436 spa->spa_uberblock.ub_version = intval; 5437 vdev_config_dirty(spa->spa_root_vdev); 5438 } 5439 break; 5440 5441 case ZPOOL_PROP_ALTROOT: 5442 /* 5443 * 'altroot' is a non-persistent property. It should 5444 * have been set temporarily at creation or import time. 5445 */ 5446 ASSERT(spa->spa_root != NULL); 5447 break; 5448 5449 case ZPOOL_PROP_READONLY: 5450 case ZPOOL_PROP_CACHEFILE: 5451 /* 5452 * 'readonly' and 'cachefile' are also non-persisitent 5453 * properties. 5454 */ 5455 break; 5456 case ZPOOL_PROP_COMMENT: 5457 VERIFY(nvpair_value_string(elem, &strval) == 0); 5458 if (spa->spa_comment != NULL) 5459 spa_strfree(spa->spa_comment); 5460 spa->spa_comment = spa_strdup(strval); 5461 /* 5462 * We need to dirty the configuration on all the vdevs 5463 * so that their labels get updated. It's unnecessary 5464 * to do this for pool creation since the vdev's 5465 * configuratoin has already been dirtied. 5466 */ 5467 if (tx->tx_txg != TXG_INITIAL) 5468 vdev_config_dirty(spa->spa_root_vdev); 5469 break; 5470 default: 5471 /* 5472 * Set pool property values in the poolprops mos object. 5473 */ 5474 if (spa->spa_pool_props_object == 0) { 5475 VERIFY((spa->spa_pool_props_object = 5476 zap_create(mos, DMU_OT_POOL_PROPS, 5477 DMU_OT_NONE, 0, tx)) > 0); 5478 5479 VERIFY(zap_update(mos, 5480 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 5481 8, 1, &spa->spa_pool_props_object, tx) 5482 == 0); 5483 } 5484 5485 /* normalize the property name */ 5486 propname = zpool_prop_to_name(prop); 5487 proptype = zpool_prop_get_type(prop); 5488 5489 if (nvpair_type(elem) == DATA_TYPE_STRING) { 5490 ASSERT(proptype == PROP_TYPE_STRING); 5491 VERIFY(nvpair_value_string(elem, &strval) == 0); 5492 VERIFY(zap_update(mos, 5493 spa->spa_pool_props_object, propname, 5494 1, strlen(strval) + 1, strval, tx) == 0); 5495 5496 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 5497 VERIFY(nvpair_value_uint64(elem, &intval) == 0); 5498 5499 if (proptype == PROP_TYPE_INDEX) { 5500 const char *unused; 5501 VERIFY(zpool_prop_index_to_string( 5502 prop, intval, &unused) == 0); 5503 } 5504 VERIFY(zap_update(mos, 5505 spa->spa_pool_props_object, propname, 5506 8, 1, &intval, tx) == 0); 5507 } else { 5508 ASSERT(0); /* not allowed */ 5509 } 5510 5511 switch (prop) { 5512 case ZPOOL_PROP_DELEGATION: 5513 spa->spa_delegation = intval; 5514 break; 5515 case ZPOOL_PROP_BOOTFS: 5516 spa->spa_bootfs = intval; 5517 break; 5518 case ZPOOL_PROP_FAILUREMODE: 5519 spa->spa_failmode = intval; 5520 break; 5521 case ZPOOL_PROP_AUTOEXPAND: 5522 spa->spa_autoexpand = intval; 5523 if (tx->tx_txg != TXG_INITIAL) 5524 spa_async_request(spa, 5525 SPA_ASYNC_AUTOEXPAND); 5526 break; 5527 case ZPOOL_PROP_DEDUPDITTO: 5528 spa->spa_dedup_ditto = intval; 5529 break; 5530 default: 5531 break; 5532 } 5533 } 5534 5535 /* log internal history if this is not a zpool create */ 5536 if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY && 5537 tx->tx_txg != TXG_INITIAL) { 5538 spa_history_log_internal(LOG_POOL_PROPSET, 5539 spa, tx, "%s %lld %s", 5540 nvpair_name(elem), intval, spa_name(spa)); 5541 } 5542 } 5543 5544 mutex_exit(&spa->spa_props_lock); 5545} 5546 5547/* 5548 * Perform one-time upgrade on-disk changes. spa_version() does not 5549 * reflect the new version this txg, so there must be no changes this 5550 * txg to anything that the upgrade code depends on after it executes. 5551 * Therefore this must be called after dsl_pool_sync() does the sync 5552 * tasks. 5553 */ 5554static void 5555spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) 5556{ 5557 dsl_pool_t *dp = spa->spa_dsl_pool; 5558 5559 ASSERT(spa->spa_sync_pass == 1); 5560 5561 if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && 5562 spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { 5563 dsl_pool_create_origin(dp, tx); 5564 5565 /* Keeping the origin open increases spa_minref */ 5566 spa->spa_minref += 3; 5567 } 5568 5569 if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && 5570 spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { 5571 dsl_pool_upgrade_clones(dp, tx); 5572 } 5573 5574 if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && 5575 spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { 5576 dsl_pool_upgrade_dir_clones(dp, tx); 5577 5578 /* Keeping the freedir open increases spa_minref */ 5579 spa->spa_minref += 3; 5580 } 5581} 5582 5583/* 5584 * Sync the specified transaction group. New blocks may be dirtied as 5585 * part of the process, so we iterate until it converges. 5586 */ 5587void 5588spa_sync(spa_t *spa, uint64_t txg) 5589{ 5590 dsl_pool_t *dp = spa->spa_dsl_pool; 5591 objset_t *mos = spa->spa_meta_objset; 5592 bpobj_t *defer_bpo = &spa->spa_deferred_bpobj; 5593 bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; 5594 vdev_t *rvd = spa->spa_root_vdev; 5595 vdev_t *vd; 5596 dmu_tx_t *tx; 5597 int error; 5598 5599 VERIFY(spa_writeable(spa)); 5600 5601 /* 5602 * Lock out configuration changes. 5603 */ 5604 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 5605 5606 spa->spa_syncing_txg = txg; 5607 spa->spa_sync_pass = 0; 5608 5609 /* 5610 * If there are any pending vdev state changes, convert them 5611 * into config changes that go out with this transaction group. 5612 */ 5613 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 5614 while (list_head(&spa->spa_state_dirty_list) != NULL) { 5615 /* 5616 * We need the write lock here because, for aux vdevs, 5617 * calling vdev_config_dirty() modifies sav_config. 5618 * This is ugly and will become unnecessary when we 5619 * eliminate the aux vdev wart by integrating all vdevs 5620 * into the root vdev tree. 5621 */ 5622 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 5623 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); 5624 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 5625 vdev_state_clean(vd); 5626 vdev_config_dirty(vd); 5627 } 5628 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 5629 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 5630 } 5631 spa_config_exit(spa, SCL_STATE, FTAG); 5632 5633 tx = dmu_tx_create_assigned(dp, txg); 5634 5635 /* 5636 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 5637 * set spa_deflate if we have no raid-z vdevs. 5638 */ 5639 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 5640 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 5641 int i; 5642 5643 for (i = 0; i < rvd->vdev_children; i++) { 5644 vd = rvd->vdev_child[i]; 5645 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 5646 break; 5647 } 5648 if (i == rvd->vdev_children) { 5649 spa->spa_deflate = TRUE; 5650 VERIFY(0 == zap_add(spa->spa_meta_objset, 5651 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 5652 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 5653 } 5654 } 5655 5656 /* 5657 * If anything has changed in this txg, or if someone is waiting 5658 * for this txg to sync (eg, spa_vdev_remove()), push the 5659 * deferred frees from the previous txg. If not, leave them 5660 * alone so that we don't generate work on an otherwise idle 5661 * system. 5662 */ 5663 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 5664 !txg_list_empty(&dp->dp_dirty_dirs, txg) || 5665 !txg_list_empty(&dp->dp_sync_tasks, txg) || 5666 ((dsl_scan_active(dp->dp_scan) || 5667 txg_sync_waiting(dp)) && !spa_shutting_down(spa))) { 5668 zio_t *zio = zio_root(spa, NULL, NULL, 0); 5669 VERIFY3U(bpobj_iterate(defer_bpo, 5670 spa_free_sync_cb, zio, tx), ==, 0); 5671 VERIFY3U(zio_wait(zio), ==, 0); 5672 } 5673 5674 /* 5675 * Iterate to convergence. 5676 */ 5677 do { 5678 int pass = ++spa->spa_sync_pass; 5679 5680 spa_sync_config_object(spa, tx); 5681 spa_sync_aux_dev(spa, &spa->spa_spares, tx, 5682 ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 5683 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 5684 ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 5685 spa_errlog_sync(spa, txg); 5686 dsl_pool_sync(dp, txg); 5687 5688 if (pass <= SYNC_PASS_DEFERRED_FREE) { 5689 zio_t *zio = zio_root(spa, NULL, NULL, 0); 5690 bplist_iterate(free_bpl, spa_free_sync_cb, 5691 zio, tx); 5692 VERIFY(zio_wait(zio) == 0); 5693 } else { 5694 bplist_iterate(free_bpl, bpobj_enqueue_cb, 5695 defer_bpo, tx); 5696 } 5697 5698 ddt_sync(spa, txg); 5699 dsl_scan_sync(dp, tx); 5700 5701 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) 5702 vdev_sync(vd, txg); 5703 5704 if (pass == 1) 5705 spa_sync_upgrades(spa, tx); 5706 5707 } while (dmu_objset_is_dirty(mos, txg)); 5708 5709 /* 5710 * Rewrite the vdev configuration (which includes the uberblock) 5711 * to commit the transaction group. 5712 * 5713 * If there are no dirty vdevs, we sync the uberblock to a few 5714 * random top-level vdevs that are known to be visible in the 5715 * config cache (see spa_vdev_add() for a complete description). 5716 * If there *are* dirty vdevs, sync the uberblock to all vdevs. 5717 */ 5718 for (;;) { 5719 /* 5720 * We hold SCL_STATE to prevent vdev open/close/etc. 5721 * while we're attempting to write the vdev labels. 5722 */ 5723 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 5724 5725 if (list_is_empty(&spa->spa_config_dirty_list)) { 5726 vdev_t *svd[SPA_DVAS_PER_BP]; 5727 int svdcount = 0; 5728 int children = rvd->vdev_children; 5729 int c0 = spa_get_random(children); 5730 5731 for (int c = 0; c < children; c++) { 5732 vd = rvd->vdev_child[(c0 + c) % children]; 5733 if (vd->vdev_ms_array == 0 || vd->vdev_islog) 5734 continue; 5735 svd[svdcount++] = vd; 5736 if (svdcount == SPA_DVAS_PER_BP) 5737 break; 5738 } 5739 error = vdev_config_sync(svd, svdcount, txg, B_FALSE); 5740 if (error != 0) 5741 error = vdev_config_sync(svd, svdcount, txg, 5742 B_TRUE); 5743 } else { 5744 error = vdev_config_sync(rvd->vdev_child, 5745 rvd->vdev_children, txg, B_FALSE); 5746 if (error != 0) 5747 error = vdev_config_sync(rvd->vdev_child, 5748 rvd->vdev_children, txg, B_TRUE); 5749 } 5750 5751 spa_config_exit(spa, SCL_STATE, FTAG); 5752 5753 if (error == 0) 5754 break; 5755 zio_suspend(spa, NULL); 5756 zio_resume_wait(spa); 5757 } 5758 dmu_tx_commit(tx); 5759 5760 /* 5761 * Clear the dirty config list. 5762 */ 5763 while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) 5764 vdev_config_clean(vd); 5765 5766 /* 5767 * Now that the new config has synced transactionally, 5768 * let it become visible to the config cache. 5769 */ 5770 if (spa->spa_config_syncing != NULL) { 5771 spa_config_set(spa, spa->spa_config_syncing); 5772 spa->spa_config_txg = txg; 5773 spa->spa_config_syncing = NULL; 5774 } 5775 5776 spa->spa_ubsync = spa->spa_uberblock; 5777 5778 dsl_pool_sync_done(dp, txg); 5779 5780 /* 5781 * Update usable space statistics. 5782 */ 5783 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 5784 vdev_sync_done(vd, txg); 5785 5786 spa_update_dspace(spa); 5787 5788 /* 5789 * It had better be the case that we didn't dirty anything 5790 * since vdev_config_sync(). 5791 */ 5792 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 5793 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 5794 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 5795 5796 spa->spa_sync_pass = 0; 5797 5798 spa_config_exit(spa, SCL_CONFIG, FTAG); 5799 5800 spa_handle_ignored_writes(spa); 5801 5802 /* 5803 * If any async tasks have been requested, kick them off. 5804 */ 5805 spa_async_dispatch(spa); 5806} 5807 5808/* 5809 * Sync all pools. We don't want to hold the namespace lock across these 5810 * operations, so we take a reference on the spa_t and drop the lock during the 5811 * sync. 5812 */ 5813void 5814spa_sync_allpools(void) 5815{ 5816 spa_t *spa = NULL; 5817 mutex_enter(&spa_namespace_lock); 5818 while ((spa = spa_next(spa)) != NULL) { 5819 if (spa_state(spa) != POOL_STATE_ACTIVE || 5820 !spa_writeable(spa) || spa_suspended(spa)) 5821 continue; 5822 spa_open_ref(spa, FTAG); 5823 mutex_exit(&spa_namespace_lock); 5824 txg_wait_synced(spa_get_dsl(spa), 0); 5825 mutex_enter(&spa_namespace_lock); 5826 spa_close(spa, FTAG); 5827 } 5828 mutex_exit(&spa_namespace_lock); 5829} 5830 5831/* 5832 * ========================================================================== 5833 * Miscellaneous routines 5834 * ========================================================================== 5835 */ 5836 5837/* 5838 * Remove all pools in the system. 5839 */ 5840void 5841spa_evict_all(void) 5842{ 5843 spa_t *spa; 5844 5845 /* 5846 * Remove all cached state. All pools should be closed now, 5847 * so every spa in the AVL tree should be unreferenced. 5848 */ 5849 mutex_enter(&spa_namespace_lock); 5850 while ((spa = spa_next(NULL)) != NULL) { 5851 /* 5852 * Stop async tasks. The async thread may need to detach 5853 * a device that's been replaced, which requires grabbing 5854 * spa_namespace_lock, so we must drop it here. 5855 */ 5856 spa_open_ref(spa, FTAG); 5857 mutex_exit(&spa_namespace_lock); 5858 spa_async_suspend(spa); 5859 mutex_enter(&spa_namespace_lock); 5860 spa_close(spa, FTAG); 5861 5862 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 5863 spa_unload(spa); 5864 spa_deactivate(spa); 5865 } 5866 spa_remove(spa); 5867 } 5868 mutex_exit(&spa_namespace_lock); 5869} 5870 5871vdev_t * 5872spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) 5873{ 5874 vdev_t *vd; 5875 int i; 5876 5877 if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 5878 return (vd); 5879 5880 if (aux) { 5881 for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 5882 vd = spa->spa_l2cache.sav_vdevs[i]; 5883 if (vd->vdev_guid == guid) 5884 return (vd); 5885 } 5886 5887 for (i = 0; i < spa->spa_spares.sav_count; i++) { 5888 vd = spa->spa_spares.sav_vdevs[i]; 5889 if (vd->vdev_guid == guid) 5890 return (vd); 5891 } 5892 } 5893 5894 return (NULL); 5895} 5896 5897void 5898spa_upgrade(spa_t *spa, uint64_t version) 5899{ 5900 ASSERT(spa_writeable(spa)); 5901 5902 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5903 5904 /* 5905 * This should only be called for a non-faulted pool, and since a 5906 * future version would result in an unopenable pool, this shouldn't be 5907 * possible. 5908 */ 5909 ASSERT(spa->spa_uberblock.ub_version <= SPA_VERSION); 5910 ASSERT(version >= spa->spa_uberblock.ub_version); 5911 5912 spa->spa_uberblock.ub_version = version; 5913 vdev_config_dirty(spa->spa_root_vdev); 5914 5915 spa_config_exit(spa, SCL_ALL, FTAG); 5916 5917 txg_wait_synced(spa_get_dsl(spa), 0); 5918} 5919 5920boolean_t 5921spa_has_spare(spa_t *spa, uint64_t guid) 5922{ 5923 int i; 5924 uint64_t spareguid; 5925 spa_aux_vdev_t *sav = &spa->spa_spares; 5926 5927 for (i = 0; i < sav->sav_count; i++) 5928 if (sav->sav_vdevs[i]->vdev_guid == guid) 5929 return (B_TRUE); 5930 5931 for (i = 0; i < sav->sav_npending; i++) { 5932 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 5933 &spareguid) == 0 && spareguid == guid) 5934 return (B_TRUE); 5935 } 5936 5937 return (B_FALSE); 5938} 5939 5940/* 5941 * Check if a pool has an active shared spare device. 5942 * Note: reference count of an active spare is 2, as a spare and as a replace 5943 */ 5944static boolean_t 5945spa_has_active_shared_spare(spa_t *spa) 5946{ 5947 int i, refcnt; 5948 uint64_t pool; 5949 spa_aux_vdev_t *sav = &spa->spa_spares; 5950 5951 for (i = 0; i < sav->sav_count; i++) { 5952 if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, 5953 &refcnt) && pool != 0ULL && pool == spa_guid(spa) && 5954 refcnt > 2) 5955 return (B_TRUE); 5956 } 5957 5958 return (B_FALSE); 5959} 5960 5961/* 5962 * Post a sysevent corresponding to the given event. The 'name' must be one of 5963 * the event definitions in sys/sysevent/eventdefs.h. The payload will be 5964 * filled in from the spa and (optionally) the vdev. This doesn't do anything 5965 * in the userland libzpool, as we don't want consumers to misinterpret ztest 5966 * or zdb as real changes. 5967 */ 5968void 5969spa_event_notify(spa_t *spa, vdev_t *vd, const char *name) 5970{ 5971#ifdef _KERNEL 5972 sysevent_t *ev; 5973 sysevent_attr_list_t *attr = NULL; 5974 sysevent_value_t value; 5975 sysevent_id_t eid; 5976 5977 ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", 5978 SE_SLEEP); 5979 5980 value.value_type = SE_DATA_TYPE_STRING; 5981 value.value.sv_string = spa_name(spa); 5982 if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) 5983 goto done; 5984 5985 value.value_type = SE_DATA_TYPE_UINT64; 5986 value.value.sv_uint64 = spa_guid(spa); 5987 if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) 5988 goto done; 5989 5990 if (vd) { 5991 value.value_type = SE_DATA_TYPE_UINT64; 5992 value.value.sv_uint64 = vd->vdev_guid; 5993 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, 5994 SE_SLEEP) != 0) 5995 goto done; 5996 5997 if (vd->vdev_path) { 5998 value.value_type = SE_DATA_TYPE_STRING; 5999 value.value.sv_string = vd->vdev_path; 6000 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, 6001 &value, SE_SLEEP) != 0) 6002 goto done; 6003 } 6004 } 6005 6006 if (sysevent_attach_attributes(ev, attr) != 0) 6007 goto done; 6008 attr = NULL; 6009 6010 (void) log_sysevent(ev, SE_SLEEP, &eid); 6011 6012done: 6013 if (attr) 6014 sysevent_free_attr(attr); 6015 sysevent_free(ev); 6016#endif 6017}
| 3934 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 3935 3936 if (newrootvd->vdev_children != 1) 3937 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 3938 3939 newvd = newrootvd->vdev_child[0]; 3940 3941 if (!newvd->vdev_ops->vdev_op_leaf) 3942 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 3943 3944 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 3945 return (spa_vdev_exit(spa, newrootvd, txg, error)); 3946 3947 /* 3948 * Spares can't replace logs 3949 */ 3950 if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) 3951 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3952 3953 if (!replacing) { 3954 /* 3955 * For attach, the only allowable parent is a mirror or the root 3956 * vdev. 3957 */ 3958 if (pvd->vdev_ops != &vdev_mirror_ops && 3959 pvd->vdev_ops != &vdev_root_ops) 3960 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3961 3962 pvops = &vdev_mirror_ops; 3963 } else { 3964 /* 3965 * Active hot spares can only be replaced by inactive hot 3966 * spares. 3967 */ 3968 if (pvd->vdev_ops == &vdev_spare_ops && 3969 oldvd->vdev_isspare && 3970 !spa_has_spare(spa, newvd->vdev_guid)) 3971 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3972 3973 /* 3974 * If the source is a hot spare, and the parent isn't already a 3975 * spare, then we want to create a new hot spare. Otherwise, we 3976 * want to create a replacing vdev. The user is not allowed to 3977 * attach to a spared vdev child unless the 'isspare' state is 3978 * the same (spare replaces spare, non-spare replaces 3979 * non-spare). 3980 */ 3981 if (pvd->vdev_ops == &vdev_replacing_ops && 3982 spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { 3983 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3984 } else if (pvd->vdev_ops == &vdev_spare_ops && 3985 newvd->vdev_isspare != oldvd->vdev_isspare) { 3986 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3987 } 3988 3989 if (newvd->vdev_isspare) 3990 pvops = &vdev_spare_ops; 3991 else 3992 pvops = &vdev_replacing_ops; 3993 } 3994 3995 /* 3996 * Make sure the new device is big enough. 3997 */ 3998 if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) 3999 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 4000 4001 /* 4002 * The new device cannot have a higher alignment requirement 4003 * than the top-level vdev. 4004 */ 4005 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 4006 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 4007 4008 /* 4009 * If this is an in-place replacement, update oldvd's path and devid 4010 * to make it distinguishable from newvd, and unopenable from now on. 4011 */ 4012 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 4013 spa_strfree(oldvd->vdev_path); 4014 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 4015 KM_SLEEP); 4016 (void) sprintf(oldvd->vdev_path, "%s/%s", 4017 newvd->vdev_path, "old"); 4018 if (oldvd->vdev_devid != NULL) { 4019 spa_strfree(oldvd->vdev_devid); 4020 oldvd->vdev_devid = NULL; 4021 } 4022 } 4023 4024 /* mark the device being resilvered */ 4025 newvd->vdev_resilvering = B_TRUE; 4026 4027 /* 4028 * If the parent is not a mirror, or if we're replacing, insert the new 4029 * mirror/replacing/spare vdev above oldvd. 4030 */ 4031 if (pvd->vdev_ops != pvops) 4032 pvd = vdev_add_parent(oldvd, pvops); 4033 4034 ASSERT(pvd->vdev_top->vdev_parent == rvd); 4035 ASSERT(pvd->vdev_ops == pvops); 4036 ASSERT(oldvd->vdev_parent == pvd); 4037 4038 /* 4039 * Extract the new device from its root and add it to pvd. 4040 */ 4041 vdev_remove_child(newrootvd, newvd); 4042 newvd->vdev_id = pvd->vdev_children; 4043 newvd->vdev_crtxg = oldvd->vdev_crtxg; 4044 vdev_add_child(pvd, newvd); 4045 4046 tvd = newvd->vdev_top; 4047 ASSERT(pvd->vdev_top == tvd); 4048 ASSERT(tvd->vdev_parent == rvd); 4049 4050 vdev_config_dirty(tvd); 4051 4052 /* 4053 * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account 4054 * for any dmu_sync-ed blocks. It will propagate upward when 4055 * spa_vdev_exit() calls vdev_dtl_reassess(). 4056 */ 4057 dtl_max_txg = txg + TXG_CONCURRENT_STATES; 4058 4059 vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, 4060 dtl_max_txg - TXG_INITIAL); 4061 4062 if (newvd->vdev_isspare) { 4063 spa_spare_activate(newvd); 4064 spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE); 4065 } 4066 4067 oldvdpath = spa_strdup(oldvd->vdev_path); 4068 newvdpath = spa_strdup(newvd->vdev_path); 4069 newvd_isspare = newvd->vdev_isspare; 4070 4071 /* 4072 * Mark newvd's DTL dirty in this txg. 4073 */ 4074 vdev_dirty(tvd, VDD_DTL, newvd, txg); 4075 4076 /* 4077 * Restart the resilver 4078 */ 4079 dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg); 4080 4081 /* 4082 * Commit the config 4083 */ 4084 (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); 4085 4086 spa_history_log_internal(LOG_POOL_VDEV_ATTACH, spa, NULL, 4087 "%s vdev=%s %s vdev=%s", 4088 replacing && newvd_isspare ? "spare in" : 4089 replacing ? "replace" : "attach", newvdpath, 4090 replacing ? "for" : "to", oldvdpath); 4091 4092 spa_strfree(oldvdpath); 4093 spa_strfree(newvdpath); 4094 4095 if (spa->spa_bootfs) 4096 spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH); 4097 4098 return (0); 4099} 4100 4101/* 4102 * Detach a device from a mirror or replacing vdev. 4103 * If 'replace_done' is specified, only detach if the parent 4104 * is a replacing vdev. 4105 */ 4106int 4107spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) 4108{ 4109 uint64_t txg; 4110 int error; 4111 vdev_t *rvd = spa->spa_root_vdev; 4112 vdev_t *vd, *pvd, *cvd, *tvd; 4113 boolean_t unspare = B_FALSE; 4114 uint64_t unspare_guid; 4115 char *vdpath; 4116 4117 ASSERT(spa_writeable(spa)); 4118 4119 txg = spa_vdev_enter(spa); 4120 4121 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 4122 4123 if (vd == NULL) 4124 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 4125 4126 if (!vd->vdev_ops->vdev_op_leaf) 4127 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4128 4129 pvd = vd->vdev_parent; 4130 4131 /* 4132 * If the parent/child relationship is not as expected, don't do it. 4133 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing 4134 * vdev that's replacing B with C. The user's intent in replacing 4135 * is to go from M(A,B) to M(A,C). If the user decides to cancel 4136 * the replace by detaching C, the expected behavior is to end up 4137 * M(A,B). But suppose that right after deciding to detach C, 4138 * the replacement of B completes. We would have M(A,C), and then 4139 * ask to detach C, which would leave us with just A -- not what 4140 * the user wanted. To prevent this, we make sure that the 4141 * parent/child relationship hasn't changed -- in this example, 4142 * that C's parent is still the replacing vdev R. 4143 */ 4144 if (pvd->vdev_guid != pguid && pguid != 0) 4145 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 4146 4147 /* 4148 * Only 'replacing' or 'spare' vdevs can be replaced. 4149 */ 4150 if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && 4151 pvd->vdev_ops != &vdev_spare_ops) 4152 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4153 4154 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 4155 spa_version(spa) >= SPA_VERSION_SPARES); 4156 4157 /* 4158 * Only mirror, replacing, and spare vdevs support detach. 4159 */ 4160 if (pvd->vdev_ops != &vdev_replacing_ops && 4161 pvd->vdev_ops != &vdev_mirror_ops && 4162 pvd->vdev_ops != &vdev_spare_ops) 4163 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4164 4165 /* 4166 * If this device has the only valid copy of some data, 4167 * we cannot safely detach it. 4168 */ 4169 if (vdev_dtl_required(vd)) 4170 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 4171 4172 ASSERT(pvd->vdev_children >= 2); 4173 4174 /* 4175 * If we are detaching the second disk from a replacing vdev, then 4176 * check to see if we changed the original vdev's path to have "/old" 4177 * at the end in spa_vdev_attach(). If so, undo that change now. 4178 */ 4179 if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && 4180 vd->vdev_path != NULL) { 4181 size_t len = strlen(vd->vdev_path); 4182 4183 for (int c = 0; c < pvd->vdev_children; c++) { 4184 cvd = pvd->vdev_child[c]; 4185 4186 if (cvd == vd || cvd->vdev_path == NULL) 4187 continue; 4188 4189 if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 4190 strcmp(cvd->vdev_path + len, "/old") == 0) { 4191 spa_strfree(cvd->vdev_path); 4192 cvd->vdev_path = spa_strdup(vd->vdev_path); 4193 break; 4194 } 4195 } 4196 } 4197 4198 /* 4199 * If we are detaching the original disk from a spare, then it implies 4200 * that the spare should become a real disk, and be removed from the 4201 * active spare list for the pool. 4202 */ 4203 if (pvd->vdev_ops == &vdev_spare_ops && 4204 vd->vdev_id == 0 && 4205 pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare) 4206 unspare = B_TRUE; 4207 4208 /* 4209 * Erase the disk labels so the disk can be used for other things. 4210 * This must be done after all other error cases are handled, 4211 * but before we disembowel vd (so we can still do I/O to it). 4212 * But if we can't do it, don't treat the error as fatal -- 4213 * it may be that the unwritability of the disk is the reason 4214 * it's being detached! 4215 */ 4216 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 4217 4218 /* 4219 * Remove vd from its parent and compact the parent's children. 4220 */ 4221 vdev_remove_child(pvd, vd); 4222 vdev_compact_children(pvd); 4223 4224 /* 4225 * Remember one of the remaining children so we can get tvd below. 4226 */ 4227 cvd = pvd->vdev_child[pvd->vdev_children - 1]; 4228 4229 /* 4230 * If we need to remove the remaining child from the list of hot spares, 4231 * do it now, marking the vdev as no longer a spare in the process. 4232 * We must do this before vdev_remove_parent(), because that can 4233 * change the GUID if it creates a new toplevel GUID. For a similar 4234 * reason, we must remove the spare now, in the same txg as the detach; 4235 * otherwise someone could attach a new sibling, change the GUID, and 4236 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. 4237 */ 4238 if (unspare) { 4239 ASSERT(cvd->vdev_isspare); 4240 spa_spare_remove(cvd); 4241 unspare_guid = cvd->vdev_guid; 4242 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 4243 cvd->vdev_unspare = B_TRUE; 4244 } 4245 4246 /* 4247 * If the parent mirror/replacing vdev only has one child, 4248 * the parent is no longer needed. Remove it from the tree. 4249 */ 4250 if (pvd->vdev_children == 1) { 4251 if (pvd->vdev_ops == &vdev_spare_ops) 4252 cvd->vdev_unspare = B_FALSE; 4253 vdev_remove_parent(cvd); 4254 cvd->vdev_resilvering = B_FALSE; 4255 } 4256 4257 4258 /* 4259 * We don't set tvd until now because the parent we just removed 4260 * may have been the previous top-level vdev. 4261 */ 4262 tvd = cvd->vdev_top; 4263 ASSERT(tvd->vdev_parent == rvd); 4264 4265 /* 4266 * Reevaluate the parent vdev state. 4267 */ 4268 vdev_propagate_state(cvd); 4269 4270 /* 4271 * If the 'autoexpand' property is set on the pool then automatically 4272 * try to expand the size of the pool. For example if the device we 4273 * just detached was smaller than the others, it may be possible to 4274 * add metaslabs (i.e. grow the pool). We need to reopen the vdev 4275 * first so that we can obtain the updated sizes of the leaf vdevs. 4276 */ 4277 if (spa->spa_autoexpand) { 4278 vdev_reopen(tvd); 4279 vdev_expand(tvd, txg); 4280 } 4281 4282 vdev_config_dirty(tvd); 4283 4284 /* 4285 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 4286 * vd->vdev_detached is set and free vd's DTL object in syncing context. 4287 * But first make sure we're not on any *other* txg's DTL list, to 4288 * prevent vd from being accessed after it's freed. 4289 */ 4290 vdpath = spa_strdup(vd->vdev_path); 4291 for (int t = 0; t < TXG_SIZE; t++) 4292 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 4293 vd->vdev_detached = B_TRUE; 4294 vdev_dirty(tvd, VDD_DTL, vd, txg); 4295 4296 spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); 4297 4298 /* hang on to the spa before we release the lock */ 4299 spa_open_ref(spa, FTAG); 4300 4301 error = spa_vdev_exit(spa, vd, txg, 0); 4302 4303 spa_history_log_internal(LOG_POOL_VDEV_DETACH, spa, NULL, 4304 "vdev=%s", vdpath); 4305 spa_strfree(vdpath); 4306 4307 /* 4308 * If this was the removal of the original device in a hot spare vdev, 4309 * then we want to go through and remove the device from the hot spare 4310 * list of every other pool. 4311 */ 4312 if (unspare) { 4313 spa_t *altspa = NULL; 4314 4315 mutex_enter(&spa_namespace_lock); 4316 while ((altspa = spa_next(altspa)) != NULL) { 4317 if (altspa->spa_state != POOL_STATE_ACTIVE || 4318 altspa == spa) 4319 continue; 4320 4321 spa_open_ref(altspa, FTAG); 4322 mutex_exit(&spa_namespace_lock); 4323 (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); 4324 mutex_enter(&spa_namespace_lock); 4325 spa_close(altspa, FTAG); 4326 } 4327 mutex_exit(&spa_namespace_lock); 4328 4329 /* search the rest of the vdevs for spares to remove */ 4330 spa_vdev_resilver_done(spa); 4331 } 4332 4333 /* all done with the spa; OK to release */ 4334 mutex_enter(&spa_namespace_lock); 4335 spa_close(spa, FTAG); 4336 mutex_exit(&spa_namespace_lock); 4337 4338 return (error); 4339} 4340 4341/* 4342 * Split a set of devices from their mirrors, and create a new pool from them. 4343 */ 4344int 4345spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, 4346 nvlist_t *props, boolean_t exp) 4347{ 4348 int error = 0; 4349 uint64_t txg, *glist; 4350 spa_t *newspa; 4351 uint_t c, children, lastlog; 4352 nvlist_t **child, *nvl, *tmp; 4353 dmu_tx_t *tx; 4354 char *altroot = NULL; 4355 vdev_t *rvd, **vml = NULL; /* vdev modify list */ 4356 boolean_t activate_slog; 4357 4358 ASSERT(spa_writeable(spa)); 4359 4360 txg = spa_vdev_enter(spa); 4361 4362 /* clear the log and flush everything up to now */ 4363 activate_slog = spa_passivate_log(spa); 4364 (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 4365 error = spa_offline_log(spa); 4366 txg = spa_vdev_config_enter(spa); 4367 4368 if (activate_slog) 4369 spa_activate_log(spa); 4370 4371 if (error != 0) 4372 return (spa_vdev_exit(spa, NULL, txg, error)); 4373 4374 /* check new spa name before going any further */ 4375 if (spa_lookup(newname) != NULL) 4376 return (spa_vdev_exit(spa, NULL, txg, EEXIST)); 4377 4378 /* 4379 * scan through all the children to ensure they're all mirrors 4380 */ 4381 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || 4382 nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, 4383 &children) != 0) 4384 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4385 4386 /* first, check to ensure we've got the right child count */ 4387 rvd = spa->spa_root_vdev; 4388 lastlog = 0; 4389 for (c = 0; c < rvd->vdev_children; c++) { 4390 vdev_t *vd = rvd->vdev_child[c]; 4391 4392 /* don't count the holes & logs as children */ 4393 if (vd->vdev_islog || vd->vdev_ishole) { 4394 if (lastlog == 0) 4395 lastlog = c; 4396 continue; 4397 } 4398 4399 lastlog = 0; 4400 } 4401 if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) 4402 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4403 4404 /* next, ensure no spare or cache devices are part of the split */ 4405 if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || 4406 nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) 4407 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4408 4409 vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); 4410 glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); 4411 4412 /* then, loop over each vdev and validate it */ 4413 for (c = 0; c < children; c++) { 4414 uint64_t is_hole = 0; 4415 4416 (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, 4417 &is_hole); 4418 4419 if (is_hole != 0) { 4420 if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || 4421 spa->spa_root_vdev->vdev_child[c]->vdev_islog) { 4422 continue; 4423 } else { 4424 error = EINVAL; 4425 break; 4426 } 4427 } 4428 4429 /* which disk is going to be split? */ 4430 if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, 4431 &glist[c]) != 0) { 4432 error = EINVAL; 4433 break; 4434 } 4435 4436 /* look it up in the spa */ 4437 vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); 4438 if (vml[c] == NULL) { 4439 error = ENODEV; 4440 break; 4441 } 4442 4443 /* make sure there's nothing stopping the split */ 4444 if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || 4445 vml[c]->vdev_islog || 4446 vml[c]->vdev_ishole || 4447 vml[c]->vdev_isspare || 4448 vml[c]->vdev_isl2cache || 4449 !vdev_writeable(vml[c]) || 4450 vml[c]->vdev_children != 0 || 4451 vml[c]->vdev_state != VDEV_STATE_HEALTHY || 4452 c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { 4453 error = EINVAL; 4454 break; 4455 } 4456 4457 if (vdev_dtl_required(vml[c])) { 4458 error = EBUSY; 4459 break; 4460 } 4461 4462 /* we need certain info from the top level */ 4463 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, 4464 vml[c]->vdev_top->vdev_ms_array) == 0); 4465 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, 4466 vml[c]->vdev_top->vdev_ms_shift) == 0); 4467 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, 4468 vml[c]->vdev_top->vdev_asize) == 0); 4469 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, 4470 vml[c]->vdev_top->vdev_ashift) == 0); 4471 } 4472 4473 if (error != 0) { 4474 kmem_free(vml, children * sizeof (vdev_t *)); 4475 kmem_free(glist, children * sizeof (uint64_t)); 4476 return (spa_vdev_exit(spa, NULL, txg, error)); 4477 } 4478 4479 /* stop writers from using the disks */ 4480 for (c = 0; c < children; c++) { 4481 if (vml[c] != NULL) 4482 vml[c]->vdev_offline = B_TRUE; 4483 } 4484 vdev_reopen(spa->spa_root_vdev); 4485 4486 /* 4487 * Temporarily record the splitting vdevs in the spa config. This 4488 * will disappear once the config is regenerated. 4489 */ 4490 VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); 4491 VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 4492 glist, children) == 0); 4493 kmem_free(glist, children * sizeof (uint64_t)); 4494 4495 mutex_enter(&spa->spa_props_lock); 4496 VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, 4497 nvl) == 0); 4498 mutex_exit(&spa->spa_props_lock); 4499 spa->spa_config_splitting = nvl; 4500 vdev_config_dirty(spa->spa_root_vdev); 4501 4502 /* configure and create the new pool */ 4503 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0); 4504 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 4505 exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0); 4506 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 4507 spa_version(spa)) == 0); 4508 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, 4509 spa->spa_config_txg) == 0); 4510 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, 4511 spa_generate_guid(NULL)) == 0); 4512 (void) nvlist_lookup_string(props, 4513 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 4514 4515 /* add the new pool to the namespace */ 4516 newspa = spa_add(newname, config, altroot); 4517 newspa->spa_config_txg = spa->spa_config_txg; 4518 spa_set_log_state(newspa, SPA_LOG_CLEAR); 4519 4520 /* release the spa config lock, retaining the namespace lock */ 4521 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 4522 4523 if (zio_injection_enabled) 4524 zio_handle_panic_injection(spa, FTAG, 1); 4525 4526 spa_activate(newspa, spa_mode_global); 4527 spa_async_suspend(newspa); 4528 4529#ifndef sun 4530 /* mark that we are creating new spa by splitting */ 4531 newspa->spa_splitting_newspa = B_TRUE; 4532#endif 4533 /* create the new pool from the disks of the original pool */ 4534 error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE); 4535#ifndef sun 4536 newspa->spa_splitting_newspa = B_FALSE; 4537#endif 4538 if (error) 4539 goto out; 4540 4541 /* if that worked, generate a real config for the new pool */ 4542 if (newspa->spa_root_vdev != NULL) { 4543 VERIFY(nvlist_alloc(&newspa->spa_config_splitting, 4544 NV_UNIQUE_NAME, KM_SLEEP) == 0); 4545 VERIFY(nvlist_add_uint64(newspa->spa_config_splitting, 4546 ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0); 4547 spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, 4548 B_TRUE)); 4549 } 4550 4551 /* set the props */ 4552 if (props != NULL) { 4553 spa_configfile_set(newspa, props, B_FALSE); 4554 error = spa_prop_set(newspa, props); 4555 if (error) 4556 goto out; 4557 } 4558 4559 /* flush everything */ 4560 txg = spa_vdev_config_enter(newspa); 4561 vdev_config_dirty(newspa->spa_root_vdev); 4562 (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); 4563 4564 if (zio_injection_enabled) 4565 zio_handle_panic_injection(spa, FTAG, 2); 4566 4567 spa_async_resume(newspa); 4568 4569 /* finally, update the original pool's config */ 4570 txg = spa_vdev_config_enter(spa); 4571 tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 4572 error = dmu_tx_assign(tx, TXG_WAIT); 4573 if (error != 0) 4574 dmu_tx_abort(tx); 4575 for (c = 0; c < children; c++) { 4576 if (vml[c] != NULL) { 4577 vdev_split(vml[c]); 4578 if (error == 0) 4579 spa_history_log_internal(LOG_POOL_VDEV_DETACH, 4580 spa, tx, "vdev=%s", 4581 vml[c]->vdev_path); 4582 vdev_free(vml[c]); 4583 } 4584 } 4585 vdev_config_dirty(spa->spa_root_vdev); 4586 spa->spa_config_splitting = NULL; 4587 nvlist_free(nvl); 4588 if (error == 0) 4589 dmu_tx_commit(tx); 4590 (void) spa_vdev_exit(spa, NULL, txg, 0); 4591 4592 if (zio_injection_enabled) 4593 zio_handle_panic_injection(spa, FTAG, 3); 4594 4595 /* split is complete; log a history record */ 4596 spa_history_log_internal(LOG_POOL_SPLIT, newspa, NULL, 4597 "split new pool %s from pool %s", newname, spa_name(spa)); 4598 4599 kmem_free(vml, children * sizeof (vdev_t *)); 4600 4601 /* if we're not going to mount the filesystems in userland, export */ 4602 if (exp) 4603 error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, 4604 B_FALSE, B_FALSE); 4605 4606 return (error); 4607 4608out: 4609 spa_unload(newspa); 4610 spa_deactivate(newspa); 4611 spa_remove(newspa); 4612 4613 txg = spa_vdev_config_enter(spa); 4614 4615 /* re-online all offlined disks */ 4616 for (c = 0; c < children; c++) { 4617 if (vml[c] != NULL) 4618 vml[c]->vdev_offline = B_FALSE; 4619 } 4620 vdev_reopen(spa->spa_root_vdev); 4621 4622 nvlist_free(spa->spa_config_splitting); 4623 spa->spa_config_splitting = NULL; 4624 (void) spa_vdev_exit(spa, NULL, txg, error); 4625 4626 kmem_free(vml, children * sizeof (vdev_t *)); 4627 return (error); 4628} 4629 4630static nvlist_t * 4631spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) 4632{ 4633 for (int i = 0; i < count; i++) { 4634 uint64_t guid; 4635 4636 VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID, 4637 &guid) == 0); 4638 4639 if (guid == target_guid) 4640 return (nvpp[i]); 4641 } 4642 4643 return (NULL); 4644} 4645 4646static void 4647spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, 4648 nvlist_t *dev_to_remove) 4649{ 4650 nvlist_t **newdev = NULL; 4651 4652 if (count > 1) 4653 newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); 4654 4655 for (int i = 0, j = 0; i < count; i++) { 4656 if (dev[i] == dev_to_remove) 4657 continue; 4658 VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); 4659 } 4660 4661 VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); 4662 VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); 4663 4664 for (int i = 0; i < count - 1; i++) 4665 nvlist_free(newdev[i]); 4666 4667 if (count > 1) 4668 kmem_free(newdev, (count - 1) * sizeof (void *)); 4669} 4670 4671/* 4672 * Evacuate the device. 4673 */ 4674static int 4675spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd) 4676{ 4677 uint64_t txg; 4678 int error = 0; 4679 4680 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 4681 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 4682 ASSERT(vd == vd->vdev_top); 4683 4684 /* 4685 * Evacuate the device. We don't hold the config lock as writer 4686 * since we need to do I/O but we do keep the 4687 * spa_namespace_lock held. Once this completes the device 4688 * should no longer have any blocks allocated on it. 4689 */ 4690 if (vd->vdev_islog) { 4691 if (vd->vdev_stat.vs_alloc != 0) 4692 error = spa_offline_log(spa); 4693 } else { 4694 error = ENOTSUP; 4695 } 4696 4697 if (error) 4698 return (error); 4699 4700 /* 4701 * The evacuation succeeded. Remove any remaining MOS metadata 4702 * associated with this vdev, and wait for these changes to sync. 4703 */ 4704 ASSERT3U(vd->vdev_stat.vs_alloc, ==, 0); 4705 txg = spa_vdev_config_enter(spa); 4706 vd->vdev_removing = B_TRUE; 4707 vdev_dirty(vd, 0, NULL, txg); 4708 vdev_config_dirty(vd); 4709 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 4710 4711 return (0); 4712} 4713 4714/* 4715 * Complete the removal by cleaning up the namespace. 4716 */ 4717static void 4718spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd) 4719{ 4720 vdev_t *rvd = spa->spa_root_vdev; 4721 uint64_t id = vd->vdev_id; 4722 boolean_t last_vdev = (id == (rvd->vdev_children - 1)); 4723 4724 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 4725 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 4726 ASSERT(vd == vd->vdev_top); 4727 4728 /* 4729 * Only remove any devices which are empty. 4730 */ 4731 if (vd->vdev_stat.vs_alloc != 0) 4732 return; 4733 4734 (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 4735 4736 if (list_link_active(&vd->vdev_state_dirty_node)) 4737 vdev_state_clean(vd); 4738 if (list_link_active(&vd->vdev_config_dirty_node)) 4739 vdev_config_clean(vd); 4740 4741 vdev_free(vd); 4742 4743 if (last_vdev) { 4744 vdev_compact_children(rvd); 4745 } else { 4746 vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); 4747 vdev_add_child(rvd, vd); 4748 } 4749 vdev_config_dirty(rvd); 4750 4751 /* 4752 * Reassess the health of our root vdev. 4753 */ 4754 vdev_reopen(rvd); 4755} 4756 4757/* 4758 * Remove a device from the pool - 4759 * 4760 * Removing a device from the vdev namespace requires several steps 4761 * and can take a significant amount of time. As a result we use 4762 * the spa_vdev_config_[enter/exit] functions which allow us to 4763 * grab and release the spa_config_lock while still holding the namespace 4764 * lock. During each step the configuration is synced out. 4765 */ 4766 4767/* 4768 * Remove a device from the pool. Currently, this supports removing only hot 4769 * spares, slogs, and level 2 ARC devices. 4770 */ 4771int 4772spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 4773{ 4774 vdev_t *vd; 4775 metaslab_group_t *mg; 4776 nvlist_t **spares, **l2cache, *nv; 4777 uint64_t txg = 0; 4778 uint_t nspares, nl2cache; 4779 int error = 0; 4780 boolean_t locked = MUTEX_HELD(&spa_namespace_lock); 4781 4782 ASSERT(spa_writeable(spa)); 4783 4784 if (!locked) 4785 txg = spa_vdev_enter(spa); 4786 4787 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 4788 4789 if (spa->spa_spares.sav_vdevs != NULL && 4790 nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 4791 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && 4792 (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { 4793 /* 4794 * Only remove the hot spare if it's not currently in use 4795 * in this pool. 4796 */ 4797 if (vd == NULL || unspare) { 4798 spa_vdev_remove_aux(spa->spa_spares.sav_config, 4799 ZPOOL_CONFIG_SPARES, spares, nspares, nv); 4800 spa_load_spares(spa); 4801 spa->spa_spares.sav_sync = B_TRUE; 4802 } else { 4803 error = EBUSY; 4804 } 4805 } else if (spa->spa_l2cache.sav_vdevs != NULL && 4806 nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 4807 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && 4808 (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { 4809 /* 4810 * Cache devices can always be removed. 4811 */ 4812 spa_vdev_remove_aux(spa->spa_l2cache.sav_config, 4813 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); 4814 spa_load_l2cache(spa); 4815 spa->spa_l2cache.sav_sync = B_TRUE; 4816 } else if (vd != NULL && vd->vdev_islog) { 4817 ASSERT(!locked); 4818 ASSERT(vd == vd->vdev_top); 4819 4820 /* 4821 * XXX - Once we have bp-rewrite this should 4822 * become the common case. 4823 */ 4824 4825 mg = vd->vdev_mg; 4826 4827 /* 4828 * Stop allocating from this vdev. 4829 */ 4830 metaslab_group_passivate(mg); 4831 4832 /* 4833 * Wait for the youngest allocations and frees to sync, 4834 * and then wait for the deferral of those frees to finish. 4835 */ 4836 spa_vdev_config_exit(spa, NULL, 4837 txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); 4838 4839 /* 4840 * Attempt to evacuate the vdev. 4841 */ 4842 error = spa_vdev_remove_evacuate(spa, vd); 4843 4844 txg = spa_vdev_config_enter(spa); 4845 4846 /* 4847 * If we couldn't evacuate the vdev, unwind. 4848 */ 4849 if (error) { 4850 metaslab_group_activate(mg); 4851 return (spa_vdev_exit(spa, NULL, txg, error)); 4852 } 4853 4854 /* 4855 * Clean up the vdev namespace. 4856 */ 4857 spa_vdev_remove_from_namespace(spa, vd); 4858 4859 } else if (vd != NULL) { 4860 /* 4861 * Normal vdevs cannot be removed (yet). 4862 */ 4863 error = ENOTSUP; 4864 } else { 4865 /* 4866 * There is no vdev of any kind with the specified guid. 4867 */ 4868 error = ENOENT; 4869 } 4870 4871 if (!locked) 4872 return (spa_vdev_exit(spa, NULL, txg, error)); 4873 4874 return (error); 4875} 4876 4877/* 4878 * Find any device that's done replacing, or a vdev marked 'unspare' that's 4879 * current spared, so we can detach it. 4880 */ 4881static vdev_t * 4882spa_vdev_resilver_done_hunt(vdev_t *vd) 4883{ 4884 vdev_t *newvd, *oldvd; 4885 4886 for (int c = 0; c < vd->vdev_children; c++) { 4887 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 4888 if (oldvd != NULL) 4889 return (oldvd); 4890 } 4891 4892 /* 4893 * Check for a completed replacement. We always consider the first 4894 * vdev in the list to be the oldest vdev, and the last one to be 4895 * the newest (see spa_vdev_attach() for how that works). In 4896 * the case where the newest vdev is faulted, we will not automatically 4897 * remove it after a resilver completes. This is OK as it will require 4898 * user intervention to determine which disk the admin wishes to keep. 4899 */ 4900 if (vd->vdev_ops == &vdev_replacing_ops) { 4901 ASSERT(vd->vdev_children > 1); 4902 4903 newvd = vd->vdev_child[vd->vdev_children - 1]; 4904 oldvd = vd->vdev_child[0]; 4905 4906 if (vdev_dtl_empty(newvd, DTL_MISSING) && 4907 vdev_dtl_empty(newvd, DTL_OUTAGE) && 4908 !vdev_dtl_required(oldvd)) 4909 return (oldvd); 4910 } 4911 4912 /* 4913 * Check for a completed resilver with the 'unspare' flag set. 4914 */ 4915 if (vd->vdev_ops == &vdev_spare_ops) { 4916 vdev_t *first = vd->vdev_child[0]; 4917 vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; 4918 4919 if (last->vdev_unspare) { 4920 oldvd = first; 4921 newvd = last; 4922 } else if (first->vdev_unspare) { 4923 oldvd = last; 4924 newvd = first; 4925 } else { 4926 oldvd = NULL; 4927 } 4928 4929 if (oldvd != NULL && 4930 vdev_dtl_empty(newvd, DTL_MISSING) && 4931 vdev_dtl_empty(newvd, DTL_OUTAGE) && 4932 !vdev_dtl_required(oldvd)) 4933 return (oldvd); 4934 4935 /* 4936 * If there are more than two spares attached to a disk, 4937 * and those spares are not required, then we want to 4938 * attempt to free them up now so that they can be used 4939 * by other pools. Once we're back down to a single 4940 * disk+spare, we stop removing them. 4941 */ 4942 if (vd->vdev_children > 2) { 4943 newvd = vd->vdev_child[1]; 4944 4945 if (newvd->vdev_isspare && last->vdev_isspare && 4946 vdev_dtl_empty(last, DTL_MISSING) && 4947 vdev_dtl_empty(last, DTL_OUTAGE) && 4948 !vdev_dtl_required(newvd)) 4949 return (newvd); 4950 } 4951 } 4952 4953 return (NULL); 4954} 4955 4956static void 4957spa_vdev_resilver_done(spa_t *spa) 4958{ 4959 vdev_t *vd, *pvd, *ppvd; 4960 uint64_t guid, sguid, pguid, ppguid; 4961 4962 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4963 4964 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 4965 pvd = vd->vdev_parent; 4966 ppvd = pvd->vdev_parent; 4967 guid = vd->vdev_guid; 4968 pguid = pvd->vdev_guid; 4969 ppguid = ppvd->vdev_guid; 4970 sguid = 0; 4971 /* 4972 * If we have just finished replacing a hot spared device, then 4973 * we need to detach the parent's first child (the original hot 4974 * spare) as well. 4975 */ 4976 if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && 4977 ppvd->vdev_children == 2) { 4978 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 4979 sguid = ppvd->vdev_child[1]->vdev_guid; 4980 } 4981 spa_config_exit(spa, SCL_ALL, FTAG); 4982 if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) 4983 return; 4984 if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) 4985 return; 4986 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4987 } 4988 4989 spa_config_exit(spa, SCL_ALL, FTAG); 4990} 4991 4992/* 4993 * Update the stored path or FRU for this vdev. 4994 */ 4995int 4996spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, 4997 boolean_t ispath) 4998{ 4999 vdev_t *vd; 5000 boolean_t sync = B_FALSE; 5001 5002 ASSERT(spa_writeable(spa)); 5003 5004 spa_vdev_state_enter(spa, SCL_ALL); 5005 5006 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 5007 return (spa_vdev_state_exit(spa, NULL, ENOENT)); 5008 5009 if (!vd->vdev_ops->vdev_op_leaf) 5010 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 5011 5012 if (ispath) { 5013 if (strcmp(value, vd->vdev_path) != 0) { 5014 spa_strfree(vd->vdev_path); 5015 vd->vdev_path = spa_strdup(value); 5016 sync = B_TRUE; 5017 } 5018 } else { 5019 if (vd->vdev_fru == NULL) { 5020 vd->vdev_fru = spa_strdup(value); 5021 sync = B_TRUE; 5022 } else if (strcmp(value, vd->vdev_fru) != 0) { 5023 spa_strfree(vd->vdev_fru); 5024 vd->vdev_fru = spa_strdup(value); 5025 sync = B_TRUE; 5026 } 5027 } 5028 5029 return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); 5030} 5031 5032int 5033spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 5034{ 5035 return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); 5036} 5037 5038int 5039spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) 5040{ 5041 return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); 5042} 5043 5044/* 5045 * ========================================================================== 5046 * SPA Scanning 5047 * ========================================================================== 5048 */ 5049 5050int 5051spa_scan_stop(spa_t *spa) 5052{ 5053 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5054 if (dsl_scan_resilvering(spa->spa_dsl_pool)) 5055 return (EBUSY); 5056 return (dsl_scan_cancel(spa->spa_dsl_pool)); 5057} 5058 5059int 5060spa_scan(spa_t *spa, pool_scan_func_t func) 5061{ 5062 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5063 5064 if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) 5065 return (ENOTSUP); 5066 5067 /* 5068 * If a resilver was requested, but there is no DTL on a 5069 * writeable leaf device, we have nothing to do. 5070 */ 5071 if (func == POOL_SCAN_RESILVER && 5072 !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 5073 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 5074 return (0); 5075 } 5076 5077 return (dsl_scan(spa->spa_dsl_pool, func)); 5078} 5079 5080/* 5081 * ========================================================================== 5082 * SPA async task processing 5083 * ========================================================================== 5084 */ 5085 5086static void 5087spa_async_remove(spa_t *spa, vdev_t *vd) 5088{ 5089 if (vd->vdev_remove_wanted) { 5090 vd->vdev_remove_wanted = B_FALSE; 5091 vd->vdev_delayed_close = B_FALSE; 5092 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 5093 5094 /* 5095 * We want to clear the stats, but we don't want to do a full 5096 * vdev_clear() as that will cause us to throw away 5097 * degraded/faulted state as well as attempt to reopen the 5098 * device, all of which is a waste. 5099 */ 5100 vd->vdev_stat.vs_read_errors = 0; 5101 vd->vdev_stat.vs_write_errors = 0; 5102 vd->vdev_stat.vs_checksum_errors = 0; 5103 5104 vdev_state_dirty(vd->vdev_top); 5105 } 5106 5107 for (int c = 0; c < vd->vdev_children; c++) 5108 spa_async_remove(spa, vd->vdev_child[c]); 5109} 5110 5111static void 5112spa_async_probe(spa_t *spa, vdev_t *vd) 5113{ 5114 if (vd->vdev_probe_wanted) { 5115 vd->vdev_probe_wanted = B_FALSE; 5116 vdev_reopen(vd); /* vdev_open() does the actual probe */ 5117 } 5118 5119 for (int c = 0; c < vd->vdev_children; c++) 5120 spa_async_probe(spa, vd->vdev_child[c]); 5121} 5122 5123static void 5124spa_async_autoexpand(spa_t *spa, vdev_t *vd) 5125{ 5126 sysevent_id_t eid; 5127 nvlist_t *attr; 5128 char *physpath; 5129 5130 if (!spa->spa_autoexpand) 5131 return; 5132 5133 for (int c = 0; c < vd->vdev_children; c++) { 5134 vdev_t *cvd = vd->vdev_child[c]; 5135 spa_async_autoexpand(spa, cvd); 5136 } 5137 5138 if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) 5139 return; 5140 5141 physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP); 5142 (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath); 5143 5144 VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5145 VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0); 5146 5147 (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS, 5148 ESC_ZFS_VDEV_AUTOEXPAND, attr, &eid, DDI_SLEEP); 5149 5150 nvlist_free(attr); 5151 kmem_free(physpath, MAXPATHLEN); 5152} 5153 5154static void 5155spa_async_thread(void *arg) 5156{ 5157 spa_t *spa = arg; 5158 int tasks; 5159 5160 ASSERT(spa->spa_sync_on); 5161 5162 mutex_enter(&spa->spa_async_lock); 5163 tasks = spa->spa_async_tasks; 5164 spa->spa_async_tasks = 0; 5165 mutex_exit(&spa->spa_async_lock); 5166 5167 /* 5168 * See if the config needs to be updated. 5169 */ 5170 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 5171 uint64_t old_space, new_space; 5172 5173 mutex_enter(&spa_namespace_lock); 5174 old_space = metaslab_class_get_space(spa_normal_class(spa)); 5175 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 5176 new_space = metaslab_class_get_space(spa_normal_class(spa)); 5177 mutex_exit(&spa_namespace_lock); 5178 5179 /* 5180 * If the pool grew as a result of the config update, 5181 * then log an internal history event. 5182 */ 5183 if (new_space != old_space) { 5184 spa_history_log_internal(LOG_POOL_VDEV_ONLINE, 5185 spa, NULL, 5186 "pool '%s' size: %llu(+%llu)", 5187 spa_name(spa), new_space, new_space - old_space); 5188 } 5189 } 5190 5191 /* 5192 * See if any devices need to be marked REMOVED. 5193 */ 5194 if (tasks & SPA_ASYNC_REMOVE) { 5195 spa_vdev_state_enter(spa, SCL_NONE); 5196 spa_async_remove(spa, spa->spa_root_vdev); 5197 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 5198 spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); 5199 for (int i = 0; i < spa->spa_spares.sav_count; i++) 5200 spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); 5201 (void) spa_vdev_state_exit(spa, NULL, 0); 5202 } 5203 5204 if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { 5205 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 5206 spa_async_autoexpand(spa, spa->spa_root_vdev); 5207 spa_config_exit(spa, SCL_CONFIG, FTAG); 5208 } 5209 5210 /* 5211 * See if any devices need to be probed. 5212 */ 5213 if (tasks & SPA_ASYNC_PROBE) { 5214 spa_vdev_state_enter(spa, SCL_NONE); 5215 spa_async_probe(spa, spa->spa_root_vdev); 5216 (void) spa_vdev_state_exit(spa, NULL, 0); 5217 } 5218 5219 /* 5220 * If any devices are done replacing, detach them. 5221 */ 5222 if (tasks & SPA_ASYNC_RESILVER_DONE) 5223 spa_vdev_resilver_done(spa); 5224 5225 /* 5226 * Kick off a resilver. 5227 */ 5228 if (tasks & SPA_ASYNC_RESILVER) 5229 dsl_resilver_restart(spa->spa_dsl_pool, 0); 5230 5231 /* 5232 * Let the world know that we're done. 5233 */ 5234 mutex_enter(&spa->spa_async_lock); 5235 spa->spa_async_thread = NULL; 5236 cv_broadcast(&spa->spa_async_cv); 5237 mutex_exit(&spa->spa_async_lock); 5238 thread_exit(); 5239} 5240 5241void 5242spa_async_suspend(spa_t *spa) 5243{ 5244 mutex_enter(&spa->spa_async_lock); 5245 spa->spa_async_suspended++; 5246 while (spa->spa_async_thread != NULL) 5247 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 5248 mutex_exit(&spa->spa_async_lock); 5249} 5250 5251void 5252spa_async_resume(spa_t *spa) 5253{ 5254 mutex_enter(&spa->spa_async_lock); 5255 ASSERT(spa->spa_async_suspended != 0); 5256 spa->spa_async_suspended--; 5257 mutex_exit(&spa->spa_async_lock); 5258} 5259 5260static void 5261spa_async_dispatch(spa_t *spa) 5262{ 5263 mutex_enter(&spa->spa_async_lock); 5264 if (spa->spa_async_tasks && !spa->spa_async_suspended && 5265 spa->spa_async_thread == NULL && 5266 rootdir != NULL && !vn_is_readonly(rootdir)) 5267 spa->spa_async_thread = thread_create(NULL, 0, 5268 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 5269 mutex_exit(&spa->spa_async_lock); 5270} 5271 5272void 5273spa_async_request(spa_t *spa, int task) 5274{ 5275 zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); 5276 mutex_enter(&spa->spa_async_lock); 5277 spa->spa_async_tasks |= task; 5278 mutex_exit(&spa->spa_async_lock); 5279} 5280 5281/* 5282 * ========================================================================== 5283 * SPA syncing routines 5284 * ========================================================================== 5285 */ 5286 5287static int 5288bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 5289{ 5290 bpobj_t *bpo = arg; 5291 bpobj_enqueue(bpo, bp, tx); 5292 return (0); 5293} 5294 5295static int 5296spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 5297{ 5298 zio_t *zio = arg; 5299 5300 zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp, 5301 zio->io_flags)); 5302 return (0); 5303} 5304 5305static void 5306spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 5307{ 5308 char *packed = NULL; 5309 size_t bufsize; 5310 size_t nvsize = 0; 5311 dmu_buf_t *db; 5312 5313 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 5314 5315 /* 5316 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 5317 * information. This avoids the dbuf_will_dirty() path and 5318 * saves us a pre-read to get data we don't actually care about. 5319 */ 5320 bufsize = P2ROUNDUP(nvsize, SPA_CONFIG_BLOCKSIZE); 5321 packed = kmem_alloc(bufsize, KM_SLEEP); 5322 5323 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 5324 KM_SLEEP) == 0); 5325 bzero(packed + nvsize, bufsize - nvsize); 5326 5327 dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); 5328 5329 kmem_free(packed, bufsize); 5330 5331 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 5332 dmu_buf_will_dirty(db, tx); 5333 *(uint64_t *)db->db_data = nvsize; 5334 dmu_buf_rele(db, FTAG); 5335} 5336 5337static void 5338spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 5339 const char *config, const char *entry) 5340{ 5341 nvlist_t *nvroot; 5342 nvlist_t **list; 5343 int i; 5344 5345 if (!sav->sav_sync) 5346 return; 5347 5348 /* 5349 * Update the MOS nvlist describing the list of available devices. 5350 * spa_validate_aux() will have already made sure this nvlist is 5351 * valid and the vdevs are labeled appropriately. 5352 */ 5353 if (sav->sav_object == 0) { 5354 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 5355 DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 5356 sizeof (uint64_t), tx); 5357 VERIFY(zap_update(spa->spa_meta_objset, 5358 DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 5359 &sav->sav_object, tx) == 0); 5360 } 5361 5362 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5363 if (sav->sav_count == 0) { 5364 VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); 5365 } else { 5366 list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 5367 for (i = 0; i < sav->sav_count; i++) 5368 list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 5369 B_FALSE, VDEV_CONFIG_L2CACHE); 5370 VERIFY(nvlist_add_nvlist_array(nvroot, config, list, 5371 sav->sav_count) == 0); 5372 for (i = 0; i < sav->sav_count; i++) 5373 nvlist_free(list[i]); 5374 kmem_free(list, sav->sav_count * sizeof (void *)); 5375 } 5376 5377 spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 5378 nvlist_free(nvroot); 5379 5380 sav->sav_sync = B_FALSE; 5381} 5382 5383static void 5384spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 5385{ 5386 nvlist_t *config; 5387 5388 if (list_is_empty(&spa->spa_config_dirty_list)) 5389 return; 5390 5391 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 5392 5393 config = spa_config_generate(spa, spa->spa_root_vdev, 5394 dmu_tx_get_txg(tx), B_FALSE); 5395 5396 spa_config_exit(spa, SCL_STATE, FTAG); 5397 5398 if (spa->spa_config_syncing) 5399 nvlist_free(spa->spa_config_syncing); 5400 spa->spa_config_syncing = config; 5401 5402 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 5403} 5404 5405/* 5406 * Set zpool properties. 5407 */ 5408static void 5409spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx) 5410{ 5411 spa_t *spa = arg1; 5412 objset_t *mos = spa->spa_meta_objset; 5413 nvlist_t *nvp = arg2; 5414 nvpair_t *elem; 5415 uint64_t intval; 5416 char *strval; 5417 zpool_prop_t prop; 5418 const char *propname; 5419 zprop_type_t proptype; 5420 5421 mutex_enter(&spa->spa_props_lock); 5422 5423 elem = NULL; 5424 while ((elem = nvlist_next_nvpair(nvp, elem))) { 5425 switch (prop = zpool_name_to_prop(nvpair_name(elem))) { 5426 case ZPOOL_PROP_VERSION: 5427 /* 5428 * Only set version for non-zpool-creation cases 5429 * (set/import). spa_create() needs special care 5430 * for version setting. 5431 */ 5432 if (tx->tx_txg != TXG_INITIAL) { 5433 VERIFY(nvpair_value_uint64(elem, 5434 &intval) == 0); 5435 ASSERT(intval <= SPA_VERSION); 5436 ASSERT(intval >= spa_version(spa)); 5437 spa->spa_uberblock.ub_version = intval; 5438 vdev_config_dirty(spa->spa_root_vdev); 5439 } 5440 break; 5441 5442 case ZPOOL_PROP_ALTROOT: 5443 /* 5444 * 'altroot' is a non-persistent property. It should 5445 * have been set temporarily at creation or import time. 5446 */ 5447 ASSERT(spa->spa_root != NULL); 5448 break; 5449 5450 case ZPOOL_PROP_READONLY: 5451 case ZPOOL_PROP_CACHEFILE: 5452 /* 5453 * 'readonly' and 'cachefile' are also non-persisitent 5454 * properties. 5455 */ 5456 break; 5457 case ZPOOL_PROP_COMMENT: 5458 VERIFY(nvpair_value_string(elem, &strval) == 0); 5459 if (spa->spa_comment != NULL) 5460 spa_strfree(spa->spa_comment); 5461 spa->spa_comment = spa_strdup(strval); 5462 /* 5463 * We need to dirty the configuration on all the vdevs 5464 * so that their labels get updated. It's unnecessary 5465 * to do this for pool creation since the vdev's 5466 * configuratoin has already been dirtied. 5467 */ 5468 if (tx->tx_txg != TXG_INITIAL) 5469 vdev_config_dirty(spa->spa_root_vdev); 5470 break; 5471 default: 5472 /* 5473 * Set pool property values in the poolprops mos object. 5474 */ 5475 if (spa->spa_pool_props_object == 0) { 5476 VERIFY((spa->spa_pool_props_object = 5477 zap_create(mos, DMU_OT_POOL_PROPS, 5478 DMU_OT_NONE, 0, tx)) > 0); 5479 5480 VERIFY(zap_update(mos, 5481 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 5482 8, 1, &spa->spa_pool_props_object, tx) 5483 == 0); 5484 } 5485 5486 /* normalize the property name */ 5487 propname = zpool_prop_to_name(prop); 5488 proptype = zpool_prop_get_type(prop); 5489 5490 if (nvpair_type(elem) == DATA_TYPE_STRING) { 5491 ASSERT(proptype == PROP_TYPE_STRING); 5492 VERIFY(nvpair_value_string(elem, &strval) == 0); 5493 VERIFY(zap_update(mos, 5494 spa->spa_pool_props_object, propname, 5495 1, strlen(strval) + 1, strval, tx) == 0); 5496 5497 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 5498 VERIFY(nvpair_value_uint64(elem, &intval) == 0); 5499 5500 if (proptype == PROP_TYPE_INDEX) { 5501 const char *unused; 5502 VERIFY(zpool_prop_index_to_string( 5503 prop, intval, &unused) == 0); 5504 } 5505 VERIFY(zap_update(mos, 5506 spa->spa_pool_props_object, propname, 5507 8, 1, &intval, tx) == 0); 5508 } else { 5509 ASSERT(0); /* not allowed */ 5510 } 5511 5512 switch (prop) { 5513 case ZPOOL_PROP_DELEGATION: 5514 spa->spa_delegation = intval; 5515 break; 5516 case ZPOOL_PROP_BOOTFS: 5517 spa->spa_bootfs = intval; 5518 break; 5519 case ZPOOL_PROP_FAILUREMODE: 5520 spa->spa_failmode = intval; 5521 break; 5522 case ZPOOL_PROP_AUTOEXPAND: 5523 spa->spa_autoexpand = intval; 5524 if (tx->tx_txg != TXG_INITIAL) 5525 spa_async_request(spa, 5526 SPA_ASYNC_AUTOEXPAND); 5527 break; 5528 case ZPOOL_PROP_DEDUPDITTO: 5529 spa->spa_dedup_ditto = intval; 5530 break; 5531 default: 5532 break; 5533 } 5534 } 5535 5536 /* log internal history if this is not a zpool create */ 5537 if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY && 5538 tx->tx_txg != TXG_INITIAL) { 5539 spa_history_log_internal(LOG_POOL_PROPSET, 5540 spa, tx, "%s %lld %s", 5541 nvpair_name(elem), intval, spa_name(spa)); 5542 } 5543 } 5544 5545 mutex_exit(&spa->spa_props_lock); 5546} 5547 5548/* 5549 * Perform one-time upgrade on-disk changes. spa_version() does not 5550 * reflect the new version this txg, so there must be no changes this 5551 * txg to anything that the upgrade code depends on after it executes. 5552 * Therefore this must be called after dsl_pool_sync() does the sync 5553 * tasks. 5554 */ 5555static void 5556spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) 5557{ 5558 dsl_pool_t *dp = spa->spa_dsl_pool; 5559 5560 ASSERT(spa->spa_sync_pass == 1); 5561 5562 if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && 5563 spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { 5564 dsl_pool_create_origin(dp, tx); 5565 5566 /* Keeping the origin open increases spa_minref */ 5567 spa->spa_minref += 3; 5568 } 5569 5570 if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && 5571 spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { 5572 dsl_pool_upgrade_clones(dp, tx); 5573 } 5574 5575 if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && 5576 spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { 5577 dsl_pool_upgrade_dir_clones(dp, tx); 5578 5579 /* Keeping the freedir open increases spa_minref */ 5580 spa->spa_minref += 3; 5581 } 5582} 5583 5584/* 5585 * Sync the specified transaction group. New blocks may be dirtied as 5586 * part of the process, so we iterate until it converges. 5587 */ 5588void 5589spa_sync(spa_t *spa, uint64_t txg) 5590{ 5591 dsl_pool_t *dp = spa->spa_dsl_pool; 5592 objset_t *mos = spa->spa_meta_objset; 5593 bpobj_t *defer_bpo = &spa->spa_deferred_bpobj; 5594 bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; 5595 vdev_t *rvd = spa->spa_root_vdev; 5596 vdev_t *vd; 5597 dmu_tx_t *tx; 5598 int error; 5599 5600 VERIFY(spa_writeable(spa)); 5601 5602 /* 5603 * Lock out configuration changes. 5604 */ 5605 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 5606 5607 spa->spa_syncing_txg = txg; 5608 spa->spa_sync_pass = 0; 5609 5610 /* 5611 * If there are any pending vdev state changes, convert them 5612 * into config changes that go out with this transaction group. 5613 */ 5614 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 5615 while (list_head(&spa->spa_state_dirty_list) != NULL) { 5616 /* 5617 * We need the write lock here because, for aux vdevs, 5618 * calling vdev_config_dirty() modifies sav_config. 5619 * This is ugly and will become unnecessary when we 5620 * eliminate the aux vdev wart by integrating all vdevs 5621 * into the root vdev tree. 5622 */ 5623 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 5624 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); 5625 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 5626 vdev_state_clean(vd); 5627 vdev_config_dirty(vd); 5628 } 5629 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 5630 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 5631 } 5632 spa_config_exit(spa, SCL_STATE, FTAG); 5633 5634 tx = dmu_tx_create_assigned(dp, txg); 5635 5636 /* 5637 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 5638 * set spa_deflate if we have no raid-z vdevs. 5639 */ 5640 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 5641 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 5642 int i; 5643 5644 for (i = 0; i < rvd->vdev_children; i++) { 5645 vd = rvd->vdev_child[i]; 5646 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 5647 break; 5648 } 5649 if (i == rvd->vdev_children) { 5650 spa->spa_deflate = TRUE; 5651 VERIFY(0 == zap_add(spa->spa_meta_objset, 5652 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 5653 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 5654 } 5655 } 5656 5657 /* 5658 * If anything has changed in this txg, or if someone is waiting 5659 * for this txg to sync (eg, spa_vdev_remove()), push the 5660 * deferred frees from the previous txg. If not, leave them 5661 * alone so that we don't generate work on an otherwise idle 5662 * system. 5663 */ 5664 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 5665 !txg_list_empty(&dp->dp_dirty_dirs, txg) || 5666 !txg_list_empty(&dp->dp_sync_tasks, txg) || 5667 ((dsl_scan_active(dp->dp_scan) || 5668 txg_sync_waiting(dp)) && !spa_shutting_down(spa))) { 5669 zio_t *zio = zio_root(spa, NULL, NULL, 0); 5670 VERIFY3U(bpobj_iterate(defer_bpo, 5671 spa_free_sync_cb, zio, tx), ==, 0); 5672 VERIFY3U(zio_wait(zio), ==, 0); 5673 } 5674 5675 /* 5676 * Iterate to convergence. 5677 */ 5678 do { 5679 int pass = ++spa->spa_sync_pass; 5680 5681 spa_sync_config_object(spa, tx); 5682 spa_sync_aux_dev(spa, &spa->spa_spares, tx, 5683 ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 5684 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 5685 ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 5686 spa_errlog_sync(spa, txg); 5687 dsl_pool_sync(dp, txg); 5688 5689 if (pass <= SYNC_PASS_DEFERRED_FREE) { 5690 zio_t *zio = zio_root(spa, NULL, NULL, 0); 5691 bplist_iterate(free_bpl, spa_free_sync_cb, 5692 zio, tx); 5693 VERIFY(zio_wait(zio) == 0); 5694 } else { 5695 bplist_iterate(free_bpl, bpobj_enqueue_cb, 5696 defer_bpo, tx); 5697 } 5698 5699 ddt_sync(spa, txg); 5700 dsl_scan_sync(dp, tx); 5701 5702 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) 5703 vdev_sync(vd, txg); 5704 5705 if (pass == 1) 5706 spa_sync_upgrades(spa, tx); 5707 5708 } while (dmu_objset_is_dirty(mos, txg)); 5709 5710 /* 5711 * Rewrite the vdev configuration (which includes the uberblock) 5712 * to commit the transaction group. 5713 * 5714 * If there are no dirty vdevs, we sync the uberblock to a few 5715 * random top-level vdevs that are known to be visible in the 5716 * config cache (see spa_vdev_add() for a complete description). 5717 * If there *are* dirty vdevs, sync the uberblock to all vdevs. 5718 */ 5719 for (;;) { 5720 /* 5721 * We hold SCL_STATE to prevent vdev open/close/etc. 5722 * while we're attempting to write the vdev labels. 5723 */ 5724 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 5725 5726 if (list_is_empty(&spa->spa_config_dirty_list)) { 5727 vdev_t *svd[SPA_DVAS_PER_BP]; 5728 int svdcount = 0; 5729 int children = rvd->vdev_children; 5730 int c0 = spa_get_random(children); 5731 5732 for (int c = 0; c < children; c++) { 5733 vd = rvd->vdev_child[(c0 + c) % children]; 5734 if (vd->vdev_ms_array == 0 || vd->vdev_islog) 5735 continue; 5736 svd[svdcount++] = vd; 5737 if (svdcount == SPA_DVAS_PER_BP) 5738 break; 5739 } 5740 error = vdev_config_sync(svd, svdcount, txg, B_FALSE); 5741 if (error != 0) 5742 error = vdev_config_sync(svd, svdcount, txg, 5743 B_TRUE); 5744 } else { 5745 error = vdev_config_sync(rvd->vdev_child, 5746 rvd->vdev_children, txg, B_FALSE); 5747 if (error != 0) 5748 error = vdev_config_sync(rvd->vdev_child, 5749 rvd->vdev_children, txg, B_TRUE); 5750 } 5751 5752 spa_config_exit(spa, SCL_STATE, FTAG); 5753 5754 if (error == 0) 5755 break; 5756 zio_suspend(spa, NULL); 5757 zio_resume_wait(spa); 5758 } 5759 dmu_tx_commit(tx); 5760 5761 /* 5762 * Clear the dirty config list. 5763 */ 5764 while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) 5765 vdev_config_clean(vd); 5766 5767 /* 5768 * Now that the new config has synced transactionally, 5769 * let it become visible to the config cache. 5770 */ 5771 if (spa->spa_config_syncing != NULL) { 5772 spa_config_set(spa, spa->spa_config_syncing); 5773 spa->spa_config_txg = txg; 5774 spa->spa_config_syncing = NULL; 5775 } 5776 5777 spa->spa_ubsync = spa->spa_uberblock; 5778 5779 dsl_pool_sync_done(dp, txg); 5780 5781 /* 5782 * Update usable space statistics. 5783 */ 5784 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 5785 vdev_sync_done(vd, txg); 5786 5787 spa_update_dspace(spa); 5788 5789 /* 5790 * It had better be the case that we didn't dirty anything 5791 * since vdev_config_sync(). 5792 */ 5793 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 5794 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 5795 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 5796 5797 spa->spa_sync_pass = 0; 5798 5799 spa_config_exit(spa, SCL_CONFIG, FTAG); 5800 5801 spa_handle_ignored_writes(spa); 5802 5803 /* 5804 * If any async tasks have been requested, kick them off. 5805 */ 5806 spa_async_dispatch(spa); 5807} 5808 5809/* 5810 * Sync all pools. We don't want to hold the namespace lock across these 5811 * operations, so we take a reference on the spa_t and drop the lock during the 5812 * sync. 5813 */ 5814void 5815spa_sync_allpools(void) 5816{ 5817 spa_t *spa = NULL; 5818 mutex_enter(&spa_namespace_lock); 5819 while ((spa = spa_next(spa)) != NULL) { 5820 if (spa_state(spa) != POOL_STATE_ACTIVE || 5821 !spa_writeable(spa) || spa_suspended(spa)) 5822 continue; 5823 spa_open_ref(spa, FTAG); 5824 mutex_exit(&spa_namespace_lock); 5825 txg_wait_synced(spa_get_dsl(spa), 0); 5826 mutex_enter(&spa_namespace_lock); 5827 spa_close(spa, FTAG); 5828 } 5829 mutex_exit(&spa_namespace_lock); 5830} 5831 5832/* 5833 * ========================================================================== 5834 * Miscellaneous routines 5835 * ========================================================================== 5836 */ 5837 5838/* 5839 * Remove all pools in the system. 5840 */ 5841void 5842spa_evict_all(void) 5843{ 5844 spa_t *spa; 5845 5846 /* 5847 * Remove all cached state. All pools should be closed now, 5848 * so every spa in the AVL tree should be unreferenced. 5849 */ 5850 mutex_enter(&spa_namespace_lock); 5851 while ((spa = spa_next(NULL)) != NULL) { 5852 /* 5853 * Stop async tasks. The async thread may need to detach 5854 * a device that's been replaced, which requires grabbing 5855 * spa_namespace_lock, so we must drop it here. 5856 */ 5857 spa_open_ref(spa, FTAG); 5858 mutex_exit(&spa_namespace_lock); 5859 spa_async_suspend(spa); 5860 mutex_enter(&spa_namespace_lock); 5861 spa_close(spa, FTAG); 5862 5863 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 5864 spa_unload(spa); 5865 spa_deactivate(spa); 5866 } 5867 spa_remove(spa); 5868 } 5869 mutex_exit(&spa_namespace_lock); 5870} 5871 5872vdev_t * 5873spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) 5874{ 5875 vdev_t *vd; 5876 int i; 5877 5878 if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 5879 return (vd); 5880 5881 if (aux) { 5882 for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 5883 vd = spa->spa_l2cache.sav_vdevs[i]; 5884 if (vd->vdev_guid == guid) 5885 return (vd); 5886 } 5887 5888 for (i = 0; i < spa->spa_spares.sav_count; i++) { 5889 vd = spa->spa_spares.sav_vdevs[i]; 5890 if (vd->vdev_guid == guid) 5891 return (vd); 5892 } 5893 } 5894 5895 return (NULL); 5896} 5897 5898void 5899spa_upgrade(spa_t *spa, uint64_t version) 5900{ 5901 ASSERT(spa_writeable(spa)); 5902 5903 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5904 5905 /* 5906 * This should only be called for a non-faulted pool, and since a 5907 * future version would result in an unopenable pool, this shouldn't be 5908 * possible. 5909 */ 5910 ASSERT(spa->spa_uberblock.ub_version <= SPA_VERSION); 5911 ASSERT(version >= spa->spa_uberblock.ub_version); 5912 5913 spa->spa_uberblock.ub_version = version; 5914 vdev_config_dirty(spa->spa_root_vdev); 5915 5916 spa_config_exit(spa, SCL_ALL, FTAG); 5917 5918 txg_wait_synced(spa_get_dsl(spa), 0); 5919} 5920 5921boolean_t 5922spa_has_spare(spa_t *spa, uint64_t guid) 5923{ 5924 int i; 5925 uint64_t spareguid; 5926 spa_aux_vdev_t *sav = &spa->spa_spares; 5927 5928 for (i = 0; i < sav->sav_count; i++) 5929 if (sav->sav_vdevs[i]->vdev_guid == guid) 5930 return (B_TRUE); 5931 5932 for (i = 0; i < sav->sav_npending; i++) { 5933 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 5934 &spareguid) == 0 && spareguid == guid) 5935 return (B_TRUE); 5936 } 5937 5938 return (B_FALSE); 5939} 5940 5941/* 5942 * Check if a pool has an active shared spare device. 5943 * Note: reference count of an active spare is 2, as a spare and as a replace 5944 */ 5945static boolean_t 5946spa_has_active_shared_spare(spa_t *spa) 5947{ 5948 int i, refcnt; 5949 uint64_t pool; 5950 spa_aux_vdev_t *sav = &spa->spa_spares; 5951 5952 for (i = 0; i < sav->sav_count; i++) { 5953 if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, 5954 &refcnt) && pool != 0ULL && pool == spa_guid(spa) && 5955 refcnt > 2) 5956 return (B_TRUE); 5957 } 5958 5959 return (B_FALSE); 5960} 5961 5962/* 5963 * Post a sysevent corresponding to the given event. The 'name' must be one of 5964 * the event definitions in sys/sysevent/eventdefs.h. The payload will be 5965 * filled in from the spa and (optionally) the vdev. This doesn't do anything 5966 * in the userland libzpool, as we don't want consumers to misinterpret ztest 5967 * or zdb as real changes. 5968 */ 5969void 5970spa_event_notify(spa_t *spa, vdev_t *vd, const char *name) 5971{ 5972#ifdef _KERNEL 5973 sysevent_t *ev; 5974 sysevent_attr_list_t *attr = NULL; 5975 sysevent_value_t value; 5976 sysevent_id_t eid; 5977 5978 ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", 5979 SE_SLEEP); 5980 5981 value.value_type = SE_DATA_TYPE_STRING; 5982 value.value.sv_string = spa_name(spa); 5983 if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) 5984 goto done; 5985 5986 value.value_type = SE_DATA_TYPE_UINT64; 5987 value.value.sv_uint64 = spa_guid(spa); 5988 if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) 5989 goto done; 5990 5991 if (vd) { 5992 value.value_type = SE_DATA_TYPE_UINT64; 5993 value.value.sv_uint64 = vd->vdev_guid; 5994 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, 5995 SE_SLEEP) != 0) 5996 goto done; 5997 5998 if (vd->vdev_path) { 5999 value.value_type = SE_DATA_TYPE_STRING; 6000 value.value.sv_string = vd->vdev_path; 6001 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, 6002 &value, SE_SLEEP) != 0) 6003 goto done; 6004 } 6005 } 6006 6007 if (sysevent_attach_attributes(ev, attr) != 0) 6008 goto done; 6009 attr = NULL; 6010 6011 (void) log_sysevent(ev, SE_SLEEP, &eid); 6012 6013done: 6014 if (attr) 6015 sysevent_free_attr(attr); 6016 sysevent_free(ev); 6017#endif 6018}
|