1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22/* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2011 by Delphix. All rights reserved. 25 */ 26 27/* 28 * This file contains all the routines used when modifying on-disk SPA state. 29 * This includes opening, importing, destroying, exporting a pool, and syncing a 30 * pool. 31 */ 32 33#include <sys/zfs_context.h> 34#include <sys/fm/fs/zfs.h> 35#include <sys/spa_impl.h> 36#include <sys/zio.h> 37#include <sys/zio_checksum.h> 38#include <sys/dmu.h> 39#include <sys/dmu_tx.h> 40#include <sys/zap.h> 41#include <sys/zil.h> 42#include <sys/ddt.h> 43#include <sys/vdev_impl.h> 44#include <sys/metaslab.h> 45#include <sys/metaslab_impl.h> 46#include <sys/uberblock_impl.h> 47#include <sys/txg.h> 48#include <sys/avl.h> 49#include <sys/dmu_traverse.h> 50#include <sys/dmu_objset.h> 51#include <sys/unique.h> 52#include <sys/dsl_pool.h> 53#include <sys/dsl_dataset.h> 54#include <sys/dsl_dir.h> 55#include <sys/dsl_prop.h> 56#include <sys/dsl_synctask.h> 57#include <sys/fs/zfs.h> 58#include <sys/arc.h> 59#include <sys/callb.h> 60#include <sys/spa_boot.h> 61#include <sys/zfs_ioctl.h> 62#include <sys/dsl_scan.h> 63#include <sys/zvol.h> 64 65#ifdef _KERNEL 66#include <sys/callb.h> 67#include <sys/cpupart.h> 68#include <sys/zone.h> 69#endif /* _KERNEL */ 70 71#include "zfs_prop.h" 72#include "zfs_comutil.h" 73 74/* Check hostid on import? */ 75static int check_hostid = 1; 76 77SYSCTL_DECL(_vfs_zfs); 78TUNABLE_INT("vfs.zfs.check_hostid", &check_hostid); 79SYSCTL_INT(_vfs_zfs, OID_AUTO, check_hostid, CTLFLAG_RW, &check_hostid, 0, 80 "Check hostid on import?"); 81 82typedef enum zti_modes { 83 zti_mode_fixed, /* value is # of threads (min 1) */ 84 zti_mode_online_percent, /* value is % of online CPUs */ 85 zti_mode_batch, /* cpu-intensive; value is ignored */ 86 zti_mode_null, /* don't create a taskq */ 87 zti_nmodes 88} zti_modes_t; 89 90#define ZTI_FIX(n) { zti_mode_fixed, (n) } 91#define ZTI_PCT(n) { zti_mode_online_percent, (n) } 92#define ZTI_BATCH { zti_mode_batch, 0 } 93#define ZTI_NULL { zti_mode_null, 0 } 94 95#define ZTI_ONE ZTI_FIX(1) 96 97typedef struct zio_taskq_info { 98 enum zti_modes zti_mode; 99 uint_t zti_value; 100} zio_taskq_info_t; 101 102static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { 103 "issue", "issue_high", "intr", "intr_high" 104}; 105 106/* 107 * Define the taskq threads for the following I/O types: 108 * NULL, READ, WRITE, FREE, CLAIM, and IOCTL 109 */ 110const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { 111 /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ 112 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 113 { ZTI_FIX(8), ZTI_NULL, ZTI_BATCH, ZTI_NULL }, 114 { ZTI_BATCH, ZTI_FIX(5), ZTI_FIX(8), ZTI_FIX(5) }, 115 { ZTI_FIX(100), ZTI_NULL, ZTI_ONE, ZTI_NULL }, 116 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 117 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 118}; 119 120static dsl_syncfunc_t spa_sync_props; 121static boolean_t spa_has_active_shared_spare(spa_t *spa); 122static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config, 123 spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, 124 char **ereport); 125static void spa_vdev_resilver_done(spa_t *spa); 126 127uint_t zio_taskq_batch_pct = 100; /* 1 thread per cpu in pset */ 128#ifdef PSRSET_BIND 129id_t zio_taskq_psrset_bind = PS_NONE; 130#endif 131#ifdef SYSDC 132boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ 133#endif 134uint_t zio_taskq_basedc = 80; /* base duty cycle */ 135 136boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */ 137 138/* 139 * This (illegal) pool name is used when temporarily importing a spa_t in order 140 * to get the vdev stats associated with the imported devices. 141 */ 142#define TRYIMPORT_NAME "$import" 143 144/* 145 * ========================================================================== 146 * SPA properties routines 147 * ========================================================================== 148 */ 149 150/* 151 * Add a (source=src, propname=propval) list to an nvlist. 152 */ 153static void 154spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, 155 uint64_t intval, zprop_source_t src) 156{ 157 const char *propname = zpool_prop_to_name(prop); 158 nvlist_t *propval; 159 160 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 161 VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); 162 163 if (strval != NULL) 164 VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); 165 else 166 VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); 167 168 VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); 169 nvlist_free(propval); 170} 171 172/* 173 * Get property values from the spa configuration. 174 */ 175static void 176spa_prop_get_config(spa_t *spa, nvlist_t **nvp) 177{ 178 uint64_t size; 179 uint64_t alloc; 180 uint64_t cap, version; 181 zprop_source_t src = ZPROP_SRC_NONE; 182 spa_config_dirent_t *dp; 183 184 ASSERT(MUTEX_HELD(&spa->spa_props_lock)); 185 186 if (spa->spa_root_vdev != NULL) { 187 alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 188 size = metaslab_class_get_space(spa_normal_class(spa)); 189 spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); 190 spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); 191 spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); 192 spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, 193 size - alloc, src); 194 spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, 195 (spa_mode(spa) == FREAD), src); 196 197 cap = (size == 0) ? 0 : (alloc * 100 / size); 198 spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); 199 200 spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, 201 ddt_get_pool_dedup_ratio(spa), src); 202 203 spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 204 spa->spa_root_vdev->vdev_state, src); 205 206 version = spa_version(spa); 207 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) 208 src = ZPROP_SRC_DEFAULT; 209 else 210 src = ZPROP_SRC_LOCAL; 211 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); 212 } 213 214 spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); 215 216 if (spa->spa_comment != NULL) { 217 spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment, 218 0, ZPROP_SRC_LOCAL); 219 } 220 221 if (spa->spa_root != NULL) 222 spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 223 0, ZPROP_SRC_LOCAL); 224 225 if ((dp = list_head(&spa->spa_config_list)) != NULL) { 226 if (dp->scd_path == NULL) { 227 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 228 "none", 0, ZPROP_SRC_LOCAL); 229 } else if (strcmp(dp->scd_path, spa_config_path) != 0) { 230 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 231 dp->scd_path, 0, ZPROP_SRC_LOCAL); 232 } 233 } 234} 235 236/* 237 * Get zpool property values. 238 */ 239int 240spa_prop_get(spa_t *spa, nvlist_t **nvp) 241{ 242 objset_t *mos = spa->spa_meta_objset; 243 zap_cursor_t zc; 244 zap_attribute_t za; 245 int err; 246 247 VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 248 249 mutex_enter(&spa->spa_props_lock); 250 251 /* 252 * Get properties from the spa config. 253 */ 254 spa_prop_get_config(spa, nvp); 255 256 /* If no pool property object, no more prop to get. */ 257 if (mos == NULL || spa->spa_pool_props_object == 0) { 258 mutex_exit(&spa->spa_props_lock); 259 return (0); 260 } 261 262 /* 263 * Get properties from the MOS pool property object. 264 */ 265 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 266 (err = zap_cursor_retrieve(&zc, &za)) == 0; 267 zap_cursor_advance(&zc)) { 268 uint64_t intval = 0; 269 char *strval = NULL; 270 zprop_source_t src = ZPROP_SRC_DEFAULT; 271 zpool_prop_t prop; 272 273 if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL) 274 continue; 275 276 switch (za.za_integer_length) { 277 case 8: 278 /* integer property */ 279 if (za.za_first_integer != 280 zpool_prop_default_numeric(prop)) 281 src = ZPROP_SRC_LOCAL; 282 283 if (prop == ZPOOL_PROP_BOOTFS) { 284 dsl_pool_t *dp; 285 dsl_dataset_t *ds = NULL; 286 287 dp = spa_get_dsl(spa); 288 rw_enter(&dp->dp_config_rwlock, RW_READER); 289 if (err = dsl_dataset_hold_obj(dp, 290 za.za_first_integer, FTAG, &ds)) { 291 rw_exit(&dp->dp_config_rwlock); 292 break; 293 } 294 295 strval = kmem_alloc( 296 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, 297 KM_SLEEP); 298 dsl_dataset_name(ds, strval); 299 dsl_dataset_rele(ds, FTAG); 300 rw_exit(&dp->dp_config_rwlock); 301 } else { 302 strval = NULL; 303 intval = za.za_first_integer; 304 } 305 306 spa_prop_add_list(*nvp, prop, strval, intval, src); 307 308 if (strval != NULL) 309 kmem_free(strval, 310 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); 311 312 break; 313 314 case 1: 315 /* string property */ 316 strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 317 err = zap_lookup(mos, spa->spa_pool_props_object, 318 za.za_name, 1, za.za_num_integers, strval); 319 if (err) { 320 kmem_free(strval, za.za_num_integers); 321 break; 322 } 323 spa_prop_add_list(*nvp, prop, strval, 0, src); 324 kmem_free(strval, za.za_num_integers); 325 break; 326 327 default: 328 break; 329 } 330 } 331 zap_cursor_fini(&zc); 332 mutex_exit(&spa->spa_props_lock); 333out: 334 if (err && err != ENOENT) { 335 nvlist_free(*nvp); 336 *nvp = NULL; 337 return (err); 338 } 339 340 return (0); 341} 342 343/* 344 * Validate the given pool properties nvlist and modify the list 345 * for the property values to be set. 346 */ 347static int 348spa_prop_validate(spa_t *spa, nvlist_t *props) 349{ 350 nvpair_t *elem; 351 int error = 0, reset_bootfs = 0; 352 uint64_t objnum; 353 354 elem = NULL; 355 while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 356 zpool_prop_t prop; 357 char *propname, *strval; 358 uint64_t intval; 359 objset_t *os; 360 char *slash, *check; 361 362 propname = nvpair_name(elem); 363 364 if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL) 365 return (EINVAL); 366 367 switch (prop) { 368 case ZPOOL_PROP_VERSION: 369 error = nvpair_value_uint64(elem, &intval); 370 if (!error && 371 (intval < spa_version(spa) || intval > SPA_VERSION)) 372 error = EINVAL; 373 break; 374 375 case ZPOOL_PROP_DELEGATION: 376 case ZPOOL_PROP_AUTOREPLACE: 377 case ZPOOL_PROP_LISTSNAPS: 378 case ZPOOL_PROP_AUTOEXPAND: 379 error = nvpair_value_uint64(elem, &intval); 380 if (!error && intval > 1) 381 error = EINVAL; 382 break; 383 384 case ZPOOL_PROP_BOOTFS: 385 /* 386 * If the pool version is less than SPA_VERSION_BOOTFS, 387 * or the pool is still being created (version == 0), 388 * the bootfs property cannot be set. 389 */ 390 if (spa_version(spa) < SPA_VERSION_BOOTFS) { 391 error = ENOTSUP; 392 break; 393 } 394 395 /* 396 * Make sure the vdev config is bootable 397 */ 398 if (!vdev_is_bootable(spa->spa_root_vdev)) { 399 error = ENOTSUP; 400 break; 401 } 402 403 reset_bootfs = 1; 404 405 error = nvpair_value_string(elem, &strval); 406 407 if (!error) { 408 uint64_t compress; 409 410 if (strval == NULL || strval[0] == '\0') { 411 objnum = zpool_prop_default_numeric( 412 ZPOOL_PROP_BOOTFS); 413 break; 414 } 415 416 if (error = dmu_objset_hold(strval, FTAG, &os)) 417 break; 418 419 /* Must be ZPL and not gzip compressed. */ 420 421 if (dmu_objset_type(os) != DMU_OST_ZFS) { 422 error = ENOTSUP; 423 } else if ((error = dsl_prop_get_integer(strval, 424 zfs_prop_to_name(ZFS_PROP_COMPRESSION), 425 &compress, NULL)) == 0 && 426 !BOOTFS_COMPRESS_VALID(compress)) { 427 error = ENOTSUP; 428 } else { 429 objnum = dmu_objset_id(os); 430 } 431 dmu_objset_rele(os, FTAG); 432 } 433 break; 434 435 case ZPOOL_PROP_FAILUREMODE: 436 error = nvpair_value_uint64(elem, &intval); 437 if (!error && (intval < ZIO_FAILURE_MODE_WAIT || 438 intval > ZIO_FAILURE_MODE_PANIC)) 439 error = EINVAL; 440 441 /* 442 * This is a special case which only occurs when 443 * the pool has completely failed. This allows 444 * the user to change the in-core failmode property 445 * without syncing it out to disk (I/Os might 446 * currently be blocked). We do this by returning 447 * EIO to the caller (spa_prop_set) to trick it 448 * into thinking we encountered a property validation 449 * error. 450 */ 451 if (!error && spa_suspended(spa)) { 452 spa->spa_failmode = intval; 453 error = EIO; 454 } 455 break; 456 457 case ZPOOL_PROP_CACHEFILE: 458 if ((error = nvpair_value_string(elem, &strval)) != 0) 459 break; 460 461 if (strval[0] == '\0') 462 break; 463 464 if (strcmp(strval, "none") == 0) 465 break; 466 467 if (strval[0] != '/') { 468 error = EINVAL; 469 break; 470 } 471 472 slash = strrchr(strval, '/'); 473 ASSERT(slash != NULL); 474 475 if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 476 strcmp(slash, "/..") == 0) 477 error = EINVAL; 478 break; 479 480 case ZPOOL_PROP_COMMENT: 481 if ((error = nvpair_value_string(elem, &strval)) != 0) 482 break; 483 for (check = strval; *check != '\0'; check++) { 484 /* 485 * The kernel doesn't have an easy isprint() 486 * check. For this kernel check, we merely 487 * check ASCII apart from DEL. Fix this if 488 * there is an easy-to-use kernel isprint(). 489 */ 490 if (*check >= 0x7f) { 491 error = EINVAL; 492 break; 493 } 494 check++; 495 } 496 if (strlen(strval) > ZPROP_MAX_COMMENT) 497 error = E2BIG; 498 break; 499 500 case ZPOOL_PROP_DEDUPDITTO: 501 if (spa_version(spa) < SPA_VERSION_DEDUP) 502 error = ENOTSUP; 503 else 504 error = nvpair_value_uint64(elem, &intval); 505 if (error == 0 && 506 intval != 0 && intval < ZIO_DEDUPDITTO_MIN) 507 error = EINVAL; 508 break; 509 } 510 511 if (error) 512 break; 513 } 514 515 if (!error && reset_bootfs) { 516 error = nvlist_remove(props, 517 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 518 519 if (!error) { 520 error = nvlist_add_uint64(props, 521 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 522 } 523 } 524 525 return (error); 526} 527 528void 529spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) 530{ 531 char *cachefile; 532 spa_config_dirent_t *dp; 533 534 if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), 535 &cachefile) != 0) 536 return; 537 538 dp = kmem_alloc(sizeof (spa_config_dirent_t), 539 KM_SLEEP); 540 541 if (cachefile[0] == '\0') 542 dp->scd_path = spa_strdup(spa_config_path); 543 else if (strcmp(cachefile, "none") == 0) 544 dp->scd_path = NULL; 545 else 546 dp->scd_path = spa_strdup(cachefile); 547 548 list_insert_head(&spa->spa_config_list, dp); 549 if (need_sync) 550 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 551} 552 553int 554spa_prop_set(spa_t *spa, nvlist_t *nvp) 555{ 556 int error; 557 nvpair_t *elem; 558 boolean_t need_sync = B_FALSE; 559 zpool_prop_t prop; 560 561 if ((error = spa_prop_validate(spa, nvp)) != 0) 562 return (error); 563 564 elem = NULL; 565 while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { 566 if ((prop = zpool_name_to_prop( 567 nvpair_name(elem))) == ZPROP_INVAL) 568 return (EINVAL); 569 570 if (prop == ZPOOL_PROP_CACHEFILE || 571 prop == ZPOOL_PROP_ALTROOT || 572 prop == ZPOOL_PROP_READONLY) 573 continue; 574 575 need_sync = B_TRUE; 576 break; 577 } 578 579 if (need_sync) 580 return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, 581 spa, nvp, 3)); 582 else 583 return (0); 584} 585 586/* 587 * If the bootfs property value is dsobj, clear it. 588 */ 589void 590spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 591{ 592 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 593 VERIFY(zap_remove(spa->spa_meta_objset, 594 spa->spa_pool_props_object, 595 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 596 spa->spa_bootfs = 0; 597 } 598} 599 600/* 601 * Change the GUID for the pool. This is done so that we can later 602 * re-import a pool built from a clone of our own vdevs. We will modify 603 * the root vdev's guid, our own pool guid, and then mark all of our 604 * vdevs dirty. Note that we must make sure that all our vdevs are 605 * online when we do this, or else any vdevs that weren't present 606 * would be orphaned from our pool. We are also going to issue a 607 * sysevent to update any watchers. 608 */ 609int 610spa_change_guid(spa_t *spa) 611{ 612 uint64_t oldguid, newguid; 613 uint64_t txg; 614 615 if (!(spa_mode_global & FWRITE)) 616 return (EROFS); 617 618 txg = spa_vdev_enter(spa); 619 620 if (spa->spa_root_vdev->vdev_state != VDEV_STATE_HEALTHY) 621 return (spa_vdev_exit(spa, NULL, txg, ENXIO)); 622 623 oldguid = spa_guid(spa); 624 newguid = spa_generate_guid(NULL); 625 ASSERT3U(oldguid, !=, newguid); 626 627 spa->spa_root_vdev->vdev_guid = newguid; 628 spa->spa_root_vdev->vdev_guid_sum += (newguid - oldguid); 629 630 vdev_config_dirty(spa->spa_root_vdev); 631 632 spa_event_notify(spa, NULL, ESC_ZFS_POOL_REGUID); 633 634 return (spa_vdev_exit(spa, NULL, txg, 0)); 635} 636 637/* 638 * ========================================================================== 639 * SPA state manipulation (open/create/destroy/import/export) 640 * ========================================================================== 641 */ 642 643static int 644spa_error_entry_compare(const void *a, const void *b) 645{ 646 spa_error_entry_t *sa = (spa_error_entry_t *)a; 647 spa_error_entry_t *sb = (spa_error_entry_t *)b; 648 int ret; 649 650 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 651 sizeof (zbookmark_t)); 652 653 if (ret < 0) 654 return (-1); 655 else if (ret > 0) 656 return (1); 657 else 658 return (0); 659} 660 661/* 662 * Utility function which retrieves copies of the current logs and 663 * re-initializes them in the process. 664 */ 665void 666spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 667{ 668 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 669 670 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 671 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 672 673 avl_create(&spa->spa_errlist_scrub, 674 spa_error_entry_compare, sizeof (spa_error_entry_t), 675 offsetof(spa_error_entry_t, se_avl)); 676 avl_create(&spa->spa_errlist_last, 677 spa_error_entry_compare, sizeof (spa_error_entry_t), 678 offsetof(spa_error_entry_t, se_avl)); 679} 680 681static taskq_t * 682spa_taskq_create(spa_t *spa, const char *name, enum zti_modes mode, 683 uint_t value) 684{ 685 uint_t flags = TASKQ_PREPOPULATE; 686 boolean_t batch = B_FALSE; 687 688 switch (mode) { 689 case zti_mode_null: 690 return (NULL); /* no taskq needed */ 691 692 case zti_mode_fixed: 693 ASSERT3U(value, >=, 1); 694 value = MAX(value, 1); 695 break; 696 697 case zti_mode_batch: 698 batch = B_TRUE; 699 flags |= TASKQ_THREADS_CPU_PCT; 700 value = zio_taskq_batch_pct; 701 break; 702 703 case zti_mode_online_percent: 704 flags |= TASKQ_THREADS_CPU_PCT; 705 break; 706 707 default: 708 panic("unrecognized mode for %s taskq (%u:%u) in " 709 "spa_activate()", 710 name, mode, value); 711 break; 712 } 713 714#ifdef SYSDC 715 if (zio_taskq_sysdc && spa->spa_proc != &p0) { 716 if (batch) 717 flags |= TASKQ_DC_BATCH; 718 719 return (taskq_create_sysdc(name, value, 50, INT_MAX, 720 spa->spa_proc, zio_taskq_basedc, flags)); 721 } 722#endif 723 return (taskq_create_proc(name, value, maxclsyspri, 50, INT_MAX, 724 spa->spa_proc, flags)); 725} 726 727static void 728spa_create_zio_taskqs(spa_t *spa) 729{ 730 for (int t = 0; t < ZIO_TYPES; t++) { 731 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 732 const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; 733 enum zti_modes mode = ztip->zti_mode; 734 uint_t value = ztip->zti_value; 735 char name[32]; 736 737 (void) snprintf(name, sizeof (name), 738 "%s_%s", zio_type_name[t], zio_taskq_types[q]); 739 740 spa->spa_zio_taskq[t][q] = 741 spa_taskq_create(spa, name, mode, value); 742 } 743 } 744} 745 746#ifdef _KERNEL 747#ifdef SPA_PROCESS 748static void 749spa_thread(void *arg) 750{ 751 callb_cpr_t cprinfo; 752 753 spa_t *spa = arg; 754 user_t *pu = PTOU(curproc); 755 756 CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, 757 spa->spa_name); 758 759 ASSERT(curproc != &p0); 760 (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), 761 "zpool-%s", spa->spa_name); 762 (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); 763 764#ifdef PSRSET_BIND 765 /* bind this thread to the requested psrset */ 766 if (zio_taskq_psrset_bind != PS_NONE) { 767 pool_lock(); 768 mutex_enter(&cpu_lock); 769 mutex_enter(&pidlock); 770 mutex_enter(&curproc->p_lock); 771 772 if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, 773 0, NULL, NULL) == 0) { 774 curthread->t_bind_pset = zio_taskq_psrset_bind; 775 } else { 776 cmn_err(CE_WARN, 777 "Couldn't bind process for zfs pool \"%s\" to " 778 "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); 779 } 780 781 mutex_exit(&curproc->p_lock); 782 mutex_exit(&pidlock); 783 mutex_exit(&cpu_lock); 784 pool_unlock(); 785 } 786#endif 787 788#ifdef SYSDC 789 if (zio_taskq_sysdc) { 790 sysdc_thread_enter(curthread, 100, 0); 791 } 792#endif 793 794 spa->spa_proc = curproc; 795 spa->spa_did = curthread->t_did; 796 797 spa_create_zio_taskqs(spa); 798 799 mutex_enter(&spa->spa_proc_lock); 800 ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); 801 802 spa->spa_proc_state = SPA_PROC_ACTIVE; 803 cv_broadcast(&spa->spa_proc_cv); 804 805 CALLB_CPR_SAFE_BEGIN(&cprinfo); 806 while (spa->spa_proc_state == SPA_PROC_ACTIVE) 807 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 808 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); 809 810 ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); 811 spa->spa_proc_state = SPA_PROC_GONE; 812 spa->spa_proc = &p0; 813 cv_broadcast(&spa->spa_proc_cv); 814 CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ 815 816 mutex_enter(&curproc->p_lock); 817 lwp_exit(); 818} 819#endif /* SPA_PROCESS */ 820#endif 821 822/* 823 * Activate an uninitialized pool. 824 */ 825static void 826spa_activate(spa_t *spa, int mode) 827{ 828 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 829 830 spa->spa_state = POOL_STATE_ACTIVE; 831 spa->spa_mode = mode; 832 833 spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops); 834 spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); 835 836 /* Try to create a covering process */ 837 mutex_enter(&spa->spa_proc_lock); 838 ASSERT(spa->spa_proc_state == SPA_PROC_NONE); 839 ASSERT(spa->spa_proc == &p0); 840 spa->spa_did = 0; 841 842#ifdef SPA_PROCESS 843 /* Only create a process if we're going to be around a while. */ 844 if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { 845 if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, 846 NULL, 0) == 0) { 847 spa->spa_proc_state = SPA_PROC_CREATED; 848 while (spa->spa_proc_state == SPA_PROC_CREATED) { 849 cv_wait(&spa->spa_proc_cv, 850 &spa->spa_proc_lock); 851 } 852 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 853 ASSERT(spa->spa_proc != &p0); 854 ASSERT(spa->spa_did != 0); 855 } else { 856#ifdef _KERNEL 857 cmn_err(CE_WARN, 858 "Couldn't create process for zfs pool \"%s\"\n", 859 spa->spa_name); 860#endif 861 } 862 } 863#endif /* SPA_PROCESS */ 864 mutex_exit(&spa->spa_proc_lock); 865 866 /* If we didn't create a process, we need to create our taskqs. */ 867 ASSERT(spa->spa_proc == &p0); 868 if (spa->spa_proc == &p0) { 869 spa_create_zio_taskqs(spa); 870 } 871 872 list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), 873 offsetof(vdev_t, vdev_config_dirty_node)); 874 list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), 875 offsetof(vdev_t, vdev_state_dirty_node)); 876 877 txg_list_create(&spa->spa_vdev_txg_list, 878 offsetof(struct vdev, vdev_txg_node)); 879 880 avl_create(&spa->spa_errlist_scrub, 881 spa_error_entry_compare, sizeof (spa_error_entry_t), 882 offsetof(spa_error_entry_t, se_avl)); 883 avl_create(&spa->spa_errlist_last, 884 spa_error_entry_compare, sizeof (spa_error_entry_t), 885 offsetof(spa_error_entry_t, se_avl)); 886} 887 888/* 889 * Opposite of spa_activate(). 890 */ 891static void 892spa_deactivate(spa_t *spa) 893{ 894 ASSERT(spa->spa_sync_on == B_FALSE); 895 ASSERT(spa->spa_dsl_pool == NULL); 896 ASSERT(spa->spa_root_vdev == NULL); 897 ASSERT(spa->spa_async_zio_root == NULL); 898 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 899 900 txg_list_destroy(&spa->spa_vdev_txg_list); 901 902 list_destroy(&spa->spa_config_dirty_list); 903 list_destroy(&spa->spa_state_dirty_list); 904 905 for (int t = 0; t < ZIO_TYPES; t++) { 906 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 907 if (spa->spa_zio_taskq[t][q] != NULL) 908 taskq_destroy(spa->spa_zio_taskq[t][q]); 909 spa->spa_zio_taskq[t][q] = NULL; 910 } 911 } 912 913 metaslab_class_destroy(spa->spa_normal_class); 914 spa->spa_normal_class = NULL; 915 916 metaslab_class_destroy(spa->spa_log_class); 917 spa->spa_log_class = NULL; 918 919 /* 920 * If this was part of an import or the open otherwise failed, we may 921 * still have errors left in the queues. Empty them just in case. 922 */ 923 spa_errlog_drain(spa); 924 925 avl_destroy(&spa->spa_errlist_scrub); 926 avl_destroy(&spa->spa_errlist_last); 927 928 spa->spa_state = POOL_STATE_UNINITIALIZED; 929 930 mutex_enter(&spa->spa_proc_lock); 931 if (spa->spa_proc_state != SPA_PROC_NONE) { 932 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 933 spa->spa_proc_state = SPA_PROC_DEACTIVATE; 934 cv_broadcast(&spa->spa_proc_cv); 935 while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { 936 ASSERT(spa->spa_proc != &p0); 937 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 938 } 939 ASSERT(spa->spa_proc_state == SPA_PROC_GONE); 940 spa->spa_proc_state = SPA_PROC_NONE; 941 } 942 ASSERT(spa->spa_proc == &p0); 943 mutex_exit(&spa->spa_proc_lock); 944 945#ifdef SPA_PROCESS 946 /* 947 * We want to make sure spa_thread() has actually exited the ZFS 948 * module, so that the module can't be unloaded out from underneath 949 * it. 950 */ 951 if (spa->spa_did != 0) { 952 thread_join(spa->spa_did); 953 spa->spa_did = 0; 954 } 955#endif /* SPA_PROCESS */ 956} 957 958/* 959 * Verify a pool configuration, and construct the vdev tree appropriately. This 960 * will create all the necessary vdevs in the appropriate layout, with each vdev 961 * in the CLOSED state. This will prep the pool before open/creation/import. 962 * All vdev validation is done by the vdev_alloc() routine. 963 */ 964static int 965spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 966 uint_t id, int atype) 967{ 968 nvlist_t **child; 969 uint_t children; 970 int error; 971 972 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 973 return (error); 974 975 if ((*vdp)->vdev_ops->vdev_op_leaf) 976 return (0); 977 978 error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 979 &child, &children); 980 981 if (error == ENOENT) 982 return (0); 983 984 if (error) { 985 vdev_free(*vdp); 986 *vdp = NULL; 987 return (EINVAL); 988 } 989 990 for (int c = 0; c < children; c++) { 991 vdev_t *vd; 992 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 993 atype)) != 0) { 994 vdev_free(*vdp); 995 *vdp = NULL; 996 return (error); 997 } 998 } 999 1000 ASSERT(*vdp != NULL); 1001 1002 return (0); 1003} 1004 1005/* 1006 * Opposite of spa_load(). 1007 */ 1008static void 1009spa_unload(spa_t *spa) 1010{ 1011 int i; 1012 1013 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1014 1015 /* 1016 * Stop async tasks. 1017 */ 1018 spa_async_suspend(spa); 1019 1020 /* 1021 * Stop syncing. 1022 */ 1023 if (spa->spa_sync_on) { 1024 txg_sync_stop(spa->spa_dsl_pool); 1025 spa->spa_sync_on = B_FALSE; 1026 } 1027 1028 /* 1029 * Wait for any outstanding async I/O to complete. 1030 */ 1031 if (spa->spa_async_zio_root != NULL) { 1032 (void) zio_wait(spa->spa_async_zio_root); 1033 spa->spa_async_zio_root = NULL; 1034 } 1035 1036 bpobj_close(&spa->spa_deferred_bpobj); 1037 1038 /* 1039 * Close the dsl pool. 1040 */ 1041 if (spa->spa_dsl_pool) { 1042 dsl_pool_close(spa->spa_dsl_pool); 1043 spa->spa_dsl_pool = NULL; 1044 spa->spa_meta_objset = NULL; 1045 } 1046 1047 ddt_unload(spa); 1048 1049 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1050 1051 /* 1052 * Drop and purge level 2 cache 1053 */ 1054 spa_l2cache_drop(spa); 1055 1056 /* 1057 * Close all vdevs. 1058 */ 1059 if (spa->spa_root_vdev) 1060 vdev_free(spa->spa_root_vdev); 1061 ASSERT(spa->spa_root_vdev == NULL); 1062 1063 for (i = 0; i < spa->spa_spares.sav_count; i++) 1064 vdev_free(spa->spa_spares.sav_vdevs[i]); 1065 if (spa->spa_spares.sav_vdevs) { 1066 kmem_free(spa->spa_spares.sav_vdevs, 1067 spa->spa_spares.sav_count * sizeof (void *)); 1068 spa->spa_spares.sav_vdevs = NULL; 1069 } 1070 if (spa->spa_spares.sav_config) { 1071 nvlist_free(spa->spa_spares.sav_config); 1072 spa->spa_spares.sav_config = NULL; 1073 } 1074 spa->spa_spares.sav_count = 0; 1075
|
1076 for (i = 0; i < spa->spa_l2cache.sav_count; i++)
|
1076 for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 1077 vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]); |
1078 vdev_free(spa->spa_l2cache.sav_vdevs[i]);
|
1079 } |
1080 if (spa->spa_l2cache.sav_vdevs) { 1081 kmem_free(spa->spa_l2cache.sav_vdevs, 1082 spa->spa_l2cache.sav_count * sizeof (void *)); 1083 spa->spa_l2cache.sav_vdevs = NULL; 1084 } 1085 if (spa->spa_l2cache.sav_config) { 1086 nvlist_free(spa->spa_l2cache.sav_config); 1087 spa->spa_l2cache.sav_config = NULL; 1088 } 1089 spa->spa_l2cache.sav_count = 0; 1090 1091 spa->spa_async_suspended = 0; 1092 1093 if (spa->spa_comment != NULL) { 1094 spa_strfree(spa->spa_comment); 1095 spa->spa_comment = NULL; 1096 } 1097 1098 spa_config_exit(spa, SCL_ALL, FTAG); 1099} 1100 1101/* 1102 * Load (or re-load) the current list of vdevs describing the active spares for 1103 * this pool. When this is called, we have some form of basic information in 1104 * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and 1105 * then re-generate a more complete list including status information. 1106 */ 1107static void 1108spa_load_spares(spa_t *spa) 1109{ 1110 nvlist_t **spares; 1111 uint_t nspares; 1112 int i; 1113 vdev_t *vd, *tvd; 1114 1115 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1116 1117 /* 1118 * First, close and free any existing spare vdevs. 1119 */ 1120 for (i = 0; i < spa->spa_spares.sav_count; i++) { 1121 vd = spa->spa_spares.sav_vdevs[i]; 1122 1123 /* Undo the call to spa_activate() below */ 1124 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1125 B_FALSE)) != NULL && tvd->vdev_isspare) 1126 spa_spare_remove(tvd); 1127 vdev_close(vd); 1128 vdev_free(vd); 1129 } 1130 1131 if (spa->spa_spares.sav_vdevs) 1132 kmem_free(spa->spa_spares.sav_vdevs, 1133 spa->spa_spares.sav_count * sizeof (void *)); 1134 1135 if (spa->spa_spares.sav_config == NULL) 1136 nspares = 0; 1137 else 1138 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 1139 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1140 1141 spa->spa_spares.sav_count = (int)nspares; 1142 spa->spa_spares.sav_vdevs = NULL; 1143 1144 if (nspares == 0) 1145 return; 1146 1147 /* 1148 * Construct the array of vdevs, opening them to get status in the 1149 * process. For each spare, there is potentially two different vdev_t 1150 * structures associated with it: one in the list of spares (used only 1151 * for basic validation purposes) and one in the active vdev 1152 * configuration (if it's spared in). During this phase we open and 1153 * validate each vdev on the spare list. If the vdev also exists in the 1154 * active configuration, then we also mark this vdev as an active spare. 1155 */ 1156 spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), 1157 KM_SLEEP); 1158 for (i = 0; i < spa->spa_spares.sav_count; i++) { 1159 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 1160 VDEV_ALLOC_SPARE) == 0); 1161 ASSERT(vd != NULL); 1162 1163 spa->spa_spares.sav_vdevs[i] = vd; 1164 1165 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1166 B_FALSE)) != NULL) { 1167 if (!tvd->vdev_isspare) 1168 spa_spare_add(tvd); 1169 1170 /* 1171 * We only mark the spare active if we were successfully 1172 * able to load the vdev. Otherwise, importing a pool 1173 * with a bad active spare would result in strange 1174 * behavior, because multiple pool would think the spare 1175 * is actively in use. 1176 * 1177 * There is a vulnerability here to an equally bizarre 1178 * circumstance, where a dead active spare is later 1179 * brought back to life (onlined or otherwise). Given 1180 * the rarity of this scenario, and the extra complexity 1181 * it adds, we ignore the possibility. 1182 */ 1183 if (!vdev_is_dead(tvd)) 1184 spa_spare_activate(tvd); 1185 } 1186 1187 vd->vdev_top = vd; 1188 vd->vdev_aux = &spa->spa_spares; 1189 1190 if (vdev_open(vd) != 0) 1191 continue; 1192 1193 if (vdev_validate_aux(vd) == 0) 1194 spa_spare_add(vd); 1195 } 1196 1197 /* 1198 * Recompute the stashed list of spares, with status information 1199 * this time. 1200 */ 1201 VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, 1202 DATA_TYPE_NVLIST_ARRAY) == 0); 1203 1204 spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), 1205 KM_SLEEP); 1206 for (i = 0; i < spa->spa_spares.sav_count; i++) 1207 spares[i] = vdev_config_generate(spa, 1208 spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); 1209 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 1210 ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); 1211 for (i = 0; i < spa->spa_spares.sav_count; i++) 1212 nvlist_free(spares[i]); 1213 kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); 1214} 1215 1216/* 1217 * Load (or re-load) the current list of vdevs describing the active l2cache for 1218 * this pool. When this is called, we have some form of basic information in 1219 * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and 1220 * then re-generate a more complete list including status information. 1221 * Devices which are already active have their details maintained, and are 1222 * not re-opened. 1223 */ 1224static void 1225spa_load_l2cache(spa_t *spa) 1226{ 1227 nvlist_t **l2cache; 1228 uint_t nl2cache; 1229 int i, j, oldnvdevs; 1230 uint64_t guid; 1231 vdev_t *vd, **oldvdevs, **newvdevs; 1232 spa_aux_vdev_t *sav = &spa->spa_l2cache; 1233 1234 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1235 1236 if (sav->sav_config != NULL) { 1237 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 1238 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 1239 newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); 1240 } else { 1241 nl2cache = 0; 1242 } 1243 1244 oldvdevs = sav->sav_vdevs; 1245 oldnvdevs = sav->sav_count; 1246 sav->sav_vdevs = NULL; 1247 sav->sav_count = 0; 1248 1249 /* 1250 * Process new nvlist of vdevs. 1251 */ 1252 for (i = 0; i < nl2cache; i++) { 1253 VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, 1254 &guid) == 0); 1255 1256 newvdevs[i] = NULL; 1257 for (j = 0; j < oldnvdevs; j++) { 1258 vd = oldvdevs[j]; 1259 if (vd != NULL && guid == vd->vdev_guid) { 1260 /* 1261 * Retain previous vdev for add/remove ops. 1262 */ 1263 newvdevs[i] = vd; 1264 oldvdevs[j] = NULL; 1265 break; 1266 } 1267 } 1268 1269 if (newvdevs[i] == NULL) { 1270 /* 1271 * Create new vdev 1272 */ 1273 VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, 1274 VDEV_ALLOC_L2CACHE) == 0); 1275 ASSERT(vd != NULL); 1276 newvdevs[i] = vd; 1277 1278 /* 1279 * Commit this vdev as an l2cache device, 1280 * even if it fails to open. 1281 */ 1282 spa_l2cache_add(vd); 1283 1284 vd->vdev_top = vd; 1285 vd->vdev_aux = sav; 1286 1287 spa_l2cache_activate(vd); 1288 1289 if (vdev_open(vd) != 0) 1290 continue; 1291 1292 (void) vdev_validate_aux(vd); 1293 1294 if (!vdev_is_dead(vd)) 1295 l2arc_add_vdev(spa, vd); 1296 } 1297 } 1298 1299 /* 1300 * Purge vdevs that were dropped 1301 */ 1302 for (i = 0; i < oldnvdevs; i++) { 1303 uint64_t pool; 1304 1305 vd = oldvdevs[i]; 1306 if (vd != NULL) {
|
1307 ASSERT(vd->vdev_isl2cache); 1308 |
1309 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 1310 pool != 0ULL && l2arc_vdev_present(vd)) 1311 l2arc_remove_vdev(vd);
|
1308 (void) vdev_close(vd);
1309 spa_l2cache_remove(vd);
|
1312 vdev_clear_stats(vd); 1313 vdev_free(vd); |
1314 } 1315 } 1316 1317 if (oldvdevs) 1318 kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); 1319 1320 if (sav->sav_config == NULL) 1321 goto out; 1322 1323 sav->sav_vdevs = newvdevs; 1324 sav->sav_count = (int)nl2cache; 1325 1326 /* 1327 * Recompute the stashed list of l2cache devices, with status 1328 * information this time. 1329 */ 1330 VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 1331 DATA_TYPE_NVLIST_ARRAY) == 0); 1332 1333 l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 1334 for (i = 0; i < sav->sav_count; i++) 1335 l2cache[i] = vdev_config_generate(spa, 1336 sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); 1337 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 1338 ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); 1339out: 1340 for (i = 0; i < sav->sav_count; i++) 1341 nvlist_free(l2cache[i]); 1342 if (sav->sav_count) 1343 kmem_free(l2cache, sav->sav_count * sizeof (void *)); 1344} 1345 1346static int 1347load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 1348{ 1349 dmu_buf_t *db; 1350 char *packed = NULL; 1351 size_t nvsize = 0; 1352 int error; 1353 *value = NULL; 1354 1355 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 1356 nvsize = *(uint64_t *)db->db_data; 1357 dmu_buf_rele(db, FTAG); 1358 1359 packed = kmem_alloc(nvsize, KM_SLEEP); 1360 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, 1361 DMU_READ_PREFETCH); 1362 if (error == 0) 1363 error = nvlist_unpack(packed, nvsize, value, 0); 1364 kmem_free(packed, nvsize); 1365 1366 return (error); 1367} 1368 1369/* 1370 * Checks to see if the given vdev could not be opened, in which case we post a 1371 * sysevent to notify the autoreplace code that the device has been removed. 1372 */ 1373static void 1374spa_check_removed(vdev_t *vd) 1375{ 1376 for (int c = 0; c < vd->vdev_children; c++) 1377 spa_check_removed(vd->vdev_child[c]); 1378 1379 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) { 1380 zfs_post_autoreplace(vd->vdev_spa, vd); 1381 spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK); 1382 } 1383} 1384 1385/* 1386 * Validate the current config against the MOS config 1387 */ 1388static boolean_t 1389spa_config_valid(spa_t *spa, nvlist_t *config) 1390{ 1391 vdev_t *mrvd, *rvd = spa->spa_root_vdev; 1392 nvlist_t *nv; 1393 1394 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0); 1395 1396 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1397 VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); 1398 1399 ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children); 1400 1401 /* 1402 * If we're doing a normal import, then build up any additional 1403 * diagnostic information about missing devices in this config. 1404 * We'll pass this up to the user for further processing. 1405 */ 1406 if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { 1407 nvlist_t **child, *nv; 1408 uint64_t idx = 0; 1409 1410 child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **), 1411 KM_SLEEP); 1412 VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); 1413 1414 for (int c = 0; c < rvd->vdev_children; c++) { 1415 vdev_t *tvd = rvd->vdev_child[c]; 1416 vdev_t *mtvd = mrvd->vdev_child[c]; 1417 1418 if (tvd->vdev_ops == &vdev_missing_ops && 1419 mtvd->vdev_ops != &vdev_missing_ops && 1420 mtvd->vdev_islog) 1421 child[idx++] = vdev_config_generate(spa, mtvd, 1422 B_FALSE, 0); 1423 } 1424 1425 if (idx) { 1426 VERIFY(nvlist_add_nvlist_array(nv, 1427 ZPOOL_CONFIG_CHILDREN, child, idx) == 0); 1428 VERIFY(nvlist_add_nvlist(spa->spa_load_info, 1429 ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0); 1430 1431 for (int i = 0; i < idx; i++) 1432 nvlist_free(child[i]); 1433 } 1434 nvlist_free(nv); 1435 kmem_free(child, rvd->vdev_children * sizeof (char **)); 1436 } 1437 1438 /* 1439 * Compare the root vdev tree with the information we have 1440 * from the MOS config (mrvd). Check each top-level vdev 1441 * with the corresponding MOS config top-level (mtvd). 1442 */ 1443 for (int c = 0; c < rvd->vdev_children; c++) { 1444 vdev_t *tvd = rvd->vdev_child[c]; 1445 vdev_t *mtvd = mrvd->vdev_child[c]; 1446 1447 /* 1448 * Resolve any "missing" vdevs in the current configuration. 1449 * If we find that the MOS config has more accurate information 1450 * about the top-level vdev then use that vdev instead. 1451 */ 1452 if (tvd->vdev_ops == &vdev_missing_ops && 1453 mtvd->vdev_ops != &vdev_missing_ops) { 1454 1455 if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) 1456 continue; 1457 1458 /* 1459 * Device specific actions. 1460 */ 1461 if (mtvd->vdev_islog) { 1462 spa_set_log_state(spa, SPA_LOG_CLEAR); 1463 } else { 1464 /* 1465 * XXX - once we have 'readonly' pool 1466 * support we should be able to handle 1467 * missing data devices by transitioning 1468 * the pool to readonly. 1469 */ 1470 continue; 1471 } 1472 1473 /* 1474 * Swap the missing vdev with the data we were 1475 * able to obtain from the MOS config. 1476 */ 1477 vdev_remove_child(rvd, tvd); 1478 vdev_remove_child(mrvd, mtvd); 1479 1480 vdev_add_child(rvd, mtvd); 1481 vdev_add_child(mrvd, tvd); 1482 1483 spa_config_exit(spa, SCL_ALL, FTAG); 1484 vdev_load(mtvd); 1485 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1486 1487 vdev_reopen(rvd); 1488 } else if (mtvd->vdev_islog) { 1489 /* 1490 * Load the slog device's state from the MOS config 1491 * since it's possible that the label does not 1492 * contain the most up-to-date information. 1493 */ 1494 vdev_load_log_state(tvd, mtvd); 1495 vdev_reopen(tvd); 1496 } 1497 } 1498 vdev_free(mrvd); 1499 spa_config_exit(spa, SCL_ALL, FTAG); 1500 1501 /* 1502 * Ensure we were able to validate the config. 1503 */ 1504 return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum); 1505} 1506 1507/* 1508 * Check for missing log devices 1509 */ 1510static int 1511spa_check_logs(spa_t *spa) 1512{ 1513 switch (spa->spa_log_state) { 1514 case SPA_LOG_MISSING: 1515 /* need to recheck in case slog has been restored */ 1516 case SPA_LOG_UNKNOWN: 1517 if (dmu_objset_find(spa->spa_name, zil_check_log_chain, NULL, 1518 DS_FIND_CHILDREN)) { 1519 spa_set_log_state(spa, SPA_LOG_MISSING); 1520 return (1); 1521 } 1522 break; 1523 } 1524 return (0); 1525} 1526 1527static boolean_t 1528spa_passivate_log(spa_t *spa) 1529{ 1530 vdev_t *rvd = spa->spa_root_vdev; 1531 boolean_t slog_found = B_FALSE; 1532 1533 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1534 1535 if (!spa_has_slogs(spa)) 1536 return (B_FALSE); 1537 1538 for (int c = 0; c < rvd->vdev_children; c++) { 1539 vdev_t *tvd = rvd->vdev_child[c]; 1540 metaslab_group_t *mg = tvd->vdev_mg; 1541 1542 if (tvd->vdev_islog) { 1543 metaslab_group_passivate(mg); 1544 slog_found = B_TRUE; 1545 } 1546 } 1547 1548 return (slog_found); 1549} 1550 1551static void 1552spa_activate_log(spa_t *spa) 1553{ 1554 vdev_t *rvd = spa->spa_root_vdev; 1555 1556 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1557 1558 for (int c = 0; c < rvd->vdev_children; c++) { 1559 vdev_t *tvd = rvd->vdev_child[c]; 1560 metaslab_group_t *mg = tvd->vdev_mg; 1561 1562 if (tvd->vdev_islog) 1563 metaslab_group_activate(mg); 1564 } 1565} 1566 1567int 1568spa_offline_log(spa_t *spa) 1569{ 1570 int error = 0; 1571 1572 if ((error = dmu_objset_find(spa_name(spa), zil_vdev_offline, 1573 NULL, DS_FIND_CHILDREN)) == 0) { 1574 1575 /* 1576 * We successfully offlined the log device, sync out the 1577 * current txg so that the "stubby" block can be removed 1578 * by zil_sync(). 1579 */ 1580 txg_wait_synced(spa->spa_dsl_pool, 0); 1581 } 1582 return (error); 1583} 1584 1585static void 1586spa_aux_check_removed(spa_aux_vdev_t *sav) 1587{ 1588 int i; 1589 1590 for (i = 0; i < sav->sav_count; i++) 1591 spa_check_removed(sav->sav_vdevs[i]); 1592} 1593 1594void 1595spa_claim_notify(zio_t *zio) 1596{ 1597 spa_t *spa = zio->io_spa; 1598 1599 if (zio->io_error) 1600 return; 1601 1602 mutex_enter(&spa->spa_props_lock); /* any mutex will do */ 1603 if (spa->spa_claim_max_txg < zio->io_bp->blk_birth) 1604 spa->spa_claim_max_txg = zio->io_bp->blk_birth; 1605 mutex_exit(&spa->spa_props_lock); 1606} 1607 1608typedef struct spa_load_error { 1609 uint64_t sle_meta_count; 1610 uint64_t sle_data_count; 1611} spa_load_error_t; 1612 1613static void 1614spa_load_verify_done(zio_t *zio) 1615{ 1616 blkptr_t *bp = zio->io_bp; 1617 spa_load_error_t *sle = zio->io_private; 1618 dmu_object_type_t type = BP_GET_TYPE(bp); 1619 int error = zio->io_error; 1620 1621 if (error) { 1622 if ((BP_GET_LEVEL(bp) != 0 || dmu_ot[type].ot_metadata) && 1623 type != DMU_OT_INTENT_LOG) 1624 atomic_add_64(&sle->sle_meta_count, 1); 1625 else 1626 atomic_add_64(&sle->sle_data_count, 1); 1627 } 1628 zio_data_buf_free(zio->io_data, zio->io_size); 1629} 1630 1631/*ARGSUSED*/ 1632static int 1633spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 1634 arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) 1635{ 1636 if (bp != NULL) { 1637 zio_t *rio = arg; 1638 size_t size = BP_GET_PSIZE(bp); 1639 void *data = zio_data_buf_alloc(size); 1640 1641 zio_nowait(zio_read(rio, spa, bp, data, size, 1642 spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, 1643 ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | 1644 ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); 1645 } 1646 return (0); 1647} 1648 1649static int 1650spa_load_verify(spa_t *spa) 1651{ 1652 zio_t *rio; 1653 spa_load_error_t sle = { 0 }; 1654 zpool_rewind_policy_t policy; 1655 boolean_t verify_ok = B_FALSE; 1656 int error; 1657 1658 zpool_get_rewind_policy(spa->spa_config, &policy); 1659 1660 if (policy.zrp_request & ZPOOL_NEVER_REWIND) 1661 return (0); 1662 1663 rio = zio_root(spa, NULL, &sle, 1664 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 1665 1666 error = traverse_pool(spa, spa->spa_verify_min_txg, 1667 TRAVERSE_PRE | TRAVERSE_PREFETCH, spa_load_verify_cb, rio); 1668 1669 (void) zio_wait(rio); 1670 1671 spa->spa_load_meta_errors = sle.sle_meta_count; 1672 spa->spa_load_data_errors = sle.sle_data_count; 1673 1674 if (!error && sle.sle_meta_count <= policy.zrp_maxmeta && 1675 sle.sle_data_count <= policy.zrp_maxdata) { 1676 int64_t loss = 0; 1677 1678 verify_ok = B_TRUE; 1679 spa->spa_load_txg = spa->spa_uberblock.ub_txg; 1680 spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; 1681 1682 loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; 1683 VERIFY(nvlist_add_uint64(spa->spa_load_info, 1684 ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0); 1685 VERIFY(nvlist_add_int64(spa->spa_load_info, 1686 ZPOOL_CONFIG_REWIND_TIME, loss) == 0); 1687 VERIFY(nvlist_add_uint64(spa->spa_load_info, 1688 ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0); 1689 } else { 1690 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; 1691 } 1692 1693 if (error) { 1694 if (error != ENXIO && error != EIO) 1695 error = EIO; 1696 return (error); 1697 } 1698 1699 return (verify_ok ? 0 : EIO); 1700} 1701 1702/* 1703 * Find a value in the pool props object. 1704 */ 1705static void 1706spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) 1707{ 1708 (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, 1709 zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); 1710} 1711 1712/* 1713 * Find a value in the pool directory object. 1714 */ 1715static int 1716spa_dir_prop(spa_t *spa, const char *name, uint64_t *val) 1717{ 1718 return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1719 name, sizeof (uint64_t), 1, val)); 1720} 1721 1722static int 1723spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) 1724{ 1725 vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); 1726 return (err); 1727} 1728 1729/* 1730 * Fix up config after a partly-completed split. This is done with the 1731 * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off 1732 * pool have that entry in their config, but only the splitting one contains 1733 * a list of all the guids of the vdevs that are being split off. 1734 * 1735 * This function determines what to do with that list: either rejoin 1736 * all the disks to the pool, or complete the splitting process. To attempt 1737 * the rejoin, each disk that is offlined is marked online again, and 1738 * we do a reopen() call. If the vdev label for every disk that was 1739 * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) 1740 * then we call vdev_split() on each disk, and complete the split. 1741 * 1742 * Otherwise we leave the config alone, with all the vdevs in place in 1743 * the original pool. 1744 */ 1745static void 1746spa_try_repair(spa_t *spa, nvlist_t *config) 1747{ 1748 uint_t extracted; 1749 uint64_t *glist; 1750 uint_t i, gcount; 1751 nvlist_t *nvl; 1752 vdev_t **vd; 1753 boolean_t attempt_reopen; 1754 1755 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) 1756 return; 1757 1758 /* check that the config is complete */ 1759 if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 1760 &glist, &gcount) != 0) 1761 return; 1762 1763 vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); 1764 1765 /* attempt to online all the vdevs & validate */ 1766 attempt_reopen = B_TRUE; 1767 for (i = 0; i < gcount; i++) { 1768 if (glist[i] == 0) /* vdev is hole */ 1769 continue; 1770 1771 vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); 1772 if (vd[i] == NULL) { 1773 /* 1774 * Don't bother attempting to reopen the disks; 1775 * just do the split. 1776 */ 1777 attempt_reopen = B_FALSE; 1778 } else { 1779 /* attempt to re-online it */ 1780 vd[i]->vdev_offline = B_FALSE; 1781 } 1782 } 1783 1784 if (attempt_reopen) { 1785 vdev_reopen(spa->spa_root_vdev); 1786 1787 /* check each device to see what state it's in */ 1788 for (extracted = 0, i = 0; i < gcount; i++) { 1789 if (vd[i] != NULL && 1790 vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) 1791 break; 1792 ++extracted; 1793 } 1794 } 1795 1796 /* 1797 * If every disk has been moved to the new pool, or if we never 1798 * even attempted to look at them, then we split them off for 1799 * good. 1800 */ 1801 if (!attempt_reopen || gcount == extracted) { 1802 for (i = 0; i < gcount; i++) 1803 if (vd[i] != NULL) 1804 vdev_split(vd[i]); 1805 vdev_reopen(spa->spa_root_vdev); 1806 } 1807 1808 kmem_free(vd, gcount * sizeof (vdev_t *)); 1809} 1810 1811static int 1812spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, 1813 boolean_t mosconfig) 1814{ 1815 nvlist_t *config = spa->spa_config; 1816 char *ereport = FM_EREPORT_ZFS_POOL; 1817 char *comment; 1818 int error; 1819 uint64_t pool_guid; 1820 nvlist_t *nvl; 1821 1822 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) 1823 return (EINVAL); 1824 1825 ASSERT(spa->spa_comment == NULL); 1826 if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) 1827 spa->spa_comment = spa_strdup(comment); 1828 1829 /* 1830 * Versioning wasn't explicitly added to the label until later, so if 1831 * it's not present treat it as the initial version. 1832 */ 1833 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 1834 &spa->spa_ubsync.ub_version) != 0) 1835 spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 1836 1837 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 1838 &spa->spa_config_txg); 1839 1840 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 1841 spa_guid_exists(pool_guid, 0)) { 1842 error = EEXIST; 1843 } else { 1844 spa->spa_config_guid = pool_guid; 1845 1846 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, 1847 &nvl) == 0) { 1848 VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting, 1849 KM_SLEEP) == 0); 1850 } 1851 1852 gethrestime(&spa->spa_loaded_ts); 1853 error = spa_load_impl(spa, pool_guid, config, state, type, 1854 mosconfig, &ereport); 1855 } 1856 1857 spa->spa_minref = refcount_count(&spa->spa_refcount); 1858 if (error) { 1859 if (error != EEXIST) { 1860 spa->spa_loaded_ts.tv_sec = 0; 1861 spa->spa_loaded_ts.tv_nsec = 0; 1862 } 1863 if (error != EBADF) { 1864 zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); 1865 } 1866 } 1867 spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; 1868 spa->spa_ena = 0; 1869 1870 return (error); 1871} 1872 1873/* 1874 * Load an existing storage pool, using the pool's builtin spa_config as a 1875 * source of configuration information. 1876 */ 1877static int 1878spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, 1879 spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, 1880 char **ereport) 1881{ 1882 int error = 0; 1883 nvlist_t *nvroot = NULL; 1884 vdev_t *rvd; 1885 uberblock_t *ub = &spa->spa_uberblock; 1886 uint64_t children, config_cache_txg = spa->spa_config_txg; 1887 int orig_mode = spa->spa_mode; 1888 int parse; 1889 uint64_t obj; 1890 1891 /* 1892 * If this is an untrusted config, access the pool in read-only mode. 1893 * This prevents things like resilvering recently removed devices. 1894 */ 1895 if (!mosconfig) 1896 spa->spa_mode = FREAD; 1897 1898 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1899 1900 spa->spa_load_state = state; 1901 1902 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot)) 1903 return (EINVAL); 1904 1905 parse = (type == SPA_IMPORT_EXISTING ? 1906 VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); 1907 1908 /* 1909 * Create "The Godfather" zio to hold all async IOs 1910 */ 1911 spa->spa_async_zio_root = zio_root(spa, NULL, NULL, 1912 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); 1913 1914 /* 1915 * Parse the configuration into a vdev tree. We explicitly set the 1916 * value that will be returned by spa_version() since parsing the 1917 * configuration requires knowing the version number. 1918 */ 1919 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1920 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse); 1921 spa_config_exit(spa, SCL_ALL, FTAG); 1922 1923 if (error != 0) 1924 return (error); 1925 1926 ASSERT(spa->spa_root_vdev == rvd); 1927 1928 if (type != SPA_IMPORT_ASSEMBLE) { 1929 ASSERT(spa_guid(spa) == pool_guid); 1930 } 1931 1932 /* 1933 * Try to open all vdevs, loading each label in the process. 1934 */ 1935 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1936 error = vdev_open(rvd); 1937 spa_config_exit(spa, SCL_ALL, FTAG); 1938 if (error != 0) 1939 return (error); 1940 1941 /* 1942 * We need to validate the vdev labels against the configuration that 1943 * we have in hand, which is dependent on the setting of mosconfig. If 1944 * mosconfig is true then we're validating the vdev labels based on 1945 * that config. Otherwise, we're validating against the cached config 1946 * (zpool.cache) that was read when we loaded the zfs module, and then 1947 * later we will recursively call spa_load() and validate against 1948 * the vdev config. 1949 * 1950 * If we're assembling a new pool that's been split off from an 1951 * existing pool, the labels haven't yet been updated so we skip 1952 * validation for now. 1953 */ 1954 if (type != SPA_IMPORT_ASSEMBLE) { 1955 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
|
1952 error = vdev_validate(rvd);
|
1956 error = vdev_validate(rvd, mosconfig); |
1957 spa_config_exit(spa, SCL_ALL, FTAG); 1958 1959 if (error != 0) 1960 return (error); 1961 1962 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 1963 return (ENXIO); 1964 } 1965 1966 /* 1967 * Find the best uberblock. 1968 */ 1969 vdev_uberblock_load(NULL, rvd, ub); 1970 1971 /* 1972 * If we weren't able to find a single valid uberblock, return failure. 1973 */ 1974 if (ub->ub_txg == 0) 1975 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); 1976 1977 /* 1978 * If the pool is newer than the code, we can't open it. 1979 */ 1980 if (ub->ub_version > SPA_VERSION) 1981 return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); 1982 1983 /* 1984 * If the vdev guid sum doesn't match the uberblock, we have an 1985 * incomplete configuration. We first check to see if the pool 1986 * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN). 1987 * If it is, defer the vdev_guid_sum check till later so we 1988 * can handle missing vdevs. 1989 */ 1990 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN, 1991 &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE && 1992 rvd->vdev_guid_sum != ub->ub_guid_sum) 1993 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); 1994 1995 if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { 1996 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1997 spa_try_repair(spa, config); 1998 spa_config_exit(spa, SCL_ALL, FTAG); 1999 nvlist_free(spa->spa_config_splitting); 2000 spa->spa_config_splitting = NULL; 2001 } 2002 2003 /* 2004 * Initialize internal SPA structures. 2005 */ 2006 spa->spa_state = POOL_STATE_ACTIVE; 2007 spa->spa_ubsync = spa->spa_uberblock; 2008 spa->spa_verify_min_txg = spa->spa_extreme_rewind ? 2009 TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; 2010 spa->spa_first_txg = spa->spa_last_ubsync_txg ? 2011 spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; 2012 spa->spa_claim_max_txg = spa->spa_first_txg; 2013 spa->spa_prev_software_version = ub->ub_software_version; 2014 2015 error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 2016 if (error) 2017 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2018 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 2019 2020 if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0) 2021 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2022 2023 if (!mosconfig) { 2024 uint64_t hostid; 2025 nvlist_t *policy = NULL, *nvconfig; 2026 2027 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) 2028 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2029 2030 if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig, 2031 ZPOOL_CONFIG_HOSTID, &hostid) == 0) { 2032 char *hostname; 2033 unsigned long myhostid = 0; 2034 2035 VERIFY(nvlist_lookup_string(nvconfig, 2036 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 2037 2038#ifdef _KERNEL 2039 myhostid = zone_get_hostid(NULL); 2040#else /* _KERNEL */ 2041 /* 2042 * We're emulating the system's hostid in userland, so 2043 * we can't use zone_get_hostid(). 2044 */ 2045 (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 2046#endif /* _KERNEL */ 2047 if (check_hostid && hostid != 0 && myhostid != 0 && 2048 hostid != myhostid) { 2049 nvlist_free(nvconfig); 2050 cmn_err(CE_WARN, "pool '%s' could not be " 2051 "loaded as it was last accessed by " 2052 "another system (host: %s hostid: 0x%lx). " 2053 "See: http://www.sun.com/msg/ZFS-8000-EY", 2054 spa_name(spa), hostname, 2055 (unsigned long)hostid); 2056 return (EBADF); 2057 } 2058 } 2059 if (nvlist_lookup_nvlist(spa->spa_config, 2060 ZPOOL_REWIND_POLICY, &policy) == 0) 2061 VERIFY(nvlist_add_nvlist(nvconfig, 2062 ZPOOL_REWIND_POLICY, policy) == 0); 2063 2064 spa_config_set(spa, nvconfig); 2065 spa_unload(spa); 2066 spa_deactivate(spa); 2067 spa_activate(spa, orig_mode); 2068 2069 return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE)); 2070 } 2071 2072 if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0) 2073 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2074 error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); 2075 if (error != 0) 2076 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2077 2078 /* 2079 * Load the bit that tells us to use the new accounting function 2080 * (raid-z deflation). If we have an older pool, this will not 2081 * be present. 2082 */ 2083 error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate); 2084 if (error != 0 && error != ENOENT) 2085 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2086 2087 error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, 2088 &spa->spa_creation_version); 2089 if (error != 0 && error != ENOENT) 2090 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2091 2092 /* 2093 * Load the persistent error log. If we have an older pool, this will 2094 * not be present. 2095 */ 2096 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last); 2097 if (error != 0 && error != ENOENT) 2098 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2099 2100 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, 2101 &spa->spa_errlog_scrub); 2102 if (error != 0 && error != ENOENT) 2103 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2104 2105 /* 2106 * Load the history object. If we have an older pool, this 2107 * will not be present. 2108 */ 2109 error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history); 2110 if (error != 0 && error != ENOENT) 2111 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2112 2113 /* 2114 * If we're assembling the pool from the split-off vdevs of 2115 * an existing pool, we don't want to attach the spares & cache 2116 * devices. 2117 */ 2118 2119 /* 2120 * Load any hot spares for this pool. 2121 */ 2122 error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object); 2123 if (error != 0 && error != ENOENT) 2124 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2125 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 2126 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 2127 if (load_nvlist(spa, spa->spa_spares.sav_object, 2128 &spa->spa_spares.sav_config) != 0) 2129 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2130 2131 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2132 spa_load_spares(spa); 2133 spa_config_exit(spa, SCL_ALL, FTAG); 2134 } else if (error == 0) { 2135 spa->spa_spares.sav_sync = B_TRUE; 2136 } 2137 2138 /* 2139 * Load any level 2 ARC devices for this pool. 2140 */ 2141 error = spa_dir_prop(spa, DMU_POOL_L2CACHE, 2142 &spa->spa_l2cache.sav_object); 2143 if (error != 0 && error != ENOENT) 2144 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2145 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 2146 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 2147 if (load_nvlist(spa, spa->spa_l2cache.sav_object, 2148 &spa->spa_l2cache.sav_config) != 0) 2149 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2150 2151 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2152 spa_load_l2cache(spa); 2153 spa_config_exit(spa, SCL_ALL, FTAG); 2154 } else if (error == 0) { 2155 spa->spa_l2cache.sav_sync = B_TRUE; 2156 } 2157 2158 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 2159 2160 error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object); 2161 if (error && error != ENOENT) 2162 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2163 2164 if (error == 0) { 2165 uint64_t autoreplace; 2166 2167 spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); 2168 spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); 2169 spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); 2170 spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); 2171 spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); 2172 spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO, 2173 &spa->spa_dedup_ditto); 2174 2175 spa->spa_autoreplace = (autoreplace != 0); 2176 } 2177 2178 /* 2179 * If the 'autoreplace' property is set, then post a resource notifying 2180 * the ZFS DE that it should not issue any faults for unopenable 2181 * devices. We also iterate over the vdevs, and post a sysevent for any 2182 * unopenable vdevs so that the normal autoreplace handler can take 2183 * over. 2184 */ 2185 if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) { 2186 spa_check_removed(spa->spa_root_vdev); 2187 /* 2188 * For the import case, this is done in spa_import(), because 2189 * at this point we're using the spare definitions from 2190 * the MOS config, not necessarily from the userland config. 2191 */ 2192 if (state != SPA_LOAD_IMPORT) { 2193 spa_aux_check_removed(&spa->spa_spares); 2194 spa_aux_check_removed(&spa->spa_l2cache); 2195 } 2196 } 2197 2198 /* 2199 * Load the vdev state for all toplevel vdevs. 2200 */ 2201 vdev_load(rvd); 2202 2203 /* 2204 * Propagate the leaf DTLs we just loaded all the way up the tree. 2205 */ 2206 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2207 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 2208 spa_config_exit(spa, SCL_ALL, FTAG); 2209 2210 /* 2211 * Load the DDTs (dedup tables). 2212 */ 2213 error = ddt_load(spa); 2214 if (error != 0) 2215 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2216 2217 spa_update_dspace(spa); 2218 2219 /* 2220 * Validate the config, using the MOS config to fill in any 2221 * information which might be missing. If we fail to validate 2222 * the config then declare the pool unfit for use. If we're 2223 * assembling a pool from a split, the log is not transferred 2224 * over. 2225 */ 2226 if (type != SPA_IMPORT_ASSEMBLE) { 2227 nvlist_t *nvconfig; 2228 2229 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) 2230 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2231 2232 if (!spa_config_valid(spa, nvconfig)) { 2233 nvlist_free(nvconfig); 2234 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, 2235 ENXIO)); 2236 } 2237 nvlist_free(nvconfig); 2238 2239 /* 2240 * Now that we've validate the config, check the state of the 2241 * root vdev. If it can't be opened, it indicates one or 2242 * more toplevel vdevs are faulted. 2243 */ 2244 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 2245 return (ENXIO); 2246 2247 if (spa_check_logs(spa)) { 2248 *ereport = FM_EREPORT_ZFS_LOG_REPLAY; 2249 return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO)); 2250 } 2251 } 2252 2253 /* 2254 * We've successfully opened the pool, verify that we're ready 2255 * to start pushing transactions. 2256 */ 2257 if (state != SPA_LOAD_TRYIMPORT) { 2258 if (error = spa_load_verify(spa)) 2259 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 2260 error)); 2261 } 2262 2263 if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER || 2264 spa->spa_load_max_txg == UINT64_MAX)) { 2265 dmu_tx_t *tx; 2266 int need_update = B_FALSE; 2267 2268 ASSERT(state != SPA_LOAD_TRYIMPORT); 2269 2270 /* 2271 * Claim log blocks that haven't been committed yet. 2272 * This must all happen in a single txg. 2273 * Note: spa_claim_max_txg is updated by spa_claim_notify(), 2274 * invoked from zil_claim_log_block()'s i/o done callback. 2275 * Price of rollback is that we abandon the log. 2276 */ 2277 spa->spa_claiming = B_TRUE; 2278 2279 tx = dmu_tx_create_assigned(spa_get_dsl(spa), 2280 spa_first_txg(spa)); 2281 (void) dmu_objset_find(spa_name(spa), 2282 zil_claim, tx, DS_FIND_CHILDREN); 2283 dmu_tx_commit(tx); 2284 2285 spa->spa_claiming = B_FALSE; 2286 2287 spa_set_log_state(spa, SPA_LOG_GOOD); 2288 spa->spa_sync_on = B_TRUE; 2289 txg_sync_start(spa->spa_dsl_pool); 2290 2291 /* 2292 * Wait for all claims to sync. We sync up to the highest 2293 * claimed log block birth time so that claimed log blocks 2294 * don't appear to be from the future. spa_claim_max_txg 2295 * will have been set for us by either zil_check_log_chain() 2296 * (invoked from spa_check_logs()) or zil_claim() above. 2297 */ 2298 txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); 2299 2300 /* 2301 * If the config cache is stale, or we have uninitialized 2302 * metaslabs (see spa_vdev_add()), then update the config. 2303 * 2304 * If this is a verbatim import, trust the current 2305 * in-core spa_config and update the disk labels. 2306 */ 2307 if (config_cache_txg != spa->spa_config_txg || 2308 state == SPA_LOAD_IMPORT || 2309 state == SPA_LOAD_RECOVER || 2310 (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) 2311 need_update = B_TRUE; 2312 2313 for (int c = 0; c < rvd->vdev_children; c++) 2314 if (rvd->vdev_child[c]->vdev_ms_array == 0) 2315 need_update = B_TRUE; 2316 2317 /* 2318 * Update the config cache asychronously in case we're the 2319 * root pool, in which case the config cache isn't writable yet. 2320 */ 2321 if (need_update) 2322 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 2323 2324 /* 2325 * Check all DTLs to see if anything needs resilvering. 2326 */ 2327 if (!dsl_scan_resilvering(spa->spa_dsl_pool) && 2328 vdev_resilver_needed(rvd, NULL, NULL)) 2329 spa_async_request(spa, SPA_ASYNC_RESILVER); 2330 2331 /* 2332 * Delete any inconsistent datasets. 2333 */ 2334 (void) dmu_objset_find(spa_name(spa), 2335 dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); 2336 2337 /* 2338 * Clean up any stale temporary dataset userrefs. 2339 */ 2340 dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); 2341 } 2342 2343 return (0); 2344} 2345 2346static int 2347spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig) 2348{ 2349 int mode = spa->spa_mode; 2350 2351 spa_unload(spa); 2352 spa_deactivate(spa); 2353 2354 spa->spa_load_max_txg--; 2355 2356 spa_activate(spa, mode); 2357 spa_async_suspend(spa); 2358 2359 return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig)); 2360} 2361 2362static int 2363spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, 2364 uint64_t max_request, int rewind_flags) 2365{ 2366 nvlist_t *config = NULL; 2367 int load_error, rewind_error; 2368 uint64_t safe_rewind_txg; 2369 uint64_t min_txg; 2370 2371 if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { 2372 spa->spa_load_max_txg = spa->spa_load_txg; 2373 spa_set_log_state(spa, SPA_LOG_CLEAR); 2374 } else { 2375 spa->spa_load_max_txg = max_request; 2376 } 2377 2378 load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING, 2379 mosconfig); 2380 if (load_error == 0) 2381 return (0); 2382 2383 if (spa->spa_root_vdev != NULL) 2384 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 2385 2386 spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; 2387 spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; 2388 2389 if (rewind_flags & ZPOOL_NEVER_REWIND) { 2390 nvlist_free(config); 2391 return (load_error); 2392 } 2393 2394 /* Price of rolling back is discarding txgs, including log */ 2395 if (state == SPA_LOAD_RECOVER) 2396 spa_set_log_state(spa, SPA_LOG_CLEAR); 2397 2398 spa->spa_load_max_txg = spa->spa_last_ubsync_txg; 2399 safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; 2400 min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? 2401 TXG_INITIAL : safe_rewind_txg; 2402 2403 /* 2404 * Continue as long as we're finding errors, we're still within 2405 * the acceptable rewind range, and we're still finding uberblocks 2406 */ 2407 while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && 2408 spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { 2409 if (spa->spa_load_max_txg < safe_rewind_txg) 2410 spa->spa_extreme_rewind = B_TRUE; 2411 rewind_error = spa_load_retry(spa, state, mosconfig); 2412 } 2413 2414 spa->spa_extreme_rewind = B_FALSE; 2415 spa->spa_load_max_txg = UINT64_MAX; 2416 2417 if (config && (rewind_error || state != SPA_LOAD_RECOVER)) 2418 spa_config_set(spa, config); 2419 2420 return (state == SPA_LOAD_RECOVER ? rewind_error : load_error); 2421} 2422 2423/* 2424 * Pool Open/Import 2425 * 2426 * The import case is identical to an open except that the configuration is sent 2427 * down from userland, instead of grabbed from the configuration cache. For the 2428 * case of an open, the pool configuration will exist in the 2429 * POOL_STATE_UNINITIALIZED state. 2430 * 2431 * The stats information (gen/count/ustats) is used to gather vdev statistics at 2432 * the same time open the pool, without having to keep around the spa_t in some 2433 * ambiguous state. 2434 */ 2435static int 2436spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, 2437 nvlist_t **config) 2438{ 2439 spa_t *spa; 2440 spa_load_state_t state = SPA_LOAD_OPEN; 2441 int error; 2442 int locked = B_FALSE; 2443 int firstopen = B_FALSE; 2444 2445 *spapp = NULL; 2446 2447 /* 2448 * As disgusting as this is, we need to support recursive calls to this 2449 * function because dsl_dir_open() is called during spa_load(), and ends 2450 * up calling spa_open() again. The real fix is to figure out how to 2451 * avoid dsl_dir_open() calling this in the first place. 2452 */ 2453 if (mutex_owner(&spa_namespace_lock) != curthread) { 2454 mutex_enter(&spa_namespace_lock); 2455 locked = B_TRUE; 2456 } 2457 2458 if ((spa = spa_lookup(pool)) == NULL) { 2459 if (locked) 2460 mutex_exit(&spa_namespace_lock); 2461 return (ENOENT); 2462 } 2463 2464 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 2465 zpool_rewind_policy_t policy; 2466 2467 firstopen = B_TRUE; 2468 2469 zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config, 2470 &policy); 2471 if (policy.zrp_request & ZPOOL_DO_REWIND) 2472 state = SPA_LOAD_RECOVER; 2473 2474 spa_activate(spa, spa_mode_global); 2475 2476 if (state != SPA_LOAD_RECOVER) 2477 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 2478 2479 error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg, 2480 policy.zrp_request); 2481 2482 if (error == EBADF) { 2483 /* 2484 * If vdev_validate() returns failure (indicated by 2485 * EBADF), it indicates that one of the vdevs indicates 2486 * that the pool has been exported or destroyed. If 2487 * this is the case, the config cache is out of sync and 2488 * we should remove the pool from the namespace. 2489 */ 2490 spa_unload(spa); 2491 spa_deactivate(spa); 2492 spa_config_sync(spa, B_TRUE, B_TRUE); 2493 spa_remove(spa); 2494 if (locked) 2495 mutex_exit(&spa_namespace_lock); 2496 return (ENOENT); 2497 } 2498 2499 if (error) { 2500 /* 2501 * We can't open the pool, but we still have useful 2502 * information: the state of each vdev after the 2503 * attempted vdev_open(). Return this to the user. 2504 */ 2505 if (config != NULL && spa->spa_config) { 2506 VERIFY(nvlist_dup(spa->spa_config, config, 2507 KM_SLEEP) == 0); 2508 VERIFY(nvlist_add_nvlist(*config, 2509 ZPOOL_CONFIG_LOAD_INFO, 2510 spa->spa_load_info) == 0); 2511 } 2512 spa_unload(spa); 2513 spa_deactivate(spa); 2514 spa->spa_last_open_failed = error; 2515 if (locked) 2516 mutex_exit(&spa_namespace_lock); 2517 *spapp = NULL; 2518 return (error); 2519 } 2520 } 2521 2522 spa_open_ref(spa, tag); 2523 2524 if (config != NULL) 2525 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 2526 2527 /* 2528 * If we've recovered the pool, pass back any information we 2529 * gathered while doing the load. 2530 */ 2531 if (state == SPA_LOAD_RECOVER) { 2532 VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, 2533 spa->spa_load_info) == 0); 2534 } 2535 2536 if (locked) { 2537 spa->spa_last_open_failed = 0; 2538 spa->spa_last_ubsync_txg = 0; 2539 spa->spa_load_txg = 0; 2540 mutex_exit(&spa_namespace_lock); 2541#ifdef __FreeBSD__ 2542#ifdef _KERNEL 2543 if (firstopen) 2544 zvol_create_minors(pool); 2545#endif 2546#endif 2547 } 2548 2549 *spapp = spa; 2550 2551 return (0); 2552} 2553 2554int 2555spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy, 2556 nvlist_t **config) 2557{ 2558 return (spa_open_common(name, spapp, tag, policy, config)); 2559} 2560 2561int 2562spa_open(const char *name, spa_t **spapp, void *tag) 2563{ 2564 return (spa_open_common(name, spapp, tag, NULL, NULL)); 2565} 2566 2567/* 2568 * Lookup the given spa_t, incrementing the inject count in the process, 2569 * preventing it from being exported or destroyed. 2570 */ 2571spa_t * 2572spa_inject_addref(char *name) 2573{ 2574 spa_t *spa; 2575 2576 mutex_enter(&spa_namespace_lock); 2577 if ((spa = spa_lookup(name)) == NULL) { 2578 mutex_exit(&spa_namespace_lock); 2579 return (NULL); 2580 } 2581 spa->spa_inject_ref++; 2582 mutex_exit(&spa_namespace_lock); 2583 2584 return (spa); 2585} 2586 2587void 2588spa_inject_delref(spa_t *spa) 2589{ 2590 mutex_enter(&spa_namespace_lock); 2591 spa->spa_inject_ref--; 2592 mutex_exit(&spa_namespace_lock); 2593} 2594 2595/* 2596 * Add spares device information to the nvlist. 2597 */ 2598static void 2599spa_add_spares(spa_t *spa, nvlist_t *config) 2600{ 2601 nvlist_t **spares; 2602 uint_t i, nspares; 2603 nvlist_t *nvroot; 2604 uint64_t guid; 2605 vdev_stat_t *vs; 2606 uint_t vsc; 2607 uint64_t pool; 2608 2609 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 2610 2611 if (spa->spa_spares.sav_count == 0) 2612 return; 2613 2614 VERIFY(nvlist_lookup_nvlist(config, 2615 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 2616 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 2617 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 2618 if (nspares != 0) { 2619 VERIFY(nvlist_add_nvlist_array(nvroot, 2620 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 2621 VERIFY(nvlist_lookup_nvlist_array(nvroot, 2622 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 2623 2624 /* 2625 * Go through and find any spares which have since been 2626 * repurposed as an active spare. If this is the case, update 2627 * their status appropriately. 2628 */ 2629 for (i = 0; i < nspares; i++) { 2630 VERIFY(nvlist_lookup_uint64(spares[i], 2631 ZPOOL_CONFIG_GUID, &guid) == 0); 2632 if (spa_spare_exists(guid, &pool, NULL) && 2633 pool != 0ULL) { 2634 VERIFY(nvlist_lookup_uint64_array( 2635 spares[i], ZPOOL_CONFIG_VDEV_STATS, 2636 (uint64_t **)&vs, &vsc) == 0); 2637 vs->vs_state = VDEV_STATE_CANT_OPEN; 2638 vs->vs_aux = VDEV_AUX_SPARED; 2639 } 2640 } 2641 } 2642} 2643 2644/* 2645 * Add l2cache device information to the nvlist, including vdev stats. 2646 */ 2647static void 2648spa_add_l2cache(spa_t *spa, nvlist_t *config) 2649{ 2650 nvlist_t **l2cache; 2651 uint_t i, j, nl2cache; 2652 nvlist_t *nvroot; 2653 uint64_t guid; 2654 vdev_t *vd; 2655 vdev_stat_t *vs; 2656 uint_t vsc; 2657 2658 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 2659 2660 if (spa->spa_l2cache.sav_count == 0) 2661 return; 2662 2663 VERIFY(nvlist_lookup_nvlist(config, 2664 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 2665 VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 2666 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 2667 if (nl2cache != 0) { 2668 VERIFY(nvlist_add_nvlist_array(nvroot, 2669 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 2670 VERIFY(nvlist_lookup_nvlist_array(nvroot, 2671 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 2672 2673 /* 2674 * Update level 2 cache device stats. 2675 */ 2676 2677 for (i = 0; i < nl2cache; i++) { 2678 VERIFY(nvlist_lookup_uint64(l2cache[i], 2679 ZPOOL_CONFIG_GUID, &guid) == 0); 2680 2681 vd = NULL; 2682 for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 2683 if (guid == 2684 spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 2685 vd = spa->spa_l2cache.sav_vdevs[j]; 2686 break; 2687 } 2688 } 2689 ASSERT(vd != NULL); 2690 2691 VERIFY(nvlist_lookup_uint64_array(l2cache[i], 2692 ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) 2693 == 0); 2694 vdev_get_stats(vd, vs); 2695 } 2696 } 2697} 2698 2699int 2700spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 2701{ 2702 int error; 2703 spa_t *spa; 2704 2705 *config = NULL; 2706 error = spa_open_common(name, &spa, FTAG, NULL, config); 2707 2708 if (spa != NULL) { 2709 /* 2710 * This still leaves a window of inconsistency where the spares 2711 * or l2cache devices could change and the config would be 2712 * self-inconsistent. 2713 */ 2714 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 2715 2716 if (*config != NULL) { 2717 uint64_t loadtimes[2]; 2718 2719 loadtimes[0] = spa->spa_loaded_ts.tv_sec; 2720 loadtimes[1] = spa->spa_loaded_ts.tv_nsec; 2721 VERIFY(nvlist_add_uint64_array(*config, 2722 ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0); 2723 2724 VERIFY(nvlist_add_uint64(*config, 2725 ZPOOL_CONFIG_ERRCOUNT, 2726 spa_get_errlog_size(spa)) == 0); 2727 2728 if (spa_suspended(spa)) 2729 VERIFY(nvlist_add_uint64(*config, 2730 ZPOOL_CONFIG_SUSPENDED, 2731 spa->spa_failmode) == 0); 2732 2733 spa_add_spares(spa, *config); 2734 spa_add_l2cache(spa, *config); 2735 } 2736 } 2737 2738 /* 2739 * We want to get the alternate root even for faulted pools, so we cheat 2740 * and call spa_lookup() directly. 2741 */ 2742 if (altroot) { 2743 if (spa == NULL) { 2744 mutex_enter(&spa_namespace_lock); 2745 spa = spa_lookup(name); 2746 if (spa) 2747 spa_altroot(spa, altroot, buflen); 2748 else 2749 altroot[0] = '\0'; 2750 spa = NULL; 2751 mutex_exit(&spa_namespace_lock); 2752 } else { 2753 spa_altroot(spa, altroot, buflen); 2754 } 2755 } 2756 2757 if (spa != NULL) { 2758 spa_config_exit(spa, SCL_CONFIG, FTAG); 2759 spa_close(spa, FTAG); 2760 } 2761 2762 return (error); 2763} 2764 2765/* 2766 * Validate that the auxiliary device array is well formed. We must have an 2767 * array of nvlists, each which describes a valid leaf vdev. If this is an 2768 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 2769 * specified, as long as they are well-formed. 2770 */ 2771static int 2772spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 2773 spa_aux_vdev_t *sav, const char *config, uint64_t version, 2774 vdev_labeltype_t label) 2775{ 2776 nvlist_t **dev; 2777 uint_t i, ndev; 2778 vdev_t *vd; 2779 int error; 2780 2781 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 2782 2783 /* 2784 * It's acceptable to have no devs specified. 2785 */ 2786 if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 2787 return (0); 2788 2789 if (ndev == 0) 2790 return (EINVAL); 2791 2792 /* 2793 * Make sure the pool is formatted with a version that supports this 2794 * device type. 2795 */ 2796 if (spa_version(spa) < version) 2797 return (ENOTSUP); 2798 2799 /* 2800 * Set the pending device list so we correctly handle device in-use 2801 * checking. 2802 */ 2803 sav->sav_pending = dev; 2804 sav->sav_npending = ndev; 2805 2806 for (i = 0; i < ndev; i++) { 2807 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 2808 mode)) != 0) 2809 goto out; 2810 2811 if (!vd->vdev_ops->vdev_op_leaf) { 2812 vdev_free(vd); 2813 error = EINVAL; 2814 goto out; 2815 } 2816 2817 /* 2818 * The L2ARC currently only supports disk devices in 2819 * kernel context. For user-level testing, we allow it. 2820 */ 2821#ifdef _KERNEL 2822 if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && 2823 strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { 2824 error = ENOTBLK;
|
2825 vdev_free(vd); |
2826 goto out; 2827 } 2828#endif 2829 vd->vdev_top = vd; 2830 2831 if ((error = vdev_open(vd)) == 0 && 2832 (error = vdev_label_init(vd, crtxg, label)) == 0) { 2833 VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 2834 vd->vdev_guid) == 0); 2835 } 2836 2837 vdev_free(vd); 2838 2839 if (error && 2840 (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 2841 goto out; 2842 else 2843 error = 0; 2844 } 2845 2846out: 2847 sav->sav_pending = NULL; 2848 sav->sav_npending = 0; 2849 return (error); 2850} 2851 2852static int 2853spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 2854{ 2855 int error; 2856 2857 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 2858 2859 if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 2860 &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 2861 VDEV_LABEL_SPARE)) != 0) { 2862 return (error); 2863 } 2864 2865 return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 2866 &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 2867 VDEV_LABEL_L2CACHE)); 2868} 2869 2870static void 2871spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 2872 const char *config) 2873{ 2874 int i; 2875 2876 if (sav->sav_config != NULL) { 2877 nvlist_t **olddevs; 2878 uint_t oldndevs; 2879 nvlist_t **newdevs; 2880 2881 /* 2882 * Generate new dev list by concatentating with the 2883 * current dev list. 2884 */ 2885 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, 2886 &olddevs, &oldndevs) == 0); 2887 2888 newdevs = kmem_alloc(sizeof (void *) * 2889 (ndevs + oldndevs), KM_SLEEP); 2890 for (i = 0; i < oldndevs; i++) 2891 VERIFY(nvlist_dup(olddevs[i], &newdevs[i], 2892 KM_SLEEP) == 0); 2893 for (i = 0; i < ndevs; i++) 2894 VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], 2895 KM_SLEEP) == 0); 2896 2897 VERIFY(nvlist_remove(sav->sav_config, config, 2898 DATA_TYPE_NVLIST_ARRAY) == 0); 2899 2900 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 2901 config, newdevs, ndevs + oldndevs) == 0); 2902 for (i = 0; i < oldndevs + ndevs; i++) 2903 nvlist_free(newdevs[i]); 2904 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 2905 } else { 2906 /* 2907 * Generate a new dev list. 2908 */ 2909 VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, 2910 KM_SLEEP) == 0); 2911 VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, 2912 devs, ndevs) == 0); 2913 } 2914} 2915 2916/* 2917 * Stop and drop level 2 ARC devices 2918 */ 2919void 2920spa_l2cache_drop(spa_t *spa) 2921{ 2922 vdev_t *vd; 2923 int i; 2924 spa_aux_vdev_t *sav = &spa->spa_l2cache; 2925 2926 for (i = 0; i < sav->sav_count; i++) { 2927 uint64_t pool; 2928 2929 vd = sav->sav_vdevs[i]; 2930 ASSERT(vd != NULL); 2931 2932 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 2933 pool != 0ULL && l2arc_vdev_present(vd)) 2934 l2arc_remove_vdev(vd);
|
2930 if (vd->vdev_isl2cache)
2931 spa_l2cache_remove(vd);
2932 vdev_clear_stats(vd);
2933 (void) vdev_close(vd);
|
2935 } 2936} 2937 2938/* 2939 * Pool Creation 2940 */ 2941int 2942spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 2943 const char *history_str, nvlist_t *zplprops) 2944{ 2945 spa_t *spa; 2946 char *altroot = NULL; 2947 vdev_t *rvd; 2948 dsl_pool_t *dp; 2949 dmu_tx_t *tx; 2950 int error = 0; 2951 uint64_t txg = TXG_INITIAL; 2952 nvlist_t **spares, **l2cache; 2953 uint_t nspares, nl2cache; 2954 uint64_t version, obj; 2955 2956 /* 2957 * If this pool already exists, return failure. 2958 */ 2959 mutex_enter(&spa_namespace_lock); 2960 if (spa_lookup(pool) != NULL) { 2961 mutex_exit(&spa_namespace_lock); 2962 return (EEXIST); 2963 } 2964 2965 /* 2966 * Allocate a new spa_t structure. 2967 */ 2968 (void) nvlist_lookup_string(props, 2969 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 2970 spa = spa_add(pool, NULL, altroot); 2971 spa_activate(spa, spa_mode_global); 2972 2973 if (props && (error = spa_prop_validate(spa, props))) { 2974 spa_deactivate(spa); 2975 spa_remove(spa); 2976 mutex_exit(&spa_namespace_lock); 2977 return (error); 2978 } 2979 2980 if (nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), 2981 &version) != 0) 2982 version = SPA_VERSION; 2983 ASSERT(version <= SPA_VERSION); 2984 2985 spa->spa_first_txg = txg; 2986 spa->spa_uberblock.ub_txg = txg - 1; 2987 spa->spa_uberblock.ub_version = version; 2988 spa->spa_ubsync = spa->spa_uberblock; 2989 2990 /* 2991 * Create "The Godfather" zio to hold all async IOs 2992 */ 2993 spa->spa_async_zio_root = zio_root(spa, NULL, NULL, 2994 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); 2995 2996 /* 2997 * Create the root vdev. 2998 */ 2999 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3000 3001 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 3002 3003 ASSERT(error != 0 || rvd != NULL); 3004 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 3005 3006 if (error == 0 && !zfs_allocatable_devs(nvroot)) 3007 error = EINVAL; 3008 3009 if (error == 0 && 3010 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 3011 (error = spa_validate_aux(spa, nvroot, txg, 3012 VDEV_ALLOC_ADD)) == 0) { 3013 for (int c = 0; c < rvd->vdev_children; c++) { 3014 vdev_metaslab_set_size(rvd->vdev_child[c]); 3015 vdev_expand(rvd->vdev_child[c], txg); 3016 } 3017 } 3018 3019 spa_config_exit(spa, SCL_ALL, FTAG); 3020 3021 if (error != 0) { 3022 spa_unload(spa); 3023 spa_deactivate(spa); 3024 spa_remove(spa); 3025 mutex_exit(&spa_namespace_lock); 3026 return (error); 3027 } 3028 3029 /* 3030 * Get the list of spares, if specified. 3031 */ 3032 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 3033 &spares, &nspares) == 0) { 3034 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, 3035 KM_SLEEP) == 0); 3036 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 3037 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 3038 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3039 spa_load_spares(spa); 3040 spa_config_exit(spa, SCL_ALL, FTAG); 3041 spa->spa_spares.sav_sync = B_TRUE; 3042 } 3043 3044 /* 3045 * Get the list of level 2 cache devices, if specified. 3046 */ 3047 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 3048 &l2cache, &nl2cache) == 0) { 3049 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 3050 NV_UNIQUE_NAME, KM_SLEEP) == 0); 3051 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 3052 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 3053 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3054 spa_load_l2cache(spa); 3055 spa_config_exit(spa, SCL_ALL, FTAG); 3056 spa->spa_l2cache.sav_sync = B_TRUE; 3057 } 3058 3059 spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); 3060 spa->spa_meta_objset = dp->dp_meta_objset; 3061 3062 /* 3063 * Create DDTs (dedup tables). 3064 */ 3065 ddt_create(spa); 3066 3067 spa_update_dspace(spa); 3068 3069 tx = dmu_tx_create_assigned(dp, txg); 3070 3071 /* 3072 * Create the pool config object. 3073 */ 3074 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 3075 DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, 3076 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 3077 3078 if (zap_add(spa->spa_meta_objset, 3079 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 3080 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 3081 cmn_err(CE_PANIC, "failed to add pool config"); 3082 } 3083 3084 if (zap_add(spa->spa_meta_objset, 3085 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, 3086 sizeof (uint64_t), 1, &version, tx) != 0) { 3087 cmn_err(CE_PANIC, "failed to add pool version"); 3088 } 3089 3090 /* Newly created pools with the right version are always deflated. */ 3091 if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 3092 spa->spa_deflate = TRUE; 3093 if (zap_add(spa->spa_meta_objset, 3094 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 3095 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 3096 cmn_err(CE_PANIC, "failed to add deflate"); 3097 } 3098 } 3099 3100 /* 3101 * Create the deferred-free bpobj. Turn off compression 3102 * because sync-to-convergence takes longer if the blocksize 3103 * keeps changing. 3104 */ 3105 obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); 3106 dmu_object_set_compress(spa->spa_meta_objset, obj, 3107 ZIO_COMPRESS_OFF, tx); 3108 if (zap_add(spa->spa_meta_objset, 3109 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, 3110 sizeof (uint64_t), 1, &obj, tx) != 0) { 3111 cmn_err(CE_PANIC, "failed to add bpobj"); 3112 } 3113 VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, 3114 spa->spa_meta_objset, obj)); 3115 3116 /* 3117 * Create the pool's history object. 3118 */ 3119 if (version >= SPA_VERSION_ZPOOL_HISTORY) 3120 spa_history_create_obj(spa, tx); 3121 3122 /* 3123 * Set pool properties. 3124 */ 3125 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 3126 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 3127 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 3128 spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); 3129 3130 if (props != NULL) { 3131 spa_configfile_set(spa, props, B_FALSE); 3132 spa_sync_props(spa, props, tx); 3133 } 3134 3135 dmu_tx_commit(tx); 3136 3137 spa->spa_sync_on = B_TRUE; 3138 txg_sync_start(spa->spa_dsl_pool); 3139 3140 /* 3141 * We explicitly wait for the first transaction to complete so that our 3142 * bean counters are appropriately updated. 3143 */ 3144 txg_wait_synced(spa->spa_dsl_pool, txg); 3145 3146 spa_config_sync(spa, B_FALSE, B_TRUE); 3147 3148 if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL) 3149 (void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE); 3150 spa_history_log_version(spa, LOG_POOL_CREATE); 3151 3152 spa->spa_minref = refcount_count(&spa->spa_refcount); 3153 3154 mutex_exit(&spa_namespace_lock); 3155 3156 return (0); 3157} 3158 3159#if defined(sun) 3160#ifdef _KERNEL 3161/* 3162 * Get the root pool information from the root disk, then import the root pool 3163 * during the system boot up time. 3164 */ 3165extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); 3166 3167static nvlist_t * 3168spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid) 3169{ 3170 nvlist_t *config; 3171 nvlist_t *nvtop, *nvroot; 3172 uint64_t pgid; 3173 3174 if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0) 3175 return (NULL); 3176 3177 /* 3178 * Add this top-level vdev to the child array. 3179 */ 3180 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3181 &nvtop) == 0); 3182 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 3183 &pgid) == 0); 3184 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0); 3185 3186 /* 3187 * Put this pool's top-level vdevs into a root vdev. 3188 */ 3189 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3190 VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 3191 VDEV_TYPE_ROOT) == 0); 3192 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 3193 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 3194 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 3195 &nvtop, 1) == 0); 3196 3197 /* 3198 * Replace the existing vdev_tree with the new root vdev in 3199 * this pool's configuration (remove the old, add the new). 3200 */ 3201 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 3202 nvlist_free(nvroot); 3203 return (config); 3204} 3205 3206/* 3207 * Walk the vdev tree and see if we can find a device with "better" 3208 * configuration. A configuration is "better" if the label on that 3209 * device has a more recent txg. 3210 */ 3211static void 3212spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg) 3213{ 3214 for (int c = 0; c < vd->vdev_children; c++) 3215 spa_alt_rootvdev(vd->vdev_child[c], avd, txg); 3216 3217 if (vd->vdev_ops->vdev_op_leaf) { 3218 nvlist_t *label; 3219 uint64_t label_txg; 3220 3221 if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid, 3222 &label) != 0) 3223 return; 3224 3225 VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, 3226 &label_txg) == 0); 3227 3228 /* 3229 * Do we have a better boot device? 3230 */ 3231 if (label_txg > *txg) { 3232 *txg = label_txg; 3233 *avd = vd; 3234 } 3235 nvlist_free(label); 3236 } 3237} 3238 3239/* 3240 * Import a root pool. 3241 * 3242 * For x86. devpath_list will consist of devid and/or physpath name of 3243 * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). 3244 * The GRUB "findroot" command will return the vdev we should boot. 3245 * 3246 * For Sparc, devpath_list consists the physpath name of the booting device 3247 * no matter the rootpool is a single device pool or a mirrored pool. 3248 * e.g. 3249 * "/pci@1f,0/ide@d/disk@0,0:a" 3250 */ 3251int 3252spa_import_rootpool(char *devpath, char *devid) 3253{ 3254 spa_t *spa; 3255 vdev_t *rvd, *bvd, *avd = NULL; 3256 nvlist_t *config, *nvtop; 3257 uint64_t guid, txg; 3258 char *pname; 3259 int error; 3260 3261 /* 3262 * Read the label from the boot device and generate a configuration. 3263 */ 3264 config = spa_generate_rootconf(devpath, devid, &guid); 3265#if defined(_OBP) && defined(_KERNEL) 3266 if (config == NULL) { 3267 if (strstr(devpath, "/iscsi/ssd") != NULL) { 3268 /* iscsi boot */ 3269 get_iscsi_bootpath_phy(devpath); 3270 config = spa_generate_rootconf(devpath, devid, &guid); 3271 } 3272 } 3273#endif 3274 if (config == NULL) { 3275 cmn_err(CE_NOTE, "Can not read the pool label from '%s'", 3276 devpath); 3277 return (EIO); 3278 } 3279 3280 VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 3281 &pname) == 0); 3282 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); 3283 3284 mutex_enter(&spa_namespace_lock); 3285 if ((spa = spa_lookup(pname)) != NULL) { 3286 /* 3287 * Remove the existing root pool from the namespace so that we 3288 * can replace it with the correct config we just read in. 3289 */ 3290 spa_remove(spa); 3291 } 3292 3293 spa = spa_add(pname, config, NULL); 3294 spa->spa_is_root = B_TRUE; 3295 spa->spa_import_flags = ZFS_IMPORT_VERBATIM; 3296 3297 /* 3298 * Build up a vdev tree based on the boot device's label config. 3299 */ 3300 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3301 &nvtop) == 0); 3302 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3303 error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, 3304 VDEV_ALLOC_ROOTPOOL); 3305 spa_config_exit(spa, SCL_ALL, FTAG); 3306 if (error) { 3307 mutex_exit(&spa_namespace_lock); 3308 nvlist_free(config); 3309 cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", 3310 pname); 3311 return (error); 3312 } 3313 3314 /* 3315 * Get the boot vdev. 3316 */ 3317 if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 3318 cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu", 3319 (u_longlong_t)guid); 3320 error = ENOENT; 3321 goto out; 3322 } 3323 3324 /* 3325 * Determine if there is a better boot device. 3326 */ 3327 avd = bvd; 3328 spa_alt_rootvdev(rvd, &avd, &txg); 3329 if (avd != bvd) { 3330 cmn_err(CE_NOTE, "The boot device is 'degraded'. Please " 3331 "try booting from '%s'", avd->vdev_path); 3332 error = EINVAL; 3333 goto out; 3334 } 3335 3336 /* 3337 * If the boot device is part of a spare vdev then ensure that 3338 * we're booting off the active spare. 3339 */ 3340 if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops && 3341 !bvd->vdev_isspare) { 3342 cmn_err(CE_NOTE, "The boot device is currently spared. Please " 3343 "try booting from '%s'", 3344 bvd->vdev_parent-> 3345 vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path); 3346 error = EINVAL; 3347 goto out; 3348 } 3349 3350 error = 0; 3351 spa_history_log_version(spa, LOG_POOL_IMPORT); 3352out: 3353 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3354 vdev_free(rvd); 3355 spa_config_exit(spa, SCL_ALL, FTAG); 3356 mutex_exit(&spa_namespace_lock); 3357 3358 nvlist_free(config); 3359 return (error); 3360} 3361 3362#endif 3363#endif /* sun */ 3364 3365/* 3366 * Import a non-root pool into the system. 3367 */ 3368int 3369spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) 3370{ 3371 spa_t *spa; 3372 char *altroot = NULL; 3373 spa_load_state_t state = SPA_LOAD_IMPORT; 3374 zpool_rewind_policy_t policy; 3375 uint64_t mode = spa_mode_global; 3376 uint64_t readonly = B_FALSE; 3377 int error; 3378 nvlist_t *nvroot; 3379 nvlist_t **spares, **l2cache; 3380 uint_t nspares, nl2cache; 3381 3382 /* 3383 * If a pool with this name exists, return failure. 3384 */ 3385 mutex_enter(&spa_namespace_lock); 3386 if (spa_lookup(pool) != NULL) { 3387 mutex_exit(&spa_namespace_lock); 3388 return (EEXIST); 3389 } 3390 3391 /* 3392 * Create and initialize the spa structure. 3393 */ 3394 (void) nvlist_lookup_string(props, 3395 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 3396 (void) nvlist_lookup_uint64(props, 3397 zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); 3398 if (readonly) 3399 mode = FREAD; 3400 spa = spa_add(pool, config, altroot); 3401 spa->spa_import_flags = flags; 3402 3403 /* 3404 * Verbatim import - Take a pool and insert it into the namespace 3405 * as if it had been loaded at boot. 3406 */ 3407 if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { 3408 if (props != NULL) 3409 spa_configfile_set(spa, props, B_FALSE); 3410 3411 spa_config_sync(spa, B_FALSE, B_TRUE); 3412 3413 mutex_exit(&spa_namespace_lock); 3414 spa_history_log_version(spa, LOG_POOL_IMPORT); 3415 3416 return (0); 3417 } 3418 3419 spa_activate(spa, mode); 3420 3421 /* 3422 * Don't start async tasks until we know everything is healthy. 3423 */ 3424 spa_async_suspend(spa); 3425 3426 zpool_get_rewind_policy(config, &policy); 3427 if (policy.zrp_request & ZPOOL_DO_REWIND) 3428 state = SPA_LOAD_RECOVER; 3429 3430 /* 3431 * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig 3432 * because the user-supplied config is actually the one to trust when 3433 * doing an import. 3434 */ 3435 if (state != SPA_LOAD_RECOVER) 3436 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 3437 3438 error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg, 3439 policy.zrp_request); 3440 3441 /* 3442 * Propagate anything learned while loading the pool and pass it 3443 * back to caller (i.e. rewind info, missing devices, etc). 3444 */ 3445 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 3446 spa->spa_load_info) == 0); 3447 3448 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3449 /* 3450 * Toss any existing sparelist, as it doesn't have any validity 3451 * anymore, and conflicts with spa_has_spare(). 3452 */ 3453 if (spa->spa_spares.sav_config) { 3454 nvlist_free(spa->spa_spares.sav_config); 3455 spa->spa_spares.sav_config = NULL; 3456 spa_load_spares(spa); 3457 } 3458 if (spa->spa_l2cache.sav_config) { 3459 nvlist_free(spa->spa_l2cache.sav_config); 3460 spa->spa_l2cache.sav_config = NULL; 3461 spa_load_l2cache(spa); 3462 } 3463 3464 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3465 &nvroot) == 0); 3466 if (error == 0) 3467 error = spa_validate_aux(spa, nvroot, -1ULL, 3468 VDEV_ALLOC_SPARE); 3469 if (error == 0) 3470 error = spa_validate_aux(spa, nvroot, -1ULL, 3471 VDEV_ALLOC_L2CACHE); 3472 spa_config_exit(spa, SCL_ALL, FTAG); 3473 3474 if (props != NULL) 3475 spa_configfile_set(spa, props, B_FALSE); 3476 3477 if (error != 0 || (props && spa_writeable(spa) && 3478 (error = spa_prop_set(spa, props)))) { 3479 spa_unload(spa); 3480 spa_deactivate(spa); 3481 spa_remove(spa); 3482 mutex_exit(&spa_namespace_lock); 3483 return (error); 3484 } 3485 3486 spa_async_resume(spa); 3487 3488 /* 3489 * Override any spares and level 2 cache devices as specified by 3490 * the user, as these may have correct device names/devids, etc. 3491 */ 3492 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 3493 &spares, &nspares) == 0) { 3494 if (spa->spa_spares.sav_config) 3495 VERIFY(nvlist_remove(spa->spa_spares.sav_config, 3496 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 3497 else 3498 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, 3499 NV_UNIQUE_NAME, KM_SLEEP) == 0); 3500 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 3501 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 3502 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3503 spa_load_spares(spa); 3504 spa_config_exit(spa, SCL_ALL, FTAG); 3505 spa->spa_spares.sav_sync = B_TRUE; 3506 } 3507 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 3508 &l2cache, &nl2cache) == 0) { 3509 if (spa->spa_l2cache.sav_config) 3510 VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, 3511 ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); 3512 else 3513 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 3514 NV_UNIQUE_NAME, KM_SLEEP) == 0); 3515 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 3516 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 3517 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3518 spa_load_l2cache(spa); 3519 spa_config_exit(spa, SCL_ALL, FTAG); 3520 spa->spa_l2cache.sav_sync = B_TRUE; 3521 } 3522 3523 /* 3524 * Check for any removed devices. 3525 */ 3526 if (spa->spa_autoreplace) { 3527 spa_aux_check_removed(&spa->spa_spares); 3528 spa_aux_check_removed(&spa->spa_l2cache); 3529 } 3530 3531 if (spa_writeable(spa)) { 3532 /* 3533 * Update the config cache to include the newly-imported pool. 3534 */ 3535 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 3536 } 3537 3538 /* 3539 * It's possible that the pool was expanded while it was exported. 3540 * We kick off an async task to handle this for us. 3541 */ 3542 spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 3543 3544 mutex_exit(&spa_namespace_lock); 3545 spa_history_log_version(spa, LOG_POOL_IMPORT); 3546 3547#ifdef __FreeBSD__ 3548#ifdef _KERNEL 3549 zvol_create_minors(pool); 3550#endif 3551#endif 3552 return (0); 3553} 3554 3555nvlist_t * 3556spa_tryimport(nvlist_t *tryconfig) 3557{ 3558 nvlist_t *config = NULL; 3559 char *poolname; 3560 spa_t *spa; 3561 uint64_t state; 3562 int error; 3563 3564 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 3565 return (NULL); 3566 3567 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 3568 return (NULL); 3569 3570 /* 3571 * Create and initialize the spa structure. 3572 */ 3573 mutex_enter(&spa_namespace_lock); 3574 spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); 3575 spa_activate(spa, FREAD); 3576 3577 /* 3578 * Pass off the heavy lifting to spa_load(). 3579 * Pass TRUE for mosconfig because the user-supplied config 3580 * is actually the one to trust when doing an import. 3581 */ 3582 error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE); 3583 3584 /* 3585 * If 'tryconfig' was at least parsable, return the current config. 3586 */ 3587 if (spa->spa_root_vdev != NULL) { 3588 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 3589 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 3590 poolname) == 0); 3591 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 3592 state) == 0); 3593 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 3594 spa->spa_uberblock.ub_timestamp) == 0); 3595 3596 /* 3597 * If the bootfs property exists on this pool then we 3598 * copy it out so that external consumers can tell which 3599 * pools are bootable. 3600 */ 3601 if ((!error || error == EEXIST) && spa->spa_bootfs) { 3602 char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 3603 3604 /* 3605 * We have to play games with the name since the 3606 * pool was opened as TRYIMPORT_NAME. 3607 */ 3608 if (dsl_dsobj_to_dsname(spa_name(spa), 3609 spa->spa_bootfs, tmpname) == 0) { 3610 char *cp; 3611 char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 3612 3613 cp = strchr(tmpname, '/'); 3614 if (cp == NULL) { 3615 (void) strlcpy(dsname, tmpname, 3616 MAXPATHLEN); 3617 } else { 3618 (void) snprintf(dsname, MAXPATHLEN, 3619 "%s/%s", poolname, ++cp); 3620 } 3621 VERIFY(nvlist_add_string(config, 3622 ZPOOL_CONFIG_BOOTFS, dsname) == 0); 3623 kmem_free(dsname, MAXPATHLEN); 3624 } 3625 kmem_free(tmpname, MAXPATHLEN); 3626 } 3627 3628 /* 3629 * Add the list of hot spares and level 2 cache devices. 3630 */ 3631 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3632 spa_add_spares(spa, config); 3633 spa_add_l2cache(spa, config); 3634 spa_config_exit(spa, SCL_CONFIG, FTAG); 3635 } 3636 3637 spa_unload(spa); 3638 spa_deactivate(spa); 3639 spa_remove(spa); 3640 mutex_exit(&spa_namespace_lock); 3641 3642 return (config); 3643} 3644 3645/* 3646 * Pool export/destroy 3647 * 3648 * The act of destroying or exporting a pool is very simple. We make sure there 3649 * is no more pending I/O and any references to the pool are gone. Then, we 3650 * update the pool state and sync all the labels to disk, removing the 3651 * configuration from the cache afterwards. If the 'hardforce' flag is set, then 3652 * we don't sync the labels or remove the configuration cache. 3653 */ 3654static int 3655spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, 3656 boolean_t force, boolean_t hardforce) 3657{ 3658 spa_t *spa; 3659 3660 if (oldconfig) 3661 *oldconfig = NULL; 3662 3663 if (!(spa_mode_global & FWRITE)) 3664 return (EROFS); 3665 3666 mutex_enter(&spa_namespace_lock); 3667 if ((spa = spa_lookup(pool)) == NULL) { 3668 mutex_exit(&spa_namespace_lock); 3669 return (ENOENT); 3670 } 3671 3672 /* 3673 * Put a hold on the pool, drop the namespace lock, stop async tasks, 3674 * reacquire the namespace lock, and see if we can export. 3675 */ 3676 spa_open_ref(spa, FTAG); 3677 mutex_exit(&spa_namespace_lock); 3678 spa_async_suspend(spa); 3679 mutex_enter(&spa_namespace_lock); 3680 spa_close(spa, FTAG); 3681 3682 /* 3683 * The pool will be in core if it's openable, 3684 * in which case we can modify its state. 3685 */ 3686 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 3687 /* 3688 * Objsets may be open only because they're dirty, so we 3689 * have to force it to sync before checking spa_refcnt. 3690 */ 3691 txg_wait_synced(spa->spa_dsl_pool, 0); 3692 3693 /* 3694 * A pool cannot be exported or destroyed if there are active 3695 * references. If we are resetting a pool, allow references by 3696 * fault injection handlers. 3697 */ 3698 if (!spa_refcount_zero(spa) || 3699 (spa->spa_inject_ref != 0 && 3700 new_state != POOL_STATE_UNINITIALIZED)) { 3701 spa_async_resume(spa); 3702 mutex_exit(&spa_namespace_lock); 3703 return (EBUSY); 3704 } 3705 3706 /* 3707 * A pool cannot be exported if it has an active shared spare. 3708 * This is to prevent other pools stealing the active spare 3709 * from an exported pool. At user's own will, such pool can 3710 * be forcedly exported. 3711 */ 3712 if (!force && new_state == POOL_STATE_EXPORTED && 3713 spa_has_active_shared_spare(spa)) { 3714 spa_async_resume(spa); 3715 mutex_exit(&spa_namespace_lock); 3716 return (EXDEV); 3717 } 3718 3719 /* 3720 * We want this to be reflected on every label, 3721 * so mark them all dirty. spa_unload() will do the 3722 * final sync that pushes these changes out. 3723 */ 3724 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 3725 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3726 spa->spa_state = new_state; 3727 spa->spa_final_txg = spa_last_synced_txg(spa) + 3728 TXG_DEFER_SIZE + 1; 3729 vdev_config_dirty(spa->spa_root_vdev); 3730 spa_config_exit(spa, SCL_ALL, FTAG); 3731 } 3732 } 3733 3734 spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY); 3735 3736 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 3737 spa_unload(spa); 3738 spa_deactivate(spa); 3739 } 3740 3741 if (oldconfig && spa->spa_config) 3742 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 3743 3744 if (new_state != POOL_STATE_UNINITIALIZED) { 3745 if (!hardforce) 3746 spa_config_sync(spa, B_TRUE, B_TRUE); 3747 spa_remove(spa); 3748 } 3749 mutex_exit(&spa_namespace_lock); 3750 3751 return (0); 3752} 3753 3754/* 3755 * Destroy a storage pool. 3756 */ 3757int 3758spa_destroy(char *pool) 3759{ 3760 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, 3761 B_FALSE, B_FALSE)); 3762} 3763 3764/* 3765 * Export a storage pool. 3766 */ 3767int 3768spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, 3769 boolean_t hardforce) 3770{ 3771 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, 3772 force, hardforce)); 3773} 3774 3775/* 3776 * Similar to spa_export(), this unloads the spa_t without actually removing it 3777 * from the namespace in any way. 3778 */ 3779int 3780spa_reset(char *pool) 3781{ 3782 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, 3783 B_FALSE, B_FALSE)); 3784} 3785 3786/* 3787 * ========================================================================== 3788 * Device manipulation 3789 * ========================================================================== 3790 */ 3791 3792/* 3793 * Add a device to a storage pool. 3794 */ 3795int 3796spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 3797{ 3798 uint64_t txg, id; 3799 int error; 3800 vdev_t *rvd = spa->spa_root_vdev; 3801 vdev_t *vd, *tvd; 3802 nvlist_t **spares, **l2cache; 3803 uint_t nspares, nl2cache; 3804 3805 ASSERT(spa_writeable(spa)); 3806 3807 txg = spa_vdev_enter(spa); 3808 3809 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 3810 VDEV_ALLOC_ADD)) != 0) 3811 return (spa_vdev_exit(spa, NULL, txg, error)); 3812 3813 spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ 3814 3815 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 3816 &nspares) != 0) 3817 nspares = 0; 3818 3819 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 3820 &nl2cache) != 0) 3821 nl2cache = 0; 3822 3823 if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) 3824 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 3825 3826 if (vd->vdev_children != 0 && 3827 (error = vdev_create(vd, txg, B_FALSE)) != 0) 3828 return (spa_vdev_exit(spa, vd, txg, error)); 3829 3830 /* 3831 * We must validate the spares and l2cache devices after checking the 3832 * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 3833 */ 3834 if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) 3835 return (spa_vdev_exit(spa, vd, txg, error)); 3836 3837 /* 3838 * Transfer each new top-level vdev from vd to rvd. 3839 */ 3840 for (int c = 0; c < vd->vdev_children; c++) { 3841 3842 /* 3843 * Set the vdev id to the first hole, if one exists. 3844 */ 3845 for (id = 0; id < rvd->vdev_children; id++) { 3846 if (rvd->vdev_child[id]->vdev_ishole) { 3847 vdev_free(rvd->vdev_child[id]); 3848 break; 3849 } 3850 } 3851 tvd = vd->vdev_child[c]; 3852 vdev_remove_child(vd, tvd); 3853 tvd->vdev_id = id; 3854 vdev_add_child(rvd, tvd); 3855 vdev_config_dirty(tvd); 3856 } 3857 3858 if (nspares != 0) { 3859 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 3860 ZPOOL_CONFIG_SPARES); 3861 spa_load_spares(spa); 3862 spa->spa_spares.sav_sync = B_TRUE; 3863 } 3864 3865 if (nl2cache != 0) { 3866 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 3867 ZPOOL_CONFIG_L2CACHE); 3868 spa_load_l2cache(spa); 3869 spa->spa_l2cache.sav_sync = B_TRUE; 3870 } 3871 3872 /* 3873 * We have to be careful when adding new vdevs to an existing pool. 3874 * If other threads start allocating from these vdevs before we 3875 * sync the config cache, and we lose power, then upon reboot we may 3876 * fail to open the pool because there are DVAs that the config cache 3877 * can't translate. Therefore, we first add the vdevs without 3878 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 3879 * and then let spa_config_update() initialize the new metaslabs. 3880 * 3881 * spa_load() checks for added-but-not-initialized vdevs, so that 3882 * if we lose power at any point in this sequence, the remaining 3883 * steps will be completed the next time we load the pool. 3884 */ 3885 (void) spa_vdev_exit(spa, vd, txg, 0); 3886 3887 mutex_enter(&spa_namespace_lock); 3888 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 3889 mutex_exit(&spa_namespace_lock); 3890 3891 return (0); 3892} 3893 3894/* 3895 * Attach a device to a mirror. The arguments are the path to any device 3896 * in the mirror, and the nvroot for the new device. If the path specifies 3897 * a device that is not mirrored, we automatically insert the mirror vdev. 3898 * 3899 * If 'replacing' is specified, the new device is intended to replace the 3900 * existing device; in this case the two devices are made into their own 3901 * mirror using the 'replacing' vdev, which is functionally identical to 3902 * the mirror vdev (it actually reuses all the same ops) but has a few 3903 * extra rules: you can't attach to it after it's been created, and upon 3904 * completion of resilvering, the first disk (the one being replaced) 3905 * is automatically detached. 3906 */ 3907int 3908spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 3909{ 3910 uint64_t txg, dtl_max_txg; 3911 vdev_t *rvd = spa->spa_root_vdev; 3912 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 3913 vdev_ops_t *pvops; 3914 char *oldvdpath, *newvdpath; 3915 int newvd_isspare; 3916 int error; 3917 3918 ASSERT(spa_writeable(spa)); 3919 3920 txg = spa_vdev_enter(spa); 3921 3922 oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 3923 3924 if (oldvd == NULL) 3925 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 3926 3927 if (!oldvd->vdev_ops->vdev_op_leaf) 3928 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3929 3930 pvd = oldvd->vdev_parent; 3931 3932 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
|
3932 VDEV_ALLOC_ADD)) != 0)
|
3933 VDEV_ALLOC_ATTACH)) != 0) |
3934 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 3935 3936 if (newrootvd->vdev_children != 1) 3937 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 3938 3939 newvd = newrootvd->vdev_child[0]; 3940 3941 if (!newvd->vdev_ops->vdev_op_leaf) 3942 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 3943 3944 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 3945 return (spa_vdev_exit(spa, newrootvd, txg, error)); 3946 3947 /* 3948 * Spares can't replace logs 3949 */ 3950 if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) 3951 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3952 3953 if (!replacing) { 3954 /* 3955 * For attach, the only allowable parent is a mirror or the root 3956 * vdev. 3957 */ 3958 if (pvd->vdev_ops != &vdev_mirror_ops && 3959 pvd->vdev_ops != &vdev_root_ops) 3960 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3961 3962 pvops = &vdev_mirror_ops; 3963 } else { 3964 /* 3965 * Active hot spares can only be replaced by inactive hot 3966 * spares. 3967 */ 3968 if (pvd->vdev_ops == &vdev_spare_ops && 3969 oldvd->vdev_isspare && 3970 !spa_has_spare(spa, newvd->vdev_guid)) 3971 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3972 3973 /* 3974 * If the source is a hot spare, and the parent isn't already a 3975 * spare, then we want to create a new hot spare. Otherwise, we 3976 * want to create a replacing vdev. The user is not allowed to 3977 * attach to a spared vdev child unless the 'isspare' state is 3978 * the same (spare replaces spare, non-spare replaces 3979 * non-spare). 3980 */ 3981 if (pvd->vdev_ops == &vdev_replacing_ops && 3982 spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { 3983 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3984 } else if (pvd->vdev_ops == &vdev_spare_ops && 3985 newvd->vdev_isspare != oldvd->vdev_isspare) { 3986 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3987 } 3988 3989 if (newvd->vdev_isspare) 3990 pvops = &vdev_spare_ops; 3991 else 3992 pvops = &vdev_replacing_ops; 3993 } 3994 3995 /* 3996 * Make sure the new device is big enough. 3997 */ 3998 if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) 3999 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 4000 4001 /* 4002 * The new device cannot have a higher alignment requirement 4003 * than the top-level vdev. 4004 */ 4005 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 4006 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 4007 4008 /* 4009 * If this is an in-place replacement, update oldvd's path and devid 4010 * to make it distinguishable from newvd, and unopenable from now on. 4011 */ 4012 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 4013 spa_strfree(oldvd->vdev_path); 4014 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 4015 KM_SLEEP); 4016 (void) sprintf(oldvd->vdev_path, "%s/%s", 4017 newvd->vdev_path, "old"); 4018 if (oldvd->vdev_devid != NULL) { 4019 spa_strfree(oldvd->vdev_devid); 4020 oldvd->vdev_devid = NULL; 4021 } 4022 } 4023 4024 /* mark the device being resilvered */ 4025 newvd->vdev_resilvering = B_TRUE; 4026 4027 /* 4028 * If the parent is not a mirror, or if we're replacing, insert the new 4029 * mirror/replacing/spare vdev above oldvd. 4030 */ 4031 if (pvd->vdev_ops != pvops) 4032 pvd = vdev_add_parent(oldvd, pvops); 4033 4034 ASSERT(pvd->vdev_top->vdev_parent == rvd); 4035 ASSERT(pvd->vdev_ops == pvops); 4036 ASSERT(oldvd->vdev_parent == pvd); 4037 4038 /* 4039 * Extract the new device from its root and add it to pvd. 4040 */ 4041 vdev_remove_child(newrootvd, newvd); 4042 newvd->vdev_id = pvd->vdev_children; 4043 newvd->vdev_crtxg = oldvd->vdev_crtxg; 4044 vdev_add_child(pvd, newvd); 4045 4046 tvd = newvd->vdev_top; 4047 ASSERT(pvd->vdev_top == tvd); 4048 ASSERT(tvd->vdev_parent == rvd); 4049 4050 vdev_config_dirty(tvd); 4051 4052 /* 4053 * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account 4054 * for any dmu_sync-ed blocks. It will propagate upward when 4055 * spa_vdev_exit() calls vdev_dtl_reassess(). 4056 */ 4057 dtl_max_txg = txg + TXG_CONCURRENT_STATES; 4058 4059 vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, 4060 dtl_max_txg - TXG_INITIAL); 4061 4062 if (newvd->vdev_isspare) { 4063 spa_spare_activate(newvd); 4064 spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE); 4065 } 4066 4067 oldvdpath = spa_strdup(oldvd->vdev_path); 4068 newvdpath = spa_strdup(newvd->vdev_path); 4069 newvd_isspare = newvd->vdev_isspare; 4070 4071 /* 4072 * Mark newvd's DTL dirty in this txg. 4073 */ 4074 vdev_dirty(tvd, VDD_DTL, newvd, txg); 4075 4076 /* 4077 * Restart the resilver 4078 */ 4079 dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg); 4080 4081 /* 4082 * Commit the config 4083 */ 4084 (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); 4085 4086 spa_history_log_internal(LOG_POOL_VDEV_ATTACH, spa, NULL, 4087 "%s vdev=%s %s vdev=%s", 4088 replacing && newvd_isspare ? "spare in" : 4089 replacing ? "replace" : "attach", newvdpath, 4090 replacing ? "for" : "to", oldvdpath); 4091 4092 spa_strfree(oldvdpath); 4093 spa_strfree(newvdpath); 4094 4095 if (spa->spa_bootfs) 4096 spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH); 4097 4098 return (0); 4099} 4100 4101/* 4102 * Detach a device from a mirror or replacing vdev. 4103 * If 'replace_done' is specified, only detach if the parent 4104 * is a replacing vdev. 4105 */ 4106int 4107spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) 4108{ 4109 uint64_t txg; 4110 int error; 4111 vdev_t *rvd = spa->spa_root_vdev; 4112 vdev_t *vd, *pvd, *cvd, *tvd; 4113 boolean_t unspare = B_FALSE; 4114 uint64_t unspare_guid; 4115 char *vdpath; 4116 4117 ASSERT(spa_writeable(spa)); 4118 4119 txg = spa_vdev_enter(spa); 4120 4121 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 4122 4123 if (vd == NULL) 4124 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 4125 4126 if (!vd->vdev_ops->vdev_op_leaf) 4127 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4128 4129 pvd = vd->vdev_parent; 4130 4131 /* 4132 * If the parent/child relationship is not as expected, don't do it. 4133 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing 4134 * vdev that's replacing B with C. The user's intent in replacing 4135 * is to go from M(A,B) to M(A,C). If the user decides to cancel 4136 * the replace by detaching C, the expected behavior is to end up 4137 * M(A,B). But suppose that right after deciding to detach C, 4138 * the replacement of B completes. We would have M(A,C), and then 4139 * ask to detach C, which would leave us with just A -- not what 4140 * the user wanted. To prevent this, we make sure that the 4141 * parent/child relationship hasn't changed -- in this example, 4142 * that C's parent is still the replacing vdev R. 4143 */ 4144 if (pvd->vdev_guid != pguid && pguid != 0) 4145 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 4146 4147 /* 4148 * Only 'replacing' or 'spare' vdevs can be replaced. 4149 */ 4150 if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && 4151 pvd->vdev_ops != &vdev_spare_ops) 4152 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4153 4154 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 4155 spa_version(spa) >= SPA_VERSION_SPARES); 4156 4157 /* 4158 * Only mirror, replacing, and spare vdevs support detach. 4159 */ 4160 if (pvd->vdev_ops != &vdev_replacing_ops && 4161 pvd->vdev_ops != &vdev_mirror_ops && 4162 pvd->vdev_ops != &vdev_spare_ops) 4163 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4164 4165 /* 4166 * If this device has the only valid copy of some data, 4167 * we cannot safely detach it. 4168 */ 4169 if (vdev_dtl_required(vd)) 4170 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 4171 4172 ASSERT(pvd->vdev_children >= 2); 4173 4174 /* 4175 * If we are detaching the second disk from a replacing vdev, then 4176 * check to see if we changed the original vdev's path to have "/old" 4177 * at the end in spa_vdev_attach(). If so, undo that change now. 4178 */ 4179 if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && 4180 vd->vdev_path != NULL) { 4181 size_t len = strlen(vd->vdev_path); 4182 4183 for (int c = 0; c < pvd->vdev_children; c++) { 4184 cvd = pvd->vdev_child[c]; 4185 4186 if (cvd == vd || cvd->vdev_path == NULL) 4187 continue; 4188 4189 if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 4190 strcmp(cvd->vdev_path + len, "/old") == 0) { 4191 spa_strfree(cvd->vdev_path); 4192 cvd->vdev_path = spa_strdup(vd->vdev_path); 4193 break; 4194 } 4195 } 4196 } 4197 4198 /* 4199 * If we are detaching the original disk from a spare, then it implies 4200 * that the spare should become a real disk, and be removed from the 4201 * active spare list for the pool. 4202 */ 4203 if (pvd->vdev_ops == &vdev_spare_ops && 4204 vd->vdev_id == 0 && 4205 pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare) 4206 unspare = B_TRUE; 4207 4208 /* 4209 * Erase the disk labels so the disk can be used for other things. 4210 * This must be done after all other error cases are handled, 4211 * but before we disembowel vd (so we can still do I/O to it). 4212 * But if we can't do it, don't treat the error as fatal -- 4213 * it may be that the unwritability of the disk is the reason 4214 * it's being detached! 4215 */ 4216 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 4217 4218 /* 4219 * Remove vd from its parent and compact the parent's children. 4220 */ 4221 vdev_remove_child(pvd, vd); 4222 vdev_compact_children(pvd); 4223 4224 /* 4225 * Remember one of the remaining children so we can get tvd below. 4226 */ 4227 cvd = pvd->vdev_child[pvd->vdev_children - 1]; 4228 4229 /* 4230 * If we need to remove the remaining child from the list of hot spares, 4231 * do it now, marking the vdev as no longer a spare in the process. 4232 * We must do this before vdev_remove_parent(), because that can 4233 * change the GUID if it creates a new toplevel GUID. For a similar 4234 * reason, we must remove the spare now, in the same txg as the detach; 4235 * otherwise someone could attach a new sibling, change the GUID, and 4236 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. 4237 */ 4238 if (unspare) { 4239 ASSERT(cvd->vdev_isspare); 4240 spa_spare_remove(cvd); 4241 unspare_guid = cvd->vdev_guid; 4242 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 4243 cvd->vdev_unspare = B_TRUE; 4244 } 4245 4246 /* 4247 * If the parent mirror/replacing vdev only has one child, 4248 * the parent is no longer needed. Remove it from the tree. 4249 */ 4250 if (pvd->vdev_children == 1) { 4251 if (pvd->vdev_ops == &vdev_spare_ops) 4252 cvd->vdev_unspare = B_FALSE; 4253 vdev_remove_parent(cvd); 4254 cvd->vdev_resilvering = B_FALSE; 4255 } 4256 4257 4258 /* 4259 * We don't set tvd until now because the parent we just removed 4260 * may have been the previous top-level vdev. 4261 */ 4262 tvd = cvd->vdev_top; 4263 ASSERT(tvd->vdev_parent == rvd); 4264 4265 /* 4266 * Reevaluate the parent vdev state. 4267 */ 4268 vdev_propagate_state(cvd); 4269 4270 /* 4271 * If the 'autoexpand' property is set on the pool then automatically 4272 * try to expand the size of the pool. For example if the device we 4273 * just detached was smaller than the others, it may be possible to 4274 * add metaslabs (i.e. grow the pool). We need to reopen the vdev 4275 * first so that we can obtain the updated sizes of the leaf vdevs. 4276 */ 4277 if (spa->spa_autoexpand) { 4278 vdev_reopen(tvd); 4279 vdev_expand(tvd, txg); 4280 } 4281 4282 vdev_config_dirty(tvd); 4283 4284 /* 4285 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 4286 * vd->vdev_detached is set and free vd's DTL object in syncing context. 4287 * But first make sure we're not on any *other* txg's DTL list, to 4288 * prevent vd from being accessed after it's freed. 4289 */ 4290 vdpath = spa_strdup(vd->vdev_path); 4291 for (int t = 0; t < TXG_SIZE; t++) 4292 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 4293 vd->vdev_detached = B_TRUE; 4294 vdev_dirty(tvd, VDD_DTL, vd, txg); 4295 4296 spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); 4297 4298 /* hang on to the spa before we release the lock */ 4299 spa_open_ref(spa, FTAG); 4300 4301 error = spa_vdev_exit(spa, vd, txg, 0); 4302 4303 spa_history_log_internal(LOG_POOL_VDEV_DETACH, spa, NULL, 4304 "vdev=%s", vdpath); 4305 spa_strfree(vdpath); 4306 4307 /* 4308 * If this was the removal of the original device in a hot spare vdev, 4309 * then we want to go through and remove the device from the hot spare 4310 * list of every other pool. 4311 */ 4312 if (unspare) { 4313 spa_t *altspa = NULL; 4314 4315 mutex_enter(&spa_namespace_lock); 4316 while ((altspa = spa_next(altspa)) != NULL) { 4317 if (altspa->spa_state != POOL_STATE_ACTIVE || 4318 altspa == spa) 4319 continue; 4320 4321 spa_open_ref(altspa, FTAG); 4322 mutex_exit(&spa_namespace_lock); 4323 (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); 4324 mutex_enter(&spa_namespace_lock); 4325 spa_close(altspa, FTAG); 4326 } 4327 mutex_exit(&spa_namespace_lock); 4328 4329 /* search the rest of the vdevs for spares to remove */ 4330 spa_vdev_resilver_done(spa); 4331 } 4332 4333 /* all done with the spa; OK to release */ 4334 mutex_enter(&spa_namespace_lock); 4335 spa_close(spa, FTAG); 4336 mutex_exit(&spa_namespace_lock); 4337 4338 return (error); 4339} 4340 4341/* 4342 * Split a set of devices from their mirrors, and create a new pool from them. 4343 */ 4344int 4345spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, 4346 nvlist_t *props, boolean_t exp) 4347{ 4348 int error = 0; 4349 uint64_t txg, *glist; 4350 spa_t *newspa; 4351 uint_t c, children, lastlog; 4352 nvlist_t **child, *nvl, *tmp; 4353 dmu_tx_t *tx; 4354 char *altroot = NULL; 4355 vdev_t *rvd, **vml = NULL; /* vdev modify list */ 4356 boolean_t activate_slog; 4357 4358 ASSERT(spa_writeable(spa)); 4359 4360 txg = spa_vdev_enter(spa); 4361 4362 /* clear the log and flush everything up to now */ 4363 activate_slog = spa_passivate_log(spa); 4364 (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 4365 error = spa_offline_log(spa); 4366 txg = spa_vdev_config_enter(spa); 4367 4368 if (activate_slog) 4369 spa_activate_log(spa); 4370 4371 if (error != 0) 4372 return (spa_vdev_exit(spa, NULL, txg, error)); 4373 4374 /* check new spa name before going any further */ 4375 if (spa_lookup(newname) != NULL) 4376 return (spa_vdev_exit(spa, NULL, txg, EEXIST)); 4377 4378 /* 4379 * scan through all the children to ensure they're all mirrors 4380 */ 4381 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || 4382 nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, 4383 &children) != 0) 4384 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4385 4386 /* first, check to ensure we've got the right child count */ 4387 rvd = spa->spa_root_vdev; 4388 lastlog = 0; 4389 for (c = 0; c < rvd->vdev_children; c++) { 4390 vdev_t *vd = rvd->vdev_child[c]; 4391 4392 /* don't count the holes & logs as children */ 4393 if (vd->vdev_islog || vd->vdev_ishole) { 4394 if (lastlog == 0) 4395 lastlog = c; 4396 continue; 4397 } 4398 4399 lastlog = 0; 4400 } 4401 if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) 4402 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4403 4404 /* next, ensure no spare or cache devices are part of the split */ 4405 if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || 4406 nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) 4407 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4408 4409 vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); 4410 glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); 4411 4412 /* then, loop over each vdev and validate it */ 4413 for (c = 0; c < children; c++) { 4414 uint64_t is_hole = 0; 4415 4416 (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, 4417 &is_hole); 4418 4419 if (is_hole != 0) { 4420 if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || 4421 spa->spa_root_vdev->vdev_child[c]->vdev_islog) { 4422 continue; 4423 } else { 4424 error = EINVAL; 4425 break; 4426 } 4427 } 4428 4429 /* which disk is going to be split? */ 4430 if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, 4431 &glist[c]) != 0) { 4432 error = EINVAL; 4433 break; 4434 } 4435 4436 /* look it up in the spa */ 4437 vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); 4438 if (vml[c] == NULL) { 4439 error = ENODEV; 4440 break; 4441 } 4442 4443 /* make sure there's nothing stopping the split */ 4444 if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || 4445 vml[c]->vdev_islog || 4446 vml[c]->vdev_ishole || 4447 vml[c]->vdev_isspare || 4448 vml[c]->vdev_isl2cache || 4449 !vdev_writeable(vml[c]) || 4450 vml[c]->vdev_children != 0 || 4451 vml[c]->vdev_state != VDEV_STATE_HEALTHY || 4452 c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { 4453 error = EINVAL; 4454 break; 4455 } 4456 4457 if (vdev_dtl_required(vml[c])) { 4458 error = EBUSY; 4459 break; 4460 } 4461 4462 /* we need certain info from the top level */ 4463 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, 4464 vml[c]->vdev_top->vdev_ms_array) == 0); 4465 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, 4466 vml[c]->vdev_top->vdev_ms_shift) == 0); 4467 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, 4468 vml[c]->vdev_top->vdev_asize) == 0); 4469 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, 4470 vml[c]->vdev_top->vdev_ashift) == 0); 4471 } 4472 4473 if (error != 0) { 4474 kmem_free(vml, children * sizeof (vdev_t *)); 4475 kmem_free(glist, children * sizeof (uint64_t)); 4476 return (spa_vdev_exit(spa, NULL, txg, error)); 4477 } 4478 4479 /* stop writers from using the disks */ 4480 for (c = 0; c < children; c++) { 4481 if (vml[c] != NULL) 4482 vml[c]->vdev_offline = B_TRUE; 4483 } 4484 vdev_reopen(spa->spa_root_vdev); 4485 4486 /* 4487 * Temporarily record the splitting vdevs in the spa config. This 4488 * will disappear once the config is regenerated. 4489 */ 4490 VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); 4491 VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 4492 glist, children) == 0); 4493 kmem_free(glist, children * sizeof (uint64_t)); 4494 4495 mutex_enter(&spa->spa_props_lock); 4496 VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, 4497 nvl) == 0); 4498 mutex_exit(&spa->spa_props_lock); 4499 spa->spa_config_splitting = nvl; 4500 vdev_config_dirty(spa->spa_root_vdev); 4501 4502 /* configure and create the new pool */ 4503 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0); 4504 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 4505 exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0); 4506 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 4507 spa_version(spa)) == 0); 4508 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, 4509 spa->spa_config_txg) == 0); 4510 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, 4511 spa_generate_guid(NULL)) == 0); 4512 (void) nvlist_lookup_string(props, 4513 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 4514 4515 /* add the new pool to the namespace */ 4516 newspa = spa_add(newname, config, altroot); 4517 newspa->spa_config_txg = spa->spa_config_txg; 4518 spa_set_log_state(newspa, SPA_LOG_CLEAR); 4519 4520 /* release the spa config lock, retaining the namespace lock */ 4521 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 4522 4523 if (zio_injection_enabled) 4524 zio_handle_panic_injection(spa, FTAG, 1); 4525 4526 spa_activate(newspa, spa_mode_global); 4527 spa_async_suspend(newspa); 4528 4529#ifndef sun 4530 /* mark that we are creating new spa by splitting */ 4531 newspa->spa_splitting_newspa = B_TRUE; 4532#endif 4533 /* create the new pool from the disks of the original pool */ 4534 error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE); 4535#ifndef sun 4536 newspa->spa_splitting_newspa = B_FALSE; 4537#endif 4538 if (error) 4539 goto out; 4540 4541 /* if that worked, generate a real config for the new pool */ 4542 if (newspa->spa_root_vdev != NULL) { 4543 VERIFY(nvlist_alloc(&newspa->spa_config_splitting, 4544 NV_UNIQUE_NAME, KM_SLEEP) == 0); 4545 VERIFY(nvlist_add_uint64(newspa->spa_config_splitting, 4546 ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0); 4547 spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, 4548 B_TRUE)); 4549 } 4550 4551 /* set the props */ 4552 if (props != NULL) { 4553 spa_configfile_set(newspa, props, B_FALSE); 4554 error = spa_prop_set(newspa, props); 4555 if (error) 4556 goto out; 4557 } 4558 4559 /* flush everything */ 4560 txg = spa_vdev_config_enter(newspa); 4561 vdev_config_dirty(newspa->spa_root_vdev); 4562 (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); 4563 4564 if (zio_injection_enabled) 4565 zio_handle_panic_injection(spa, FTAG, 2); 4566 4567 spa_async_resume(newspa); 4568 4569 /* finally, update the original pool's config */ 4570 txg = spa_vdev_config_enter(spa); 4571 tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 4572 error = dmu_tx_assign(tx, TXG_WAIT); 4573 if (error != 0) 4574 dmu_tx_abort(tx); 4575 for (c = 0; c < children; c++) { 4576 if (vml[c] != NULL) { 4577 vdev_split(vml[c]); 4578 if (error == 0) 4579 spa_history_log_internal(LOG_POOL_VDEV_DETACH, 4580 spa, tx, "vdev=%s", 4581 vml[c]->vdev_path); 4582 vdev_free(vml[c]); 4583 } 4584 } 4585 vdev_config_dirty(spa->spa_root_vdev); 4586 spa->spa_config_splitting = NULL; 4587 nvlist_free(nvl); 4588 if (error == 0) 4589 dmu_tx_commit(tx); 4590 (void) spa_vdev_exit(spa, NULL, txg, 0); 4591 4592 if (zio_injection_enabled) 4593 zio_handle_panic_injection(spa, FTAG, 3); 4594 4595 /* split is complete; log a history record */ 4596 spa_history_log_internal(LOG_POOL_SPLIT, newspa, NULL, 4597 "split new pool %s from pool %s", newname, spa_name(spa)); 4598 4599 kmem_free(vml, children * sizeof (vdev_t *)); 4600 4601 /* if we're not going to mount the filesystems in userland, export */ 4602 if (exp) 4603 error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, 4604 B_FALSE, B_FALSE); 4605 4606 return (error); 4607 4608out: 4609 spa_unload(newspa); 4610 spa_deactivate(newspa); 4611 spa_remove(newspa); 4612 4613 txg = spa_vdev_config_enter(spa); 4614 4615 /* re-online all offlined disks */ 4616 for (c = 0; c < children; c++) { 4617 if (vml[c] != NULL) 4618 vml[c]->vdev_offline = B_FALSE; 4619 } 4620 vdev_reopen(spa->spa_root_vdev); 4621 4622 nvlist_free(spa->spa_config_splitting); 4623 spa->spa_config_splitting = NULL; 4624 (void) spa_vdev_exit(spa, NULL, txg, error); 4625 4626 kmem_free(vml, children * sizeof (vdev_t *)); 4627 return (error); 4628} 4629 4630static nvlist_t * 4631spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) 4632{ 4633 for (int i = 0; i < count; i++) { 4634 uint64_t guid; 4635 4636 VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID, 4637 &guid) == 0); 4638 4639 if (guid == target_guid) 4640 return (nvpp[i]); 4641 } 4642 4643 return (NULL); 4644} 4645 4646static void 4647spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, 4648 nvlist_t *dev_to_remove) 4649{ 4650 nvlist_t **newdev = NULL; 4651 4652 if (count > 1) 4653 newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); 4654 4655 for (int i = 0, j = 0; i < count; i++) { 4656 if (dev[i] == dev_to_remove) 4657 continue; 4658 VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); 4659 } 4660 4661 VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); 4662 VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); 4663 4664 for (int i = 0; i < count - 1; i++) 4665 nvlist_free(newdev[i]); 4666 4667 if (count > 1) 4668 kmem_free(newdev, (count - 1) * sizeof (void *)); 4669} 4670 4671/* 4672 * Evacuate the device. 4673 */ 4674static int 4675spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd) 4676{ 4677 uint64_t txg; 4678 int error = 0; 4679 4680 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 4681 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 4682 ASSERT(vd == vd->vdev_top); 4683 4684 /* 4685 * Evacuate the device. We don't hold the config lock as writer 4686 * since we need to do I/O but we do keep the 4687 * spa_namespace_lock held. Once this completes the device 4688 * should no longer have any blocks allocated on it. 4689 */ 4690 if (vd->vdev_islog) { 4691 if (vd->vdev_stat.vs_alloc != 0) 4692 error = spa_offline_log(spa); 4693 } else { 4694 error = ENOTSUP; 4695 } 4696 4697 if (error) 4698 return (error); 4699 4700 /* 4701 * The evacuation succeeded. Remove any remaining MOS metadata 4702 * associated with this vdev, and wait for these changes to sync. 4703 */ 4704 ASSERT3U(vd->vdev_stat.vs_alloc, ==, 0); 4705 txg = spa_vdev_config_enter(spa); 4706 vd->vdev_removing = B_TRUE; 4707 vdev_dirty(vd, 0, NULL, txg); 4708 vdev_config_dirty(vd); 4709 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 4710 4711 return (0); 4712} 4713 4714/* 4715 * Complete the removal by cleaning up the namespace. 4716 */ 4717static void 4718spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd) 4719{ 4720 vdev_t *rvd = spa->spa_root_vdev; 4721 uint64_t id = vd->vdev_id; 4722 boolean_t last_vdev = (id == (rvd->vdev_children - 1)); 4723 4724 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 4725 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 4726 ASSERT(vd == vd->vdev_top); 4727 4728 /* 4729 * Only remove any devices which are empty. 4730 */ 4731 if (vd->vdev_stat.vs_alloc != 0) 4732 return; 4733 4734 (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 4735 4736 if (list_link_active(&vd->vdev_state_dirty_node)) 4737 vdev_state_clean(vd); 4738 if (list_link_active(&vd->vdev_config_dirty_node)) 4739 vdev_config_clean(vd); 4740 4741 vdev_free(vd); 4742 4743 if (last_vdev) { 4744 vdev_compact_children(rvd); 4745 } else { 4746 vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); 4747 vdev_add_child(rvd, vd); 4748 } 4749 vdev_config_dirty(rvd); 4750 4751 /* 4752 * Reassess the health of our root vdev. 4753 */ 4754 vdev_reopen(rvd); 4755} 4756 4757/* 4758 * Remove a device from the pool - 4759 * 4760 * Removing a device from the vdev namespace requires several steps 4761 * and can take a significant amount of time. As a result we use 4762 * the spa_vdev_config_[enter/exit] functions which allow us to 4763 * grab and release the spa_config_lock while still holding the namespace 4764 * lock. During each step the configuration is synced out. 4765 */ 4766 4767/* 4768 * Remove a device from the pool. Currently, this supports removing only hot 4769 * spares, slogs, and level 2 ARC devices. 4770 */ 4771int 4772spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 4773{ 4774 vdev_t *vd; 4775 metaslab_group_t *mg; 4776 nvlist_t **spares, **l2cache, *nv; 4777 uint64_t txg = 0; 4778 uint_t nspares, nl2cache; 4779 int error = 0; 4780 boolean_t locked = MUTEX_HELD(&spa_namespace_lock); 4781 4782 ASSERT(spa_writeable(spa)); 4783 4784 if (!locked) 4785 txg = spa_vdev_enter(spa); 4786 4787 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 4788 4789 if (spa->spa_spares.sav_vdevs != NULL && 4790 nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 4791 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && 4792 (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { 4793 /* 4794 * Only remove the hot spare if it's not currently in use 4795 * in this pool. 4796 */ 4797 if (vd == NULL || unspare) { 4798 spa_vdev_remove_aux(spa->spa_spares.sav_config, 4799 ZPOOL_CONFIG_SPARES, spares, nspares, nv); 4800 spa_load_spares(spa); 4801 spa->spa_spares.sav_sync = B_TRUE; 4802 } else { 4803 error = EBUSY; 4804 } 4805 } else if (spa->spa_l2cache.sav_vdevs != NULL && 4806 nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 4807 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && 4808 (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { 4809 /* 4810 * Cache devices can always be removed. 4811 */ 4812 spa_vdev_remove_aux(spa->spa_l2cache.sav_config, 4813 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); 4814 spa_load_l2cache(spa); 4815 spa->spa_l2cache.sav_sync = B_TRUE; 4816 } else if (vd != NULL && vd->vdev_islog) { 4817 ASSERT(!locked); 4818 ASSERT(vd == vd->vdev_top); 4819 4820 /* 4821 * XXX - Once we have bp-rewrite this should 4822 * become the common case. 4823 */ 4824 4825 mg = vd->vdev_mg; 4826 4827 /* 4828 * Stop allocating from this vdev. 4829 */ 4830 metaslab_group_passivate(mg); 4831 4832 /* 4833 * Wait for the youngest allocations and frees to sync, 4834 * and then wait for the deferral of those frees to finish. 4835 */ 4836 spa_vdev_config_exit(spa, NULL, 4837 txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); 4838 4839 /* 4840 * Attempt to evacuate the vdev. 4841 */ 4842 error = spa_vdev_remove_evacuate(spa, vd); 4843 4844 txg = spa_vdev_config_enter(spa); 4845 4846 /* 4847 * If we couldn't evacuate the vdev, unwind. 4848 */ 4849 if (error) { 4850 metaslab_group_activate(mg); 4851 return (spa_vdev_exit(spa, NULL, txg, error)); 4852 } 4853 4854 /* 4855 * Clean up the vdev namespace. 4856 */ 4857 spa_vdev_remove_from_namespace(spa, vd); 4858 4859 } else if (vd != NULL) { 4860 /* 4861 * Normal vdevs cannot be removed (yet). 4862 */ 4863 error = ENOTSUP; 4864 } else { 4865 /* 4866 * There is no vdev of any kind with the specified guid. 4867 */ 4868 error = ENOENT; 4869 } 4870 4871 if (!locked) 4872 return (spa_vdev_exit(spa, NULL, txg, error)); 4873 4874 return (error); 4875} 4876 4877/* 4878 * Find any device that's done replacing, or a vdev marked 'unspare' that's 4879 * current spared, so we can detach it. 4880 */ 4881static vdev_t * 4882spa_vdev_resilver_done_hunt(vdev_t *vd) 4883{ 4884 vdev_t *newvd, *oldvd; 4885 4886 for (int c = 0; c < vd->vdev_children; c++) { 4887 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 4888 if (oldvd != NULL) 4889 return (oldvd); 4890 } 4891 4892 /* 4893 * Check for a completed replacement. We always consider the first 4894 * vdev in the list to be the oldest vdev, and the last one to be 4895 * the newest (see spa_vdev_attach() for how that works). In 4896 * the case where the newest vdev is faulted, we will not automatically 4897 * remove it after a resilver completes. This is OK as it will require 4898 * user intervention to determine which disk the admin wishes to keep. 4899 */ 4900 if (vd->vdev_ops == &vdev_replacing_ops) { 4901 ASSERT(vd->vdev_children > 1); 4902 4903 newvd = vd->vdev_child[vd->vdev_children - 1]; 4904 oldvd = vd->vdev_child[0]; 4905 4906 if (vdev_dtl_empty(newvd, DTL_MISSING) && 4907 vdev_dtl_empty(newvd, DTL_OUTAGE) && 4908 !vdev_dtl_required(oldvd)) 4909 return (oldvd); 4910 } 4911 4912 /* 4913 * Check for a completed resilver with the 'unspare' flag set. 4914 */ 4915 if (vd->vdev_ops == &vdev_spare_ops) { 4916 vdev_t *first = vd->vdev_child[0]; 4917 vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; 4918 4919 if (last->vdev_unspare) { 4920 oldvd = first; 4921 newvd = last; 4922 } else if (first->vdev_unspare) { 4923 oldvd = last; 4924 newvd = first; 4925 } else { 4926 oldvd = NULL; 4927 } 4928 4929 if (oldvd != NULL && 4930 vdev_dtl_empty(newvd, DTL_MISSING) && 4931 vdev_dtl_empty(newvd, DTL_OUTAGE) && 4932 !vdev_dtl_required(oldvd)) 4933 return (oldvd); 4934 4935 /* 4936 * If there are more than two spares attached to a disk, 4937 * and those spares are not required, then we want to 4938 * attempt to free them up now so that they can be used 4939 * by other pools. Once we're back down to a single 4940 * disk+spare, we stop removing them. 4941 */ 4942 if (vd->vdev_children > 2) { 4943 newvd = vd->vdev_child[1]; 4944 4945 if (newvd->vdev_isspare && last->vdev_isspare && 4946 vdev_dtl_empty(last, DTL_MISSING) && 4947 vdev_dtl_empty(last, DTL_OUTAGE) && 4948 !vdev_dtl_required(newvd)) 4949 return (newvd); 4950 } 4951 } 4952 4953 return (NULL); 4954} 4955 4956static void 4957spa_vdev_resilver_done(spa_t *spa) 4958{ 4959 vdev_t *vd, *pvd, *ppvd; 4960 uint64_t guid, sguid, pguid, ppguid; 4961 4962 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4963 4964 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 4965 pvd = vd->vdev_parent; 4966 ppvd = pvd->vdev_parent; 4967 guid = vd->vdev_guid; 4968 pguid = pvd->vdev_guid; 4969 ppguid = ppvd->vdev_guid; 4970 sguid = 0; 4971 /* 4972 * If we have just finished replacing a hot spared device, then 4973 * we need to detach the parent's first child (the original hot 4974 * spare) as well. 4975 */ 4976 if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && 4977 ppvd->vdev_children == 2) { 4978 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 4979 sguid = ppvd->vdev_child[1]->vdev_guid; 4980 } 4981 spa_config_exit(spa, SCL_ALL, FTAG); 4982 if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) 4983 return; 4984 if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) 4985 return; 4986 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4987 } 4988 4989 spa_config_exit(spa, SCL_ALL, FTAG); 4990} 4991 4992/* 4993 * Update the stored path or FRU for this vdev. 4994 */ 4995int 4996spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, 4997 boolean_t ispath) 4998{ 4999 vdev_t *vd; 5000 boolean_t sync = B_FALSE; 5001 5002 ASSERT(spa_writeable(spa)); 5003 5004 spa_vdev_state_enter(spa, SCL_ALL); 5005 5006 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 5007 return (spa_vdev_state_exit(spa, NULL, ENOENT)); 5008 5009 if (!vd->vdev_ops->vdev_op_leaf) 5010 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 5011 5012 if (ispath) { 5013 if (strcmp(value, vd->vdev_path) != 0) { 5014 spa_strfree(vd->vdev_path); 5015 vd->vdev_path = spa_strdup(value); 5016 sync = B_TRUE; 5017 } 5018 } else { 5019 if (vd->vdev_fru == NULL) { 5020 vd->vdev_fru = spa_strdup(value); 5021 sync = B_TRUE; 5022 } else if (strcmp(value, vd->vdev_fru) != 0) { 5023 spa_strfree(vd->vdev_fru); 5024 vd->vdev_fru = spa_strdup(value); 5025 sync = B_TRUE; 5026 } 5027 } 5028 5029 return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); 5030} 5031 5032int 5033spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 5034{ 5035 return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); 5036} 5037 5038int 5039spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) 5040{ 5041 return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); 5042} 5043 5044/* 5045 * ========================================================================== 5046 * SPA Scanning 5047 * ========================================================================== 5048 */ 5049 5050int 5051spa_scan_stop(spa_t *spa) 5052{ 5053 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5054 if (dsl_scan_resilvering(spa->spa_dsl_pool)) 5055 return (EBUSY); 5056 return (dsl_scan_cancel(spa->spa_dsl_pool)); 5057} 5058 5059int 5060spa_scan(spa_t *spa, pool_scan_func_t func) 5061{ 5062 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5063 5064 if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) 5065 return (ENOTSUP); 5066 5067 /* 5068 * If a resilver was requested, but there is no DTL on a 5069 * writeable leaf device, we have nothing to do. 5070 */ 5071 if (func == POOL_SCAN_RESILVER && 5072 !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 5073 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 5074 return (0); 5075 } 5076 5077 return (dsl_scan(spa->spa_dsl_pool, func)); 5078} 5079 5080/* 5081 * ========================================================================== 5082 * SPA async task processing 5083 * ========================================================================== 5084 */ 5085 5086static void 5087spa_async_remove(spa_t *spa, vdev_t *vd) 5088{ 5089 if (vd->vdev_remove_wanted) { 5090 vd->vdev_remove_wanted = B_FALSE; 5091 vd->vdev_delayed_close = B_FALSE; 5092 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 5093 5094 /* 5095 * We want to clear the stats, but we don't want to do a full 5096 * vdev_clear() as that will cause us to throw away 5097 * degraded/faulted state as well as attempt to reopen the 5098 * device, all of which is a waste. 5099 */ 5100 vd->vdev_stat.vs_read_errors = 0; 5101 vd->vdev_stat.vs_write_errors = 0; 5102 vd->vdev_stat.vs_checksum_errors = 0; 5103 5104 vdev_state_dirty(vd->vdev_top); 5105 } 5106 5107 for (int c = 0; c < vd->vdev_children; c++) 5108 spa_async_remove(spa, vd->vdev_child[c]); 5109} 5110 5111static void 5112spa_async_probe(spa_t *spa, vdev_t *vd) 5113{ 5114 if (vd->vdev_probe_wanted) { 5115 vd->vdev_probe_wanted = B_FALSE; 5116 vdev_reopen(vd); /* vdev_open() does the actual probe */ 5117 } 5118 5119 for (int c = 0; c < vd->vdev_children; c++) 5120 spa_async_probe(spa, vd->vdev_child[c]); 5121} 5122 5123static void 5124spa_async_autoexpand(spa_t *spa, vdev_t *vd) 5125{ 5126 sysevent_id_t eid; 5127 nvlist_t *attr; 5128 char *physpath; 5129 5130 if (!spa->spa_autoexpand) 5131 return; 5132 5133 for (int c = 0; c < vd->vdev_children; c++) { 5134 vdev_t *cvd = vd->vdev_child[c]; 5135 spa_async_autoexpand(spa, cvd); 5136 } 5137 5138 if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) 5139 return; 5140 5141 physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP); 5142 (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath); 5143 5144 VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5145 VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0); 5146 5147 (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS, 5148 ESC_ZFS_VDEV_AUTOEXPAND, attr, &eid, DDI_SLEEP); 5149 5150 nvlist_free(attr); 5151 kmem_free(physpath, MAXPATHLEN); 5152} 5153 5154static void 5155spa_async_thread(void *arg) 5156{ 5157 spa_t *spa = arg; 5158 int tasks; 5159 5160 ASSERT(spa->spa_sync_on); 5161 5162 mutex_enter(&spa->spa_async_lock); 5163 tasks = spa->spa_async_tasks; 5164 spa->spa_async_tasks = 0; 5165 mutex_exit(&spa->spa_async_lock); 5166 5167 /* 5168 * See if the config needs to be updated. 5169 */ 5170 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 5171 uint64_t old_space, new_space; 5172 5173 mutex_enter(&spa_namespace_lock); 5174 old_space = metaslab_class_get_space(spa_normal_class(spa)); 5175 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 5176 new_space = metaslab_class_get_space(spa_normal_class(spa)); 5177 mutex_exit(&spa_namespace_lock); 5178 5179 /* 5180 * If the pool grew as a result of the config update, 5181 * then log an internal history event. 5182 */ 5183 if (new_space != old_space) { 5184 spa_history_log_internal(LOG_POOL_VDEV_ONLINE, 5185 spa, NULL, 5186 "pool '%s' size: %llu(+%llu)", 5187 spa_name(spa), new_space, new_space - old_space); 5188 } 5189 } 5190 5191 /* 5192 * See if any devices need to be marked REMOVED. 5193 */ 5194 if (tasks & SPA_ASYNC_REMOVE) { 5195 spa_vdev_state_enter(spa, SCL_NONE); 5196 spa_async_remove(spa, spa->spa_root_vdev); 5197 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 5198 spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); 5199 for (int i = 0; i < spa->spa_spares.sav_count; i++) 5200 spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); 5201 (void) spa_vdev_state_exit(spa, NULL, 0); 5202 } 5203 5204 if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { 5205 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 5206 spa_async_autoexpand(spa, spa->spa_root_vdev); 5207 spa_config_exit(spa, SCL_CONFIG, FTAG); 5208 } 5209 5210 /* 5211 * See if any devices need to be probed. 5212 */ 5213 if (tasks & SPA_ASYNC_PROBE) { 5214 spa_vdev_state_enter(spa, SCL_NONE); 5215 spa_async_probe(spa, spa->spa_root_vdev); 5216 (void) spa_vdev_state_exit(spa, NULL, 0); 5217 } 5218 5219 /* 5220 * If any devices are done replacing, detach them. 5221 */ 5222 if (tasks & SPA_ASYNC_RESILVER_DONE) 5223 spa_vdev_resilver_done(spa); 5224 5225 /* 5226 * Kick off a resilver. 5227 */ 5228 if (tasks & SPA_ASYNC_RESILVER) 5229 dsl_resilver_restart(spa->spa_dsl_pool, 0); 5230 5231 /* 5232 * Let the world know that we're done. 5233 */ 5234 mutex_enter(&spa->spa_async_lock); 5235 spa->spa_async_thread = NULL; 5236 cv_broadcast(&spa->spa_async_cv); 5237 mutex_exit(&spa->spa_async_lock); 5238 thread_exit(); 5239} 5240 5241void 5242spa_async_suspend(spa_t *spa) 5243{ 5244 mutex_enter(&spa->spa_async_lock); 5245 spa->spa_async_suspended++; 5246 while (spa->spa_async_thread != NULL) 5247 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 5248 mutex_exit(&spa->spa_async_lock); 5249} 5250 5251void 5252spa_async_resume(spa_t *spa) 5253{ 5254 mutex_enter(&spa->spa_async_lock); 5255 ASSERT(spa->spa_async_suspended != 0); 5256 spa->spa_async_suspended--; 5257 mutex_exit(&spa->spa_async_lock); 5258} 5259 5260static void 5261spa_async_dispatch(spa_t *spa) 5262{ 5263 mutex_enter(&spa->spa_async_lock); 5264 if (spa->spa_async_tasks && !spa->spa_async_suspended && 5265 spa->spa_async_thread == NULL && 5266 rootdir != NULL && !vn_is_readonly(rootdir)) 5267 spa->spa_async_thread = thread_create(NULL, 0, 5268 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 5269 mutex_exit(&spa->spa_async_lock); 5270} 5271 5272void 5273spa_async_request(spa_t *spa, int task) 5274{ 5275 zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); 5276 mutex_enter(&spa->spa_async_lock); 5277 spa->spa_async_tasks |= task; 5278 mutex_exit(&spa->spa_async_lock); 5279} 5280 5281/* 5282 * ========================================================================== 5283 * SPA syncing routines 5284 * ========================================================================== 5285 */ 5286 5287static int 5288bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 5289{ 5290 bpobj_t *bpo = arg; 5291 bpobj_enqueue(bpo, bp, tx); 5292 return (0); 5293} 5294 5295static int 5296spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 5297{ 5298 zio_t *zio = arg; 5299 5300 zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp, 5301 zio->io_flags)); 5302 return (0); 5303} 5304 5305static void 5306spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 5307{ 5308 char *packed = NULL; 5309 size_t bufsize; 5310 size_t nvsize = 0; 5311 dmu_buf_t *db; 5312 5313 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 5314 5315 /* 5316 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 5317 * information. This avoids the dbuf_will_dirty() path and 5318 * saves us a pre-read to get data we don't actually care about. 5319 */ 5320 bufsize = P2ROUNDUP(nvsize, SPA_CONFIG_BLOCKSIZE); 5321 packed = kmem_alloc(bufsize, KM_SLEEP); 5322 5323 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 5324 KM_SLEEP) == 0); 5325 bzero(packed + nvsize, bufsize - nvsize); 5326 5327 dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); 5328 5329 kmem_free(packed, bufsize); 5330 5331 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 5332 dmu_buf_will_dirty(db, tx); 5333 *(uint64_t *)db->db_data = nvsize; 5334 dmu_buf_rele(db, FTAG); 5335} 5336 5337static void 5338spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 5339 const char *config, const char *entry) 5340{ 5341 nvlist_t *nvroot; 5342 nvlist_t **list; 5343 int i; 5344 5345 if (!sav->sav_sync) 5346 return; 5347 5348 /* 5349 * Update the MOS nvlist describing the list of available devices. 5350 * spa_validate_aux() will have already made sure this nvlist is 5351 * valid and the vdevs are labeled appropriately. 5352 */ 5353 if (sav->sav_object == 0) { 5354 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 5355 DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 5356 sizeof (uint64_t), tx); 5357 VERIFY(zap_update(spa->spa_meta_objset, 5358 DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 5359 &sav->sav_object, tx) == 0); 5360 } 5361 5362 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5363 if (sav->sav_count == 0) { 5364 VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); 5365 } else { 5366 list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 5367 for (i = 0; i < sav->sav_count; i++) 5368 list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 5369 B_FALSE, VDEV_CONFIG_L2CACHE); 5370 VERIFY(nvlist_add_nvlist_array(nvroot, config, list, 5371 sav->sav_count) == 0); 5372 for (i = 0; i < sav->sav_count; i++) 5373 nvlist_free(list[i]); 5374 kmem_free(list, sav->sav_count * sizeof (void *)); 5375 } 5376 5377 spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 5378 nvlist_free(nvroot); 5379 5380 sav->sav_sync = B_FALSE; 5381} 5382 5383static void 5384spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 5385{ 5386 nvlist_t *config; 5387 5388 if (list_is_empty(&spa->spa_config_dirty_list)) 5389 return; 5390 5391 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 5392 5393 config = spa_config_generate(spa, spa->spa_root_vdev, 5394 dmu_tx_get_txg(tx), B_FALSE); 5395 5396 spa_config_exit(spa, SCL_STATE, FTAG); 5397 5398 if (spa->spa_config_syncing) 5399 nvlist_free(spa->spa_config_syncing); 5400 spa->spa_config_syncing = config; 5401 5402 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 5403} 5404 5405/* 5406 * Set zpool properties. 5407 */ 5408static void 5409spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx) 5410{ 5411 spa_t *spa = arg1; 5412 objset_t *mos = spa->spa_meta_objset; 5413 nvlist_t *nvp = arg2; 5414 nvpair_t *elem; 5415 uint64_t intval; 5416 char *strval; 5417 zpool_prop_t prop; 5418 const char *propname; 5419 zprop_type_t proptype; 5420 5421 mutex_enter(&spa->spa_props_lock); 5422 5423 elem = NULL; 5424 while ((elem = nvlist_next_nvpair(nvp, elem))) { 5425 switch (prop = zpool_name_to_prop(nvpair_name(elem))) { 5426 case ZPOOL_PROP_VERSION: 5427 /* 5428 * Only set version for non-zpool-creation cases 5429 * (set/import). spa_create() needs special care 5430 * for version setting. 5431 */ 5432 if (tx->tx_txg != TXG_INITIAL) { 5433 VERIFY(nvpair_value_uint64(elem, 5434 &intval) == 0); 5435 ASSERT(intval <= SPA_VERSION); 5436 ASSERT(intval >= spa_version(spa)); 5437 spa->spa_uberblock.ub_version = intval; 5438 vdev_config_dirty(spa->spa_root_vdev); 5439 } 5440 break; 5441 5442 case ZPOOL_PROP_ALTROOT: 5443 /* 5444 * 'altroot' is a non-persistent property. It should 5445 * have been set temporarily at creation or import time. 5446 */ 5447 ASSERT(spa->spa_root != NULL); 5448 break; 5449 5450 case ZPOOL_PROP_READONLY: 5451 case ZPOOL_PROP_CACHEFILE: 5452 /* 5453 * 'readonly' and 'cachefile' are also non-persisitent 5454 * properties. 5455 */ 5456 break; 5457 case ZPOOL_PROP_COMMENT: 5458 VERIFY(nvpair_value_string(elem, &strval) == 0); 5459 if (spa->spa_comment != NULL) 5460 spa_strfree(spa->spa_comment); 5461 spa->spa_comment = spa_strdup(strval); 5462 /* 5463 * We need to dirty the configuration on all the vdevs 5464 * so that their labels get updated. It's unnecessary 5465 * to do this for pool creation since the vdev's 5466 * configuratoin has already been dirtied. 5467 */ 5468 if (tx->tx_txg != TXG_INITIAL) 5469 vdev_config_dirty(spa->spa_root_vdev); 5470 break; 5471 default: 5472 /* 5473 * Set pool property values in the poolprops mos object. 5474 */ 5475 if (spa->spa_pool_props_object == 0) { 5476 VERIFY((spa->spa_pool_props_object = 5477 zap_create(mos, DMU_OT_POOL_PROPS, 5478 DMU_OT_NONE, 0, tx)) > 0); 5479 5480 VERIFY(zap_update(mos, 5481 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 5482 8, 1, &spa->spa_pool_props_object, tx) 5483 == 0); 5484 } 5485 5486 /* normalize the property name */ 5487 propname = zpool_prop_to_name(prop); 5488 proptype = zpool_prop_get_type(prop); 5489 5490 if (nvpair_type(elem) == DATA_TYPE_STRING) { 5491 ASSERT(proptype == PROP_TYPE_STRING); 5492 VERIFY(nvpair_value_string(elem, &strval) == 0); 5493 VERIFY(zap_update(mos, 5494 spa->spa_pool_props_object, propname, 5495 1, strlen(strval) + 1, strval, tx) == 0); 5496 5497 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 5498 VERIFY(nvpair_value_uint64(elem, &intval) == 0); 5499 5500 if (proptype == PROP_TYPE_INDEX) { 5501 const char *unused; 5502 VERIFY(zpool_prop_index_to_string( 5503 prop, intval, &unused) == 0); 5504 } 5505 VERIFY(zap_update(mos, 5506 spa->spa_pool_props_object, propname, 5507 8, 1, &intval, tx) == 0); 5508 } else { 5509 ASSERT(0); /* not allowed */ 5510 } 5511 5512 switch (prop) { 5513 case ZPOOL_PROP_DELEGATION: 5514 spa->spa_delegation = intval; 5515 break; 5516 case ZPOOL_PROP_BOOTFS: 5517 spa->spa_bootfs = intval; 5518 break; 5519 case ZPOOL_PROP_FAILUREMODE: 5520 spa->spa_failmode = intval; 5521 break; 5522 case ZPOOL_PROP_AUTOEXPAND: 5523 spa->spa_autoexpand = intval; 5524 if (tx->tx_txg != TXG_INITIAL) 5525 spa_async_request(spa, 5526 SPA_ASYNC_AUTOEXPAND); 5527 break; 5528 case ZPOOL_PROP_DEDUPDITTO: 5529 spa->spa_dedup_ditto = intval; 5530 break; 5531 default: 5532 break; 5533 } 5534 } 5535 5536 /* log internal history if this is not a zpool create */ 5537 if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY && 5538 tx->tx_txg != TXG_INITIAL) { 5539 spa_history_log_internal(LOG_POOL_PROPSET, 5540 spa, tx, "%s %lld %s", 5541 nvpair_name(elem), intval, spa_name(spa)); 5542 } 5543 } 5544 5545 mutex_exit(&spa->spa_props_lock); 5546} 5547 5548/* 5549 * Perform one-time upgrade on-disk changes. spa_version() does not 5550 * reflect the new version this txg, so there must be no changes this 5551 * txg to anything that the upgrade code depends on after it executes. 5552 * Therefore this must be called after dsl_pool_sync() does the sync 5553 * tasks. 5554 */ 5555static void 5556spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) 5557{ 5558 dsl_pool_t *dp = spa->spa_dsl_pool; 5559 5560 ASSERT(spa->spa_sync_pass == 1); 5561 5562 if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && 5563 spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { 5564 dsl_pool_create_origin(dp, tx); 5565 5566 /* Keeping the origin open increases spa_minref */ 5567 spa->spa_minref += 3; 5568 } 5569 5570 if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && 5571 spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { 5572 dsl_pool_upgrade_clones(dp, tx); 5573 } 5574 5575 if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && 5576 spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { 5577 dsl_pool_upgrade_dir_clones(dp, tx); 5578 5579 /* Keeping the freedir open increases spa_minref */ 5580 spa->spa_minref += 3; 5581 } 5582} 5583 5584/* 5585 * Sync the specified transaction group. New blocks may be dirtied as 5586 * part of the process, so we iterate until it converges. 5587 */ 5588void 5589spa_sync(spa_t *spa, uint64_t txg) 5590{ 5591 dsl_pool_t *dp = spa->spa_dsl_pool; 5592 objset_t *mos = spa->spa_meta_objset; 5593 bpobj_t *defer_bpo = &spa->spa_deferred_bpobj; 5594 bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; 5595 vdev_t *rvd = spa->spa_root_vdev; 5596 vdev_t *vd; 5597 dmu_tx_t *tx; 5598 int error; 5599 5600 VERIFY(spa_writeable(spa)); 5601 5602 /* 5603 * Lock out configuration changes. 5604 */ 5605 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 5606 5607 spa->spa_syncing_txg = txg; 5608 spa->spa_sync_pass = 0; 5609 5610 /* 5611 * If there are any pending vdev state changes, convert them 5612 * into config changes that go out with this transaction group. 5613 */ 5614 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 5615 while (list_head(&spa->spa_state_dirty_list) != NULL) { 5616 /* 5617 * We need the write lock here because, for aux vdevs, 5618 * calling vdev_config_dirty() modifies sav_config. 5619 * This is ugly and will become unnecessary when we 5620 * eliminate the aux vdev wart by integrating all vdevs 5621 * into the root vdev tree. 5622 */ 5623 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 5624 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); 5625 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 5626 vdev_state_clean(vd); 5627 vdev_config_dirty(vd); 5628 } 5629 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 5630 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 5631 } 5632 spa_config_exit(spa, SCL_STATE, FTAG); 5633 5634 tx = dmu_tx_create_assigned(dp, txg); 5635 5636 /* 5637 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 5638 * set spa_deflate if we have no raid-z vdevs. 5639 */ 5640 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 5641 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 5642 int i; 5643 5644 for (i = 0; i < rvd->vdev_children; i++) { 5645 vd = rvd->vdev_child[i]; 5646 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 5647 break; 5648 } 5649 if (i == rvd->vdev_children) { 5650 spa->spa_deflate = TRUE; 5651 VERIFY(0 == zap_add(spa->spa_meta_objset, 5652 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 5653 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 5654 } 5655 } 5656 5657 /* 5658 * If anything has changed in this txg, or if someone is waiting 5659 * for this txg to sync (eg, spa_vdev_remove()), push the 5660 * deferred frees from the previous txg. If not, leave them 5661 * alone so that we don't generate work on an otherwise idle 5662 * system. 5663 */ 5664 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 5665 !txg_list_empty(&dp->dp_dirty_dirs, txg) || 5666 !txg_list_empty(&dp->dp_sync_tasks, txg) || 5667 ((dsl_scan_active(dp->dp_scan) || 5668 txg_sync_waiting(dp)) && !spa_shutting_down(spa))) { 5669 zio_t *zio = zio_root(spa, NULL, NULL, 0); 5670 VERIFY3U(bpobj_iterate(defer_bpo, 5671 spa_free_sync_cb, zio, tx), ==, 0); 5672 VERIFY3U(zio_wait(zio), ==, 0); 5673 } 5674 5675 /* 5676 * Iterate to convergence. 5677 */ 5678 do { 5679 int pass = ++spa->spa_sync_pass; 5680 5681 spa_sync_config_object(spa, tx); 5682 spa_sync_aux_dev(spa, &spa->spa_spares, tx, 5683 ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 5684 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 5685 ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 5686 spa_errlog_sync(spa, txg); 5687 dsl_pool_sync(dp, txg); 5688 5689 if (pass <= SYNC_PASS_DEFERRED_FREE) { 5690 zio_t *zio = zio_root(spa, NULL, NULL, 0); 5691 bplist_iterate(free_bpl, spa_free_sync_cb, 5692 zio, tx); 5693 VERIFY(zio_wait(zio) == 0); 5694 } else { 5695 bplist_iterate(free_bpl, bpobj_enqueue_cb, 5696 defer_bpo, tx); 5697 } 5698 5699 ddt_sync(spa, txg); 5700 dsl_scan_sync(dp, tx); 5701 5702 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) 5703 vdev_sync(vd, txg); 5704 5705 if (pass == 1) 5706 spa_sync_upgrades(spa, tx); 5707 5708 } while (dmu_objset_is_dirty(mos, txg)); 5709 5710 /* 5711 * Rewrite the vdev configuration (which includes the uberblock) 5712 * to commit the transaction group. 5713 * 5714 * If there are no dirty vdevs, we sync the uberblock to a few 5715 * random top-level vdevs that are known to be visible in the 5716 * config cache (see spa_vdev_add() for a complete description). 5717 * If there *are* dirty vdevs, sync the uberblock to all vdevs. 5718 */ 5719 for (;;) { 5720 /* 5721 * We hold SCL_STATE to prevent vdev open/close/etc. 5722 * while we're attempting to write the vdev labels. 5723 */ 5724 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 5725 5726 if (list_is_empty(&spa->spa_config_dirty_list)) { 5727 vdev_t *svd[SPA_DVAS_PER_BP]; 5728 int svdcount = 0; 5729 int children = rvd->vdev_children; 5730 int c0 = spa_get_random(children); 5731 5732 for (int c = 0; c < children; c++) { 5733 vd = rvd->vdev_child[(c0 + c) % children]; 5734 if (vd->vdev_ms_array == 0 || vd->vdev_islog) 5735 continue; 5736 svd[svdcount++] = vd; 5737 if (svdcount == SPA_DVAS_PER_BP) 5738 break; 5739 } 5740 error = vdev_config_sync(svd, svdcount, txg, B_FALSE); 5741 if (error != 0) 5742 error = vdev_config_sync(svd, svdcount, txg, 5743 B_TRUE); 5744 } else { 5745 error = vdev_config_sync(rvd->vdev_child, 5746 rvd->vdev_children, txg, B_FALSE); 5747 if (error != 0) 5748 error = vdev_config_sync(rvd->vdev_child, 5749 rvd->vdev_children, txg, B_TRUE); 5750 } 5751 5752 spa_config_exit(spa, SCL_STATE, FTAG); 5753 5754 if (error == 0) 5755 break; 5756 zio_suspend(spa, NULL); 5757 zio_resume_wait(spa); 5758 } 5759 dmu_tx_commit(tx); 5760 5761 /* 5762 * Clear the dirty config list. 5763 */ 5764 while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) 5765 vdev_config_clean(vd); 5766 5767 /* 5768 * Now that the new config has synced transactionally, 5769 * let it become visible to the config cache. 5770 */ 5771 if (spa->spa_config_syncing != NULL) { 5772 spa_config_set(spa, spa->spa_config_syncing); 5773 spa->spa_config_txg = txg; 5774 spa->spa_config_syncing = NULL; 5775 } 5776 5777 spa->spa_ubsync = spa->spa_uberblock; 5778 5779 dsl_pool_sync_done(dp, txg); 5780 5781 /* 5782 * Update usable space statistics. 5783 */ 5784 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 5785 vdev_sync_done(vd, txg); 5786 5787 spa_update_dspace(spa); 5788 5789 /* 5790 * It had better be the case that we didn't dirty anything 5791 * since vdev_config_sync(). 5792 */ 5793 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 5794 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 5795 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 5796 5797 spa->spa_sync_pass = 0; 5798 5799 spa_config_exit(spa, SCL_CONFIG, FTAG); 5800 5801 spa_handle_ignored_writes(spa); 5802 5803 /* 5804 * If any async tasks have been requested, kick them off. 5805 */ 5806 spa_async_dispatch(spa); 5807} 5808 5809/* 5810 * Sync all pools. We don't want to hold the namespace lock across these 5811 * operations, so we take a reference on the spa_t and drop the lock during the 5812 * sync. 5813 */ 5814void 5815spa_sync_allpools(void) 5816{ 5817 spa_t *spa = NULL; 5818 mutex_enter(&spa_namespace_lock); 5819 while ((spa = spa_next(spa)) != NULL) { 5820 if (spa_state(spa) != POOL_STATE_ACTIVE || 5821 !spa_writeable(spa) || spa_suspended(spa)) 5822 continue; 5823 spa_open_ref(spa, FTAG); 5824 mutex_exit(&spa_namespace_lock); 5825 txg_wait_synced(spa_get_dsl(spa), 0); 5826 mutex_enter(&spa_namespace_lock); 5827 spa_close(spa, FTAG); 5828 } 5829 mutex_exit(&spa_namespace_lock); 5830} 5831 5832/* 5833 * ========================================================================== 5834 * Miscellaneous routines 5835 * ========================================================================== 5836 */ 5837 5838/* 5839 * Remove all pools in the system. 5840 */ 5841void 5842spa_evict_all(void) 5843{ 5844 spa_t *spa; 5845 5846 /* 5847 * Remove all cached state. All pools should be closed now, 5848 * so every spa in the AVL tree should be unreferenced. 5849 */ 5850 mutex_enter(&spa_namespace_lock); 5851 while ((spa = spa_next(NULL)) != NULL) { 5852 /* 5853 * Stop async tasks. The async thread may need to detach 5854 * a device that's been replaced, which requires grabbing 5855 * spa_namespace_lock, so we must drop it here. 5856 */ 5857 spa_open_ref(spa, FTAG); 5858 mutex_exit(&spa_namespace_lock); 5859 spa_async_suspend(spa); 5860 mutex_enter(&spa_namespace_lock); 5861 spa_close(spa, FTAG); 5862 5863 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 5864 spa_unload(spa); 5865 spa_deactivate(spa); 5866 } 5867 spa_remove(spa); 5868 } 5869 mutex_exit(&spa_namespace_lock); 5870} 5871 5872vdev_t * 5873spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) 5874{ 5875 vdev_t *vd; 5876 int i; 5877 5878 if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 5879 return (vd); 5880 5881 if (aux) { 5882 for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 5883 vd = spa->spa_l2cache.sav_vdevs[i]; 5884 if (vd->vdev_guid == guid) 5885 return (vd); 5886 } 5887 5888 for (i = 0; i < spa->spa_spares.sav_count; i++) { 5889 vd = spa->spa_spares.sav_vdevs[i]; 5890 if (vd->vdev_guid == guid) 5891 return (vd); 5892 } 5893 } 5894 5895 return (NULL); 5896} 5897 5898void 5899spa_upgrade(spa_t *spa, uint64_t version) 5900{ 5901 ASSERT(spa_writeable(spa)); 5902 5903 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5904 5905 /* 5906 * This should only be called for a non-faulted pool, and since a 5907 * future version would result in an unopenable pool, this shouldn't be 5908 * possible. 5909 */ 5910 ASSERT(spa->spa_uberblock.ub_version <= SPA_VERSION); 5911 ASSERT(version >= spa->spa_uberblock.ub_version); 5912 5913 spa->spa_uberblock.ub_version = version; 5914 vdev_config_dirty(spa->spa_root_vdev); 5915 5916 spa_config_exit(spa, SCL_ALL, FTAG); 5917 5918 txg_wait_synced(spa_get_dsl(spa), 0); 5919} 5920 5921boolean_t 5922spa_has_spare(spa_t *spa, uint64_t guid) 5923{ 5924 int i; 5925 uint64_t spareguid; 5926 spa_aux_vdev_t *sav = &spa->spa_spares; 5927 5928 for (i = 0; i < sav->sav_count; i++) 5929 if (sav->sav_vdevs[i]->vdev_guid == guid) 5930 return (B_TRUE); 5931 5932 for (i = 0; i < sav->sav_npending; i++) { 5933 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 5934 &spareguid) == 0 && spareguid == guid) 5935 return (B_TRUE); 5936 } 5937 5938 return (B_FALSE); 5939} 5940 5941/* 5942 * Check if a pool has an active shared spare device. 5943 * Note: reference count of an active spare is 2, as a spare and as a replace 5944 */ 5945static boolean_t 5946spa_has_active_shared_spare(spa_t *spa) 5947{ 5948 int i, refcnt; 5949 uint64_t pool; 5950 spa_aux_vdev_t *sav = &spa->spa_spares; 5951 5952 for (i = 0; i < sav->sav_count; i++) { 5953 if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, 5954 &refcnt) && pool != 0ULL && pool == spa_guid(spa) && 5955 refcnt > 2) 5956 return (B_TRUE); 5957 } 5958 5959 return (B_FALSE); 5960} 5961 5962/* 5963 * Post a sysevent corresponding to the given event. The 'name' must be one of 5964 * the event definitions in sys/sysevent/eventdefs.h. The payload will be 5965 * filled in from the spa and (optionally) the vdev. This doesn't do anything 5966 * in the userland libzpool, as we don't want consumers to misinterpret ztest 5967 * or zdb as real changes. 5968 */ 5969void 5970spa_event_notify(spa_t *spa, vdev_t *vd, const char *name) 5971{ 5972#ifdef _KERNEL 5973 sysevent_t *ev; 5974 sysevent_attr_list_t *attr = NULL; 5975 sysevent_value_t value; 5976 sysevent_id_t eid; 5977 5978 ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", 5979 SE_SLEEP); 5980 5981 value.value_type = SE_DATA_TYPE_STRING; 5982 value.value.sv_string = spa_name(spa); 5983 if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) 5984 goto done; 5985 5986 value.value_type = SE_DATA_TYPE_UINT64; 5987 value.value.sv_uint64 = spa_guid(spa); 5988 if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) 5989 goto done; 5990 5991 if (vd) { 5992 value.value_type = SE_DATA_TYPE_UINT64; 5993 value.value.sv_uint64 = vd->vdev_guid; 5994 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, 5995 SE_SLEEP) != 0) 5996 goto done; 5997 5998 if (vd->vdev_path) { 5999 value.value_type = SE_DATA_TYPE_STRING; 6000 value.value.sv_string = vd->vdev_path; 6001 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, 6002 &value, SE_SLEEP) != 0) 6003 goto done; 6004 } 6005 } 6006 6007 if (sysevent_attach_attributes(ev, attr) != 0) 6008 goto done; 6009 attr = NULL; 6010 6011 (void) log_sysevent(ev, SE_SLEEP, &eid); 6012 6013done: 6014 if (attr) 6015 sysevent_free_attr(attr); 6016 sysevent_free(ev); 6017#endif 6018}
|