spa.c revision 249195
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22/* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2013 by Delphix. All rights reserved. 25 * Copyright 2013 Nexenta Systems, Inc. All rights reserved. 26 * Copyright (c) 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved. 27 */ 28 29/* 30 * This file contains all the routines used when modifying on-disk SPA state. 31 * This includes opening, importing, destroying, exporting a pool, and syncing a 32 * pool. 33 */ 34 35#include <sys/zfs_context.h> 36#include <sys/fm/fs/zfs.h> 37#include <sys/spa_impl.h> 38#include <sys/zio.h> 39#include <sys/zio_checksum.h> 40#include <sys/dmu.h> 41#include <sys/dmu_tx.h> 42#include <sys/zap.h> 43#include <sys/zil.h> 44#include <sys/ddt.h> 45#include <sys/vdev_impl.h> 46#include <sys/metaslab.h> 47#include <sys/metaslab_impl.h> 48#include <sys/uberblock_impl.h> 49#include <sys/txg.h> 50#include <sys/avl.h> 51#include <sys/dmu_traverse.h> 52#include <sys/dmu_objset.h> 53#include <sys/unique.h> 54#include <sys/dsl_pool.h> 55#include <sys/dsl_dataset.h> 56#include <sys/dsl_dir.h> 57#include <sys/dsl_prop.h> 58#include <sys/dsl_synctask.h> 59#include <sys/fs/zfs.h> 60#include <sys/arc.h> 61#include <sys/callb.h> 62#include <sys/spa_boot.h> 63#include <sys/zfs_ioctl.h> 64#include <sys/dsl_scan.h> 65#include <sys/dmu_send.h> 66#include <sys/dsl_destroy.h> 67#include <sys/dsl_userhold.h> 68#include <sys/zfeature.h> 69#include <sys/zvol.h> 70#include <sys/trim_map.h> 71 72#ifdef _KERNEL 73#include <sys/callb.h> 74#include <sys/cpupart.h> 75#include <sys/zone.h> 76#endif /* _KERNEL */ 77 78#include "zfs_prop.h" 79#include "zfs_comutil.h" 80 81/* Check hostid on import? */ 82static int check_hostid = 1; 83 84SYSCTL_DECL(_vfs_zfs); 85TUNABLE_INT("vfs.zfs.check_hostid", &check_hostid); 86SYSCTL_INT(_vfs_zfs, OID_AUTO, check_hostid, CTLFLAG_RW, &check_hostid, 0, 87 "Check hostid on import?"); 88 89typedef enum zti_modes { 90 zti_mode_fixed, /* value is # of threads (min 1) */ 91 zti_mode_online_percent, /* value is % of online CPUs */ 92 zti_mode_batch, /* cpu-intensive; value is ignored */ 93 zti_mode_null, /* don't create a taskq */ 94 zti_nmodes 95} zti_modes_t; 96 97#define ZTI_FIX(n) { zti_mode_fixed, (n) } 98#define ZTI_PCT(n) { zti_mode_online_percent, (n) } 99#define ZTI_BATCH { zti_mode_batch, 0 } 100#define ZTI_NULL { zti_mode_null, 0 } 101 102#define ZTI_ONE ZTI_FIX(1) 103 104typedef struct zio_taskq_info { 105 enum zti_modes zti_mode; 106 uint_t zti_value; 107} zio_taskq_info_t; 108 109static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { 110 "issue", "issue_high", "intr", "intr_high" 111}; 112 113/* 114 * Define the taskq threads for the following I/O types: 115 * NULL, READ, WRITE, FREE, CLAIM, and IOCTL 116 */ 117const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { 118 /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ 119 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 120 { ZTI_FIX(8), ZTI_NULL, ZTI_BATCH, ZTI_NULL }, 121 { ZTI_BATCH, ZTI_FIX(5), ZTI_FIX(8), ZTI_FIX(5) }, 122 { ZTI_FIX(100), ZTI_NULL, ZTI_ONE, ZTI_NULL }, 123 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 124 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 125}; 126 127static void spa_sync_version(void *arg, dmu_tx_t *tx); 128static void spa_sync_props(void *arg, dmu_tx_t *tx); 129static boolean_t spa_has_active_shared_spare(spa_t *spa); 130static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config, 131 spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, 132 char **ereport); 133static void spa_vdev_resilver_done(spa_t *spa); 134 135uint_t zio_taskq_batch_pct = 100; /* 1 thread per cpu in pset */ 136#ifdef PSRSET_BIND 137id_t zio_taskq_psrset_bind = PS_NONE; 138#endif 139#ifdef SYSDC 140boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ 141#endif 142uint_t zio_taskq_basedc = 80; /* base duty cycle */ 143 144boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */ 145extern int zfs_sync_pass_deferred_free; 146 147#ifndef illumos 148extern void spa_deadman(void *arg); 149#endif 150 151/* 152 * This (illegal) pool name is used when temporarily importing a spa_t in order 153 * to get the vdev stats associated with the imported devices. 154 */ 155#define TRYIMPORT_NAME "$import" 156 157/* 158 * ========================================================================== 159 * SPA properties routines 160 * ========================================================================== 161 */ 162 163/* 164 * Add a (source=src, propname=propval) list to an nvlist. 165 */ 166static void 167spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, 168 uint64_t intval, zprop_source_t src) 169{ 170 const char *propname = zpool_prop_to_name(prop); 171 nvlist_t *propval; 172 173 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 174 VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); 175 176 if (strval != NULL) 177 VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); 178 else 179 VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); 180 181 VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); 182 nvlist_free(propval); 183} 184 185/* 186 * Get property values from the spa configuration. 187 */ 188static void 189spa_prop_get_config(spa_t *spa, nvlist_t **nvp) 190{ 191 vdev_t *rvd = spa->spa_root_vdev; 192 dsl_pool_t *pool = spa->spa_dsl_pool; 193 uint64_t size; 194 uint64_t alloc; 195 uint64_t space; 196 uint64_t cap, version; 197 zprop_source_t src = ZPROP_SRC_NONE; 198 spa_config_dirent_t *dp; 199 200 ASSERT(MUTEX_HELD(&spa->spa_props_lock)); 201 202 if (rvd != NULL) { 203 alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 204 size = metaslab_class_get_space(spa_normal_class(spa)); 205 spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); 206 spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); 207 spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); 208 spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, 209 size - alloc, src); 210 211 space = 0; 212 for (int c = 0; c < rvd->vdev_children; c++) { 213 vdev_t *tvd = rvd->vdev_child[c]; 214 space += tvd->vdev_max_asize - tvd->vdev_asize; 215 } 216 spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, space, 217 src); 218 219 spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, 220 (spa_mode(spa) == FREAD), src); 221 222 cap = (size == 0) ? 0 : (alloc * 100 / size); 223 spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); 224 225 spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, 226 ddt_get_pool_dedup_ratio(spa), src); 227 228 spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 229 rvd->vdev_state, src); 230 231 version = spa_version(spa); 232 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) 233 src = ZPROP_SRC_DEFAULT; 234 else 235 src = ZPROP_SRC_LOCAL; 236 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); 237 } 238 239 if (pool != NULL) { 240 dsl_dir_t *freedir = pool->dp_free_dir; 241 242 /* 243 * The $FREE directory was introduced in SPA_VERSION_DEADLISTS, 244 * when opening pools before this version freedir will be NULL. 245 */ 246 if (freedir != NULL) { 247 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, 248 freedir->dd_phys->dd_used_bytes, src); 249 } else { 250 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, 251 NULL, 0, src); 252 } 253 } 254 255 spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); 256 257 if (spa->spa_comment != NULL) { 258 spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment, 259 0, ZPROP_SRC_LOCAL); 260 } 261 262 if (spa->spa_root != NULL) 263 spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 264 0, ZPROP_SRC_LOCAL); 265 266 if ((dp = list_head(&spa->spa_config_list)) != NULL) { 267 if (dp->scd_path == NULL) { 268 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 269 "none", 0, ZPROP_SRC_LOCAL); 270 } else if (strcmp(dp->scd_path, spa_config_path) != 0) { 271 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 272 dp->scd_path, 0, ZPROP_SRC_LOCAL); 273 } 274 } 275} 276 277/* 278 * Get zpool property values. 279 */ 280int 281spa_prop_get(spa_t *spa, nvlist_t **nvp) 282{ 283 objset_t *mos = spa->spa_meta_objset; 284 zap_cursor_t zc; 285 zap_attribute_t za; 286 int err; 287 288 VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 289 290 mutex_enter(&spa->spa_props_lock); 291 292 /* 293 * Get properties from the spa config. 294 */ 295 spa_prop_get_config(spa, nvp); 296 297 /* If no pool property object, no more prop to get. */ 298 if (mos == NULL || spa->spa_pool_props_object == 0) { 299 mutex_exit(&spa->spa_props_lock); 300 return (0); 301 } 302 303 /* 304 * Get properties from the MOS pool property object. 305 */ 306 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 307 (err = zap_cursor_retrieve(&zc, &za)) == 0; 308 zap_cursor_advance(&zc)) { 309 uint64_t intval = 0; 310 char *strval = NULL; 311 zprop_source_t src = ZPROP_SRC_DEFAULT; 312 zpool_prop_t prop; 313 314 if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL) 315 continue; 316 317 switch (za.za_integer_length) { 318 case 8: 319 /* integer property */ 320 if (za.za_first_integer != 321 zpool_prop_default_numeric(prop)) 322 src = ZPROP_SRC_LOCAL; 323 324 if (prop == ZPOOL_PROP_BOOTFS) { 325 dsl_pool_t *dp; 326 dsl_dataset_t *ds = NULL; 327 328 dp = spa_get_dsl(spa); 329 dsl_pool_config_enter(dp, FTAG); 330 if (err = dsl_dataset_hold_obj(dp, 331 za.za_first_integer, FTAG, &ds)) { 332 dsl_pool_config_exit(dp, FTAG); 333 break; 334 } 335 336 strval = kmem_alloc( 337 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, 338 KM_SLEEP); 339 dsl_dataset_name(ds, strval); 340 dsl_dataset_rele(ds, FTAG); 341 dsl_pool_config_exit(dp, FTAG); 342 } else { 343 strval = NULL; 344 intval = za.za_first_integer; 345 } 346 347 spa_prop_add_list(*nvp, prop, strval, intval, src); 348 349 if (strval != NULL) 350 kmem_free(strval, 351 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); 352 353 break; 354 355 case 1: 356 /* string property */ 357 strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 358 err = zap_lookup(mos, spa->spa_pool_props_object, 359 za.za_name, 1, za.za_num_integers, strval); 360 if (err) { 361 kmem_free(strval, za.za_num_integers); 362 break; 363 } 364 spa_prop_add_list(*nvp, prop, strval, 0, src); 365 kmem_free(strval, za.za_num_integers); 366 break; 367 368 default: 369 break; 370 } 371 } 372 zap_cursor_fini(&zc); 373 mutex_exit(&spa->spa_props_lock); 374out: 375 if (err && err != ENOENT) { 376 nvlist_free(*nvp); 377 *nvp = NULL; 378 return (err); 379 } 380 381 return (0); 382} 383 384/* 385 * Validate the given pool properties nvlist and modify the list 386 * for the property values to be set. 387 */ 388static int 389spa_prop_validate(spa_t *spa, nvlist_t *props) 390{ 391 nvpair_t *elem; 392 int error = 0, reset_bootfs = 0; 393 uint64_t objnum = 0; 394 boolean_t has_feature = B_FALSE; 395 396 elem = NULL; 397 while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 398 uint64_t intval; 399 char *strval, *slash, *check, *fname; 400 const char *propname = nvpair_name(elem); 401 zpool_prop_t prop = zpool_name_to_prop(propname); 402 403 switch (prop) { 404 case ZPROP_INVAL: 405 if (!zpool_prop_feature(propname)) { 406 error = SET_ERROR(EINVAL); 407 break; 408 } 409 410 /* 411 * Sanitize the input. 412 */ 413 if (nvpair_type(elem) != DATA_TYPE_UINT64) { 414 error = SET_ERROR(EINVAL); 415 break; 416 } 417 418 if (nvpair_value_uint64(elem, &intval) != 0) { 419 error = SET_ERROR(EINVAL); 420 break; 421 } 422 423 if (intval != 0) { 424 error = SET_ERROR(EINVAL); 425 break; 426 } 427 428 fname = strchr(propname, '@') + 1; 429 if (zfeature_lookup_name(fname, NULL) != 0) { 430 error = SET_ERROR(EINVAL); 431 break; 432 } 433 434 has_feature = B_TRUE; 435 break; 436 437 case ZPOOL_PROP_VERSION: 438 error = nvpair_value_uint64(elem, &intval); 439 if (!error && 440 (intval < spa_version(spa) || 441 intval > SPA_VERSION_BEFORE_FEATURES || 442 has_feature)) 443 error = SET_ERROR(EINVAL); 444 break; 445 446 case ZPOOL_PROP_DELEGATION: 447 case ZPOOL_PROP_AUTOREPLACE: 448 case ZPOOL_PROP_LISTSNAPS: 449 case ZPOOL_PROP_AUTOEXPAND: 450 error = nvpair_value_uint64(elem, &intval); 451 if (!error && intval > 1) 452 error = SET_ERROR(EINVAL); 453 break; 454 455 case ZPOOL_PROP_BOOTFS: 456 /* 457 * If the pool version is less than SPA_VERSION_BOOTFS, 458 * or the pool is still being created (version == 0), 459 * the bootfs property cannot be set. 460 */ 461 if (spa_version(spa) < SPA_VERSION_BOOTFS) { 462 error = SET_ERROR(ENOTSUP); 463 break; 464 } 465 466 /* 467 * Make sure the vdev config is bootable 468 */ 469 if (!vdev_is_bootable(spa->spa_root_vdev)) { 470 error = SET_ERROR(ENOTSUP); 471 break; 472 } 473 474 reset_bootfs = 1; 475 476 error = nvpair_value_string(elem, &strval); 477 478 if (!error) { 479 objset_t *os; 480 uint64_t compress; 481 482 if (strval == NULL || strval[0] == '\0') { 483 objnum = zpool_prop_default_numeric( 484 ZPOOL_PROP_BOOTFS); 485 break; 486 } 487 488 if (error = dmu_objset_hold(strval, FTAG, &os)) 489 break; 490 491 /* Must be ZPL and not gzip compressed. */ 492 493 if (dmu_objset_type(os) != DMU_OST_ZFS) { 494 error = SET_ERROR(ENOTSUP); 495 } else if ((error = 496 dsl_prop_get_int_ds(dmu_objset_ds(os), 497 zfs_prop_to_name(ZFS_PROP_COMPRESSION), 498 &compress)) == 0 && 499 !BOOTFS_COMPRESS_VALID(compress)) { 500 error = SET_ERROR(ENOTSUP); 501 } else { 502 objnum = dmu_objset_id(os); 503 } 504 dmu_objset_rele(os, FTAG); 505 } 506 break; 507 508 case ZPOOL_PROP_FAILUREMODE: 509 error = nvpair_value_uint64(elem, &intval); 510 if (!error && (intval < ZIO_FAILURE_MODE_WAIT || 511 intval > ZIO_FAILURE_MODE_PANIC)) 512 error = SET_ERROR(EINVAL); 513 514 /* 515 * This is a special case which only occurs when 516 * the pool has completely failed. This allows 517 * the user to change the in-core failmode property 518 * without syncing it out to disk (I/Os might 519 * currently be blocked). We do this by returning 520 * EIO to the caller (spa_prop_set) to trick it 521 * into thinking we encountered a property validation 522 * error. 523 */ 524 if (!error && spa_suspended(spa)) { 525 spa->spa_failmode = intval; 526 error = SET_ERROR(EIO); 527 } 528 break; 529 530 case ZPOOL_PROP_CACHEFILE: 531 if ((error = nvpair_value_string(elem, &strval)) != 0) 532 break; 533 534 if (strval[0] == '\0') 535 break; 536 537 if (strcmp(strval, "none") == 0) 538 break; 539 540 if (strval[0] != '/') { 541 error = SET_ERROR(EINVAL); 542 break; 543 } 544 545 slash = strrchr(strval, '/'); 546 ASSERT(slash != NULL); 547 548 if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 549 strcmp(slash, "/..") == 0) 550 error = SET_ERROR(EINVAL); 551 break; 552 553 case ZPOOL_PROP_COMMENT: 554 if ((error = nvpair_value_string(elem, &strval)) != 0) 555 break; 556 for (check = strval; *check != '\0'; check++) { 557 /* 558 * The kernel doesn't have an easy isprint() 559 * check. For this kernel check, we merely 560 * check ASCII apart from DEL. Fix this if 561 * there is an easy-to-use kernel isprint(). 562 */ 563 if (*check >= 0x7f) { 564 error = SET_ERROR(EINVAL); 565 break; 566 } 567 check++; 568 } 569 if (strlen(strval) > ZPROP_MAX_COMMENT) 570 error = E2BIG; 571 break; 572 573 case ZPOOL_PROP_DEDUPDITTO: 574 if (spa_version(spa) < SPA_VERSION_DEDUP) 575 error = SET_ERROR(ENOTSUP); 576 else 577 error = nvpair_value_uint64(elem, &intval); 578 if (error == 0 && 579 intval != 0 && intval < ZIO_DEDUPDITTO_MIN) 580 error = SET_ERROR(EINVAL); 581 break; 582 } 583 584 if (error) 585 break; 586 } 587 588 if (!error && reset_bootfs) { 589 error = nvlist_remove(props, 590 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 591 592 if (!error) { 593 error = nvlist_add_uint64(props, 594 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 595 } 596 } 597 598 return (error); 599} 600 601void 602spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) 603{ 604 char *cachefile; 605 spa_config_dirent_t *dp; 606 607 if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), 608 &cachefile) != 0) 609 return; 610 611 dp = kmem_alloc(sizeof (spa_config_dirent_t), 612 KM_SLEEP); 613 614 if (cachefile[0] == '\0') 615 dp->scd_path = spa_strdup(spa_config_path); 616 else if (strcmp(cachefile, "none") == 0) 617 dp->scd_path = NULL; 618 else 619 dp->scd_path = spa_strdup(cachefile); 620 621 list_insert_head(&spa->spa_config_list, dp); 622 if (need_sync) 623 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 624} 625 626int 627spa_prop_set(spa_t *spa, nvlist_t *nvp) 628{ 629 int error; 630 nvpair_t *elem = NULL; 631 boolean_t need_sync = B_FALSE; 632 633 if ((error = spa_prop_validate(spa, nvp)) != 0) 634 return (error); 635 636 while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { 637 zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem)); 638 639 if (prop == ZPOOL_PROP_CACHEFILE || 640 prop == ZPOOL_PROP_ALTROOT || 641 prop == ZPOOL_PROP_READONLY) 642 continue; 643 644 if (prop == ZPOOL_PROP_VERSION || prop == ZPROP_INVAL) { 645 uint64_t ver; 646 647 if (prop == ZPOOL_PROP_VERSION) { 648 VERIFY(nvpair_value_uint64(elem, &ver) == 0); 649 } else { 650 ASSERT(zpool_prop_feature(nvpair_name(elem))); 651 ver = SPA_VERSION_FEATURES; 652 need_sync = B_TRUE; 653 } 654 655 /* Save time if the version is already set. */ 656 if (ver == spa_version(spa)) 657 continue; 658 659 /* 660 * In addition to the pool directory object, we might 661 * create the pool properties object, the features for 662 * read object, the features for write object, or the 663 * feature descriptions object. 664 */ 665 error = dsl_sync_task(spa->spa_name, NULL, 666 spa_sync_version, &ver, 6); 667 if (error) 668 return (error); 669 continue; 670 } 671 672 need_sync = B_TRUE; 673 break; 674 } 675 676 if (need_sync) { 677 return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props, 678 nvp, 6)); 679 } 680 681 return (0); 682} 683 684/* 685 * If the bootfs property value is dsobj, clear it. 686 */ 687void 688spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 689{ 690 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 691 VERIFY(zap_remove(spa->spa_meta_objset, 692 spa->spa_pool_props_object, 693 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 694 spa->spa_bootfs = 0; 695 } 696} 697 698/*ARGSUSED*/ 699static int 700spa_change_guid_check(void *arg, dmu_tx_t *tx) 701{ 702 uint64_t *newguid = arg; 703 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 704 vdev_t *rvd = spa->spa_root_vdev; 705 uint64_t vdev_state; 706 707 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 708 vdev_state = rvd->vdev_state; 709 spa_config_exit(spa, SCL_STATE, FTAG); 710 711 if (vdev_state != VDEV_STATE_HEALTHY) 712 return (SET_ERROR(ENXIO)); 713 714 ASSERT3U(spa_guid(spa), !=, *newguid); 715 716 return (0); 717} 718 719static void 720spa_change_guid_sync(void *arg, dmu_tx_t *tx) 721{ 722 uint64_t *newguid = arg; 723 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 724 uint64_t oldguid; 725 vdev_t *rvd = spa->spa_root_vdev; 726 727 oldguid = spa_guid(spa); 728 729 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 730 rvd->vdev_guid = *newguid; 731 rvd->vdev_guid_sum += (*newguid - oldguid); 732 vdev_config_dirty(rvd); 733 spa_config_exit(spa, SCL_STATE, FTAG); 734 735 spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu", 736 oldguid, *newguid); 737} 738 739/* 740 * Change the GUID for the pool. This is done so that we can later 741 * re-import a pool built from a clone of our own vdevs. We will modify 742 * the root vdev's guid, our own pool guid, and then mark all of our 743 * vdevs dirty. Note that we must make sure that all our vdevs are 744 * online when we do this, or else any vdevs that weren't present 745 * would be orphaned from our pool. We are also going to issue a 746 * sysevent to update any watchers. 747 */ 748int 749spa_change_guid(spa_t *spa) 750{ 751 int error; 752 uint64_t guid; 753 754 mutex_enter(&spa_namespace_lock); 755 guid = spa_generate_guid(NULL); 756 757 error = dsl_sync_task(spa->spa_name, spa_change_guid_check, 758 spa_change_guid_sync, &guid, 5); 759 760 if (error == 0) { 761 spa_config_sync(spa, B_FALSE, B_TRUE); 762 spa_event_notify(spa, NULL, ESC_ZFS_POOL_REGUID); 763 } 764 765 mutex_exit(&spa_namespace_lock); 766 767 return (error); 768} 769 770/* 771 * ========================================================================== 772 * SPA state manipulation (open/create/destroy/import/export) 773 * ========================================================================== 774 */ 775 776static int 777spa_error_entry_compare(const void *a, const void *b) 778{ 779 spa_error_entry_t *sa = (spa_error_entry_t *)a; 780 spa_error_entry_t *sb = (spa_error_entry_t *)b; 781 int ret; 782 783 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 784 sizeof (zbookmark_t)); 785 786 if (ret < 0) 787 return (-1); 788 else if (ret > 0) 789 return (1); 790 else 791 return (0); 792} 793 794/* 795 * Utility function which retrieves copies of the current logs and 796 * re-initializes them in the process. 797 */ 798void 799spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 800{ 801 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 802 803 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 804 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 805 806 avl_create(&spa->spa_errlist_scrub, 807 spa_error_entry_compare, sizeof (spa_error_entry_t), 808 offsetof(spa_error_entry_t, se_avl)); 809 avl_create(&spa->spa_errlist_last, 810 spa_error_entry_compare, sizeof (spa_error_entry_t), 811 offsetof(spa_error_entry_t, se_avl)); 812} 813 814static taskq_t * 815spa_taskq_create(spa_t *spa, const char *name, enum zti_modes mode, 816 uint_t value) 817{ 818 uint_t flags = TASKQ_PREPOPULATE; 819 boolean_t batch = B_FALSE; 820 821 switch (mode) { 822 case zti_mode_null: 823 return (NULL); /* no taskq needed */ 824 825 case zti_mode_fixed: 826 ASSERT3U(value, >=, 1); 827 value = MAX(value, 1); 828 break; 829 830 case zti_mode_batch: 831 batch = B_TRUE; 832 flags |= TASKQ_THREADS_CPU_PCT; 833 value = zio_taskq_batch_pct; 834 break; 835 836 case zti_mode_online_percent: 837 flags |= TASKQ_THREADS_CPU_PCT; 838 break; 839 840 default: 841 panic("unrecognized mode for %s taskq (%u:%u) in " 842 "spa_activate()", 843 name, mode, value); 844 break; 845 } 846 847#ifdef SYSDC 848 if (zio_taskq_sysdc && spa->spa_proc != &p0) { 849 if (batch) 850 flags |= TASKQ_DC_BATCH; 851 852 return (taskq_create_sysdc(name, value, 50, INT_MAX, 853 spa->spa_proc, zio_taskq_basedc, flags)); 854 } 855#endif 856 return (taskq_create_proc(name, value, maxclsyspri, 50, INT_MAX, 857 spa->spa_proc, flags)); 858} 859 860static void 861spa_create_zio_taskqs(spa_t *spa) 862{ 863 for (int t = 0; t < ZIO_TYPES; t++) { 864 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 865 const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; 866 enum zti_modes mode = ztip->zti_mode; 867 uint_t value = ztip->zti_value; 868 char name[32]; 869 870 (void) snprintf(name, sizeof (name), 871 "%s_%s", zio_type_name[t], zio_taskq_types[q]); 872 873 spa->spa_zio_taskq[t][q] = 874 spa_taskq_create(spa, name, mode, value); 875 } 876 } 877} 878 879#ifdef _KERNEL 880#ifdef SPA_PROCESS 881static void 882spa_thread(void *arg) 883{ 884 callb_cpr_t cprinfo; 885 886 spa_t *spa = arg; 887 user_t *pu = PTOU(curproc); 888 889 CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, 890 spa->spa_name); 891 892 ASSERT(curproc != &p0); 893 (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), 894 "zpool-%s", spa->spa_name); 895 (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); 896 897#ifdef PSRSET_BIND 898 /* bind this thread to the requested psrset */ 899 if (zio_taskq_psrset_bind != PS_NONE) { 900 pool_lock(); 901 mutex_enter(&cpu_lock); 902 mutex_enter(&pidlock); 903 mutex_enter(&curproc->p_lock); 904 905 if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, 906 0, NULL, NULL) == 0) { 907 curthread->t_bind_pset = zio_taskq_psrset_bind; 908 } else { 909 cmn_err(CE_WARN, 910 "Couldn't bind process for zfs pool \"%s\" to " 911 "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); 912 } 913 914 mutex_exit(&curproc->p_lock); 915 mutex_exit(&pidlock); 916 mutex_exit(&cpu_lock); 917 pool_unlock(); 918 } 919#endif 920 921#ifdef SYSDC 922 if (zio_taskq_sysdc) { 923 sysdc_thread_enter(curthread, 100, 0); 924 } 925#endif 926 927 spa->spa_proc = curproc; 928 spa->spa_did = curthread->t_did; 929 930 spa_create_zio_taskqs(spa); 931 932 mutex_enter(&spa->spa_proc_lock); 933 ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); 934 935 spa->spa_proc_state = SPA_PROC_ACTIVE; 936 cv_broadcast(&spa->spa_proc_cv); 937 938 CALLB_CPR_SAFE_BEGIN(&cprinfo); 939 while (spa->spa_proc_state == SPA_PROC_ACTIVE) 940 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 941 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); 942 943 ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); 944 spa->spa_proc_state = SPA_PROC_GONE; 945 spa->spa_proc = &p0; 946 cv_broadcast(&spa->spa_proc_cv); 947 CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ 948 949 mutex_enter(&curproc->p_lock); 950 lwp_exit(); 951} 952#endif /* SPA_PROCESS */ 953#endif 954 955/* 956 * Activate an uninitialized pool. 957 */ 958static void 959spa_activate(spa_t *spa, int mode) 960{ 961 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 962 963 spa->spa_state = POOL_STATE_ACTIVE; 964 spa->spa_mode = mode; 965 966 spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops); 967 spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); 968 969 /* Try to create a covering process */ 970 mutex_enter(&spa->spa_proc_lock); 971 ASSERT(spa->spa_proc_state == SPA_PROC_NONE); 972 ASSERT(spa->spa_proc == &p0); 973 spa->spa_did = 0; 974 975#ifdef SPA_PROCESS 976 /* Only create a process if we're going to be around a while. */ 977 if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { 978 if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, 979 NULL, 0) == 0) { 980 spa->spa_proc_state = SPA_PROC_CREATED; 981 while (spa->spa_proc_state == SPA_PROC_CREATED) { 982 cv_wait(&spa->spa_proc_cv, 983 &spa->spa_proc_lock); 984 } 985 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 986 ASSERT(spa->spa_proc != &p0); 987 ASSERT(spa->spa_did != 0); 988 } else { 989#ifdef _KERNEL 990 cmn_err(CE_WARN, 991 "Couldn't create process for zfs pool \"%s\"\n", 992 spa->spa_name); 993#endif 994 } 995 } 996#endif /* SPA_PROCESS */ 997 mutex_exit(&spa->spa_proc_lock); 998 999 /* If we didn't create a process, we need to create our taskqs. */ 1000 ASSERT(spa->spa_proc == &p0); 1001 if (spa->spa_proc == &p0) { 1002 spa_create_zio_taskqs(spa); 1003 } 1004 1005 /* 1006 * Start TRIM thread. 1007 */ 1008 trim_thread_create(spa); 1009 1010 list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), 1011 offsetof(vdev_t, vdev_config_dirty_node)); 1012 list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), 1013 offsetof(vdev_t, vdev_state_dirty_node)); 1014 1015 txg_list_create(&spa->spa_vdev_txg_list, 1016 offsetof(struct vdev, vdev_txg_node)); 1017 1018 avl_create(&spa->spa_errlist_scrub, 1019 spa_error_entry_compare, sizeof (spa_error_entry_t), 1020 offsetof(spa_error_entry_t, se_avl)); 1021 avl_create(&spa->spa_errlist_last, 1022 spa_error_entry_compare, sizeof (spa_error_entry_t), 1023 offsetof(spa_error_entry_t, se_avl)); 1024} 1025 1026/* 1027 * Opposite of spa_activate(). 1028 */ 1029static void 1030spa_deactivate(spa_t *spa) 1031{ 1032 ASSERT(spa->spa_sync_on == B_FALSE); 1033 ASSERT(spa->spa_dsl_pool == NULL); 1034 ASSERT(spa->spa_root_vdev == NULL); 1035 ASSERT(spa->spa_async_zio_root == NULL); 1036 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 1037 1038 /* 1039 * Stop TRIM thread in case spa_unload() wasn't called directly 1040 * before spa_deactivate(). 1041 */ 1042 trim_thread_destroy(spa); 1043 1044 txg_list_destroy(&spa->spa_vdev_txg_list); 1045 1046 list_destroy(&spa->spa_config_dirty_list); 1047 list_destroy(&spa->spa_state_dirty_list); 1048 1049 for (int t = 0; t < ZIO_TYPES; t++) { 1050 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 1051 if (spa->spa_zio_taskq[t][q] != NULL) 1052 taskq_destroy(spa->spa_zio_taskq[t][q]); 1053 spa->spa_zio_taskq[t][q] = NULL; 1054 } 1055 } 1056 1057 metaslab_class_destroy(spa->spa_normal_class); 1058 spa->spa_normal_class = NULL; 1059 1060 metaslab_class_destroy(spa->spa_log_class); 1061 spa->spa_log_class = NULL; 1062 1063 /* 1064 * If this was part of an import or the open otherwise failed, we may 1065 * still have errors left in the queues. Empty them just in case. 1066 */ 1067 spa_errlog_drain(spa); 1068 1069 avl_destroy(&spa->spa_errlist_scrub); 1070 avl_destroy(&spa->spa_errlist_last); 1071 1072 spa->spa_state = POOL_STATE_UNINITIALIZED; 1073 1074 mutex_enter(&spa->spa_proc_lock); 1075 if (spa->spa_proc_state != SPA_PROC_NONE) { 1076 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1077 spa->spa_proc_state = SPA_PROC_DEACTIVATE; 1078 cv_broadcast(&spa->spa_proc_cv); 1079 while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { 1080 ASSERT(spa->spa_proc != &p0); 1081 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1082 } 1083 ASSERT(spa->spa_proc_state == SPA_PROC_GONE); 1084 spa->spa_proc_state = SPA_PROC_NONE; 1085 } 1086 ASSERT(spa->spa_proc == &p0); 1087 mutex_exit(&spa->spa_proc_lock); 1088 1089#ifdef SPA_PROCESS 1090 /* 1091 * We want to make sure spa_thread() has actually exited the ZFS 1092 * module, so that the module can't be unloaded out from underneath 1093 * it. 1094 */ 1095 if (spa->spa_did != 0) { 1096 thread_join(spa->spa_did); 1097 spa->spa_did = 0; 1098 } 1099#endif /* SPA_PROCESS */ 1100} 1101 1102/* 1103 * Verify a pool configuration, and construct the vdev tree appropriately. This 1104 * will create all the necessary vdevs in the appropriate layout, with each vdev 1105 * in the CLOSED state. This will prep the pool before open/creation/import. 1106 * All vdev validation is done by the vdev_alloc() routine. 1107 */ 1108static int 1109spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 1110 uint_t id, int atype) 1111{ 1112 nvlist_t **child; 1113 uint_t children; 1114 int error; 1115 1116 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 1117 return (error); 1118 1119 if ((*vdp)->vdev_ops->vdev_op_leaf) 1120 return (0); 1121 1122 error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 1123 &child, &children); 1124 1125 if (error == ENOENT) 1126 return (0); 1127 1128 if (error) { 1129 vdev_free(*vdp); 1130 *vdp = NULL; 1131 return (SET_ERROR(EINVAL)); 1132 } 1133 1134 for (int c = 0; c < children; c++) { 1135 vdev_t *vd; 1136 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 1137 atype)) != 0) { 1138 vdev_free(*vdp); 1139 *vdp = NULL; 1140 return (error); 1141 } 1142 } 1143 1144 ASSERT(*vdp != NULL); 1145 1146 return (0); 1147} 1148 1149/* 1150 * Opposite of spa_load(). 1151 */ 1152static void 1153spa_unload(spa_t *spa) 1154{ 1155 int i; 1156 1157 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1158 1159 /* 1160 * Stop TRIM thread. 1161 */ 1162 trim_thread_destroy(spa); 1163 1164 /* 1165 * Stop async tasks. 1166 */ 1167 spa_async_suspend(spa); 1168 1169 /* 1170 * Stop syncing. 1171 */ 1172 if (spa->spa_sync_on) { 1173 txg_sync_stop(spa->spa_dsl_pool); 1174 spa->spa_sync_on = B_FALSE; 1175 } 1176 1177 /* 1178 * Wait for any outstanding async I/O to complete. 1179 */ 1180 if (spa->spa_async_zio_root != NULL) { 1181 (void) zio_wait(spa->spa_async_zio_root); 1182 spa->spa_async_zio_root = NULL; 1183 } 1184 1185 bpobj_close(&spa->spa_deferred_bpobj); 1186 1187 /* 1188 * Close the dsl pool. 1189 */ 1190 if (spa->spa_dsl_pool) { 1191 dsl_pool_close(spa->spa_dsl_pool); 1192 spa->spa_dsl_pool = NULL; 1193 spa->spa_meta_objset = NULL; 1194 } 1195 1196 ddt_unload(spa); 1197 1198 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1199 1200 /* 1201 * Drop and purge level 2 cache 1202 */ 1203 spa_l2cache_drop(spa); 1204 1205 /* 1206 * Close all vdevs. 1207 */ 1208 if (spa->spa_root_vdev) 1209 vdev_free(spa->spa_root_vdev); 1210 ASSERT(spa->spa_root_vdev == NULL); 1211 1212 for (i = 0; i < spa->spa_spares.sav_count; i++) 1213 vdev_free(spa->spa_spares.sav_vdevs[i]); 1214 if (spa->spa_spares.sav_vdevs) { 1215 kmem_free(spa->spa_spares.sav_vdevs, 1216 spa->spa_spares.sav_count * sizeof (void *)); 1217 spa->spa_spares.sav_vdevs = NULL; 1218 } 1219 if (spa->spa_spares.sav_config) { 1220 nvlist_free(spa->spa_spares.sav_config); 1221 spa->spa_spares.sav_config = NULL; 1222 } 1223 spa->spa_spares.sav_count = 0; 1224 1225 for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 1226 vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]); 1227 vdev_free(spa->spa_l2cache.sav_vdevs[i]); 1228 } 1229 if (spa->spa_l2cache.sav_vdevs) { 1230 kmem_free(spa->spa_l2cache.sav_vdevs, 1231 spa->spa_l2cache.sav_count * sizeof (void *)); 1232 spa->spa_l2cache.sav_vdevs = NULL; 1233 } 1234 if (spa->spa_l2cache.sav_config) { 1235 nvlist_free(spa->spa_l2cache.sav_config); 1236 spa->spa_l2cache.sav_config = NULL; 1237 } 1238 spa->spa_l2cache.sav_count = 0; 1239 1240 spa->spa_async_suspended = 0; 1241 1242 if (spa->spa_comment != NULL) { 1243 spa_strfree(spa->spa_comment); 1244 spa->spa_comment = NULL; 1245 } 1246 1247 spa_config_exit(spa, SCL_ALL, FTAG); 1248} 1249 1250/* 1251 * Load (or re-load) the current list of vdevs describing the active spares for 1252 * this pool. When this is called, we have some form of basic information in 1253 * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and 1254 * then re-generate a more complete list including status information. 1255 */ 1256static void 1257spa_load_spares(spa_t *spa) 1258{ 1259 nvlist_t **spares; 1260 uint_t nspares; 1261 int i; 1262 vdev_t *vd, *tvd; 1263 1264 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1265 1266 /* 1267 * First, close and free any existing spare vdevs. 1268 */ 1269 for (i = 0; i < spa->spa_spares.sav_count; i++) { 1270 vd = spa->spa_spares.sav_vdevs[i]; 1271 1272 /* Undo the call to spa_activate() below */ 1273 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1274 B_FALSE)) != NULL && tvd->vdev_isspare) 1275 spa_spare_remove(tvd); 1276 vdev_close(vd); 1277 vdev_free(vd); 1278 } 1279 1280 if (spa->spa_spares.sav_vdevs) 1281 kmem_free(spa->spa_spares.sav_vdevs, 1282 spa->spa_spares.sav_count * sizeof (void *)); 1283 1284 if (spa->spa_spares.sav_config == NULL) 1285 nspares = 0; 1286 else 1287 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 1288 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1289 1290 spa->spa_spares.sav_count = (int)nspares; 1291 spa->spa_spares.sav_vdevs = NULL; 1292 1293 if (nspares == 0) 1294 return; 1295 1296 /* 1297 * Construct the array of vdevs, opening them to get status in the 1298 * process. For each spare, there is potentially two different vdev_t 1299 * structures associated with it: one in the list of spares (used only 1300 * for basic validation purposes) and one in the active vdev 1301 * configuration (if it's spared in). During this phase we open and 1302 * validate each vdev on the spare list. If the vdev also exists in the 1303 * active configuration, then we also mark this vdev as an active spare. 1304 */ 1305 spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), 1306 KM_SLEEP); 1307 for (i = 0; i < spa->spa_spares.sav_count; i++) { 1308 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 1309 VDEV_ALLOC_SPARE) == 0); 1310 ASSERT(vd != NULL); 1311 1312 spa->spa_spares.sav_vdevs[i] = vd; 1313 1314 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1315 B_FALSE)) != NULL) { 1316 if (!tvd->vdev_isspare) 1317 spa_spare_add(tvd); 1318 1319 /* 1320 * We only mark the spare active if we were successfully 1321 * able to load the vdev. Otherwise, importing a pool 1322 * with a bad active spare would result in strange 1323 * behavior, because multiple pool would think the spare 1324 * is actively in use. 1325 * 1326 * There is a vulnerability here to an equally bizarre 1327 * circumstance, where a dead active spare is later 1328 * brought back to life (onlined or otherwise). Given 1329 * the rarity of this scenario, and the extra complexity 1330 * it adds, we ignore the possibility. 1331 */ 1332 if (!vdev_is_dead(tvd)) 1333 spa_spare_activate(tvd); 1334 } 1335 1336 vd->vdev_top = vd; 1337 vd->vdev_aux = &spa->spa_spares; 1338 1339 if (vdev_open(vd) != 0) 1340 continue; 1341 1342 if (vdev_validate_aux(vd) == 0) 1343 spa_spare_add(vd); 1344 } 1345 1346 /* 1347 * Recompute the stashed list of spares, with status information 1348 * this time. 1349 */ 1350 VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, 1351 DATA_TYPE_NVLIST_ARRAY) == 0); 1352 1353 spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), 1354 KM_SLEEP); 1355 for (i = 0; i < spa->spa_spares.sav_count; i++) 1356 spares[i] = vdev_config_generate(spa, 1357 spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); 1358 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 1359 ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); 1360 for (i = 0; i < spa->spa_spares.sav_count; i++) 1361 nvlist_free(spares[i]); 1362 kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); 1363} 1364 1365/* 1366 * Load (or re-load) the current list of vdevs describing the active l2cache for 1367 * this pool. When this is called, we have some form of basic information in 1368 * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and 1369 * then re-generate a more complete list including status information. 1370 * Devices which are already active have their details maintained, and are 1371 * not re-opened. 1372 */ 1373static void 1374spa_load_l2cache(spa_t *spa) 1375{ 1376 nvlist_t **l2cache; 1377 uint_t nl2cache; 1378 int i, j, oldnvdevs; 1379 uint64_t guid; 1380 vdev_t *vd, **oldvdevs, **newvdevs; 1381 spa_aux_vdev_t *sav = &spa->spa_l2cache; 1382 1383 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1384 1385 if (sav->sav_config != NULL) { 1386 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 1387 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 1388 newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); 1389 } else { 1390 nl2cache = 0; 1391 newvdevs = NULL; 1392 } 1393 1394 oldvdevs = sav->sav_vdevs; 1395 oldnvdevs = sav->sav_count; 1396 sav->sav_vdevs = NULL; 1397 sav->sav_count = 0; 1398 1399 /* 1400 * Process new nvlist of vdevs. 1401 */ 1402 for (i = 0; i < nl2cache; i++) { 1403 VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, 1404 &guid) == 0); 1405 1406 newvdevs[i] = NULL; 1407 for (j = 0; j < oldnvdevs; j++) { 1408 vd = oldvdevs[j]; 1409 if (vd != NULL && guid == vd->vdev_guid) { 1410 /* 1411 * Retain previous vdev for add/remove ops. 1412 */ 1413 newvdevs[i] = vd; 1414 oldvdevs[j] = NULL; 1415 break; 1416 } 1417 } 1418 1419 if (newvdevs[i] == NULL) { 1420 /* 1421 * Create new vdev 1422 */ 1423 VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, 1424 VDEV_ALLOC_L2CACHE) == 0); 1425 ASSERT(vd != NULL); 1426 newvdevs[i] = vd; 1427 1428 /* 1429 * Commit this vdev as an l2cache device, 1430 * even if it fails to open. 1431 */ 1432 spa_l2cache_add(vd); 1433 1434 vd->vdev_top = vd; 1435 vd->vdev_aux = sav; 1436 1437 spa_l2cache_activate(vd); 1438 1439 if (vdev_open(vd) != 0) 1440 continue; 1441 1442 (void) vdev_validate_aux(vd); 1443 1444 if (!vdev_is_dead(vd)) 1445 l2arc_add_vdev(spa, vd); 1446 } 1447 } 1448 1449 /* 1450 * Purge vdevs that were dropped 1451 */ 1452 for (i = 0; i < oldnvdevs; i++) { 1453 uint64_t pool; 1454 1455 vd = oldvdevs[i]; 1456 if (vd != NULL) { 1457 ASSERT(vd->vdev_isl2cache); 1458 1459 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 1460 pool != 0ULL && l2arc_vdev_present(vd)) 1461 l2arc_remove_vdev(vd); 1462 vdev_clear_stats(vd); 1463 vdev_free(vd); 1464 } 1465 } 1466 1467 if (oldvdevs) 1468 kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); 1469 1470 if (sav->sav_config == NULL) 1471 goto out; 1472 1473 sav->sav_vdevs = newvdevs; 1474 sav->sav_count = (int)nl2cache; 1475 1476 /* 1477 * Recompute the stashed list of l2cache devices, with status 1478 * information this time. 1479 */ 1480 VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 1481 DATA_TYPE_NVLIST_ARRAY) == 0); 1482 1483 l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 1484 for (i = 0; i < sav->sav_count; i++) 1485 l2cache[i] = vdev_config_generate(spa, 1486 sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); 1487 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 1488 ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); 1489out: 1490 for (i = 0; i < sav->sav_count; i++) 1491 nvlist_free(l2cache[i]); 1492 if (sav->sav_count) 1493 kmem_free(l2cache, sav->sav_count * sizeof (void *)); 1494} 1495 1496static int 1497load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 1498{ 1499 dmu_buf_t *db; 1500 char *packed = NULL; 1501 size_t nvsize = 0; 1502 int error; 1503 *value = NULL; 1504 1505 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 1506 nvsize = *(uint64_t *)db->db_data; 1507 dmu_buf_rele(db, FTAG); 1508 1509 packed = kmem_alloc(nvsize, KM_SLEEP); 1510 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, 1511 DMU_READ_PREFETCH); 1512 if (error == 0) 1513 error = nvlist_unpack(packed, nvsize, value, 0); 1514 kmem_free(packed, nvsize); 1515 1516 return (error); 1517} 1518 1519/* 1520 * Checks to see if the given vdev could not be opened, in which case we post a 1521 * sysevent to notify the autoreplace code that the device has been removed. 1522 */ 1523static void 1524spa_check_removed(vdev_t *vd) 1525{ 1526 for (int c = 0; c < vd->vdev_children; c++) 1527 spa_check_removed(vd->vdev_child[c]); 1528 1529 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) && 1530 !vd->vdev_ishole) { 1531 zfs_post_autoreplace(vd->vdev_spa, vd); 1532 spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK); 1533 } 1534} 1535 1536/* 1537 * Validate the current config against the MOS config 1538 */ 1539static boolean_t 1540spa_config_valid(spa_t *spa, nvlist_t *config) 1541{ 1542 vdev_t *mrvd, *rvd = spa->spa_root_vdev; 1543 nvlist_t *nv; 1544 1545 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0); 1546 1547 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1548 VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); 1549 1550 ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children); 1551 1552 /* 1553 * If we're doing a normal import, then build up any additional 1554 * diagnostic information about missing devices in this config. 1555 * We'll pass this up to the user for further processing. 1556 */ 1557 if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { 1558 nvlist_t **child, *nv; 1559 uint64_t idx = 0; 1560 1561 child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **), 1562 KM_SLEEP); 1563 VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); 1564 1565 for (int c = 0; c < rvd->vdev_children; c++) { 1566 vdev_t *tvd = rvd->vdev_child[c]; 1567 vdev_t *mtvd = mrvd->vdev_child[c]; 1568 1569 if (tvd->vdev_ops == &vdev_missing_ops && 1570 mtvd->vdev_ops != &vdev_missing_ops && 1571 mtvd->vdev_islog) 1572 child[idx++] = vdev_config_generate(spa, mtvd, 1573 B_FALSE, 0); 1574 } 1575 1576 if (idx) { 1577 VERIFY(nvlist_add_nvlist_array(nv, 1578 ZPOOL_CONFIG_CHILDREN, child, idx) == 0); 1579 VERIFY(nvlist_add_nvlist(spa->spa_load_info, 1580 ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0); 1581 1582 for (int i = 0; i < idx; i++) 1583 nvlist_free(child[i]); 1584 } 1585 nvlist_free(nv); 1586 kmem_free(child, rvd->vdev_children * sizeof (char **)); 1587 } 1588 1589 /* 1590 * Compare the root vdev tree with the information we have 1591 * from the MOS config (mrvd). Check each top-level vdev 1592 * with the corresponding MOS config top-level (mtvd). 1593 */ 1594 for (int c = 0; c < rvd->vdev_children; c++) { 1595 vdev_t *tvd = rvd->vdev_child[c]; 1596 vdev_t *mtvd = mrvd->vdev_child[c]; 1597 1598 /* 1599 * Resolve any "missing" vdevs in the current configuration. 1600 * If we find that the MOS config has more accurate information 1601 * about the top-level vdev then use that vdev instead. 1602 */ 1603 if (tvd->vdev_ops == &vdev_missing_ops && 1604 mtvd->vdev_ops != &vdev_missing_ops) { 1605 1606 if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) 1607 continue; 1608 1609 /* 1610 * Device specific actions. 1611 */ 1612 if (mtvd->vdev_islog) { 1613 spa_set_log_state(spa, SPA_LOG_CLEAR); 1614 } else { 1615 /* 1616 * XXX - once we have 'readonly' pool 1617 * support we should be able to handle 1618 * missing data devices by transitioning 1619 * the pool to readonly. 1620 */ 1621 continue; 1622 } 1623 1624 /* 1625 * Swap the missing vdev with the data we were 1626 * able to obtain from the MOS config. 1627 */ 1628 vdev_remove_child(rvd, tvd); 1629 vdev_remove_child(mrvd, mtvd); 1630 1631 vdev_add_child(rvd, mtvd); 1632 vdev_add_child(mrvd, tvd); 1633 1634 spa_config_exit(spa, SCL_ALL, FTAG); 1635 vdev_load(mtvd); 1636 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1637 1638 vdev_reopen(rvd); 1639 } else if (mtvd->vdev_islog) { 1640 /* 1641 * Load the slog device's state from the MOS config 1642 * since it's possible that the label does not 1643 * contain the most up-to-date information. 1644 */ 1645 vdev_load_log_state(tvd, mtvd); 1646 vdev_reopen(tvd); 1647 } 1648 } 1649 vdev_free(mrvd); 1650 spa_config_exit(spa, SCL_ALL, FTAG); 1651 1652 /* 1653 * Ensure we were able to validate the config. 1654 */ 1655 return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum); 1656} 1657 1658/* 1659 * Check for missing log devices 1660 */ 1661static boolean_t 1662spa_check_logs(spa_t *spa) 1663{ 1664 boolean_t rv = B_FALSE; 1665 1666 switch (spa->spa_log_state) { 1667 case SPA_LOG_MISSING: 1668 /* need to recheck in case slog has been restored */ 1669 case SPA_LOG_UNKNOWN: 1670 rv = (dmu_objset_find(spa->spa_name, zil_check_log_chain, 1671 NULL, DS_FIND_CHILDREN) != 0); 1672 if (rv) 1673 spa_set_log_state(spa, SPA_LOG_MISSING); 1674 break; 1675 } 1676 return (rv); 1677} 1678 1679static boolean_t 1680spa_passivate_log(spa_t *spa) 1681{ 1682 vdev_t *rvd = spa->spa_root_vdev; 1683 boolean_t slog_found = B_FALSE; 1684 1685 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1686 1687 if (!spa_has_slogs(spa)) 1688 return (B_FALSE); 1689 1690 for (int c = 0; c < rvd->vdev_children; c++) { 1691 vdev_t *tvd = rvd->vdev_child[c]; 1692 metaslab_group_t *mg = tvd->vdev_mg; 1693 1694 if (tvd->vdev_islog) { 1695 metaslab_group_passivate(mg); 1696 slog_found = B_TRUE; 1697 } 1698 } 1699 1700 return (slog_found); 1701} 1702 1703static void 1704spa_activate_log(spa_t *spa) 1705{ 1706 vdev_t *rvd = spa->spa_root_vdev; 1707 1708 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1709 1710 for (int c = 0; c < rvd->vdev_children; c++) { 1711 vdev_t *tvd = rvd->vdev_child[c]; 1712 metaslab_group_t *mg = tvd->vdev_mg; 1713 1714 if (tvd->vdev_islog) 1715 metaslab_group_activate(mg); 1716 } 1717} 1718 1719int 1720spa_offline_log(spa_t *spa) 1721{ 1722 int error; 1723 1724 error = dmu_objset_find(spa_name(spa), zil_vdev_offline, 1725 NULL, DS_FIND_CHILDREN); 1726 if (error == 0) { 1727 /* 1728 * We successfully offlined the log device, sync out the 1729 * current txg so that the "stubby" block can be removed 1730 * by zil_sync(). 1731 */ 1732 txg_wait_synced(spa->spa_dsl_pool, 0); 1733 } 1734 return (error); 1735} 1736 1737static void 1738spa_aux_check_removed(spa_aux_vdev_t *sav) 1739{ 1740 int i; 1741 1742 for (i = 0; i < sav->sav_count; i++) 1743 spa_check_removed(sav->sav_vdevs[i]); 1744} 1745 1746void 1747spa_claim_notify(zio_t *zio) 1748{ 1749 spa_t *spa = zio->io_spa; 1750 1751 if (zio->io_error) 1752 return; 1753 1754 mutex_enter(&spa->spa_props_lock); /* any mutex will do */ 1755 if (spa->spa_claim_max_txg < zio->io_bp->blk_birth) 1756 spa->spa_claim_max_txg = zio->io_bp->blk_birth; 1757 mutex_exit(&spa->spa_props_lock); 1758} 1759 1760typedef struct spa_load_error { 1761 uint64_t sle_meta_count; 1762 uint64_t sle_data_count; 1763} spa_load_error_t; 1764 1765static void 1766spa_load_verify_done(zio_t *zio) 1767{ 1768 blkptr_t *bp = zio->io_bp; 1769 spa_load_error_t *sle = zio->io_private; 1770 dmu_object_type_t type = BP_GET_TYPE(bp); 1771 int error = zio->io_error; 1772 1773 if (error) { 1774 if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && 1775 type != DMU_OT_INTENT_LOG) 1776 atomic_add_64(&sle->sle_meta_count, 1); 1777 else 1778 atomic_add_64(&sle->sle_data_count, 1); 1779 } 1780 zio_data_buf_free(zio->io_data, zio->io_size); 1781} 1782 1783/*ARGSUSED*/ 1784static int 1785spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 1786 const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) 1787{ 1788 if (bp != NULL) { 1789 zio_t *rio = arg; 1790 size_t size = BP_GET_PSIZE(bp); 1791 void *data = zio_data_buf_alloc(size); 1792 1793 zio_nowait(zio_read(rio, spa, bp, data, size, 1794 spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, 1795 ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | 1796 ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); 1797 } 1798 return (0); 1799} 1800 1801static int 1802spa_load_verify(spa_t *spa) 1803{ 1804 zio_t *rio; 1805 spa_load_error_t sle = { 0 }; 1806 zpool_rewind_policy_t policy; 1807 boolean_t verify_ok = B_FALSE; 1808 int error; 1809 1810 zpool_get_rewind_policy(spa->spa_config, &policy); 1811 1812 if (policy.zrp_request & ZPOOL_NEVER_REWIND) 1813 return (0); 1814 1815 rio = zio_root(spa, NULL, &sle, 1816 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 1817 1818 error = traverse_pool(spa, spa->spa_verify_min_txg, 1819 TRAVERSE_PRE | TRAVERSE_PREFETCH, spa_load_verify_cb, rio); 1820 1821 (void) zio_wait(rio); 1822 1823 spa->spa_load_meta_errors = sle.sle_meta_count; 1824 spa->spa_load_data_errors = sle.sle_data_count; 1825 1826 if (!error && sle.sle_meta_count <= policy.zrp_maxmeta && 1827 sle.sle_data_count <= policy.zrp_maxdata) { 1828 int64_t loss = 0; 1829 1830 verify_ok = B_TRUE; 1831 spa->spa_load_txg = spa->spa_uberblock.ub_txg; 1832 spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; 1833 1834 loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; 1835 VERIFY(nvlist_add_uint64(spa->spa_load_info, 1836 ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0); 1837 VERIFY(nvlist_add_int64(spa->spa_load_info, 1838 ZPOOL_CONFIG_REWIND_TIME, loss) == 0); 1839 VERIFY(nvlist_add_uint64(spa->spa_load_info, 1840 ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0); 1841 } else { 1842 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; 1843 } 1844 1845 if (error) { 1846 if (error != ENXIO && error != EIO) 1847 error = SET_ERROR(EIO); 1848 return (error); 1849 } 1850 1851 return (verify_ok ? 0 : EIO); 1852} 1853 1854/* 1855 * Find a value in the pool props object. 1856 */ 1857static void 1858spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) 1859{ 1860 (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, 1861 zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); 1862} 1863 1864/* 1865 * Find a value in the pool directory object. 1866 */ 1867static int 1868spa_dir_prop(spa_t *spa, const char *name, uint64_t *val) 1869{ 1870 return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1871 name, sizeof (uint64_t), 1, val)); 1872} 1873 1874static int 1875spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) 1876{ 1877 vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); 1878 return (err); 1879} 1880 1881/* 1882 * Fix up config after a partly-completed split. This is done with the 1883 * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off 1884 * pool have that entry in their config, but only the splitting one contains 1885 * a list of all the guids of the vdevs that are being split off. 1886 * 1887 * This function determines what to do with that list: either rejoin 1888 * all the disks to the pool, or complete the splitting process. To attempt 1889 * the rejoin, each disk that is offlined is marked online again, and 1890 * we do a reopen() call. If the vdev label for every disk that was 1891 * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) 1892 * then we call vdev_split() on each disk, and complete the split. 1893 * 1894 * Otherwise we leave the config alone, with all the vdevs in place in 1895 * the original pool. 1896 */ 1897static void 1898spa_try_repair(spa_t *spa, nvlist_t *config) 1899{ 1900 uint_t extracted; 1901 uint64_t *glist; 1902 uint_t i, gcount; 1903 nvlist_t *nvl; 1904 vdev_t **vd; 1905 boolean_t attempt_reopen; 1906 1907 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) 1908 return; 1909 1910 /* check that the config is complete */ 1911 if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 1912 &glist, &gcount) != 0) 1913 return; 1914 1915 vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); 1916 1917 /* attempt to online all the vdevs & validate */ 1918 attempt_reopen = B_TRUE; 1919 for (i = 0; i < gcount; i++) { 1920 if (glist[i] == 0) /* vdev is hole */ 1921 continue; 1922 1923 vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); 1924 if (vd[i] == NULL) { 1925 /* 1926 * Don't bother attempting to reopen the disks; 1927 * just do the split. 1928 */ 1929 attempt_reopen = B_FALSE; 1930 } else { 1931 /* attempt to re-online it */ 1932 vd[i]->vdev_offline = B_FALSE; 1933 } 1934 } 1935 1936 if (attempt_reopen) { 1937 vdev_reopen(spa->spa_root_vdev); 1938 1939 /* check each device to see what state it's in */ 1940 for (extracted = 0, i = 0; i < gcount; i++) { 1941 if (vd[i] != NULL && 1942 vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) 1943 break; 1944 ++extracted; 1945 } 1946 } 1947 1948 /* 1949 * If every disk has been moved to the new pool, or if we never 1950 * even attempted to look at them, then we split them off for 1951 * good. 1952 */ 1953 if (!attempt_reopen || gcount == extracted) { 1954 for (i = 0; i < gcount; i++) 1955 if (vd[i] != NULL) 1956 vdev_split(vd[i]); 1957 vdev_reopen(spa->spa_root_vdev); 1958 } 1959 1960 kmem_free(vd, gcount * sizeof (vdev_t *)); 1961} 1962 1963static int 1964spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, 1965 boolean_t mosconfig) 1966{ 1967 nvlist_t *config = spa->spa_config; 1968 char *ereport = FM_EREPORT_ZFS_POOL; 1969 char *comment; 1970 int error; 1971 uint64_t pool_guid; 1972 nvlist_t *nvl; 1973 1974 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) 1975 return (SET_ERROR(EINVAL)); 1976 1977 ASSERT(spa->spa_comment == NULL); 1978 if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) 1979 spa->spa_comment = spa_strdup(comment); 1980 1981 /* 1982 * Versioning wasn't explicitly added to the label until later, so if 1983 * it's not present treat it as the initial version. 1984 */ 1985 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 1986 &spa->spa_ubsync.ub_version) != 0) 1987 spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 1988 1989 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 1990 &spa->spa_config_txg); 1991 1992 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 1993 spa_guid_exists(pool_guid, 0)) { 1994 error = SET_ERROR(EEXIST); 1995 } else { 1996 spa->spa_config_guid = pool_guid; 1997 1998 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, 1999 &nvl) == 0) { 2000 VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting, 2001 KM_SLEEP) == 0); 2002 } 2003 2004 nvlist_free(spa->spa_load_info); 2005 spa->spa_load_info = fnvlist_alloc(); 2006 2007 gethrestime(&spa->spa_loaded_ts); 2008 error = spa_load_impl(spa, pool_guid, config, state, type, 2009 mosconfig, &ereport); 2010 } 2011 2012 spa->spa_minref = refcount_count(&spa->spa_refcount); 2013 if (error) { 2014 if (error != EEXIST) { 2015 spa->spa_loaded_ts.tv_sec = 0; 2016 spa->spa_loaded_ts.tv_nsec = 0; 2017 } 2018 if (error != EBADF) { 2019 zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); 2020 } 2021 } 2022 spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; 2023 spa->spa_ena = 0; 2024 2025 return (error); 2026} 2027 2028/* 2029 * Load an existing storage pool, using the pool's builtin spa_config as a 2030 * source of configuration information. 2031 */ 2032static int 2033spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, 2034 spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, 2035 char **ereport) 2036{ 2037 int error = 0; 2038 nvlist_t *nvroot = NULL; 2039 nvlist_t *label; 2040 vdev_t *rvd; 2041 uberblock_t *ub = &spa->spa_uberblock; 2042 uint64_t children, config_cache_txg = spa->spa_config_txg; 2043 int orig_mode = spa->spa_mode; 2044 int parse; 2045 uint64_t obj; 2046 boolean_t missing_feat_write = B_FALSE; 2047 2048 /* 2049 * If this is an untrusted config, access the pool in read-only mode. 2050 * This prevents things like resilvering recently removed devices. 2051 */ 2052 if (!mosconfig) 2053 spa->spa_mode = FREAD; 2054 2055 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 2056 2057 spa->spa_load_state = state; 2058 2059 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot)) 2060 return (SET_ERROR(EINVAL)); 2061 2062 parse = (type == SPA_IMPORT_EXISTING ? 2063 VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); 2064 2065 /* 2066 * Create "The Godfather" zio to hold all async IOs 2067 */ 2068 spa->spa_async_zio_root = zio_root(spa, NULL, NULL, 2069 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); 2070 2071 /* 2072 * Parse the configuration into a vdev tree. We explicitly set the 2073 * value that will be returned by spa_version() since parsing the 2074 * configuration requires knowing the version number. 2075 */ 2076 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2077 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse); 2078 spa_config_exit(spa, SCL_ALL, FTAG); 2079 2080 if (error != 0) 2081 return (error); 2082 2083 ASSERT(spa->spa_root_vdev == rvd); 2084 2085 if (type != SPA_IMPORT_ASSEMBLE) { 2086 ASSERT(spa_guid(spa) == pool_guid); 2087 } 2088 2089 /* 2090 * Try to open all vdevs, loading each label in the process. 2091 */ 2092 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2093 error = vdev_open(rvd); 2094 spa_config_exit(spa, SCL_ALL, FTAG); 2095 if (error != 0) 2096 return (error); 2097 2098 /* 2099 * We need to validate the vdev labels against the configuration that 2100 * we have in hand, which is dependent on the setting of mosconfig. If 2101 * mosconfig is true then we're validating the vdev labels based on 2102 * that config. Otherwise, we're validating against the cached config 2103 * (zpool.cache) that was read when we loaded the zfs module, and then 2104 * later we will recursively call spa_load() and validate against 2105 * the vdev config. 2106 * 2107 * If we're assembling a new pool that's been split off from an 2108 * existing pool, the labels haven't yet been updated so we skip 2109 * validation for now. 2110 */ 2111 if (type != SPA_IMPORT_ASSEMBLE) { 2112 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2113 error = vdev_validate(rvd, mosconfig); 2114 spa_config_exit(spa, SCL_ALL, FTAG); 2115 2116 if (error != 0) 2117 return (error); 2118 2119 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 2120 return (SET_ERROR(ENXIO)); 2121 } 2122 2123 /* 2124 * Find the best uberblock. 2125 */ 2126 vdev_uberblock_load(rvd, ub, &label); 2127 2128 /* 2129 * If we weren't able to find a single valid uberblock, return failure. 2130 */ 2131 if (ub->ub_txg == 0) { 2132 nvlist_free(label); 2133 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); 2134 } 2135 2136 /* 2137 * If the pool has an unsupported version we can't open it. 2138 */ 2139 if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) { 2140 nvlist_free(label); 2141 return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); 2142 } 2143 2144 if (ub->ub_version >= SPA_VERSION_FEATURES) { 2145 nvlist_t *features; 2146 2147 /* 2148 * If we weren't able to find what's necessary for reading the 2149 * MOS in the label, return failure. 2150 */ 2151 if (label == NULL || nvlist_lookup_nvlist(label, 2152 ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) { 2153 nvlist_free(label); 2154 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 2155 ENXIO)); 2156 } 2157 2158 /* 2159 * Update our in-core representation with the definitive values 2160 * from the label. 2161 */ 2162 nvlist_free(spa->spa_label_features); 2163 VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0); 2164 } 2165 2166 nvlist_free(label); 2167 2168 /* 2169 * Look through entries in the label nvlist's features_for_read. If 2170 * there is a feature listed there which we don't understand then we 2171 * cannot open a pool. 2172 */ 2173 if (ub->ub_version >= SPA_VERSION_FEATURES) { 2174 nvlist_t *unsup_feat; 2175 2176 VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) == 2177 0); 2178 2179 for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features, 2180 NULL); nvp != NULL; 2181 nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) { 2182 if (!zfeature_is_supported(nvpair_name(nvp))) { 2183 VERIFY(nvlist_add_string(unsup_feat, 2184 nvpair_name(nvp), "") == 0); 2185 } 2186 } 2187 2188 if (!nvlist_empty(unsup_feat)) { 2189 VERIFY(nvlist_add_nvlist(spa->spa_load_info, 2190 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0); 2191 nvlist_free(unsup_feat); 2192 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 2193 ENOTSUP)); 2194 } 2195 2196 nvlist_free(unsup_feat); 2197 } 2198 2199 /* 2200 * If the vdev guid sum doesn't match the uberblock, we have an 2201 * incomplete configuration. We first check to see if the pool 2202 * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN). 2203 * If it is, defer the vdev_guid_sum check till later so we 2204 * can handle missing vdevs. 2205 */ 2206 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN, 2207 &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE && 2208 rvd->vdev_guid_sum != ub->ub_guid_sum) 2209 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); 2210 2211 if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { 2212 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2213 spa_try_repair(spa, config); 2214 spa_config_exit(spa, SCL_ALL, FTAG); 2215 nvlist_free(spa->spa_config_splitting); 2216 spa->spa_config_splitting = NULL; 2217 } 2218 2219 /* 2220 * Initialize internal SPA structures. 2221 */ 2222 spa->spa_state = POOL_STATE_ACTIVE; 2223 spa->spa_ubsync = spa->spa_uberblock; 2224 spa->spa_verify_min_txg = spa->spa_extreme_rewind ? 2225 TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; 2226 spa->spa_first_txg = spa->spa_last_ubsync_txg ? 2227 spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; 2228 spa->spa_claim_max_txg = spa->spa_first_txg; 2229 spa->spa_prev_software_version = ub->ub_software_version; 2230 2231 error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 2232 if (error) 2233 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2234 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 2235 2236 if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0) 2237 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2238 2239 if (spa_version(spa) >= SPA_VERSION_FEATURES) { 2240 boolean_t missing_feat_read = B_FALSE; 2241 nvlist_t *unsup_feat, *enabled_feat; 2242 2243 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ, 2244 &spa->spa_feat_for_read_obj) != 0) { 2245 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2246 } 2247 2248 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE, 2249 &spa->spa_feat_for_write_obj) != 0) { 2250 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2251 } 2252 2253 if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS, 2254 &spa->spa_feat_desc_obj) != 0) { 2255 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2256 } 2257 2258 enabled_feat = fnvlist_alloc(); 2259 unsup_feat = fnvlist_alloc(); 2260 2261 if (!feature_is_supported(spa->spa_meta_objset, 2262 spa->spa_feat_for_read_obj, spa->spa_feat_desc_obj, 2263 unsup_feat, enabled_feat)) 2264 missing_feat_read = B_TRUE; 2265 2266 if (spa_writeable(spa) || state == SPA_LOAD_TRYIMPORT) { 2267 if (!feature_is_supported(spa->spa_meta_objset, 2268 spa->spa_feat_for_write_obj, spa->spa_feat_desc_obj, 2269 unsup_feat, enabled_feat)) { 2270 missing_feat_write = B_TRUE; 2271 } 2272 } 2273 2274 fnvlist_add_nvlist(spa->spa_load_info, 2275 ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat); 2276 2277 if (!nvlist_empty(unsup_feat)) { 2278 fnvlist_add_nvlist(spa->spa_load_info, 2279 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); 2280 } 2281 2282 fnvlist_free(enabled_feat); 2283 fnvlist_free(unsup_feat); 2284 2285 if (!missing_feat_read) { 2286 fnvlist_add_boolean(spa->spa_load_info, 2287 ZPOOL_CONFIG_CAN_RDONLY); 2288 } 2289 2290 /* 2291 * If the state is SPA_LOAD_TRYIMPORT, our objective is 2292 * twofold: to determine whether the pool is available for 2293 * import in read-write mode and (if it is not) whether the 2294 * pool is available for import in read-only mode. If the pool 2295 * is available for import in read-write mode, it is displayed 2296 * as available in userland; if it is not available for import 2297 * in read-only mode, it is displayed as unavailable in 2298 * userland. If the pool is available for import in read-only 2299 * mode but not read-write mode, it is displayed as unavailable 2300 * in userland with a special note that the pool is actually 2301 * available for open in read-only mode. 2302 * 2303 * As a result, if the state is SPA_LOAD_TRYIMPORT and we are 2304 * missing a feature for write, we must first determine whether 2305 * the pool can be opened read-only before returning to 2306 * userland in order to know whether to display the 2307 * abovementioned note. 2308 */ 2309 if (missing_feat_read || (missing_feat_write && 2310 spa_writeable(spa))) { 2311 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 2312 ENOTSUP)); 2313 } 2314 } 2315 2316 spa->spa_is_initializing = B_TRUE; 2317 error = dsl_pool_open(spa->spa_dsl_pool); 2318 spa->spa_is_initializing = B_FALSE; 2319 if (error != 0) 2320 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2321 2322 if (!mosconfig) { 2323 uint64_t hostid; 2324 nvlist_t *policy = NULL, *nvconfig; 2325 2326 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) 2327 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2328 2329 if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig, 2330 ZPOOL_CONFIG_HOSTID, &hostid) == 0) { 2331 char *hostname; 2332 unsigned long myhostid = 0; 2333 2334 VERIFY(nvlist_lookup_string(nvconfig, 2335 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 2336 2337#ifdef _KERNEL 2338 myhostid = zone_get_hostid(NULL); 2339#else /* _KERNEL */ 2340 /* 2341 * We're emulating the system's hostid in userland, so 2342 * we can't use zone_get_hostid(). 2343 */ 2344 (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 2345#endif /* _KERNEL */ 2346 if (check_hostid && hostid != 0 && myhostid != 0 && 2347 hostid != myhostid) { 2348 nvlist_free(nvconfig); 2349 cmn_err(CE_WARN, "pool '%s' could not be " 2350 "loaded as it was last accessed by " 2351 "another system (host: %s hostid: 0x%lx). " 2352 "See: http://illumos.org/msg/ZFS-8000-EY", 2353 spa_name(spa), hostname, 2354 (unsigned long)hostid); 2355 return (SET_ERROR(EBADF)); 2356 } 2357 } 2358 if (nvlist_lookup_nvlist(spa->spa_config, 2359 ZPOOL_REWIND_POLICY, &policy) == 0) 2360 VERIFY(nvlist_add_nvlist(nvconfig, 2361 ZPOOL_REWIND_POLICY, policy) == 0); 2362 2363 spa_config_set(spa, nvconfig); 2364 spa_unload(spa); 2365 spa_deactivate(spa); 2366 spa_activate(spa, orig_mode); 2367 2368 return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE)); 2369 } 2370 2371 if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0) 2372 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2373 error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); 2374 if (error != 0) 2375 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2376 2377 /* 2378 * Load the bit that tells us to use the new accounting function 2379 * (raid-z deflation). If we have an older pool, this will not 2380 * be present. 2381 */ 2382 error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate); 2383 if (error != 0 && error != ENOENT) 2384 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2385 2386 error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, 2387 &spa->spa_creation_version); 2388 if (error != 0 && error != ENOENT) 2389 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2390 2391 /* 2392 * Load the persistent error log. If we have an older pool, this will 2393 * not be present. 2394 */ 2395 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last); 2396 if (error != 0 && error != ENOENT) 2397 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2398 2399 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, 2400 &spa->spa_errlog_scrub); 2401 if (error != 0 && error != ENOENT) 2402 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2403 2404 /* 2405 * Load the history object. If we have an older pool, this 2406 * will not be present. 2407 */ 2408 error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history); 2409 if (error != 0 && error != ENOENT) 2410 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2411 2412 /* 2413 * If we're assembling the pool from the split-off vdevs of 2414 * an existing pool, we don't want to attach the spares & cache 2415 * devices. 2416 */ 2417 2418 /* 2419 * Load any hot spares for this pool. 2420 */ 2421 error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object); 2422 if (error != 0 && error != ENOENT) 2423 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2424 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 2425 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 2426 if (load_nvlist(spa, spa->spa_spares.sav_object, 2427 &spa->spa_spares.sav_config) != 0) 2428 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2429 2430 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2431 spa_load_spares(spa); 2432 spa_config_exit(spa, SCL_ALL, FTAG); 2433 } else if (error == 0) { 2434 spa->spa_spares.sav_sync = B_TRUE; 2435 } 2436 2437 /* 2438 * Load any level 2 ARC devices for this pool. 2439 */ 2440 error = spa_dir_prop(spa, DMU_POOL_L2CACHE, 2441 &spa->spa_l2cache.sav_object); 2442 if (error != 0 && error != ENOENT) 2443 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2444 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 2445 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 2446 if (load_nvlist(spa, spa->spa_l2cache.sav_object, 2447 &spa->spa_l2cache.sav_config) != 0) 2448 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2449 2450 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2451 spa_load_l2cache(spa); 2452 spa_config_exit(spa, SCL_ALL, FTAG); 2453 } else if (error == 0) { 2454 spa->spa_l2cache.sav_sync = B_TRUE; 2455 } 2456 2457 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 2458 2459 error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object); 2460 if (error && error != ENOENT) 2461 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2462 2463 if (error == 0) { 2464 uint64_t autoreplace; 2465 2466 spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); 2467 spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); 2468 spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); 2469 spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); 2470 spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); 2471 spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO, 2472 &spa->spa_dedup_ditto); 2473 2474 spa->spa_autoreplace = (autoreplace != 0); 2475 } 2476 2477 /* 2478 * If the 'autoreplace' property is set, then post a resource notifying 2479 * the ZFS DE that it should not issue any faults for unopenable 2480 * devices. We also iterate over the vdevs, and post a sysevent for any 2481 * unopenable vdevs so that the normal autoreplace handler can take 2482 * over. 2483 */ 2484 if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) { 2485 spa_check_removed(spa->spa_root_vdev); 2486 /* 2487 * For the import case, this is done in spa_import(), because 2488 * at this point we're using the spare definitions from 2489 * the MOS config, not necessarily from the userland config. 2490 */ 2491 if (state != SPA_LOAD_IMPORT) { 2492 spa_aux_check_removed(&spa->spa_spares); 2493 spa_aux_check_removed(&spa->spa_l2cache); 2494 } 2495 } 2496 2497 /* 2498 * Load the vdev state for all toplevel vdevs. 2499 */ 2500 vdev_load(rvd); 2501 2502 /* 2503 * Propagate the leaf DTLs we just loaded all the way up the tree. 2504 */ 2505 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2506 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 2507 spa_config_exit(spa, SCL_ALL, FTAG); 2508 2509 /* 2510 * Load the DDTs (dedup tables). 2511 */ 2512 error = ddt_load(spa); 2513 if (error != 0) 2514 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2515 2516 spa_update_dspace(spa); 2517 2518 /* 2519 * Validate the config, using the MOS config to fill in any 2520 * information which might be missing. If we fail to validate 2521 * the config then declare the pool unfit for use. If we're 2522 * assembling a pool from a split, the log is not transferred 2523 * over. 2524 */ 2525 if (type != SPA_IMPORT_ASSEMBLE) { 2526 nvlist_t *nvconfig; 2527 2528 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) 2529 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2530 2531 if (!spa_config_valid(spa, nvconfig)) { 2532 nvlist_free(nvconfig); 2533 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, 2534 ENXIO)); 2535 } 2536 nvlist_free(nvconfig); 2537 2538 /* 2539 * Now that we've validated the config, check the state of the 2540 * root vdev. If it can't be opened, it indicates one or 2541 * more toplevel vdevs are faulted. 2542 */ 2543 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 2544 return (SET_ERROR(ENXIO)); 2545 2546 if (spa_check_logs(spa)) { 2547 *ereport = FM_EREPORT_ZFS_LOG_REPLAY; 2548 return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO)); 2549 } 2550 } 2551 2552 if (missing_feat_write) { 2553 ASSERT(state == SPA_LOAD_TRYIMPORT); 2554 2555 /* 2556 * At this point, we know that we can open the pool in 2557 * read-only mode but not read-write mode. We now have enough 2558 * information and can return to userland. 2559 */ 2560 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); 2561 } 2562 2563 /* 2564 * We've successfully opened the pool, verify that we're ready 2565 * to start pushing transactions. 2566 */ 2567 if (state != SPA_LOAD_TRYIMPORT) { 2568 if (error = spa_load_verify(spa)) 2569 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 2570 error)); 2571 } 2572 2573 if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER || 2574 spa->spa_load_max_txg == UINT64_MAX)) { 2575 dmu_tx_t *tx; 2576 int need_update = B_FALSE; 2577 2578 ASSERT(state != SPA_LOAD_TRYIMPORT); 2579 2580 /* 2581 * Claim log blocks that haven't been committed yet. 2582 * This must all happen in a single txg. 2583 * Note: spa_claim_max_txg is updated by spa_claim_notify(), 2584 * invoked from zil_claim_log_block()'s i/o done callback. 2585 * Price of rollback is that we abandon the log. 2586 */ 2587 spa->spa_claiming = B_TRUE; 2588 2589 tx = dmu_tx_create_assigned(spa_get_dsl(spa), 2590 spa_first_txg(spa)); 2591 (void) dmu_objset_find(spa_name(spa), 2592 zil_claim, tx, DS_FIND_CHILDREN); 2593 dmu_tx_commit(tx); 2594 2595 spa->spa_claiming = B_FALSE; 2596 2597 spa_set_log_state(spa, SPA_LOG_GOOD); 2598 spa->spa_sync_on = B_TRUE; 2599 txg_sync_start(spa->spa_dsl_pool); 2600 2601 /* 2602 * Wait for all claims to sync. We sync up to the highest 2603 * claimed log block birth time so that claimed log blocks 2604 * don't appear to be from the future. spa_claim_max_txg 2605 * will have been set for us by either zil_check_log_chain() 2606 * (invoked from spa_check_logs()) or zil_claim() above. 2607 */ 2608 txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); 2609 2610 /* 2611 * If the config cache is stale, or we have uninitialized 2612 * metaslabs (see spa_vdev_add()), then update the config. 2613 * 2614 * If this is a verbatim import, trust the current 2615 * in-core spa_config and update the disk labels. 2616 */ 2617 if (config_cache_txg != spa->spa_config_txg || 2618 state == SPA_LOAD_IMPORT || 2619 state == SPA_LOAD_RECOVER || 2620 (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) 2621 need_update = B_TRUE; 2622 2623 for (int c = 0; c < rvd->vdev_children; c++) 2624 if (rvd->vdev_child[c]->vdev_ms_array == 0) 2625 need_update = B_TRUE; 2626 2627 /* 2628 * Update the config cache asychronously in case we're the 2629 * root pool, in which case the config cache isn't writable yet. 2630 */ 2631 if (need_update) 2632 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 2633 2634 /* 2635 * Check all DTLs to see if anything needs resilvering. 2636 */ 2637 if (!dsl_scan_resilvering(spa->spa_dsl_pool) && 2638 vdev_resilver_needed(rvd, NULL, NULL)) 2639 spa_async_request(spa, SPA_ASYNC_RESILVER); 2640 2641 /* 2642 * Log the fact that we booted up (so that we can detect if 2643 * we rebooted in the middle of an operation). 2644 */ 2645 spa_history_log_version(spa, "open"); 2646 2647 /* 2648 * Delete any inconsistent datasets. 2649 */ 2650 (void) dmu_objset_find(spa_name(spa), 2651 dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); 2652 2653 /* 2654 * Clean up any stale temporary dataset userrefs. 2655 */ 2656 dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); 2657 } 2658 2659 return (0); 2660} 2661 2662static int 2663spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig) 2664{ 2665 int mode = spa->spa_mode; 2666 2667 spa_unload(spa); 2668 spa_deactivate(spa); 2669 2670 spa->spa_load_max_txg--; 2671 2672 spa_activate(spa, mode); 2673 spa_async_suspend(spa); 2674 2675 return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig)); 2676} 2677 2678/* 2679 * If spa_load() fails this function will try loading prior txg's. If 2680 * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool 2681 * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this 2682 * function will not rewind the pool and will return the same error as 2683 * spa_load(). 2684 */ 2685static int 2686spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, 2687 uint64_t max_request, int rewind_flags) 2688{ 2689 nvlist_t *loadinfo = NULL; 2690 nvlist_t *config = NULL; 2691 int load_error, rewind_error; 2692 uint64_t safe_rewind_txg; 2693 uint64_t min_txg; 2694 2695 if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { 2696 spa->spa_load_max_txg = spa->spa_load_txg; 2697 spa_set_log_state(spa, SPA_LOG_CLEAR); 2698 } else { 2699 spa->spa_load_max_txg = max_request; 2700 } 2701 2702 load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING, 2703 mosconfig); 2704 if (load_error == 0) 2705 return (0); 2706 2707 if (spa->spa_root_vdev != NULL) 2708 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 2709 2710 spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; 2711 spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; 2712 2713 if (rewind_flags & ZPOOL_NEVER_REWIND) { 2714 nvlist_free(config); 2715 return (load_error); 2716 } 2717 2718 if (state == SPA_LOAD_RECOVER) { 2719 /* Price of rolling back is discarding txgs, including log */ 2720 spa_set_log_state(spa, SPA_LOG_CLEAR); 2721 } else { 2722 /* 2723 * If we aren't rolling back save the load info from our first 2724 * import attempt so that we can restore it after attempting 2725 * to rewind. 2726 */ 2727 loadinfo = spa->spa_load_info; 2728 spa->spa_load_info = fnvlist_alloc(); 2729 } 2730 2731 spa->spa_load_max_txg = spa->spa_last_ubsync_txg; 2732 safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; 2733 min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? 2734 TXG_INITIAL : safe_rewind_txg; 2735 2736 /* 2737 * Continue as long as we're finding errors, we're still within 2738 * the acceptable rewind range, and we're still finding uberblocks 2739 */ 2740 while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && 2741 spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { 2742 if (spa->spa_load_max_txg < safe_rewind_txg) 2743 spa->spa_extreme_rewind = B_TRUE; 2744 rewind_error = spa_load_retry(spa, state, mosconfig); 2745 } 2746 2747 spa->spa_extreme_rewind = B_FALSE; 2748 spa->spa_load_max_txg = UINT64_MAX; 2749 2750 if (config && (rewind_error || state != SPA_LOAD_RECOVER)) 2751 spa_config_set(spa, config); 2752 2753 if (state == SPA_LOAD_RECOVER) { 2754 ASSERT3P(loadinfo, ==, NULL); 2755 return (rewind_error); 2756 } else { 2757 /* Store the rewind info as part of the initial load info */ 2758 fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO, 2759 spa->spa_load_info); 2760 2761 /* Restore the initial load info */ 2762 fnvlist_free(spa->spa_load_info); 2763 spa->spa_load_info = loadinfo; 2764 2765 return (load_error); 2766 } 2767} 2768 2769/* 2770 * Pool Open/Import 2771 * 2772 * The import case is identical to an open except that the configuration is sent 2773 * down from userland, instead of grabbed from the configuration cache. For the 2774 * case of an open, the pool configuration will exist in the 2775 * POOL_STATE_UNINITIALIZED state. 2776 * 2777 * The stats information (gen/count/ustats) is used to gather vdev statistics at 2778 * the same time open the pool, without having to keep around the spa_t in some 2779 * ambiguous state. 2780 */ 2781static int 2782spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, 2783 nvlist_t **config) 2784{ 2785 spa_t *spa; 2786 spa_load_state_t state = SPA_LOAD_OPEN; 2787 int error; 2788 int locked = B_FALSE; 2789 int firstopen = B_FALSE; 2790 2791 *spapp = NULL; 2792 2793 /* 2794 * As disgusting as this is, we need to support recursive calls to this 2795 * function because dsl_dir_open() is called during spa_load(), and ends 2796 * up calling spa_open() again. The real fix is to figure out how to 2797 * avoid dsl_dir_open() calling this in the first place. 2798 */ 2799 if (mutex_owner(&spa_namespace_lock) != curthread) { 2800 mutex_enter(&spa_namespace_lock); 2801 locked = B_TRUE; 2802 } 2803 2804 if ((spa = spa_lookup(pool)) == NULL) { 2805 if (locked) 2806 mutex_exit(&spa_namespace_lock); 2807 return (SET_ERROR(ENOENT)); 2808 } 2809 2810 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 2811 zpool_rewind_policy_t policy; 2812 2813 firstopen = B_TRUE; 2814 2815 zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config, 2816 &policy); 2817 if (policy.zrp_request & ZPOOL_DO_REWIND) 2818 state = SPA_LOAD_RECOVER; 2819 2820 spa_activate(spa, spa_mode_global); 2821 2822 if (state != SPA_LOAD_RECOVER) 2823 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 2824 2825 error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg, 2826 policy.zrp_request); 2827 2828 if (error == EBADF) { 2829 /* 2830 * If vdev_validate() returns failure (indicated by 2831 * EBADF), it indicates that one of the vdevs indicates 2832 * that the pool has been exported or destroyed. If 2833 * this is the case, the config cache is out of sync and 2834 * we should remove the pool from the namespace. 2835 */ 2836 spa_unload(spa); 2837 spa_deactivate(spa); 2838 spa_config_sync(spa, B_TRUE, B_TRUE); 2839 spa_remove(spa); 2840 if (locked) 2841 mutex_exit(&spa_namespace_lock); 2842 return (SET_ERROR(ENOENT)); 2843 } 2844 2845 if (error) { 2846 /* 2847 * We can't open the pool, but we still have useful 2848 * information: the state of each vdev after the 2849 * attempted vdev_open(). Return this to the user. 2850 */ 2851 if (config != NULL && spa->spa_config) { 2852 VERIFY(nvlist_dup(spa->spa_config, config, 2853 KM_SLEEP) == 0); 2854 VERIFY(nvlist_add_nvlist(*config, 2855 ZPOOL_CONFIG_LOAD_INFO, 2856 spa->spa_load_info) == 0); 2857 } 2858 spa_unload(spa); 2859 spa_deactivate(spa); 2860 spa->spa_last_open_failed = error; 2861 if (locked) 2862 mutex_exit(&spa_namespace_lock); 2863 *spapp = NULL; 2864 return (error); 2865 } 2866 } 2867 2868 spa_open_ref(spa, tag); 2869 2870 if (config != NULL) 2871 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 2872 2873 /* 2874 * If we've recovered the pool, pass back any information we 2875 * gathered while doing the load. 2876 */ 2877 if (state == SPA_LOAD_RECOVER) { 2878 VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, 2879 spa->spa_load_info) == 0); 2880 } 2881 2882 if (locked) { 2883 spa->spa_last_open_failed = 0; 2884 spa->spa_last_ubsync_txg = 0; 2885 spa->spa_load_txg = 0; 2886 mutex_exit(&spa_namespace_lock); 2887#ifdef __FreeBSD__ 2888#ifdef _KERNEL 2889 if (firstopen) 2890 zvol_create_minors(spa->spa_name); 2891#endif 2892#endif 2893 } 2894 2895 *spapp = spa; 2896 2897 return (0); 2898} 2899 2900int 2901spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy, 2902 nvlist_t **config) 2903{ 2904 return (spa_open_common(name, spapp, tag, policy, config)); 2905} 2906 2907int 2908spa_open(const char *name, spa_t **spapp, void *tag) 2909{ 2910 return (spa_open_common(name, spapp, tag, NULL, NULL)); 2911} 2912 2913/* 2914 * Lookup the given spa_t, incrementing the inject count in the process, 2915 * preventing it from being exported or destroyed. 2916 */ 2917spa_t * 2918spa_inject_addref(char *name) 2919{ 2920 spa_t *spa; 2921 2922 mutex_enter(&spa_namespace_lock); 2923 if ((spa = spa_lookup(name)) == NULL) { 2924 mutex_exit(&spa_namespace_lock); 2925 return (NULL); 2926 } 2927 spa->spa_inject_ref++; 2928 mutex_exit(&spa_namespace_lock); 2929 2930 return (spa); 2931} 2932 2933void 2934spa_inject_delref(spa_t *spa) 2935{ 2936 mutex_enter(&spa_namespace_lock); 2937 spa->spa_inject_ref--; 2938 mutex_exit(&spa_namespace_lock); 2939} 2940 2941/* 2942 * Add spares device information to the nvlist. 2943 */ 2944static void 2945spa_add_spares(spa_t *spa, nvlist_t *config) 2946{ 2947 nvlist_t **spares; 2948 uint_t i, nspares; 2949 nvlist_t *nvroot; 2950 uint64_t guid; 2951 vdev_stat_t *vs; 2952 uint_t vsc; 2953 uint64_t pool; 2954 2955 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 2956 2957 if (spa->spa_spares.sav_count == 0) 2958 return; 2959 2960 VERIFY(nvlist_lookup_nvlist(config, 2961 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 2962 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 2963 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 2964 if (nspares != 0) { 2965 VERIFY(nvlist_add_nvlist_array(nvroot, 2966 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 2967 VERIFY(nvlist_lookup_nvlist_array(nvroot, 2968 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 2969 2970 /* 2971 * Go through and find any spares which have since been 2972 * repurposed as an active spare. If this is the case, update 2973 * their status appropriately. 2974 */ 2975 for (i = 0; i < nspares; i++) { 2976 VERIFY(nvlist_lookup_uint64(spares[i], 2977 ZPOOL_CONFIG_GUID, &guid) == 0); 2978 if (spa_spare_exists(guid, &pool, NULL) && 2979 pool != 0ULL) { 2980 VERIFY(nvlist_lookup_uint64_array( 2981 spares[i], ZPOOL_CONFIG_VDEV_STATS, 2982 (uint64_t **)&vs, &vsc) == 0); 2983 vs->vs_state = VDEV_STATE_CANT_OPEN; 2984 vs->vs_aux = VDEV_AUX_SPARED; 2985 } 2986 } 2987 } 2988} 2989 2990/* 2991 * Add l2cache device information to the nvlist, including vdev stats. 2992 */ 2993static void 2994spa_add_l2cache(spa_t *spa, nvlist_t *config) 2995{ 2996 nvlist_t **l2cache; 2997 uint_t i, j, nl2cache; 2998 nvlist_t *nvroot; 2999 uint64_t guid; 3000 vdev_t *vd; 3001 vdev_stat_t *vs; 3002 uint_t vsc; 3003 3004 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 3005 3006 if (spa->spa_l2cache.sav_count == 0) 3007 return; 3008 3009 VERIFY(nvlist_lookup_nvlist(config, 3010 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 3011 VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 3012 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 3013 if (nl2cache != 0) { 3014 VERIFY(nvlist_add_nvlist_array(nvroot, 3015 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 3016 VERIFY(nvlist_lookup_nvlist_array(nvroot, 3017 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 3018 3019 /* 3020 * Update level 2 cache device stats. 3021 */ 3022 3023 for (i = 0; i < nl2cache; i++) { 3024 VERIFY(nvlist_lookup_uint64(l2cache[i], 3025 ZPOOL_CONFIG_GUID, &guid) == 0); 3026 3027 vd = NULL; 3028 for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 3029 if (guid == 3030 spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 3031 vd = spa->spa_l2cache.sav_vdevs[j]; 3032 break; 3033 } 3034 } 3035 ASSERT(vd != NULL); 3036 3037 VERIFY(nvlist_lookup_uint64_array(l2cache[i], 3038 ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) 3039 == 0); 3040 vdev_get_stats(vd, vs); 3041 } 3042 } 3043} 3044 3045static void 3046spa_add_feature_stats(spa_t *spa, nvlist_t *config) 3047{ 3048 nvlist_t *features; 3049 zap_cursor_t zc; 3050 zap_attribute_t za; 3051 3052 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 3053 VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3054 3055 if (spa->spa_feat_for_read_obj != 0) { 3056 for (zap_cursor_init(&zc, spa->spa_meta_objset, 3057 spa->spa_feat_for_read_obj); 3058 zap_cursor_retrieve(&zc, &za) == 0; 3059 zap_cursor_advance(&zc)) { 3060 ASSERT(za.za_integer_length == sizeof (uint64_t) && 3061 za.za_num_integers == 1); 3062 VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, 3063 za.za_first_integer)); 3064 } 3065 zap_cursor_fini(&zc); 3066 } 3067 3068 if (spa->spa_feat_for_write_obj != 0) { 3069 for (zap_cursor_init(&zc, spa->spa_meta_objset, 3070 spa->spa_feat_for_write_obj); 3071 zap_cursor_retrieve(&zc, &za) == 0; 3072 zap_cursor_advance(&zc)) { 3073 ASSERT(za.za_integer_length == sizeof (uint64_t) && 3074 za.za_num_integers == 1); 3075 VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, 3076 za.za_first_integer)); 3077 } 3078 zap_cursor_fini(&zc); 3079 } 3080 3081 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, 3082 features) == 0); 3083 nvlist_free(features); 3084} 3085 3086int 3087spa_get_stats(const char *name, nvlist_t **config, 3088 char *altroot, size_t buflen) 3089{ 3090 int error; 3091 spa_t *spa; 3092 3093 *config = NULL; 3094 error = spa_open_common(name, &spa, FTAG, NULL, config); 3095 3096 if (spa != NULL) { 3097 /* 3098 * This still leaves a window of inconsistency where the spares 3099 * or l2cache devices could change and the config would be 3100 * self-inconsistent. 3101 */ 3102 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3103 3104 if (*config != NULL) { 3105 uint64_t loadtimes[2]; 3106 3107 loadtimes[0] = spa->spa_loaded_ts.tv_sec; 3108 loadtimes[1] = spa->spa_loaded_ts.tv_nsec; 3109 VERIFY(nvlist_add_uint64_array(*config, 3110 ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0); 3111 3112 VERIFY(nvlist_add_uint64(*config, 3113 ZPOOL_CONFIG_ERRCOUNT, 3114 spa_get_errlog_size(spa)) == 0); 3115 3116 if (spa_suspended(spa)) 3117 VERIFY(nvlist_add_uint64(*config, 3118 ZPOOL_CONFIG_SUSPENDED, 3119 spa->spa_failmode) == 0); 3120 3121 spa_add_spares(spa, *config); 3122 spa_add_l2cache(spa, *config); 3123 spa_add_feature_stats(spa, *config); 3124 } 3125 } 3126 3127 /* 3128 * We want to get the alternate root even for faulted pools, so we cheat 3129 * and call spa_lookup() directly. 3130 */ 3131 if (altroot) { 3132 if (spa == NULL) { 3133 mutex_enter(&spa_namespace_lock); 3134 spa = spa_lookup(name); 3135 if (spa) 3136 spa_altroot(spa, altroot, buflen); 3137 else 3138 altroot[0] = '\0'; 3139 spa = NULL; 3140 mutex_exit(&spa_namespace_lock); 3141 } else { 3142 spa_altroot(spa, altroot, buflen); 3143 } 3144 } 3145 3146 if (spa != NULL) { 3147 spa_config_exit(spa, SCL_CONFIG, FTAG); 3148 spa_close(spa, FTAG); 3149 } 3150 3151 return (error); 3152} 3153 3154/* 3155 * Validate that the auxiliary device array is well formed. We must have an 3156 * array of nvlists, each which describes a valid leaf vdev. If this is an 3157 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 3158 * specified, as long as they are well-formed. 3159 */ 3160static int 3161spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 3162 spa_aux_vdev_t *sav, const char *config, uint64_t version, 3163 vdev_labeltype_t label) 3164{ 3165 nvlist_t **dev; 3166 uint_t i, ndev; 3167 vdev_t *vd; 3168 int error; 3169 3170 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3171 3172 /* 3173 * It's acceptable to have no devs specified. 3174 */ 3175 if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 3176 return (0); 3177 3178 if (ndev == 0) 3179 return (SET_ERROR(EINVAL)); 3180 3181 /* 3182 * Make sure the pool is formatted with a version that supports this 3183 * device type. 3184 */ 3185 if (spa_version(spa) < version) 3186 return (SET_ERROR(ENOTSUP)); 3187 3188 /* 3189 * Set the pending device list so we correctly handle device in-use 3190 * checking. 3191 */ 3192 sav->sav_pending = dev; 3193 sav->sav_npending = ndev; 3194 3195 for (i = 0; i < ndev; i++) { 3196 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 3197 mode)) != 0) 3198 goto out; 3199 3200 if (!vd->vdev_ops->vdev_op_leaf) { 3201 vdev_free(vd); 3202 error = SET_ERROR(EINVAL); 3203 goto out; 3204 } 3205 3206 /* 3207 * The L2ARC currently only supports disk devices in 3208 * kernel context. For user-level testing, we allow it. 3209 */ 3210#ifdef _KERNEL 3211 if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && 3212 strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { 3213 error = SET_ERROR(ENOTBLK); 3214 vdev_free(vd); 3215 goto out; 3216 } 3217#endif 3218 vd->vdev_top = vd; 3219 3220 if ((error = vdev_open(vd)) == 0 && 3221 (error = vdev_label_init(vd, crtxg, label)) == 0) { 3222 VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 3223 vd->vdev_guid) == 0); 3224 } 3225 3226 vdev_free(vd); 3227 3228 if (error && 3229 (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 3230 goto out; 3231 else 3232 error = 0; 3233 } 3234 3235out: 3236 sav->sav_pending = NULL; 3237 sav->sav_npending = 0; 3238 return (error); 3239} 3240 3241static int 3242spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 3243{ 3244 int error; 3245 3246 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3247 3248 if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 3249 &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 3250 VDEV_LABEL_SPARE)) != 0) { 3251 return (error); 3252 } 3253 3254 return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 3255 &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 3256 VDEV_LABEL_L2CACHE)); 3257} 3258 3259static void 3260spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 3261 const char *config) 3262{ 3263 int i; 3264 3265 if (sav->sav_config != NULL) { 3266 nvlist_t **olddevs; 3267 uint_t oldndevs; 3268 nvlist_t **newdevs; 3269 3270 /* 3271 * Generate new dev list by concatentating with the 3272 * current dev list. 3273 */ 3274 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, 3275 &olddevs, &oldndevs) == 0); 3276 3277 newdevs = kmem_alloc(sizeof (void *) * 3278 (ndevs + oldndevs), KM_SLEEP); 3279 for (i = 0; i < oldndevs; i++) 3280 VERIFY(nvlist_dup(olddevs[i], &newdevs[i], 3281 KM_SLEEP) == 0); 3282 for (i = 0; i < ndevs; i++) 3283 VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], 3284 KM_SLEEP) == 0); 3285 3286 VERIFY(nvlist_remove(sav->sav_config, config, 3287 DATA_TYPE_NVLIST_ARRAY) == 0); 3288 3289 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 3290 config, newdevs, ndevs + oldndevs) == 0); 3291 for (i = 0; i < oldndevs + ndevs; i++) 3292 nvlist_free(newdevs[i]); 3293 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 3294 } else { 3295 /* 3296 * Generate a new dev list. 3297 */ 3298 VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, 3299 KM_SLEEP) == 0); 3300 VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, 3301 devs, ndevs) == 0); 3302 } 3303} 3304 3305/* 3306 * Stop and drop level 2 ARC devices 3307 */ 3308void 3309spa_l2cache_drop(spa_t *spa) 3310{ 3311 vdev_t *vd; 3312 int i; 3313 spa_aux_vdev_t *sav = &spa->spa_l2cache; 3314 3315 for (i = 0; i < sav->sav_count; i++) { 3316 uint64_t pool; 3317 3318 vd = sav->sav_vdevs[i]; 3319 ASSERT(vd != NULL); 3320 3321 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 3322 pool != 0ULL && l2arc_vdev_present(vd)) 3323 l2arc_remove_vdev(vd); 3324 } 3325} 3326 3327/* 3328 * Pool Creation 3329 */ 3330int 3331spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 3332 nvlist_t *zplprops) 3333{ 3334 spa_t *spa; 3335 char *altroot = NULL; 3336 vdev_t *rvd; 3337 dsl_pool_t *dp; 3338 dmu_tx_t *tx; 3339 int error = 0; 3340 uint64_t txg = TXG_INITIAL; 3341 nvlist_t **spares, **l2cache; 3342 uint_t nspares, nl2cache; 3343 uint64_t version, obj; 3344 boolean_t has_features; 3345 3346 /* 3347 * If this pool already exists, return failure. 3348 */ 3349 mutex_enter(&spa_namespace_lock); 3350 if (spa_lookup(pool) != NULL) { 3351 mutex_exit(&spa_namespace_lock); 3352 return (SET_ERROR(EEXIST)); 3353 } 3354 3355 /* 3356 * Allocate a new spa_t structure. 3357 */ 3358 (void) nvlist_lookup_string(props, 3359 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 3360 spa = spa_add(pool, NULL, altroot); 3361 spa_activate(spa, spa_mode_global); 3362 3363 if (props && (error = spa_prop_validate(spa, props))) { 3364 spa_deactivate(spa); 3365 spa_remove(spa); 3366 mutex_exit(&spa_namespace_lock); 3367 return (error); 3368 } 3369 3370 has_features = B_FALSE; 3371 for (nvpair_t *elem = nvlist_next_nvpair(props, NULL); 3372 elem != NULL; elem = nvlist_next_nvpair(props, elem)) { 3373 if (zpool_prop_feature(nvpair_name(elem))) 3374 has_features = B_TRUE; 3375 } 3376 3377 if (has_features || nvlist_lookup_uint64(props, 3378 zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) { 3379 version = SPA_VERSION; 3380 } 3381 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 3382 3383 spa->spa_first_txg = txg; 3384 spa->spa_uberblock.ub_txg = txg - 1; 3385 spa->spa_uberblock.ub_version = version; 3386 spa->spa_ubsync = spa->spa_uberblock; 3387 3388 /* 3389 * Create "The Godfather" zio to hold all async IOs 3390 */ 3391 spa->spa_async_zio_root = zio_root(spa, NULL, NULL, 3392 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); 3393 3394 /* 3395 * Create the root vdev. 3396 */ 3397 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3398 3399 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 3400 3401 ASSERT(error != 0 || rvd != NULL); 3402 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 3403 3404 if (error == 0 && !zfs_allocatable_devs(nvroot)) 3405 error = SET_ERROR(EINVAL); 3406 3407 if (error == 0 && 3408 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 3409 (error = spa_validate_aux(spa, nvroot, txg, 3410 VDEV_ALLOC_ADD)) == 0) { 3411 for (int c = 0; c < rvd->vdev_children; c++) { 3412 vdev_metaslab_set_size(rvd->vdev_child[c]); 3413 vdev_expand(rvd->vdev_child[c], txg); 3414 } 3415 } 3416 3417 spa_config_exit(spa, SCL_ALL, FTAG); 3418 3419 if (error != 0) { 3420 spa_unload(spa); 3421 spa_deactivate(spa); 3422 spa_remove(spa); 3423 mutex_exit(&spa_namespace_lock); 3424 return (error); 3425 } 3426 3427 /* 3428 * Get the list of spares, if specified. 3429 */ 3430 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 3431 &spares, &nspares) == 0) { 3432 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, 3433 KM_SLEEP) == 0); 3434 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 3435 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 3436 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3437 spa_load_spares(spa); 3438 spa_config_exit(spa, SCL_ALL, FTAG); 3439 spa->spa_spares.sav_sync = B_TRUE; 3440 } 3441 3442 /* 3443 * Get the list of level 2 cache devices, if specified. 3444 */ 3445 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 3446 &l2cache, &nl2cache) == 0) { 3447 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 3448 NV_UNIQUE_NAME, KM_SLEEP) == 0); 3449 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 3450 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 3451 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3452 spa_load_l2cache(spa); 3453 spa_config_exit(spa, SCL_ALL, FTAG); 3454 spa->spa_l2cache.sav_sync = B_TRUE; 3455 } 3456 3457 spa->spa_is_initializing = B_TRUE; 3458 spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); 3459 spa->spa_meta_objset = dp->dp_meta_objset; 3460 spa->spa_is_initializing = B_FALSE; 3461 3462 /* 3463 * Create DDTs (dedup tables). 3464 */ 3465 ddt_create(spa); 3466 3467 spa_update_dspace(spa); 3468 3469 tx = dmu_tx_create_assigned(dp, txg); 3470 3471 /* 3472 * Create the pool config object. 3473 */ 3474 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 3475 DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, 3476 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 3477 3478 if (zap_add(spa->spa_meta_objset, 3479 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 3480 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 3481 cmn_err(CE_PANIC, "failed to add pool config"); 3482 } 3483 3484 if (spa_version(spa) >= SPA_VERSION_FEATURES) 3485 spa_feature_create_zap_objects(spa, tx); 3486 3487 if (zap_add(spa->spa_meta_objset, 3488 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, 3489 sizeof (uint64_t), 1, &version, tx) != 0) { 3490 cmn_err(CE_PANIC, "failed to add pool version"); 3491 } 3492 3493 /* Newly created pools with the right version are always deflated. */ 3494 if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 3495 spa->spa_deflate = TRUE; 3496 if (zap_add(spa->spa_meta_objset, 3497 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 3498 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 3499 cmn_err(CE_PANIC, "failed to add deflate"); 3500 } 3501 } 3502 3503 /* 3504 * Create the deferred-free bpobj. Turn off compression 3505 * because sync-to-convergence takes longer if the blocksize 3506 * keeps changing. 3507 */ 3508 obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); 3509 dmu_object_set_compress(spa->spa_meta_objset, obj, 3510 ZIO_COMPRESS_OFF, tx); 3511 if (zap_add(spa->spa_meta_objset, 3512 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, 3513 sizeof (uint64_t), 1, &obj, tx) != 0) { 3514 cmn_err(CE_PANIC, "failed to add bpobj"); 3515 } 3516 VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, 3517 spa->spa_meta_objset, obj)); 3518 3519 /* 3520 * Create the pool's history object. 3521 */ 3522 if (version >= SPA_VERSION_ZPOOL_HISTORY) 3523 spa_history_create_obj(spa, tx); 3524 3525 /* 3526 * Set pool properties. 3527 */ 3528 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 3529 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 3530 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 3531 spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); 3532 3533 if (props != NULL) { 3534 spa_configfile_set(spa, props, B_FALSE); 3535 spa_sync_props(props, tx); 3536 } 3537 3538 dmu_tx_commit(tx); 3539 3540 spa->spa_sync_on = B_TRUE; 3541 txg_sync_start(spa->spa_dsl_pool); 3542 3543 /* 3544 * We explicitly wait for the first transaction to complete so that our 3545 * bean counters are appropriately updated. 3546 */ 3547 txg_wait_synced(spa->spa_dsl_pool, txg); 3548 3549 spa_config_sync(spa, B_FALSE, B_TRUE); 3550 3551 spa_history_log_version(spa, "create"); 3552 3553 spa->spa_minref = refcount_count(&spa->spa_refcount); 3554 3555 mutex_exit(&spa_namespace_lock); 3556 3557 return (0); 3558} 3559 3560#ifdef _KERNEL 3561#if defined(sun) 3562/* 3563 * Get the root pool information from the root disk, then import the root pool 3564 * during the system boot up time. 3565 */ 3566extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); 3567 3568static nvlist_t * 3569spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid) 3570{ 3571 nvlist_t *config; 3572 nvlist_t *nvtop, *nvroot; 3573 uint64_t pgid; 3574 3575 if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0) 3576 return (NULL); 3577 3578 /* 3579 * Add this top-level vdev to the child array. 3580 */ 3581 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3582 &nvtop) == 0); 3583 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 3584 &pgid) == 0); 3585 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0); 3586 3587 /* 3588 * Put this pool's top-level vdevs into a root vdev. 3589 */ 3590 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3591 VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 3592 VDEV_TYPE_ROOT) == 0); 3593 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 3594 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 3595 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 3596 &nvtop, 1) == 0); 3597 3598 /* 3599 * Replace the existing vdev_tree with the new root vdev in 3600 * this pool's configuration (remove the old, add the new). 3601 */ 3602 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 3603 nvlist_free(nvroot); 3604 return (config); 3605} 3606 3607/* 3608 * Walk the vdev tree and see if we can find a device with "better" 3609 * configuration. A configuration is "better" if the label on that 3610 * device has a more recent txg. 3611 */ 3612static void 3613spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg) 3614{ 3615 for (int c = 0; c < vd->vdev_children; c++) 3616 spa_alt_rootvdev(vd->vdev_child[c], avd, txg); 3617 3618 if (vd->vdev_ops->vdev_op_leaf) { 3619 nvlist_t *label; 3620 uint64_t label_txg; 3621 3622 if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid, 3623 &label) != 0) 3624 return; 3625 3626 VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, 3627 &label_txg) == 0); 3628 3629 /* 3630 * Do we have a better boot device? 3631 */ 3632 if (label_txg > *txg) { 3633 *txg = label_txg; 3634 *avd = vd; 3635 } 3636 nvlist_free(label); 3637 } 3638} 3639 3640/* 3641 * Import a root pool. 3642 * 3643 * For x86. devpath_list will consist of devid and/or physpath name of 3644 * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). 3645 * The GRUB "findroot" command will return the vdev we should boot. 3646 * 3647 * For Sparc, devpath_list consists the physpath name of the booting device 3648 * no matter the rootpool is a single device pool or a mirrored pool. 3649 * e.g. 3650 * "/pci@1f,0/ide@d/disk@0,0:a" 3651 */ 3652int 3653spa_import_rootpool(char *devpath, char *devid) 3654{ 3655 spa_t *spa; 3656 vdev_t *rvd, *bvd, *avd = NULL; 3657 nvlist_t *config, *nvtop; 3658 uint64_t guid, txg; 3659 char *pname; 3660 int error; 3661 3662 /* 3663 * Read the label from the boot device and generate a configuration. 3664 */ 3665 config = spa_generate_rootconf(devpath, devid, &guid); 3666#if defined(_OBP) && defined(_KERNEL) 3667 if (config == NULL) { 3668 if (strstr(devpath, "/iscsi/ssd") != NULL) { 3669 /* iscsi boot */ 3670 get_iscsi_bootpath_phy(devpath); 3671 config = spa_generate_rootconf(devpath, devid, &guid); 3672 } 3673 } 3674#endif 3675 if (config == NULL) { 3676 cmn_err(CE_NOTE, "Cannot read the pool label from '%s'", 3677 devpath); 3678 return (SET_ERROR(EIO)); 3679 } 3680 3681 VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 3682 &pname) == 0); 3683 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); 3684 3685 mutex_enter(&spa_namespace_lock); 3686 if ((spa = spa_lookup(pname)) != NULL) { 3687 /* 3688 * Remove the existing root pool from the namespace so that we 3689 * can replace it with the correct config we just read in. 3690 */ 3691 spa_remove(spa); 3692 } 3693 3694 spa = spa_add(pname, config, NULL); 3695 spa->spa_is_root = B_TRUE; 3696 spa->spa_import_flags = ZFS_IMPORT_VERBATIM; 3697 3698 /* 3699 * Build up a vdev tree based on the boot device's label config. 3700 */ 3701 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3702 &nvtop) == 0); 3703 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3704 error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, 3705 VDEV_ALLOC_ROOTPOOL); 3706 spa_config_exit(spa, SCL_ALL, FTAG); 3707 if (error) { 3708 mutex_exit(&spa_namespace_lock); 3709 nvlist_free(config); 3710 cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", 3711 pname); 3712 return (error); 3713 } 3714 3715 /* 3716 * Get the boot vdev. 3717 */ 3718 if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 3719 cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu", 3720 (u_longlong_t)guid); 3721 error = SET_ERROR(ENOENT); 3722 goto out; 3723 } 3724 3725 /* 3726 * Determine if there is a better boot device. 3727 */ 3728 avd = bvd; 3729 spa_alt_rootvdev(rvd, &avd, &txg); 3730 if (avd != bvd) { 3731 cmn_err(CE_NOTE, "The boot device is 'degraded'. Please " 3732 "try booting from '%s'", avd->vdev_path); 3733 error = SET_ERROR(EINVAL); 3734 goto out; 3735 } 3736 3737 /* 3738 * If the boot device is part of a spare vdev then ensure that 3739 * we're booting off the active spare. 3740 */ 3741 if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops && 3742 !bvd->vdev_isspare) { 3743 cmn_err(CE_NOTE, "The boot device is currently spared. Please " 3744 "try booting from '%s'", 3745 bvd->vdev_parent-> 3746 vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path); 3747 error = SET_ERROR(EINVAL); 3748 goto out; 3749 } 3750 3751 error = 0; 3752out: 3753 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3754 vdev_free(rvd); 3755 spa_config_exit(spa, SCL_ALL, FTAG); 3756 mutex_exit(&spa_namespace_lock); 3757 3758 nvlist_free(config); 3759 return (error); 3760} 3761 3762#else 3763 3764extern int vdev_geom_read_pool_label(const char *name, nvlist_t ***configs, 3765 uint64_t *count); 3766 3767static nvlist_t * 3768spa_generate_rootconf(const char *name) 3769{ 3770 nvlist_t **configs, **tops; 3771 nvlist_t *config; 3772 nvlist_t *best_cfg, *nvtop, *nvroot; 3773 uint64_t *holes; 3774 uint64_t best_txg; 3775 uint64_t nchildren; 3776 uint64_t pgid; 3777 uint64_t count; 3778 uint64_t i; 3779 uint_t nholes; 3780 3781 if (vdev_geom_read_pool_label(name, &configs, &count) != 0) 3782 return (NULL); 3783 3784 ASSERT3U(count, !=, 0); 3785 best_txg = 0; 3786 for (i = 0; i < count; i++) { 3787 uint64_t txg; 3788 3789 VERIFY(nvlist_lookup_uint64(configs[i], ZPOOL_CONFIG_POOL_TXG, 3790 &txg) == 0); 3791 if (txg > best_txg) { 3792 best_txg = txg; 3793 best_cfg = configs[i]; 3794 } 3795 } 3796 3797 /* 3798 * Multi-vdev root pool configuration discovery is not supported yet. 3799 */ 3800 nchildren = 1; 3801 nvlist_lookup_uint64(best_cfg, ZPOOL_CONFIG_VDEV_CHILDREN, &nchildren); 3802 holes = NULL; 3803 nvlist_lookup_uint64_array(best_cfg, ZPOOL_CONFIG_HOLE_ARRAY, 3804 &holes, &nholes); 3805 3806 tops = kmem_zalloc(nchildren * sizeof(void *), KM_SLEEP); 3807 for (i = 0; i < nchildren; i++) { 3808 if (i >= count) 3809 break; 3810 if (configs[i] == NULL) 3811 continue; 3812 VERIFY(nvlist_lookup_nvlist(configs[i], ZPOOL_CONFIG_VDEV_TREE, 3813 &nvtop) == 0); 3814 nvlist_dup(nvtop, &tops[i], KM_SLEEP); 3815 } 3816 for (i = 0; holes != NULL && i < nholes; i++) { 3817 if (i >= nchildren) 3818 continue; 3819 if (tops[holes[i]] != NULL) 3820 continue; 3821 nvlist_alloc(&tops[holes[i]], NV_UNIQUE_NAME, KM_SLEEP); 3822 VERIFY(nvlist_add_string(tops[holes[i]], ZPOOL_CONFIG_TYPE, 3823 VDEV_TYPE_HOLE) == 0); 3824 VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_ID, 3825 holes[i]) == 0); 3826 VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_GUID, 3827 0) == 0); 3828 } 3829 for (i = 0; i < nchildren; i++) { 3830 if (tops[i] != NULL) 3831 continue; 3832 nvlist_alloc(&tops[i], NV_UNIQUE_NAME, KM_SLEEP); 3833 VERIFY(nvlist_add_string(tops[i], ZPOOL_CONFIG_TYPE, 3834 VDEV_TYPE_MISSING) == 0); 3835 VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_ID, 3836 i) == 0); 3837 VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_GUID, 3838 0) == 0); 3839 } 3840 3841 /* 3842 * Create pool config based on the best vdev config. 3843 */ 3844 nvlist_dup(best_cfg, &config, KM_SLEEP); 3845 3846 /* 3847 * Put this pool's top-level vdevs into a root vdev. 3848 */ 3849 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 3850 &pgid) == 0); 3851 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3852 VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 3853 VDEV_TYPE_ROOT) == 0); 3854 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 3855 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 3856 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 3857 tops, nchildren) == 0); 3858 3859 /* 3860 * Replace the existing vdev_tree with the new root vdev in 3861 * this pool's configuration (remove the old, add the new). 3862 */ 3863 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 3864 3865 /* 3866 * Drop vdev config elements that should not be present at pool level. 3867 */ 3868 nvlist_remove(config, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64); 3869 nvlist_remove(config, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64); 3870 3871 for (i = 0; i < count; i++) 3872 nvlist_free(configs[i]); 3873 kmem_free(configs, count * sizeof(void *)); 3874 for (i = 0; i < nchildren; i++) 3875 nvlist_free(tops[i]); 3876 kmem_free(tops, nchildren * sizeof(void *)); 3877 nvlist_free(nvroot); 3878 return (config); 3879} 3880 3881int 3882spa_import_rootpool(const char *name) 3883{ 3884 spa_t *spa; 3885 vdev_t *rvd, *bvd, *avd = NULL; 3886 nvlist_t *config, *nvtop; 3887 uint64_t txg; 3888 char *pname; 3889 int error; 3890 3891 /* 3892 * Read the label from the boot device and generate a configuration. 3893 */ 3894 config = spa_generate_rootconf(name); 3895 3896 mutex_enter(&spa_namespace_lock); 3897 if (config != NULL) { 3898 VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 3899 &pname) == 0 && strcmp(name, pname) == 0); 3900 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) 3901 == 0); 3902 3903 if ((spa = spa_lookup(pname)) != NULL) { 3904 /* 3905 * Remove the existing root pool from the namespace so 3906 * that we can replace it with the correct config 3907 * we just read in. 3908 */ 3909 spa_remove(spa); 3910 } 3911 spa = spa_add(pname, config, NULL); 3912 3913 /* 3914 * Set spa_ubsync.ub_version as it can be used in vdev_alloc() 3915 * via spa_version(). 3916 */ 3917 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 3918 &spa->spa_ubsync.ub_version) != 0) 3919 spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 3920 } else if ((spa = spa_lookup(name)) == NULL) { 3921 cmn_err(CE_NOTE, "Cannot find the pool label for '%s'", 3922 name); 3923 return (EIO); 3924 } else { 3925 VERIFY(nvlist_dup(spa->spa_config, &config, KM_SLEEP) == 0); 3926 } 3927 spa->spa_is_root = B_TRUE; 3928 spa->spa_import_flags = ZFS_IMPORT_VERBATIM; 3929 3930 /* 3931 * Build up a vdev tree based on the boot device's label config. 3932 */ 3933 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3934 &nvtop) == 0); 3935 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3936 error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, 3937 VDEV_ALLOC_ROOTPOOL); 3938 spa_config_exit(spa, SCL_ALL, FTAG); 3939 if (error) { 3940 mutex_exit(&spa_namespace_lock); 3941 nvlist_free(config); 3942 cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", 3943 pname); 3944 return (error); 3945 } 3946 3947 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3948 vdev_free(rvd); 3949 spa_config_exit(spa, SCL_ALL, FTAG); 3950 mutex_exit(&spa_namespace_lock); 3951 3952 nvlist_free(config); 3953 return (0); 3954} 3955 3956#endif /* sun */ 3957#endif 3958 3959/* 3960 * Import a non-root pool into the system. 3961 */ 3962int 3963spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) 3964{ 3965 spa_t *spa; 3966 char *altroot = NULL; 3967 spa_load_state_t state = SPA_LOAD_IMPORT; 3968 zpool_rewind_policy_t policy; 3969 uint64_t mode = spa_mode_global; 3970 uint64_t readonly = B_FALSE; 3971 int error; 3972 nvlist_t *nvroot; 3973 nvlist_t **spares, **l2cache; 3974 uint_t nspares, nl2cache; 3975 3976 /* 3977 * If a pool with this name exists, return failure. 3978 */ 3979 mutex_enter(&spa_namespace_lock); 3980 if (spa_lookup(pool) != NULL) { 3981 mutex_exit(&spa_namespace_lock); 3982 return (SET_ERROR(EEXIST)); 3983 } 3984 3985 /* 3986 * Create and initialize the spa structure. 3987 */ 3988 (void) nvlist_lookup_string(props, 3989 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 3990 (void) nvlist_lookup_uint64(props, 3991 zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); 3992 if (readonly) 3993 mode = FREAD; 3994 spa = spa_add(pool, config, altroot); 3995 spa->spa_import_flags = flags; 3996 3997 /* 3998 * Verbatim import - Take a pool and insert it into the namespace 3999 * as if it had been loaded at boot. 4000 */ 4001 if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { 4002 if (props != NULL) 4003 spa_configfile_set(spa, props, B_FALSE); 4004 4005 spa_config_sync(spa, B_FALSE, B_TRUE); 4006 4007 mutex_exit(&spa_namespace_lock); 4008 spa_history_log_version(spa, "import"); 4009 4010 return (0); 4011 } 4012 4013 spa_activate(spa, mode); 4014 4015 /* 4016 * Don't start async tasks until we know everything is healthy. 4017 */ 4018 spa_async_suspend(spa); 4019 4020 zpool_get_rewind_policy(config, &policy); 4021 if (policy.zrp_request & ZPOOL_DO_REWIND) 4022 state = SPA_LOAD_RECOVER; 4023 4024 /* 4025 * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig 4026 * because the user-supplied config is actually the one to trust when 4027 * doing an import. 4028 */ 4029 if (state != SPA_LOAD_RECOVER) 4030 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 4031 4032 error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg, 4033 policy.zrp_request); 4034 4035 /* 4036 * Propagate anything learned while loading the pool and pass it 4037 * back to caller (i.e. rewind info, missing devices, etc). 4038 */ 4039 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 4040 spa->spa_load_info) == 0); 4041 4042 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4043 /* 4044 * Toss any existing sparelist, as it doesn't have any validity 4045 * anymore, and conflicts with spa_has_spare(). 4046 */ 4047 if (spa->spa_spares.sav_config) { 4048 nvlist_free(spa->spa_spares.sav_config); 4049 spa->spa_spares.sav_config = NULL; 4050 spa_load_spares(spa); 4051 } 4052 if (spa->spa_l2cache.sav_config) { 4053 nvlist_free(spa->spa_l2cache.sav_config); 4054 spa->spa_l2cache.sav_config = NULL; 4055 spa_load_l2cache(spa); 4056 } 4057 4058 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 4059 &nvroot) == 0); 4060 if (error == 0) 4061 error = spa_validate_aux(spa, nvroot, -1ULL, 4062 VDEV_ALLOC_SPARE); 4063 if (error == 0) 4064 error = spa_validate_aux(spa, nvroot, -1ULL, 4065 VDEV_ALLOC_L2CACHE); 4066 spa_config_exit(spa, SCL_ALL, FTAG); 4067 4068 if (props != NULL) 4069 spa_configfile_set(spa, props, B_FALSE); 4070 4071 if (error != 0 || (props && spa_writeable(spa) && 4072 (error = spa_prop_set(spa, props)))) { 4073 spa_unload(spa); 4074 spa_deactivate(spa); 4075 spa_remove(spa); 4076 mutex_exit(&spa_namespace_lock); 4077 return (error); 4078 } 4079 4080 spa_async_resume(spa); 4081 4082 /* 4083 * Override any spares and level 2 cache devices as specified by 4084 * the user, as these may have correct device names/devids, etc. 4085 */ 4086 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 4087 &spares, &nspares) == 0) { 4088 if (spa->spa_spares.sav_config) 4089 VERIFY(nvlist_remove(spa->spa_spares.sav_config, 4090 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 4091 else 4092 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, 4093 NV_UNIQUE_NAME, KM_SLEEP) == 0); 4094 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 4095 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 4096 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4097 spa_load_spares(spa); 4098 spa_config_exit(spa, SCL_ALL, FTAG); 4099 spa->spa_spares.sav_sync = B_TRUE; 4100 } 4101 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 4102 &l2cache, &nl2cache) == 0) { 4103 if (spa->spa_l2cache.sav_config) 4104 VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, 4105 ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); 4106 else 4107 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 4108 NV_UNIQUE_NAME, KM_SLEEP) == 0); 4109 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 4110 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 4111 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4112 spa_load_l2cache(spa); 4113 spa_config_exit(spa, SCL_ALL, FTAG); 4114 spa->spa_l2cache.sav_sync = B_TRUE; 4115 } 4116 4117 /* 4118 * Check for any removed devices. 4119 */ 4120 if (spa->spa_autoreplace) { 4121 spa_aux_check_removed(&spa->spa_spares); 4122 spa_aux_check_removed(&spa->spa_l2cache); 4123 } 4124 4125 if (spa_writeable(spa)) { 4126 /* 4127 * Update the config cache to include the newly-imported pool. 4128 */ 4129 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 4130 } 4131 4132 /* 4133 * It's possible that the pool was expanded while it was exported. 4134 * We kick off an async task to handle this for us. 4135 */ 4136 spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 4137 4138 mutex_exit(&spa_namespace_lock); 4139 spa_history_log_version(spa, "import"); 4140 4141#ifdef __FreeBSD__ 4142#ifdef _KERNEL 4143 zvol_create_minors(pool); 4144#endif 4145#endif 4146 return (0); 4147} 4148 4149nvlist_t * 4150spa_tryimport(nvlist_t *tryconfig) 4151{ 4152 nvlist_t *config = NULL; 4153 char *poolname; 4154 spa_t *spa; 4155 uint64_t state; 4156 int error; 4157 4158 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 4159 return (NULL); 4160 4161 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 4162 return (NULL); 4163 4164 /* 4165 * Create and initialize the spa structure. 4166 */ 4167 mutex_enter(&spa_namespace_lock); 4168 spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); 4169 spa_activate(spa, FREAD); 4170 4171 /* 4172 * Pass off the heavy lifting to spa_load(). 4173 * Pass TRUE for mosconfig because the user-supplied config 4174 * is actually the one to trust when doing an import. 4175 */ 4176 error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE); 4177 4178 /* 4179 * If 'tryconfig' was at least parsable, return the current config. 4180 */ 4181 if (spa->spa_root_vdev != NULL) { 4182 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 4183 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 4184 poolname) == 0); 4185 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 4186 state) == 0); 4187 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 4188 spa->spa_uberblock.ub_timestamp) == 0); 4189 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 4190 spa->spa_load_info) == 0); 4191 4192 /* 4193 * If the bootfs property exists on this pool then we 4194 * copy it out so that external consumers can tell which 4195 * pools are bootable. 4196 */ 4197 if ((!error || error == EEXIST) && spa->spa_bootfs) { 4198 char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 4199 4200 /* 4201 * We have to play games with the name since the 4202 * pool was opened as TRYIMPORT_NAME. 4203 */ 4204 if (dsl_dsobj_to_dsname(spa_name(spa), 4205 spa->spa_bootfs, tmpname) == 0) { 4206 char *cp; 4207 char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 4208 4209 cp = strchr(tmpname, '/'); 4210 if (cp == NULL) { 4211 (void) strlcpy(dsname, tmpname, 4212 MAXPATHLEN); 4213 } else { 4214 (void) snprintf(dsname, MAXPATHLEN, 4215 "%s/%s", poolname, ++cp); 4216 } 4217 VERIFY(nvlist_add_string(config, 4218 ZPOOL_CONFIG_BOOTFS, dsname) == 0); 4219 kmem_free(dsname, MAXPATHLEN); 4220 } 4221 kmem_free(tmpname, MAXPATHLEN); 4222 } 4223 4224 /* 4225 * Add the list of hot spares and level 2 cache devices. 4226 */ 4227 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4228 spa_add_spares(spa, config); 4229 spa_add_l2cache(spa, config); 4230 spa_config_exit(spa, SCL_CONFIG, FTAG); 4231 } 4232 4233 spa_unload(spa); 4234 spa_deactivate(spa); 4235 spa_remove(spa); 4236 mutex_exit(&spa_namespace_lock); 4237 4238 return (config); 4239} 4240 4241/* 4242 * Pool export/destroy 4243 * 4244 * The act of destroying or exporting a pool is very simple. We make sure there 4245 * is no more pending I/O and any references to the pool are gone. Then, we 4246 * update the pool state and sync all the labels to disk, removing the 4247 * configuration from the cache afterwards. If the 'hardforce' flag is set, then 4248 * we don't sync the labels or remove the configuration cache. 4249 */ 4250static int 4251spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, 4252 boolean_t force, boolean_t hardforce) 4253{ 4254 spa_t *spa; 4255 4256 if (oldconfig) 4257 *oldconfig = NULL; 4258 4259 if (!(spa_mode_global & FWRITE)) 4260 return (SET_ERROR(EROFS)); 4261 4262 mutex_enter(&spa_namespace_lock); 4263 if ((spa = spa_lookup(pool)) == NULL) { 4264 mutex_exit(&spa_namespace_lock); 4265 return (SET_ERROR(ENOENT)); 4266 } 4267 4268 /* 4269 * Put a hold on the pool, drop the namespace lock, stop async tasks, 4270 * reacquire the namespace lock, and see if we can export. 4271 */ 4272 spa_open_ref(spa, FTAG); 4273 mutex_exit(&spa_namespace_lock); 4274 spa_async_suspend(spa); 4275 mutex_enter(&spa_namespace_lock); 4276 spa_close(spa, FTAG); 4277 4278 /* 4279 * The pool will be in core if it's openable, 4280 * in which case we can modify its state. 4281 */ 4282 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 4283 /* 4284 * Objsets may be open only because they're dirty, so we 4285 * have to force it to sync before checking spa_refcnt. 4286 */ 4287 txg_wait_synced(spa->spa_dsl_pool, 0); 4288 4289 /* 4290 * A pool cannot be exported or destroyed if there are active 4291 * references. If we are resetting a pool, allow references by 4292 * fault injection handlers. 4293 */ 4294 if (!spa_refcount_zero(spa) || 4295 (spa->spa_inject_ref != 0 && 4296 new_state != POOL_STATE_UNINITIALIZED)) { 4297 spa_async_resume(spa); 4298 mutex_exit(&spa_namespace_lock); 4299 return (SET_ERROR(EBUSY)); 4300 } 4301 4302 /* 4303 * A pool cannot be exported if it has an active shared spare. 4304 * This is to prevent other pools stealing the active spare 4305 * from an exported pool. At user's own will, such pool can 4306 * be forcedly exported. 4307 */ 4308 if (!force && new_state == POOL_STATE_EXPORTED && 4309 spa_has_active_shared_spare(spa)) { 4310 spa_async_resume(spa); 4311 mutex_exit(&spa_namespace_lock); 4312 return (SET_ERROR(EXDEV)); 4313 } 4314 4315 /* 4316 * We want this to be reflected on every label, 4317 * so mark them all dirty. spa_unload() will do the 4318 * final sync that pushes these changes out. 4319 */ 4320 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 4321 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4322 spa->spa_state = new_state; 4323 spa->spa_final_txg = spa_last_synced_txg(spa) + 4324 TXG_DEFER_SIZE + 1; 4325 vdev_config_dirty(spa->spa_root_vdev); 4326 spa_config_exit(spa, SCL_ALL, FTAG); 4327 } 4328 } 4329 4330 spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY); 4331 4332 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 4333 spa_unload(spa); 4334 spa_deactivate(spa); 4335 } 4336 4337 if (oldconfig && spa->spa_config) 4338 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 4339 4340 if (new_state != POOL_STATE_UNINITIALIZED) { 4341 if (!hardforce) 4342 spa_config_sync(spa, B_TRUE, B_TRUE); 4343 spa_remove(spa); 4344 } 4345 mutex_exit(&spa_namespace_lock); 4346 4347 return (0); 4348} 4349 4350/* 4351 * Destroy a storage pool. 4352 */ 4353int 4354spa_destroy(char *pool) 4355{ 4356 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, 4357 B_FALSE, B_FALSE)); 4358} 4359 4360/* 4361 * Export a storage pool. 4362 */ 4363int 4364spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, 4365 boolean_t hardforce) 4366{ 4367 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, 4368 force, hardforce)); 4369} 4370 4371/* 4372 * Similar to spa_export(), this unloads the spa_t without actually removing it 4373 * from the namespace in any way. 4374 */ 4375int 4376spa_reset(char *pool) 4377{ 4378 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, 4379 B_FALSE, B_FALSE)); 4380} 4381 4382/* 4383 * ========================================================================== 4384 * Device manipulation 4385 * ========================================================================== 4386 */ 4387 4388/* 4389 * Add a device to a storage pool. 4390 */ 4391int 4392spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 4393{ 4394 uint64_t txg, id; 4395 int error; 4396 vdev_t *rvd = spa->spa_root_vdev; 4397 vdev_t *vd, *tvd; 4398 nvlist_t **spares, **l2cache; 4399 uint_t nspares, nl2cache; 4400 4401 ASSERT(spa_writeable(spa)); 4402 4403 txg = spa_vdev_enter(spa); 4404 4405 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 4406 VDEV_ALLOC_ADD)) != 0) 4407 return (spa_vdev_exit(spa, NULL, txg, error)); 4408 4409 spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ 4410 4411 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 4412 &nspares) != 0) 4413 nspares = 0; 4414 4415 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 4416 &nl2cache) != 0) 4417 nl2cache = 0; 4418 4419 if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) 4420 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 4421 4422 if (vd->vdev_children != 0 && 4423 (error = vdev_create(vd, txg, B_FALSE)) != 0) 4424 return (spa_vdev_exit(spa, vd, txg, error)); 4425 4426 /* 4427 * We must validate the spares and l2cache devices after checking the 4428 * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 4429 */ 4430 if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) 4431 return (spa_vdev_exit(spa, vd, txg, error)); 4432 4433 /* 4434 * Transfer each new top-level vdev from vd to rvd. 4435 */ 4436 for (int c = 0; c < vd->vdev_children; c++) { 4437 4438 /* 4439 * Set the vdev id to the first hole, if one exists. 4440 */ 4441 for (id = 0; id < rvd->vdev_children; id++) { 4442 if (rvd->vdev_child[id]->vdev_ishole) { 4443 vdev_free(rvd->vdev_child[id]); 4444 break; 4445 } 4446 } 4447 tvd = vd->vdev_child[c]; 4448 vdev_remove_child(vd, tvd); 4449 tvd->vdev_id = id; 4450 vdev_add_child(rvd, tvd); 4451 vdev_config_dirty(tvd); 4452 } 4453 4454 if (nspares != 0) { 4455 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 4456 ZPOOL_CONFIG_SPARES); 4457 spa_load_spares(spa); 4458 spa->spa_spares.sav_sync = B_TRUE; 4459 } 4460 4461 if (nl2cache != 0) { 4462 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 4463 ZPOOL_CONFIG_L2CACHE); 4464 spa_load_l2cache(spa); 4465 spa->spa_l2cache.sav_sync = B_TRUE; 4466 } 4467 4468 /* 4469 * We have to be careful when adding new vdevs to an existing pool. 4470 * If other threads start allocating from these vdevs before we 4471 * sync the config cache, and we lose power, then upon reboot we may 4472 * fail to open the pool because there are DVAs that the config cache 4473 * can't translate. Therefore, we first add the vdevs without 4474 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 4475 * and then let spa_config_update() initialize the new metaslabs. 4476 * 4477 * spa_load() checks for added-but-not-initialized vdevs, so that 4478 * if we lose power at any point in this sequence, the remaining 4479 * steps will be completed the next time we load the pool. 4480 */ 4481 (void) spa_vdev_exit(spa, vd, txg, 0); 4482 4483 mutex_enter(&spa_namespace_lock); 4484 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 4485 mutex_exit(&spa_namespace_lock); 4486 4487 return (0); 4488} 4489 4490/* 4491 * Attach a device to a mirror. The arguments are the path to any device 4492 * in the mirror, and the nvroot for the new device. If the path specifies 4493 * a device that is not mirrored, we automatically insert the mirror vdev. 4494 * 4495 * If 'replacing' is specified, the new device is intended to replace the 4496 * existing device; in this case the two devices are made into their own 4497 * mirror using the 'replacing' vdev, which is functionally identical to 4498 * the mirror vdev (it actually reuses all the same ops) but has a few 4499 * extra rules: you can't attach to it after it's been created, and upon 4500 * completion of resilvering, the first disk (the one being replaced) 4501 * is automatically detached. 4502 */ 4503int 4504spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 4505{ 4506 uint64_t txg, dtl_max_txg; 4507 vdev_t *rvd = spa->spa_root_vdev; 4508 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 4509 vdev_ops_t *pvops; 4510 char *oldvdpath, *newvdpath; 4511 int newvd_isspare; 4512 int error; 4513 4514 ASSERT(spa_writeable(spa)); 4515 4516 txg = spa_vdev_enter(spa); 4517 4518 oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 4519 4520 if (oldvd == NULL) 4521 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 4522 4523 if (!oldvd->vdev_ops->vdev_op_leaf) 4524 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4525 4526 pvd = oldvd->vdev_parent; 4527 4528 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 4529 VDEV_ALLOC_ATTACH)) != 0) 4530 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4531 4532 if (newrootvd->vdev_children != 1) 4533 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 4534 4535 newvd = newrootvd->vdev_child[0]; 4536 4537 if (!newvd->vdev_ops->vdev_op_leaf) 4538 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 4539 4540 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 4541 return (spa_vdev_exit(spa, newrootvd, txg, error)); 4542 4543 /* 4544 * Spares can't replace logs 4545 */ 4546 if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) 4547 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4548 4549 if (!replacing) { 4550 /* 4551 * For attach, the only allowable parent is a mirror or the root 4552 * vdev. 4553 */ 4554 if (pvd->vdev_ops != &vdev_mirror_ops && 4555 pvd->vdev_ops != &vdev_root_ops) 4556 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4557 4558 pvops = &vdev_mirror_ops; 4559 } else { 4560 /* 4561 * Active hot spares can only be replaced by inactive hot 4562 * spares. 4563 */ 4564 if (pvd->vdev_ops == &vdev_spare_ops && 4565 oldvd->vdev_isspare && 4566 !spa_has_spare(spa, newvd->vdev_guid)) 4567 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4568 4569 /* 4570 * If the source is a hot spare, and the parent isn't already a 4571 * spare, then we want to create a new hot spare. Otherwise, we 4572 * want to create a replacing vdev. The user is not allowed to 4573 * attach to a spared vdev child unless the 'isspare' state is 4574 * the same (spare replaces spare, non-spare replaces 4575 * non-spare). 4576 */ 4577 if (pvd->vdev_ops == &vdev_replacing_ops && 4578 spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { 4579 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4580 } else if (pvd->vdev_ops == &vdev_spare_ops && 4581 newvd->vdev_isspare != oldvd->vdev_isspare) { 4582 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4583 } 4584 4585 if (newvd->vdev_isspare) 4586 pvops = &vdev_spare_ops; 4587 else 4588 pvops = &vdev_replacing_ops; 4589 } 4590 4591 /* 4592 * Make sure the new device is big enough. 4593 */ 4594 if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) 4595 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 4596 4597 /* 4598 * The new device cannot have a higher alignment requirement 4599 * than the top-level vdev. 4600 */ 4601 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 4602 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 4603 4604 /* 4605 * If this is an in-place replacement, update oldvd's path and devid 4606 * to make it distinguishable from newvd, and unopenable from now on. 4607 */ 4608 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 4609 spa_strfree(oldvd->vdev_path); 4610 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 4611 KM_SLEEP); 4612 (void) sprintf(oldvd->vdev_path, "%s/%s", 4613 newvd->vdev_path, "old"); 4614 if (oldvd->vdev_devid != NULL) { 4615 spa_strfree(oldvd->vdev_devid); 4616 oldvd->vdev_devid = NULL; 4617 } 4618 } 4619 4620 /* mark the device being resilvered */ 4621 newvd->vdev_resilvering = B_TRUE; 4622 4623 /* 4624 * If the parent is not a mirror, or if we're replacing, insert the new 4625 * mirror/replacing/spare vdev above oldvd. 4626 */ 4627 if (pvd->vdev_ops != pvops) 4628 pvd = vdev_add_parent(oldvd, pvops); 4629 4630 ASSERT(pvd->vdev_top->vdev_parent == rvd); 4631 ASSERT(pvd->vdev_ops == pvops); 4632 ASSERT(oldvd->vdev_parent == pvd); 4633 4634 /* 4635 * Extract the new device from its root and add it to pvd. 4636 */ 4637 vdev_remove_child(newrootvd, newvd); 4638 newvd->vdev_id = pvd->vdev_children; 4639 newvd->vdev_crtxg = oldvd->vdev_crtxg; 4640 vdev_add_child(pvd, newvd); 4641 4642 tvd = newvd->vdev_top; 4643 ASSERT(pvd->vdev_top == tvd); 4644 ASSERT(tvd->vdev_parent == rvd); 4645 4646 vdev_config_dirty(tvd); 4647 4648 /* 4649 * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account 4650 * for any dmu_sync-ed blocks. It will propagate upward when 4651 * spa_vdev_exit() calls vdev_dtl_reassess(). 4652 */ 4653 dtl_max_txg = txg + TXG_CONCURRENT_STATES; 4654 4655 vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, 4656 dtl_max_txg - TXG_INITIAL); 4657 4658 if (newvd->vdev_isspare) { 4659 spa_spare_activate(newvd); 4660 spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE); 4661 } 4662 4663 oldvdpath = spa_strdup(oldvd->vdev_path); 4664 newvdpath = spa_strdup(newvd->vdev_path); 4665 newvd_isspare = newvd->vdev_isspare; 4666 4667 /* 4668 * Mark newvd's DTL dirty in this txg. 4669 */ 4670 vdev_dirty(tvd, VDD_DTL, newvd, txg); 4671 4672 /* 4673 * Restart the resilver 4674 */ 4675 dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg); 4676 4677 /* 4678 * Commit the config 4679 */ 4680 (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); 4681 4682 spa_history_log_internal(spa, "vdev attach", NULL, 4683 "%s vdev=%s %s vdev=%s", 4684 replacing && newvd_isspare ? "spare in" : 4685 replacing ? "replace" : "attach", newvdpath, 4686 replacing ? "for" : "to", oldvdpath); 4687 4688 spa_strfree(oldvdpath); 4689 spa_strfree(newvdpath); 4690 4691 if (spa->spa_bootfs) 4692 spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH); 4693 4694 return (0); 4695} 4696 4697/* 4698 * Detach a device from a mirror or replacing vdev. 4699 * If 'replace_done' is specified, only detach if the parent 4700 * is a replacing vdev. 4701 */ 4702int 4703spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) 4704{ 4705 uint64_t txg; 4706 int error; 4707 vdev_t *rvd = spa->spa_root_vdev; 4708 vdev_t *vd, *pvd, *cvd, *tvd; 4709 boolean_t unspare = B_FALSE; 4710 uint64_t unspare_guid = 0; 4711 char *vdpath; 4712 4713 ASSERT(spa_writeable(spa)); 4714 4715 txg = spa_vdev_enter(spa); 4716 4717 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 4718 4719 if (vd == NULL) 4720 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 4721 4722 if (!vd->vdev_ops->vdev_op_leaf) 4723 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4724 4725 pvd = vd->vdev_parent; 4726 4727 /* 4728 * If the parent/child relationship is not as expected, don't do it. 4729 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing 4730 * vdev that's replacing B with C. The user's intent in replacing 4731 * is to go from M(A,B) to M(A,C). If the user decides to cancel 4732 * the replace by detaching C, the expected behavior is to end up 4733 * M(A,B). But suppose that right after deciding to detach C, 4734 * the replacement of B completes. We would have M(A,C), and then 4735 * ask to detach C, which would leave us with just A -- not what 4736 * the user wanted. To prevent this, we make sure that the 4737 * parent/child relationship hasn't changed -- in this example, 4738 * that C's parent is still the replacing vdev R. 4739 */ 4740 if (pvd->vdev_guid != pguid && pguid != 0) 4741 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 4742 4743 /* 4744 * Only 'replacing' or 'spare' vdevs can be replaced. 4745 */ 4746 if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && 4747 pvd->vdev_ops != &vdev_spare_ops) 4748 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4749 4750 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 4751 spa_version(spa) >= SPA_VERSION_SPARES); 4752 4753 /* 4754 * Only mirror, replacing, and spare vdevs support detach. 4755 */ 4756 if (pvd->vdev_ops != &vdev_replacing_ops && 4757 pvd->vdev_ops != &vdev_mirror_ops && 4758 pvd->vdev_ops != &vdev_spare_ops) 4759 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4760 4761 /* 4762 * If this device has the only valid copy of some data, 4763 * we cannot safely detach it. 4764 */ 4765 if (vdev_dtl_required(vd)) 4766 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 4767 4768 ASSERT(pvd->vdev_children >= 2); 4769 4770 /* 4771 * If we are detaching the second disk from a replacing vdev, then 4772 * check to see if we changed the original vdev's path to have "/old" 4773 * at the end in spa_vdev_attach(). If so, undo that change now. 4774 */ 4775 if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && 4776 vd->vdev_path != NULL) { 4777 size_t len = strlen(vd->vdev_path); 4778 4779 for (int c = 0; c < pvd->vdev_children; c++) { 4780 cvd = pvd->vdev_child[c]; 4781 4782 if (cvd == vd || cvd->vdev_path == NULL) 4783 continue; 4784 4785 if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 4786 strcmp(cvd->vdev_path + len, "/old") == 0) { 4787 spa_strfree(cvd->vdev_path); 4788 cvd->vdev_path = spa_strdup(vd->vdev_path); 4789 break; 4790 } 4791 } 4792 } 4793 4794 /* 4795 * If we are detaching the original disk from a spare, then it implies 4796 * that the spare should become a real disk, and be removed from the 4797 * active spare list for the pool. 4798 */ 4799 if (pvd->vdev_ops == &vdev_spare_ops && 4800 vd->vdev_id == 0 && 4801 pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare) 4802 unspare = B_TRUE; 4803 4804 /* 4805 * Erase the disk labels so the disk can be used for other things. 4806 * This must be done after all other error cases are handled, 4807 * but before we disembowel vd (so we can still do I/O to it). 4808 * But if we can't do it, don't treat the error as fatal -- 4809 * it may be that the unwritability of the disk is the reason 4810 * it's being detached! 4811 */ 4812 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 4813 4814 /* 4815 * Remove vd from its parent and compact the parent's children. 4816 */ 4817 vdev_remove_child(pvd, vd); 4818 vdev_compact_children(pvd); 4819 4820 /* 4821 * Remember one of the remaining children so we can get tvd below. 4822 */ 4823 cvd = pvd->vdev_child[pvd->vdev_children - 1]; 4824 4825 /* 4826 * If we need to remove the remaining child from the list of hot spares, 4827 * do it now, marking the vdev as no longer a spare in the process. 4828 * We must do this before vdev_remove_parent(), because that can 4829 * change the GUID if it creates a new toplevel GUID. For a similar 4830 * reason, we must remove the spare now, in the same txg as the detach; 4831 * otherwise someone could attach a new sibling, change the GUID, and 4832 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. 4833 */ 4834 if (unspare) { 4835 ASSERT(cvd->vdev_isspare); 4836 spa_spare_remove(cvd); 4837 unspare_guid = cvd->vdev_guid; 4838 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 4839 cvd->vdev_unspare = B_TRUE; 4840 } 4841 4842 /* 4843 * If the parent mirror/replacing vdev only has one child, 4844 * the parent is no longer needed. Remove it from the tree. 4845 */ 4846 if (pvd->vdev_children == 1) { 4847 if (pvd->vdev_ops == &vdev_spare_ops) 4848 cvd->vdev_unspare = B_FALSE; 4849 vdev_remove_parent(cvd); 4850 cvd->vdev_resilvering = B_FALSE; 4851 } 4852 4853 4854 /* 4855 * We don't set tvd until now because the parent we just removed 4856 * may have been the previous top-level vdev. 4857 */ 4858 tvd = cvd->vdev_top; 4859 ASSERT(tvd->vdev_parent == rvd); 4860 4861 /* 4862 * Reevaluate the parent vdev state. 4863 */ 4864 vdev_propagate_state(cvd); 4865 4866 /* 4867 * If the 'autoexpand' property is set on the pool then automatically 4868 * try to expand the size of the pool. For example if the device we 4869 * just detached was smaller than the others, it may be possible to 4870 * add metaslabs (i.e. grow the pool). We need to reopen the vdev 4871 * first so that we can obtain the updated sizes of the leaf vdevs. 4872 */ 4873 if (spa->spa_autoexpand) { 4874 vdev_reopen(tvd); 4875 vdev_expand(tvd, txg); 4876 } 4877 4878 vdev_config_dirty(tvd); 4879 4880 /* 4881 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 4882 * vd->vdev_detached is set and free vd's DTL object in syncing context. 4883 * But first make sure we're not on any *other* txg's DTL list, to 4884 * prevent vd from being accessed after it's freed. 4885 */ 4886 vdpath = spa_strdup(vd->vdev_path); 4887 for (int t = 0; t < TXG_SIZE; t++) 4888 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 4889 vd->vdev_detached = B_TRUE; 4890 vdev_dirty(tvd, VDD_DTL, vd, txg); 4891 4892 spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); 4893 4894 /* hang on to the spa before we release the lock */ 4895 spa_open_ref(spa, FTAG); 4896 4897 error = spa_vdev_exit(spa, vd, txg, 0); 4898 4899 spa_history_log_internal(spa, "detach", NULL, 4900 "vdev=%s", vdpath); 4901 spa_strfree(vdpath); 4902 4903 /* 4904 * If this was the removal of the original device in a hot spare vdev, 4905 * then we want to go through and remove the device from the hot spare 4906 * list of every other pool. 4907 */ 4908 if (unspare) { 4909 spa_t *altspa = NULL; 4910 4911 mutex_enter(&spa_namespace_lock); 4912 while ((altspa = spa_next(altspa)) != NULL) { 4913 if (altspa->spa_state != POOL_STATE_ACTIVE || 4914 altspa == spa) 4915 continue; 4916 4917 spa_open_ref(altspa, FTAG); 4918 mutex_exit(&spa_namespace_lock); 4919 (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); 4920 mutex_enter(&spa_namespace_lock); 4921 spa_close(altspa, FTAG); 4922 } 4923 mutex_exit(&spa_namespace_lock); 4924 4925 /* search the rest of the vdevs for spares to remove */ 4926 spa_vdev_resilver_done(spa); 4927 } 4928 4929 /* all done with the spa; OK to release */ 4930 mutex_enter(&spa_namespace_lock); 4931 spa_close(spa, FTAG); 4932 mutex_exit(&spa_namespace_lock); 4933 4934 return (error); 4935} 4936 4937/* 4938 * Split a set of devices from their mirrors, and create a new pool from them. 4939 */ 4940int 4941spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, 4942 nvlist_t *props, boolean_t exp) 4943{ 4944 int error = 0; 4945 uint64_t txg, *glist; 4946 spa_t *newspa; 4947 uint_t c, children, lastlog; 4948 nvlist_t **child, *nvl, *tmp; 4949 dmu_tx_t *tx; 4950 char *altroot = NULL; 4951 vdev_t *rvd, **vml = NULL; /* vdev modify list */ 4952 boolean_t activate_slog; 4953 4954 ASSERT(spa_writeable(spa)); 4955 4956 txg = spa_vdev_enter(spa); 4957 4958 /* clear the log and flush everything up to now */ 4959 activate_slog = spa_passivate_log(spa); 4960 (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 4961 error = spa_offline_log(spa); 4962 txg = spa_vdev_config_enter(spa); 4963 4964 if (activate_slog) 4965 spa_activate_log(spa); 4966 4967 if (error != 0) 4968 return (spa_vdev_exit(spa, NULL, txg, error)); 4969 4970 /* check new spa name before going any further */ 4971 if (spa_lookup(newname) != NULL) 4972 return (spa_vdev_exit(spa, NULL, txg, EEXIST)); 4973 4974 /* 4975 * scan through all the children to ensure they're all mirrors 4976 */ 4977 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || 4978 nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, 4979 &children) != 0) 4980 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4981 4982 /* first, check to ensure we've got the right child count */ 4983 rvd = spa->spa_root_vdev; 4984 lastlog = 0; 4985 for (c = 0; c < rvd->vdev_children; c++) { 4986 vdev_t *vd = rvd->vdev_child[c]; 4987 4988 /* don't count the holes & logs as children */ 4989 if (vd->vdev_islog || vd->vdev_ishole) { 4990 if (lastlog == 0) 4991 lastlog = c; 4992 continue; 4993 } 4994 4995 lastlog = 0; 4996 } 4997 if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) 4998 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4999 5000 /* next, ensure no spare or cache devices are part of the split */ 5001 if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || 5002 nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) 5003 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 5004 5005 vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); 5006 glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); 5007 5008 /* then, loop over each vdev and validate it */ 5009 for (c = 0; c < children; c++) { 5010 uint64_t is_hole = 0; 5011 5012 (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, 5013 &is_hole); 5014 5015 if (is_hole != 0) { 5016 if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || 5017 spa->spa_root_vdev->vdev_child[c]->vdev_islog) { 5018 continue; 5019 } else { 5020 error = SET_ERROR(EINVAL); 5021 break; 5022 } 5023 } 5024 5025 /* which disk is going to be split? */ 5026 if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, 5027 &glist[c]) != 0) { 5028 error = SET_ERROR(EINVAL); 5029 break; 5030 } 5031 5032 /* look it up in the spa */ 5033 vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); 5034 if (vml[c] == NULL) { 5035 error = SET_ERROR(ENODEV); 5036 break; 5037 } 5038 5039 /* make sure there's nothing stopping the split */ 5040 if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || 5041 vml[c]->vdev_islog || 5042 vml[c]->vdev_ishole || 5043 vml[c]->vdev_isspare || 5044 vml[c]->vdev_isl2cache || 5045 !vdev_writeable(vml[c]) || 5046 vml[c]->vdev_children != 0 || 5047 vml[c]->vdev_state != VDEV_STATE_HEALTHY || 5048 c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { 5049 error = SET_ERROR(EINVAL); 5050 break; 5051 } 5052 5053 if (vdev_dtl_required(vml[c])) { 5054 error = SET_ERROR(EBUSY); 5055 break; 5056 } 5057 5058 /* we need certain info from the top level */ 5059 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, 5060 vml[c]->vdev_top->vdev_ms_array) == 0); 5061 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, 5062 vml[c]->vdev_top->vdev_ms_shift) == 0); 5063 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, 5064 vml[c]->vdev_top->vdev_asize) == 0); 5065 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, 5066 vml[c]->vdev_top->vdev_ashift) == 0); 5067 } 5068 5069 if (error != 0) { 5070 kmem_free(vml, children * sizeof (vdev_t *)); 5071 kmem_free(glist, children * sizeof (uint64_t)); 5072 return (spa_vdev_exit(spa, NULL, txg, error)); 5073 } 5074 5075 /* stop writers from using the disks */ 5076 for (c = 0; c < children; c++) { 5077 if (vml[c] != NULL) 5078 vml[c]->vdev_offline = B_TRUE; 5079 } 5080 vdev_reopen(spa->spa_root_vdev); 5081 5082 /* 5083 * Temporarily record the splitting vdevs in the spa config. This 5084 * will disappear once the config is regenerated. 5085 */ 5086 VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5087 VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 5088 glist, children) == 0); 5089 kmem_free(glist, children * sizeof (uint64_t)); 5090 5091 mutex_enter(&spa->spa_props_lock); 5092 VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, 5093 nvl) == 0); 5094 mutex_exit(&spa->spa_props_lock); 5095 spa->spa_config_splitting = nvl; 5096 vdev_config_dirty(spa->spa_root_vdev); 5097 5098 /* configure and create the new pool */ 5099 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0); 5100 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 5101 exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0); 5102 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 5103 spa_version(spa)) == 0); 5104 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, 5105 spa->spa_config_txg) == 0); 5106 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, 5107 spa_generate_guid(NULL)) == 0); 5108 (void) nvlist_lookup_string(props, 5109 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 5110 5111 /* add the new pool to the namespace */ 5112 newspa = spa_add(newname, config, altroot); 5113 newspa->spa_config_txg = spa->spa_config_txg; 5114 spa_set_log_state(newspa, SPA_LOG_CLEAR); 5115 5116 /* release the spa config lock, retaining the namespace lock */ 5117 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 5118 5119 if (zio_injection_enabled) 5120 zio_handle_panic_injection(spa, FTAG, 1); 5121 5122 spa_activate(newspa, spa_mode_global); 5123 spa_async_suspend(newspa); 5124 5125#ifndef sun 5126 /* mark that we are creating new spa by splitting */ 5127 newspa->spa_splitting_newspa = B_TRUE; 5128#endif 5129 /* create the new pool from the disks of the original pool */ 5130 error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE); 5131#ifndef sun 5132 newspa->spa_splitting_newspa = B_FALSE; 5133#endif 5134 if (error) 5135 goto out; 5136 5137 /* if that worked, generate a real config for the new pool */ 5138 if (newspa->spa_root_vdev != NULL) { 5139 VERIFY(nvlist_alloc(&newspa->spa_config_splitting, 5140 NV_UNIQUE_NAME, KM_SLEEP) == 0); 5141 VERIFY(nvlist_add_uint64(newspa->spa_config_splitting, 5142 ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0); 5143 spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, 5144 B_TRUE)); 5145 } 5146 5147 /* set the props */ 5148 if (props != NULL) { 5149 spa_configfile_set(newspa, props, B_FALSE); 5150 error = spa_prop_set(newspa, props); 5151 if (error) 5152 goto out; 5153 } 5154 5155 /* flush everything */ 5156 txg = spa_vdev_config_enter(newspa); 5157 vdev_config_dirty(newspa->spa_root_vdev); 5158 (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); 5159 5160 if (zio_injection_enabled) 5161 zio_handle_panic_injection(spa, FTAG, 2); 5162 5163 spa_async_resume(newspa); 5164 5165 /* finally, update the original pool's config */ 5166 txg = spa_vdev_config_enter(spa); 5167 tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 5168 error = dmu_tx_assign(tx, TXG_WAIT); 5169 if (error != 0) 5170 dmu_tx_abort(tx); 5171 for (c = 0; c < children; c++) { 5172 if (vml[c] != NULL) { 5173 vdev_split(vml[c]); 5174 if (error == 0) 5175 spa_history_log_internal(spa, "detach", tx, 5176 "vdev=%s", vml[c]->vdev_path); 5177 vdev_free(vml[c]); 5178 } 5179 } 5180 vdev_config_dirty(spa->spa_root_vdev); 5181 spa->spa_config_splitting = NULL; 5182 nvlist_free(nvl); 5183 if (error == 0) 5184 dmu_tx_commit(tx); 5185 (void) spa_vdev_exit(spa, NULL, txg, 0); 5186 5187 if (zio_injection_enabled) 5188 zio_handle_panic_injection(spa, FTAG, 3); 5189 5190 /* split is complete; log a history record */ 5191 spa_history_log_internal(newspa, "split", NULL, 5192 "from pool %s", spa_name(spa)); 5193 5194 kmem_free(vml, children * sizeof (vdev_t *)); 5195 5196 /* if we're not going to mount the filesystems in userland, export */ 5197 if (exp) 5198 error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, 5199 B_FALSE, B_FALSE); 5200 5201 return (error); 5202 5203out: 5204 spa_unload(newspa); 5205 spa_deactivate(newspa); 5206 spa_remove(newspa); 5207 5208 txg = spa_vdev_config_enter(spa); 5209 5210 /* re-online all offlined disks */ 5211 for (c = 0; c < children; c++) { 5212 if (vml[c] != NULL) 5213 vml[c]->vdev_offline = B_FALSE; 5214 } 5215 vdev_reopen(spa->spa_root_vdev); 5216 5217 nvlist_free(spa->spa_config_splitting); 5218 spa->spa_config_splitting = NULL; 5219 (void) spa_vdev_exit(spa, NULL, txg, error); 5220 5221 kmem_free(vml, children * sizeof (vdev_t *)); 5222 return (error); 5223} 5224 5225static nvlist_t * 5226spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) 5227{ 5228 for (int i = 0; i < count; i++) { 5229 uint64_t guid; 5230 5231 VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID, 5232 &guid) == 0); 5233 5234 if (guid == target_guid) 5235 return (nvpp[i]); 5236 } 5237 5238 return (NULL); 5239} 5240 5241static void 5242spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, 5243 nvlist_t *dev_to_remove) 5244{ 5245 nvlist_t **newdev = NULL; 5246 5247 if (count > 1) 5248 newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); 5249 5250 for (int i = 0, j = 0; i < count; i++) { 5251 if (dev[i] == dev_to_remove) 5252 continue; 5253 VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); 5254 } 5255 5256 VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); 5257 VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); 5258 5259 for (int i = 0; i < count - 1; i++) 5260 nvlist_free(newdev[i]); 5261 5262 if (count > 1) 5263 kmem_free(newdev, (count - 1) * sizeof (void *)); 5264} 5265 5266/* 5267 * Evacuate the device. 5268 */ 5269static int 5270spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd) 5271{ 5272 uint64_t txg; 5273 int error = 0; 5274 5275 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 5276 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5277 ASSERT(vd == vd->vdev_top); 5278 5279 /* 5280 * Evacuate the device. We don't hold the config lock as writer 5281 * since we need to do I/O but we do keep the 5282 * spa_namespace_lock held. Once this completes the device 5283 * should no longer have any blocks allocated on it. 5284 */ 5285 if (vd->vdev_islog) { 5286 if (vd->vdev_stat.vs_alloc != 0) 5287 error = spa_offline_log(spa); 5288 } else { 5289 error = SET_ERROR(ENOTSUP); 5290 } 5291 5292 if (error) 5293 return (error); 5294 5295 /* 5296 * The evacuation succeeded. Remove any remaining MOS metadata 5297 * associated with this vdev, and wait for these changes to sync. 5298 */ 5299 ASSERT0(vd->vdev_stat.vs_alloc); 5300 txg = spa_vdev_config_enter(spa); 5301 vd->vdev_removing = B_TRUE; 5302 vdev_dirty(vd, 0, NULL, txg); 5303 vdev_config_dirty(vd); 5304 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 5305 5306 return (0); 5307} 5308 5309/* 5310 * Complete the removal by cleaning up the namespace. 5311 */ 5312static void 5313spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd) 5314{ 5315 vdev_t *rvd = spa->spa_root_vdev; 5316 uint64_t id = vd->vdev_id; 5317 boolean_t last_vdev = (id == (rvd->vdev_children - 1)); 5318 5319 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 5320 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 5321 ASSERT(vd == vd->vdev_top); 5322 5323 /* 5324 * Only remove any devices which are empty. 5325 */ 5326 if (vd->vdev_stat.vs_alloc != 0) 5327 return; 5328 5329 (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 5330 5331 if (list_link_active(&vd->vdev_state_dirty_node)) 5332 vdev_state_clean(vd); 5333 if (list_link_active(&vd->vdev_config_dirty_node)) 5334 vdev_config_clean(vd); 5335 5336 vdev_free(vd); 5337 5338 if (last_vdev) { 5339 vdev_compact_children(rvd); 5340 } else { 5341 vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); 5342 vdev_add_child(rvd, vd); 5343 } 5344 vdev_config_dirty(rvd); 5345 5346 /* 5347 * Reassess the health of our root vdev. 5348 */ 5349 vdev_reopen(rvd); 5350} 5351 5352/* 5353 * Remove a device from the pool - 5354 * 5355 * Removing a device from the vdev namespace requires several steps 5356 * and can take a significant amount of time. As a result we use 5357 * the spa_vdev_config_[enter/exit] functions which allow us to 5358 * grab and release the spa_config_lock while still holding the namespace 5359 * lock. During each step the configuration is synced out. 5360 */ 5361 5362/* 5363 * Remove a device from the pool. Currently, this supports removing only hot 5364 * spares, slogs, and level 2 ARC devices. 5365 */ 5366int 5367spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 5368{ 5369 vdev_t *vd; 5370 metaslab_group_t *mg; 5371 nvlist_t **spares, **l2cache, *nv; 5372 uint64_t txg = 0; 5373 uint_t nspares, nl2cache; 5374 int error = 0; 5375 boolean_t locked = MUTEX_HELD(&spa_namespace_lock); 5376 5377 ASSERT(spa_writeable(spa)); 5378 5379 if (!locked) 5380 txg = spa_vdev_enter(spa); 5381 5382 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 5383 5384 if (spa->spa_spares.sav_vdevs != NULL && 5385 nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 5386 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && 5387 (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { 5388 /* 5389 * Only remove the hot spare if it's not currently in use 5390 * in this pool. 5391 */ 5392 if (vd == NULL || unspare) { 5393 spa_vdev_remove_aux(spa->spa_spares.sav_config, 5394 ZPOOL_CONFIG_SPARES, spares, nspares, nv); 5395 spa_load_spares(spa); 5396 spa->spa_spares.sav_sync = B_TRUE; 5397 } else { 5398 error = SET_ERROR(EBUSY); 5399 } 5400 } else if (spa->spa_l2cache.sav_vdevs != NULL && 5401 nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 5402 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && 5403 (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { 5404 /* 5405 * Cache devices can always be removed. 5406 */ 5407 spa_vdev_remove_aux(spa->spa_l2cache.sav_config, 5408 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); 5409 spa_load_l2cache(spa); 5410 spa->spa_l2cache.sav_sync = B_TRUE; 5411 } else if (vd != NULL && vd->vdev_islog) { 5412 ASSERT(!locked); 5413 ASSERT(vd == vd->vdev_top); 5414 5415 /* 5416 * XXX - Once we have bp-rewrite this should 5417 * become the common case. 5418 */ 5419 5420 mg = vd->vdev_mg; 5421 5422 /* 5423 * Stop allocating from this vdev. 5424 */ 5425 metaslab_group_passivate(mg); 5426 5427 /* 5428 * Wait for the youngest allocations and frees to sync, 5429 * and then wait for the deferral of those frees to finish. 5430 */ 5431 spa_vdev_config_exit(spa, NULL, 5432 txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); 5433 5434 /* 5435 * Attempt to evacuate the vdev. 5436 */ 5437 error = spa_vdev_remove_evacuate(spa, vd); 5438 5439 txg = spa_vdev_config_enter(spa); 5440 5441 /* 5442 * If we couldn't evacuate the vdev, unwind. 5443 */ 5444 if (error) { 5445 metaslab_group_activate(mg); 5446 return (spa_vdev_exit(spa, NULL, txg, error)); 5447 } 5448 5449 /* 5450 * Clean up the vdev namespace. 5451 */ 5452 spa_vdev_remove_from_namespace(spa, vd); 5453 5454 } else if (vd != NULL) { 5455 /* 5456 * Normal vdevs cannot be removed (yet). 5457 */ 5458 error = SET_ERROR(ENOTSUP); 5459 } else { 5460 /* 5461 * There is no vdev of any kind with the specified guid. 5462 */ 5463 error = SET_ERROR(ENOENT); 5464 } 5465 5466 if (!locked) 5467 return (spa_vdev_exit(spa, NULL, txg, error)); 5468 5469 return (error); 5470} 5471 5472/* 5473 * Find any device that's done replacing, or a vdev marked 'unspare' that's 5474 * current spared, so we can detach it. 5475 */ 5476static vdev_t * 5477spa_vdev_resilver_done_hunt(vdev_t *vd) 5478{ 5479 vdev_t *newvd, *oldvd; 5480 5481 for (int c = 0; c < vd->vdev_children; c++) { 5482 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 5483 if (oldvd != NULL) 5484 return (oldvd); 5485 } 5486 5487 /* 5488 * Check for a completed replacement. We always consider the first 5489 * vdev in the list to be the oldest vdev, and the last one to be 5490 * the newest (see spa_vdev_attach() for how that works). In 5491 * the case where the newest vdev is faulted, we will not automatically 5492 * remove it after a resilver completes. This is OK as it will require 5493 * user intervention to determine which disk the admin wishes to keep. 5494 */ 5495 if (vd->vdev_ops == &vdev_replacing_ops) { 5496 ASSERT(vd->vdev_children > 1); 5497 5498 newvd = vd->vdev_child[vd->vdev_children - 1]; 5499 oldvd = vd->vdev_child[0]; 5500 5501 if (vdev_dtl_empty(newvd, DTL_MISSING) && 5502 vdev_dtl_empty(newvd, DTL_OUTAGE) && 5503 !vdev_dtl_required(oldvd)) 5504 return (oldvd); 5505 } 5506 5507 /* 5508 * Check for a completed resilver with the 'unspare' flag set. 5509 */ 5510 if (vd->vdev_ops == &vdev_spare_ops) { 5511 vdev_t *first = vd->vdev_child[0]; 5512 vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; 5513 5514 if (last->vdev_unspare) { 5515 oldvd = first; 5516 newvd = last; 5517 } else if (first->vdev_unspare) { 5518 oldvd = last; 5519 newvd = first; 5520 } else { 5521 oldvd = NULL; 5522 } 5523 5524 if (oldvd != NULL && 5525 vdev_dtl_empty(newvd, DTL_MISSING) && 5526 vdev_dtl_empty(newvd, DTL_OUTAGE) && 5527 !vdev_dtl_required(oldvd)) 5528 return (oldvd); 5529 5530 /* 5531 * If there are more than two spares attached to a disk, 5532 * and those spares are not required, then we want to 5533 * attempt to free them up now so that they can be used 5534 * by other pools. Once we're back down to a single 5535 * disk+spare, we stop removing them. 5536 */ 5537 if (vd->vdev_children > 2) { 5538 newvd = vd->vdev_child[1]; 5539 5540 if (newvd->vdev_isspare && last->vdev_isspare && 5541 vdev_dtl_empty(last, DTL_MISSING) && 5542 vdev_dtl_empty(last, DTL_OUTAGE) && 5543 !vdev_dtl_required(newvd)) 5544 return (newvd); 5545 } 5546 } 5547 5548 return (NULL); 5549} 5550 5551static void 5552spa_vdev_resilver_done(spa_t *spa) 5553{ 5554 vdev_t *vd, *pvd, *ppvd; 5555 uint64_t guid, sguid, pguid, ppguid; 5556 5557 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5558 5559 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 5560 pvd = vd->vdev_parent; 5561 ppvd = pvd->vdev_parent; 5562 guid = vd->vdev_guid; 5563 pguid = pvd->vdev_guid; 5564 ppguid = ppvd->vdev_guid; 5565 sguid = 0; 5566 /* 5567 * If we have just finished replacing a hot spared device, then 5568 * we need to detach the parent's first child (the original hot 5569 * spare) as well. 5570 */ 5571 if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && 5572 ppvd->vdev_children == 2) { 5573 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 5574 sguid = ppvd->vdev_child[1]->vdev_guid; 5575 } 5576 spa_config_exit(spa, SCL_ALL, FTAG); 5577 if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) 5578 return; 5579 if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) 5580 return; 5581 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5582 } 5583 5584 spa_config_exit(spa, SCL_ALL, FTAG); 5585} 5586 5587/* 5588 * Update the stored path or FRU for this vdev. 5589 */ 5590int 5591spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, 5592 boolean_t ispath) 5593{ 5594 vdev_t *vd; 5595 boolean_t sync = B_FALSE; 5596 5597 ASSERT(spa_writeable(spa)); 5598 5599 spa_vdev_state_enter(spa, SCL_ALL); 5600 5601 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 5602 return (spa_vdev_state_exit(spa, NULL, ENOENT)); 5603 5604 if (!vd->vdev_ops->vdev_op_leaf) 5605 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 5606 5607 if (ispath) { 5608 if (strcmp(value, vd->vdev_path) != 0) { 5609 spa_strfree(vd->vdev_path); 5610 vd->vdev_path = spa_strdup(value); 5611 sync = B_TRUE; 5612 } 5613 } else { 5614 if (vd->vdev_fru == NULL) { 5615 vd->vdev_fru = spa_strdup(value); 5616 sync = B_TRUE; 5617 } else if (strcmp(value, vd->vdev_fru) != 0) { 5618 spa_strfree(vd->vdev_fru); 5619 vd->vdev_fru = spa_strdup(value); 5620 sync = B_TRUE; 5621 } 5622 } 5623 5624 return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); 5625} 5626 5627int 5628spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 5629{ 5630 return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); 5631} 5632 5633int 5634spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) 5635{ 5636 return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); 5637} 5638 5639/* 5640 * ========================================================================== 5641 * SPA Scanning 5642 * ========================================================================== 5643 */ 5644 5645int 5646spa_scan_stop(spa_t *spa) 5647{ 5648 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5649 if (dsl_scan_resilvering(spa->spa_dsl_pool)) 5650 return (SET_ERROR(EBUSY)); 5651 return (dsl_scan_cancel(spa->spa_dsl_pool)); 5652} 5653 5654int 5655spa_scan(spa_t *spa, pool_scan_func_t func) 5656{ 5657 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5658 5659 if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) 5660 return (SET_ERROR(ENOTSUP)); 5661 5662 /* 5663 * If a resilver was requested, but there is no DTL on a 5664 * writeable leaf device, we have nothing to do. 5665 */ 5666 if (func == POOL_SCAN_RESILVER && 5667 !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 5668 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 5669 return (0); 5670 } 5671 5672 return (dsl_scan(spa->spa_dsl_pool, func)); 5673} 5674 5675/* 5676 * ========================================================================== 5677 * SPA async task processing 5678 * ========================================================================== 5679 */ 5680 5681static void 5682spa_async_remove(spa_t *spa, vdev_t *vd) 5683{ 5684 if (vd->vdev_remove_wanted) { 5685 vd->vdev_remove_wanted = B_FALSE; 5686 vd->vdev_delayed_close = B_FALSE; 5687 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 5688 5689 /* 5690 * We want to clear the stats, but we don't want to do a full 5691 * vdev_clear() as that will cause us to throw away 5692 * degraded/faulted state as well as attempt to reopen the 5693 * device, all of which is a waste. 5694 */ 5695 vd->vdev_stat.vs_read_errors = 0; 5696 vd->vdev_stat.vs_write_errors = 0; 5697 vd->vdev_stat.vs_checksum_errors = 0; 5698 5699 vdev_state_dirty(vd->vdev_top); 5700 } 5701 5702 for (int c = 0; c < vd->vdev_children; c++) 5703 spa_async_remove(spa, vd->vdev_child[c]); 5704} 5705 5706static void 5707spa_async_probe(spa_t *spa, vdev_t *vd) 5708{ 5709 if (vd->vdev_probe_wanted) { 5710 vd->vdev_probe_wanted = B_FALSE; 5711 vdev_reopen(vd); /* vdev_open() does the actual probe */ 5712 } 5713 5714 for (int c = 0; c < vd->vdev_children; c++) 5715 spa_async_probe(spa, vd->vdev_child[c]); 5716} 5717 5718static void 5719spa_async_autoexpand(spa_t *spa, vdev_t *vd) 5720{ 5721 sysevent_id_t eid; 5722 nvlist_t *attr; 5723 char *physpath; 5724 5725 if (!spa->spa_autoexpand) 5726 return; 5727 5728 for (int c = 0; c < vd->vdev_children; c++) { 5729 vdev_t *cvd = vd->vdev_child[c]; 5730 spa_async_autoexpand(spa, cvd); 5731 } 5732 5733 if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) 5734 return; 5735 5736 physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP); 5737 (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath); 5738 5739 VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5740 VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0); 5741 5742 (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS, 5743 ESC_ZFS_VDEV_AUTOEXPAND, attr, &eid, DDI_SLEEP); 5744 5745 nvlist_free(attr); 5746 kmem_free(physpath, MAXPATHLEN); 5747} 5748 5749static void 5750spa_async_thread(void *arg) 5751{ 5752 spa_t *spa = arg; 5753 int tasks; 5754 5755 ASSERT(spa->spa_sync_on); 5756 5757 mutex_enter(&spa->spa_async_lock); 5758 tasks = spa->spa_async_tasks; 5759 spa->spa_async_tasks = 0; 5760 mutex_exit(&spa->spa_async_lock); 5761 5762 /* 5763 * See if the config needs to be updated. 5764 */ 5765 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 5766 uint64_t old_space, new_space; 5767 5768 mutex_enter(&spa_namespace_lock); 5769 old_space = metaslab_class_get_space(spa_normal_class(spa)); 5770 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 5771 new_space = metaslab_class_get_space(spa_normal_class(spa)); 5772 mutex_exit(&spa_namespace_lock); 5773 5774 /* 5775 * If the pool grew as a result of the config update, 5776 * then log an internal history event. 5777 */ 5778 if (new_space != old_space) { 5779 spa_history_log_internal(spa, "vdev online", NULL, 5780 "pool '%s' size: %llu(+%llu)", 5781 spa_name(spa), new_space, new_space - old_space); 5782 } 5783 } 5784 5785 /* 5786 * See if any devices need to be marked REMOVED. 5787 */ 5788 if (tasks & SPA_ASYNC_REMOVE) { 5789 spa_vdev_state_enter(spa, SCL_NONE); 5790 spa_async_remove(spa, spa->spa_root_vdev); 5791 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 5792 spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); 5793 for (int i = 0; i < spa->spa_spares.sav_count; i++) 5794 spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); 5795 (void) spa_vdev_state_exit(spa, NULL, 0); 5796 } 5797 5798 if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { 5799 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 5800 spa_async_autoexpand(spa, spa->spa_root_vdev); 5801 spa_config_exit(spa, SCL_CONFIG, FTAG); 5802 } 5803 5804 /* 5805 * See if any devices need to be probed. 5806 */ 5807 if (tasks & SPA_ASYNC_PROBE) { 5808 spa_vdev_state_enter(spa, SCL_NONE); 5809 spa_async_probe(spa, spa->spa_root_vdev); 5810 (void) spa_vdev_state_exit(spa, NULL, 0); 5811 } 5812 5813 /* 5814 * If any devices are done replacing, detach them. 5815 */ 5816 if (tasks & SPA_ASYNC_RESILVER_DONE) 5817 spa_vdev_resilver_done(spa); 5818 5819 /* 5820 * Kick off a resilver. 5821 */ 5822 if (tasks & SPA_ASYNC_RESILVER) 5823 dsl_resilver_restart(spa->spa_dsl_pool, 0); 5824 5825 /* 5826 * Let the world know that we're done. 5827 */ 5828 mutex_enter(&spa->spa_async_lock); 5829 spa->spa_async_thread = NULL; 5830 cv_broadcast(&spa->spa_async_cv); 5831 mutex_exit(&spa->spa_async_lock); 5832 thread_exit(); 5833} 5834 5835void 5836spa_async_suspend(spa_t *spa) 5837{ 5838 mutex_enter(&spa->spa_async_lock); 5839 spa->spa_async_suspended++; 5840 while (spa->spa_async_thread != NULL) 5841 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 5842 mutex_exit(&spa->spa_async_lock); 5843} 5844 5845void 5846spa_async_resume(spa_t *spa) 5847{ 5848 mutex_enter(&spa->spa_async_lock); 5849 ASSERT(spa->spa_async_suspended != 0); 5850 spa->spa_async_suspended--; 5851 mutex_exit(&spa->spa_async_lock); 5852} 5853 5854static void 5855spa_async_dispatch(spa_t *spa) 5856{ 5857 mutex_enter(&spa->spa_async_lock); 5858 if (spa->spa_async_tasks && !spa->spa_async_suspended && 5859 spa->spa_async_thread == NULL && 5860 rootdir != NULL && !vn_is_readonly(rootdir)) 5861 spa->spa_async_thread = thread_create(NULL, 0, 5862 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 5863 mutex_exit(&spa->spa_async_lock); 5864} 5865 5866void 5867spa_async_request(spa_t *spa, int task) 5868{ 5869 zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); 5870 mutex_enter(&spa->spa_async_lock); 5871 spa->spa_async_tasks |= task; 5872 mutex_exit(&spa->spa_async_lock); 5873} 5874 5875/* 5876 * ========================================================================== 5877 * SPA syncing routines 5878 * ========================================================================== 5879 */ 5880 5881static int 5882bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 5883{ 5884 bpobj_t *bpo = arg; 5885 bpobj_enqueue(bpo, bp, tx); 5886 return (0); 5887} 5888 5889static int 5890spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 5891{ 5892 zio_t *zio = arg; 5893 5894 zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp, 5895 BP_GET_PSIZE(bp), zio->io_flags)); 5896 return (0); 5897} 5898 5899static void 5900spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 5901{ 5902 char *packed = NULL; 5903 size_t bufsize; 5904 size_t nvsize = 0; 5905 dmu_buf_t *db; 5906 5907 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 5908 5909 /* 5910 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 5911 * information. This avoids the dbuf_will_dirty() path and 5912 * saves us a pre-read to get data we don't actually care about. 5913 */ 5914 bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE); 5915 packed = kmem_alloc(bufsize, KM_SLEEP); 5916 5917 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 5918 KM_SLEEP) == 0); 5919 bzero(packed + nvsize, bufsize - nvsize); 5920 5921 dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); 5922 5923 kmem_free(packed, bufsize); 5924 5925 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 5926 dmu_buf_will_dirty(db, tx); 5927 *(uint64_t *)db->db_data = nvsize; 5928 dmu_buf_rele(db, FTAG); 5929} 5930 5931static void 5932spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 5933 const char *config, const char *entry) 5934{ 5935 nvlist_t *nvroot; 5936 nvlist_t **list; 5937 int i; 5938 5939 if (!sav->sav_sync) 5940 return; 5941 5942 /* 5943 * Update the MOS nvlist describing the list of available devices. 5944 * spa_validate_aux() will have already made sure this nvlist is 5945 * valid and the vdevs are labeled appropriately. 5946 */ 5947 if (sav->sav_object == 0) { 5948 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 5949 DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 5950 sizeof (uint64_t), tx); 5951 VERIFY(zap_update(spa->spa_meta_objset, 5952 DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 5953 &sav->sav_object, tx) == 0); 5954 } 5955 5956 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5957 if (sav->sav_count == 0) { 5958 VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); 5959 } else { 5960 list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 5961 for (i = 0; i < sav->sav_count; i++) 5962 list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 5963 B_FALSE, VDEV_CONFIG_L2CACHE); 5964 VERIFY(nvlist_add_nvlist_array(nvroot, config, list, 5965 sav->sav_count) == 0); 5966 for (i = 0; i < sav->sav_count; i++) 5967 nvlist_free(list[i]); 5968 kmem_free(list, sav->sav_count * sizeof (void *)); 5969 } 5970 5971 spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 5972 nvlist_free(nvroot); 5973 5974 sav->sav_sync = B_FALSE; 5975} 5976 5977static void 5978spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 5979{ 5980 nvlist_t *config; 5981 5982 if (list_is_empty(&spa->spa_config_dirty_list)) 5983 return; 5984 5985 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 5986 5987 config = spa_config_generate(spa, spa->spa_root_vdev, 5988 dmu_tx_get_txg(tx), B_FALSE); 5989 5990 /* 5991 * If we're upgrading the spa version then make sure that 5992 * the config object gets updated with the correct version. 5993 */ 5994 if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version) 5995 fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 5996 spa->spa_uberblock.ub_version); 5997 5998 spa_config_exit(spa, SCL_STATE, FTAG); 5999 6000 if (spa->spa_config_syncing) 6001 nvlist_free(spa->spa_config_syncing); 6002 spa->spa_config_syncing = config; 6003 6004 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 6005} 6006 6007static void 6008spa_sync_version(void *arg, dmu_tx_t *tx) 6009{ 6010 uint64_t *versionp = arg; 6011 uint64_t version = *versionp; 6012 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 6013 6014 /* 6015 * Setting the version is special cased when first creating the pool. 6016 */ 6017 ASSERT(tx->tx_txg != TXG_INITIAL); 6018 6019 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 6020 ASSERT(version >= spa_version(spa)); 6021 6022 spa->spa_uberblock.ub_version = version; 6023 vdev_config_dirty(spa->spa_root_vdev); 6024 spa_history_log_internal(spa, "set", tx, "version=%lld", version); 6025} 6026 6027/* 6028 * Set zpool properties. 6029 */ 6030static void 6031spa_sync_props(void *arg, dmu_tx_t *tx) 6032{ 6033 nvlist_t *nvp = arg; 6034 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 6035 objset_t *mos = spa->spa_meta_objset; 6036 nvpair_t *elem = NULL; 6037 6038 mutex_enter(&spa->spa_props_lock); 6039 6040 while ((elem = nvlist_next_nvpair(nvp, elem))) { 6041 uint64_t intval; 6042 char *strval, *fname; 6043 zpool_prop_t prop; 6044 const char *propname; 6045 zprop_type_t proptype; 6046 zfeature_info_t *feature; 6047 6048 switch (prop = zpool_name_to_prop(nvpair_name(elem))) { 6049 case ZPROP_INVAL: 6050 /* 6051 * We checked this earlier in spa_prop_validate(). 6052 */ 6053 ASSERT(zpool_prop_feature(nvpair_name(elem))); 6054 6055 fname = strchr(nvpair_name(elem), '@') + 1; 6056 VERIFY3U(0, ==, zfeature_lookup_name(fname, &feature)); 6057 6058 spa_feature_enable(spa, feature, tx); 6059 spa_history_log_internal(spa, "set", tx, 6060 "%s=enabled", nvpair_name(elem)); 6061 break; 6062 6063 case ZPOOL_PROP_VERSION: 6064 VERIFY(nvpair_value_uint64(elem, &intval) == 0); 6065 /* 6066 * The version is synced seperatly before other 6067 * properties and should be correct by now. 6068 */ 6069 ASSERT3U(spa_version(spa), >=, intval); 6070 break; 6071 6072 case ZPOOL_PROP_ALTROOT: 6073 /* 6074 * 'altroot' is a non-persistent property. It should 6075 * have been set temporarily at creation or import time. 6076 */ 6077 ASSERT(spa->spa_root != NULL); 6078 break; 6079 6080 case ZPOOL_PROP_READONLY: 6081 case ZPOOL_PROP_CACHEFILE: 6082 /* 6083 * 'readonly' and 'cachefile' are also non-persisitent 6084 * properties. 6085 */ 6086 break; 6087 case ZPOOL_PROP_COMMENT: 6088 VERIFY(nvpair_value_string(elem, &strval) == 0); 6089 if (spa->spa_comment != NULL) 6090 spa_strfree(spa->spa_comment); 6091 spa->spa_comment = spa_strdup(strval); 6092 /* 6093 * We need to dirty the configuration on all the vdevs 6094 * so that their labels get updated. It's unnecessary 6095 * to do this for pool creation since the vdev's 6096 * configuratoin has already been dirtied. 6097 */ 6098 if (tx->tx_txg != TXG_INITIAL) 6099 vdev_config_dirty(spa->spa_root_vdev); 6100 spa_history_log_internal(spa, "set", tx, 6101 "%s=%s", nvpair_name(elem), strval); 6102 break; 6103 default: 6104 /* 6105 * Set pool property values in the poolprops mos object. 6106 */ 6107 if (spa->spa_pool_props_object == 0) { 6108 spa->spa_pool_props_object = 6109 zap_create_link(mos, DMU_OT_POOL_PROPS, 6110 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 6111 tx); 6112 } 6113 6114 /* normalize the property name */ 6115 propname = zpool_prop_to_name(prop); 6116 proptype = zpool_prop_get_type(prop); 6117 6118 if (nvpair_type(elem) == DATA_TYPE_STRING) { 6119 ASSERT(proptype == PROP_TYPE_STRING); 6120 VERIFY(nvpair_value_string(elem, &strval) == 0); 6121 VERIFY(zap_update(mos, 6122 spa->spa_pool_props_object, propname, 6123 1, strlen(strval) + 1, strval, tx) == 0); 6124 spa_history_log_internal(spa, "set", tx, 6125 "%s=%s", nvpair_name(elem), strval); 6126 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 6127 VERIFY(nvpair_value_uint64(elem, &intval) == 0); 6128 6129 if (proptype == PROP_TYPE_INDEX) { 6130 const char *unused; 6131 VERIFY(zpool_prop_index_to_string( 6132 prop, intval, &unused) == 0); 6133 } 6134 VERIFY(zap_update(mos, 6135 spa->spa_pool_props_object, propname, 6136 8, 1, &intval, tx) == 0); 6137 spa_history_log_internal(spa, "set", tx, 6138 "%s=%lld", nvpair_name(elem), intval); 6139 } else { 6140 ASSERT(0); /* not allowed */ 6141 } 6142 6143 switch (prop) { 6144 case ZPOOL_PROP_DELEGATION: 6145 spa->spa_delegation = intval; 6146 break; 6147 case ZPOOL_PROP_BOOTFS: 6148 spa->spa_bootfs = intval; 6149 break; 6150 case ZPOOL_PROP_FAILUREMODE: 6151 spa->spa_failmode = intval; 6152 break; 6153 case ZPOOL_PROP_AUTOEXPAND: 6154 spa->spa_autoexpand = intval; 6155 if (tx->tx_txg != TXG_INITIAL) 6156 spa_async_request(spa, 6157 SPA_ASYNC_AUTOEXPAND); 6158 break; 6159 case ZPOOL_PROP_DEDUPDITTO: 6160 spa->spa_dedup_ditto = intval; 6161 break; 6162 default: 6163 break; 6164 } 6165 } 6166 6167 } 6168 6169 mutex_exit(&spa->spa_props_lock); 6170} 6171 6172/* 6173 * Perform one-time upgrade on-disk changes. spa_version() does not 6174 * reflect the new version this txg, so there must be no changes this 6175 * txg to anything that the upgrade code depends on after it executes. 6176 * Therefore this must be called after dsl_pool_sync() does the sync 6177 * tasks. 6178 */ 6179static void 6180spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) 6181{ 6182 dsl_pool_t *dp = spa->spa_dsl_pool; 6183 6184 ASSERT(spa->spa_sync_pass == 1); 6185 6186 rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); 6187 6188 if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && 6189 spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { 6190 dsl_pool_create_origin(dp, tx); 6191 6192 /* Keeping the origin open increases spa_minref */ 6193 spa->spa_minref += 3; 6194 } 6195 6196 if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && 6197 spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { 6198 dsl_pool_upgrade_clones(dp, tx); 6199 } 6200 6201 if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && 6202 spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { 6203 dsl_pool_upgrade_dir_clones(dp, tx); 6204 6205 /* Keeping the freedir open increases spa_minref */ 6206 spa->spa_minref += 3; 6207 } 6208 6209 if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES && 6210 spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 6211 spa_feature_create_zap_objects(spa, tx); 6212 } 6213 rrw_exit(&dp->dp_config_rwlock, FTAG); 6214} 6215 6216/* 6217 * Sync the specified transaction group. New blocks may be dirtied as 6218 * part of the process, so we iterate until it converges. 6219 */ 6220void 6221spa_sync(spa_t *spa, uint64_t txg) 6222{ 6223 dsl_pool_t *dp = spa->spa_dsl_pool; 6224 objset_t *mos = spa->spa_meta_objset; 6225 bpobj_t *defer_bpo = &spa->spa_deferred_bpobj; 6226 bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; 6227 vdev_t *rvd = spa->spa_root_vdev; 6228 vdev_t *vd; 6229 dmu_tx_t *tx; 6230 int error; 6231 6232 VERIFY(spa_writeable(spa)); 6233 6234 /* 6235 * Lock out configuration changes. 6236 */ 6237 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 6238 6239 spa->spa_syncing_txg = txg; 6240 spa->spa_sync_pass = 0; 6241 6242 /* 6243 * If there are any pending vdev state changes, convert them 6244 * into config changes that go out with this transaction group. 6245 */ 6246 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6247 while (list_head(&spa->spa_state_dirty_list) != NULL) { 6248 /* 6249 * We need the write lock here because, for aux vdevs, 6250 * calling vdev_config_dirty() modifies sav_config. 6251 * This is ugly and will become unnecessary when we 6252 * eliminate the aux vdev wart by integrating all vdevs 6253 * into the root vdev tree. 6254 */ 6255 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 6256 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); 6257 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 6258 vdev_state_clean(vd); 6259 vdev_config_dirty(vd); 6260 } 6261 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 6262 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 6263 } 6264 spa_config_exit(spa, SCL_STATE, FTAG); 6265 6266 tx = dmu_tx_create_assigned(dp, txg); 6267 6268 spa->spa_sync_starttime = gethrtime(); 6269#ifdef illumos 6270 VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, 6271 spa->spa_sync_starttime + spa->spa_deadman_synctime)); 6272#else /* FreeBSD */ 6273#ifdef _KERNEL 6274 callout_reset(&spa->spa_deadman_cycid, 6275 hz * spa->spa_deadman_synctime / NANOSEC, spa_deadman, spa); 6276#endif 6277#endif 6278 6279 /* 6280 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 6281 * set spa_deflate if we have no raid-z vdevs. 6282 */ 6283 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 6284 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 6285 int i; 6286 6287 for (i = 0; i < rvd->vdev_children; i++) { 6288 vd = rvd->vdev_child[i]; 6289 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 6290 break; 6291 } 6292 if (i == rvd->vdev_children) { 6293 spa->spa_deflate = TRUE; 6294 VERIFY(0 == zap_add(spa->spa_meta_objset, 6295 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 6296 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 6297 } 6298 } 6299 6300 /* 6301 * If anything has changed in this txg, or if someone is waiting 6302 * for this txg to sync (eg, spa_vdev_remove()), push the 6303 * deferred frees from the previous txg. If not, leave them 6304 * alone so that we don't generate work on an otherwise idle 6305 * system. 6306 */ 6307 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 6308 !txg_list_empty(&dp->dp_dirty_dirs, txg) || 6309 !txg_list_empty(&dp->dp_sync_tasks, txg) || 6310 ((dsl_scan_active(dp->dp_scan) || 6311 txg_sync_waiting(dp)) && !spa_shutting_down(spa))) { 6312 zio_t *zio = zio_root(spa, NULL, NULL, 0); 6313 VERIFY3U(bpobj_iterate(defer_bpo, 6314 spa_free_sync_cb, zio, tx), ==, 0); 6315 VERIFY0(zio_wait(zio)); 6316 } 6317 6318 /* 6319 * Iterate to convergence. 6320 */ 6321 do { 6322 int pass = ++spa->spa_sync_pass; 6323 6324 spa_sync_config_object(spa, tx); 6325 spa_sync_aux_dev(spa, &spa->spa_spares, tx, 6326 ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 6327 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 6328 ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 6329 spa_errlog_sync(spa, txg); 6330 dsl_pool_sync(dp, txg); 6331 6332 if (pass < zfs_sync_pass_deferred_free) { 6333 zio_t *zio = zio_root(spa, NULL, NULL, 0); 6334 bplist_iterate(free_bpl, spa_free_sync_cb, 6335 zio, tx); 6336 VERIFY(zio_wait(zio) == 0); 6337 } else { 6338 bplist_iterate(free_bpl, bpobj_enqueue_cb, 6339 defer_bpo, tx); 6340 } 6341 6342 ddt_sync(spa, txg); 6343 dsl_scan_sync(dp, tx); 6344 6345 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) 6346 vdev_sync(vd, txg); 6347 6348 if (pass == 1) 6349 spa_sync_upgrades(spa, tx); 6350 6351 } while (dmu_objset_is_dirty(mos, txg)); 6352 6353 /* 6354 * Rewrite the vdev configuration (which includes the uberblock) 6355 * to commit the transaction group. 6356 * 6357 * If there are no dirty vdevs, we sync the uberblock to a few 6358 * random top-level vdevs that are known to be visible in the 6359 * config cache (see spa_vdev_add() for a complete description). 6360 * If there *are* dirty vdevs, sync the uberblock to all vdevs. 6361 */ 6362 for (;;) { 6363 /* 6364 * We hold SCL_STATE to prevent vdev open/close/etc. 6365 * while we're attempting to write the vdev labels. 6366 */ 6367 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6368 6369 if (list_is_empty(&spa->spa_config_dirty_list)) { 6370 vdev_t *svd[SPA_DVAS_PER_BP]; 6371 int svdcount = 0; 6372 int children = rvd->vdev_children; 6373 int c0 = spa_get_random(children); 6374 6375 for (int c = 0; c < children; c++) { 6376 vd = rvd->vdev_child[(c0 + c) % children]; 6377 if (vd->vdev_ms_array == 0 || vd->vdev_islog) 6378 continue; 6379 svd[svdcount++] = vd; 6380 if (svdcount == SPA_DVAS_PER_BP) 6381 break; 6382 } 6383 error = vdev_config_sync(svd, svdcount, txg, B_FALSE); 6384 if (error != 0) 6385 error = vdev_config_sync(svd, svdcount, txg, 6386 B_TRUE); 6387 } else { 6388 error = vdev_config_sync(rvd->vdev_child, 6389 rvd->vdev_children, txg, B_FALSE); 6390 if (error != 0) 6391 error = vdev_config_sync(rvd->vdev_child, 6392 rvd->vdev_children, txg, B_TRUE); 6393 } 6394 6395 if (error == 0) 6396 spa->spa_last_synced_guid = rvd->vdev_guid; 6397 6398 spa_config_exit(spa, SCL_STATE, FTAG); 6399 6400 if (error == 0) 6401 break; 6402 zio_suspend(spa, NULL); 6403 zio_resume_wait(spa); 6404 } 6405 dmu_tx_commit(tx); 6406 6407#ifdef illumos 6408 VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY)); 6409#else /* FreeBSD */ 6410#ifdef _KERNEL 6411 callout_drain(&spa->spa_deadman_cycid); 6412#endif 6413#endif 6414 6415 /* 6416 * Clear the dirty config list. 6417 */ 6418 while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) 6419 vdev_config_clean(vd); 6420 6421 /* 6422 * Now that the new config has synced transactionally, 6423 * let it become visible to the config cache. 6424 */ 6425 if (spa->spa_config_syncing != NULL) { 6426 spa_config_set(spa, spa->spa_config_syncing); 6427 spa->spa_config_txg = txg; 6428 spa->spa_config_syncing = NULL; 6429 } 6430 6431 spa->spa_ubsync = spa->spa_uberblock; 6432 6433 dsl_pool_sync_done(dp, txg); 6434 6435 /* 6436 * Update usable space statistics. 6437 */ 6438 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 6439 vdev_sync_done(vd, txg); 6440 6441 spa_update_dspace(spa); 6442 6443 /* 6444 * It had better be the case that we didn't dirty anything 6445 * since vdev_config_sync(). 6446 */ 6447 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 6448 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 6449 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 6450 6451 spa->spa_sync_pass = 0; 6452 6453 spa_config_exit(spa, SCL_CONFIG, FTAG); 6454 6455 spa_handle_ignored_writes(spa); 6456 6457 /* 6458 * If any async tasks have been requested, kick them off. 6459 */ 6460 spa_async_dispatch(spa); 6461} 6462 6463/* 6464 * Sync all pools. We don't want to hold the namespace lock across these 6465 * operations, so we take a reference on the spa_t and drop the lock during the 6466 * sync. 6467 */ 6468void 6469spa_sync_allpools(void) 6470{ 6471 spa_t *spa = NULL; 6472 mutex_enter(&spa_namespace_lock); 6473 while ((spa = spa_next(spa)) != NULL) { 6474 if (spa_state(spa) != POOL_STATE_ACTIVE || 6475 !spa_writeable(spa) || spa_suspended(spa)) 6476 continue; 6477 spa_open_ref(spa, FTAG); 6478 mutex_exit(&spa_namespace_lock); 6479 txg_wait_synced(spa_get_dsl(spa), 0); 6480 mutex_enter(&spa_namespace_lock); 6481 spa_close(spa, FTAG); 6482 } 6483 mutex_exit(&spa_namespace_lock); 6484} 6485 6486/* 6487 * ========================================================================== 6488 * Miscellaneous routines 6489 * ========================================================================== 6490 */ 6491 6492/* 6493 * Remove all pools in the system. 6494 */ 6495void 6496spa_evict_all(void) 6497{ 6498 spa_t *spa; 6499 6500 /* 6501 * Remove all cached state. All pools should be closed now, 6502 * so every spa in the AVL tree should be unreferenced. 6503 */ 6504 mutex_enter(&spa_namespace_lock); 6505 while ((spa = spa_next(NULL)) != NULL) { 6506 /* 6507 * Stop async tasks. The async thread may need to detach 6508 * a device that's been replaced, which requires grabbing 6509 * spa_namespace_lock, so we must drop it here. 6510 */ 6511 spa_open_ref(spa, FTAG); 6512 mutex_exit(&spa_namespace_lock); 6513 spa_async_suspend(spa); 6514 mutex_enter(&spa_namespace_lock); 6515 spa_close(spa, FTAG); 6516 6517 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 6518 spa_unload(spa); 6519 spa_deactivate(spa); 6520 } 6521 spa_remove(spa); 6522 } 6523 mutex_exit(&spa_namespace_lock); 6524} 6525 6526vdev_t * 6527spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) 6528{ 6529 vdev_t *vd; 6530 int i; 6531 6532 if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 6533 return (vd); 6534 6535 if (aux) { 6536 for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 6537 vd = spa->spa_l2cache.sav_vdevs[i]; 6538 if (vd->vdev_guid == guid) 6539 return (vd); 6540 } 6541 6542 for (i = 0; i < spa->spa_spares.sav_count; i++) { 6543 vd = spa->spa_spares.sav_vdevs[i]; 6544 if (vd->vdev_guid == guid) 6545 return (vd); 6546 } 6547 } 6548 6549 return (NULL); 6550} 6551 6552void 6553spa_upgrade(spa_t *spa, uint64_t version) 6554{ 6555 ASSERT(spa_writeable(spa)); 6556 6557 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6558 6559 /* 6560 * This should only be called for a non-faulted pool, and since a 6561 * future version would result in an unopenable pool, this shouldn't be 6562 * possible. 6563 */ 6564 ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version)); 6565 ASSERT(version >= spa->spa_uberblock.ub_version); 6566 6567 spa->spa_uberblock.ub_version = version; 6568 vdev_config_dirty(spa->spa_root_vdev); 6569 6570 spa_config_exit(spa, SCL_ALL, FTAG); 6571 6572 txg_wait_synced(spa_get_dsl(spa), 0); 6573} 6574 6575boolean_t 6576spa_has_spare(spa_t *spa, uint64_t guid) 6577{ 6578 int i; 6579 uint64_t spareguid; 6580 spa_aux_vdev_t *sav = &spa->spa_spares; 6581 6582 for (i = 0; i < sav->sav_count; i++) 6583 if (sav->sav_vdevs[i]->vdev_guid == guid) 6584 return (B_TRUE); 6585 6586 for (i = 0; i < sav->sav_npending; i++) { 6587 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 6588 &spareguid) == 0 && spareguid == guid) 6589 return (B_TRUE); 6590 } 6591 6592 return (B_FALSE); 6593} 6594 6595/* 6596 * Check if a pool has an active shared spare device. 6597 * Note: reference count of an active spare is 2, as a spare and as a replace 6598 */ 6599static boolean_t 6600spa_has_active_shared_spare(spa_t *spa) 6601{ 6602 int i, refcnt; 6603 uint64_t pool; 6604 spa_aux_vdev_t *sav = &spa->spa_spares; 6605 6606 for (i = 0; i < sav->sav_count; i++) { 6607 if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, 6608 &refcnt) && pool != 0ULL && pool == spa_guid(spa) && 6609 refcnt > 2) 6610 return (B_TRUE); 6611 } 6612 6613 return (B_FALSE); 6614} 6615 6616/* 6617 * Post a sysevent corresponding to the given event. The 'name' must be one of 6618 * the event definitions in sys/sysevent/eventdefs.h. The payload will be 6619 * filled in from the spa and (optionally) the vdev. This doesn't do anything 6620 * in the userland libzpool, as we don't want consumers to misinterpret ztest 6621 * or zdb as real changes. 6622 */ 6623void 6624spa_event_notify(spa_t *spa, vdev_t *vd, const char *name) 6625{ 6626#ifdef _KERNEL 6627 sysevent_t *ev; 6628 sysevent_attr_list_t *attr = NULL; 6629 sysevent_value_t value; 6630 sysevent_id_t eid; 6631 6632 ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", 6633 SE_SLEEP); 6634 6635 value.value_type = SE_DATA_TYPE_STRING; 6636 value.value.sv_string = spa_name(spa); 6637 if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) 6638 goto done; 6639 6640 value.value_type = SE_DATA_TYPE_UINT64; 6641 value.value.sv_uint64 = spa_guid(spa); 6642 if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) 6643 goto done; 6644 6645 if (vd) { 6646 value.value_type = SE_DATA_TYPE_UINT64; 6647 value.value.sv_uint64 = vd->vdev_guid; 6648 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, 6649 SE_SLEEP) != 0) 6650 goto done; 6651 6652 if (vd->vdev_path) { 6653 value.value_type = SE_DATA_TYPE_STRING; 6654 value.value.sv_string = vd->vdev_path; 6655 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, 6656 &value, SE_SLEEP) != 0) 6657 goto done; 6658 } 6659 } 6660 6661 if (sysevent_attach_attributes(ev, attr) != 0) 6662 goto done; 6663 attr = NULL; 6664 6665 (void) log_sysevent(ev, SE_SLEEP, &eid); 6666 6667done: 6668 if (attr) 6669 sysevent_free_attr(attr); 6670 sysevent_free(ev); 6671#endif 6672} 6673