spa.c revision 287667
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22/* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2013 by Delphix. All rights reserved. 25 * Copyright (c) 2013, 2014, Nexenta Systems, Inc. All rights reserved. 26 * Copyright (c) 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved. 27 */ 28 29/* 30 * SPA: Storage Pool Allocator 31 * 32 * This file contains all the routines used when modifying on-disk SPA state. 33 * This includes opening, importing, destroying, exporting a pool, and syncing a 34 * pool. 35 */ 36 37#include <sys/zfs_context.h> 38#include <sys/fm/fs/zfs.h> 39#include <sys/spa_impl.h> 40#include <sys/zio.h> 41#include <sys/zio_checksum.h> 42#include <sys/dmu.h> 43#include <sys/dmu_tx.h> 44#include <sys/zap.h> 45#include <sys/zil.h> 46#include <sys/ddt.h> 47#include <sys/vdev_impl.h> 48#include <sys/metaslab.h> 49#include <sys/metaslab_impl.h> 50#include <sys/uberblock_impl.h> 51#include <sys/txg.h> 52#include <sys/avl.h> 53#include <sys/dmu_traverse.h> 54#include <sys/dmu_objset.h> 55#include <sys/unique.h> 56#include <sys/dsl_pool.h> 57#include <sys/dsl_dataset.h> 58#include <sys/dsl_dir.h> 59#include <sys/dsl_prop.h> 60#include <sys/dsl_synctask.h> 61#include <sys/fs/zfs.h> 62#include <sys/arc.h> 63#include <sys/callb.h> 64#include <sys/spa_boot.h> 65#include <sys/zfs_ioctl.h> 66#include <sys/dsl_scan.h> 67#include <sys/dmu_send.h> 68#include <sys/dsl_destroy.h> 69#include <sys/dsl_userhold.h> 70#include <sys/zfeature.h> 71#include <sys/zvol.h> 72#include <sys/trim_map.h> 73 74#ifdef _KERNEL 75#include <sys/callb.h> 76#include <sys/cpupart.h> 77#include <sys/zone.h> 78#endif /* _KERNEL */ 79 80#include "zfs_prop.h" 81#include "zfs_comutil.h" 82 83/* Check hostid on import? */ 84static int check_hostid = 1; 85 86SYSCTL_DECL(_vfs_zfs); 87TUNABLE_INT("vfs.zfs.check_hostid", &check_hostid); 88SYSCTL_INT(_vfs_zfs, OID_AUTO, check_hostid, CTLFLAG_RW, &check_hostid, 0, 89 "Check hostid on import?"); 90 91/* 92 * The interval, in seconds, at which failed configuration cache file writes 93 * should be retried. 94 */ 95static int zfs_ccw_retry_interval = 300; 96 97typedef enum zti_modes { 98 ZTI_MODE_FIXED, /* value is # of threads (min 1) */ 99 ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */ 100 ZTI_MODE_NULL, /* don't create a taskq */ 101 ZTI_NMODES 102} zti_modes_t; 103 104#define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) } 105#define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 } 106#define ZTI_NULL { ZTI_MODE_NULL, 0, 0 } 107 108#define ZTI_N(n) ZTI_P(n, 1) 109#define ZTI_ONE ZTI_N(1) 110 111typedef struct zio_taskq_info { 112 zti_modes_t zti_mode; 113 uint_t zti_value; 114 uint_t zti_count; 115} zio_taskq_info_t; 116 117static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { 118 "issue", "issue_high", "intr", "intr_high" 119}; 120 121/* 122 * This table defines the taskq settings for each ZFS I/O type. When 123 * initializing a pool, we use this table to create an appropriately sized 124 * taskq. Some operations are low volume and therefore have a small, static 125 * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE 126 * macros. Other operations process a large amount of data; the ZTI_BATCH 127 * macro causes us to create a taskq oriented for throughput. Some operations 128 * are so high frequency and short-lived that the taskq itself can become a a 129 * point of lock contention. The ZTI_P(#, #) macro indicates that we need an 130 * additional degree of parallelism specified by the number of threads per- 131 * taskq and the number of taskqs; when dispatching an event in this case, the 132 * particular taskq is chosen at random. 133 * 134 * The different taskq priorities are to handle the different contexts (issue 135 * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that 136 * need to be handled with minimum delay. 137 */ 138const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { 139 /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ 140 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */ 141 { ZTI_N(8), ZTI_NULL, ZTI_BATCH, ZTI_NULL }, /* READ */ 142 { ZTI_BATCH, ZTI_N(5), ZTI_N(8), ZTI_N(5) }, /* WRITE */ 143 { ZTI_P(12, 8), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */ 144 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */ 145 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */ 146}; 147 148static void spa_sync_version(void *arg, dmu_tx_t *tx); 149static void spa_sync_props(void *arg, dmu_tx_t *tx); 150static boolean_t spa_has_active_shared_spare(spa_t *spa); 151static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config, 152 spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, 153 char **ereport); 154static void spa_vdev_resilver_done(spa_t *spa); 155 156uint_t zio_taskq_batch_pct = 75; /* 1 thread per cpu in pset */ 157#ifdef PSRSET_BIND 158id_t zio_taskq_psrset_bind = PS_NONE; 159#endif 160#ifdef SYSDC 161boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ 162#endif 163uint_t zio_taskq_basedc = 80; /* base duty cycle */ 164 165boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */ 166extern int zfs_sync_pass_deferred_free; 167 168#ifndef illumos 169extern void spa_deadman(void *arg); 170#endif 171 172/* 173 * This (illegal) pool name is used when temporarily importing a spa_t in order 174 * to get the vdev stats associated with the imported devices. 175 */ 176#define TRYIMPORT_NAME "$import" 177 178/* 179 * ========================================================================== 180 * SPA properties routines 181 * ========================================================================== 182 */ 183 184/* 185 * Add a (source=src, propname=propval) list to an nvlist. 186 */ 187static void 188spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, 189 uint64_t intval, zprop_source_t src) 190{ 191 const char *propname = zpool_prop_to_name(prop); 192 nvlist_t *propval; 193 194 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 195 VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); 196 197 if (strval != NULL) 198 VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); 199 else 200 VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); 201 202 VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); 203 nvlist_free(propval); 204} 205 206/* 207 * Get property values from the spa configuration. 208 */ 209static void 210spa_prop_get_config(spa_t *spa, nvlist_t **nvp) 211{ 212 vdev_t *rvd = spa->spa_root_vdev; 213 dsl_pool_t *pool = spa->spa_dsl_pool; 214 uint64_t size, alloc, cap, version; 215 zprop_source_t src = ZPROP_SRC_NONE; 216 spa_config_dirent_t *dp; 217 metaslab_class_t *mc = spa_normal_class(spa); 218 219 ASSERT(MUTEX_HELD(&spa->spa_props_lock)); 220 221 if (rvd != NULL) { 222 alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 223 size = metaslab_class_get_space(spa_normal_class(spa)); 224 spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); 225 spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); 226 spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); 227 spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, 228 size - alloc, src); 229 230 spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL, 231 metaslab_class_fragmentation(mc), src); 232 spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, 233 metaslab_class_expandable_space(mc), src); 234 spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, 235 (spa_mode(spa) == FREAD), src); 236 237 cap = (size == 0) ? 0 : (alloc * 100 / size); 238 spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); 239 240 spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, 241 ddt_get_pool_dedup_ratio(spa), src); 242 243 spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 244 rvd->vdev_state, src); 245 246 version = spa_version(spa); 247 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) 248 src = ZPROP_SRC_DEFAULT; 249 else 250 src = ZPROP_SRC_LOCAL; 251 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); 252 } 253 254 if (pool != NULL) { 255 /* 256 * The $FREE directory was introduced in SPA_VERSION_DEADLISTS, 257 * when opening pools before this version freedir will be NULL. 258 */ 259 if (pool->dp_free_dir != NULL) { 260 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, 261 dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes, 262 src); 263 } else { 264 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, 265 NULL, 0, src); 266 } 267 268 if (pool->dp_leak_dir != NULL) { 269 spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL, 270 dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes, 271 src); 272 } else { 273 spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, 274 NULL, 0, src); 275 } 276 } 277 278 spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); 279 280 if (spa->spa_comment != NULL) { 281 spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment, 282 0, ZPROP_SRC_LOCAL); 283 } 284 285 if (spa->spa_root != NULL) 286 spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 287 0, ZPROP_SRC_LOCAL); 288 289 if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { 290 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, 291 MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE); 292 } else { 293 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, 294 SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE); 295 } 296 297 if ((dp = list_head(&spa->spa_config_list)) != NULL) { 298 if (dp->scd_path == NULL) { 299 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 300 "none", 0, ZPROP_SRC_LOCAL); 301 } else if (strcmp(dp->scd_path, spa_config_path) != 0) { 302 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 303 dp->scd_path, 0, ZPROP_SRC_LOCAL); 304 } 305 } 306} 307 308/* 309 * Get zpool property values. 310 */ 311int 312spa_prop_get(spa_t *spa, nvlist_t **nvp) 313{ 314 objset_t *mos = spa->spa_meta_objset; 315 zap_cursor_t zc; 316 zap_attribute_t za; 317 int err; 318 319 VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 320 321 mutex_enter(&spa->spa_props_lock); 322 323 /* 324 * Get properties from the spa config. 325 */ 326 spa_prop_get_config(spa, nvp); 327 328 /* If no pool property object, no more prop to get. */ 329 if (mos == NULL || spa->spa_pool_props_object == 0) { 330 mutex_exit(&spa->spa_props_lock); 331 return (0); 332 } 333 334 /* 335 * Get properties from the MOS pool property object. 336 */ 337 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 338 (err = zap_cursor_retrieve(&zc, &za)) == 0; 339 zap_cursor_advance(&zc)) { 340 uint64_t intval = 0; 341 char *strval = NULL; 342 zprop_source_t src = ZPROP_SRC_DEFAULT; 343 zpool_prop_t prop; 344 345 if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL) 346 continue; 347 348 switch (za.za_integer_length) { 349 case 8: 350 /* integer property */ 351 if (za.za_first_integer != 352 zpool_prop_default_numeric(prop)) 353 src = ZPROP_SRC_LOCAL; 354 355 if (prop == ZPOOL_PROP_BOOTFS) { 356 dsl_pool_t *dp; 357 dsl_dataset_t *ds = NULL; 358 359 dp = spa_get_dsl(spa); 360 dsl_pool_config_enter(dp, FTAG); 361 if (err = dsl_dataset_hold_obj(dp, 362 za.za_first_integer, FTAG, &ds)) { 363 dsl_pool_config_exit(dp, FTAG); 364 break; 365 } 366 367 strval = kmem_alloc( 368 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, 369 KM_SLEEP); 370 dsl_dataset_name(ds, strval); 371 dsl_dataset_rele(ds, FTAG); 372 dsl_pool_config_exit(dp, FTAG); 373 } else { 374 strval = NULL; 375 intval = za.za_first_integer; 376 } 377 378 spa_prop_add_list(*nvp, prop, strval, intval, src); 379 380 if (strval != NULL) 381 kmem_free(strval, 382 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); 383 384 break; 385 386 case 1: 387 /* string property */ 388 strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 389 err = zap_lookup(mos, spa->spa_pool_props_object, 390 za.za_name, 1, za.za_num_integers, strval); 391 if (err) { 392 kmem_free(strval, za.za_num_integers); 393 break; 394 } 395 spa_prop_add_list(*nvp, prop, strval, 0, src); 396 kmem_free(strval, za.za_num_integers); 397 break; 398 399 default: 400 break; 401 } 402 } 403 zap_cursor_fini(&zc); 404 mutex_exit(&spa->spa_props_lock); 405out: 406 if (err && err != ENOENT) { 407 nvlist_free(*nvp); 408 *nvp = NULL; 409 return (err); 410 } 411 412 return (0); 413} 414 415/* 416 * Validate the given pool properties nvlist and modify the list 417 * for the property values to be set. 418 */ 419static int 420spa_prop_validate(spa_t *spa, nvlist_t *props) 421{ 422 nvpair_t *elem; 423 int error = 0, reset_bootfs = 0; 424 uint64_t objnum = 0; 425 boolean_t has_feature = B_FALSE; 426 427 elem = NULL; 428 while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 429 uint64_t intval; 430 char *strval, *slash, *check, *fname; 431 const char *propname = nvpair_name(elem); 432 zpool_prop_t prop = zpool_name_to_prop(propname); 433 434 switch (prop) { 435 case ZPROP_INVAL: 436 if (!zpool_prop_feature(propname)) { 437 error = SET_ERROR(EINVAL); 438 break; 439 } 440 441 /* 442 * Sanitize the input. 443 */ 444 if (nvpair_type(elem) != DATA_TYPE_UINT64) { 445 error = SET_ERROR(EINVAL); 446 break; 447 } 448 449 if (nvpair_value_uint64(elem, &intval) != 0) { 450 error = SET_ERROR(EINVAL); 451 break; 452 } 453 454 if (intval != 0) { 455 error = SET_ERROR(EINVAL); 456 break; 457 } 458 459 fname = strchr(propname, '@') + 1; 460 if (zfeature_lookup_name(fname, NULL) != 0) { 461 error = SET_ERROR(EINVAL); 462 break; 463 } 464 465 has_feature = B_TRUE; 466 break; 467 468 case ZPOOL_PROP_VERSION: 469 error = nvpair_value_uint64(elem, &intval); 470 if (!error && 471 (intval < spa_version(spa) || 472 intval > SPA_VERSION_BEFORE_FEATURES || 473 has_feature)) 474 error = SET_ERROR(EINVAL); 475 break; 476 477 case ZPOOL_PROP_DELEGATION: 478 case ZPOOL_PROP_AUTOREPLACE: 479 case ZPOOL_PROP_LISTSNAPS: 480 case ZPOOL_PROP_AUTOEXPAND: 481 error = nvpair_value_uint64(elem, &intval); 482 if (!error && intval > 1) 483 error = SET_ERROR(EINVAL); 484 break; 485 486 case ZPOOL_PROP_BOOTFS: 487 /* 488 * If the pool version is less than SPA_VERSION_BOOTFS, 489 * or the pool is still being created (version == 0), 490 * the bootfs property cannot be set. 491 */ 492 if (spa_version(spa) < SPA_VERSION_BOOTFS) { 493 error = SET_ERROR(ENOTSUP); 494 break; 495 } 496 497 /* 498 * Make sure the vdev config is bootable 499 */ 500 if (!vdev_is_bootable(spa->spa_root_vdev)) { 501 error = SET_ERROR(ENOTSUP); 502 break; 503 } 504 505 reset_bootfs = 1; 506 507 error = nvpair_value_string(elem, &strval); 508 509 if (!error) { 510 objset_t *os; 511 uint64_t propval; 512 513 if (strval == NULL || strval[0] == '\0') { 514 objnum = zpool_prop_default_numeric( 515 ZPOOL_PROP_BOOTFS); 516 break; 517 } 518 519 if (error = dmu_objset_hold(strval, FTAG, &os)) 520 break; 521 522 /* 523 * Must be ZPL, and its property settings 524 * must be supported by GRUB (compression 525 * is not gzip, and large blocks are not used). 526 */ 527 528 if (dmu_objset_type(os) != DMU_OST_ZFS) { 529 error = SET_ERROR(ENOTSUP); 530 } else if ((error = 531 dsl_prop_get_int_ds(dmu_objset_ds(os), 532 zfs_prop_to_name(ZFS_PROP_COMPRESSION), 533 &propval)) == 0 && 534 !BOOTFS_COMPRESS_VALID(propval)) { 535 error = SET_ERROR(ENOTSUP); 536 } else if ((error = 537 dsl_prop_get_int_ds(dmu_objset_ds(os), 538 zfs_prop_to_name(ZFS_PROP_RECORDSIZE), 539 &propval)) == 0 && 540 propval > SPA_OLD_MAXBLOCKSIZE) { 541 error = SET_ERROR(ENOTSUP); 542 } else { 543 objnum = dmu_objset_id(os); 544 } 545 dmu_objset_rele(os, FTAG); 546 } 547 break; 548 549 case ZPOOL_PROP_FAILUREMODE: 550 error = nvpair_value_uint64(elem, &intval); 551 if (!error && (intval < ZIO_FAILURE_MODE_WAIT || 552 intval > ZIO_FAILURE_MODE_PANIC)) 553 error = SET_ERROR(EINVAL); 554 555 /* 556 * This is a special case which only occurs when 557 * the pool has completely failed. This allows 558 * the user to change the in-core failmode property 559 * without syncing it out to disk (I/Os might 560 * currently be blocked). We do this by returning 561 * EIO to the caller (spa_prop_set) to trick it 562 * into thinking we encountered a property validation 563 * error. 564 */ 565 if (!error && spa_suspended(spa)) { 566 spa->spa_failmode = intval; 567 error = SET_ERROR(EIO); 568 } 569 break; 570 571 case ZPOOL_PROP_CACHEFILE: 572 if ((error = nvpair_value_string(elem, &strval)) != 0) 573 break; 574 575 if (strval[0] == '\0') 576 break; 577 578 if (strcmp(strval, "none") == 0) 579 break; 580 581 if (strval[0] != '/') { 582 error = SET_ERROR(EINVAL); 583 break; 584 } 585 586 slash = strrchr(strval, '/'); 587 ASSERT(slash != NULL); 588 589 if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 590 strcmp(slash, "/..") == 0) 591 error = SET_ERROR(EINVAL); 592 break; 593 594 case ZPOOL_PROP_COMMENT: 595 if ((error = nvpair_value_string(elem, &strval)) != 0) 596 break; 597 for (check = strval; *check != '\0'; check++) { 598 /* 599 * The kernel doesn't have an easy isprint() 600 * check. For this kernel check, we merely 601 * check ASCII apart from DEL. Fix this if 602 * there is an easy-to-use kernel isprint(). 603 */ 604 if (*check >= 0x7f) { 605 error = SET_ERROR(EINVAL); 606 break; 607 } 608 check++; 609 } 610 if (strlen(strval) > ZPROP_MAX_COMMENT) 611 error = E2BIG; 612 break; 613 614 case ZPOOL_PROP_DEDUPDITTO: 615 if (spa_version(spa) < SPA_VERSION_DEDUP) 616 error = SET_ERROR(ENOTSUP); 617 else 618 error = nvpair_value_uint64(elem, &intval); 619 if (error == 0 && 620 intval != 0 && intval < ZIO_DEDUPDITTO_MIN) 621 error = SET_ERROR(EINVAL); 622 break; 623 } 624 625 if (error) 626 break; 627 } 628 629 if (!error && reset_bootfs) { 630 error = nvlist_remove(props, 631 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 632 633 if (!error) { 634 error = nvlist_add_uint64(props, 635 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 636 } 637 } 638 639 return (error); 640} 641 642void 643spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) 644{ 645 char *cachefile; 646 spa_config_dirent_t *dp; 647 648 if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), 649 &cachefile) != 0) 650 return; 651 652 dp = kmem_alloc(sizeof (spa_config_dirent_t), 653 KM_SLEEP); 654 655 if (cachefile[0] == '\0') 656 dp->scd_path = spa_strdup(spa_config_path); 657 else if (strcmp(cachefile, "none") == 0) 658 dp->scd_path = NULL; 659 else 660 dp->scd_path = spa_strdup(cachefile); 661 662 list_insert_head(&spa->spa_config_list, dp); 663 if (need_sync) 664 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 665} 666 667int 668spa_prop_set(spa_t *spa, nvlist_t *nvp) 669{ 670 int error; 671 nvpair_t *elem = NULL; 672 boolean_t need_sync = B_FALSE; 673 674 if ((error = spa_prop_validate(spa, nvp)) != 0) 675 return (error); 676 677 while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { 678 zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem)); 679 680 if (prop == ZPOOL_PROP_CACHEFILE || 681 prop == ZPOOL_PROP_ALTROOT || 682 prop == ZPOOL_PROP_READONLY) 683 continue; 684 685 if (prop == ZPOOL_PROP_VERSION || prop == ZPROP_INVAL) { 686 uint64_t ver; 687 688 if (prop == ZPOOL_PROP_VERSION) { 689 VERIFY(nvpair_value_uint64(elem, &ver) == 0); 690 } else { 691 ASSERT(zpool_prop_feature(nvpair_name(elem))); 692 ver = SPA_VERSION_FEATURES; 693 need_sync = B_TRUE; 694 } 695 696 /* Save time if the version is already set. */ 697 if (ver == spa_version(spa)) 698 continue; 699 700 /* 701 * In addition to the pool directory object, we might 702 * create the pool properties object, the features for 703 * read object, the features for write object, or the 704 * feature descriptions object. 705 */ 706 error = dsl_sync_task(spa->spa_name, NULL, 707 spa_sync_version, &ver, 708 6, ZFS_SPACE_CHECK_RESERVED); 709 if (error) 710 return (error); 711 continue; 712 } 713 714 need_sync = B_TRUE; 715 break; 716 } 717 718 if (need_sync) { 719 return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props, 720 nvp, 6, ZFS_SPACE_CHECK_RESERVED)); 721 } 722 723 return (0); 724} 725 726/* 727 * If the bootfs property value is dsobj, clear it. 728 */ 729void 730spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 731{ 732 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 733 VERIFY(zap_remove(spa->spa_meta_objset, 734 spa->spa_pool_props_object, 735 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 736 spa->spa_bootfs = 0; 737 } 738} 739 740/*ARGSUSED*/ 741static int 742spa_change_guid_check(void *arg, dmu_tx_t *tx) 743{ 744 uint64_t *newguid = arg; 745 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 746 vdev_t *rvd = spa->spa_root_vdev; 747 uint64_t vdev_state; 748 749 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 750 vdev_state = rvd->vdev_state; 751 spa_config_exit(spa, SCL_STATE, FTAG); 752 753 if (vdev_state != VDEV_STATE_HEALTHY) 754 return (SET_ERROR(ENXIO)); 755 756 ASSERT3U(spa_guid(spa), !=, *newguid); 757 758 return (0); 759} 760 761static void 762spa_change_guid_sync(void *arg, dmu_tx_t *tx) 763{ 764 uint64_t *newguid = arg; 765 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 766 uint64_t oldguid; 767 vdev_t *rvd = spa->spa_root_vdev; 768 769 oldguid = spa_guid(spa); 770 771 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 772 rvd->vdev_guid = *newguid; 773 rvd->vdev_guid_sum += (*newguid - oldguid); 774 vdev_config_dirty(rvd); 775 spa_config_exit(spa, SCL_STATE, FTAG); 776 777 spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu", 778 oldguid, *newguid); 779} 780 781/* 782 * Change the GUID for the pool. This is done so that we can later 783 * re-import a pool built from a clone of our own vdevs. We will modify 784 * the root vdev's guid, our own pool guid, and then mark all of our 785 * vdevs dirty. Note that we must make sure that all our vdevs are 786 * online when we do this, or else any vdevs that weren't present 787 * would be orphaned from our pool. We are also going to issue a 788 * sysevent to update any watchers. 789 */ 790int 791spa_change_guid(spa_t *spa) 792{ 793 int error; 794 uint64_t guid; 795 796 mutex_enter(&spa->spa_vdev_top_lock); 797 mutex_enter(&spa_namespace_lock); 798 guid = spa_generate_guid(NULL); 799 800 error = dsl_sync_task(spa->spa_name, spa_change_guid_check, 801 spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED); 802 803 if (error == 0) { 804 spa_config_sync(spa, B_FALSE, B_TRUE); 805 spa_event_notify(spa, NULL, ESC_ZFS_POOL_REGUID); 806 } 807 808 mutex_exit(&spa_namespace_lock); 809 mutex_exit(&spa->spa_vdev_top_lock); 810 811 return (error); 812} 813 814/* 815 * ========================================================================== 816 * SPA state manipulation (open/create/destroy/import/export) 817 * ========================================================================== 818 */ 819 820static int 821spa_error_entry_compare(const void *a, const void *b) 822{ 823 spa_error_entry_t *sa = (spa_error_entry_t *)a; 824 spa_error_entry_t *sb = (spa_error_entry_t *)b; 825 int ret; 826 827 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 828 sizeof (zbookmark_phys_t)); 829 830 if (ret < 0) 831 return (-1); 832 else if (ret > 0) 833 return (1); 834 else 835 return (0); 836} 837 838/* 839 * Utility function which retrieves copies of the current logs and 840 * re-initializes them in the process. 841 */ 842void 843spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 844{ 845 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 846 847 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 848 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 849 850 avl_create(&spa->spa_errlist_scrub, 851 spa_error_entry_compare, sizeof (spa_error_entry_t), 852 offsetof(spa_error_entry_t, se_avl)); 853 avl_create(&spa->spa_errlist_last, 854 spa_error_entry_compare, sizeof (spa_error_entry_t), 855 offsetof(spa_error_entry_t, se_avl)); 856} 857 858static void 859spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 860{ 861 const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; 862 enum zti_modes mode = ztip->zti_mode; 863 uint_t value = ztip->zti_value; 864 uint_t count = ztip->zti_count; 865 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 866 char name[32]; 867 uint_t flags = 0; 868 boolean_t batch = B_FALSE; 869 870 if (mode == ZTI_MODE_NULL) { 871 tqs->stqs_count = 0; 872 tqs->stqs_taskq = NULL; 873 return; 874 } 875 876 ASSERT3U(count, >, 0); 877 878 tqs->stqs_count = count; 879 tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP); 880 881 switch (mode) { 882 case ZTI_MODE_FIXED: 883 ASSERT3U(value, >=, 1); 884 value = MAX(value, 1); 885 break; 886 887 case ZTI_MODE_BATCH: 888 batch = B_TRUE; 889 flags |= TASKQ_THREADS_CPU_PCT; 890 value = zio_taskq_batch_pct; 891 break; 892 893 default: 894 panic("unrecognized mode for %s_%s taskq (%u:%u) in " 895 "spa_activate()", 896 zio_type_name[t], zio_taskq_types[q], mode, value); 897 break; 898 } 899 900 for (uint_t i = 0; i < count; i++) { 901 taskq_t *tq; 902 903 if (count > 1) { 904 (void) snprintf(name, sizeof (name), "%s_%s_%u", 905 zio_type_name[t], zio_taskq_types[q], i); 906 } else { 907 (void) snprintf(name, sizeof (name), "%s_%s", 908 zio_type_name[t], zio_taskq_types[q]); 909 } 910 911#ifdef SYSDC 912 if (zio_taskq_sysdc && spa->spa_proc != &p0) { 913 if (batch) 914 flags |= TASKQ_DC_BATCH; 915 916 tq = taskq_create_sysdc(name, value, 50, INT_MAX, 917 spa->spa_proc, zio_taskq_basedc, flags); 918 } else { 919#endif 920 pri_t pri = maxclsyspri; 921 /* 922 * The write issue taskq can be extremely CPU 923 * intensive. Run it at slightly lower priority 924 * than the other taskqs. 925 */ 926 if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) 927 pri--; 928 929 tq = taskq_create_proc(name, value, pri, 50, 930 INT_MAX, spa->spa_proc, flags); 931#ifdef SYSDC 932 } 933#endif 934 935 tqs->stqs_taskq[i] = tq; 936 } 937} 938 939static void 940spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 941{ 942 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 943 944 if (tqs->stqs_taskq == NULL) { 945 ASSERT0(tqs->stqs_count); 946 return; 947 } 948 949 for (uint_t i = 0; i < tqs->stqs_count; i++) { 950 ASSERT3P(tqs->stqs_taskq[i], !=, NULL); 951 taskq_destroy(tqs->stqs_taskq[i]); 952 } 953 954 kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *)); 955 tqs->stqs_taskq = NULL; 956} 957 958/* 959 * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority. 960 * Note that a type may have multiple discrete taskqs to avoid lock contention 961 * on the taskq itself. In that case we choose which taskq at random by using 962 * the low bits of gethrtime(). 963 */ 964void 965spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, 966 task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent) 967{ 968 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 969 taskq_t *tq; 970 971 ASSERT3P(tqs->stqs_taskq, !=, NULL); 972 ASSERT3U(tqs->stqs_count, !=, 0); 973 974 if (tqs->stqs_count == 1) { 975 tq = tqs->stqs_taskq[0]; 976 } else { 977#ifdef _KERNEL 978 tq = tqs->stqs_taskq[cpu_ticks() % tqs->stqs_count]; 979#else 980 tq = tqs->stqs_taskq[gethrtime() % tqs->stqs_count]; 981#endif 982 } 983 984 taskq_dispatch_ent(tq, func, arg, flags, ent); 985} 986 987static void 988spa_create_zio_taskqs(spa_t *spa) 989{ 990 for (int t = 0; t < ZIO_TYPES; t++) { 991 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 992 spa_taskqs_init(spa, t, q); 993 } 994 } 995} 996 997#ifdef _KERNEL 998#ifdef SPA_PROCESS 999static void 1000spa_thread(void *arg) 1001{ 1002 callb_cpr_t cprinfo; 1003 1004 spa_t *spa = arg; 1005 user_t *pu = PTOU(curproc); 1006 1007 CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, 1008 spa->spa_name); 1009 1010 ASSERT(curproc != &p0); 1011 (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), 1012 "zpool-%s", spa->spa_name); 1013 (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); 1014 1015#ifdef PSRSET_BIND 1016 /* bind this thread to the requested psrset */ 1017 if (zio_taskq_psrset_bind != PS_NONE) { 1018 pool_lock(); 1019 mutex_enter(&cpu_lock); 1020 mutex_enter(&pidlock); 1021 mutex_enter(&curproc->p_lock); 1022 1023 if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, 1024 0, NULL, NULL) == 0) { 1025 curthread->t_bind_pset = zio_taskq_psrset_bind; 1026 } else { 1027 cmn_err(CE_WARN, 1028 "Couldn't bind process for zfs pool \"%s\" to " 1029 "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); 1030 } 1031 1032 mutex_exit(&curproc->p_lock); 1033 mutex_exit(&pidlock); 1034 mutex_exit(&cpu_lock); 1035 pool_unlock(); 1036 } 1037#endif 1038 1039#ifdef SYSDC 1040 if (zio_taskq_sysdc) { 1041 sysdc_thread_enter(curthread, 100, 0); 1042 } 1043#endif 1044 1045 spa->spa_proc = curproc; 1046 spa->spa_did = curthread->t_did; 1047 1048 spa_create_zio_taskqs(spa); 1049 1050 mutex_enter(&spa->spa_proc_lock); 1051 ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); 1052 1053 spa->spa_proc_state = SPA_PROC_ACTIVE; 1054 cv_broadcast(&spa->spa_proc_cv); 1055 1056 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1057 while (spa->spa_proc_state == SPA_PROC_ACTIVE) 1058 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1059 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); 1060 1061 ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); 1062 spa->spa_proc_state = SPA_PROC_GONE; 1063 spa->spa_proc = &p0; 1064 cv_broadcast(&spa->spa_proc_cv); 1065 CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ 1066 1067 mutex_enter(&curproc->p_lock); 1068 lwp_exit(); 1069} 1070#endif /* SPA_PROCESS */ 1071#endif 1072 1073/* 1074 * Activate an uninitialized pool. 1075 */ 1076static void 1077spa_activate(spa_t *spa, int mode) 1078{ 1079 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 1080 1081 spa->spa_state = POOL_STATE_ACTIVE; 1082 spa->spa_mode = mode; 1083 1084 spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops); 1085 spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); 1086 1087 /* Try to create a covering process */ 1088 mutex_enter(&spa->spa_proc_lock); 1089 ASSERT(spa->spa_proc_state == SPA_PROC_NONE); 1090 ASSERT(spa->spa_proc == &p0); 1091 spa->spa_did = 0; 1092 1093#ifdef SPA_PROCESS 1094 /* Only create a process if we're going to be around a while. */ 1095 if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { 1096 if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, 1097 NULL, 0) == 0) { 1098 spa->spa_proc_state = SPA_PROC_CREATED; 1099 while (spa->spa_proc_state == SPA_PROC_CREATED) { 1100 cv_wait(&spa->spa_proc_cv, 1101 &spa->spa_proc_lock); 1102 } 1103 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1104 ASSERT(spa->spa_proc != &p0); 1105 ASSERT(spa->spa_did != 0); 1106 } else { 1107#ifdef _KERNEL 1108 cmn_err(CE_WARN, 1109 "Couldn't create process for zfs pool \"%s\"\n", 1110 spa->spa_name); 1111#endif 1112 } 1113 } 1114#endif /* SPA_PROCESS */ 1115 mutex_exit(&spa->spa_proc_lock); 1116 1117 /* If we didn't create a process, we need to create our taskqs. */ 1118 ASSERT(spa->spa_proc == &p0); 1119 if (spa->spa_proc == &p0) { 1120 spa_create_zio_taskqs(spa); 1121 } 1122 1123 /* 1124 * Start TRIM thread. 1125 */ 1126 trim_thread_create(spa); 1127 1128 list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), 1129 offsetof(vdev_t, vdev_config_dirty_node)); 1130 list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), 1131 offsetof(vdev_t, vdev_state_dirty_node)); 1132 1133 txg_list_create(&spa->spa_vdev_txg_list, 1134 offsetof(struct vdev, vdev_txg_node)); 1135 1136 avl_create(&spa->spa_errlist_scrub, 1137 spa_error_entry_compare, sizeof (spa_error_entry_t), 1138 offsetof(spa_error_entry_t, se_avl)); 1139 avl_create(&spa->spa_errlist_last, 1140 spa_error_entry_compare, sizeof (spa_error_entry_t), 1141 offsetof(spa_error_entry_t, se_avl)); 1142} 1143 1144/* 1145 * Opposite of spa_activate(). 1146 */ 1147static void 1148spa_deactivate(spa_t *spa) 1149{ 1150 ASSERT(spa->spa_sync_on == B_FALSE); 1151 ASSERT(spa->spa_dsl_pool == NULL); 1152 ASSERT(spa->spa_root_vdev == NULL); 1153 ASSERT(spa->spa_async_zio_root == NULL); 1154 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 1155 1156 /* 1157 * Stop TRIM thread in case spa_unload() wasn't called directly 1158 * before spa_deactivate(). 1159 */ 1160 trim_thread_destroy(spa); 1161 1162 txg_list_destroy(&spa->spa_vdev_txg_list); 1163 1164 list_destroy(&spa->spa_config_dirty_list); 1165 list_destroy(&spa->spa_state_dirty_list); 1166 1167 for (int t = 0; t < ZIO_TYPES; t++) { 1168 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 1169 spa_taskqs_fini(spa, t, q); 1170 } 1171 } 1172 1173 metaslab_class_destroy(spa->spa_normal_class); 1174 spa->spa_normal_class = NULL; 1175 1176 metaslab_class_destroy(spa->spa_log_class); 1177 spa->spa_log_class = NULL; 1178 1179 /* 1180 * If this was part of an import or the open otherwise failed, we may 1181 * still have errors left in the queues. Empty them just in case. 1182 */ 1183 spa_errlog_drain(spa); 1184 1185 avl_destroy(&spa->spa_errlist_scrub); 1186 avl_destroy(&spa->spa_errlist_last); 1187 1188 spa->spa_state = POOL_STATE_UNINITIALIZED; 1189 1190 mutex_enter(&spa->spa_proc_lock); 1191 if (spa->spa_proc_state != SPA_PROC_NONE) { 1192 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1193 spa->spa_proc_state = SPA_PROC_DEACTIVATE; 1194 cv_broadcast(&spa->spa_proc_cv); 1195 while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { 1196 ASSERT(spa->spa_proc != &p0); 1197 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1198 } 1199 ASSERT(spa->spa_proc_state == SPA_PROC_GONE); 1200 spa->spa_proc_state = SPA_PROC_NONE; 1201 } 1202 ASSERT(spa->spa_proc == &p0); 1203 mutex_exit(&spa->spa_proc_lock); 1204 1205#ifdef SPA_PROCESS 1206 /* 1207 * We want to make sure spa_thread() has actually exited the ZFS 1208 * module, so that the module can't be unloaded out from underneath 1209 * it. 1210 */ 1211 if (spa->spa_did != 0) { 1212 thread_join(spa->spa_did); 1213 spa->spa_did = 0; 1214 } 1215#endif /* SPA_PROCESS */ 1216} 1217 1218/* 1219 * Verify a pool configuration, and construct the vdev tree appropriately. This 1220 * will create all the necessary vdevs in the appropriate layout, with each vdev 1221 * in the CLOSED state. This will prep the pool before open/creation/import. 1222 * All vdev validation is done by the vdev_alloc() routine. 1223 */ 1224static int 1225spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 1226 uint_t id, int atype) 1227{ 1228 nvlist_t **child; 1229 uint_t children; 1230 int error; 1231 1232 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 1233 return (error); 1234 1235 if ((*vdp)->vdev_ops->vdev_op_leaf) 1236 return (0); 1237 1238 error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 1239 &child, &children); 1240 1241 if (error == ENOENT) 1242 return (0); 1243 1244 if (error) { 1245 vdev_free(*vdp); 1246 *vdp = NULL; 1247 return (SET_ERROR(EINVAL)); 1248 } 1249 1250 for (int c = 0; c < children; c++) { 1251 vdev_t *vd; 1252 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 1253 atype)) != 0) { 1254 vdev_free(*vdp); 1255 *vdp = NULL; 1256 return (error); 1257 } 1258 } 1259 1260 ASSERT(*vdp != NULL); 1261 1262 return (0); 1263} 1264 1265/* 1266 * Opposite of spa_load(). 1267 */ 1268static void 1269spa_unload(spa_t *spa) 1270{ 1271 int i; 1272 1273 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1274 1275 /* 1276 * Stop TRIM thread. 1277 */ 1278 trim_thread_destroy(spa); 1279 1280 /* 1281 * Stop async tasks. 1282 */ 1283 spa_async_suspend(spa); 1284 1285 /* 1286 * Stop syncing. 1287 */ 1288 if (spa->spa_sync_on) { 1289 txg_sync_stop(spa->spa_dsl_pool); 1290 spa->spa_sync_on = B_FALSE; 1291 } 1292 1293 /* 1294 * Wait for any outstanding async I/O to complete. 1295 */ 1296 if (spa->spa_async_zio_root != NULL) { 1297 for (int i = 0; i < max_ncpus; i++) 1298 (void) zio_wait(spa->spa_async_zio_root[i]); 1299 kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *)); 1300 spa->spa_async_zio_root = NULL; 1301 } 1302 1303 bpobj_close(&spa->spa_deferred_bpobj); 1304 1305 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1306 1307 /* 1308 * Close all vdevs. 1309 */ 1310 if (spa->spa_root_vdev) 1311 vdev_free(spa->spa_root_vdev); 1312 ASSERT(spa->spa_root_vdev == NULL); 1313 1314 /* 1315 * Close the dsl pool. 1316 */ 1317 if (spa->spa_dsl_pool) { 1318 dsl_pool_close(spa->spa_dsl_pool); 1319 spa->spa_dsl_pool = NULL; 1320 spa->spa_meta_objset = NULL; 1321 } 1322 1323 ddt_unload(spa); 1324 1325 1326 /* 1327 * Drop and purge level 2 cache 1328 */ 1329 spa_l2cache_drop(spa); 1330 1331 for (i = 0; i < spa->spa_spares.sav_count; i++) 1332 vdev_free(spa->spa_spares.sav_vdevs[i]); 1333 if (spa->spa_spares.sav_vdevs) { 1334 kmem_free(spa->spa_spares.sav_vdevs, 1335 spa->spa_spares.sav_count * sizeof (void *)); 1336 spa->spa_spares.sav_vdevs = NULL; 1337 } 1338 if (spa->spa_spares.sav_config) { 1339 nvlist_free(spa->spa_spares.sav_config); 1340 spa->spa_spares.sav_config = NULL; 1341 } 1342 spa->spa_spares.sav_count = 0; 1343 1344 for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 1345 vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]); 1346 vdev_free(spa->spa_l2cache.sav_vdevs[i]); 1347 } 1348 if (spa->spa_l2cache.sav_vdevs) { 1349 kmem_free(spa->spa_l2cache.sav_vdevs, 1350 spa->spa_l2cache.sav_count * sizeof (void *)); 1351 spa->spa_l2cache.sav_vdevs = NULL; 1352 } 1353 if (spa->spa_l2cache.sav_config) { 1354 nvlist_free(spa->spa_l2cache.sav_config); 1355 spa->spa_l2cache.sav_config = NULL; 1356 } 1357 spa->spa_l2cache.sav_count = 0; 1358 1359 spa->spa_async_suspended = 0; 1360 1361 if (spa->spa_comment != NULL) { 1362 spa_strfree(spa->spa_comment); 1363 spa->spa_comment = NULL; 1364 } 1365 1366 spa_config_exit(spa, SCL_ALL, FTAG); 1367} 1368 1369/* 1370 * Load (or re-load) the current list of vdevs describing the active spares for 1371 * this pool. When this is called, we have some form of basic information in 1372 * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and 1373 * then re-generate a more complete list including status information. 1374 */ 1375static void 1376spa_load_spares(spa_t *spa) 1377{ 1378 nvlist_t **spares; 1379 uint_t nspares; 1380 int i; 1381 vdev_t *vd, *tvd; 1382 1383 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1384 1385 /* 1386 * First, close and free any existing spare vdevs. 1387 */ 1388 for (i = 0; i < spa->spa_spares.sav_count; i++) { 1389 vd = spa->spa_spares.sav_vdevs[i]; 1390 1391 /* Undo the call to spa_activate() below */ 1392 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1393 B_FALSE)) != NULL && tvd->vdev_isspare) 1394 spa_spare_remove(tvd); 1395 vdev_close(vd); 1396 vdev_free(vd); 1397 } 1398 1399 if (spa->spa_spares.sav_vdevs) 1400 kmem_free(spa->spa_spares.sav_vdevs, 1401 spa->spa_spares.sav_count * sizeof (void *)); 1402 1403 if (spa->spa_spares.sav_config == NULL) 1404 nspares = 0; 1405 else 1406 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 1407 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1408 1409 spa->spa_spares.sav_count = (int)nspares; 1410 spa->spa_spares.sav_vdevs = NULL; 1411 1412 if (nspares == 0) 1413 return; 1414 1415 /* 1416 * Construct the array of vdevs, opening them to get status in the 1417 * process. For each spare, there is potentially two different vdev_t 1418 * structures associated with it: one in the list of spares (used only 1419 * for basic validation purposes) and one in the active vdev 1420 * configuration (if it's spared in). During this phase we open and 1421 * validate each vdev on the spare list. If the vdev also exists in the 1422 * active configuration, then we also mark this vdev as an active spare. 1423 */ 1424 spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), 1425 KM_SLEEP); 1426 for (i = 0; i < spa->spa_spares.sav_count; i++) { 1427 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 1428 VDEV_ALLOC_SPARE) == 0); 1429 ASSERT(vd != NULL); 1430 1431 spa->spa_spares.sav_vdevs[i] = vd; 1432 1433 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1434 B_FALSE)) != NULL) { 1435 if (!tvd->vdev_isspare) 1436 spa_spare_add(tvd); 1437 1438 /* 1439 * We only mark the spare active if we were successfully 1440 * able to load the vdev. Otherwise, importing a pool 1441 * with a bad active spare would result in strange 1442 * behavior, because multiple pool would think the spare 1443 * is actively in use. 1444 * 1445 * There is a vulnerability here to an equally bizarre 1446 * circumstance, where a dead active spare is later 1447 * brought back to life (onlined or otherwise). Given 1448 * the rarity of this scenario, and the extra complexity 1449 * it adds, we ignore the possibility. 1450 */ 1451 if (!vdev_is_dead(tvd)) 1452 spa_spare_activate(tvd); 1453 } 1454 1455 vd->vdev_top = vd; 1456 vd->vdev_aux = &spa->spa_spares; 1457 1458 if (vdev_open(vd) != 0) 1459 continue; 1460 1461 if (vdev_validate_aux(vd) == 0) 1462 spa_spare_add(vd); 1463 } 1464 1465 /* 1466 * Recompute the stashed list of spares, with status information 1467 * this time. 1468 */ 1469 VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, 1470 DATA_TYPE_NVLIST_ARRAY) == 0); 1471 1472 spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), 1473 KM_SLEEP); 1474 for (i = 0; i < spa->spa_spares.sav_count; i++) 1475 spares[i] = vdev_config_generate(spa, 1476 spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); 1477 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 1478 ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); 1479 for (i = 0; i < spa->spa_spares.sav_count; i++) 1480 nvlist_free(spares[i]); 1481 kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); 1482} 1483 1484/* 1485 * Load (or re-load) the current list of vdevs describing the active l2cache for 1486 * this pool. When this is called, we have some form of basic information in 1487 * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and 1488 * then re-generate a more complete list including status information. 1489 * Devices which are already active have their details maintained, and are 1490 * not re-opened. 1491 */ 1492static void 1493spa_load_l2cache(spa_t *spa) 1494{ 1495 nvlist_t **l2cache; 1496 uint_t nl2cache; 1497 int i, j, oldnvdevs; 1498 uint64_t guid; 1499 vdev_t *vd, **oldvdevs, **newvdevs; 1500 spa_aux_vdev_t *sav = &spa->spa_l2cache; 1501 1502 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1503 1504 if (sav->sav_config != NULL) { 1505 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 1506 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 1507 newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); 1508 } else { 1509 nl2cache = 0; 1510 newvdevs = NULL; 1511 } 1512 1513 oldvdevs = sav->sav_vdevs; 1514 oldnvdevs = sav->sav_count; 1515 sav->sav_vdevs = NULL; 1516 sav->sav_count = 0; 1517 1518 /* 1519 * Process new nvlist of vdevs. 1520 */ 1521 for (i = 0; i < nl2cache; i++) { 1522 VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, 1523 &guid) == 0); 1524 1525 newvdevs[i] = NULL; 1526 for (j = 0; j < oldnvdevs; j++) { 1527 vd = oldvdevs[j]; 1528 if (vd != NULL && guid == vd->vdev_guid) { 1529 /* 1530 * Retain previous vdev for add/remove ops. 1531 */ 1532 newvdevs[i] = vd; 1533 oldvdevs[j] = NULL; 1534 break; 1535 } 1536 } 1537 1538 if (newvdevs[i] == NULL) { 1539 /* 1540 * Create new vdev 1541 */ 1542 VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, 1543 VDEV_ALLOC_L2CACHE) == 0); 1544 ASSERT(vd != NULL); 1545 newvdevs[i] = vd; 1546 1547 /* 1548 * Commit this vdev as an l2cache device, 1549 * even if it fails to open. 1550 */ 1551 spa_l2cache_add(vd); 1552 1553 vd->vdev_top = vd; 1554 vd->vdev_aux = sav; 1555 1556 spa_l2cache_activate(vd); 1557 1558 if (vdev_open(vd) != 0) 1559 continue; 1560 1561 (void) vdev_validate_aux(vd); 1562 1563 if (!vdev_is_dead(vd)) 1564 l2arc_add_vdev(spa, vd); 1565 } 1566 } 1567 1568 /* 1569 * Purge vdevs that were dropped 1570 */ 1571 for (i = 0; i < oldnvdevs; i++) { 1572 uint64_t pool; 1573 1574 vd = oldvdevs[i]; 1575 if (vd != NULL) { 1576 ASSERT(vd->vdev_isl2cache); 1577 1578 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 1579 pool != 0ULL && l2arc_vdev_present(vd)) 1580 l2arc_remove_vdev(vd); 1581 vdev_clear_stats(vd); 1582 vdev_free(vd); 1583 } 1584 } 1585 1586 if (oldvdevs) 1587 kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); 1588 1589 if (sav->sav_config == NULL) 1590 goto out; 1591 1592 sav->sav_vdevs = newvdevs; 1593 sav->sav_count = (int)nl2cache; 1594 1595 /* 1596 * Recompute the stashed list of l2cache devices, with status 1597 * information this time. 1598 */ 1599 VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 1600 DATA_TYPE_NVLIST_ARRAY) == 0); 1601 1602 l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 1603 for (i = 0; i < sav->sav_count; i++) 1604 l2cache[i] = vdev_config_generate(spa, 1605 sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); 1606 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 1607 ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); 1608out: 1609 for (i = 0; i < sav->sav_count; i++) 1610 nvlist_free(l2cache[i]); 1611 if (sav->sav_count) 1612 kmem_free(l2cache, sav->sav_count * sizeof (void *)); 1613} 1614 1615static int 1616load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 1617{ 1618 dmu_buf_t *db; 1619 char *packed = NULL; 1620 size_t nvsize = 0; 1621 int error; 1622 *value = NULL; 1623 1624 error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db); 1625 if (error != 0) 1626 return (error); 1627 nvsize = *(uint64_t *)db->db_data; 1628 dmu_buf_rele(db, FTAG); 1629 1630 packed = kmem_alloc(nvsize, KM_SLEEP); 1631 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, 1632 DMU_READ_PREFETCH); 1633 if (error == 0) 1634 error = nvlist_unpack(packed, nvsize, value, 0); 1635 kmem_free(packed, nvsize); 1636 1637 return (error); 1638} 1639 1640/* 1641 * Checks to see if the given vdev could not be opened, in which case we post a 1642 * sysevent to notify the autoreplace code that the device has been removed. 1643 */ 1644static void 1645spa_check_removed(vdev_t *vd) 1646{ 1647 for (int c = 0; c < vd->vdev_children; c++) 1648 spa_check_removed(vd->vdev_child[c]); 1649 1650 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) && 1651 !vd->vdev_ishole) { 1652 zfs_post_autoreplace(vd->vdev_spa, vd); 1653 spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK); 1654 } 1655} 1656 1657/* 1658 * Validate the current config against the MOS config 1659 */ 1660static boolean_t 1661spa_config_valid(spa_t *spa, nvlist_t *config) 1662{ 1663 vdev_t *mrvd, *rvd = spa->spa_root_vdev; 1664 nvlist_t *nv; 1665 1666 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0); 1667 1668 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1669 VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); 1670 1671 ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children); 1672 1673 /* 1674 * If we're doing a normal import, then build up any additional 1675 * diagnostic information about missing devices in this config. 1676 * We'll pass this up to the user for further processing. 1677 */ 1678 if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { 1679 nvlist_t **child, *nv; 1680 uint64_t idx = 0; 1681 1682 child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **), 1683 KM_SLEEP); 1684 VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); 1685 1686 for (int c = 0; c < rvd->vdev_children; c++) { 1687 vdev_t *tvd = rvd->vdev_child[c]; 1688 vdev_t *mtvd = mrvd->vdev_child[c]; 1689 1690 if (tvd->vdev_ops == &vdev_missing_ops && 1691 mtvd->vdev_ops != &vdev_missing_ops && 1692 mtvd->vdev_islog) 1693 child[idx++] = vdev_config_generate(spa, mtvd, 1694 B_FALSE, 0); 1695 } 1696 1697 if (idx) { 1698 VERIFY(nvlist_add_nvlist_array(nv, 1699 ZPOOL_CONFIG_CHILDREN, child, idx) == 0); 1700 VERIFY(nvlist_add_nvlist(spa->spa_load_info, 1701 ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0); 1702 1703 for (int i = 0; i < idx; i++) 1704 nvlist_free(child[i]); 1705 } 1706 nvlist_free(nv); 1707 kmem_free(child, rvd->vdev_children * sizeof (char **)); 1708 } 1709 1710 /* 1711 * Compare the root vdev tree with the information we have 1712 * from the MOS config (mrvd). Check each top-level vdev 1713 * with the corresponding MOS config top-level (mtvd). 1714 */ 1715 for (int c = 0; c < rvd->vdev_children; c++) { 1716 vdev_t *tvd = rvd->vdev_child[c]; 1717 vdev_t *mtvd = mrvd->vdev_child[c]; 1718 1719 /* 1720 * Resolve any "missing" vdevs in the current configuration. 1721 * If we find that the MOS config has more accurate information 1722 * about the top-level vdev then use that vdev instead. 1723 */ 1724 if (tvd->vdev_ops == &vdev_missing_ops && 1725 mtvd->vdev_ops != &vdev_missing_ops) { 1726 1727 if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) 1728 continue; 1729 1730 /* 1731 * Device specific actions. 1732 */ 1733 if (mtvd->vdev_islog) { 1734 spa_set_log_state(spa, SPA_LOG_CLEAR); 1735 } else { 1736 /* 1737 * XXX - once we have 'readonly' pool 1738 * support we should be able to handle 1739 * missing data devices by transitioning 1740 * the pool to readonly. 1741 */ 1742 continue; 1743 } 1744 1745 /* 1746 * Swap the missing vdev with the data we were 1747 * able to obtain from the MOS config. 1748 */ 1749 vdev_remove_child(rvd, tvd); 1750 vdev_remove_child(mrvd, mtvd); 1751 1752 vdev_add_child(rvd, mtvd); 1753 vdev_add_child(mrvd, tvd); 1754 1755 spa_config_exit(spa, SCL_ALL, FTAG); 1756 vdev_load(mtvd); 1757 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1758 1759 vdev_reopen(rvd); 1760 } else if (mtvd->vdev_islog) { 1761 /* 1762 * Load the slog device's state from the MOS config 1763 * since it's possible that the label does not 1764 * contain the most up-to-date information. 1765 */ 1766 vdev_load_log_state(tvd, mtvd); 1767 vdev_reopen(tvd); 1768 } 1769 } 1770 vdev_free(mrvd); 1771 spa_config_exit(spa, SCL_ALL, FTAG); 1772 1773 /* 1774 * Ensure we were able to validate the config. 1775 */ 1776 return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum); 1777} 1778 1779/* 1780 * Check for missing log devices 1781 */ 1782static boolean_t 1783spa_check_logs(spa_t *spa) 1784{ 1785 boolean_t rv = B_FALSE; 1786 1787 switch (spa->spa_log_state) { 1788 case SPA_LOG_MISSING: 1789 /* need to recheck in case slog has been restored */ 1790 case SPA_LOG_UNKNOWN: 1791 rv = (dmu_objset_find(spa->spa_name, zil_check_log_chain, 1792 NULL, DS_FIND_CHILDREN) != 0); 1793 if (rv) 1794 spa_set_log_state(spa, SPA_LOG_MISSING); 1795 break; 1796 } 1797 return (rv); 1798} 1799 1800static boolean_t 1801spa_passivate_log(spa_t *spa) 1802{ 1803 vdev_t *rvd = spa->spa_root_vdev; 1804 boolean_t slog_found = B_FALSE; 1805 1806 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1807 1808 if (!spa_has_slogs(spa)) 1809 return (B_FALSE); 1810 1811 for (int c = 0; c < rvd->vdev_children; c++) { 1812 vdev_t *tvd = rvd->vdev_child[c]; 1813 metaslab_group_t *mg = tvd->vdev_mg; 1814 1815 if (tvd->vdev_islog) { 1816 metaslab_group_passivate(mg); 1817 slog_found = B_TRUE; 1818 } 1819 } 1820 1821 return (slog_found); 1822} 1823 1824static void 1825spa_activate_log(spa_t *spa) 1826{ 1827 vdev_t *rvd = spa->spa_root_vdev; 1828 1829 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1830 1831 for (int c = 0; c < rvd->vdev_children; c++) { 1832 vdev_t *tvd = rvd->vdev_child[c]; 1833 metaslab_group_t *mg = tvd->vdev_mg; 1834 1835 if (tvd->vdev_islog) 1836 metaslab_group_activate(mg); 1837 } 1838} 1839 1840int 1841spa_offline_log(spa_t *spa) 1842{ 1843 int error; 1844 1845 error = dmu_objset_find(spa_name(spa), zil_vdev_offline, 1846 NULL, DS_FIND_CHILDREN); 1847 if (error == 0) { 1848 /* 1849 * We successfully offlined the log device, sync out the 1850 * current txg so that the "stubby" block can be removed 1851 * by zil_sync(). 1852 */ 1853 txg_wait_synced(spa->spa_dsl_pool, 0); 1854 } 1855 return (error); 1856} 1857 1858static void 1859spa_aux_check_removed(spa_aux_vdev_t *sav) 1860{ 1861 int i; 1862 1863 for (i = 0; i < sav->sav_count; i++) 1864 spa_check_removed(sav->sav_vdevs[i]); 1865} 1866 1867void 1868spa_claim_notify(zio_t *zio) 1869{ 1870 spa_t *spa = zio->io_spa; 1871 1872 if (zio->io_error) 1873 return; 1874 1875 mutex_enter(&spa->spa_props_lock); /* any mutex will do */ 1876 if (spa->spa_claim_max_txg < zio->io_bp->blk_birth) 1877 spa->spa_claim_max_txg = zio->io_bp->blk_birth; 1878 mutex_exit(&spa->spa_props_lock); 1879} 1880 1881typedef struct spa_load_error { 1882 uint64_t sle_meta_count; 1883 uint64_t sle_data_count; 1884} spa_load_error_t; 1885 1886static void 1887spa_load_verify_done(zio_t *zio) 1888{ 1889 blkptr_t *bp = zio->io_bp; 1890 spa_load_error_t *sle = zio->io_private; 1891 dmu_object_type_t type = BP_GET_TYPE(bp); 1892 int error = zio->io_error; 1893 spa_t *spa = zio->io_spa; 1894 1895 if (error) { 1896 if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && 1897 type != DMU_OT_INTENT_LOG) 1898 atomic_inc_64(&sle->sle_meta_count); 1899 else 1900 atomic_inc_64(&sle->sle_data_count); 1901 } 1902 zio_data_buf_free(zio->io_data, zio->io_size); 1903 1904 mutex_enter(&spa->spa_scrub_lock); 1905 spa->spa_scrub_inflight--; 1906 cv_broadcast(&spa->spa_scrub_io_cv); 1907 mutex_exit(&spa->spa_scrub_lock); 1908} 1909 1910/* 1911 * Maximum number of concurrent scrub i/os to create while verifying 1912 * a pool while importing it. 1913 */ 1914int spa_load_verify_maxinflight = 10000; 1915boolean_t spa_load_verify_metadata = B_TRUE; 1916boolean_t spa_load_verify_data = B_TRUE; 1917 1918SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_maxinflight, CTLFLAG_RWTUN, 1919 &spa_load_verify_maxinflight, 0, 1920 "Maximum number of concurrent scrub I/Os to create while verifying a " 1921 "pool while importing it"); 1922 1923SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_metadata, CTLFLAG_RWTUN, 1924 &spa_load_verify_metadata, 0, 1925 "Check metadata on import?"); 1926 1927SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_data, CTLFLAG_RWTUN, 1928 &spa_load_verify_data, 0, 1929 "Check user data on import?"); 1930 1931/*ARGSUSED*/ 1932static int 1933spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 1934 const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) 1935{ 1936 if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) 1937 return (0); 1938 /* 1939 * Note: normally this routine will not be called if 1940 * spa_load_verify_metadata is not set. However, it may be useful 1941 * to manually set the flag after the traversal has begun. 1942 */ 1943 if (!spa_load_verify_metadata) 1944 return (0); 1945 if (BP_GET_BUFC_TYPE(bp) == ARC_BUFC_DATA && !spa_load_verify_data) 1946 return (0); 1947 1948 zio_t *rio = arg; 1949 size_t size = BP_GET_PSIZE(bp); 1950 void *data = zio_data_buf_alloc(size); 1951 1952 mutex_enter(&spa->spa_scrub_lock); 1953 while (spa->spa_scrub_inflight >= spa_load_verify_maxinflight) 1954 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1955 spa->spa_scrub_inflight++; 1956 mutex_exit(&spa->spa_scrub_lock); 1957 1958 zio_nowait(zio_read(rio, spa, bp, data, size, 1959 spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, 1960 ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | 1961 ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); 1962 return (0); 1963} 1964 1965static int 1966spa_load_verify(spa_t *spa) 1967{ 1968 zio_t *rio; 1969 spa_load_error_t sle = { 0 }; 1970 zpool_rewind_policy_t policy; 1971 boolean_t verify_ok = B_FALSE; 1972 int error = 0; 1973 1974 zpool_get_rewind_policy(spa->spa_config, &policy); 1975 1976 if (policy.zrp_request & ZPOOL_NEVER_REWIND) 1977 return (0); 1978 1979 rio = zio_root(spa, NULL, &sle, 1980 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 1981 1982 if (spa_load_verify_metadata) { 1983 error = traverse_pool(spa, spa->spa_verify_min_txg, 1984 TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, 1985 spa_load_verify_cb, rio); 1986 } 1987 1988 (void) zio_wait(rio); 1989 1990 spa->spa_load_meta_errors = sle.sle_meta_count; 1991 spa->spa_load_data_errors = sle.sle_data_count; 1992 1993 if (!error && sle.sle_meta_count <= policy.zrp_maxmeta && 1994 sle.sle_data_count <= policy.zrp_maxdata) { 1995 int64_t loss = 0; 1996 1997 verify_ok = B_TRUE; 1998 spa->spa_load_txg = spa->spa_uberblock.ub_txg; 1999 spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; 2000 2001 loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; 2002 VERIFY(nvlist_add_uint64(spa->spa_load_info, 2003 ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0); 2004 VERIFY(nvlist_add_int64(spa->spa_load_info, 2005 ZPOOL_CONFIG_REWIND_TIME, loss) == 0); 2006 VERIFY(nvlist_add_uint64(spa->spa_load_info, 2007 ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0); 2008 } else { 2009 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; 2010 } 2011 2012 if (error) { 2013 if (error != ENXIO && error != EIO) 2014 error = SET_ERROR(EIO); 2015 return (error); 2016 } 2017 2018 return (verify_ok ? 0 : EIO); 2019} 2020 2021/* 2022 * Find a value in the pool props object. 2023 */ 2024static void 2025spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) 2026{ 2027 (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, 2028 zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); 2029} 2030 2031/* 2032 * Find a value in the pool directory object. 2033 */ 2034static int 2035spa_dir_prop(spa_t *spa, const char *name, uint64_t *val) 2036{ 2037 return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 2038 name, sizeof (uint64_t), 1, val)); 2039} 2040 2041static int 2042spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) 2043{ 2044 vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); 2045 return (err); 2046} 2047 2048/* 2049 * Fix up config after a partly-completed split. This is done with the 2050 * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off 2051 * pool have that entry in their config, but only the splitting one contains 2052 * a list of all the guids of the vdevs that are being split off. 2053 * 2054 * This function determines what to do with that list: either rejoin 2055 * all the disks to the pool, or complete the splitting process. To attempt 2056 * the rejoin, each disk that is offlined is marked online again, and 2057 * we do a reopen() call. If the vdev label for every disk that was 2058 * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) 2059 * then we call vdev_split() on each disk, and complete the split. 2060 * 2061 * Otherwise we leave the config alone, with all the vdevs in place in 2062 * the original pool. 2063 */ 2064static void 2065spa_try_repair(spa_t *spa, nvlist_t *config) 2066{ 2067 uint_t extracted; 2068 uint64_t *glist; 2069 uint_t i, gcount; 2070 nvlist_t *nvl; 2071 vdev_t **vd; 2072 boolean_t attempt_reopen; 2073 2074 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) 2075 return; 2076 2077 /* check that the config is complete */ 2078 if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 2079 &glist, &gcount) != 0) 2080 return; 2081 2082 vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); 2083 2084 /* attempt to online all the vdevs & validate */ 2085 attempt_reopen = B_TRUE; 2086 for (i = 0; i < gcount; i++) { 2087 if (glist[i] == 0) /* vdev is hole */ 2088 continue; 2089 2090 vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); 2091 if (vd[i] == NULL) { 2092 /* 2093 * Don't bother attempting to reopen the disks; 2094 * just do the split. 2095 */ 2096 attempt_reopen = B_FALSE; 2097 } else { 2098 /* attempt to re-online it */ 2099 vd[i]->vdev_offline = B_FALSE; 2100 } 2101 } 2102 2103 if (attempt_reopen) { 2104 vdev_reopen(spa->spa_root_vdev); 2105 2106 /* check each device to see what state it's in */ 2107 for (extracted = 0, i = 0; i < gcount; i++) { 2108 if (vd[i] != NULL && 2109 vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) 2110 break; 2111 ++extracted; 2112 } 2113 } 2114 2115 /* 2116 * If every disk has been moved to the new pool, or if we never 2117 * even attempted to look at them, then we split them off for 2118 * good. 2119 */ 2120 if (!attempt_reopen || gcount == extracted) { 2121 for (i = 0; i < gcount; i++) 2122 if (vd[i] != NULL) 2123 vdev_split(vd[i]); 2124 vdev_reopen(spa->spa_root_vdev); 2125 } 2126 2127 kmem_free(vd, gcount * sizeof (vdev_t *)); 2128} 2129 2130static int 2131spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, 2132 boolean_t mosconfig) 2133{ 2134 nvlist_t *config = spa->spa_config; 2135 char *ereport = FM_EREPORT_ZFS_POOL; 2136 char *comment; 2137 int error; 2138 uint64_t pool_guid; 2139 nvlist_t *nvl; 2140 2141 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) 2142 return (SET_ERROR(EINVAL)); 2143 2144 ASSERT(spa->spa_comment == NULL); 2145 if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) 2146 spa->spa_comment = spa_strdup(comment); 2147 2148 /* 2149 * Versioning wasn't explicitly added to the label until later, so if 2150 * it's not present treat it as the initial version. 2151 */ 2152 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 2153 &spa->spa_ubsync.ub_version) != 0) 2154 spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 2155 2156 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 2157 &spa->spa_config_txg); 2158 2159 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 2160 spa_guid_exists(pool_guid, 0)) { 2161 error = SET_ERROR(EEXIST); 2162 } else { 2163 spa->spa_config_guid = pool_guid; 2164 2165 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, 2166 &nvl) == 0) { 2167 VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting, 2168 KM_SLEEP) == 0); 2169 } 2170 2171 nvlist_free(spa->spa_load_info); 2172 spa->spa_load_info = fnvlist_alloc(); 2173 2174 gethrestime(&spa->spa_loaded_ts); 2175 error = spa_load_impl(spa, pool_guid, config, state, type, 2176 mosconfig, &ereport); 2177 } 2178 2179 spa->spa_minref = refcount_count(&spa->spa_refcount); 2180 if (error) { 2181 if (error != EEXIST) { 2182 spa->spa_loaded_ts.tv_sec = 0; 2183 spa->spa_loaded_ts.tv_nsec = 0; 2184 } 2185 if (error != EBADF) { 2186 zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); 2187 } 2188 } 2189 spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; 2190 spa->spa_ena = 0; 2191 2192 return (error); 2193} 2194 2195/* 2196 * Load an existing storage pool, using the pool's builtin spa_config as a 2197 * source of configuration information. 2198 */ 2199static int 2200spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, 2201 spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, 2202 char **ereport) 2203{ 2204 int error = 0; 2205 nvlist_t *nvroot = NULL; 2206 nvlist_t *label; 2207 vdev_t *rvd; 2208 uberblock_t *ub = &spa->spa_uberblock; 2209 uint64_t children, config_cache_txg = spa->spa_config_txg; 2210 int orig_mode = spa->spa_mode; 2211 int parse; 2212 uint64_t obj; 2213 boolean_t missing_feat_write = B_FALSE; 2214 2215 /* 2216 * If this is an untrusted config, access the pool in read-only mode. 2217 * This prevents things like resilvering recently removed devices. 2218 */ 2219 if (!mosconfig) 2220 spa->spa_mode = FREAD; 2221 2222 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 2223 2224 spa->spa_load_state = state; 2225 2226 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot)) 2227 return (SET_ERROR(EINVAL)); 2228 2229 parse = (type == SPA_IMPORT_EXISTING ? 2230 VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); 2231 2232 /* 2233 * Create "The Godfather" zio to hold all async IOs 2234 */ 2235 spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), 2236 KM_SLEEP); 2237 for (int i = 0; i < max_ncpus; i++) { 2238 spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, 2239 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 2240 ZIO_FLAG_GODFATHER); 2241 } 2242 2243 /* 2244 * Parse the configuration into a vdev tree. We explicitly set the 2245 * value that will be returned by spa_version() since parsing the 2246 * configuration requires knowing the version number. 2247 */ 2248 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2249 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse); 2250 spa_config_exit(spa, SCL_ALL, FTAG); 2251 2252 if (error != 0) 2253 return (error); 2254 2255 ASSERT(spa->spa_root_vdev == rvd); 2256 ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); 2257 ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT); 2258 2259 if (type != SPA_IMPORT_ASSEMBLE) { 2260 ASSERT(spa_guid(spa) == pool_guid); 2261 } 2262 2263 /* 2264 * Try to open all vdevs, loading each label in the process. 2265 */ 2266 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2267 error = vdev_open(rvd); 2268 spa_config_exit(spa, SCL_ALL, FTAG); 2269 if (error != 0) 2270 return (error); 2271 2272 /* 2273 * We need to validate the vdev labels against the configuration that 2274 * we have in hand, which is dependent on the setting of mosconfig. If 2275 * mosconfig is true then we're validating the vdev labels based on 2276 * that config. Otherwise, we're validating against the cached config 2277 * (zpool.cache) that was read when we loaded the zfs module, and then 2278 * later we will recursively call spa_load() and validate against 2279 * the vdev config. 2280 * 2281 * If we're assembling a new pool that's been split off from an 2282 * existing pool, the labels haven't yet been updated so we skip 2283 * validation for now. 2284 */ 2285 if (type != SPA_IMPORT_ASSEMBLE) { 2286 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2287 error = vdev_validate(rvd, mosconfig); 2288 spa_config_exit(spa, SCL_ALL, FTAG); 2289 2290 if (error != 0) 2291 return (error); 2292 2293 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 2294 return (SET_ERROR(ENXIO)); 2295 } 2296 2297 /* 2298 * Find the best uberblock. 2299 */ 2300 vdev_uberblock_load(rvd, ub, &label); 2301 2302 /* 2303 * If we weren't able to find a single valid uberblock, return failure. 2304 */ 2305 if (ub->ub_txg == 0) { 2306 nvlist_free(label); 2307 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); 2308 } 2309 2310 /* 2311 * If the pool has an unsupported version we can't open it. 2312 */ 2313 if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) { 2314 nvlist_free(label); 2315 return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); 2316 } 2317 2318 if (ub->ub_version >= SPA_VERSION_FEATURES) { 2319 nvlist_t *features; 2320 2321 /* 2322 * If we weren't able to find what's necessary for reading the 2323 * MOS in the label, return failure. 2324 */ 2325 if (label == NULL || nvlist_lookup_nvlist(label, 2326 ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) { 2327 nvlist_free(label); 2328 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 2329 ENXIO)); 2330 } 2331 2332 /* 2333 * Update our in-core representation with the definitive values 2334 * from the label. 2335 */ 2336 nvlist_free(spa->spa_label_features); 2337 VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0); 2338 } 2339 2340 nvlist_free(label); 2341 2342 /* 2343 * Look through entries in the label nvlist's features_for_read. If 2344 * there is a feature listed there which we don't understand then we 2345 * cannot open a pool. 2346 */ 2347 if (ub->ub_version >= SPA_VERSION_FEATURES) { 2348 nvlist_t *unsup_feat; 2349 2350 VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) == 2351 0); 2352 2353 for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features, 2354 NULL); nvp != NULL; 2355 nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) { 2356 if (!zfeature_is_supported(nvpair_name(nvp))) { 2357 VERIFY(nvlist_add_string(unsup_feat, 2358 nvpair_name(nvp), "") == 0); 2359 } 2360 } 2361 2362 if (!nvlist_empty(unsup_feat)) { 2363 VERIFY(nvlist_add_nvlist(spa->spa_load_info, 2364 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0); 2365 nvlist_free(unsup_feat); 2366 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 2367 ENOTSUP)); 2368 } 2369 2370 nvlist_free(unsup_feat); 2371 } 2372 2373 /* 2374 * If the vdev guid sum doesn't match the uberblock, we have an 2375 * incomplete configuration. We first check to see if the pool 2376 * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN). 2377 * If it is, defer the vdev_guid_sum check till later so we 2378 * can handle missing vdevs. 2379 */ 2380 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN, 2381 &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE && 2382 rvd->vdev_guid_sum != ub->ub_guid_sum) 2383 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); 2384 2385 if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { 2386 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2387 spa_try_repair(spa, config); 2388 spa_config_exit(spa, SCL_ALL, FTAG); 2389 nvlist_free(spa->spa_config_splitting); 2390 spa->spa_config_splitting = NULL; 2391 } 2392 2393 /* 2394 * Initialize internal SPA structures. 2395 */ 2396 spa->spa_state = POOL_STATE_ACTIVE; 2397 spa->spa_ubsync = spa->spa_uberblock; 2398 spa->spa_verify_min_txg = spa->spa_extreme_rewind ? 2399 TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; 2400 spa->spa_first_txg = spa->spa_last_ubsync_txg ? 2401 spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; 2402 spa->spa_claim_max_txg = spa->spa_first_txg; 2403 spa->spa_prev_software_version = ub->ub_software_version; 2404 2405 error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 2406 if (error) 2407 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2408 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 2409 2410 if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0) 2411 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2412 2413 if (spa_version(spa) >= SPA_VERSION_FEATURES) { 2414 boolean_t missing_feat_read = B_FALSE; 2415 nvlist_t *unsup_feat, *enabled_feat; 2416 2417 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ, 2418 &spa->spa_feat_for_read_obj) != 0) { 2419 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2420 } 2421 2422 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE, 2423 &spa->spa_feat_for_write_obj) != 0) { 2424 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2425 } 2426 2427 if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS, 2428 &spa->spa_feat_desc_obj) != 0) { 2429 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2430 } 2431 2432 enabled_feat = fnvlist_alloc(); 2433 unsup_feat = fnvlist_alloc(); 2434 2435 if (!spa_features_check(spa, B_FALSE, 2436 unsup_feat, enabled_feat)) 2437 missing_feat_read = B_TRUE; 2438 2439 if (spa_writeable(spa) || state == SPA_LOAD_TRYIMPORT) { 2440 if (!spa_features_check(spa, B_TRUE, 2441 unsup_feat, enabled_feat)) { 2442 missing_feat_write = B_TRUE; 2443 } 2444 } 2445 2446 fnvlist_add_nvlist(spa->spa_load_info, 2447 ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat); 2448 2449 if (!nvlist_empty(unsup_feat)) { 2450 fnvlist_add_nvlist(spa->spa_load_info, 2451 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); 2452 } 2453 2454 fnvlist_free(enabled_feat); 2455 fnvlist_free(unsup_feat); 2456 2457 if (!missing_feat_read) { 2458 fnvlist_add_boolean(spa->spa_load_info, 2459 ZPOOL_CONFIG_CAN_RDONLY); 2460 } 2461 2462 /* 2463 * If the state is SPA_LOAD_TRYIMPORT, our objective is 2464 * twofold: to determine whether the pool is available for 2465 * import in read-write mode and (if it is not) whether the 2466 * pool is available for import in read-only mode. If the pool 2467 * is available for import in read-write mode, it is displayed 2468 * as available in userland; if it is not available for import 2469 * in read-only mode, it is displayed as unavailable in 2470 * userland. If the pool is available for import in read-only 2471 * mode but not read-write mode, it is displayed as unavailable 2472 * in userland with a special note that the pool is actually 2473 * available for open in read-only mode. 2474 * 2475 * As a result, if the state is SPA_LOAD_TRYIMPORT and we are 2476 * missing a feature for write, we must first determine whether 2477 * the pool can be opened read-only before returning to 2478 * userland in order to know whether to display the 2479 * abovementioned note. 2480 */ 2481 if (missing_feat_read || (missing_feat_write && 2482 spa_writeable(spa))) { 2483 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 2484 ENOTSUP)); 2485 } 2486 2487 /* 2488 * Load refcounts for ZFS features from disk into an in-memory 2489 * cache during SPA initialization. 2490 */ 2491 for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { 2492 uint64_t refcount; 2493 2494 error = feature_get_refcount_from_disk(spa, 2495 &spa_feature_table[i], &refcount); 2496 if (error == 0) { 2497 spa->spa_feat_refcount_cache[i] = refcount; 2498 } else if (error == ENOTSUP) { 2499 spa->spa_feat_refcount_cache[i] = 2500 SPA_FEATURE_DISABLED; 2501 } else { 2502 return (spa_vdev_err(rvd, 2503 VDEV_AUX_CORRUPT_DATA, EIO)); 2504 } 2505 } 2506 } 2507 2508 if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) { 2509 if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG, 2510 &spa->spa_feat_enabled_txg_obj) != 0) 2511 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2512 } 2513 2514 spa->spa_is_initializing = B_TRUE; 2515 error = dsl_pool_open(spa->spa_dsl_pool); 2516 spa->spa_is_initializing = B_FALSE; 2517 if (error != 0) 2518 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2519 2520 if (!mosconfig) { 2521 uint64_t hostid; 2522 nvlist_t *policy = NULL, *nvconfig; 2523 2524 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) 2525 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2526 2527 if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig, 2528 ZPOOL_CONFIG_HOSTID, &hostid) == 0) { 2529 char *hostname; 2530 unsigned long myhostid = 0; 2531 2532 VERIFY(nvlist_lookup_string(nvconfig, 2533 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 2534 2535#ifdef _KERNEL 2536 myhostid = zone_get_hostid(NULL); 2537#else /* _KERNEL */ 2538 /* 2539 * We're emulating the system's hostid in userland, so 2540 * we can't use zone_get_hostid(). 2541 */ 2542 (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 2543#endif /* _KERNEL */ 2544 if (check_hostid && hostid != 0 && myhostid != 0 && 2545 hostid != myhostid) { 2546 nvlist_free(nvconfig); 2547 cmn_err(CE_WARN, "pool '%s' could not be " 2548 "loaded as it was last accessed by " 2549 "another system (host: %s hostid: 0x%lx). " 2550 "See: http://illumos.org/msg/ZFS-8000-EY", 2551 spa_name(spa), hostname, 2552 (unsigned long)hostid); 2553 return (SET_ERROR(EBADF)); 2554 } 2555 } 2556 if (nvlist_lookup_nvlist(spa->spa_config, 2557 ZPOOL_REWIND_POLICY, &policy) == 0) 2558 VERIFY(nvlist_add_nvlist(nvconfig, 2559 ZPOOL_REWIND_POLICY, policy) == 0); 2560 2561 spa_config_set(spa, nvconfig); 2562 spa_unload(spa); 2563 spa_deactivate(spa); 2564 spa_activate(spa, orig_mode); 2565 2566 return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE)); 2567 } 2568 2569 if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0) 2570 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2571 error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); 2572 if (error != 0) 2573 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2574 2575 /* 2576 * Load the bit that tells us to use the new accounting function 2577 * (raid-z deflation). If we have an older pool, this will not 2578 * be present. 2579 */ 2580 error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate); 2581 if (error != 0 && error != ENOENT) 2582 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2583 2584 error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, 2585 &spa->spa_creation_version); 2586 if (error != 0 && error != ENOENT) 2587 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2588 2589 /* 2590 * Load the persistent error log. If we have an older pool, this will 2591 * not be present. 2592 */ 2593 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last); 2594 if (error != 0 && error != ENOENT) 2595 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2596 2597 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, 2598 &spa->spa_errlog_scrub); 2599 if (error != 0 && error != ENOENT) 2600 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2601 2602 /* 2603 * Load the history object. If we have an older pool, this 2604 * will not be present. 2605 */ 2606 error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history); 2607 if (error != 0 && error != ENOENT) 2608 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2609 2610 /* 2611 * If we're assembling the pool from the split-off vdevs of 2612 * an existing pool, we don't want to attach the spares & cache 2613 * devices. 2614 */ 2615 2616 /* 2617 * Load any hot spares for this pool. 2618 */ 2619 error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object); 2620 if (error != 0 && error != ENOENT) 2621 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2622 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 2623 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 2624 if (load_nvlist(spa, spa->spa_spares.sav_object, 2625 &spa->spa_spares.sav_config) != 0) 2626 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2627 2628 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2629 spa_load_spares(spa); 2630 spa_config_exit(spa, SCL_ALL, FTAG); 2631 } else if (error == 0) { 2632 spa->spa_spares.sav_sync = B_TRUE; 2633 } 2634 2635 /* 2636 * Load any level 2 ARC devices for this pool. 2637 */ 2638 error = spa_dir_prop(spa, DMU_POOL_L2CACHE, 2639 &spa->spa_l2cache.sav_object); 2640 if (error != 0 && error != ENOENT) 2641 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2642 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 2643 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 2644 if (load_nvlist(spa, spa->spa_l2cache.sav_object, 2645 &spa->spa_l2cache.sav_config) != 0) 2646 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2647 2648 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2649 spa_load_l2cache(spa); 2650 spa_config_exit(spa, SCL_ALL, FTAG); 2651 } else if (error == 0) { 2652 spa->spa_l2cache.sav_sync = B_TRUE; 2653 } 2654 2655 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 2656 2657 error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object); 2658 if (error && error != ENOENT) 2659 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2660 2661 if (error == 0) { 2662 uint64_t autoreplace; 2663 2664 spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); 2665 spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); 2666 spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); 2667 spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); 2668 spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); 2669 spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO, 2670 &spa->spa_dedup_ditto); 2671 2672 spa->spa_autoreplace = (autoreplace != 0); 2673 } 2674 2675 /* 2676 * If the 'autoreplace' property is set, then post a resource notifying 2677 * the ZFS DE that it should not issue any faults for unopenable 2678 * devices. We also iterate over the vdevs, and post a sysevent for any 2679 * unopenable vdevs so that the normal autoreplace handler can take 2680 * over. 2681 */ 2682 if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) { 2683 spa_check_removed(spa->spa_root_vdev); 2684 /* 2685 * For the import case, this is done in spa_import(), because 2686 * at this point we're using the spare definitions from 2687 * the MOS config, not necessarily from the userland config. 2688 */ 2689 if (state != SPA_LOAD_IMPORT) { 2690 spa_aux_check_removed(&spa->spa_spares); 2691 spa_aux_check_removed(&spa->spa_l2cache); 2692 } 2693 } 2694 2695 /* 2696 * Load the vdev state for all toplevel vdevs. 2697 */ 2698 vdev_load(rvd); 2699 2700 /* 2701 * Propagate the leaf DTLs we just loaded all the way up the tree. 2702 */ 2703 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2704 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 2705 spa_config_exit(spa, SCL_ALL, FTAG); 2706 2707 /* 2708 * Load the DDTs (dedup tables). 2709 */ 2710 error = ddt_load(spa); 2711 if (error != 0) 2712 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2713 2714 spa_update_dspace(spa); 2715 2716 /* 2717 * Validate the config, using the MOS config to fill in any 2718 * information which might be missing. If we fail to validate 2719 * the config then declare the pool unfit for use. If we're 2720 * assembling a pool from a split, the log is not transferred 2721 * over. 2722 */ 2723 if (type != SPA_IMPORT_ASSEMBLE) { 2724 nvlist_t *nvconfig; 2725 2726 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) 2727 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2728 2729 if (!spa_config_valid(spa, nvconfig)) { 2730 nvlist_free(nvconfig); 2731 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, 2732 ENXIO)); 2733 } 2734 nvlist_free(nvconfig); 2735 2736 /* 2737 * Now that we've validated the config, check the state of the 2738 * root vdev. If it can't be opened, it indicates one or 2739 * more toplevel vdevs are faulted. 2740 */ 2741 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 2742 return (SET_ERROR(ENXIO)); 2743 2744 if (spa_check_logs(spa)) { 2745 *ereport = FM_EREPORT_ZFS_LOG_REPLAY; 2746 return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO)); 2747 } 2748 } 2749 2750 if (missing_feat_write) { 2751 ASSERT(state == SPA_LOAD_TRYIMPORT); 2752 2753 /* 2754 * At this point, we know that we can open the pool in 2755 * read-only mode but not read-write mode. We now have enough 2756 * information and can return to userland. 2757 */ 2758 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); 2759 } 2760 2761 /* 2762 * We've successfully opened the pool, verify that we're ready 2763 * to start pushing transactions. 2764 */ 2765 if (state != SPA_LOAD_TRYIMPORT) { 2766 if (error = spa_load_verify(spa)) 2767 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 2768 error)); 2769 } 2770 2771 if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER || 2772 spa->spa_load_max_txg == UINT64_MAX)) { 2773 dmu_tx_t *tx; 2774 int need_update = B_FALSE; 2775 2776 ASSERT(state != SPA_LOAD_TRYIMPORT); 2777 2778 /* 2779 * Claim log blocks that haven't been committed yet. 2780 * This must all happen in a single txg. 2781 * Note: spa_claim_max_txg is updated by spa_claim_notify(), 2782 * invoked from zil_claim_log_block()'s i/o done callback. 2783 * Price of rollback is that we abandon the log. 2784 */ 2785 spa->spa_claiming = B_TRUE; 2786 2787 tx = dmu_tx_create_assigned(spa_get_dsl(spa), 2788 spa_first_txg(spa)); 2789 (void) dmu_objset_find(spa_name(spa), 2790 zil_claim, tx, DS_FIND_CHILDREN); 2791 dmu_tx_commit(tx); 2792 2793 spa->spa_claiming = B_FALSE; 2794 2795 spa_set_log_state(spa, SPA_LOG_GOOD); 2796 spa->spa_sync_on = B_TRUE; 2797 txg_sync_start(spa->spa_dsl_pool); 2798 2799 /* 2800 * Wait for all claims to sync. We sync up to the highest 2801 * claimed log block birth time so that claimed log blocks 2802 * don't appear to be from the future. spa_claim_max_txg 2803 * will have been set for us by either zil_check_log_chain() 2804 * (invoked from spa_check_logs()) or zil_claim() above. 2805 */ 2806 txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); 2807 2808 /* 2809 * If the config cache is stale, or we have uninitialized 2810 * metaslabs (see spa_vdev_add()), then update the config. 2811 * 2812 * If this is a verbatim import, trust the current 2813 * in-core spa_config and update the disk labels. 2814 */ 2815 if (config_cache_txg != spa->spa_config_txg || 2816 state == SPA_LOAD_IMPORT || 2817 state == SPA_LOAD_RECOVER || 2818 (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) 2819 need_update = B_TRUE; 2820 2821 for (int c = 0; c < rvd->vdev_children; c++) 2822 if (rvd->vdev_child[c]->vdev_ms_array == 0) 2823 need_update = B_TRUE; 2824 2825 /* 2826 * Update the config cache asychronously in case we're the 2827 * root pool, in which case the config cache isn't writable yet. 2828 */ 2829 if (need_update) 2830 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 2831 2832 /* 2833 * Check all DTLs to see if anything needs resilvering. 2834 */ 2835 if (!dsl_scan_resilvering(spa->spa_dsl_pool) && 2836 vdev_resilver_needed(rvd, NULL, NULL)) 2837 spa_async_request(spa, SPA_ASYNC_RESILVER); 2838 2839 /* 2840 * Log the fact that we booted up (so that we can detect if 2841 * we rebooted in the middle of an operation). 2842 */ 2843 spa_history_log_version(spa, "open"); 2844 2845 /* 2846 * Delete any inconsistent datasets. 2847 */ 2848 (void) dmu_objset_find(spa_name(spa), 2849 dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); 2850 2851 /* 2852 * Clean up any stale temporary dataset userrefs. 2853 */ 2854 dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); 2855 } 2856 2857 return (0); 2858} 2859 2860static int 2861spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig) 2862{ 2863 int mode = spa->spa_mode; 2864 2865 spa_unload(spa); 2866 spa_deactivate(spa); 2867 2868 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1; 2869 2870 spa_activate(spa, mode); 2871 spa_async_suspend(spa); 2872 2873 return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig)); 2874} 2875 2876/* 2877 * If spa_load() fails this function will try loading prior txg's. If 2878 * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool 2879 * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this 2880 * function will not rewind the pool and will return the same error as 2881 * spa_load(). 2882 */ 2883static int 2884spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, 2885 uint64_t max_request, int rewind_flags) 2886{ 2887 nvlist_t *loadinfo = NULL; 2888 nvlist_t *config = NULL; 2889 int load_error, rewind_error; 2890 uint64_t safe_rewind_txg; 2891 uint64_t min_txg; 2892 2893 if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { 2894 spa->spa_load_max_txg = spa->spa_load_txg; 2895 spa_set_log_state(spa, SPA_LOG_CLEAR); 2896 } else { 2897 spa->spa_load_max_txg = max_request; 2898 if (max_request != UINT64_MAX) 2899 spa->spa_extreme_rewind = B_TRUE; 2900 } 2901 2902 load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING, 2903 mosconfig); 2904 if (load_error == 0) 2905 return (0); 2906 2907 if (spa->spa_root_vdev != NULL) 2908 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 2909 2910 spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; 2911 spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; 2912 2913 if (rewind_flags & ZPOOL_NEVER_REWIND) { 2914 nvlist_free(config); 2915 return (load_error); 2916 } 2917 2918 if (state == SPA_LOAD_RECOVER) { 2919 /* Price of rolling back is discarding txgs, including log */ 2920 spa_set_log_state(spa, SPA_LOG_CLEAR); 2921 } else { 2922 /* 2923 * If we aren't rolling back save the load info from our first 2924 * import attempt so that we can restore it after attempting 2925 * to rewind. 2926 */ 2927 loadinfo = spa->spa_load_info; 2928 spa->spa_load_info = fnvlist_alloc(); 2929 } 2930 2931 spa->spa_load_max_txg = spa->spa_last_ubsync_txg; 2932 safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; 2933 min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? 2934 TXG_INITIAL : safe_rewind_txg; 2935 2936 /* 2937 * Continue as long as we're finding errors, we're still within 2938 * the acceptable rewind range, and we're still finding uberblocks 2939 */ 2940 while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && 2941 spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { 2942 if (spa->spa_load_max_txg < safe_rewind_txg) 2943 spa->spa_extreme_rewind = B_TRUE; 2944 rewind_error = spa_load_retry(spa, state, mosconfig); 2945 } 2946 2947 spa->spa_extreme_rewind = B_FALSE; 2948 spa->spa_load_max_txg = UINT64_MAX; 2949 2950 if (config && (rewind_error || state != SPA_LOAD_RECOVER)) 2951 spa_config_set(spa, config); 2952 2953 if (state == SPA_LOAD_RECOVER) { 2954 ASSERT3P(loadinfo, ==, NULL); 2955 return (rewind_error); 2956 } else { 2957 /* Store the rewind info as part of the initial load info */ 2958 fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO, 2959 spa->spa_load_info); 2960 2961 /* Restore the initial load info */ 2962 fnvlist_free(spa->spa_load_info); 2963 spa->spa_load_info = loadinfo; 2964 2965 return (load_error); 2966 } 2967} 2968 2969/* 2970 * Pool Open/Import 2971 * 2972 * The import case is identical to an open except that the configuration is sent 2973 * down from userland, instead of grabbed from the configuration cache. For the 2974 * case of an open, the pool configuration will exist in the 2975 * POOL_STATE_UNINITIALIZED state. 2976 * 2977 * The stats information (gen/count/ustats) is used to gather vdev statistics at 2978 * the same time open the pool, without having to keep around the spa_t in some 2979 * ambiguous state. 2980 */ 2981static int 2982spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, 2983 nvlist_t **config) 2984{ 2985 spa_t *spa; 2986 spa_load_state_t state = SPA_LOAD_OPEN; 2987 int error; 2988 int locked = B_FALSE; 2989 int firstopen = B_FALSE; 2990 2991 *spapp = NULL; 2992 2993 /* 2994 * As disgusting as this is, we need to support recursive calls to this 2995 * function because dsl_dir_open() is called during spa_load(), and ends 2996 * up calling spa_open() again. The real fix is to figure out how to 2997 * avoid dsl_dir_open() calling this in the first place. 2998 */ 2999 if (mutex_owner(&spa_namespace_lock) != curthread) { 3000 mutex_enter(&spa_namespace_lock); 3001 locked = B_TRUE; 3002 } 3003 3004 if ((spa = spa_lookup(pool)) == NULL) { 3005 if (locked) 3006 mutex_exit(&spa_namespace_lock); 3007 return (SET_ERROR(ENOENT)); 3008 } 3009 3010 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 3011 zpool_rewind_policy_t policy; 3012 3013 firstopen = B_TRUE; 3014 3015 zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config, 3016 &policy); 3017 if (policy.zrp_request & ZPOOL_DO_REWIND) 3018 state = SPA_LOAD_RECOVER; 3019 3020 spa_activate(spa, spa_mode_global); 3021 3022 if (state != SPA_LOAD_RECOVER) 3023 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 3024 3025 error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg, 3026 policy.zrp_request); 3027 3028 if (error == EBADF) { 3029 /* 3030 * If vdev_validate() returns failure (indicated by 3031 * EBADF), it indicates that one of the vdevs indicates 3032 * that the pool has been exported or destroyed. If 3033 * this is the case, the config cache is out of sync and 3034 * we should remove the pool from the namespace. 3035 */ 3036 spa_unload(spa); 3037 spa_deactivate(spa); 3038 spa_config_sync(spa, B_TRUE, B_TRUE); 3039 spa_remove(spa); 3040 if (locked) 3041 mutex_exit(&spa_namespace_lock); 3042 return (SET_ERROR(ENOENT)); 3043 } 3044 3045 if (error) { 3046 /* 3047 * We can't open the pool, but we still have useful 3048 * information: the state of each vdev after the 3049 * attempted vdev_open(). Return this to the user. 3050 */ 3051 if (config != NULL && spa->spa_config) { 3052 VERIFY(nvlist_dup(spa->spa_config, config, 3053 KM_SLEEP) == 0); 3054 VERIFY(nvlist_add_nvlist(*config, 3055 ZPOOL_CONFIG_LOAD_INFO, 3056 spa->spa_load_info) == 0); 3057 } 3058 spa_unload(spa); 3059 spa_deactivate(spa); 3060 spa->spa_last_open_failed = error; 3061 if (locked) 3062 mutex_exit(&spa_namespace_lock); 3063 *spapp = NULL; 3064 return (error); 3065 } 3066 } 3067 3068 spa_open_ref(spa, tag); 3069 3070 if (config != NULL) 3071 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 3072 3073 /* 3074 * If we've recovered the pool, pass back any information we 3075 * gathered while doing the load. 3076 */ 3077 if (state == SPA_LOAD_RECOVER) { 3078 VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, 3079 spa->spa_load_info) == 0); 3080 } 3081 3082 if (locked) { 3083 spa->spa_last_open_failed = 0; 3084 spa->spa_last_ubsync_txg = 0; 3085 spa->spa_load_txg = 0; 3086 mutex_exit(&spa_namespace_lock); 3087#ifdef __FreeBSD__ 3088#ifdef _KERNEL 3089 if (firstopen) 3090 zvol_create_minors(spa->spa_name); 3091#endif 3092#endif 3093 } 3094 3095 *spapp = spa; 3096 3097 return (0); 3098} 3099 3100int 3101spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy, 3102 nvlist_t **config) 3103{ 3104 return (spa_open_common(name, spapp, tag, policy, config)); 3105} 3106 3107int 3108spa_open(const char *name, spa_t **spapp, void *tag) 3109{ 3110 return (spa_open_common(name, spapp, tag, NULL, NULL)); 3111} 3112 3113/* 3114 * Lookup the given spa_t, incrementing the inject count in the process, 3115 * preventing it from being exported or destroyed. 3116 */ 3117spa_t * 3118spa_inject_addref(char *name) 3119{ 3120 spa_t *spa; 3121 3122 mutex_enter(&spa_namespace_lock); 3123 if ((spa = spa_lookup(name)) == NULL) { 3124 mutex_exit(&spa_namespace_lock); 3125 return (NULL); 3126 } 3127 spa->spa_inject_ref++; 3128 mutex_exit(&spa_namespace_lock); 3129 3130 return (spa); 3131} 3132 3133void 3134spa_inject_delref(spa_t *spa) 3135{ 3136 mutex_enter(&spa_namespace_lock); 3137 spa->spa_inject_ref--; 3138 mutex_exit(&spa_namespace_lock); 3139} 3140 3141/* 3142 * Add spares device information to the nvlist. 3143 */ 3144static void 3145spa_add_spares(spa_t *spa, nvlist_t *config) 3146{ 3147 nvlist_t **spares; 3148 uint_t i, nspares; 3149 nvlist_t *nvroot; 3150 uint64_t guid; 3151 vdev_stat_t *vs; 3152 uint_t vsc; 3153 uint64_t pool; 3154 3155 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 3156 3157 if (spa->spa_spares.sav_count == 0) 3158 return; 3159 3160 VERIFY(nvlist_lookup_nvlist(config, 3161 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 3162 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 3163 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 3164 if (nspares != 0) { 3165 VERIFY(nvlist_add_nvlist_array(nvroot, 3166 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 3167 VERIFY(nvlist_lookup_nvlist_array(nvroot, 3168 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 3169 3170 /* 3171 * Go through and find any spares which have since been 3172 * repurposed as an active spare. If this is the case, update 3173 * their status appropriately. 3174 */ 3175 for (i = 0; i < nspares; i++) { 3176 VERIFY(nvlist_lookup_uint64(spares[i], 3177 ZPOOL_CONFIG_GUID, &guid) == 0); 3178 if (spa_spare_exists(guid, &pool, NULL) && 3179 pool != 0ULL) { 3180 VERIFY(nvlist_lookup_uint64_array( 3181 spares[i], ZPOOL_CONFIG_VDEV_STATS, 3182 (uint64_t **)&vs, &vsc) == 0); 3183 vs->vs_state = VDEV_STATE_CANT_OPEN; 3184 vs->vs_aux = VDEV_AUX_SPARED; 3185 } 3186 } 3187 } 3188} 3189 3190/* 3191 * Add l2cache device information to the nvlist, including vdev stats. 3192 */ 3193static void 3194spa_add_l2cache(spa_t *spa, nvlist_t *config) 3195{ 3196 nvlist_t **l2cache; 3197 uint_t i, j, nl2cache; 3198 nvlist_t *nvroot; 3199 uint64_t guid; 3200 vdev_t *vd; 3201 vdev_stat_t *vs; 3202 uint_t vsc; 3203 3204 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 3205 3206 if (spa->spa_l2cache.sav_count == 0) 3207 return; 3208 3209 VERIFY(nvlist_lookup_nvlist(config, 3210 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 3211 VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 3212 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 3213 if (nl2cache != 0) { 3214 VERIFY(nvlist_add_nvlist_array(nvroot, 3215 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 3216 VERIFY(nvlist_lookup_nvlist_array(nvroot, 3217 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 3218 3219 /* 3220 * Update level 2 cache device stats. 3221 */ 3222 3223 for (i = 0; i < nl2cache; i++) { 3224 VERIFY(nvlist_lookup_uint64(l2cache[i], 3225 ZPOOL_CONFIG_GUID, &guid) == 0); 3226 3227 vd = NULL; 3228 for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 3229 if (guid == 3230 spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 3231 vd = spa->spa_l2cache.sav_vdevs[j]; 3232 break; 3233 } 3234 } 3235 ASSERT(vd != NULL); 3236 3237 VERIFY(nvlist_lookup_uint64_array(l2cache[i], 3238 ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) 3239 == 0); 3240 vdev_get_stats(vd, vs); 3241 } 3242 } 3243} 3244 3245static void 3246spa_add_feature_stats(spa_t *spa, nvlist_t *config) 3247{ 3248 nvlist_t *features; 3249 zap_cursor_t zc; 3250 zap_attribute_t za; 3251 3252 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 3253 VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3254 3255 /* We may be unable to read features if pool is suspended. */ 3256 if (spa_suspended(spa)) 3257 goto out; 3258 3259 if (spa->spa_feat_for_read_obj != 0) { 3260 for (zap_cursor_init(&zc, spa->spa_meta_objset, 3261 spa->spa_feat_for_read_obj); 3262 zap_cursor_retrieve(&zc, &za) == 0; 3263 zap_cursor_advance(&zc)) { 3264 ASSERT(za.za_integer_length == sizeof (uint64_t) && 3265 za.za_num_integers == 1); 3266 VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, 3267 za.za_first_integer)); 3268 } 3269 zap_cursor_fini(&zc); 3270 } 3271 3272 if (spa->spa_feat_for_write_obj != 0) { 3273 for (zap_cursor_init(&zc, spa->spa_meta_objset, 3274 spa->spa_feat_for_write_obj); 3275 zap_cursor_retrieve(&zc, &za) == 0; 3276 zap_cursor_advance(&zc)) { 3277 ASSERT(za.za_integer_length == sizeof (uint64_t) && 3278 za.za_num_integers == 1); 3279 VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, 3280 za.za_first_integer)); 3281 } 3282 zap_cursor_fini(&zc); 3283 } 3284 3285out: 3286 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, 3287 features) == 0); 3288 nvlist_free(features); 3289} 3290 3291int 3292spa_get_stats(const char *name, nvlist_t **config, 3293 char *altroot, size_t buflen) 3294{ 3295 int error; 3296 spa_t *spa; 3297 3298 *config = NULL; 3299 error = spa_open_common(name, &spa, FTAG, NULL, config); 3300 3301 if (spa != NULL) { 3302 /* 3303 * This still leaves a window of inconsistency where the spares 3304 * or l2cache devices could change and the config would be 3305 * self-inconsistent. 3306 */ 3307 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3308 3309 if (*config != NULL) { 3310 uint64_t loadtimes[2]; 3311 3312 loadtimes[0] = spa->spa_loaded_ts.tv_sec; 3313 loadtimes[1] = spa->spa_loaded_ts.tv_nsec; 3314 VERIFY(nvlist_add_uint64_array(*config, 3315 ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0); 3316 3317 VERIFY(nvlist_add_uint64(*config, 3318 ZPOOL_CONFIG_ERRCOUNT, 3319 spa_get_errlog_size(spa)) == 0); 3320 3321 if (spa_suspended(spa)) 3322 VERIFY(nvlist_add_uint64(*config, 3323 ZPOOL_CONFIG_SUSPENDED, 3324 spa->spa_failmode) == 0); 3325 3326 spa_add_spares(spa, *config); 3327 spa_add_l2cache(spa, *config); 3328 spa_add_feature_stats(spa, *config); 3329 } 3330 } 3331 3332 /* 3333 * We want to get the alternate root even for faulted pools, so we cheat 3334 * and call spa_lookup() directly. 3335 */ 3336 if (altroot) { 3337 if (spa == NULL) { 3338 mutex_enter(&spa_namespace_lock); 3339 spa = spa_lookup(name); 3340 if (spa) 3341 spa_altroot(spa, altroot, buflen); 3342 else 3343 altroot[0] = '\0'; 3344 spa = NULL; 3345 mutex_exit(&spa_namespace_lock); 3346 } else { 3347 spa_altroot(spa, altroot, buflen); 3348 } 3349 } 3350 3351 if (spa != NULL) { 3352 spa_config_exit(spa, SCL_CONFIG, FTAG); 3353 spa_close(spa, FTAG); 3354 } 3355 3356 return (error); 3357} 3358 3359/* 3360 * Validate that the auxiliary device array is well formed. We must have an 3361 * array of nvlists, each which describes a valid leaf vdev. If this is an 3362 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 3363 * specified, as long as they are well-formed. 3364 */ 3365static int 3366spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 3367 spa_aux_vdev_t *sav, const char *config, uint64_t version, 3368 vdev_labeltype_t label) 3369{ 3370 nvlist_t **dev; 3371 uint_t i, ndev; 3372 vdev_t *vd; 3373 int error; 3374 3375 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3376 3377 /* 3378 * It's acceptable to have no devs specified. 3379 */ 3380 if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 3381 return (0); 3382 3383 if (ndev == 0) 3384 return (SET_ERROR(EINVAL)); 3385 3386 /* 3387 * Make sure the pool is formatted with a version that supports this 3388 * device type. 3389 */ 3390 if (spa_version(spa) < version) 3391 return (SET_ERROR(ENOTSUP)); 3392 3393 /* 3394 * Set the pending device list so we correctly handle device in-use 3395 * checking. 3396 */ 3397 sav->sav_pending = dev; 3398 sav->sav_npending = ndev; 3399 3400 for (i = 0; i < ndev; i++) { 3401 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 3402 mode)) != 0) 3403 goto out; 3404 3405 if (!vd->vdev_ops->vdev_op_leaf) { 3406 vdev_free(vd); 3407 error = SET_ERROR(EINVAL); 3408 goto out; 3409 } 3410 3411 /* 3412 * The L2ARC currently only supports disk devices in 3413 * kernel context. For user-level testing, we allow it. 3414 */ 3415#ifdef _KERNEL 3416 if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && 3417 strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { 3418 error = SET_ERROR(ENOTBLK); 3419 vdev_free(vd); 3420 goto out; 3421 } 3422#endif 3423 vd->vdev_top = vd; 3424 3425 if ((error = vdev_open(vd)) == 0 && 3426 (error = vdev_label_init(vd, crtxg, label)) == 0) { 3427 VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 3428 vd->vdev_guid) == 0); 3429 } 3430 3431 vdev_free(vd); 3432 3433 if (error && 3434 (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 3435 goto out; 3436 else 3437 error = 0; 3438 } 3439 3440out: 3441 sav->sav_pending = NULL; 3442 sav->sav_npending = 0; 3443 return (error); 3444} 3445 3446static int 3447spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 3448{ 3449 int error; 3450 3451 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3452 3453 if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 3454 &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 3455 VDEV_LABEL_SPARE)) != 0) { 3456 return (error); 3457 } 3458 3459 return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 3460 &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 3461 VDEV_LABEL_L2CACHE)); 3462} 3463 3464static void 3465spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 3466 const char *config) 3467{ 3468 int i; 3469 3470 if (sav->sav_config != NULL) { 3471 nvlist_t **olddevs; 3472 uint_t oldndevs; 3473 nvlist_t **newdevs; 3474 3475 /* 3476 * Generate new dev list by concatentating with the 3477 * current dev list. 3478 */ 3479 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, 3480 &olddevs, &oldndevs) == 0); 3481 3482 newdevs = kmem_alloc(sizeof (void *) * 3483 (ndevs + oldndevs), KM_SLEEP); 3484 for (i = 0; i < oldndevs; i++) 3485 VERIFY(nvlist_dup(olddevs[i], &newdevs[i], 3486 KM_SLEEP) == 0); 3487 for (i = 0; i < ndevs; i++) 3488 VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], 3489 KM_SLEEP) == 0); 3490 3491 VERIFY(nvlist_remove(sav->sav_config, config, 3492 DATA_TYPE_NVLIST_ARRAY) == 0); 3493 3494 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 3495 config, newdevs, ndevs + oldndevs) == 0); 3496 for (i = 0; i < oldndevs + ndevs; i++) 3497 nvlist_free(newdevs[i]); 3498 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 3499 } else { 3500 /* 3501 * Generate a new dev list. 3502 */ 3503 VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, 3504 KM_SLEEP) == 0); 3505 VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, 3506 devs, ndevs) == 0); 3507 } 3508} 3509 3510/* 3511 * Stop and drop level 2 ARC devices 3512 */ 3513void 3514spa_l2cache_drop(spa_t *spa) 3515{ 3516 vdev_t *vd; 3517 int i; 3518 spa_aux_vdev_t *sav = &spa->spa_l2cache; 3519 3520 for (i = 0; i < sav->sav_count; i++) { 3521 uint64_t pool; 3522 3523 vd = sav->sav_vdevs[i]; 3524 ASSERT(vd != NULL); 3525 3526 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 3527 pool != 0ULL && l2arc_vdev_present(vd)) 3528 l2arc_remove_vdev(vd); 3529 } 3530} 3531 3532/* 3533 * Pool Creation 3534 */ 3535int 3536spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 3537 nvlist_t *zplprops) 3538{ 3539 spa_t *spa; 3540 char *altroot = NULL; 3541 vdev_t *rvd; 3542 dsl_pool_t *dp; 3543 dmu_tx_t *tx; 3544 int error = 0; 3545 uint64_t txg = TXG_INITIAL; 3546 nvlist_t **spares, **l2cache; 3547 uint_t nspares, nl2cache; 3548 uint64_t version, obj; 3549 boolean_t has_features; 3550 3551 /* 3552 * If this pool already exists, return failure. 3553 */ 3554 mutex_enter(&spa_namespace_lock); 3555 if (spa_lookup(pool) != NULL) { 3556 mutex_exit(&spa_namespace_lock); 3557 return (SET_ERROR(EEXIST)); 3558 } 3559 3560 /* 3561 * Allocate a new spa_t structure. 3562 */ 3563 (void) nvlist_lookup_string(props, 3564 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 3565 spa = spa_add(pool, NULL, altroot); 3566 spa_activate(spa, spa_mode_global); 3567 3568 if (props && (error = spa_prop_validate(spa, props))) { 3569 spa_deactivate(spa); 3570 spa_remove(spa); 3571 mutex_exit(&spa_namespace_lock); 3572 return (error); 3573 } 3574 3575 has_features = B_FALSE; 3576 for (nvpair_t *elem = nvlist_next_nvpair(props, NULL); 3577 elem != NULL; elem = nvlist_next_nvpair(props, elem)) { 3578 if (zpool_prop_feature(nvpair_name(elem))) 3579 has_features = B_TRUE; 3580 } 3581 3582 if (has_features || nvlist_lookup_uint64(props, 3583 zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) { 3584 version = SPA_VERSION; 3585 } 3586 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 3587 3588 spa->spa_first_txg = txg; 3589 spa->spa_uberblock.ub_txg = txg - 1; 3590 spa->spa_uberblock.ub_version = version; 3591 spa->spa_ubsync = spa->spa_uberblock; 3592 3593 /* 3594 * Create "The Godfather" zio to hold all async IOs 3595 */ 3596 spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), 3597 KM_SLEEP); 3598 for (int i = 0; i < max_ncpus; i++) { 3599 spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, 3600 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 3601 ZIO_FLAG_GODFATHER); 3602 } 3603 3604 /* 3605 * Create the root vdev. 3606 */ 3607 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3608 3609 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 3610 3611 ASSERT(error != 0 || rvd != NULL); 3612 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 3613 3614 if (error == 0 && !zfs_allocatable_devs(nvroot)) 3615 error = SET_ERROR(EINVAL); 3616 3617 if (error == 0 && 3618 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 3619 (error = spa_validate_aux(spa, nvroot, txg, 3620 VDEV_ALLOC_ADD)) == 0) { 3621 for (int c = 0; c < rvd->vdev_children; c++) { 3622 vdev_ashift_optimize(rvd->vdev_child[c]); 3623 vdev_metaslab_set_size(rvd->vdev_child[c]); 3624 vdev_expand(rvd->vdev_child[c], txg); 3625 } 3626 } 3627 3628 spa_config_exit(spa, SCL_ALL, FTAG); 3629 3630 if (error != 0) { 3631 spa_unload(spa); 3632 spa_deactivate(spa); 3633 spa_remove(spa); 3634 mutex_exit(&spa_namespace_lock); 3635 return (error); 3636 } 3637 3638 /* 3639 * Get the list of spares, if specified. 3640 */ 3641 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 3642 &spares, &nspares) == 0) { 3643 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, 3644 KM_SLEEP) == 0); 3645 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 3646 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 3647 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3648 spa_load_spares(spa); 3649 spa_config_exit(spa, SCL_ALL, FTAG); 3650 spa->spa_spares.sav_sync = B_TRUE; 3651 } 3652 3653 /* 3654 * Get the list of level 2 cache devices, if specified. 3655 */ 3656 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 3657 &l2cache, &nl2cache) == 0) { 3658 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 3659 NV_UNIQUE_NAME, KM_SLEEP) == 0); 3660 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 3661 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 3662 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3663 spa_load_l2cache(spa); 3664 spa_config_exit(spa, SCL_ALL, FTAG); 3665 spa->spa_l2cache.sav_sync = B_TRUE; 3666 } 3667 3668 spa->spa_is_initializing = B_TRUE; 3669 spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); 3670 spa->spa_meta_objset = dp->dp_meta_objset; 3671 spa->spa_is_initializing = B_FALSE; 3672 3673 /* 3674 * Create DDTs (dedup tables). 3675 */ 3676 ddt_create(spa); 3677 3678 spa_update_dspace(spa); 3679 3680 tx = dmu_tx_create_assigned(dp, txg); 3681 3682 /* 3683 * Create the pool config object. 3684 */ 3685 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 3686 DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, 3687 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 3688 3689 if (zap_add(spa->spa_meta_objset, 3690 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 3691 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 3692 cmn_err(CE_PANIC, "failed to add pool config"); 3693 } 3694 3695 if (spa_version(spa) >= SPA_VERSION_FEATURES) 3696 spa_feature_create_zap_objects(spa, tx); 3697 3698 if (zap_add(spa->spa_meta_objset, 3699 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, 3700 sizeof (uint64_t), 1, &version, tx) != 0) { 3701 cmn_err(CE_PANIC, "failed to add pool version"); 3702 } 3703 3704 /* Newly created pools with the right version are always deflated. */ 3705 if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 3706 spa->spa_deflate = TRUE; 3707 if (zap_add(spa->spa_meta_objset, 3708 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 3709 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 3710 cmn_err(CE_PANIC, "failed to add deflate"); 3711 } 3712 } 3713 3714 /* 3715 * Create the deferred-free bpobj. Turn off compression 3716 * because sync-to-convergence takes longer if the blocksize 3717 * keeps changing. 3718 */ 3719 obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); 3720 dmu_object_set_compress(spa->spa_meta_objset, obj, 3721 ZIO_COMPRESS_OFF, tx); 3722 if (zap_add(spa->spa_meta_objset, 3723 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, 3724 sizeof (uint64_t), 1, &obj, tx) != 0) { 3725 cmn_err(CE_PANIC, "failed to add bpobj"); 3726 } 3727 VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, 3728 spa->spa_meta_objset, obj)); 3729 3730 /* 3731 * Create the pool's history object. 3732 */ 3733 if (version >= SPA_VERSION_ZPOOL_HISTORY) 3734 spa_history_create_obj(spa, tx); 3735 3736 /* 3737 * Set pool properties. 3738 */ 3739 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 3740 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 3741 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 3742 spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); 3743 3744 if (props != NULL) { 3745 spa_configfile_set(spa, props, B_FALSE); 3746 spa_sync_props(props, tx); 3747 } 3748 3749 dmu_tx_commit(tx); 3750 3751 spa->spa_sync_on = B_TRUE; 3752 txg_sync_start(spa->spa_dsl_pool); 3753 3754 /* 3755 * We explicitly wait for the first transaction to complete so that our 3756 * bean counters are appropriately updated. 3757 */ 3758 txg_wait_synced(spa->spa_dsl_pool, txg); 3759 3760 spa_config_sync(spa, B_FALSE, B_TRUE); 3761 3762 spa_history_log_version(spa, "create"); 3763 3764 spa->spa_minref = refcount_count(&spa->spa_refcount); 3765 3766 mutex_exit(&spa_namespace_lock); 3767 3768 return (0); 3769} 3770 3771#ifdef _KERNEL 3772#if defined(sun) 3773/* 3774 * Get the root pool information from the root disk, then import the root pool 3775 * during the system boot up time. 3776 */ 3777extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); 3778 3779static nvlist_t * 3780spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid) 3781{ 3782 nvlist_t *config; 3783 nvlist_t *nvtop, *nvroot; 3784 uint64_t pgid; 3785 3786 if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0) 3787 return (NULL); 3788 3789 /* 3790 * Add this top-level vdev to the child array. 3791 */ 3792 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3793 &nvtop) == 0); 3794 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 3795 &pgid) == 0); 3796 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0); 3797 3798 /* 3799 * Put this pool's top-level vdevs into a root vdev. 3800 */ 3801 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3802 VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 3803 VDEV_TYPE_ROOT) == 0); 3804 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 3805 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 3806 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 3807 &nvtop, 1) == 0); 3808 3809 /* 3810 * Replace the existing vdev_tree with the new root vdev in 3811 * this pool's configuration (remove the old, add the new). 3812 */ 3813 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 3814 nvlist_free(nvroot); 3815 return (config); 3816} 3817 3818/* 3819 * Walk the vdev tree and see if we can find a device with "better" 3820 * configuration. A configuration is "better" if the label on that 3821 * device has a more recent txg. 3822 */ 3823static void 3824spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg) 3825{ 3826 for (int c = 0; c < vd->vdev_children; c++) 3827 spa_alt_rootvdev(vd->vdev_child[c], avd, txg); 3828 3829 if (vd->vdev_ops->vdev_op_leaf) { 3830 nvlist_t *label; 3831 uint64_t label_txg; 3832 3833 if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid, 3834 &label) != 0) 3835 return; 3836 3837 VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, 3838 &label_txg) == 0); 3839 3840 /* 3841 * Do we have a better boot device? 3842 */ 3843 if (label_txg > *txg) { 3844 *txg = label_txg; 3845 *avd = vd; 3846 } 3847 nvlist_free(label); 3848 } 3849} 3850 3851/* 3852 * Import a root pool. 3853 * 3854 * For x86. devpath_list will consist of devid and/or physpath name of 3855 * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). 3856 * The GRUB "findroot" command will return the vdev we should boot. 3857 * 3858 * For Sparc, devpath_list consists the physpath name of the booting device 3859 * no matter the rootpool is a single device pool or a mirrored pool. 3860 * e.g. 3861 * "/pci@1f,0/ide@d/disk@0,0:a" 3862 */ 3863int 3864spa_import_rootpool(char *devpath, char *devid) 3865{ 3866 spa_t *spa; 3867 vdev_t *rvd, *bvd, *avd = NULL; 3868 nvlist_t *config, *nvtop; 3869 uint64_t guid, txg; 3870 char *pname; 3871 int error; 3872 3873 /* 3874 * Read the label from the boot device and generate a configuration. 3875 */ 3876 config = spa_generate_rootconf(devpath, devid, &guid); 3877#if defined(_OBP) && defined(_KERNEL) 3878 if (config == NULL) { 3879 if (strstr(devpath, "/iscsi/ssd") != NULL) { 3880 /* iscsi boot */ 3881 get_iscsi_bootpath_phy(devpath); 3882 config = spa_generate_rootconf(devpath, devid, &guid); 3883 } 3884 } 3885#endif 3886 if (config == NULL) { 3887 cmn_err(CE_NOTE, "Cannot read the pool label from '%s'", 3888 devpath); 3889 return (SET_ERROR(EIO)); 3890 } 3891 3892 VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 3893 &pname) == 0); 3894 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); 3895 3896 mutex_enter(&spa_namespace_lock); 3897 if ((spa = spa_lookup(pname)) != NULL) { 3898 /* 3899 * Remove the existing root pool from the namespace so that we 3900 * can replace it with the correct config we just read in. 3901 */ 3902 spa_remove(spa); 3903 } 3904 3905 spa = spa_add(pname, config, NULL); 3906 spa->spa_is_root = B_TRUE; 3907 spa->spa_import_flags = ZFS_IMPORT_VERBATIM; 3908 3909 /* 3910 * Build up a vdev tree based on the boot device's label config. 3911 */ 3912 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3913 &nvtop) == 0); 3914 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3915 error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, 3916 VDEV_ALLOC_ROOTPOOL); 3917 spa_config_exit(spa, SCL_ALL, FTAG); 3918 if (error) { 3919 mutex_exit(&spa_namespace_lock); 3920 nvlist_free(config); 3921 cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", 3922 pname); 3923 return (error); 3924 } 3925 3926 /* 3927 * Get the boot vdev. 3928 */ 3929 if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 3930 cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu", 3931 (u_longlong_t)guid); 3932 error = SET_ERROR(ENOENT); 3933 goto out; 3934 } 3935 3936 /* 3937 * Determine if there is a better boot device. 3938 */ 3939 avd = bvd; 3940 spa_alt_rootvdev(rvd, &avd, &txg); 3941 if (avd != bvd) { 3942 cmn_err(CE_NOTE, "The boot device is 'degraded'. Please " 3943 "try booting from '%s'", avd->vdev_path); 3944 error = SET_ERROR(EINVAL); 3945 goto out; 3946 } 3947 3948 /* 3949 * If the boot device is part of a spare vdev then ensure that 3950 * we're booting off the active spare. 3951 */ 3952 if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops && 3953 !bvd->vdev_isspare) { 3954 cmn_err(CE_NOTE, "The boot device is currently spared. Please " 3955 "try booting from '%s'", 3956 bvd->vdev_parent-> 3957 vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path); 3958 error = SET_ERROR(EINVAL); 3959 goto out; 3960 } 3961 3962 error = 0; 3963out: 3964 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3965 vdev_free(rvd); 3966 spa_config_exit(spa, SCL_ALL, FTAG); 3967 mutex_exit(&spa_namespace_lock); 3968 3969 nvlist_free(config); 3970 return (error); 3971} 3972 3973#else 3974 3975extern int vdev_geom_read_pool_label(const char *name, nvlist_t ***configs, 3976 uint64_t *count); 3977 3978static nvlist_t * 3979spa_generate_rootconf(const char *name) 3980{ 3981 nvlist_t **configs, **tops; 3982 nvlist_t *config; 3983 nvlist_t *best_cfg, *nvtop, *nvroot; 3984 uint64_t *holes; 3985 uint64_t best_txg; 3986 uint64_t nchildren; 3987 uint64_t pgid; 3988 uint64_t count; 3989 uint64_t i; 3990 uint_t nholes; 3991 3992 if (vdev_geom_read_pool_label(name, &configs, &count) != 0) 3993 return (NULL); 3994 3995 ASSERT3U(count, !=, 0); 3996 best_txg = 0; 3997 for (i = 0; i < count; i++) { 3998 uint64_t txg; 3999 4000 VERIFY(nvlist_lookup_uint64(configs[i], ZPOOL_CONFIG_POOL_TXG, 4001 &txg) == 0); 4002 if (txg > best_txg) { 4003 best_txg = txg; 4004 best_cfg = configs[i]; 4005 } 4006 } 4007 4008 /* 4009 * Multi-vdev root pool configuration discovery is not supported yet. 4010 */ 4011 nchildren = 1; 4012 nvlist_lookup_uint64(best_cfg, ZPOOL_CONFIG_VDEV_CHILDREN, &nchildren); 4013 holes = NULL; 4014 nvlist_lookup_uint64_array(best_cfg, ZPOOL_CONFIG_HOLE_ARRAY, 4015 &holes, &nholes); 4016 4017 tops = kmem_zalloc(nchildren * sizeof(void *), KM_SLEEP); 4018 for (i = 0; i < nchildren; i++) { 4019 if (i >= count) 4020 break; 4021 if (configs[i] == NULL) 4022 continue; 4023 VERIFY(nvlist_lookup_nvlist(configs[i], ZPOOL_CONFIG_VDEV_TREE, 4024 &nvtop) == 0); 4025 nvlist_dup(nvtop, &tops[i], KM_SLEEP); 4026 } 4027 for (i = 0; holes != NULL && i < nholes; i++) { 4028 if (i >= nchildren) 4029 continue; 4030 if (tops[holes[i]] != NULL) 4031 continue; 4032 nvlist_alloc(&tops[holes[i]], NV_UNIQUE_NAME, KM_SLEEP); 4033 VERIFY(nvlist_add_string(tops[holes[i]], ZPOOL_CONFIG_TYPE, 4034 VDEV_TYPE_HOLE) == 0); 4035 VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_ID, 4036 holes[i]) == 0); 4037 VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_GUID, 4038 0) == 0); 4039 } 4040 for (i = 0; i < nchildren; i++) { 4041 if (tops[i] != NULL) 4042 continue; 4043 nvlist_alloc(&tops[i], NV_UNIQUE_NAME, KM_SLEEP); 4044 VERIFY(nvlist_add_string(tops[i], ZPOOL_CONFIG_TYPE, 4045 VDEV_TYPE_MISSING) == 0); 4046 VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_ID, 4047 i) == 0); 4048 VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_GUID, 4049 0) == 0); 4050 } 4051 4052 /* 4053 * Create pool config based on the best vdev config. 4054 */ 4055 nvlist_dup(best_cfg, &config, KM_SLEEP); 4056 4057 /* 4058 * Put this pool's top-level vdevs into a root vdev. 4059 */ 4060 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 4061 &pgid) == 0); 4062 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 4063 VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 4064 VDEV_TYPE_ROOT) == 0); 4065 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 4066 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 4067 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 4068 tops, nchildren) == 0); 4069 4070 /* 4071 * Replace the existing vdev_tree with the new root vdev in 4072 * this pool's configuration (remove the old, add the new). 4073 */ 4074 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 4075 4076 /* 4077 * Drop vdev config elements that should not be present at pool level. 4078 */ 4079 nvlist_remove(config, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64); 4080 nvlist_remove(config, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64); 4081 4082 for (i = 0; i < count; i++) 4083 nvlist_free(configs[i]); 4084 kmem_free(configs, count * sizeof(void *)); 4085 for (i = 0; i < nchildren; i++) 4086 nvlist_free(tops[i]); 4087 kmem_free(tops, nchildren * sizeof(void *)); 4088 nvlist_free(nvroot); 4089 return (config); 4090} 4091 4092int 4093spa_import_rootpool(const char *name) 4094{ 4095 spa_t *spa; 4096 vdev_t *rvd, *bvd, *avd = NULL; 4097 nvlist_t *config, *nvtop; 4098 uint64_t txg; 4099 char *pname; 4100 int error; 4101 4102 /* 4103 * Read the label from the boot device and generate a configuration. 4104 */ 4105 config = spa_generate_rootconf(name); 4106 4107 mutex_enter(&spa_namespace_lock); 4108 if (config != NULL) { 4109 VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 4110 &pname) == 0 && strcmp(name, pname) == 0); 4111 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) 4112 == 0); 4113 4114 if ((spa = spa_lookup(pname)) != NULL) { 4115 /* 4116 * Remove the existing root pool from the namespace so 4117 * that we can replace it with the correct config 4118 * we just read in. 4119 */ 4120 spa_remove(spa); 4121 } 4122 spa = spa_add(pname, config, NULL); 4123 4124 /* 4125 * Set spa_ubsync.ub_version as it can be used in vdev_alloc() 4126 * via spa_version(). 4127 */ 4128 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 4129 &spa->spa_ubsync.ub_version) != 0) 4130 spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 4131 } else if ((spa = spa_lookup(name)) == NULL) { 4132 mutex_exit(&spa_namespace_lock); 4133 nvlist_free(config); 4134 cmn_err(CE_NOTE, "Cannot find the pool label for '%s'", 4135 name); 4136 return (EIO); 4137 } else { 4138 VERIFY(nvlist_dup(spa->spa_config, &config, KM_SLEEP) == 0); 4139 } 4140 spa->spa_is_root = B_TRUE; 4141 spa->spa_import_flags = ZFS_IMPORT_VERBATIM; 4142 4143 /* 4144 * Build up a vdev tree based on the boot device's label config. 4145 */ 4146 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 4147 &nvtop) == 0); 4148 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4149 error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, 4150 VDEV_ALLOC_ROOTPOOL); 4151 spa_config_exit(spa, SCL_ALL, FTAG); 4152 if (error) { 4153 mutex_exit(&spa_namespace_lock); 4154 nvlist_free(config); 4155 cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", 4156 pname); 4157 return (error); 4158 } 4159 4160 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4161 vdev_free(rvd); 4162 spa_config_exit(spa, SCL_ALL, FTAG); 4163 mutex_exit(&spa_namespace_lock); 4164 4165 nvlist_free(config); 4166 return (0); 4167} 4168 4169#endif /* sun */ 4170#endif 4171 4172/* 4173 * Import a non-root pool into the system. 4174 */ 4175int 4176spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) 4177{ 4178 spa_t *spa; 4179 char *altroot = NULL; 4180 spa_load_state_t state = SPA_LOAD_IMPORT; 4181 zpool_rewind_policy_t policy; 4182 uint64_t mode = spa_mode_global; 4183 uint64_t readonly = B_FALSE; 4184 int error; 4185 nvlist_t *nvroot; 4186 nvlist_t **spares, **l2cache; 4187 uint_t nspares, nl2cache; 4188 4189 /* 4190 * If a pool with this name exists, return failure. 4191 */ 4192 mutex_enter(&spa_namespace_lock); 4193 if (spa_lookup(pool) != NULL) { 4194 mutex_exit(&spa_namespace_lock); 4195 return (SET_ERROR(EEXIST)); 4196 } 4197 4198 /* 4199 * Create and initialize the spa structure. 4200 */ 4201 (void) nvlist_lookup_string(props, 4202 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 4203 (void) nvlist_lookup_uint64(props, 4204 zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); 4205 if (readonly) 4206 mode = FREAD; 4207 spa = spa_add(pool, config, altroot); 4208 spa->spa_import_flags = flags; 4209 4210 /* 4211 * Verbatim import - Take a pool and insert it into the namespace 4212 * as if it had been loaded at boot. 4213 */ 4214 if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { 4215 if (props != NULL) 4216 spa_configfile_set(spa, props, B_FALSE); 4217 4218 spa_config_sync(spa, B_FALSE, B_TRUE); 4219 4220 mutex_exit(&spa_namespace_lock); 4221 return (0); 4222 } 4223 4224 spa_activate(spa, mode); 4225 4226 /* 4227 * Don't start async tasks until we know everything is healthy. 4228 */ 4229 spa_async_suspend(spa); 4230 4231 zpool_get_rewind_policy(config, &policy); 4232 if (policy.zrp_request & ZPOOL_DO_REWIND) 4233 state = SPA_LOAD_RECOVER; 4234 4235 /* 4236 * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig 4237 * because the user-supplied config is actually the one to trust when 4238 * doing an import. 4239 */ 4240 if (state != SPA_LOAD_RECOVER) 4241 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 4242 4243 error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg, 4244 policy.zrp_request); 4245 4246 /* 4247 * Propagate anything learned while loading the pool and pass it 4248 * back to caller (i.e. rewind info, missing devices, etc). 4249 */ 4250 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 4251 spa->spa_load_info) == 0); 4252 4253 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4254 /* 4255 * Toss any existing sparelist, as it doesn't have any validity 4256 * anymore, and conflicts with spa_has_spare(). 4257 */ 4258 if (spa->spa_spares.sav_config) { 4259 nvlist_free(spa->spa_spares.sav_config); 4260 spa->spa_spares.sav_config = NULL; 4261 spa_load_spares(spa); 4262 } 4263 if (spa->spa_l2cache.sav_config) { 4264 nvlist_free(spa->spa_l2cache.sav_config); 4265 spa->spa_l2cache.sav_config = NULL; 4266 spa_load_l2cache(spa); 4267 } 4268 4269 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 4270 &nvroot) == 0); 4271 if (error == 0) 4272 error = spa_validate_aux(spa, nvroot, -1ULL, 4273 VDEV_ALLOC_SPARE); 4274 if (error == 0) 4275 error = spa_validate_aux(spa, nvroot, -1ULL, 4276 VDEV_ALLOC_L2CACHE); 4277 spa_config_exit(spa, SCL_ALL, FTAG); 4278 4279 if (props != NULL) 4280 spa_configfile_set(spa, props, B_FALSE); 4281 4282 if (error != 0 || (props && spa_writeable(spa) && 4283 (error = spa_prop_set(spa, props)))) { 4284 spa_unload(spa); 4285 spa_deactivate(spa); 4286 spa_remove(spa); 4287 mutex_exit(&spa_namespace_lock); 4288 return (error); 4289 } 4290 4291 spa_async_resume(spa); 4292 4293 /* 4294 * Override any spares and level 2 cache devices as specified by 4295 * the user, as these may have correct device names/devids, etc. 4296 */ 4297 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 4298 &spares, &nspares) == 0) { 4299 if (spa->spa_spares.sav_config) 4300 VERIFY(nvlist_remove(spa->spa_spares.sav_config, 4301 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 4302 else 4303 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, 4304 NV_UNIQUE_NAME, KM_SLEEP) == 0); 4305 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 4306 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 4307 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4308 spa_load_spares(spa); 4309 spa_config_exit(spa, SCL_ALL, FTAG); 4310 spa->spa_spares.sav_sync = B_TRUE; 4311 } 4312 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 4313 &l2cache, &nl2cache) == 0) { 4314 if (spa->spa_l2cache.sav_config) 4315 VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, 4316 ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); 4317 else 4318 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 4319 NV_UNIQUE_NAME, KM_SLEEP) == 0); 4320 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 4321 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 4322 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4323 spa_load_l2cache(spa); 4324 spa_config_exit(spa, SCL_ALL, FTAG); 4325 spa->spa_l2cache.sav_sync = B_TRUE; 4326 } 4327 4328 /* 4329 * Check for any removed devices. 4330 */ 4331 if (spa->spa_autoreplace) { 4332 spa_aux_check_removed(&spa->spa_spares); 4333 spa_aux_check_removed(&spa->spa_l2cache); 4334 } 4335 4336 if (spa_writeable(spa)) { 4337 /* 4338 * Update the config cache to include the newly-imported pool. 4339 */ 4340 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 4341 } 4342 4343 /* 4344 * It's possible that the pool was expanded while it was exported. 4345 * We kick off an async task to handle this for us. 4346 */ 4347 spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 4348 4349 mutex_exit(&spa_namespace_lock); 4350 spa_history_log_version(spa, "import"); 4351 4352#ifdef __FreeBSD__ 4353#ifdef _KERNEL 4354 zvol_create_minors(pool); 4355#endif 4356#endif 4357 return (0); 4358} 4359 4360nvlist_t * 4361spa_tryimport(nvlist_t *tryconfig) 4362{ 4363 nvlist_t *config = NULL; 4364 char *poolname; 4365 spa_t *spa; 4366 uint64_t state; 4367 int error; 4368 4369 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 4370 return (NULL); 4371 4372 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 4373 return (NULL); 4374 4375 /* 4376 * Create and initialize the spa structure. 4377 */ 4378 mutex_enter(&spa_namespace_lock); 4379 spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); 4380 spa_activate(spa, FREAD); 4381 4382 /* 4383 * Pass off the heavy lifting to spa_load(). 4384 * Pass TRUE for mosconfig because the user-supplied config 4385 * is actually the one to trust when doing an import. 4386 */ 4387 error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE); 4388 4389 /* 4390 * If 'tryconfig' was at least parsable, return the current config. 4391 */ 4392 if (spa->spa_root_vdev != NULL) { 4393 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 4394 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 4395 poolname) == 0); 4396 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 4397 state) == 0); 4398 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 4399 spa->spa_uberblock.ub_timestamp) == 0); 4400 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 4401 spa->spa_load_info) == 0); 4402 4403 /* 4404 * If the bootfs property exists on this pool then we 4405 * copy it out so that external consumers can tell which 4406 * pools are bootable. 4407 */ 4408 if ((!error || error == EEXIST) && spa->spa_bootfs) { 4409 char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 4410 4411 /* 4412 * We have to play games with the name since the 4413 * pool was opened as TRYIMPORT_NAME. 4414 */ 4415 if (dsl_dsobj_to_dsname(spa_name(spa), 4416 spa->spa_bootfs, tmpname) == 0) { 4417 char *cp; 4418 char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 4419 4420 cp = strchr(tmpname, '/'); 4421 if (cp == NULL) { 4422 (void) strlcpy(dsname, tmpname, 4423 MAXPATHLEN); 4424 } else { 4425 (void) snprintf(dsname, MAXPATHLEN, 4426 "%s/%s", poolname, ++cp); 4427 } 4428 VERIFY(nvlist_add_string(config, 4429 ZPOOL_CONFIG_BOOTFS, dsname) == 0); 4430 kmem_free(dsname, MAXPATHLEN); 4431 } 4432 kmem_free(tmpname, MAXPATHLEN); 4433 } 4434 4435 /* 4436 * Add the list of hot spares and level 2 cache devices. 4437 */ 4438 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4439 spa_add_spares(spa, config); 4440 spa_add_l2cache(spa, config); 4441 spa_config_exit(spa, SCL_CONFIG, FTAG); 4442 } 4443 4444 spa_unload(spa); 4445 spa_deactivate(spa); 4446 spa_remove(spa); 4447 mutex_exit(&spa_namespace_lock); 4448 4449 return (config); 4450} 4451 4452/* 4453 * Pool export/destroy 4454 * 4455 * The act of destroying or exporting a pool is very simple. We make sure there 4456 * is no more pending I/O and any references to the pool are gone. Then, we 4457 * update the pool state and sync all the labels to disk, removing the 4458 * configuration from the cache afterwards. If the 'hardforce' flag is set, then 4459 * we don't sync the labels or remove the configuration cache. 4460 */ 4461static int 4462spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, 4463 boolean_t force, boolean_t hardforce) 4464{ 4465 spa_t *spa; 4466 4467 if (oldconfig) 4468 *oldconfig = NULL; 4469 4470 if (!(spa_mode_global & FWRITE)) 4471 return (SET_ERROR(EROFS)); 4472 4473 mutex_enter(&spa_namespace_lock); 4474 if ((spa = spa_lookup(pool)) == NULL) { 4475 mutex_exit(&spa_namespace_lock); 4476 return (SET_ERROR(ENOENT)); 4477 } 4478 4479 /* 4480 * Put a hold on the pool, drop the namespace lock, stop async tasks, 4481 * reacquire the namespace lock, and see if we can export. 4482 */ 4483 spa_open_ref(spa, FTAG); 4484 mutex_exit(&spa_namespace_lock); 4485 spa_async_suspend(spa); 4486 mutex_enter(&spa_namespace_lock); 4487 spa_close(spa, FTAG); 4488 4489 /* 4490 * The pool will be in core if it's openable, 4491 * in which case we can modify its state. 4492 */ 4493 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 4494 /* 4495 * Objsets may be open only because they're dirty, so we 4496 * have to force it to sync before checking spa_refcnt. 4497 */ 4498 txg_wait_synced(spa->spa_dsl_pool, 0); 4499 4500 /* 4501 * A pool cannot be exported or destroyed if there are active 4502 * references. If we are resetting a pool, allow references by 4503 * fault injection handlers. 4504 */ 4505 if (!spa_refcount_zero(spa) || 4506 (spa->spa_inject_ref != 0 && 4507 new_state != POOL_STATE_UNINITIALIZED)) { 4508 spa_async_resume(spa); 4509 mutex_exit(&spa_namespace_lock); 4510 return (SET_ERROR(EBUSY)); 4511 } 4512 4513 /* 4514 * A pool cannot be exported if it has an active shared spare. 4515 * This is to prevent other pools stealing the active spare 4516 * from an exported pool. At user's own will, such pool can 4517 * be forcedly exported. 4518 */ 4519 if (!force && new_state == POOL_STATE_EXPORTED && 4520 spa_has_active_shared_spare(spa)) { 4521 spa_async_resume(spa); 4522 mutex_exit(&spa_namespace_lock); 4523 return (SET_ERROR(EXDEV)); 4524 } 4525 4526 /* 4527 * We want this to be reflected on every label, 4528 * so mark them all dirty. spa_unload() will do the 4529 * final sync that pushes these changes out. 4530 */ 4531 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 4532 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4533 spa->spa_state = new_state; 4534 spa->spa_final_txg = spa_last_synced_txg(spa) + 4535 TXG_DEFER_SIZE + 1; 4536 vdev_config_dirty(spa->spa_root_vdev); 4537 spa_config_exit(spa, SCL_ALL, FTAG); 4538 } 4539 } 4540 4541 spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY); 4542 4543 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 4544 spa_unload(spa); 4545 spa_deactivate(spa); 4546 } 4547 4548 if (oldconfig && spa->spa_config) 4549 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 4550 4551 if (new_state != POOL_STATE_UNINITIALIZED) { 4552 if (!hardforce) 4553 spa_config_sync(spa, B_TRUE, B_TRUE); 4554 spa_remove(spa); 4555 } 4556 mutex_exit(&spa_namespace_lock); 4557 4558 return (0); 4559} 4560 4561/* 4562 * Destroy a storage pool. 4563 */ 4564int 4565spa_destroy(char *pool) 4566{ 4567 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, 4568 B_FALSE, B_FALSE)); 4569} 4570 4571/* 4572 * Export a storage pool. 4573 */ 4574int 4575spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, 4576 boolean_t hardforce) 4577{ 4578 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, 4579 force, hardforce)); 4580} 4581 4582/* 4583 * Similar to spa_export(), this unloads the spa_t without actually removing it 4584 * from the namespace in any way. 4585 */ 4586int 4587spa_reset(char *pool) 4588{ 4589 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, 4590 B_FALSE, B_FALSE)); 4591} 4592 4593/* 4594 * ========================================================================== 4595 * Device manipulation 4596 * ========================================================================== 4597 */ 4598 4599/* 4600 * Add a device to a storage pool. 4601 */ 4602int 4603spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 4604{ 4605 uint64_t txg, id; 4606 int error; 4607 vdev_t *rvd = spa->spa_root_vdev; 4608 vdev_t *vd, *tvd; 4609 nvlist_t **spares, **l2cache; 4610 uint_t nspares, nl2cache; 4611 4612 ASSERT(spa_writeable(spa)); 4613 4614 txg = spa_vdev_enter(spa); 4615 4616 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 4617 VDEV_ALLOC_ADD)) != 0) 4618 return (spa_vdev_exit(spa, NULL, txg, error)); 4619 4620 spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ 4621 4622 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 4623 &nspares) != 0) 4624 nspares = 0; 4625 4626 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 4627 &nl2cache) != 0) 4628 nl2cache = 0; 4629 4630 if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) 4631 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 4632 4633 if (vd->vdev_children != 0 && 4634 (error = vdev_create(vd, txg, B_FALSE)) != 0) 4635 return (spa_vdev_exit(spa, vd, txg, error)); 4636 4637 /* 4638 * We must validate the spares and l2cache devices after checking the 4639 * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 4640 */ 4641 if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) 4642 return (spa_vdev_exit(spa, vd, txg, error)); 4643 4644 /* 4645 * Transfer each new top-level vdev from vd to rvd. 4646 */ 4647 for (int c = 0; c < vd->vdev_children; c++) { 4648 4649 /* 4650 * Set the vdev id to the first hole, if one exists. 4651 */ 4652 for (id = 0; id < rvd->vdev_children; id++) { 4653 if (rvd->vdev_child[id]->vdev_ishole) { 4654 vdev_free(rvd->vdev_child[id]); 4655 break; 4656 } 4657 } 4658 tvd = vd->vdev_child[c]; 4659 vdev_remove_child(vd, tvd); 4660 tvd->vdev_id = id; 4661 vdev_add_child(rvd, tvd); 4662 vdev_config_dirty(tvd); 4663 } 4664 4665 if (nspares != 0) { 4666 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 4667 ZPOOL_CONFIG_SPARES); 4668 spa_load_spares(spa); 4669 spa->spa_spares.sav_sync = B_TRUE; 4670 } 4671 4672 if (nl2cache != 0) { 4673 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 4674 ZPOOL_CONFIG_L2CACHE); 4675 spa_load_l2cache(spa); 4676 spa->spa_l2cache.sav_sync = B_TRUE; 4677 } 4678 4679 /* 4680 * We have to be careful when adding new vdevs to an existing pool. 4681 * If other threads start allocating from these vdevs before we 4682 * sync the config cache, and we lose power, then upon reboot we may 4683 * fail to open the pool because there are DVAs that the config cache 4684 * can't translate. Therefore, we first add the vdevs without 4685 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 4686 * and then let spa_config_update() initialize the new metaslabs. 4687 * 4688 * spa_load() checks for added-but-not-initialized vdevs, so that 4689 * if we lose power at any point in this sequence, the remaining 4690 * steps will be completed the next time we load the pool. 4691 */ 4692 (void) spa_vdev_exit(spa, vd, txg, 0); 4693 4694 mutex_enter(&spa_namespace_lock); 4695 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 4696 mutex_exit(&spa_namespace_lock); 4697 4698 return (0); 4699} 4700 4701/* 4702 * Attach a device to a mirror. The arguments are the path to any device 4703 * in the mirror, and the nvroot for the new device. If the path specifies 4704 * a device that is not mirrored, we automatically insert the mirror vdev. 4705 * 4706 * If 'replacing' is specified, the new device is intended to replace the 4707 * existing device; in this case the two devices are made into their own 4708 * mirror using the 'replacing' vdev, which is functionally identical to 4709 * the mirror vdev (it actually reuses all the same ops) but has a few 4710 * extra rules: you can't attach to it after it's been created, and upon 4711 * completion of resilvering, the first disk (the one being replaced) 4712 * is automatically detached. 4713 */ 4714int 4715spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 4716{ 4717 uint64_t txg, dtl_max_txg; 4718 vdev_t *rvd = spa->spa_root_vdev; 4719 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 4720 vdev_ops_t *pvops; 4721 char *oldvdpath, *newvdpath; 4722 int newvd_isspare; 4723 int error; 4724 4725 ASSERT(spa_writeable(spa)); 4726 4727 txg = spa_vdev_enter(spa); 4728 4729 oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 4730 4731 if (oldvd == NULL) 4732 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 4733 4734 if (!oldvd->vdev_ops->vdev_op_leaf) 4735 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4736 4737 pvd = oldvd->vdev_parent; 4738 4739 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 4740 VDEV_ALLOC_ATTACH)) != 0) 4741 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4742 4743 if (newrootvd->vdev_children != 1) 4744 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 4745 4746 newvd = newrootvd->vdev_child[0]; 4747 4748 if (!newvd->vdev_ops->vdev_op_leaf) 4749 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 4750 4751 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 4752 return (spa_vdev_exit(spa, newrootvd, txg, error)); 4753 4754 /* 4755 * Spares can't replace logs 4756 */ 4757 if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) 4758 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4759 4760 if (!replacing) { 4761 /* 4762 * For attach, the only allowable parent is a mirror or the root 4763 * vdev. 4764 */ 4765 if (pvd->vdev_ops != &vdev_mirror_ops && 4766 pvd->vdev_ops != &vdev_root_ops) 4767 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4768 4769 pvops = &vdev_mirror_ops; 4770 } else { 4771 /* 4772 * Active hot spares can only be replaced by inactive hot 4773 * spares. 4774 */ 4775 if (pvd->vdev_ops == &vdev_spare_ops && 4776 oldvd->vdev_isspare && 4777 !spa_has_spare(spa, newvd->vdev_guid)) 4778 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4779 4780 /* 4781 * If the source is a hot spare, and the parent isn't already a 4782 * spare, then we want to create a new hot spare. Otherwise, we 4783 * want to create a replacing vdev. The user is not allowed to 4784 * attach to a spared vdev child unless the 'isspare' state is 4785 * the same (spare replaces spare, non-spare replaces 4786 * non-spare). 4787 */ 4788 if (pvd->vdev_ops == &vdev_replacing_ops && 4789 spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { 4790 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4791 } else if (pvd->vdev_ops == &vdev_spare_ops && 4792 newvd->vdev_isspare != oldvd->vdev_isspare) { 4793 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4794 } 4795 4796 if (newvd->vdev_isspare) 4797 pvops = &vdev_spare_ops; 4798 else 4799 pvops = &vdev_replacing_ops; 4800 } 4801 4802 /* 4803 * Make sure the new device is big enough. 4804 */ 4805 if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) 4806 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 4807 4808 /* 4809 * The new device cannot have a higher alignment requirement 4810 * than the top-level vdev. 4811 */ 4812 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 4813 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 4814 4815 /* 4816 * If this is an in-place replacement, update oldvd's path and devid 4817 * to make it distinguishable from newvd, and unopenable from now on. 4818 */ 4819 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 4820 spa_strfree(oldvd->vdev_path); 4821 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 4822 KM_SLEEP); 4823 (void) sprintf(oldvd->vdev_path, "%s/%s", 4824 newvd->vdev_path, "old"); 4825 if (oldvd->vdev_devid != NULL) { 4826 spa_strfree(oldvd->vdev_devid); 4827 oldvd->vdev_devid = NULL; 4828 } 4829 } 4830 4831 /* mark the device being resilvered */ 4832 newvd->vdev_resilver_txg = txg; 4833 4834 /* 4835 * If the parent is not a mirror, or if we're replacing, insert the new 4836 * mirror/replacing/spare vdev above oldvd. 4837 */ 4838 if (pvd->vdev_ops != pvops) 4839 pvd = vdev_add_parent(oldvd, pvops); 4840 4841 ASSERT(pvd->vdev_top->vdev_parent == rvd); 4842 ASSERT(pvd->vdev_ops == pvops); 4843 ASSERT(oldvd->vdev_parent == pvd); 4844 4845 /* 4846 * Extract the new device from its root and add it to pvd. 4847 */ 4848 vdev_remove_child(newrootvd, newvd); 4849 newvd->vdev_id = pvd->vdev_children; 4850 newvd->vdev_crtxg = oldvd->vdev_crtxg; 4851 vdev_add_child(pvd, newvd); 4852 4853 tvd = newvd->vdev_top; 4854 ASSERT(pvd->vdev_top == tvd); 4855 ASSERT(tvd->vdev_parent == rvd); 4856 4857 vdev_config_dirty(tvd); 4858 4859 /* 4860 * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account 4861 * for any dmu_sync-ed blocks. It will propagate upward when 4862 * spa_vdev_exit() calls vdev_dtl_reassess(). 4863 */ 4864 dtl_max_txg = txg + TXG_CONCURRENT_STATES; 4865 4866 vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, 4867 dtl_max_txg - TXG_INITIAL); 4868 4869 if (newvd->vdev_isspare) { 4870 spa_spare_activate(newvd); 4871 spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE); 4872 } 4873 4874 oldvdpath = spa_strdup(oldvd->vdev_path); 4875 newvdpath = spa_strdup(newvd->vdev_path); 4876 newvd_isspare = newvd->vdev_isspare; 4877 4878 /* 4879 * Mark newvd's DTL dirty in this txg. 4880 */ 4881 vdev_dirty(tvd, VDD_DTL, newvd, txg); 4882 4883 /* 4884 * Schedule the resilver to restart in the future. We do this to 4885 * ensure that dmu_sync-ed blocks have been stitched into the 4886 * respective datasets. 4887 */ 4888 dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg); 4889 4890 /* 4891 * Commit the config 4892 */ 4893 (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); 4894 4895 spa_history_log_internal(spa, "vdev attach", NULL, 4896 "%s vdev=%s %s vdev=%s", 4897 replacing && newvd_isspare ? "spare in" : 4898 replacing ? "replace" : "attach", newvdpath, 4899 replacing ? "for" : "to", oldvdpath); 4900 4901 spa_strfree(oldvdpath); 4902 spa_strfree(newvdpath); 4903 4904 if (spa->spa_bootfs) 4905 spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH); 4906 4907 return (0); 4908} 4909 4910/* 4911 * Detach a device from a mirror or replacing vdev. 4912 * 4913 * If 'replace_done' is specified, only detach if the parent 4914 * is a replacing vdev. 4915 */ 4916int 4917spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) 4918{ 4919 uint64_t txg; 4920 int error; 4921 vdev_t *rvd = spa->spa_root_vdev; 4922 vdev_t *vd, *pvd, *cvd, *tvd; 4923 boolean_t unspare = B_FALSE; 4924 uint64_t unspare_guid = 0; 4925 char *vdpath; 4926 4927 ASSERT(spa_writeable(spa)); 4928 4929 txg = spa_vdev_enter(spa); 4930 4931 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 4932 4933 if (vd == NULL) 4934 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 4935 4936 if (!vd->vdev_ops->vdev_op_leaf) 4937 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4938 4939 pvd = vd->vdev_parent; 4940 4941 /* 4942 * If the parent/child relationship is not as expected, don't do it. 4943 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing 4944 * vdev that's replacing B with C. The user's intent in replacing 4945 * is to go from M(A,B) to M(A,C). If the user decides to cancel 4946 * the replace by detaching C, the expected behavior is to end up 4947 * M(A,B). But suppose that right after deciding to detach C, 4948 * the replacement of B completes. We would have M(A,C), and then 4949 * ask to detach C, which would leave us with just A -- not what 4950 * the user wanted. To prevent this, we make sure that the 4951 * parent/child relationship hasn't changed -- in this example, 4952 * that C's parent is still the replacing vdev R. 4953 */ 4954 if (pvd->vdev_guid != pguid && pguid != 0) 4955 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 4956 4957 /* 4958 * Only 'replacing' or 'spare' vdevs can be replaced. 4959 */ 4960 if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && 4961 pvd->vdev_ops != &vdev_spare_ops) 4962 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4963 4964 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 4965 spa_version(spa) >= SPA_VERSION_SPARES); 4966 4967 /* 4968 * Only mirror, replacing, and spare vdevs support detach. 4969 */ 4970 if (pvd->vdev_ops != &vdev_replacing_ops && 4971 pvd->vdev_ops != &vdev_mirror_ops && 4972 pvd->vdev_ops != &vdev_spare_ops) 4973 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4974 4975 /* 4976 * If this device has the only valid copy of some data, 4977 * we cannot safely detach it. 4978 */ 4979 if (vdev_dtl_required(vd)) 4980 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 4981 4982 ASSERT(pvd->vdev_children >= 2); 4983 4984 /* 4985 * If we are detaching the second disk from a replacing vdev, then 4986 * check to see if we changed the original vdev's path to have "/old" 4987 * at the end in spa_vdev_attach(). If so, undo that change now. 4988 */ 4989 if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && 4990 vd->vdev_path != NULL) { 4991 size_t len = strlen(vd->vdev_path); 4992 4993 for (int c = 0; c < pvd->vdev_children; c++) { 4994 cvd = pvd->vdev_child[c]; 4995 4996 if (cvd == vd || cvd->vdev_path == NULL) 4997 continue; 4998 4999 if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 5000 strcmp(cvd->vdev_path + len, "/old") == 0) { 5001 spa_strfree(cvd->vdev_path); 5002 cvd->vdev_path = spa_strdup(vd->vdev_path); 5003 break; 5004 } 5005 } 5006 } 5007 5008 /* 5009 * If we are detaching the original disk from a spare, then it implies 5010 * that the spare should become a real disk, and be removed from the 5011 * active spare list for the pool. 5012 */ 5013 if (pvd->vdev_ops == &vdev_spare_ops && 5014 vd->vdev_id == 0 && 5015 pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare) 5016 unspare = B_TRUE; 5017 5018 /* 5019 * Erase the disk labels so the disk can be used for other things. 5020 * This must be done after all other error cases are handled, 5021 * but before we disembowel vd (so we can still do I/O to it). 5022 * But if we can't do it, don't treat the error as fatal -- 5023 * it may be that the unwritability of the disk is the reason 5024 * it's being detached! 5025 */ 5026 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 5027 5028 /* 5029 * Remove vd from its parent and compact the parent's children. 5030 */ 5031 vdev_remove_child(pvd, vd); 5032 vdev_compact_children(pvd); 5033 5034 /* 5035 * Remember one of the remaining children so we can get tvd below. 5036 */ 5037 cvd = pvd->vdev_child[pvd->vdev_children - 1]; 5038 5039 /* 5040 * If we need to remove the remaining child from the list of hot spares, 5041 * do it now, marking the vdev as no longer a spare in the process. 5042 * We must do this before vdev_remove_parent(), because that can 5043 * change the GUID if it creates a new toplevel GUID. For a similar 5044 * reason, we must remove the spare now, in the same txg as the detach; 5045 * otherwise someone could attach a new sibling, change the GUID, and 5046 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. 5047 */ 5048 if (unspare) { 5049 ASSERT(cvd->vdev_isspare); 5050 spa_spare_remove(cvd); 5051 unspare_guid = cvd->vdev_guid; 5052 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 5053 cvd->vdev_unspare = B_TRUE; 5054 } 5055 5056 /* 5057 * If the parent mirror/replacing vdev only has one child, 5058 * the parent is no longer needed. Remove it from the tree. 5059 */ 5060 if (pvd->vdev_children == 1) { 5061 if (pvd->vdev_ops == &vdev_spare_ops) 5062 cvd->vdev_unspare = B_FALSE; 5063 vdev_remove_parent(cvd); 5064 } 5065 5066 5067 /* 5068 * We don't set tvd until now because the parent we just removed 5069 * may have been the previous top-level vdev. 5070 */ 5071 tvd = cvd->vdev_top; 5072 ASSERT(tvd->vdev_parent == rvd); 5073 5074 /* 5075 * Reevaluate the parent vdev state. 5076 */ 5077 vdev_propagate_state(cvd); 5078 5079 /* 5080 * If the 'autoexpand' property is set on the pool then automatically 5081 * try to expand the size of the pool. For example if the device we 5082 * just detached was smaller than the others, it may be possible to 5083 * add metaslabs (i.e. grow the pool). We need to reopen the vdev 5084 * first so that we can obtain the updated sizes of the leaf vdevs. 5085 */ 5086 if (spa->spa_autoexpand) { 5087 vdev_reopen(tvd); 5088 vdev_expand(tvd, txg); 5089 } 5090 5091 vdev_config_dirty(tvd); 5092 5093 /* 5094 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 5095 * vd->vdev_detached is set and free vd's DTL object in syncing context. 5096 * But first make sure we're not on any *other* txg's DTL list, to 5097 * prevent vd from being accessed after it's freed. 5098 */ 5099 vdpath = spa_strdup(vd->vdev_path); 5100 for (int t = 0; t < TXG_SIZE; t++) 5101 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 5102 vd->vdev_detached = B_TRUE; 5103 vdev_dirty(tvd, VDD_DTL, vd, txg); 5104 5105 spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); 5106 5107 /* hang on to the spa before we release the lock */ 5108 spa_open_ref(spa, FTAG); 5109 5110 error = spa_vdev_exit(spa, vd, txg, 0); 5111 5112 spa_history_log_internal(spa, "detach", NULL, 5113 "vdev=%s", vdpath); 5114 spa_strfree(vdpath); 5115 5116 /* 5117 * If this was the removal of the original device in a hot spare vdev, 5118 * then we want to go through and remove the device from the hot spare 5119 * list of every other pool. 5120 */ 5121 if (unspare) { 5122 spa_t *altspa = NULL; 5123 5124 mutex_enter(&spa_namespace_lock); 5125 while ((altspa = spa_next(altspa)) != NULL) { 5126 if (altspa->spa_state != POOL_STATE_ACTIVE || 5127 altspa == spa) 5128 continue; 5129 5130 spa_open_ref(altspa, FTAG); 5131 mutex_exit(&spa_namespace_lock); 5132 (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); 5133 mutex_enter(&spa_namespace_lock); 5134 spa_close(altspa, FTAG); 5135 } 5136 mutex_exit(&spa_namespace_lock); 5137 5138 /* search the rest of the vdevs for spares to remove */ 5139 spa_vdev_resilver_done(spa); 5140 } 5141 5142 /* all done with the spa; OK to release */ 5143 mutex_enter(&spa_namespace_lock); 5144 spa_close(spa, FTAG); 5145 mutex_exit(&spa_namespace_lock); 5146 5147 return (error); 5148} 5149 5150/* 5151 * Split a set of devices from their mirrors, and create a new pool from them. 5152 */ 5153int 5154spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, 5155 nvlist_t *props, boolean_t exp) 5156{ 5157 int error = 0; 5158 uint64_t txg, *glist; 5159 spa_t *newspa; 5160 uint_t c, children, lastlog; 5161 nvlist_t **child, *nvl, *tmp; 5162 dmu_tx_t *tx; 5163 char *altroot = NULL; 5164 vdev_t *rvd, **vml = NULL; /* vdev modify list */ 5165 boolean_t activate_slog; 5166 5167 ASSERT(spa_writeable(spa)); 5168 5169 txg = spa_vdev_enter(spa); 5170 5171 /* clear the log and flush everything up to now */ 5172 activate_slog = spa_passivate_log(spa); 5173 (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 5174 error = spa_offline_log(spa); 5175 txg = spa_vdev_config_enter(spa); 5176 5177 if (activate_slog) 5178 spa_activate_log(spa); 5179 5180 if (error != 0) 5181 return (spa_vdev_exit(spa, NULL, txg, error)); 5182 5183 /* check new spa name before going any further */ 5184 if (spa_lookup(newname) != NULL) 5185 return (spa_vdev_exit(spa, NULL, txg, EEXIST)); 5186 5187 /* 5188 * scan through all the children to ensure they're all mirrors 5189 */ 5190 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || 5191 nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, 5192 &children) != 0) 5193 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 5194 5195 /* first, check to ensure we've got the right child count */ 5196 rvd = spa->spa_root_vdev; 5197 lastlog = 0; 5198 for (c = 0; c < rvd->vdev_children; c++) { 5199 vdev_t *vd = rvd->vdev_child[c]; 5200 5201 /* don't count the holes & logs as children */ 5202 if (vd->vdev_islog || vd->vdev_ishole) { 5203 if (lastlog == 0) 5204 lastlog = c; 5205 continue; 5206 } 5207 5208 lastlog = 0; 5209 } 5210 if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) 5211 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 5212 5213 /* next, ensure no spare or cache devices are part of the split */ 5214 if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || 5215 nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) 5216 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 5217 5218 vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); 5219 glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); 5220 5221 /* then, loop over each vdev and validate it */ 5222 for (c = 0; c < children; c++) { 5223 uint64_t is_hole = 0; 5224 5225 (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, 5226 &is_hole); 5227 5228 if (is_hole != 0) { 5229 if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || 5230 spa->spa_root_vdev->vdev_child[c]->vdev_islog) { 5231 continue; 5232 } else { 5233 error = SET_ERROR(EINVAL); 5234 break; 5235 } 5236 } 5237 5238 /* which disk is going to be split? */ 5239 if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, 5240 &glist[c]) != 0) { 5241 error = SET_ERROR(EINVAL); 5242 break; 5243 } 5244 5245 /* look it up in the spa */ 5246 vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); 5247 if (vml[c] == NULL) { 5248 error = SET_ERROR(ENODEV); 5249 break; 5250 } 5251 5252 /* make sure there's nothing stopping the split */ 5253 if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || 5254 vml[c]->vdev_islog || 5255 vml[c]->vdev_ishole || 5256 vml[c]->vdev_isspare || 5257 vml[c]->vdev_isl2cache || 5258 !vdev_writeable(vml[c]) || 5259 vml[c]->vdev_children != 0 || 5260 vml[c]->vdev_state != VDEV_STATE_HEALTHY || 5261 c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { 5262 error = SET_ERROR(EINVAL); 5263 break; 5264 } 5265 5266 if (vdev_dtl_required(vml[c])) { 5267 error = SET_ERROR(EBUSY); 5268 break; 5269 } 5270 5271 /* we need certain info from the top level */ 5272 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, 5273 vml[c]->vdev_top->vdev_ms_array) == 0); 5274 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, 5275 vml[c]->vdev_top->vdev_ms_shift) == 0); 5276 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, 5277 vml[c]->vdev_top->vdev_asize) == 0); 5278 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, 5279 vml[c]->vdev_top->vdev_ashift) == 0); 5280 } 5281 5282 if (error != 0) { 5283 kmem_free(vml, children * sizeof (vdev_t *)); 5284 kmem_free(glist, children * sizeof (uint64_t)); 5285 return (spa_vdev_exit(spa, NULL, txg, error)); 5286 } 5287 5288 /* stop writers from using the disks */ 5289 for (c = 0; c < children; c++) { 5290 if (vml[c] != NULL) 5291 vml[c]->vdev_offline = B_TRUE; 5292 } 5293 vdev_reopen(spa->spa_root_vdev); 5294 5295 /* 5296 * Temporarily record the splitting vdevs in the spa config. This 5297 * will disappear once the config is regenerated. 5298 */ 5299 VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5300 VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 5301 glist, children) == 0); 5302 kmem_free(glist, children * sizeof (uint64_t)); 5303 5304 mutex_enter(&spa->spa_props_lock); 5305 VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, 5306 nvl) == 0); 5307 mutex_exit(&spa->spa_props_lock); 5308 spa->spa_config_splitting = nvl; 5309 vdev_config_dirty(spa->spa_root_vdev); 5310 5311 /* configure and create the new pool */ 5312 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0); 5313 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 5314 exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0); 5315 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 5316 spa_version(spa)) == 0); 5317 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, 5318 spa->spa_config_txg) == 0); 5319 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, 5320 spa_generate_guid(NULL)) == 0); 5321 (void) nvlist_lookup_string(props, 5322 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 5323 5324 /* add the new pool to the namespace */ 5325 newspa = spa_add(newname, config, altroot); 5326 newspa->spa_config_txg = spa->spa_config_txg; 5327 spa_set_log_state(newspa, SPA_LOG_CLEAR); 5328 5329 /* release the spa config lock, retaining the namespace lock */ 5330 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 5331 5332 if (zio_injection_enabled) 5333 zio_handle_panic_injection(spa, FTAG, 1); 5334 5335 spa_activate(newspa, spa_mode_global); 5336 spa_async_suspend(newspa); 5337 5338#ifndef sun 5339 /* mark that we are creating new spa by splitting */ 5340 newspa->spa_splitting_newspa = B_TRUE; 5341#endif 5342 /* create the new pool from the disks of the original pool */ 5343 error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE); 5344#ifndef sun 5345 newspa->spa_splitting_newspa = B_FALSE; 5346#endif 5347 if (error) 5348 goto out; 5349 5350 /* if that worked, generate a real config for the new pool */ 5351 if (newspa->spa_root_vdev != NULL) { 5352 VERIFY(nvlist_alloc(&newspa->spa_config_splitting, 5353 NV_UNIQUE_NAME, KM_SLEEP) == 0); 5354 VERIFY(nvlist_add_uint64(newspa->spa_config_splitting, 5355 ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0); 5356 spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, 5357 B_TRUE)); 5358 } 5359 5360 /* set the props */ 5361 if (props != NULL) { 5362 spa_configfile_set(newspa, props, B_FALSE); 5363 error = spa_prop_set(newspa, props); 5364 if (error) 5365 goto out; 5366 } 5367 5368 /* flush everything */ 5369 txg = spa_vdev_config_enter(newspa); 5370 vdev_config_dirty(newspa->spa_root_vdev); 5371 (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); 5372 5373 if (zio_injection_enabled) 5374 zio_handle_panic_injection(spa, FTAG, 2); 5375 5376 spa_async_resume(newspa); 5377 5378 /* finally, update the original pool's config */ 5379 txg = spa_vdev_config_enter(spa); 5380 tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 5381 error = dmu_tx_assign(tx, TXG_WAIT); 5382 if (error != 0) 5383 dmu_tx_abort(tx); 5384 for (c = 0; c < children; c++) { 5385 if (vml[c] != NULL) { 5386 vdev_split(vml[c]); 5387 if (error == 0) 5388 spa_history_log_internal(spa, "detach", tx, 5389 "vdev=%s", vml[c]->vdev_path); 5390 vdev_free(vml[c]); 5391 } 5392 } 5393 vdev_config_dirty(spa->spa_root_vdev); 5394 spa->spa_config_splitting = NULL; 5395 nvlist_free(nvl); 5396 if (error == 0) 5397 dmu_tx_commit(tx); 5398 (void) spa_vdev_exit(spa, NULL, txg, 0); 5399 5400 if (zio_injection_enabled) 5401 zio_handle_panic_injection(spa, FTAG, 3); 5402 5403 /* split is complete; log a history record */ 5404 spa_history_log_internal(newspa, "split", NULL, 5405 "from pool %s", spa_name(spa)); 5406 5407 kmem_free(vml, children * sizeof (vdev_t *)); 5408 5409 /* if we're not going to mount the filesystems in userland, export */ 5410 if (exp) 5411 error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, 5412 B_FALSE, B_FALSE); 5413 5414 return (error); 5415 5416out: 5417 spa_unload(newspa); 5418 spa_deactivate(newspa); 5419 spa_remove(newspa); 5420 5421 txg = spa_vdev_config_enter(spa); 5422 5423 /* re-online all offlined disks */ 5424 for (c = 0; c < children; c++) { 5425 if (vml[c] != NULL) 5426 vml[c]->vdev_offline = B_FALSE; 5427 } 5428 vdev_reopen(spa->spa_root_vdev); 5429 5430 nvlist_free(spa->spa_config_splitting); 5431 spa->spa_config_splitting = NULL; 5432 (void) spa_vdev_exit(spa, NULL, txg, error); 5433 5434 kmem_free(vml, children * sizeof (vdev_t *)); 5435 return (error); 5436} 5437 5438static nvlist_t * 5439spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) 5440{ 5441 for (int i = 0; i < count; i++) { 5442 uint64_t guid; 5443 5444 VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID, 5445 &guid) == 0); 5446 5447 if (guid == target_guid) 5448 return (nvpp[i]); 5449 } 5450 5451 return (NULL); 5452} 5453 5454static void 5455spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, 5456 nvlist_t *dev_to_remove) 5457{ 5458 nvlist_t **newdev = NULL; 5459 5460 if (count > 1) 5461 newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); 5462 5463 for (int i = 0, j = 0; i < count; i++) { 5464 if (dev[i] == dev_to_remove) 5465 continue; 5466 VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); 5467 } 5468 5469 VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); 5470 VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); 5471 5472 for (int i = 0; i < count - 1; i++) 5473 nvlist_free(newdev[i]); 5474 5475 if (count > 1) 5476 kmem_free(newdev, (count - 1) * sizeof (void *)); 5477} 5478 5479/* 5480 * Evacuate the device. 5481 */ 5482static int 5483spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd) 5484{ 5485 uint64_t txg; 5486 int error = 0; 5487 5488 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 5489 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5490 ASSERT(vd == vd->vdev_top); 5491 5492 /* 5493 * Evacuate the device. We don't hold the config lock as writer 5494 * since we need to do I/O but we do keep the 5495 * spa_namespace_lock held. Once this completes the device 5496 * should no longer have any blocks allocated on it. 5497 */ 5498 if (vd->vdev_islog) { 5499 if (vd->vdev_stat.vs_alloc != 0) 5500 error = spa_offline_log(spa); 5501 } else { 5502 error = SET_ERROR(ENOTSUP); 5503 } 5504 5505 if (error) 5506 return (error); 5507 5508 /* 5509 * The evacuation succeeded. Remove any remaining MOS metadata 5510 * associated with this vdev, and wait for these changes to sync. 5511 */ 5512 ASSERT0(vd->vdev_stat.vs_alloc); 5513 txg = spa_vdev_config_enter(spa); 5514 vd->vdev_removing = B_TRUE; 5515 vdev_dirty_leaves(vd, VDD_DTL, txg); 5516 vdev_config_dirty(vd); 5517 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 5518 5519 return (0); 5520} 5521 5522/* 5523 * Complete the removal by cleaning up the namespace. 5524 */ 5525static void 5526spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd) 5527{ 5528 vdev_t *rvd = spa->spa_root_vdev; 5529 uint64_t id = vd->vdev_id; 5530 boolean_t last_vdev = (id == (rvd->vdev_children - 1)); 5531 5532 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 5533 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 5534 ASSERT(vd == vd->vdev_top); 5535 5536 /* 5537 * Only remove any devices which are empty. 5538 */ 5539 if (vd->vdev_stat.vs_alloc != 0) 5540 return; 5541 5542 (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 5543 5544 if (list_link_active(&vd->vdev_state_dirty_node)) 5545 vdev_state_clean(vd); 5546 if (list_link_active(&vd->vdev_config_dirty_node)) 5547 vdev_config_clean(vd); 5548 5549 vdev_free(vd); 5550 5551 if (last_vdev) { 5552 vdev_compact_children(rvd); 5553 } else { 5554 vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); 5555 vdev_add_child(rvd, vd); 5556 } 5557 vdev_config_dirty(rvd); 5558 5559 /* 5560 * Reassess the health of our root vdev. 5561 */ 5562 vdev_reopen(rvd); 5563} 5564 5565/* 5566 * Remove a device from the pool - 5567 * 5568 * Removing a device from the vdev namespace requires several steps 5569 * and can take a significant amount of time. As a result we use 5570 * the spa_vdev_config_[enter/exit] functions which allow us to 5571 * grab and release the spa_config_lock while still holding the namespace 5572 * lock. During each step the configuration is synced out. 5573 * 5574 * Currently, this supports removing only hot spares, slogs, and level 2 ARC 5575 * devices. 5576 */ 5577int 5578spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 5579{ 5580 vdev_t *vd; 5581 metaslab_group_t *mg; 5582 nvlist_t **spares, **l2cache, *nv; 5583 uint64_t txg = 0; 5584 uint_t nspares, nl2cache; 5585 int error = 0; 5586 boolean_t locked = MUTEX_HELD(&spa_namespace_lock); 5587 5588 ASSERT(spa_writeable(spa)); 5589 5590 if (!locked) 5591 txg = spa_vdev_enter(spa); 5592 5593 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 5594 5595 if (spa->spa_spares.sav_vdevs != NULL && 5596 nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 5597 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && 5598 (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { 5599 /* 5600 * Only remove the hot spare if it's not currently in use 5601 * in this pool. 5602 */ 5603 if (vd == NULL || unspare) { 5604 spa_vdev_remove_aux(spa->spa_spares.sav_config, 5605 ZPOOL_CONFIG_SPARES, spares, nspares, nv); 5606 spa_load_spares(spa); 5607 spa->spa_spares.sav_sync = B_TRUE; 5608 } else { 5609 error = SET_ERROR(EBUSY); 5610 } 5611 } else if (spa->spa_l2cache.sav_vdevs != NULL && 5612 nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 5613 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && 5614 (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { 5615 /* 5616 * Cache devices can always be removed. 5617 */ 5618 spa_vdev_remove_aux(spa->spa_l2cache.sav_config, 5619 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); 5620 spa_load_l2cache(spa); 5621 spa->spa_l2cache.sav_sync = B_TRUE; 5622 } else if (vd != NULL && vd->vdev_islog) { 5623 ASSERT(!locked); 5624 ASSERT(vd == vd->vdev_top); 5625 5626 mg = vd->vdev_mg; 5627 5628 /* 5629 * Stop allocating from this vdev. 5630 */ 5631 metaslab_group_passivate(mg); 5632 5633 /* 5634 * Wait for the youngest allocations and frees to sync, 5635 * and then wait for the deferral of those frees to finish. 5636 */ 5637 spa_vdev_config_exit(spa, NULL, 5638 txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); 5639 5640 /* 5641 * Attempt to evacuate the vdev. 5642 */ 5643 error = spa_vdev_remove_evacuate(spa, vd); 5644 5645 txg = spa_vdev_config_enter(spa); 5646 5647 /* 5648 * If we couldn't evacuate the vdev, unwind. 5649 */ 5650 if (error) { 5651 metaslab_group_activate(mg); 5652 return (spa_vdev_exit(spa, NULL, txg, error)); 5653 } 5654 5655 /* 5656 * Clean up the vdev namespace. 5657 */ 5658 spa_vdev_remove_from_namespace(spa, vd); 5659 5660 } else if (vd != NULL) { 5661 /* 5662 * Normal vdevs cannot be removed (yet). 5663 */ 5664 error = SET_ERROR(ENOTSUP); 5665 } else { 5666 /* 5667 * There is no vdev of any kind with the specified guid. 5668 */ 5669 error = SET_ERROR(ENOENT); 5670 } 5671 5672 if (!locked) 5673 return (spa_vdev_exit(spa, NULL, txg, error)); 5674 5675 return (error); 5676} 5677 5678/* 5679 * Find any device that's done replacing, or a vdev marked 'unspare' that's 5680 * currently spared, so we can detach it. 5681 */ 5682static vdev_t * 5683spa_vdev_resilver_done_hunt(vdev_t *vd) 5684{ 5685 vdev_t *newvd, *oldvd; 5686 5687 for (int c = 0; c < vd->vdev_children; c++) { 5688 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 5689 if (oldvd != NULL) 5690 return (oldvd); 5691 } 5692 5693 /* 5694 * Check for a completed replacement. We always consider the first 5695 * vdev in the list to be the oldest vdev, and the last one to be 5696 * the newest (see spa_vdev_attach() for how that works). In 5697 * the case where the newest vdev is faulted, we will not automatically 5698 * remove it after a resilver completes. This is OK as it will require 5699 * user intervention to determine which disk the admin wishes to keep. 5700 */ 5701 if (vd->vdev_ops == &vdev_replacing_ops) { 5702 ASSERT(vd->vdev_children > 1); 5703 5704 newvd = vd->vdev_child[vd->vdev_children - 1]; 5705 oldvd = vd->vdev_child[0]; 5706 5707 if (vdev_dtl_empty(newvd, DTL_MISSING) && 5708 vdev_dtl_empty(newvd, DTL_OUTAGE) && 5709 !vdev_dtl_required(oldvd)) 5710 return (oldvd); 5711 } 5712 5713 /* 5714 * Check for a completed resilver with the 'unspare' flag set. 5715 */ 5716 if (vd->vdev_ops == &vdev_spare_ops) { 5717 vdev_t *first = vd->vdev_child[0]; 5718 vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; 5719 5720 if (last->vdev_unspare) { 5721 oldvd = first; 5722 newvd = last; 5723 } else if (first->vdev_unspare) { 5724 oldvd = last; 5725 newvd = first; 5726 } else { 5727 oldvd = NULL; 5728 } 5729 5730 if (oldvd != NULL && 5731 vdev_dtl_empty(newvd, DTL_MISSING) && 5732 vdev_dtl_empty(newvd, DTL_OUTAGE) && 5733 !vdev_dtl_required(oldvd)) 5734 return (oldvd); 5735 5736 /* 5737 * If there are more than two spares attached to a disk, 5738 * and those spares are not required, then we want to 5739 * attempt to free them up now so that they can be used 5740 * by other pools. Once we're back down to a single 5741 * disk+spare, we stop removing them. 5742 */ 5743 if (vd->vdev_children > 2) { 5744 newvd = vd->vdev_child[1]; 5745 5746 if (newvd->vdev_isspare && last->vdev_isspare && 5747 vdev_dtl_empty(last, DTL_MISSING) && 5748 vdev_dtl_empty(last, DTL_OUTAGE) && 5749 !vdev_dtl_required(newvd)) 5750 return (newvd); 5751 } 5752 } 5753 5754 return (NULL); 5755} 5756 5757static void 5758spa_vdev_resilver_done(spa_t *spa) 5759{ 5760 vdev_t *vd, *pvd, *ppvd; 5761 uint64_t guid, sguid, pguid, ppguid; 5762 5763 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5764 5765 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 5766 pvd = vd->vdev_parent; 5767 ppvd = pvd->vdev_parent; 5768 guid = vd->vdev_guid; 5769 pguid = pvd->vdev_guid; 5770 ppguid = ppvd->vdev_guid; 5771 sguid = 0; 5772 /* 5773 * If we have just finished replacing a hot spared device, then 5774 * we need to detach the parent's first child (the original hot 5775 * spare) as well. 5776 */ 5777 if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && 5778 ppvd->vdev_children == 2) { 5779 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 5780 sguid = ppvd->vdev_child[1]->vdev_guid; 5781 } 5782 ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd)); 5783 5784 spa_config_exit(spa, SCL_ALL, FTAG); 5785 if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) 5786 return; 5787 if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) 5788 return; 5789 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5790 } 5791 5792 spa_config_exit(spa, SCL_ALL, FTAG); 5793} 5794 5795/* 5796 * Update the stored path or FRU for this vdev. 5797 */ 5798int 5799spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, 5800 boolean_t ispath) 5801{ 5802 vdev_t *vd; 5803 boolean_t sync = B_FALSE; 5804 5805 ASSERT(spa_writeable(spa)); 5806 5807 spa_vdev_state_enter(spa, SCL_ALL); 5808 5809 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 5810 return (spa_vdev_state_exit(spa, NULL, ENOENT)); 5811 5812 if (!vd->vdev_ops->vdev_op_leaf) 5813 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 5814 5815 if (ispath) { 5816 if (strcmp(value, vd->vdev_path) != 0) { 5817 spa_strfree(vd->vdev_path); 5818 vd->vdev_path = spa_strdup(value); 5819 sync = B_TRUE; 5820 } 5821 } else { 5822 if (vd->vdev_fru == NULL) { 5823 vd->vdev_fru = spa_strdup(value); 5824 sync = B_TRUE; 5825 } else if (strcmp(value, vd->vdev_fru) != 0) { 5826 spa_strfree(vd->vdev_fru); 5827 vd->vdev_fru = spa_strdup(value); 5828 sync = B_TRUE; 5829 } 5830 } 5831 5832 return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); 5833} 5834 5835int 5836spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 5837{ 5838 return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); 5839} 5840 5841int 5842spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) 5843{ 5844 return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); 5845} 5846 5847/* 5848 * ========================================================================== 5849 * SPA Scanning 5850 * ========================================================================== 5851 */ 5852 5853int 5854spa_scan_stop(spa_t *spa) 5855{ 5856 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5857 if (dsl_scan_resilvering(spa->spa_dsl_pool)) 5858 return (SET_ERROR(EBUSY)); 5859 return (dsl_scan_cancel(spa->spa_dsl_pool)); 5860} 5861 5862int 5863spa_scan(spa_t *spa, pool_scan_func_t func) 5864{ 5865 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5866 5867 if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) 5868 return (SET_ERROR(ENOTSUP)); 5869 5870 /* 5871 * If a resilver was requested, but there is no DTL on a 5872 * writeable leaf device, we have nothing to do. 5873 */ 5874 if (func == POOL_SCAN_RESILVER && 5875 !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 5876 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 5877 return (0); 5878 } 5879 5880 return (dsl_scan(spa->spa_dsl_pool, func)); 5881} 5882 5883/* 5884 * ========================================================================== 5885 * SPA async task processing 5886 * ========================================================================== 5887 */ 5888 5889static void 5890spa_async_remove(spa_t *spa, vdev_t *vd) 5891{ 5892 if (vd->vdev_remove_wanted) { 5893 vd->vdev_remove_wanted = B_FALSE; 5894 vd->vdev_delayed_close = B_FALSE; 5895 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 5896 5897 /* 5898 * We want to clear the stats, but we don't want to do a full 5899 * vdev_clear() as that will cause us to throw away 5900 * degraded/faulted state as well as attempt to reopen the 5901 * device, all of which is a waste. 5902 */ 5903 vd->vdev_stat.vs_read_errors = 0; 5904 vd->vdev_stat.vs_write_errors = 0; 5905 vd->vdev_stat.vs_checksum_errors = 0; 5906 5907 vdev_state_dirty(vd->vdev_top); 5908 } 5909 5910 for (int c = 0; c < vd->vdev_children; c++) 5911 spa_async_remove(spa, vd->vdev_child[c]); 5912} 5913 5914static void 5915spa_async_probe(spa_t *spa, vdev_t *vd) 5916{ 5917 if (vd->vdev_probe_wanted) { 5918 vd->vdev_probe_wanted = B_FALSE; 5919 vdev_reopen(vd); /* vdev_open() does the actual probe */ 5920 } 5921 5922 for (int c = 0; c < vd->vdev_children; c++) 5923 spa_async_probe(spa, vd->vdev_child[c]); 5924} 5925 5926static void 5927spa_async_autoexpand(spa_t *spa, vdev_t *vd) 5928{ 5929 sysevent_id_t eid; 5930 nvlist_t *attr; 5931 char *physpath; 5932 5933 if (!spa->spa_autoexpand) 5934 return; 5935 5936 for (int c = 0; c < vd->vdev_children; c++) { 5937 vdev_t *cvd = vd->vdev_child[c]; 5938 spa_async_autoexpand(spa, cvd); 5939 } 5940 5941 if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) 5942 return; 5943 5944 physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP); 5945 (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath); 5946 5947 VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5948 VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0); 5949 5950 (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS, 5951 ESC_ZFS_VDEV_AUTOEXPAND, attr, &eid, DDI_SLEEP); 5952 5953 nvlist_free(attr); 5954 kmem_free(physpath, MAXPATHLEN); 5955} 5956 5957static void 5958spa_async_thread(void *arg) 5959{ 5960 spa_t *spa = arg; 5961 int tasks; 5962 5963 ASSERT(spa->spa_sync_on); 5964 5965 mutex_enter(&spa->spa_async_lock); 5966 tasks = spa->spa_async_tasks; 5967 spa->spa_async_tasks &= SPA_ASYNC_REMOVE; 5968 mutex_exit(&spa->spa_async_lock); 5969 5970 /* 5971 * See if the config needs to be updated. 5972 */ 5973 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 5974 uint64_t old_space, new_space; 5975 5976 mutex_enter(&spa_namespace_lock); 5977 old_space = metaslab_class_get_space(spa_normal_class(spa)); 5978 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 5979 new_space = metaslab_class_get_space(spa_normal_class(spa)); 5980 mutex_exit(&spa_namespace_lock); 5981 5982 /* 5983 * If the pool grew as a result of the config update, 5984 * then log an internal history event. 5985 */ 5986 if (new_space != old_space) { 5987 spa_history_log_internal(spa, "vdev online", NULL, 5988 "pool '%s' size: %llu(+%llu)", 5989 spa_name(spa), new_space, new_space - old_space); 5990 } 5991 } 5992 5993 if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { 5994 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 5995 spa_async_autoexpand(spa, spa->spa_root_vdev); 5996 spa_config_exit(spa, SCL_CONFIG, FTAG); 5997 } 5998 5999 /* 6000 * See if any devices need to be probed. 6001 */ 6002 if (tasks & SPA_ASYNC_PROBE) { 6003 spa_vdev_state_enter(spa, SCL_NONE); 6004 spa_async_probe(spa, spa->spa_root_vdev); 6005 (void) spa_vdev_state_exit(spa, NULL, 0); 6006 } 6007 6008 /* 6009 * If any devices are done replacing, detach them. 6010 */ 6011 if (tasks & SPA_ASYNC_RESILVER_DONE) 6012 spa_vdev_resilver_done(spa); 6013 6014 /* 6015 * Kick off a resilver. 6016 */ 6017 if (tasks & SPA_ASYNC_RESILVER) 6018 dsl_resilver_restart(spa->spa_dsl_pool, 0); 6019 6020 /* 6021 * Let the world know that we're done. 6022 */ 6023 mutex_enter(&spa->spa_async_lock); 6024 spa->spa_async_thread = NULL; 6025 cv_broadcast(&spa->spa_async_cv); 6026 mutex_exit(&spa->spa_async_lock); 6027 thread_exit(); 6028} 6029 6030static void 6031spa_async_thread_vd(void *arg) 6032{ 6033 spa_t *spa = arg; 6034 int tasks; 6035 6036 ASSERT(spa->spa_sync_on); 6037 6038 mutex_enter(&spa->spa_async_lock); 6039 tasks = spa->spa_async_tasks; 6040retry: 6041 spa->spa_async_tasks &= ~SPA_ASYNC_REMOVE; 6042 mutex_exit(&spa->spa_async_lock); 6043 6044 /* 6045 * See if any devices need to be marked REMOVED. 6046 */ 6047 if (tasks & SPA_ASYNC_REMOVE) { 6048 spa_vdev_state_enter(spa, SCL_NONE); 6049 spa_async_remove(spa, spa->spa_root_vdev); 6050 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 6051 spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); 6052 for (int i = 0; i < spa->spa_spares.sav_count; i++) 6053 spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); 6054 (void) spa_vdev_state_exit(spa, NULL, 0); 6055 } 6056 6057 /* 6058 * Let the world know that we're done. 6059 */ 6060 mutex_enter(&spa->spa_async_lock); 6061 tasks = spa->spa_async_tasks; 6062 if ((tasks & SPA_ASYNC_REMOVE) != 0) 6063 goto retry; 6064 spa->spa_async_thread_vd = NULL; 6065 cv_broadcast(&spa->spa_async_cv); 6066 mutex_exit(&spa->spa_async_lock); 6067 thread_exit(); 6068} 6069 6070void 6071spa_async_suspend(spa_t *spa) 6072{ 6073 mutex_enter(&spa->spa_async_lock); 6074 spa->spa_async_suspended++; 6075 while (spa->spa_async_thread != NULL && 6076 spa->spa_async_thread_vd != NULL) 6077 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 6078 mutex_exit(&spa->spa_async_lock); 6079} 6080 6081void 6082spa_async_resume(spa_t *spa) 6083{ 6084 mutex_enter(&spa->spa_async_lock); 6085 ASSERT(spa->spa_async_suspended != 0); 6086 spa->spa_async_suspended--; 6087 mutex_exit(&spa->spa_async_lock); 6088} 6089 6090static boolean_t 6091spa_async_tasks_pending(spa_t *spa) 6092{ 6093 uint_t non_config_tasks; 6094 uint_t config_task; 6095 boolean_t config_task_suspended; 6096 6097 non_config_tasks = spa->spa_async_tasks & ~(SPA_ASYNC_CONFIG_UPDATE | 6098 SPA_ASYNC_REMOVE); 6099 config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE; 6100 if (spa->spa_ccw_fail_time == 0) { 6101 config_task_suspended = B_FALSE; 6102 } else { 6103 config_task_suspended = 6104 (gethrtime() - spa->spa_ccw_fail_time) < 6105 (zfs_ccw_retry_interval * NANOSEC); 6106 } 6107 6108 return (non_config_tasks || (config_task && !config_task_suspended)); 6109} 6110 6111static void 6112spa_async_dispatch(spa_t *spa) 6113{ 6114 mutex_enter(&spa->spa_async_lock); 6115 if (spa_async_tasks_pending(spa) && 6116 !spa->spa_async_suspended && 6117 spa->spa_async_thread == NULL && 6118 rootdir != NULL) 6119 spa->spa_async_thread = thread_create(NULL, 0, 6120 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 6121 mutex_exit(&spa->spa_async_lock); 6122} 6123 6124static void 6125spa_async_dispatch_vd(spa_t *spa) 6126{ 6127 mutex_enter(&spa->spa_async_lock); 6128 if ((spa->spa_async_tasks & SPA_ASYNC_REMOVE) != 0 && 6129 !spa->spa_async_suspended && 6130 spa->spa_async_thread_vd == NULL && 6131 rootdir != NULL) 6132 spa->spa_async_thread_vd = thread_create(NULL, 0, 6133 spa_async_thread_vd, spa, 0, &p0, TS_RUN, maxclsyspri); 6134 mutex_exit(&spa->spa_async_lock); 6135} 6136 6137void 6138spa_async_request(spa_t *spa, int task) 6139{ 6140 zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); 6141 mutex_enter(&spa->spa_async_lock); 6142 spa->spa_async_tasks |= task; 6143 mutex_exit(&spa->spa_async_lock); 6144 spa_async_dispatch_vd(spa); 6145} 6146 6147/* 6148 * ========================================================================== 6149 * SPA syncing routines 6150 * ========================================================================== 6151 */ 6152 6153static int 6154bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 6155{ 6156 bpobj_t *bpo = arg; 6157 bpobj_enqueue(bpo, bp, tx); 6158 return (0); 6159} 6160 6161static int 6162spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 6163{ 6164 zio_t *zio = arg; 6165 6166 zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp, 6167 BP_GET_PSIZE(bp), zio->io_flags)); 6168 return (0); 6169} 6170 6171/* 6172 * Note: this simple function is not inlined to make it easier to dtrace the 6173 * amount of time spent syncing frees. 6174 */ 6175static void 6176spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx) 6177{ 6178 zio_t *zio = zio_root(spa, NULL, NULL, 0); 6179 bplist_iterate(bpl, spa_free_sync_cb, zio, tx); 6180 VERIFY(zio_wait(zio) == 0); 6181} 6182 6183/* 6184 * Note: this simple function is not inlined to make it easier to dtrace the 6185 * amount of time spent syncing deferred frees. 6186 */ 6187static void 6188spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx) 6189{ 6190 zio_t *zio = zio_root(spa, NULL, NULL, 0); 6191 VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj, 6192 spa_free_sync_cb, zio, tx), ==, 0); 6193 VERIFY0(zio_wait(zio)); 6194} 6195 6196 6197static void 6198spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 6199{ 6200 char *packed = NULL; 6201 size_t bufsize; 6202 size_t nvsize = 0; 6203 dmu_buf_t *db; 6204 6205 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 6206 6207 /* 6208 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 6209 * information. This avoids the dmu_buf_will_dirty() path and 6210 * saves us a pre-read to get data we don't actually care about. 6211 */ 6212 bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE); 6213 packed = kmem_alloc(bufsize, KM_SLEEP); 6214 6215 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 6216 KM_SLEEP) == 0); 6217 bzero(packed + nvsize, bufsize - nvsize); 6218 6219 dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); 6220 6221 kmem_free(packed, bufsize); 6222 6223 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 6224 dmu_buf_will_dirty(db, tx); 6225 *(uint64_t *)db->db_data = nvsize; 6226 dmu_buf_rele(db, FTAG); 6227} 6228 6229static void 6230spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 6231 const char *config, const char *entry) 6232{ 6233 nvlist_t *nvroot; 6234 nvlist_t **list; 6235 int i; 6236 6237 if (!sav->sav_sync) 6238 return; 6239 6240 /* 6241 * Update the MOS nvlist describing the list of available devices. 6242 * spa_validate_aux() will have already made sure this nvlist is 6243 * valid and the vdevs are labeled appropriately. 6244 */ 6245 if (sav->sav_object == 0) { 6246 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 6247 DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 6248 sizeof (uint64_t), tx); 6249 VERIFY(zap_update(spa->spa_meta_objset, 6250 DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 6251 &sav->sav_object, tx) == 0); 6252 } 6253 6254 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 6255 if (sav->sav_count == 0) { 6256 VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); 6257 } else { 6258 list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 6259 for (i = 0; i < sav->sav_count; i++) 6260 list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 6261 B_FALSE, VDEV_CONFIG_L2CACHE); 6262 VERIFY(nvlist_add_nvlist_array(nvroot, config, list, 6263 sav->sav_count) == 0); 6264 for (i = 0; i < sav->sav_count; i++) 6265 nvlist_free(list[i]); 6266 kmem_free(list, sav->sav_count * sizeof (void *)); 6267 } 6268 6269 spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 6270 nvlist_free(nvroot); 6271 6272 sav->sav_sync = B_FALSE; 6273} 6274 6275static void 6276spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 6277{ 6278 nvlist_t *config; 6279 6280 if (list_is_empty(&spa->spa_config_dirty_list)) 6281 return; 6282 6283 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6284 6285 config = spa_config_generate(spa, spa->spa_root_vdev, 6286 dmu_tx_get_txg(tx), B_FALSE); 6287 6288 /* 6289 * If we're upgrading the spa version then make sure that 6290 * the config object gets updated with the correct version. 6291 */ 6292 if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version) 6293 fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 6294 spa->spa_uberblock.ub_version); 6295 6296 spa_config_exit(spa, SCL_STATE, FTAG); 6297 6298 if (spa->spa_config_syncing) 6299 nvlist_free(spa->spa_config_syncing); 6300 spa->spa_config_syncing = config; 6301 6302 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 6303} 6304 6305static void 6306spa_sync_version(void *arg, dmu_tx_t *tx) 6307{ 6308 uint64_t *versionp = arg; 6309 uint64_t version = *versionp; 6310 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 6311 6312 /* 6313 * Setting the version is special cased when first creating the pool. 6314 */ 6315 ASSERT(tx->tx_txg != TXG_INITIAL); 6316 6317 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 6318 ASSERT(version >= spa_version(spa)); 6319 6320 spa->spa_uberblock.ub_version = version; 6321 vdev_config_dirty(spa->spa_root_vdev); 6322 spa_history_log_internal(spa, "set", tx, "version=%lld", version); 6323} 6324 6325/* 6326 * Set zpool properties. 6327 */ 6328static void 6329spa_sync_props(void *arg, dmu_tx_t *tx) 6330{ 6331 nvlist_t *nvp = arg; 6332 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 6333 objset_t *mos = spa->spa_meta_objset; 6334 nvpair_t *elem = NULL; 6335 6336 mutex_enter(&spa->spa_props_lock); 6337 6338 while ((elem = nvlist_next_nvpair(nvp, elem))) { 6339 uint64_t intval; 6340 char *strval, *fname; 6341 zpool_prop_t prop; 6342 const char *propname; 6343 zprop_type_t proptype; 6344 spa_feature_t fid; 6345 6346 switch (prop = zpool_name_to_prop(nvpair_name(elem))) { 6347 case ZPROP_INVAL: 6348 /* 6349 * We checked this earlier in spa_prop_validate(). 6350 */ 6351 ASSERT(zpool_prop_feature(nvpair_name(elem))); 6352 6353 fname = strchr(nvpair_name(elem), '@') + 1; 6354 VERIFY0(zfeature_lookup_name(fname, &fid)); 6355 6356 spa_feature_enable(spa, fid, tx); 6357 spa_history_log_internal(spa, "set", tx, 6358 "%s=enabled", nvpair_name(elem)); 6359 break; 6360 6361 case ZPOOL_PROP_VERSION: 6362 intval = fnvpair_value_uint64(elem); 6363 /* 6364 * The version is synced seperatly before other 6365 * properties and should be correct by now. 6366 */ 6367 ASSERT3U(spa_version(spa), >=, intval); 6368 break; 6369 6370 case ZPOOL_PROP_ALTROOT: 6371 /* 6372 * 'altroot' is a non-persistent property. It should 6373 * have been set temporarily at creation or import time. 6374 */ 6375 ASSERT(spa->spa_root != NULL); 6376 break; 6377 6378 case ZPOOL_PROP_READONLY: 6379 case ZPOOL_PROP_CACHEFILE: 6380 /* 6381 * 'readonly' and 'cachefile' are also non-persisitent 6382 * properties. 6383 */ 6384 break; 6385 case ZPOOL_PROP_COMMENT: 6386 strval = fnvpair_value_string(elem); 6387 if (spa->spa_comment != NULL) 6388 spa_strfree(spa->spa_comment); 6389 spa->spa_comment = spa_strdup(strval); 6390 /* 6391 * We need to dirty the configuration on all the vdevs 6392 * so that their labels get updated. It's unnecessary 6393 * to do this for pool creation since the vdev's 6394 * configuratoin has already been dirtied. 6395 */ 6396 if (tx->tx_txg != TXG_INITIAL) 6397 vdev_config_dirty(spa->spa_root_vdev); 6398 spa_history_log_internal(spa, "set", tx, 6399 "%s=%s", nvpair_name(elem), strval); 6400 break; 6401 default: 6402 /* 6403 * Set pool property values in the poolprops mos object. 6404 */ 6405 if (spa->spa_pool_props_object == 0) { 6406 spa->spa_pool_props_object = 6407 zap_create_link(mos, DMU_OT_POOL_PROPS, 6408 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 6409 tx); 6410 } 6411 6412 /* normalize the property name */ 6413 propname = zpool_prop_to_name(prop); 6414 proptype = zpool_prop_get_type(prop); 6415 6416 if (nvpair_type(elem) == DATA_TYPE_STRING) { 6417 ASSERT(proptype == PROP_TYPE_STRING); 6418 strval = fnvpair_value_string(elem); 6419 VERIFY0(zap_update(mos, 6420 spa->spa_pool_props_object, propname, 6421 1, strlen(strval) + 1, strval, tx)); 6422 spa_history_log_internal(spa, "set", tx, 6423 "%s=%s", nvpair_name(elem), strval); 6424 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 6425 intval = fnvpair_value_uint64(elem); 6426 6427 if (proptype == PROP_TYPE_INDEX) { 6428 const char *unused; 6429 VERIFY0(zpool_prop_index_to_string( 6430 prop, intval, &unused)); 6431 } 6432 VERIFY0(zap_update(mos, 6433 spa->spa_pool_props_object, propname, 6434 8, 1, &intval, tx)); 6435 spa_history_log_internal(spa, "set", tx, 6436 "%s=%lld", nvpair_name(elem), intval); 6437 } else { 6438 ASSERT(0); /* not allowed */ 6439 } 6440 6441 switch (prop) { 6442 case ZPOOL_PROP_DELEGATION: 6443 spa->spa_delegation = intval; 6444 break; 6445 case ZPOOL_PROP_BOOTFS: 6446 spa->spa_bootfs = intval; 6447 break; 6448 case ZPOOL_PROP_FAILUREMODE: 6449 spa->spa_failmode = intval; 6450 break; 6451 case ZPOOL_PROP_AUTOEXPAND: 6452 spa->spa_autoexpand = intval; 6453 if (tx->tx_txg != TXG_INITIAL) 6454 spa_async_request(spa, 6455 SPA_ASYNC_AUTOEXPAND); 6456 break; 6457 case ZPOOL_PROP_DEDUPDITTO: 6458 spa->spa_dedup_ditto = intval; 6459 break; 6460 default: 6461 break; 6462 } 6463 } 6464 6465 } 6466 6467 mutex_exit(&spa->spa_props_lock); 6468} 6469 6470/* 6471 * Perform one-time upgrade on-disk changes. spa_version() does not 6472 * reflect the new version this txg, so there must be no changes this 6473 * txg to anything that the upgrade code depends on after it executes. 6474 * Therefore this must be called after dsl_pool_sync() does the sync 6475 * tasks. 6476 */ 6477static void 6478spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) 6479{ 6480 dsl_pool_t *dp = spa->spa_dsl_pool; 6481 6482 ASSERT(spa->spa_sync_pass == 1); 6483 6484 rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); 6485 6486 if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && 6487 spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { 6488 dsl_pool_create_origin(dp, tx); 6489 6490 /* Keeping the origin open increases spa_minref */ 6491 spa->spa_minref += 3; 6492 } 6493 6494 if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && 6495 spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { 6496 dsl_pool_upgrade_clones(dp, tx); 6497 } 6498 6499 if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && 6500 spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { 6501 dsl_pool_upgrade_dir_clones(dp, tx); 6502 6503 /* Keeping the freedir open increases spa_minref */ 6504 spa->spa_minref += 3; 6505 } 6506 6507 if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES && 6508 spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 6509 spa_feature_create_zap_objects(spa, tx); 6510 } 6511 6512 /* 6513 * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable 6514 * when possibility to use lz4 compression for metadata was added 6515 * Old pools that have this feature enabled must be upgraded to have 6516 * this feature active 6517 */ 6518 if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 6519 boolean_t lz4_en = spa_feature_is_enabled(spa, 6520 SPA_FEATURE_LZ4_COMPRESS); 6521 boolean_t lz4_ac = spa_feature_is_active(spa, 6522 SPA_FEATURE_LZ4_COMPRESS); 6523 6524 if (lz4_en && !lz4_ac) 6525 spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx); 6526 } 6527 rrw_exit(&dp->dp_config_rwlock, FTAG); 6528} 6529 6530/* 6531 * Sync the specified transaction group. New blocks may be dirtied as 6532 * part of the process, so we iterate until it converges. 6533 */ 6534void 6535spa_sync(spa_t *spa, uint64_t txg) 6536{ 6537 dsl_pool_t *dp = spa->spa_dsl_pool; 6538 objset_t *mos = spa->spa_meta_objset; 6539 bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; 6540 vdev_t *rvd = spa->spa_root_vdev; 6541 vdev_t *vd; 6542 dmu_tx_t *tx; 6543 int error; 6544 6545 VERIFY(spa_writeable(spa)); 6546 6547 /* 6548 * Lock out configuration changes. 6549 */ 6550 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 6551 6552 spa->spa_syncing_txg = txg; 6553 spa->spa_sync_pass = 0; 6554 6555 /* 6556 * If there are any pending vdev state changes, convert them 6557 * into config changes that go out with this transaction group. 6558 */ 6559 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6560 while (list_head(&spa->spa_state_dirty_list) != NULL) { 6561 /* 6562 * We need the write lock here because, for aux vdevs, 6563 * calling vdev_config_dirty() modifies sav_config. 6564 * This is ugly and will become unnecessary when we 6565 * eliminate the aux vdev wart by integrating all vdevs 6566 * into the root vdev tree. 6567 */ 6568 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 6569 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); 6570 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 6571 vdev_state_clean(vd); 6572 vdev_config_dirty(vd); 6573 } 6574 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 6575 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 6576 } 6577 spa_config_exit(spa, SCL_STATE, FTAG); 6578 6579 tx = dmu_tx_create_assigned(dp, txg); 6580 6581 spa->spa_sync_starttime = gethrtime(); 6582#ifdef illumos 6583 VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, 6584 spa->spa_sync_starttime + spa->spa_deadman_synctime)); 6585#else /* FreeBSD */ 6586#ifdef _KERNEL 6587 callout_reset(&spa->spa_deadman_cycid, 6588 hz * spa->spa_deadman_synctime / NANOSEC, spa_deadman, spa); 6589#endif 6590#endif 6591 6592 /* 6593 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 6594 * set spa_deflate if we have no raid-z vdevs. 6595 */ 6596 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 6597 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 6598 int i; 6599 6600 for (i = 0; i < rvd->vdev_children; i++) { 6601 vd = rvd->vdev_child[i]; 6602 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 6603 break; 6604 } 6605 if (i == rvd->vdev_children) { 6606 spa->spa_deflate = TRUE; 6607 VERIFY(0 == zap_add(spa->spa_meta_objset, 6608 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 6609 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 6610 } 6611 } 6612 6613 /* 6614 * Iterate to convergence. 6615 */ 6616 do { 6617 int pass = ++spa->spa_sync_pass; 6618 6619 spa_sync_config_object(spa, tx); 6620 spa_sync_aux_dev(spa, &spa->spa_spares, tx, 6621 ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 6622 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 6623 ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 6624 spa_errlog_sync(spa, txg); 6625 dsl_pool_sync(dp, txg); 6626 6627 if (pass < zfs_sync_pass_deferred_free) { 6628 spa_sync_frees(spa, free_bpl, tx); 6629 } else { 6630 /* 6631 * We can not defer frees in pass 1, because 6632 * we sync the deferred frees later in pass 1. 6633 */ 6634 ASSERT3U(pass, >, 1); 6635 bplist_iterate(free_bpl, bpobj_enqueue_cb, 6636 &spa->spa_deferred_bpobj, tx); 6637 } 6638 6639 ddt_sync(spa, txg); 6640 dsl_scan_sync(dp, tx); 6641 6642 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) 6643 vdev_sync(vd, txg); 6644 6645 if (pass == 1) { 6646 spa_sync_upgrades(spa, tx); 6647 ASSERT3U(txg, >=, 6648 spa->spa_uberblock.ub_rootbp.blk_birth); 6649 /* 6650 * Note: We need to check if the MOS is dirty 6651 * because we could have marked the MOS dirty 6652 * without updating the uberblock (e.g. if we 6653 * have sync tasks but no dirty user data). We 6654 * need to check the uberblock's rootbp because 6655 * it is updated if we have synced out dirty 6656 * data (though in this case the MOS will most 6657 * likely also be dirty due to second order 6658 * effects, we don't want to rely on that here). 6659 */ 6660 if (spa->spa_uberblock.ub_rootbp.blk_birth < txg && 6661 !dmu_objset_is_dirty(mos, txg)) { 6662 /* 6663 * Nothing changed on the first pass, 6664 * therefore this TXG is a no-op. Avoid 6665 * syncing deferred frees, so that we 6666 * can keep this TXG as a no-op. 6667 */ 6668 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, 6669 txg)); 6670 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 6671 ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg)); 6672 break; 6673 } 6674 spa_sync_deferred_frees(spa, tx); 6675 } 6676 6677 } while (dmu_objset_is_dirty(mos, txg)); 6678 6679 /* 6680 * Rewrite the vdev configuration (which includes the uberblock) 6681 * to commit the transaction group. 6682 * 6683 * If there are no dirty vdevs, we sync the uberblock to a few 6684 * random top-level vdevs that are known to be visible in the 6685 * config cache (see spa_vdev_add() for a complete description). 6686 * If there *are* dirty vdevs, sync the uberblock to all vdevs. 6687 */ 6688 for (;;) { 6689 /* 6690 * We hold SCL_STATE to prevent vdev open/close/etc. 6691 * while we're attempting to write the vdev labels. 6692 */ 6693 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6694 6695 if (list_is_empty(&spa->spa_config_dirty_list)) { 6696 vdev_t *svd[SPA_DVAS_PER_BP]; 6697 int svdcount = 0; 6698 int children = rvd->vdev_children; 6699 int c0 = spa_get_random(children); 6700 6701 for (int c = 0; c < children; c++) { 6702 vd = rvd->vdev_child[(c0 + c) % children]; 6703 if (vd->vdev_ms_array == 0 || vd->vdev_islog) 6704 continue; 6705 svd[svdcount++] = vd; 6706 if (svdcount == SPA_DVAS_PER_BP) 6707 break; 6708 } 6709 error = vdev_config_sync(svd, svdcount, txg, B_FALSE); 6710 if (error != 0) 6711 error = vdev_config_sync(svd, svdcount, txg, 6712 B_TRUE); 6713 } else { 6714 error = vdev_config_sync(rvd->vdev_child, 6715 rvd->vdev_children, txg, B_FALSE); 6716 if (error != 0) 6717 error = vdev_config_sync(rvd->vdev_child, 6718 rvd->vdev_children, txg, B_TRUE); 6719 } 6720 6721 if (error == 0) 6722 spa->spa_last_synced_guid = rvd->vdev_guid; 6723 6724 spa_config_exit(spa, SCL_STATE, FTAG); 6725 6726 if (error == 0) 6727 break; 6728 zio_suspend(spa, NULL); 6729 zio_resume_wait(spa); 6730 } 6731 dmu_tx_commit(tx); 6732 6733#ifdef illumos 6734 VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY)); 6735#else /* FreeBSD */ 6736#ifdef _KERNEL 6737 callout_drain(&spa->spa_deadman_cycid); 6738#endif 6739#endif 6740 6741 /* 6742 * Clear the dirty config list. 6743 */ 6744 while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) 6745 vdev_config_clean(vd); 6746 6747 /* 6748 * Now that the new config has synced transactionally, 6749 * let it become visible to the config cache. 6750 */ 6751 if (spa->spa_config_syncing != NULL) { 6752 spa_config_set(spa, spa->spa_config_syncing); 6753 spa->spa_config_txg = txg; 6754 spa->spa_config_syncing = NULL; 6755 } 6756 6757 spa->spa_ubsync = spa->spa_uberblock; 6758 6759 dsl_pool_sync_done(dp, txg); 6760 6761 /* 6762 * Update usable space statistics. 6763 */ 6764 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 6765 vdev_sync_done(vd, txg); 6766 6767 spa_update_dspace(spa); 6768 6769 /* 6770 * It had better be the case that we didn't dirty anything 6771 * since vdev_config_sync(). 6772 */ 6773 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 6774 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 6775 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 6776 6777 spa->spa_sync_pass = 0; 6778 6779 spa_config_exit(spa, SCL_CONFIG, FTAG); 6780 6781 spa_handle_ignored_writes(spa); 6782 6783 /* 6784 * If any async tasks have been requested, kick them off. 6785 */ 6786 spa_async_dispatch(spa); 6787 spa_async_dispatch_vd(spa); 6788} 6789 6790/* 6791 * Sync all pools. We don't want to hold the namespace lock across these 6792 * operations, so we take a reference on the spa_t and drop the lock during the 6793 * sync. 6794 */ 6795void 6796spa_sync_allpools(void) 6797{ 6798 spa_t *spa = NULL; 6799 mutex_enter(&spa_namespace_lock); 6800 while ((spa = spa_next(spa)) != NULL) { 6801 if (spa_state(spa) != POOL_STATE_ACTIVE || 6802 !spa_writeable(spa) || spa_suspended(spa)) 6803 continue; 6804 spa_open_ref(spa, FTAG); 6805 mutex_exit(&spa_namespace_lock); 6806 txg_wait_synced(spa_get_dsl(spa), 0); 6807 mutex_enter(&spa_namespace_lock); 6808 spa_close(spa, FTAG); 6809 } 6810 mutex_exit(&spa_namespace_lock); 6811} 6812 6813/* 6814 * ========================================================================== 6815 * Miscellaneous routines 6816 * ========================================================================== 6817 */ 6818 6819/* 6820 * Remove all pools in the system. 6821 */ 6822void 6823spa_evict_all(void) 6824{ 6825 spa_t *spa; 6826 6827 /* 6828 * Remove all cached state. All pools should be closed now, 6829 * so every spa in the AVL tree should be unreferenced. 6830 */ 6831 mutex_enter(&spa_namespace_lock); 6832 while ((spa = spa_next(NULL)) != NULL) { 6833 /* 6834 * Stop async tasks. The async thread may need to detach 6835 * a device that's been replaced, which requires grabbing 6836 * spa_namespace_lock, so we must drop it here. 6837 */ 6838 spa_open_ref(spa, FTAG); 6839 mutex_exit(&spa_namespace_lock); 6840 spa_async_suspend(spa); 6841 mutex_enter(&spa_namespace_lock); 6842 spa_close(spa, FTAG); 6843 6844 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 6845 spa_unload(spa); 6846 spa_deactivate(spa); 6847 } 6848 spa_remove(spa); 6849 } 6850 mutex_exit(&spa_namespace_lock); 6851} 6852 6853vdev_t * 6854spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) 6855{ 6856 vdev_t *vd; 6857 int i; 6858 6859 if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 6860 return (vd); 6861 6862 if (aux) { 6863 for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 6864 vd = spa->spa_l2cache.sav_vdevs[i]; 6865 if (vd->vdev_guid == guid) 6866 return (vd); 6867 } 6868 6869 for (i = 0; i < spa->spa_spares.sav_count; i++) { 6870 vd = spa->spa_spares.sav_vdevs[i]; 6871 if (vd->vdev_guid == guid) 6872 return (vd); 6873 } 6874 } 6875 6876 return (NULL); 6877} 6878 6879void 6880spa_upgrade(spa_t *spa, uint64_t version) 6881{ 6882 ASSERT(spa_writeable(spa)); 6883 6884 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6885 6886 /* 6887 * This should only be called for a non-faulted pool, and since a 6888 * future version would result in an unopenable pool, this shouldn't be 6889 * possible. 6890 */ 6891 ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version)); 6892 ASSERT3U(version, >=, spa->spa_uberblock.ub_version); 6893 6894 spa->spa_uberblock.ub_version = version; 6895 vdev_config_dirty(spa->spa_root_vdev); 6896 6897 spa_config_exit(spa, SCL_ALL, FTAG); 6898 6899 txg_wait_synced(spa_get_dsl(spa), 0); 6900} 6901 6902boolean_t 6903spa_has_spare(spa_t *spa, uint64_t guid) 6904{ 6905 int i; 6906 uint64_t spareguid; 6907 spa_aux_vdev_t *sav = &spa->spa_spares; 6908 6909 for (i = 0; i < sav->sav_count; i++) 6910 if (sav->sav_vdevs[i]->vdev_guid == guid) 6911 return (B_TRUE); 6912 6913 for (i = 0; i < sav->sav_npending; i++) { 6914 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 6915 &spareguid) == 0 && spareguid == guid) 6916 return (B_TRUE); 6917 } 6918 6919 return (B_FALSE); 6920} 6921 6922/* 6923 * Check if a pool has an active shared spare device. 6924 * Note: reference count of an active spare is 2, as a spare and as a replace 6925 */ 6926static boolean_t 6927spa_has_active_shared_spare(spa_t *spa) 6928{ 6929 int i, refcnt; 6930 uint64_t pool; 6931 spa_aux_vdev_t *sav = &spa->spa_spares; 6932 6933 for (i = 0; i < sav->sav_count; i++) { 6934 if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, 6935 &refcnt) && pool != 0ULL && pool == spa_guid(spa) && 6936 refcnt > 2) 6937 return (B_TRUE); 6938 } 6939 6940 return (B_FALSE); 6941} 6942 6943/* 6944 * Post a sysevent corresponding to the given event. The 'name' must be one of 6945 * the event definitions in sys/sysevent/eventdefs.h. The payload will be 6946 * filled in from the spa and (optionally) the vdev. This doesn't do anything 6947 * in the userland libzpool, as we don't want consumers to misinterpret ztest 6948 * or zdb as real changes. 6949 */ 6950void 6951spa_event_notify(spa_t *spa, vdev_t *vd, const char *name) 6952{ 6953#ifdef _KERNEL 6954 sysevent_t *ev; 6955 sysevent_attr_list_t *attr = NULL; 6956 sysevent_value_t value; 6957 sysevent_id_t eid; 6958 6959 ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", 6960 SE_SLEEP); 6961 6962 value.value_type = SE_DATA_TYPE_STRING; 6963 value.value.sv_string = spa_name(spa); 6964 if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) 6965 goto done; 6966 6967 value.value_type = SE_DATA_TYPE_UINT64; 6968 value.value.sv_uint64 = spa_guid(spa); 6969 if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) 6970 goto done; 6971 6972 if (vd) { 6973 value.value_type = SE_DATA_TYPE_UINT64; 6974 value.value.sv_uint64 = vd->vdev_guid; 6975 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, 6976 SE_SLEEP) != 0) 6977 goto done; 6978 6979 if (vd->vdev_path) { 6980 value.value_type = SE_DATA_TYPE_STRING; 6981 value.value.sv_string = vd->vdev_path; 6982 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, 6983 &value, SE_SLEEP) != 0) 6984 goto done; 6985 } 6986 } 6987 6988 if (sysevent_attach_attributes(ev, attr) != 0) 6989 goto done; 6990 attr = NULL; 6991 6992 (void) log_sysevent(ev, SE_SLEEP, &eid); 6993 6994done: 6995 if (attr) 6996 sysevent_free_attr(attr); 6997 sysevent_free(ev); 6998#endif 6999} 7000