spa.c revision 236884
1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd 22168404Spjd/* 23219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24236155Smm * Copyright (c) 2012 by Delphix. All rights reserved. 25168404Spjd */ 26168404Spjd 27168404Spjd/* 28168404Spjd * This file contains all the routines used when modifying on-disk SPA state. 29168404Spjd * This includes opening, importing, destroying, exporting a pool, and syncing a 30168404Spjd * pool. 31168404Spjd */ 32168404Spjd 33168404Spjd#include <sys/zfs_context.h> 34168404Spjd#include <sys/fm/fs/zfs.h> 35168404Spjd#include <sys/spa_impl.h> 36168404Spjd#include <sys/zio.h> 37168404Spjd#include <sys/zio_checksum.h> 38168404Spjd#include <sys/dmu.h> 39168404Spjd#include <sys/dmu_tx.h> 40168404Spjd#include <sys/zap.h> 41168404Spjd#include <sys/zil.h> 42219089Spjd#include <sys/ddt.h> 43168404Spjd#include <sys/vdev_impl.h> 44168404Spjd#include <sys/metaslab.h> 45219089Spjd#include <sys/metaslab_impl.h> 46168404Spjd#include <sys/uberblock_impl.h> 47168404Spjd#include <sys/txg.h> 48168404Spjd#include <sys/avl.h> 49168404Spjd#include <sys/dmu_traverse.h> 50168404Spjd#include <sys/dmu_objset.h> 51168404Spjd#include <sys/unique.h> 52168404Spjd#include <sys/dsl_pool.h> 53168404Spjd#include <sys/dsl_dataset.h> 54168404Spjd#include <sys/dsl_dir.h> 55168404Spjd#include <sys/dsl_prop.h> 56168404Spjd#include <sys/dsl_synctask.h> 57168404Spjd#include <sys/fs/zfs.h> 58185029Spjd#include <sys/arc.h> 59168404Spjd#include <sys/callb.h> 60185029Spjd#include <sys/spa_boot.h> 61219089Spjd#include <sys/zfs_ioctl.h> 62219089Spjd#include <sys/dsl_scan.h> 63236884Smm#include <sys/zfeature.h> 64219089Spjd#include <sys/zvol.h> 65168404Spjd 66219089Spjd#ifdef _KERNEL 67219089Spjd#include <sys/callb.h> 68219089Spjd#include <sys/cpupart.h> 69219089Spjd#include <sys/zone.h> 70219089Spjd#endif /* _KERNEL */ 71219089Spjd 72185029Spjd#include "zfs_prop.h" 73185029Spjd#include "zfs_comutil.h" 74168404Spjd 75204073Spjd/* Check hostid on import? */ 76204073Spjdstatic int check_hostid = 1; 77204073Spjd 78204073SpjdSYSCTL_DECL(_vfs_zfs); 79204073SpjdTUNABLE_INT("vfs.zfs.check_hostid", &check_hostid); 80204073SpjdSYSCTL_INT(_vfs_zfs, OID_AUTO, check_hostid, CTLFLAG_RW, &check_hostid, 0, 81204073Spjd "Check hostid on import?"); 82204073Spjd 83219089Spjdtypedef enum zti_modes { 84209962Smm zti_mode_fixed, /* value is # of threads (min 1) */ 85209962Smm zti_mode_online_percent, /* value is % of online CPUs */ 86219089Spjd zti_mode_batch, /* cpu-intensive; value is ignored */ 87211931Smm zti_mode_null, /* don't create a taskq */ 88209962Smm zti_nmodes 89219089Spjd} zti_modes_t; 90168712Spjd 91211931Smm#define ZTI_FIX(n) { zti_mode_fixed, (n) } 92211931Smm#define ZTI_PCT(n) { zti_mode_online_percent, (n) } 93219089Spjd#define ZTI_BATCH { zti_mode_batch, 0 } 94211931Smm#define ZTI_NULL { zti_mode_null, 0 } 95209962Smm 96211931Smm#define ZTI_ONE ZTI_FIX(1) 97209962Smm 98209962Smmtypedef struct zio_taskq_info { 99211931Smm enum zti_modes zti_mode; 100211931Smm uint_t zti_value; 101209962Smm} zio_taskq_info_t; 102209962Smm 103209962Smmstatic const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { 104219089Spjd "issue", "issue_high", "intr", "intr_high" 105209962Smm}; 106209962Smm 107211931Smm/* 108211931Smm * Define the taskq threads for the following I/O types: 109211931Smm * NULL, READ, WRITE, FREE, CLAIM, and IOCTL 110211931Smm */ 111211931Smmconst zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { 112211931Smm /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ 113211931Smm { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 114219089Spjd { ZTI_FIX(8), ZTI_NULL, ZTI_BATCH, ZTI_NULL }, 115219089Spjd { ZTI_BATCH, ZTI_FIX(5), ZTI_FIX(8), ZTI_FIX(5) }, 116219089Spjd { ZTI_FIX(100), ZTI_NULL, ZTI_ONE, ZTI_NULL }, 117211931Smm { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 118211931Smm { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 119209962Smm}; 120209962Smm 121236884Smmstatic dsl_syncfunc_t spa_sync_version; 122219089Spjdstatic dsl_syncfunc_t spa_sync_props; 123185029Spjdstatic boolean_t spa_has_active_shared_spare(spa_t *spa); 124219089Spjdstatic int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config, 125219089Spjd spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, 126219089Spjd char **ereport); 127219089Spjdstatic void spa_vdev_resilver_done(spa_t *spa); 128185029Spjd 129219089Spjduint_t zio_taskq_batch_pct = 100; /* 1 thread per cpu in pset */ 130219089Spjd#ifdef PSRSET_BIND 131219089Spjdid_t zio_taskq_psrset_bind = PS_NONE; 132219089Spjd#endif 133219089Spjd#ifdef SYSDC 134219089Spjdboolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ 135219089Spjd#endif 136219089Spjduint_t zio_taskq_basedc = 80; /* base duty cycle */ 137219089Spjd 138219089Spjdboolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */ 139219089Spjd 140168404Spjd/* 141219089Spjd * This (illegal) pool name is used when temporarily importing a spa_t in order 142219089Spjd * to get the vdev stats associated with the imported devices. 143219089Spjd */ 144219089Spjd#define TRYIMPORT_NAME "$import" 145219089Spjd 146219089Spjd/* 147168404Spjd * ========================================================================== 148185029Spjd * SPA properties routines 149185029Spjd * ========================================================================== 150185029Spjd */ 151185029Spjd 152185029Spjd/* 153185029Spjd * Add a (source=src, propname=propval) list to an nvlist. 154185029Spjd */ 155185029Spjdstatic void 156185029Spjdspa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, 157185029Spjd uint64_t intval, zprop_source_t src) 158185029Spjd{ 159185029Spjd const char *propname = zpool_prop_to_name(prop); 160185029Spjd nvlist_t *propval; 161185029Spjd 162185029Spjd VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 163185029Spjd VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); 164185029Spjd 165185029Spjd if (strval != NULL) 166185029Spjd VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); 167185029Spjd else 168185029Spjd VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); 169185029Spjd 170185029Spjd VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); 171185029Spjd nvlist_free(propval); 172185029Spjd} 173185029Spjd 174185029Spjd/* 175185029Spjd * Get property values from the spa configuration. 176185029Spjd */ 177185029Spjdstatic void 178185029Spjdspa_prop_get_config(spa_t *spa, nvlist_t **nvp) 179185029Spjd{ 180236155Smm vdev_t *rvd = spa->spa_root_vdev; 181236884Smm dsl_pool_t *pool = spa->spa_dsl_pool; 182209962Smm uint64_t size; 183219089Spjd uint64_t alloc; 184236155Smm uint64_t space; 185185029Spjd uint64_t cap, version; 186185029Spjd zprop_source_t src = ZPROP_SRC_NONE; 187185029Spjd spa_config_dirent_t *dp; 188185029Spjd 189185029Spjd ASSERT(MUTEX_HELD(&spa->spa_props_lock)); 190185029Spjd 191236155Smm if (rvd != NULL) { 192219089Spjd alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 193219089Spjd size = metaslab_class_get_space(spa_normal_class(spa)); 194209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); 195209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); 196219089Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); 197219089Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, 198219089Spjd size - alloc, src); 199236155Smm 200236155Smm space = 0; 201236155Smm for (int c = 0; c < rvd->vdev_children; c++) { 202236155Smm vdev_t *tvd = rvd->vdev_child[c]; 203236155Smm space += tvd->vdev_max_asize - tvd->vdev_asize; 204236155Smm } 205236155Smm spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, space, 206236155Smm src); 207236155Smm 208219089Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, 209219089Spjd (spa_mode(spa) == FREAD), src); 210185029Spjd 211219089Spjd cap = (size == 0) ? 0 : (alloc * 100 / size); 212209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); 213185029Spjd 214219089Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, 215219089Spjd ddt_get_pool_dedup_ratio(spa), src); 216219089Spjd 217209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 218236155Smm rvd->vdev_state, src); 219209962Smm 220209962Smm version = spa_version(spa); 221209962Smm if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) 222209962Smm src = ZPROP_SRC_DEFAULT; 223209962Smm else 224209962Smm src = ZPROP_SRC_LOCAL; 225209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); 226209962Smm } 227209962Smm 228236884Smm if (pool != NULL) { 229236884Smm dsl_dir_t *freedir = pool->dp_free_dir; 230236884Smm 231236884Smm /* 232236884Smm * The $FREE directory was introduced in SPA_VERSION_DEADLISTS, 233236884Smm * when opening pools before this version freedir will be NULL. 234236884Smm */ 235236884Smm if (freedir != NULL) { 236236884Smm spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, 237236884Smm freedir->dd_phys->dd_used_bytes, src); 238236884Smm } else { 239236884Smm spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, 240236884Smm NULL, 0, src); 241236884Smm } 242236884Smm } 243236884Smm 244185029Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); 245185029Spjd 246228103Smm if (spa->spa_comment != NULL) { 247228103Smm spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment, 248228103Smm 0, ZPROP_SRC_LOCAL); 249228103Smm } 250228103Smm 251185029Spjd if (spa->spa_root != NULL) 252185029Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 253185029Spjd 0, ZPROP_SRC_LOCAL); 254185029Spjd 255185029Spjd if ((dp = list_head(&spa->spa_config_list)) != NULL) { 256185029Spjd if (dp->scd_path == NULL) { 257185029Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 258185029Spjd "none", 0, ZPROP_SRC_LOCAL); 259185029Spjd } else if (strcmp(dp->scd_path, spa_config_path) != 0) { 260185029Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 261185029Spjd dp->scd_path, 0, ZPROP_SRC_LOCAL); 262185029Spjd } 263185029Spjd } 264185029Spjd} 265185029Spjd 266185029Spjd/* 267185029Spjd * Get zpool property values. 268185029Spjd */ 269185029Spjdint 270185029Spjdspa_prop_get(spa_t *spa, nvlist_t **nvp) 271185029Spjd{ 272219089Spjd objset_t *mos = spa->spa_meta_objset; 273185029Spjd zap_cursor_t zc; 274185029Spjd zap_attribute_t za; 275185029Spjd int err; 276185029Spjd 277185029Spjd VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 278185029Spjd 279185029Spjd mutex_enter(&spa->spa_props_lock); 280185029Spjd 281185029Spjd /* 282185029Spjd * Get properties from the spa config. 283185029Spjd */ 284185029Spjd spa_prop_get_config(spa, nvp); 285185029Spjd 286185029Spjd /* If no pool property object, no more prop to get. */ 287219089Spjd if (mos == NULL || spa->spa_pool_props_object == 0) { 288185029Spjd mutex_exit(&spa->spa_props_lock); 289185029Spjd return (0); 290185029Spjd } 291185029Spjd 292185029Spjd /* 293185029Spjd * Get properties from the MOS pool property object. 294185029Spjd */ 295185029Spjd for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 296185029Spjd (err = zap_cursor_retrieve(&zc, &za)) == 0; 297185029Spjd zap_cursor_advance(&zc)) { 298185029Spjd uint64_t intval = 0; 299185029Spjd char *strval = NULL; 300185029Spjd zprop_source_t src = ZPROP_SRC_DEFAULT; 301185029Spjd zpool_prop_t prop; 302185029Spjd 303185029Spjd if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL) 304185029Spjd continue; 305185029Spjd 306185029Spjd switch (za.za_integer_length) { 307185029Spjd case 8: 308185029Spjd /* integer property */ 309185029Spjd if (za.za_first_integer != 310185029Spjd zpool_prop_default_numeric(prop)) 311185029Spjd src = ZPROP_SRC_LOCAL; 312185029Spjd 313185029Spjd if (prop == ZPOOL_PROP_BOOTFS) { 314185029Spjd dsl_pool_t *dp; 315185029Spjd dsl_dataset_t *ds = NULL; 316185029Spjd 317185029Spjd dp = spa_get_dsl(spa); 318185029Spjd rw_enter(&dp->dp_config_rwlock, RW_READER); 319185029Spjd if (err = dsl_dataset_hold_obj(dp, 320185029Spjd za.za_first_integer, FTAG, &ds)) { 321185029Spjd rw_exit(&dp->dp_config_rwlock); 322185029Spjd break; 323185029Spjd } 324185029Spjd 325185029Spjd strval = kmem_alloc( 326185029Spjd MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, 327185029Spjd KM_SLEEP); 328185029Spjd dsl_dataset_name(ds, strval); 329185029Spjd dsl_dataset_rele(ds, FTAG); 330185029Spjd rw_exit(&dp->dp_config_rwlock); 331185029Spjd } else { 332185029Spjd strval = NULL; 333185029Spjd intval = za.za_first_integer; 334185029Spjd } 335185029Spjd 336185029Spjd spa_prop_add_list(*nvp, prop, strval, intval, src); 337185029Spjd 338185029Spjd if (strval != NULL) 339185029Spjd kmem_free(strval, 340185029Spjd MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); 341185029Spjd 342185029Spjd break; 343185029Spjd 344185029Spjd case 1: 345185029Spjd /* string property */ 346185029Spjd strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 347185029Spjd err = zap_lookup(mos, spa->spa_pool_props_object, 348185029Spjd za.za_name, 1, za.za_num_integers, strval); 349185029Spjd if (err) { 350185029Spjd kmem_free(strval, za.za_num_integers); 351185029Spjd break; 352185029Spjd } 353185029Spjd spa_prop_add_list(*nvp, prop, strval, 0, src); 354185029Spjd kmem_free(strval, za.za_num_integers); 355185029Spjd break; 356185029Spjd 357185029Spjd default: 358185029Spjd break; 359185029Spjd } 360185029Spjd } 361185029Spjd zap_cursor_fini(&zc); 362185029Spjd mutex_exit(&spa->spa_props_lock); 363185029Spjdout: 364185029Spjd if (err && err != ENOENT) { 365185029Spjd nvlist_free(*nvp); 366185029Spjd *nvp = NULL; 367185029Spjd return (err); 368185029Spjd } 369185029Spjd 370185029Spjd return (0); 371185029Spjd} 372185029Spjd 373185029Spjd/* 374185029Spjd * Validate the given pool properties nvlist and modify the list 375185029Spjd * for the property values to be set. 376185029Spjd */ 377185029Spjdstatic int 378185029Spjdspa_prop_validate(spa_t *spa, nvlist_t *props) 379185029Spjd{ 380185029Spjd nvpair_t *elem; 381185029Spjd int error = 0, reset_bootfs = 0; 382185029Spjd uint64_t objnum; 383236884Smm boolean_t has_feature = B_FALSE; 384185029Spjd 385185029Spjd elem = NULL; 386185029Spjd while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 387185029Spjd uint64_t intval; 388236884Smm char *strval, *slash, *check, *fname; 389236884Smm const char *propname = nvpair_name(elem); 390236884Smm zpool_prop_t prop = zpool_name_to_prop(propname); 391185029Spjd 392236884Smm switch (prop) { 393236884Smm case ZPROP_INVAL: 394236884Smm if (!zpool_prop_feature(propname)) { 395236884Smm error = EINVAL; 396236884Smm break; 397236884Smm } 398185029Spjd 399236884Smm /* 400236884Smm * Sanitize the input. 401236884Smm */ 402236884Smm if (nvpair_type(elem) != DATA_TYPE_UINT64) { 403236884Smm error = EINVAL; 404236884Smm break; 405236884Smm } 406185029Spjd 407236884Smm if (nvpair_value_uint64(elem, &intval) != 0) { 408236884Smm error = EINVAL; 409236884Smm break; 410236884Smm } 411236884Smm 412236884Smm if (intval != 0) { 413236884Smm error = EINVAL; 414236884Smm break; 415236884Smm } 416236884Smm 417236884Smm fname = strchr(propname, '@') + 1; 418236884Smm if (zfeature_lookup_name(fname, NULL) != 0) { 419236884Smm error = EINVAL; 420236884Smm break; 421236884Smm } 422236884Smm 423236884Smm has_feature = B_TRUE; 424236884Smm break; 425236884Smm 426185029Spjd case ZPOOL_PROP_VERSION: 427185029Spjd error = nvpair_value_uint64(elem, &intval); 428185029Spjd if (!error && 429236884Smm (intval < spa_version(spa) || 430236884Smm intval > SPA_VERSION_BEFORE_FEATURES || 431236884Smm has_feature)) 432185029Spjd error = EINVAL; 433185029Spjd break; 434185029Spjd 435185029Spjd case ZPOOL_PROP_DELEGATION: 436185029Spjd case ZPOOL_PROP_AUTOREPLACE: 437185029Spjd case ZPOOL_PROP_LISTSNAPS: 438219089Spjd case ZPOOL_PROP_AUTOEXPAND: 439185029Spjd error = nvpair_value_uint64(elem, &intval); 440185029Spjd if (!error && intval > 1) 441185029Spjd error = EINVAL; 442185029Spjd break; 443185029Spjd 444185029Spjd case ZPOOL_PROP_BOOTFS: 445209962Smm /* 446209962Smm * If the pool version is less than SPA_VERSION_BOOTFS, 447209962Smm * or the pool is still being created (version == 0), 448209962Smm * the bootfs property cannot be set. 449209962Smm */ 450185029Spjd if (spa_version(spa) < SPA_VERSION_BOOTFS) { 451185029Spjd error = ENOTSUP; 452185029Spjd break; 453185029Spjd } 454185029Spjd 455185029Spjd /* 456185029Spjd * Make sure the vdev config is bootable 457185029Spjd */ 458185029Spjd if (!vdev_is_bootable(spa->spa_root_vdev)) { 459185029Spjd error = ENOTSUP; 460185029Spjd break; 461185029Spjd } 462185029Spjd 463185029Spjd reset_bootfs = 1; 464185029Spjd 465185029Spjd error = nvpair_value_string(elem, &strval); 466185029Spjd 467185029Spjd if (!error) { 468236884Smm objset_t *os; 469185029Spjd uint64_t compress; 470185029Spjd 471185029Spjd if (strval == NULL || strval[0] == '\0') { 472185029Spjd objnum = zpool_prop_default_numeric( 473185029Spjd ZPOOL_PROP_BOOTFS); 474185029Spjd break; 475185029Spjd } 476185029Spjd 477219089Spjd if (error = dmu_objset_hold(strval, FTAG, &os)) 478185029Spjd break; 479185029Spjd 480219089Spjd /* Must be ZPL and not gzip compressed. */ 481219089Spjd 482219089Spjd if (dmu_objset_type(os) != DMU_OST_ZFS) { 483219089Spjd error = ENOTSUP; 484219089Spjd } else if ((error = dsl_prop_get_integer(strval, 485185029Spjd zfs_prop_to_name(ZFS_PROP_COMPRESSION), 486185029Spjd &compress, NULL)) == 0 && 487185029Spjd !BOOTFS_COMPRESS_VALID(compress)) { 488185029Spjd error = ENOTSUP; 489185029Spjd } else { 490185029Spjd objnum = dmu_objset_id(os); 491185029Spjd } 492219089Spjd dmu_objset_rele(os, FTAG); 493185029Spjd } 494185029Spjd break; 495185029Spjd 496185029Spjd case ZPOOL_PROP_FAILUREMODE: 497185029Spjd error = nvpair_value_uint64(elem, &intval); 498185029Spjd if (!error && (intval < ZIO_FAILURE_MODE_WAIT || 499185029Spjd intval > ZIO_FAILURE_MODE_PANIC)) 500185029Spjd error = EINVAL; 501185029Spjd 502185029Spjd /* 503185029Spjd * This is a special case which only occurs when 504185029Spjd * the pool has completely failed. This allows 505185029Spjd * the user to change the in-core failmode property 506185029Spjd * without syncing it out to disk (I/Os might 507185029Spjd * currently be blocked). We do this by returning 508185029Spjd * EIO to the caller (spa_prop_set) to trick it 509185029Spjd * into thinking we encountered a property validation 510185029Spjd * error. 511185029Spjd */ 512185029Spjd if (!error && spa_suspended(spa)) { 513185029Spjd spa->spa_failmode = intval; 514185029Spjd error = EIO; 515185029Spjd } 516185029Spjd break; 517185029Spjd 518185029Spjd case ZPOOL_PROP_CACHEFILE: 519185029Spjd if ((error = nvpair_value_string(elem, &strval)) != 0) 520185029Spjd break; 521185029Spjd 522185029Spjd if (strval[0] == '\0') 523185029Spjd break; 524185029Spjd 525185029Spjd if (strcmp(strval, "none") == 0) 526185029Spjd break; 527185029Spjd 528185029Spjd if (strval[0] != '/') { 529185029Spjd error = EINVAL; 530185029Spjd break; 531185029Spjd } 532185029Spjd 533185029Spjd slash = strrchr(strval, '/'); 534185029Spjd ASSERT(slash != NULL); 535185029Spjd 536185029Spjd if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 537185029Spjd strcmp(slash, "/..") == 0) 538185029Spjd error = EINVAL; 539185029Spjd break; 540219089Spjd 541228103Smm case ZPOOL_PROP_COMMENT: 542228103Smm if ((error = nvpair_value_string(elem, &strval)) != 0) 543228103Smm break; 544228103Smm for (check = strval; *check != '\0'; check++) { 545228103Smm /* 546228103Smm * The kernel doesn't have an easy isprint() 547228103Smm * check. For this kernel check, we merely 548228103Smm * check ASCII apart from DEL. Fix this if 549228103Smm * there is an easy-to-use kernel isprint(). 550228103Smm */ 551228103Smm if (*check >= 0x7f) { 552228103Smm error = EINVAL; 553228103Smm break; 554228103Smm } 555228103Smm check++; 556228103Smm } 557228103Smm if (strlen(strval) > ZPROP_MAX_COMMENT) 558228103Smm error = E2BIG; 559228103Smm break; 560228103Smm 561219089Spjd case ZPOOL_PROP_DEDUPDITTO: 562219089Spjd if (spa_version(spa) < SPA_VERSION_DEDUP) 563219089Spjd error = ENOTSUP; 564219089Spjd else 565219089Spjd error = nvpair_value_uint64(elem, &intval); 566219089Spjd if (error == 0 && 567219089Spjd intval != 0 && intval < ZIO_DEDUPDITTO_MIN) 568219089Spjd error = EINVAL; 569219089Spjd break; 570185029Spjd } 571185029Spjd 572185029Spjd if (error) 573185029Spjd break; 574185029Spjd } 575185029Spjd 576185029Spjd if (!error && reset_bootfs) { 577185029Spjd error = nvlist_remove(props, 578185029Spjd zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 579185029Spjd 580185029Spjd if (!error) { 581185029Spjd error = nvlist_add_uint64(props, 582185029Spjd zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 583185029Spjd } 584185029Spjd } 585185029Spjd 586185029Spjd return (error); 587185029Spjd} 588185029Spjd 589209962Smmvoid 590209962Smmspa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) 591209962Smm{ 592209962Smm char *cachefile; 593209962Smm spa_config_dirent_t *dp; 594209962Smm 595209962Smm if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), 596209962Smm &cachefile) != 0) 597209962Smm return; 598209962Smm 599209962Smm dp = kmem_alloc(sizeof (spa_config_dirent_t), 600209962Smm KM_SLEEP); 601209962Smm 602209962Smm if (cachefile[0] == '\0') 603209962Smm dp->scd_path = spa_strdup(spa_config_path); 604209962Smm else if (strcmp(cachefile, "none") == 0) 605209962Smm dp->scd_path = NULL; 606209962Smm else 607209962Smm dp->scd_path = spa_strdup(cachefile); 608209962Smm 609209962Smm list_insert_head(&spa->spa_config_list, dp); 610209962Smm if (need_sync) 611209962Smm spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 612209962Smm} 613209962Smm 614185029Spjdint 615185029Spjdspa_prop_set(spa_t *spa, nvlist_t *nvp) 616185029Spjd{ 617185029Spjd int error; 618236884Smm nvpair_t *elem = NULL; 619209962Smm boolean_t need_sync = B_FALSE; 620185029Spjd 621185029Spjd if ((error = spa_prop_validate(spa, nvp)) != 0) 622185029Spjd return (error); 623185029Spjd 624209962Smm while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { 625236884Smm zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem)); 626209962Smm 627219089Spjd if (prop == ZPOOL_PROP_CACHEFILE || 628219089Spjd prop == ZPOOL_PROP_ALTROOT || 629219089Spjd prop == ZPOOL_PROP_READONLY) 630209962Smm continue; 631209962Smm 632236884Smm if (prop == ZPOOL_PROP_VERSION || prop == ZPROP_INVAL) { 633236884Smm uint64_t ver; 634236884Smm 635236884Smm if (prop == ZPOOL_PROP_VERSION) { 636236884Smm VERIFY(nvpair_value_uint64(elem, &ver) == 0); 637236884Smm } else { 638236884Smm ASSERT(zpool_prop_feature(nvpair_name(elem))); 639236884Smm ver = SPA_VERSION_FEATURES; 640236884Smm need_sync = B_TRUE; 641236884Smm } 642236884Smm 643236884Smm /* Save time if the version is already set. */ 644236884Smm if (ver == spa_version(spa)) 645236884Smm continue; 646236884Smm 647236884Smm /* 648236884Smm * In addition to the pool directory object, we might 649236884Smm * create the pool properties object, the features for 650236884Smm * read object, the features for write object, or the 651236884Smm * feature descriptions object. 652236884Smm */ 653236884Smm error = dsl_sync_task_do(spa_get_dsl(spa), NULL, 654236884Smm spa_sync_version, spa, &ver, 6); 655236884Smm if (error) 656236884Smm return (error); 657236884Smm continue; 658236884Smm } 659236884Smm 660209962Smm need_sync = B_TRUE; 661209962Smm break; 662209962Smm } 663209962Smm 664236884Smm if (need_sync) { 665209962Smm return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, 666236884Smm spa, nvp, 6)); 667236884Smm } 668236884Smm 669236884Smm return (0); 670185029Spjd} 671185029Spjd 672185029Spjd/* 673185029Spjd * If the bootfs property value is dsobj, clear it. 674185029Spjd */ 675185029Spjdvoid 676185029Spjdspa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 677185029Spjd{ 678185029Spjd if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 679185029Spjd VERIFY(zap_remove(spa->spa_meta_objset, 680185029Spjd spa->spa_pool_props_object, 681185029Spjd zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 682185029Spjd spa->spa_bootfs = 0; 683185029Spjd } 684185029Spjd} 685185029Spjd 686185029Spjd/* 687228103Smm * Change the GUID for the pool. This is done so that we can later 688228103Smm * re-import a pool built from a clone of our own vdevs. We will modify 689228103Smm * the root vdev's guid, our own pool guid, and then mark all of our 690228103Smm * vdevs dirty. Note that we must make sure that all our vdevs are 691228103Smm * online when we do this, or else any vdevs that weren't present 692228103Smm * would be orphaned from our pool. We are also going to issue a 693228103Smm * sysevent to update any watchers. 694228103Smm */ 695228103Smmint 696228103Smmspa_change_guid(spa_t *spa) 697228103Smm{ 698228103Smm uint64_t oldguid, newguid; 699228103Smm uint64_t txg; 700228103Smm 701228103Smm if (!(spa_mode_global & FWRITE)) 702228103Smm return (EROFS); 703228103Smm 704228103Smm txg = spa_vdev_enter(spa); 705228103Smm 706228103Smm if (spa->spa_root_vdev->vdev_state != VDEV_STATE_HEALTHY) 707228103Smm return (spa_vdev_exit(spa, NULL, txg, ENXIO)); 708228103Smm 709228103Smm oldguid = spa_guid(spa); 710228103Smm newguid = spa_generate_guid(NULL); 711228103Smm ASSERT3U(oldguid, !=, newguid); 712228103Smm 713228103Smm spa->spa_root_vdev->vdev_guid = newguid; 714228103Smm spa->spa_root_vdev->vdev_guid_sum += (newguid - oldguid); 715228103Smm 716228103Smm vdev_config_dirty(spa->spa_root_vdev); 717228103Smm 718228103Smm spa_event_notify(spa, NULL, ESC_ZFS_POOL_REGUID); 719228103Smm 720228103Smm return (spa_vdev_exit(spa, NULL, txg, 0)); 721228103Smm} 722228103Smm 723228103Smm/* 724185029Spjd * ========================================================================== 725168404Spjd * SPA state manipulation (open/create/destroy/import/export) 726168404Spjd * ========================================================================== 727168404Spjd */ 728168404Spjd 729168404Spjdstatic int 730168404Spjdspa_error_entry_compare(const void *a, const void *b) 731168404Spjd{ 732168404Spjd spa_error_entry_t *sa = (spa_error_entry_t *)a; 733168404Spjd spa_error_entry_t *sb = (spa_error_entry_t *)b; 734168404Spjd int ret; 735168404Spjd 736168404Spjd ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 737168404Spjd sizeof (zbookmark_t)); 738168404Spjd 739168404Spjd if (ret < 0) 740168404Spjd return (-1); 741168404Spjd else if (ret > 0) 742168404Spjd return (1); 743168404Spjd else 744168404Spjd return (0); 745168404Spjd} 746168404Spjd 747168404Spjd/* 748168404Spjd * Utility function which retrieves copies of the current logs and 749168404Spjd * re-initializes them in the process. 750168404Spjd */ 751168404Spjdvoid 752168404Spjdspa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 753168404Spjd{ 754168404Spjd ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 755168404Spjd 756168404Spjd bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 757168404Spjd bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 758168404Spjd 759168404Spjd avl_create(&spa->spa_errlist_scrub, 760168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 761168404Spjd offsetof(spa_error_entry_t, se_avl)); 762168404Spjd avl_create(&spa->spa_errlist_last, 763168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 764168404Spjd offsetof(spa_error_entry_t, se_avl)); 765168404Spjd} 766168404Spjd 767219089Spjdstatic taskq_t * 768219089Spjdspa_taskq_create(spa_t *spa, const char *name, enum zti_modes mode, 769219089Spjd uint_t value) 770168404Spjd{ 771219089Spjd uint_t flags = TASKQ_PREPOPULATE; 772219089Spjd boolean_t batch = B_FALSE; 773168404Spjd 774219089Spjd switch (mode) { 775219089Spjd case zti_mode_null: 776219089Spjd return (NULL); /* no taskq needed */ 777168404Spjd 778219089Spjd case zti_mode_fixed: 779219089Spjd ASSERT3U(value, >=, 1); 780219089Spjd value = MAX(value, 1); 781219089Spjd break; 782168404Spjd 783219089Spjd case zti_mode_batch: 784219089Spjd batch = B_TRUE; 785219089Spjd flags |= TASKQ_THREADS_CPU_PCT; 786219089Spjd value = zio_taskq_batch_pct; 787219089Spjd break; 788219089Spjd 789219089Spjd case zti_mode_online_percent: 790219089Spjd flags |= TASKQ_THREADS_CPU_PCT; 791219089Spjd break; 792219089Spjd 793219089Spjd default: 794219089Spjd panic("unrecognized mode for %s taskq (%u:%u) in " 795219089Spjd "spa_activate()", 796219089Spjd name, mode, value); 797219089Spjd break; 798219089Spjd } 799219089Spjd 800219089Spjd#ifdef SYSDC 801219089Spjd if (zio_taskq_sysdc && spa->spa_proc != &p0) { 802219089Spjd if (batch) 803219089Spjd flags |= TASKQ_DC_BATCH; 804219089Spjd 805219089Spjd return (taskq_create_sysdc(name, value, 50, INT_MAX, 806219089Spjd spa->spa_proc, zio_taskq_basedc, flags)); 807219089Spjd } 808219089Spjd#endif 809219089Spjd return (taskq_create_proc(name, value, maxclsyspri, 50, INT_MAX, 810219089Spjd spa->spa_proc, flags)); 811219089Spjd} 812219089Spjd 813219089Spjdstatic void 814219089Spjdspa_create_zio_taskqs(spa_t *spa) 815219089Spjd{ 816185029Spjd for (int t = 0; t < ZIO_TYPES; t++) { 817185029Spjd for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 818211931Smm const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; 819211931Smm enum zti_modes mode = ztip->zti_mode; 820211931Smm uint_t value = ztip->zti_value; 821209962Smm char name[32]; 822209962Smm 823209962Smm (void) snprintf(name, sizeof (name), 824211931Smm "%s_%s", zio_type_name[t], zio_taskq_types[q]); 825209962Smm 826219089Spjd spa->spa_zio_taskq[t][q] = 827219089Spjd spa_taskq_create(spa, name, mode, value); 828219089Spjd } 829219089Spjd } 830219089Spjd} 831209962Smm 832219089Spjd#ifdef _KERNEL 833219089Spjd#ifdef SPA_PROCESS 834219089Spjdstatic void 835219089Spjdspa_thread(void *arg) 836219089Spjd{ 837219089Spjd callb_cpr_t cprinfo; 838209962Smm 839219089Spjd spa_t *spa = arg; 840219089Spjd user_t *pu = PTOU(curproc); 841209962Smm 842219089Spjd CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, 843219089Spjd spa->spa_name); 844209962Smm 845219089Spjd ASSERT(curproc != &p0); 846219089Spjd (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), 847219089Spjd "zpool-%s", spa->spa_name); 848219089Spjd (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); 849211931Smm 850219089Spjd#ifdef PSRSET_BIND 851219089Spjd /* bind this thread to the requested psrset */ 852219089Spjd if (zio_taskq_psrset_bind != PS_NONE) { 853219089Spjd pool_lock(); 854219089Spjd mutex_enter(&cpu_lock); 855219089Spjd mutex_enter(&pidlock); 856219089Spjd mutex_enter(&curproc->p_lock); 857219089Spjd 858219089Spjd if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, 859219089Spjd 0, NULL, NULL) == 0) { 860219089Spjd curthread->t_bind_pset = zio_taskq_psrset_bind; 861219089Spjd } else { 862219089Spjd cmn_err(CE_WARN, 863219089Spjd "Couldn't bind process for zfs pool \"%s\" to " 864219089Spjd "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); 865219089Spjd } 866219089Spjd 867219089Spjd mutex_exit(&curproc->p_lock); 868219089Spjd mutex_exit(&pidlock); 869219089Spjd mutex_exit(&cpu_lock); 870219089Spjd pool_unlock(); 871219089Spjd } 872219089Spjd#endif 873219089Spjd 874219089Spjd#ifdef SYSDC 875219089Spjd if (zio_taskq_sysdc) { 876219089Spjd sysdc_thread_enter(curthread, 100, 0); 877219089Spjd } 878219089Spjd#endif 879219089Spjd 880219089Spjd spa->spa_proc = curproc; 881219089Spjd spa->spa_did = curthread->t_did; 882219089Spjd 883219089Spjd spa_create_zio_taskqs(spa); 884219089Spjd 885219089Spjd mutex_enter(&spa->spa_proc_lock); 886219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); 887219089Spjd 888219089Spjd spa->spa_proc_state = SPA_PROC_ACTIVE; 889219089Spjd cv_broadcast(&spa->spa_proc_cv); 890219089Spjd 891219089Spjd CALLB_CPR_SAFE_BEGIN(&cprinfo); 892219089Spjd while (spa->spa_proc_state == SPA_PROC_ACTIVE) 893219089Spjd cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 894219089Spjd CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); 895219089Spjd 896219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); 897219089Spjd spa->spa_proc_state = SPA_PROC_GONE; 898219089Spjd spa->spa_proc = &p0; 899219089Spjd cv_broadcast(&spa->spa_proc_cv); 900219089Spjd CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ 901219089Spjd 902219089Spjd mutex_enter(&curproc->p_lock); 903219089Spjd lwp_exit(); 904219089Spjd} 905219089Spjd#endif /* SPA_PROCESS */ 906219089Spjd#endif 907219089Spjd 908219089Spjd/* 909219089Spjd * Activate an uninitialized pool. 910219089Spjd */ 911219089Spjdstatic void 912219089Spjdspa_activate(spa_t *spa, int mode) 913219089Spjd{ 914219089Spjd ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 915219089Spjd 916219089Spjd spa->spa_state = POOL_STATE_ACTIVE; 917219089Spjd spa->spa_mode = mode; 918219089Spjd 919219089Spjd spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops); 920219089Spjd spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); 921219089Spjd 922219089Spjd /* Try to create a covering process */ 923219089Spjd mutex_enter(&spa->spa_proc_lock); 924219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_NONE); 925219089Spjd ASSERT(spa->spa_proc == &p0); 926219089Spjd spa->spa_did = 0; 927219089Spjd 928219089Spjd#ifdef SPA_PROCESS 929219089Spjd /* Only create a process if we're going to be around a while. */ 930219089Spjd if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { 931219089Spjd if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, 932219089Spjd NULL, 0) == 0) { 933219089Spjd spa->spa_proc_state = SPA_PROC_CREATED; 934219089Spjd while (spa->spa_proc_state == SPA_PROC_CREATED) { 935219089Spjd cv_wait(&spa->spa_proc_cv, 936219089Spjd &spa->spa_proc_lock); 937209962Smm } 938219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 939219089Spjd ASSERT(spa->spa_proc != &p0); 940219089Spjd ASSERT(spa->spa_did != 0); 941219089Spjd } else { 942219089Spjd#ifdef _KERNEL 943219089Spjd cmn_err(CE_WARN, 944219089Spjd "Couldn't create process for zfs pool \"%s\"\n", 945219089Spjd spa->spa_name); 946219089Spjd#endif 947185029Spjd } 948168404Spjd } 949219089Spjd#endif /* SPA_PROCESS */ 950219089Spjd mutex_exit(&spa->spa_proc_lock); 951168404Spjd 952219089Spjd /* If we didn't create a process, we need to create our taskqs. */ 953219089Spjd ASSERT(spa->spa_proc == &p0); 954219089Spjd if (spa->spa_proc == &p0) { 955219089Spjd spa_create_zio_taskqs(spa); 956219089Spjd } 957219089Spjd 958185029Spjd list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), 959185029Spjd offsetof(vdev_t, vdev_config_dirty_node)); 960185029Spjd list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), 961185029Spjd offsetof(vdev_t, vdev_state_dirty_node)); 962168404Spjd 963168404Spjd txg_list_create(&spa->spa_vdev_txg_list, 964168404Spjd offsetof(struct vdev, vdev_txg_node)); 965168404Spjd 966168404Spjd avl_create(&spa->spa_errlist_scrub, 967168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 968168404Spjd offsetof(spa_error_entry_t, se_avl)); 969168404Spjd avl_create(&spa->spa_errlist_last, 970168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 971168404Spjd offsetof(spa_error_entry_t, se_avl)); 972168404Spjd} 973168404Spjd 974168404Spjd/* 975168404Spjd * Opposite of spa_activate(). 976168404Spjd */ 977168404Spjdstatic void 978168404Spjdspa_deactivate(spa_t *spa) 979168404Spjd{ 980168404Spjd ASSERT(spa->spa_sync_on == B_FALSE); 981168404Spjd ASSERT(spa->spa_dsl_pool == NULL); 982168404Spjd ASSERT(spa->spa_root_vdev == NULL); 983209962Smm ASSERT(spa->spa_async_zio_root == NULL); 984168404Spjd ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 985168404Spjd 986168404Spjd txg_list_destroy(&spa->spa_vdev_txg_list); 987168404Spjd 988185029Spjd list_destroy(&spa->spa_config_dirty_list); 989185029Spjd list_destroy(&spa->spa_state_dirty_list); 990168404Spjd 991185029Spjd for (int t = 0; t < ZIO_TYPES; t++) { 992185029Spjd for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 993211931Smm if (spa->spa_zio_taskq[t][q] != NULL) 994211931Smm taskq_destroy(spa->spa_zio_taskq[t][q]); 995185029Spjd spa->spa_zio_taskq[t][q] = NULL; 996185029Spjd } 997168404Spjd } 998168404Spjd 999168404Spjd metaslab_class_destroy(spa->spa_normal_class); 1000168404Spjd spa->spa_normal_class = NULL; 1001168404Spjd 1002185029Spjd metaslab_class_destroy(spa->spa_log_class); 1003185029Spjd spa->spa_log_class = NULL; 1004185029Spjd 1005168404Spjd /* 1006168404Spjd * If this was part of an import or the open otherwise failed, we may 1007168404Spjd * still have errors left in the queues. Empty them just in case. 1008168404Spjd */ 1009168404Spjd spa_errlog_drain(spa); 1010168404Spjd 1011168404Spjd avl_destroy(&spa->spa_errlist_scrub); 1012168404Spjd avl_destroy(&spa->spa_errlist_last); 1013168404Spjd 1014168404Spjd spa->spa_state = POOL_STATE_UNINITIALIZED; 1015219089Spjd 1016219089Spjd mutex_enter(&spa->spa_proc_lock); 1017219089Spjd if (spa->spa_proc_state != SPA_PROC_NONE) { 1018219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1019219089Spjd spa->spa_proc_state = SPA_PROC_DEACTIVATE; 1020219089Spjd cv_broadcast(&spa->spa_proc_cv); 1021219089Spjd while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { 1022219089Spjd ASSERT(spa->spa_proc != &p0); 1023219089Spjd cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1024219089Spjd } 1025219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_GONE); 1026219089Spjd spa->spa_proc_state = SPA_PROC_NONE; 1027219089Spjd } 1028219089Spjd ASSERT(spa->spa_proc == &p0); 1029219089Spjd mutex_exit(&spa->spa_proc_lock); 1030219089Spjd 1031219089Spjd#ifdef SPA_PROCESS 1032219089Spjd /* 1033219089Spjd * We want to make sure spa_thread() has actually exited the ZFS 1034219089Spjd * module, so that the module can't be unloaded out from underneath 1035219089Spjd * it. 1036219089Spjd */ 1037219089Spjd if (spa->spa_did != 0) { 1038219089Spjd thread_join(spa->spa_did); 1039219089Spjd spa->spa_did = 0; 1040219089Spjd } 1041219089Spjd#endif /* SPA_PROCESS */ 1042168404Spjd} 1043168404Spjd 1044168404Spjd/* 1045168404Spjd * Verify a pool configuration, and construct the vdev tree appropriately. This 1046168404Spjd * will create all the necessary vdevs in the appropriate layout, with each vdev 1047168404Spjd * in the CLOSED state. This will prep the pool before open/creation/import. 1048168404Spjd * All vdev validation is done by the vdev_alloc() routine. 1049168404Spjd */ 1050168404Spjdstatic int 1051168404Spjdspa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 1052168404Spjd uint_t id, int atype) 1053168404Spjd{ 1054168404Spjd nvlist_t **child; 1055219089Spjd uint_t children; 1056168404Spjd int error; 1057168404Spjd 1058168404Spjd if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 1059168404Spjd return (error); 1060168404Spjd 1061168404Spjd if ((*vdp)->vdev_ops->vdev_op_leaf) 1062168404Spjd return (0); 1063168404Spjd 1064185029Spjd error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 1065185029Spjd &child, &children); 1066185029Spjd 1067185029Spjd if (error == ENOENT) 1068185029Spjd return (0); 1069185029Spjd 1070185029Spjd if (error) { 1071168404Spjd vdev_free(*vdp); 1072168404Spjd *vdp = NULL; 1073168404Spjd return (EINVAL); 1074168404Spjd } 1075168404Spjd 1076219089Spjd for (int c = 0; c < children; c++) { 1077168404Spjd vdev_t *vd; 1078168404Spjd if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 1079168404Spjd atype)) != 0) { 1080168404Spjd vdev_free(*vdp); 1081168404Spjd *vdp = NULL; 1082168404Spjd return (error); 1083168404Spjd } 1084168404Spjd } 1085168404Spjd 1086168404Spjd ASSERT(*vdp != NULL); 1087168404Spjd 1088168404Spjd return (0); 1089168404Spjd} 1090168404Spjd 1091168404Spjd/* 1092168404Spjd * Opposite of spa_load(). 1093168404Spjd */ 1094168404Spjdstatic void 1095168404Spjdspa_unload(spa_t *spa) 1096168404Spjd{ 1097168404Spjd int i; 1098168404Spjd 1099185029Spjd ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1100185029Spjd 1101168404Spjd /* 1102168404Spjd * Stop async tasks. 1103168404Spjd */ 1104168404Spjd spa_async_suspend(spa); 1105168404Spjd 1106168404Spjd /* 1107168404Spjd * Stop syncing. 1108168404Spjd */ 1109168404Spjd if (spa->spa_sync_on) { 1110168404Spjd txg_sync_stop(spa->spa_dsl_pool); 1111168404Spjd spa->spa_sync_on = B_FALSE; 1112168404Spjd } 1113168404Spjd 1114168404Spjd /* 1115185029Spjd * Wait for any outstanding async I/O to complete. 1116168404Spjd */ 1117209962Smm if (spa->spa_async_zio_root != NULL) { 1118209962Smm (void) zio_wait(spa->spa_async_zio_root); 1119209962Smm spa->spa_async_zio_root = NULL; 1120209962Smm } 1121168404Spjd 1122219089Spjd bpobj_close(&spa->spa_deferred_bpobj); 1123219089Spjd 1124168404Spjd /* 1125168404Spjd * Close the dsl pool. 1126168404Spjd */ 1127168404Spjd if (spa->spa_dsl_pool) { 1128168404Spjd dsl_pool_close(spa->spa_dsl_pool); 1129168404Spjd spa->spa_dsl_pool = NULL; 1130219089Spjd spa->spa_meta_objset = NULL; 1131168404Spjd } 1132168404Spjd 1133219089Spjd ddt_unload(spa); 1134219089Spjd 1135209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1136209962Smm 1137168404Spjd /* 1138209962Smm * Drop and purge level 2 cache 1139209962Smm */ 1140209962Smm spa_l2cache_drop(spa); 1141209962Smm 1142209962Smm /* 1143168404Spjd * Close all vdevs. 1144168404Spjd */ 1145168404Spjd if (spa->spa_root_vdev) 1146168404Spjd vdev_free(spa->spa_root_vdev); 1147168404Spjd ASSERT(spa->spa_root_vdev == NULL); 1148168404Spjd 1149185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) 1150185029Spjd vdev_free(spa->spa_spares.sav_vdevs[i]); 1151185029Spjd if (spa->spa_spares.sav_vdevs) { 1152185029Spjd kmem_free(spa->spa_spares.sav_vdevs, 1153185029Spjd spa->spa_spares.sav_count * sizeof (void *)); 1154185029Spjd spa->spa_spares.sav_vdevs = NULL; 1155168404Spjd } 1156185029Spjd if (spa->spa_spares.sav_config) { 1157185029Spjd nvlist_free(spa->spa_spares.sav_config); 1158185029Spjd spa->spa_spares.sav_config = NULL; 1159168404Spjd } 1160185029Spjd spa->spa_spares.sav_count = 0; 1161168404Spjd 1162230514Smm for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 1163230514Smm vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]); 1164185029Spjd vdev_free(spa->spa_l2cache.sav_vdevs[i]); 1165230514Smm } 1166185029Spjd if (spa->spa_l2cache.sav_vdevs) { 1167185029Spjd kmem_free(spa->spa_l2cache.sav_vdevs, 1168185029Spjd spa->spa_l2cache.sav_count * sizeof (void *)); 1169185029Spjd spa->spa_l2cache.sav_vdevs = NULL; 1170185029Spjd } 1171185029Spjd if (spa->spa_l2cache.sav_config) { 1172185029Spjd nvlist_free(spa->spa_l2cache.sav_config); 1173185029Spjd spa->spa_l2cache.sav_config = NULL; 1174185029Spjd } 1175185029Spjd spa->spa_l2cache.sav_count = 0; 1176185029Spjd 1177168404Spjd spa->spa_async_suspended = 0; 1178209962Smm 1179228103Smm if (spa->spa_comment != NULL) { 1180228103Smm spa_strfree(spa->spa_comment); 1181228103Smm spa->spa_comment = NULL; 1182228103Smm } 1183228103Smm 1184209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 1185168404Spjd} 1186168404Spjd 1187168404Spjd/* 1188168404Spjd * Load (or re-load) the current list of vdevs describing the active spares for 1189168404Spjd * this pool. When this is called, we have some form of basic information in 1190185029Spjd * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and 1191185029Spjd * then re-generate a more complete list including status information. 1192168404Spjd */ 1193168404Spjdstatic void 1194168404Spjdspa_load_spares(spa_t *spa) 1195168404Spjd{ 1196168404Spjd nvlist_t **spares; 1197168404Spjd uint_t nspares; 1198168404Spjd int i; 1199168404Spjd vdev_t *vd, *tvd; 1200168404Spjd 1201185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1202185029Spjd 1203168404Spjd /* 1204168404Spjd * First, close and free any existing spare vdevs. 1205168404Spjd */ 1206185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) { 1207185029Spjd vd = spa->spa_spares.sav_vdevs[i]; 1208168404Spjd 1209168404Spjd /* Undo the call to spa_activate() below */ 1210185029Spjd if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1211185029Spjd B_FALSE)) != NULL && tvd->vdev_isspare) 1212168404Spjd spa_spare_remove(tvd); 1213168404Spjd vdev_close(vd); 1214168404Spjd vdev_free(vd); 1215168404Spjd } 1216168404Spjd 1217185029Spjd if (spa->spa_spares.sav_vdevs) 1218185029Spjd kmem_free(spa->spa_spares.sav_vdevs, 1219185029Spjd spa->spa_spares.sav_count * sizeof (void *)); 1220168404Spjd 1221185029Spjd if (spa->spa_spares.sav_config == NULL) 1222168404Spjd nspares = 0; 1223168404Spjd else 1224185029Spjd VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 1225168404Spjd ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1226168404Spjd 1227185029Spjd spa->spa_spares.sav_count = (int)nspares; 1228185029Spjd spa->spa_spares.sav_vdevs = NULL; 1229168404Spjd 1230168404Spjd if (nspares == 0) 1231168404Spjd return; 1232168404Spjd 1233168404Spjd /* 1234168404Spjd * Construct the array of vdevs, opening them to get status in the 1235168404Spjd * process. For each spare, there is potentially two different vdev_t 1236168404Spjd * structures associated with it: one in the list of spares (used only 1237168404Spjd * for basic validation purposes) and one in the active vdev 1238168404Spjd * configuration (if it's spared in). During this phase we open and 1239168404Spjd * validate each vdev on the spare list. If the vdev also exists in the 1240168404Spjd * active configuration, then we also mark this vdev as an active spare. 1241168404Spjd */ 1242185029Spjd spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), 1243185029Spjd KM_SLEEP); 1244185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) { 1245168404Spjd VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 1246168404Spjd VDEV_ALLOC_SPARE) == 0); 1247168404Spjd ASSERT(vd != NULL); 1248168404Spjd 1249185029Spjd spa->spa_spares.sav_vdevs[i] = vd; 1250168404Spjd 1251185029Spjd if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1252185029Spjd B_FALSE)) != NULL) { 1253168404Spjd if (!tvd->vdev_isspare) 1254168404Spjd spa_spare_add(tvd); 1255168404Spjd 1256168404Spjd /* 1257168404Spjd * We only mark the spare active if we were successfully 1258168404Spjd * able to load the vdev. Otherwise, importing a pool 1259168404Spjd * with a bad active spare would result in strange 1260168404Spjd * behavior, because multiple pool would think the spare 1261168404Spjd * is actively in use. 1262168404Spjd * 1263168404Spjd * There is a vulnerability here to an equally bizarre 1264168404Spjd * circumstance, where a dead active spare is later 1265168404Spjd * brought back to life (onlined or otherwise). Given 1266168404Spjd * the rarity of this scenario, and the extra complexity 1267168404Spjd * it adds, we ignore the possibility. 1268168404Spjd */ 1269168404Spjd if (!vdev_is_dead(tvd)) 1270168404Spjd spa_spare_activate(tvd); 1271168404Spjd } 1272168404Spjd 1273185029Spjd vd->vdev_top = vd; 1274209962Smm vd->vdev_aux = &spa->spa_spares; 1275185029Spjd 1276168404Spjd if (vdev_open(vd) != 0) 1277168404Spjd continue; 1278168404Spjd 1279185029Spjd if (vdev_validate_aux(vd) == 0) 1280185029Spjd spa_spare_add(vd); 1281168404Spjd } 1282168404Spjd 1283168404Spjd /* 1284168404Spjd * Recompute the stashed list of spares, with status information 1285168404Spjd * this time. 1286168404Spjd */ 1287185029Spjd VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, 1288168404Spjd DATA_TYPE_NVLIST_ARRAY) == 0); 1289168404Spjd 1290185029Spjd spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), 1291185029Spjd KM_SLEEP); 1292185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) 1293185029Spjd spares[i] = vdev_config_generate(spa, 1294219089Spjd spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); 1295185029Spjd VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 1296185029Spjd ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); 1297185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) 1298168404Spjd nvlist_free(spares[i]); 1299185029Spjd kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); 1300168404Spjd} 1301168404Spjd 1302185029Spjd/* 1303185029Spjd * Load (or re-load) the current list of vdevs describing the active l2cache for 1304185029Spjd * this pool. When this is called, we have some form of basic information in 1305185029Spjd * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and 1306185029Spjd * then re-generate a more complete list including status information. 1307185029Spjd * Devices which are already active have their details maintained, and are 1308185029Spjd * not re-opened. 1309185029Spjd */ 1310185029Spjdstatic void 1311185029Spjdspa_load_l2cache(spa_t *spa) 1312185029Spjd{ 1313185029Spjd nvlist_t **l2cache; 1314185029Spjd uint_t nl2cache; 1315185029Spjd int i, j, oldnvdevs; 1316219089Spjd uint64_t guid; 1317185029Spjd vdev_t *vd, **oldvdevs, **newvdevs; 1318185029Spjd spa_aux_vdev_t *sav = &spa->spa_l2cache; 1319185029Spjd 1320185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1321185029Spjd 1322185029Spjd if (sav->sav_config != NULL) { 1323185029Spjd VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 1324185029Spjd ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 1325185029Spjd newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); 1326185029Spjd } else { 1327185029Spjd nl2cache = 0; 1328185029Spjd } 1329185029Spjd 1330185029Spjd oldvdevs = sav->sav_vdevs; 1331185029Spjd oldnvdevs = sav->sav_count; 1332185029Spjd sav->sav_vdevs = NULL; 1333185029Spjd sav->sav_count = 0; 1334185029Spjd 1335185029Spjd /* 1336185029Spjd * Process new nvlist of vdevs. 1337185029Spjd */ 1338185029Spjd for (i = 0; i < nl2cache; i++) { 1339185029Spjd VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, 1340185029Spjd &guid) == 0); 1341185029Spjd 1342185029Spjd newvdevs[i] = NULL; 1343185029Spjd for (j = 0; j < oldnvdevs; j++) { 1344185029Spjd vd = oldvdevs[j]; 1345185029Spjd if (vd != NULL && guid == vd->vdev_guid) { 1346185029Spjd /* 1347185029Spjd * Retain previous vdev for add/remove ops. 1348185029Spjd */ 1349185029Spjd newvdevs[i] = vd; 1350185029Spjd oldvdevs[j] = NULL; 1351185029Spjd break; 1352185029Spjd } 1353185029Spjd } 1354185029Spjd 1355185029Spjd if (newvdevs[i] == NULL) { 1356185029Spjd /* 1357185029Spjd * Create new vdev 1358185029Spjd */ 1359185029Spjd VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, 1360185029Spjd VDEV_ALLOC_L2CACHE) == 0); 1361185029Spjd ASSERT(vd != NULL); 1362185029Spjd newvdevs[i] = vd; 1363185029Spjd 1364185029Spjd /* 1365185029Spjd * Commit this vdev as an l2cache device, 1366185029Spjd * even if it fails to open. 1367185029Spjd */ 1368185029Spjd spa_l2cache_add(vd); 1369185029Spjd 1370185029Spjd vd->vdev_top = vd; 1371185029Spjd vd->vdev_aux = sav; 1372185029Spjd 1373185029Spjd spa_l2cache_activate(vd); 1374185029Spjd 1375185029Spjd if (vdev_open(vd) != 0) 1376185029Spjd continue; 1377185029Spjd 1378185029Spjd (void) vdev_validate_aux(vd); 1379185029Spjd 1380219089Spjd if (!vdev_is_dead(vd)) 1381219089Spjd l2arc_add_vdev(spa, vd); 1382185029Spjd } 1383185029Spjd } 1384185029Spjd 1385185029Spjd /* 1386185029Spjd * Purge vdevs that were dropped 1387185029Spjd */ 1388185029Spjd for (i = 0; i < oldnvdevs; i++) { 1389185029Spjd uint64_t pool; 1390185029Spjd 1391185029Spjd vd = oldvdevs[i]; 1392185029Spjd if (vd != NULL) { 1393230514Smm ASSERT(vd->vdev_isl2cache); 1394230514Smm 1395209962Smm if (spa_l2cache_exists(vd->vdev_guid, &pool) && 1396209962Smm pool != 0ULL && l2arc_vdev_present(vd)) 1397185029Spjd l2arc_remove_vdev(vd); 1398230514Smm vdev_clear_stats(vd); 1399230514Smm vdev_free(vd); 1400185029Spjd } 1401185029Spjd } 1402185029Spjd 1403185029Spjd if (oldvdevs) 1404185029Spjd kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); 1405185029Spjd 1406185029Spjd if (sav->sav_config == NULL) 1407185029Spjd goto out; 1408185029Spjd 1409185029Spjd sav->sav_vdevs = newvdevs; 1410185029Spjd sav->sav_count = (int)nl2cache; 1411185029Spjd 1412185029Spjd /* 1413185029Spjd * Recompute the stashed list of l2cache devices, with status 1414185029Spjd * information this time. 1415185029Spjd */ 1416185029Spjd VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 1417185029Spjd DATA_TYPE_NVLIST_ARRAY) == 0); 1418185029Spjd 1419185029Spjd l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 1420185029Spjd for (i = 0; i < sav->sav_count; i++) 1421185029Spjd l2cache[i] = vdev_config_generate(spa, 1422219089Spjd sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); 1423185029Spjd VERIFY(nvlist_add_nvlist_array(sav->sav_config, 1424185029Spjd ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); 1425185029Spjdout: 1426185029Spjd for (i = 0; i < sav->sav_count; i++) 1427185029Spjd nvlist_free(l2cache[i]); 1428185029Spjd if (sav->sav_count) 1429185029Spjd kmem_free(l2cache, sav->sav_count * sizeof (void *)); 1430185029Spjd} 1431185029Spjd 1432168404Spjdstatic int 1433168404Spjdload_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 1434168404Spjd{ 1435168404Spjd dmu_buf_t *db; 1436168404Spjd char *packed = NULL; 1437168404Spjd size_t nvsize = 0; 1438168404Spjd int error; 1439168404Spjd *value = NULL; 1440168404Spjd 1441168404Spjd VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 1442168404Spjd nvsize = *(uint64_t *)db->db_data; 1443168404Spjd dmu_buf_rele(db, FTAG); 1444168404Spjd 1445168404Spjd packed = kmem_alloc(nvsize, KM_SLEEP); 1446209962Smm error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, 1447209962Smm DMU_READ_PREFETCH); 1448168404Spjd if (error == 0) 1449168404Spjd error = nvlist_unpack(packed, nvsize, value, 0); 1450168404Spjd kmem_free(packed, nvsize); 1451168404Spjd 1452168404Spjd return (error); 1453168404Spjd} 1454168404Spjd 1455168404Spjd/* 1456185029Spjd * Checks to see if the given vdev could not be opened, in which case we post a 1457185029Spjd * sysevent to notify the autoreplace code that the device has been removed. 1458185029Spjd */ 1459185029Spjdstatic void 1460185029Spjdspa_check_removed(vdev_t *vd) 1461185029Spjd{ 1462219089Spjd for (int c = 0; c < vd->vdev_children; c++) 1463185029Spjd spa_check_removed(vd->vdev_child[c]); 1464185029Spjd 1465185029Spjd if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) { 1466185029Spjd zfs_post_autoreplace(vd->vdev_spa, vd); 1467185029Spjd spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK); 1468185029Spjd } 1469185029Spjd} 1470185029Spjd 1471185029Spjd/* 1472219089Spjd * Validate the current config against the MOS config 1473213197Smm */ 1474219089Spjdstatic boolean_t 1475219089Spjdspa_config_valid(spa_t *spa, nvlist_t *config) 1476213197Smm{ 1477219089Spjd vdev_t *mrvd, *rvd = spa->spa_root_vdev; 1478219089Spjd nvlist_t *nv; 1479213197Smm 1480219089Spjd VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0); 1481213197Smm 1482219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1483219089Spjd VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); 1484219089Spjd 1485219089Spjd ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children); 1486219089Spjd 1487219089Spjd /* 1488219089Spjd * If we're doing a normal import, then build up any additional 1489219089Spjd * diagnostic information about missing devices in this config. 1490219089Spjd * We'll pass this up to the user for further processing. 1491219089Spjd */ 1492219089Spjd if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { 1493219089Spjd nvlist_t **child, *nv; 1494219089Spjd uint64_t idx = 0; 1495219089Spjd 1496219089Spjd child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **), 1497219089Spjd KM_SLEEP); 1498219089Spjd VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); 1499219089Spjd 1500219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 1501219089Spjd vdev_t *tvd = rvd->vdev_child[c]; 1502219089Spjd vdev_t *mtvd = mrvd->vdev_child[c]; 1503219089Spjd 1504219089Spjd if (tvd->vdev_ops == &vdev_missing_ops && 1505219089Spjd mtvd->vdev_ops != &vdev_missing_ops && 1506219089Spjd mtvd->vdev_islog) 1507219089Spjd child[idx++] = vdev_config_generate(spa, mtvd, 1508219089Spjd B_FALSE, 0); 1509219089Spjd } 1510219089Spjd 1511219089Spjd if (idx) { 1512219089Spjd VERIFY(nvlist_add_nvlist_array(nv, 1513219089Spjd ZPOOL_CONFIG_CHILDREN, child, idx) == 0); 1514219089Spjd VERIFY(nvlist_add_nvlist(spa->spa_load_info, 1515219089Spjd ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0); 1516219089Spjd 1517219089Spjd for (int i = 0; i < idx; i++) 1518219089Spjd nvlist_free(child[i]); 1519219089Spjd } 1520219089Spjd nvlist_free(nv); 1521219089Spjd kmem_free(child, rvd->vdev_children * sizeof (char **)); 1522219089Spjd } 1523219089Spjd 1524219089Spjd /* 1525219089Spjd * Compare the root vdev tree with the information we have 1526219089Spjd * from the MOS config (mrvd). Check each top-level vdev 1527219089Spjd * with the corresponding MOS config top-level (mtvd). 1528219089Spjd */ 1529219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 1530213197Smm vdev_t *tvd = rvd->vdev_child[c]; 1531219089Spjd vdev_t *mtvd = mrvd->vdev_child[c]; 1532213197Smm 1533219089Spjd /* 1534219089Spjd * Resolve any "missing" vdevs in the current configuration. 1535219089Spjd * If we find that the MOS config has more accurate information 1536219089Spjd * about the top-level vdev then use that vdev instead. 1537219089Spjd */ 1538219089Spjd if (tvd->vdev_ops == &vdev_missing_ops && 1539219089Spjd mtvd->vdev_ops != &vdev_missing_ops) { 1540219089Spjd 1541219089Spjd if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) 1542219089Spjd continue; 1543219089Spjd 1544219089Spjd /* 1545219089Spjd * Device specific actions. 1546219089Spjd */ 1547219089Spjd if (mtvd->vdev_islog) { 1548219089Spjd spa_set_log_state(spa, SPA_LOG_CLEAR); 1549219089Spjd } else { 1550219089Spjd /* 1551219089Spjd * XXX - once we have 'readonly' pool 1552219089Spjd * support we should be able to handle 1553219089Spjd * missing data devices by transitioning 1554219089Spjd * the pool to readonly. 1555219089Spjd */ 1556219089Spjd continue; 1557219089Spjd } 1558219089Spjd 1559219089Spjd /* 1560219089Spjd * Swap the missing vdev with the data we were 1561219089Spjd * able to obtain from the MOS config. 1562219089Spjd */ 1563219089Spjd vdev_remove_child(rvd, tvd); 1564219089Spjd vdev_remove_child(mrvd, mtvd); 1565219089Spjd 1566219089Spjd vdev_add_child(rvd, mtvd); 1567219089Spjd vdev_add_child(mrvd, tvd); 1568219089Spjd 1569219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 1570219089Spjd vdev_load(mtvd); 1571219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1572219089Spjd 1573219089Spjd vdev_reopen(rvd); 1574219089Spjd } else if (mtvd->vdev_islog) { 1575219089Spjd /* 1576219089Spjd * Load the slog device's state from the MOS config 1577219089Spjd * since it's possible that the label does not 1578219089Spjd * contain the most up-to-date information. 1579219089Spjd */ 1580219089Spjd vdev_load_log_state(tvd, mtvd); 1581219089Spjd vdev_reopen(tvd); 1582219089Spjd } 1583213197Smm } 1584219089Spjd vdev_free(mrvd); 1585219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 1586219089Spjd 1587219089Spjd /* 1588219089Spjd * Ensure we were able to validate the config. 1589219089Spjd */ 1590219089Spjd return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum); 1591213197Smm} 1592213197Smm 1593213197Smm/* 1594185029Spjd * Check for missing log devices 1595185029Spjd */ 1596219089Spjdstatic int 1597185029Spjdspa_check_logs(spa_t *spa) 1598185029Spjd{ 1599185029Spjd switch (spa->spa_log_state) { 1600185029Spjd case SPA_LOG_MISSING: 1601185029Spjd /* need to recheck in case slog has been restored */ 1602185029Spjd case SPA_LOG_UNKNOWN: 1603185029Spjd if (dmu_objset_find(spa->spa_name, zil_check_log_chain, NULL, 1604185029Spjd DS_FIND_CHILDREN)) { 1605219089Spjd spa_set_log_state(spa, SPA_LOG_MISSING); 1606185029Spjd return (1); 1607185029Spjd } 1608185029Spjd break; 1609185029Spjd } 1610185029Spjd return (0); 1611185029Spjd} 1612185029Spjd 1613219089Spjdstatic boolean_t 1614219089Spjdspa_passivate_log(spa_t *spa) 1615219089Spjd{ 1616219089Spjd vdev_t *rvd = spa->spa_root_vdev; 1617219089Spjd boolean_t slog_found = B_FALSE; 1618219089Spjd 1619219089Spjd ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1620219089Spjd 1621219089Spjd if (!spa_has_slogs(spa)) 1622219089Spjd return (B_FALSE); 1623219089Spjd 1624219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 1625219089Spjd vdev_t *tvd = rvd->vdev_child[c]; 1626219089Spjd metaslab_group_t *mg = tvd->vdev_mg; 1627219089Spjd 1628219089Spjd if (tvd->vdev_islog) { 1629219089Spjd metaslab_group_passivate(mg); 1630219089Spjd slog_found = B_TRUE; 1631219089Spjd } 1632219089Spjd } 1633219089Spjd 1634219089Spjd return (slog_found); 1635219089Spjd} 1636219089Spjd 1637219089Spjdstatic void 1638219089Spjdspa_activate_log(spa_t *spa) 1639219089Spjd{ 1640219089Spjd vdev_t *rvd = spa->spa_root_vdev; 1641219089Spjd 1642219089Spjd ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1643219089Spjd 1644219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 1645219089Spjd vdev_t *tvd = rvd->vdev_child[c]; 1646219089Spjd metaslab_group_t *mg = tvd->vdev_mg; 1647219089Spjd 1648219089Spjd if (tvd->vdev_islog) 1649219089Spjd metaslab_group_activate(mg); 1650219089Spjd } 1651219089Spjd} 1652219089Spjd 1653219089Spjdint 1654219089Spjdspa_offline_log(spa_t *spa) 1655219089Spjd{ 1656219089Spjd int error = 0; 1657219089Spjd 1658219089Spjd if ((error = dmu_objset_find(spa_name(spa), zil_vdev_offline, 1659219089Spjd NULL, DS_FIND_CHILDREN)) == 0) { 1660219089Spjd 1661219089Spjd /* 1662219089Spjd * We successfully offlined the log device, sync out the 1663219089Spjd * current txg so that the "stubby" block can be removed 1664219089Spjd * by zil_sync(). 1665219089Spjd */ 1666219089Spjd txg_wait_synced(spa->spa_dsl_pool, 0); 1667219089Spjd } 1668219089Spjd return (error); 1669219089Spjd} 1670219089Spjd 1671219089Spjdstatic void 1672219089Spjdspa_aux_check_removed(spa_aux_vdev_t *sav) 1673219089Spjd{ 1674219089Spjd int i; 1675219089Spjd 1676219089Spjd for (i = 0; i < sav->sav_count; i++) 1677219089Spjd spa_check_removed(sav->sav_vdevs[i]); 1678219089Spjd} 1679219089Spjd 1680219089Spjdvoid 1681219089Spjdspa_claim_notify(zio_t *zio) 1682219089Spjd{ 1683219089Spjd spa_t *spa = zio->io_spa; 1684219089Spjd 1685219089Spjd if (zio->io_error) 1686219089Spjd return; 1687219089Spjd 1688219089Spjd mutex_enter(&spa->spa_props_lock); /* any mutex will do */ 1689219089Spjd if (spa->spa_claim_max_txg < zio->io_bp->blk_birth) 1690219089Spjd spa->spa_claim_max_txg = zio->io_bp->blk_birth; 1691219089Spjd mutex_exit(&spa->spa_props_lock); 1692219089Spjd} 1693219089Spjd 1694219089Spjdtypedef struct spa_load_error { 1695219089Spjd uint64_t sle_meta_count; 1696219089Spjd uint64_t sle_data_count; 1697219089Spjd} spa_load_error_t; 1698219089Spjd 1699219089Spjdstatic void 1700219089Spjdspa_load_verify_done(zio_t *zio) 1701219089Spjd{ 1702219089Spjd blkptr_t *bp = zio->io_bp; 1703219089Spjd spa_load_error_t *sle = zio->io_private; 1704219089Spjd dmu_object_type_t type = BP_GET_TYPE(bp); 1705219089Spjd int error = zio->io_error; 1706219089Spjd 1707219089Spjd if (error) { 1708236884Smm if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && 1709219089Spjd type != DMU_OT_INTENT_LOG) 1710219089Spjd atomic_add_64(&sle->sle_meta_count, 1); 1711219089Spjd else 1712219089Spjd atomic_add_64(&sle->sle_data_count, 1); 1713219089Spjd } 1714219089Spjd zio_data_buf_free(zio->io_data, zio->io_size); 1715219089Spjd} 1716219089Spjd 1717219089Spjd/*ARGSUSED*/ 1718219089Spjdstatic int 1719219089Spjdspa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 1720219089Spjd arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) 1721219089Spjd{ 1722219089Spjd if (bp != NULL) { 1723219089Spjd zio_t *rio = arg; 1724219089Spjd size_t size = BP_GET_PSIZE(bp); 1725219089Spjd void *data = zio_data_buf_alloc(size); 1726219089Spjd 1727219089Spjd zio_nowait(zio_read(rio, spa, bp, data, size, 1728219089Spjd spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, 1729219089Spjd ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | 1730219089Spjd ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); 1731219089Spjd } 1732219089Spjd return (0); 1733219089Spjd} 1734219089Spjd 1735219089Spjdstatic int 1736219089Spjdspa_load_verify(spa_t *spa) 1737219089Spjd{ 1738219089Spjd zio_t *rio; 1739219089Spjd spa_load_error_t sle = { 0 }; 1740219089Spjd zpool_rewind_policy_t policy; 1741219089Spjd boolean_t verify_ok = B_FALSE; 1742219089Spjd int error; 1743219089Spjd 1744219089Spjd zpool_get_rewind_policy(spa->spa_config, &policy); 1745219089Spjd 1746219089Spjd if (policy.zrp_request & ZPOOL_NEVER_REWIND) 1747219089Spjd return (0); 1748219089Spjd 1749219089Spjd rio = zio_root(spa, NULL, &sle, 1750219089Spjd ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 1751219089Spjd 1752219089Spjd error = traverse_pool(spa, spa->spa_verify_min_txg, 1753219089Spjd TRAVERSE_PRE | TRAVERSE_PREFETCH, spa_load_verify_cb, rio); 1754219089Spjd 1755219089Spjd (void) zio_wait(rio); 1756219089Spjd 1757219089Spjd spa->spa_load_meta_errors = sle.sle_meta_count; 1758219089Spjd spa->spa_load_data_errors = sle.sle_data_count; 1759219089Spjd 1760219089Spjd if (!error && sle.sle_meta_count <= policy.zrp_maxmeta && 1761219089Spjd sle.sle_data_count <= policy.zrp_maxdata) { 1762219089Spjd int64_t loss = 0; 1763219089Spjd 1764219089Spjd verify_ok = B_TRUE; 1765219089Spjd spa->spa_load_txg = spa->spa_uberblock.ub_txg; 1766219089Spjd spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; 1767219089Spjd 1768219089Spjd loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; 1769219089Spjd VERIFY(nvlist_add_uint64(spa->spa_load_info, 1770219089Spjd ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0); 1771219089Spjd VERIFY(nvlist_add_int64(spa->spa_load_info, 1772219089Spjd ZPOOL_CONFIG_REWIND_TIME, loss) == 0); 1773219089Spjd VERIFY(nvlist_add_uint64(spa->spa_load_info, 1774219089Spjd ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0); 1775219089Spjd } else { 1776219089Spjd spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; 1777219089Spjd } 1778219089Spjd 1779219089Spjd if (error) { 1780219089Spjd if (error != ENXIO && error != EIO) 1781219089Spjd error = EIO; 1782219089Spjd return (error); 1783219089Spjd } 1784219089Spjd 1785219089Spjd return (verify_ok ? 0 : EIO); 1786219089Spjd} 1787219089Spjd 1788185029Spjd/* 1789219089Spjd * Find a value in the pool props object. 1790168404Spjd */ 1791219089Spjdstatic void 1792219089Spjdspa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) 1793219089Spjd{ 1794219089Spjd (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, 1795219089Spjd zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); 1796219089Spjd} 1797219089Spjd 1798219089Spjd/* 1799219089Spjd * Find a value in the pool directory object. 1800219089Spjd */ 1801168404Spjdstatic int 1802219089Spjdspa_dir_prop(spa_t *spa, const char *name, uint64_t *val) 1803168404Spjd{ 1804219089Spjd return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1805219089Spjd name, sizeof (uint64_t), 1, val)); 1806219089Spjd} 1807168404Spjd 1808219089Spjdstatic int 1809219089Spjdspa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) 1810219089Spjd{ 1811219089Spjd vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); 1812219089Spjd return (err); 1813219089Spjd} 1814219089Spjd 1815219089Spjd/* 1816219089Spjd * Fix up config after a partly-completed split. This is done with the 1817219089Spjd * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off 1818219089Spjd * pool have that entry in their config, but only the splitting one contains 1819219089Spjd * a list of all the guids of the vdevs that are being split off. 1820219089Spjd * 1821219089Spjd * This function determines what to do with that list: either rejoin 1822219089Spjd * all the disks to the pool, or complete the splitting process. To attempt 1823219089Spjd * the rejoin, each disk that is offlined is marked online again, and 1824219089Spjd * we do a reopen() call. If the vdev label for every disk that was 1825219089Spjd * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) 1826219089Spjd * then we call vdev_split() on each disk, and complete the split. 1827219089Spjd * 1828219089Spjd * Otherwise we leave the config alone, with all the vdevs in place in 1829219089Spjd * the original pool. 1830219089Spjd */ 1831219089Spjdstatic void 1832219089Spjdspa_try_repair(spa_t *spa, nvlist_t *config) 1833219089Spjd{ 1834219089Spjd uint_t extracted; 1835219089Spjd uint64_t *glist; 1836219089Spjd uint_t i, gcount; 1837219089Spjd nvlist_t *nvl; 1838219089Spjd vdev_t **vd; 1839219089Spjd boolean_t attempt_reopen; 1840219089Spjd 1841219089Spjd if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) 1842219089Spjd return; 1843219089Spjd 1844219089Spjd /* check that the config is complete */ 1845219089Spjd if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 1846219089Spjd &glist, &gcount) != 0) 1847219089Spjd return; 1848219089Spjd 1849219089Spjd vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); 1850219089Spjd 1851219089Spjd /* attempt to online all the vdevs & validate */ 1852219089Spjd attempt_reopen = B_TRUE; 1853219089Spjd for (i = 0; i < gcount; i++) { 1854219089Spjd if (glist[i] == 0) /* vdev is hole */ 1855219089Spjd continue; 1856219089Spjd 1857219089Spjd vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); 1858219089Spjd if (vd[i] == NULL) { 1859219089Spjd /* 1860219089Spjd * Don't bother attempting to reopen the disks; 1861219089Spjd * just do the split. 1862219089Spjd */ 1863219089Spjd attempt_reopen = B_FALSE; 1864219089Spjd } else { 1865219089Spjd /* attempt to re-online it */ 1866219089Spjd vd[i]->vdev_offline = B_FALSE; 1867219089Spjd } 1868219089Spjd } 1869219089Spjd 1870219089Spjd if (attempt_reopen) { 1871219089Spjd vdev_reopen(spa->spa_root_vdev); 1872219089Spjd 1873219089Spjd /* check each device to see what state it's in */ 1874219089Spjd for (extracted = 0, i = 0; i < gcount; i++) { 1875219089Spjd if (vd[i] != NULL && 1876219089Spjd vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) 1877219089Spjd break; 1878219089Spjd ++extracted; 1879219089Spjd } 1880219089Spjd } 1881219089Spjd 1882209962Smm /* 1883219089Spjd * If every disk has been moved to the new pool, or if we never 1884219089Spjd * even attempted to look at them, then we split them off for 1885219089Spjd * good. 1886209962Smm */ 1887219089Spjd if (!attempt_reopen || gcount == extracted) { 1888219089Spjd for (i = 0; i < gcount; i++) 1889219089Spjd if (vd[i] != NULL) 1890219089Spjd vdev_split(vd[i]); 1891219089Spjd vdev_reopen(spa->spa_root_vdev); 1892219089Spjd } 1893209962Smm 1894219089Spjd kmem_free(vd, gcount * sizeof (vdev_t *)); 1895219089Spjd} 1896185029Spjd 1897219089Spjdstatic int 1898219089Spjdspa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, 1899219089Spjd boolean_t mosconfig) 1900219089Spjd{ 1901219089Spjd nvlist_t *config = spa->spa_config; 1902219089Spjd char *ereport = FM_EREPORT_ZFS_POOL; 1903228103Smm char *comment; 1904219089Spjd int error; 1905219089Spjd uint64_t pool_guid; 1906219089Spjd nvlist_t *nvl; 1907168404Spjd 1908219089Spjd if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) 1909219089Spjd return (EINVAL); 1910168404Spjd 1911228103Smm ASSERT(spa->spa_comment == NULL); 1912228103Smm if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) 1913228103Smm spa->spa_comment = spa_strdup(comment); 1914228103Smm 1915168404Spjd /* 1916168404Spjd * Versioning wasn't explicitly added to the label until later, so if 1917168404Spjd * it's not present treat it as the initial version. 1918168404Spjd */ 1919219089Spjd if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 1920219089Spjd &spa->spa_ubsync.ub_version) != 0) 1921219089Spjd spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 1922168404Spjd 1923168404Spjd (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 1924168404Spjd &spa->spa_config_txg); 1925168404Spjd 1926168404Spjd if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 1927168404Spjd spa_guid_exists(pool_guid, 0)) { 1928168404Spjd error = EEXIST; 1929219089Spjd } else { 1930228103Smm spa->spa_config_guid = pool_guid; 1931219089Spjd 1932219089Spjd if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, 1933219089Spjd &nvl) == 0) { 1934219089Spjd VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting, 1935219089Spjd KM_SLEEP) == 0); 1936219089Spjd } 1937219089Spjd 1938236884Smm nvlist_free(spa->spa_load_info); 1939236884Smm spa->spa_load_info = fnvlist_alloc(); 1940236884Smm 1941219089Spjd gethrestime(&spa->spa_loaded_ts); 1942219089Spjd error = spa_load_impl(spa, pool_guid, config, state, type, 1943219089Spjd mosconfig, &ereport); 1944168404Spjd } 1945168404Spjd 1946219089Spjd spa->spa_minref = refcount_count(&spa->spa_refcount); 1947219089Spjd if (error) { 1948219089Spjd if (error != EEXIST) { 1949219089Spjd spa->spa_loaded_ts.tv_sec = 0; 1950219089Spjd spa->spa_loaded_ts.tv_nsec = 0; 1951219089Spjd } 1952219089Spjd if (error != EBADF) { 1953219089Spjd zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); 1954219089Spjd } 1955219089Spjd } 1956219089Spjd spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; 1957219089Spjd spa->spa_ena = 0; 1958168404Spjd 1959219089Spjd return (error); 1960219089Spjd} 1961219089Spjd 1962219089Spjd/* 1963219089Spjd * Load an existing storage pool, using the pool's builtin spa_config as a 1964219089Spjd * source of configuration information. 1965219089Spjd */ 1966219089Spjdstatic int 1967219089Spjdspa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, 1968219089Spjd spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, 1969219089Spjd char **ereport) 1970219089Spjd{ 1971219089Spjd int error = 0; 1972219089Spjd nvlist_t *nvroot = NULL; 1973236884Smm nvlist_t *label; 1974219089Spjd vdev_t *rvd; 1975219089Spjd uberblock_t *ub = &spa->spa_uberblock; 1976219089Spjd uint64_t children, config_cache_txg = spa->spa_config_txg; 1977219089Spjd int orig_mode = spa->spa_mode; 1978219089Spjd int parse; 1979219089Spjd uint64_t obj; 1980236884Smm boolean_t missing_feat_write = B_FALSE; 1981219089Spjd 1982168404Spjd /* 1983219089Spjd * If this is an untrusted config, access the pool in read-only mode. 1984219089Spjd * This prevents things like resilvering recently removed devices. 1985219089Spjd */ 1986219089Spjd if (!mosconfig) 1987219089Spjd spa->spa_mode = FREAD; 1988219089Spjd 1989219089Spjd ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1990219089Spjd 1991219089Spjd spa->spa_load_state = state; 1992219089Spjd 1993219089Spjd if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot)) 1994219089Spjd return (EINVAL); 1995219089Spjd 1996219089Spjd parse = (type == SPA_IMPORT_EXISTING ? 1997219089Spjd VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); 1998219089Spjd 1999219089Spjd /* 2000209962Smm * Create "The Godfather" zio to hold all async IOs 2001209962Smm */ 2002209962Smm spa->spa_async_zio_root = zio_root(spa, NULL, NULL, 2003209962Smm ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); 2004209962Smm 2005209962Smm /* 2006168404Spjd * Parse the configuration into a vdev tree. We explicitly set the 2007168404Spjd * value that will be returned by spa_version() since parsing the 2008168404Spjd * configuration requires knowing the version number. 2009168404Spjd */ 2010185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2011219089Spjd error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse); 2012185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2013168404Spjd 2014168404Spjd if (error != 0) 2015219089Spjd return (error); 2016168404Spjd 2017168404Spjd ASSERT(spa->spa_root_vdev == rvd); 2018168404Spjd 2019219089Spjd if (type != SPA_IMPORT_ASSEMBLE) { 2020219089Spjd ASSERT(spa_guid(spa) == pool_guid); 2021219089Spjd } 2022219089Spjd 2023168404Spjd /* 2024168404Spjd * Try to open all vdevs, loading each label in the process. 2025168404Spjd */ 2026185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2027168926Spjd error = vdev_open(rvd); 2028185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2029168926Spjd if (error != 0) 2030219089Spjd return (error); 2031168404Spjd 2032168404Spjd /* 2033209962Smm * We need to validate the vdev labels against the configuration that 2034209962Smm * we have in hand, which is dependent on the setting of mosconfig. If 2035209962Smm * mosconfig is true then we're validating the vdev labels based on 2036219089Spjd * that config. Otherwise, we're validating against the cached config 2037209962Smm * (zpool.cache) that was read when we loaded the zfs module, and then 2038209962Smm * later we will recursively call spa_load() and validate against 2039209962Smm * the vdev config. 2040219089Spjd * 2041219089Spjd * If we're assembling a new pool that's been split off from an 2042219089Spjd * existing pool, the labels haven't yet been updated so we skip 2043219089Spjd * validation for now. 2044168404Spjd */ 2045219089Spjd if (type != SPA_IMPORT_ASSEMBLE) { 2046219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2047230514Smm error = vdev_validate(rvd, mosconfig); 2048219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2049168404Spjd 2050219089Spjd if (error != 0) 2051219089Spjd return (error); 2052219089Spjd 2053219089Spjd if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 2054219089Spjd return (ENXIO); 2055168404Spjd } 2056168404Spjd 2057168404Spjd /* 2058168404Spjd * Find the best uberblock. 2059168404Spjd */ 2060236884Smm vdev_uberblock_load(rvd, ub, &label); 2061168404Spjd 2062168404Spjd /* 2063168404Spjd * If we weren't able to find a single valid uberblock, return failure. 2064168404Spjd */ 2065236884Smm if (ub->ub_txg == 0) { 2066236884Smm nvlist_free(label); 2067219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); 2068236884Smm } 2069168404Spjd 2070168404Spjd /* 2071236884Smm * If the pool has an unsupported version we can't open it. 2072168404Spjd */ 2073236884Smm if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) { 2074236884Smm nvlist_free(label); 2075219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); 2076236884Smm } 2077168404Spjd 2078236884Smm if (ub->ub_version >= SPA_VERSION_FEATURES) { 2079236884Smm nvlist_t *features; 2080236884Smm 2081236884Smm /* 2082236884Smm * If we weren't able to find what's necessary for reading the 2083236884Smm * MOS in the label, return failure. 2084236884Smm */ 2085236884Smm if (label == NULL || nvlist_lookup_nvlist(label, 2086236884Smm ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) { 2087236884Smm nvlist_free(label); 2088236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 2089236884Smm ENXIO)); 2090236884Smm } 2091236884Smm 2092236884Smm /* 2093236884Smm * Update our in-core representation with the definitive values 2094236884Smm * from the label. 2095236884Smm */ 2096236884Smm nvlist_free(spa->spa_label_features); 2097236884Smm VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0); 2098236884Smm } 2099236884Smm 2100236884Smm nvlist_free(label); 2101236884Smm 2102168404Spjd /* 2103236884Smm * Look through entries in the label nvlist's features_for_read. If 2104236884Smm * there is a feature listed there which we don't understand then we 2105236884Smm * cannot open a pool. 2106236884Smm */ 2107236884Smm if (ub->ub_version >= SPA_VERSION_FEATURES) { 2108236884Smm nvlist_t *unsup_feat; 2109236884Smm 2110236884Smm VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) == 2111236884Smm 0); 2112236884Smm 2113236884Smm for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features, 2114236884Smm NULL); nvp != NULL; 2115236884Smm nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) { 2116236884Smm if (!zfeature_is_supported(nvpair_name(nvp))) { 2117236884Smm VERIFY(nvlist_add_string(unsup_feat, 2118236884Smm nvpair_name(nvp), "") == 0); 2119236884Smm } 2120236884Smm } 2121236884Smm 2122236884Smm if (!nvlist_empty(unsup_feat)) { 2123236884Smm VERIFY(nvlist_add_nvlist(spa->spa_load_info, 2124236884Smm ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0); 2125236884Smm nvlist_free(unsup_feat); 2126236884Smm return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 2127236884Smm ENOTSUP)); 2128236884Smm } 2129236884Smm 2130236884Smm nvlist_free(unsup_feat); 2131236884Smm } 2132236884Smm 2133236884Smm /* 2134168404Spjd * If the vdev guid sum doesn't match the uberblock, we have an 2135219089Spjd * incomplete configuration. We first check to see if the pool 2136219089Spjd * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN). 2137219089Spjd * If it is, defer the vdev_guid_sum check till later so we 2138219089Spjd * can handle missing vdevs. 2139168404Spjd */ 2140219089Spjd if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN, 2141219089Spjd &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE && 2142219089Spjd rvd->vdev_guid_sum != ub->ub_guid_sum) 2143219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); 2144219089Spjd 2145219089Spjd if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { 2146219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2147219089Spjd spa_try_repair(spa, config); 2148219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2149219089Spjd nvlist_free(spa->spa_config_splitting); 2150219089Spjd spa->spa_config_splitting = NULL; 2151168404Spjd } 2152168404Spjd 2153168404Spjd /* 2154168404Spjd * Initialize internal SPA structures. 2155168404Spjd */ 2156168404Spjd spa->spa_state = POOL_STATE_ACTIVE; 2157168404Spjd spa->spa_ubsync = spa->spa_uberblock; 2158219089Spjd spa->spa_verify_min_txg = spa->spa_extreme_rewind ? 2159219089Spjd TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; 2160219089Spjd spa->spa_first_txg = spa->spa_last_ubsync_txg ? 2161219089Spjd spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; 2162219089Spjd spa->spa_claim_max_txg = spa->spa_first_txg; 2163219089Spjd spa->spa_prev_software_version = ub->ub_software_version; 2164219089Spjd 2165236884Smm error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 2166219089Spjd if (error) 2167219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2168168404Spjd spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 2169168404Spjd 2170219089Spjd if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0) 2171219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2172168404Spjd 2173236884Smm if (spa_version(spa) >= SPA_VERSION_FEATURES) { 2174236884Smm boolean_t missing_feat_read = B_FALSE; 2175236884Smm nvlist_t *unsup_feat; 2176236884Smm 2177236884Smm if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ, 2178236884Smm &spa->spa_feat_for_read_obj) != 0) { 2179236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2180236884Smm } 2181236884Smm 2182236884Smm if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE, 2183236884Smm &spa->spa_feat_for_write_obj) != 0) { 2184236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2185236884Smm } 2186236884Smm 2187236884Smm if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS, 2188236884Smm &spa->spa_feat_desc_obj) != 0) { 2189236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2190236884Smm } 2191236884Smm 2192236884Smm VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) == 2193236884Smm 0); 2194236884Smm 2195236884Smm if (!feature_is_supported(spa->spa_meta_objset, 2196236884Smm spa->spa_feat_for_read_obj, spa->spa_feat_desc_obj, 2197236884Smm unsup_feat)) 2198236884Smm missing_feat_read = B_TRUE; 2199236884Smm 2200236884Smm if (spa_writeable(spa) || state == SPA_LOAD_TRYIMPORT) { 2201236884Smm if (!feature_is_supported(spa->spa_meta_objset, 2202236884Smm spa->spa_feat_for_write_obj, spa->spa_feat_desc_obj, 2203236884Smm unsup_feat)) 2204236884Smm missing_feat_write = B_TRUE; 2205236884Smm } 2206236884Smm 2207236884Smm if (!nvlist_empty(unsup_feat)) { 2208236884Smm VERIFY(nvlist_add_nvlist(spa->spa_load_info, 2209236884Smm ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0); 2210236884Smm } 2211236884Smm 2212236884Smm nvlist_free(unsup_feat); 2213236884Smm 2214236884Smm if (!missing_feat_read) { 2215236884Smm fnvlist_add_boolean(spa->spa_load_info, 2216236884Smm ZPOOL_CONFIG_CAN_RDONLY); 2217236884Smm } 2218236884Smm 2219236884Smm /* 2220236884Smm * If the state is SPA_LOAD_TRYIMPORT, our objective is 2221236884Smm * twofold: to determine whether the pool is available for 2222236884Smm * import in read-write mode and (if it is not) whether the 2223236884Smm * pool is available for import in read-only mode. If the pool 2224236884Smm * is available for import in read-write mode, it is displayed 2225236884Smm * as available in userland; if it is not available for import 2226236884Smm * in read-only mode, it is displayed as unavailable in 2227236884Smm * userland. If the pool is available for import in read-only 2228236884Smm * mode but not read-write mode, it is displayed as unavailable 2229236884Smm * in userland with a special note that the pool is actually 2230236884Smm * available for open in read-only mode. 2231236884Smm * 2232236884Smm * As a result, if the state is SPA_LOAD_TRYIMPORT and we are 2233236884Smm * missing a feature for write, we must first determine whether 2234236884Smm * the pool can be opened read-only before returning to 2235236884Smm * userland in order to know whether to display the 2236236884Smm * abovementioned note. 2237236884Smm */ 2238236884Smm if (missing_feat_read || (missing_feat_write && 2239236884Smm spa_writeable(spa))) { 2240236884Smm return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 2241236884Smm ENOTSUP)); 2242236884Smm } 2243236884Smm } 2244236884Smm 2245236884Smm spa->spa_is_initializing = B_TRUE; 2246236884Smm error = dsl_pool_open(spa->spa_dsl_pool); 2247236884Smm spa->spa_is_initializing = B_FALSE; 2248236884Smm if (error != 0) 2249236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2250236884Smm 2251168404Spjd if (!mosconfig) { 2252168498Spjd uint64_t hostid; 2253219089Spjd nvlist_t *policy = NULL, *nvconfig; 2254168404Spjd 2255219089Spjd if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) 2256219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2257168404Spjd 2258219089Spjd if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig, 2259185029Spjd ZPOOL_CONFIG_HOSTID, &hostid) == 0) { 2260168498Spjd char *hostname; 2261168498Spjd unsigned long myhostid = 0; 2262168498Spjd 2263219089Spjd VERIFY(nvlist_lookup_string(nvconfig, 2264168498Spjd ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 2265168498Spjd 2266219089Spjd#ifdef _KERNEL 2267219089Spjd myhostid = zone_get_hostid(NULL); 2268219089Spjd#else /* _KERNEL */ 2269219089Spjd /* 2270219089Spjd * We're emulating the system's hostid in userland, so 2271219089Spjd * we can't use zone_get_hostid(). 2272219089Spjd */ 2273168498Spjd (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 2274219089Spjd#endif /* _KERNEL */ 2275204073Spjd if (check_hostid && hostid != 0 && myhostid != 0 && 2276219089Spjd hostid != myhostid) { 2277219089Spjd nvlist_free(nvconfig); 2278168498Spjd cmn_err(CE_WARN, "pool '%s' could not be " 2279168498Spjd "loaded as it was last accessed by " 2280185029Spjd "another system (host: %s hostid: 0x%lx). " 2281236146Smm "See: http://illumos.org/msg/ZFS-8000-EY", 2282185029Spjd spa_name(spa), hostname, 2283168498Spjd (unsigned long)hostid); 2284219089Spjd return (EBADF); 2285168498Spjd } 2286168498Spjd } 2287219089Spjd if (nvlist_lookup_nvlist(spa->spa_config, 2288219089Spjd ZPOOL_REWIND_POLICY, &policy) == 0) 2289219089Spjd VERIFY(nvlist_add_nvlist(nvconfig, 2290219089Spjd ZPOOL_REWIND_POLICY, policy) == 0); 2291168498Spjd 2292219089Spjd spa_config_set(spa, nvconfig); 2293168404Spjd spa_unload(spa); 2294168404Spjd spa_deactivate(spa); 2295209962Smm spa_activate(spa, orig_mode); 2296168404Spjd 2297219089Spjd return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE)); 2298168404Spjd } 2299168404Spjd 2300219089Spjd if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0) 2301219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2302219089Spjd error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); 2303219089Spjd if (error != 0) 2304219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2305168404Spjd 2306168404Spjd /* 2307168404Spjd * Load the bit that tells us to use the new accounting function 2308168404Spjd * (raid-z deflation). If we have an older pool, this will not 2309168404Spjd * be present. 2310168404Spjd */ 2311219089Spjd error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate); 2312219089Spjd if (error != 0 && error != ENOENT) 2313219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2314168404Spjd 2315219089Spjd error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, 2316219089Spjd &spa->spa_creation_version); 2317219089Spjd if (error != 0 && error != ENOENT) 2318219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2319219089Spjd 2320168404Spjd /* 2321168404Spjd * Load the persistent error log. If we have an older pool, this will 2322168404Spjd * not be present. 2323168404Spjd */ 2324219089Spjd error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last); 2325219089Spjd if (error != 0 && error != ENOENT) 2326219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2327168404Spjd 2328219089Spjd error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, 2329219089Spjd &spa->spa_errlog_scrub); 2330219089Spjd if (error != 0 && error != ENOENT) 2331219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2332168404Spjd 2333168404Spjd /* 2334168404Spjd * Load the history object. If we have an older pool, this 2335168404Spjd * will not be present. 2336168404Spjd */ 2337219089Spjd error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history); 2338219089Spjd if (error != 0 && error != ENOENT) 2339219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2340168404Spjd 2341168404Spjd /* 2342219089Spjd * If we're assembling the pool from the split-off vdevs of 2343219089Spjd * an existing pool, we don't want to attach the spares & cache 2344219089Spjd * devices. 2345219089Spjd */ 2346219089Spjd 2347219089Spjd /* 2348168404Spjd * Load any hot spares for this pool. 2349168404Spjd */ 2350219089Spjd error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object); 2351219089Spjd if (error != 0 && error != ENOENT) 2352219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2353219089Spjd if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 2354185029Spjd ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 2355185029Spjd if (load_nvlist(spa, spa->spa_spares.sav_object, 2356219089Spjd &spa->spa_spares.sav_config) != 0) 2357219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2358168404Spjd 2359185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2360168404Spjd spa_load_spares(spa); 2361185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2362219089Spjd } else if (error == 0) { 2363219089Spjd spa->spa_spares.sav_sync = B_TRUE; 2364168404Spjd } 2365168404Spjd 2366185029Spjd /* 2367185029Spjd * Load any level 2 ARC devices for this pool. 2368185029Spjd */ 2369219089Spjd error = spa_dir_prop(spa, DMU_POOL_L2CACHE, 2370185029Spjd &spa->spa_l2cache.sav_object); 2371219089Spjd if (error != 0 && error != ENOENT) 2372219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2373219089Spjd if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 2374185029Spjd ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 2375185029Spjd if (load_nvlist(spa, spa->spa_l2cache.sav_object, 2376219089Spjd &spa->spa_l2cache.sav_config) != 0) 2377219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2378185029Spjd 2379185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2380185029Spjd spa_load_l2cache(spa); 2381185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2382219089Spjd } else if (error == 0) { 2383219089Spjd spa->spa_l2cache.sav_sync = B_TRUE; 2384185029Spjd } 2385185029Spjd 2386219089Spjd spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 2387213197Smm 2388219089Spjd error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object); 2389219089Spjd if (error && error != ENOENT) 2390219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2391185029Spjd 2392219089Spjd if (error == 0) { 2393219089Spjd uint64_t autoreplace; 2394185029Spjd 2395219089Spjd spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); 2396219089Spjd spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); 2397219089Spjd spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); 2398219089Spjd spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); 2399219089Spjd spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); 2400219089Spjd spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO, 2401219089Spjd &spa->spa_dedup_ditto); 2402185029Spjd 2403219089Spjd spa->spa_autoreplace = (autoreplace != 0); 2404168404Spjd } 2405168404Spjd 2406168404Spjd /* 2407185029Spjd * If the 'autoreplace' property is set, then post a resource notifying 2408185029Spjd * the ZFS DE that it should not issue any faults for unopenable 2409185029Spjd * devices. We also iterate over the vdevs, and post a sysevent for any 2410185029Spjd * unopenable vdevs so that the normal autoreplace handler can take 2411185029Spjd * over. 2412185029Spjd */ 2413219089Spjd if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) { 2414185029Spjd spa_check_removed(spa->spa_root_vdev); 2415219089Spjd /* 2416219089Spjd * For the import case, this is done in spa_import(), because 2417219089Spjd * at this point we're using the spare definitions from 2418219089Spjd * the MOS config, not necessarily from the userland config. 2419219089Spjd */ 2420219089Spjd if (state != SPA_LOAD_IMPORT) { 2421219089Spjd spa_aux_check_removed(&spa->spa_spares); 2422219089Spjd spa_aux_check_removed(&spa->spa_l2cache); 2423219089Spjd } 2424219089Spjd } 2425185029Spjd 2426185029Spjd /* 2427168404Spjd * Load the vdev state for all toplevel vdevs. 2428168404Spjd */ 2429168404Spjd vdev_load(rvd); 2430168404Spjd 2431168404Spjd /* 2432168404Spjd * Propagate the leaf DTLs we just loaded all the way up the tree. 2433168404Spjd */ 2434185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2435168404Spjd vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 2436185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2437168404Spjd 2438168404Spjd /* 2439219089Spjd * Load the DDTs (dedup tables). 2440168404Spjd */ 2441219089Spjd error = ddt_load(spa); 2442219089Spjd if (error != 0) 2443219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2444219089Spjd 2445219089Spjd spa_update_dspace(spa); 2446219089Spjd 2447219089Spjd /* 2448219089Spjd * Validate the config, using the MOS config to fill in any 2449219089Spjd * information which might be missing. If we fail to validate 2450219089Spjd * the config then declare the pool unfit for use. If we're 2451219089Spjd * assembling a pool from a split, the log is not transferred 2452219089Spjd * over. 2453219089Spjd */ 2454219089Spjd if (type != SPA_IMPORT_ASSEMBLE) { 2455219089Spjd nvlist_t *nvconfig; 2456219089Spjd 2457219089Spjd if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) 2458219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2459219089Spjd 2460219089Spjd if (!spa_config_valid(spa, nvconfig)) { 2461219089Spjd nvlist_free(nvconfig); 2462219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, 2463219089Spjd ENXIO)); 2464219089Spjd } 2465219089Spjd nvlist_free(nvconfig); 2466219089Spjd 2467219089Spjd /* 2468236884Smm * Now that we've validated the config, check the state of the 2469219089Spjd * root vdev. If it can't be opened, it indicates one or 2470219089Spjd * more toplevel vdevs are faulted. 2471219089Spjd */ 2472219089Spjd if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 2473219089Spjd return (ENXIO); 2474219089Spjd 2475219089Spjd if (spa_check_logs(spa)) { 2476219089Spjd *ereport = FM_EREPORT_ZFS_LOG_REPLAY; 2477219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO)); 2478219089Spjd } 2479168404Spjd } 2480168404Spjd 2481236884Smm if (missing_feat_write) { 2482236884Smm ASSERT(state == SPA_LOAD_TRYIMPORT); 2483236884Smm 2484236884Smm /* 2485236884Smm * At this point, we know that we can open the pool in 2486236884Smm * read-only mode but not read-write mode. We now have enough 2487236884Smm * information and can return to userland. 2488236884Smm */ 2489236884Smm return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); 2490236884Smm } 2491236884Smm 2492219089Spjd /* 2493219089Spjd * We've successfully opened the pool, verify that we're ready 2494219089Spjd * to start pushing transactions. 2495219089Spjd */ 2496219089Spjd if (state != SPA_LOAD_TRYIMPORT) { 2497219089Spjd if (error = spa_load_verify(spa)) 2498219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 2499219089Spjd error)); 2500219089Spjd } 2501219089Spjd 2502219089Spjd if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER || 2503219089Spjd spa->spa_load_max_txg == UINT64_MAX)) { 2504168404Spjd dmu_tx_t *tx; 2505168404Spjd int need_update = B_FALSE; 2506168404Spjd 2507209962Smm ASSERT(state != SPA_LOAD_TRYIMPORT); 2508209962Smm 2509168404Spjd /* 2510168404Spjd * Claim log blocks that haven't been committed yet. 2511168404Spjd * This must all happen in a single txg. 2512219089Spjd * Note: spa_claim_max_txg is updated by spa_claim_notify(), 2513219089Spjd * invoked from zil_claim_log_block()'s i/o done callback. 2514219089Spjd * Price of rollback is that we abandon the log. 2515168404Spjd */ 2516219089Spjd spa->spa_claiming = B_TRUE; 2517219089Spjd 2518168404Spjd tx = dmu_tx_create_assigned(spa_get_dsl(spa), 2519168404Spjd spa_first_txg(spa)); 2520185029Spjd (void) dmu_objset_find(spa_name(spa), 2521168404Spjd zil_claim, tx, DS_FIND_CHILDREN); 2522168404Spjd dmu_tx_commit(tx); 2523168404Spjd 2524219089Spjd spa->spa_claiming = B_FALSE; 2525219089Spjd 2526219089Spjd spa_set_log_state(spa, SPA_LOG_GOOD); 2527168404Spjd spa->spa_sync_on = B_TRUE; 2528168404Spjd txg_sync_start(spa->spa_dsl_pool); 2529168404Spjd 2530168404Spjd /* 2531219089Spjd * Wait for all claims to sync. We sync up to the highest 2532219089Spjd * claimed log block birth time so that claimed log blocks 2533219089Spjd * don't appear to be from the future. spa_claim_max_txg 2534219089Spjd * will have been set for us by either zil_check_log_chain() 2535219089Spjd * (invoked from spa_check_logs()) or zil_claim() above. 2536168404Spjd */ 2537219089Spjd txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); 2538168404Spjd 2539168404Spjd /* 2540168404Spjd * If the config cache is stale, or we have uninitialized 2541168404Spjd * metaslabs (see spa_vdev_add()), then update the config. 2542209962Smm * 2543219089Spjd * If this is a verbatim import, trust the current 2544209962Smm * in-core spa_config and update the disk labels. 2545168404Spjd */ 2546168404Spjd if (config_cache_txg != spa->spa_config_txg || 2547219089Spjd state == SPA_LOAD_IMPORT || 2548219089Spjd state == SPA_LOAD_RECOVER || 2549219089Spjd (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) 2550168404Spjd need_update = B_TRUE; 2551168404Spjd 2552209962Smm for (int c = 0; c < rvd->vdev_children; c++) 2553168404Spjd if (rvd->vdev_child[c]->vdev_ms_array == 0) 2554168404Spjd need_update = B_TRUE; 2555168404Spjd 2556168404Spjd /* 2557168404Spjd * Update the config cache asychronously in case we're the 2558168404Spjd * root pool, in which case the config cache isn't writable yet. 2559168404Spjd */ 2560168404Spjd if (need_update) 2561168404Spjd spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 2562208683Spjd 2563208683Spjd /* 2564208683Spjd * Check all DTLs to see if anything needs resilvering. 2565208683Spjd */ 2566219089Spjd if (!dsl_scan_resilvering(spa->spa_dsl_pool) && 2567219089Spjd vdev_resilver_needed(rvd, NULL, NULL)) 2568208683Spjd spa_async_request(spa, SPA_ASYNC_RESILVER); 2569219089Spjd 2570219089Spjd /* 2571219089Spjd * Delete any inconsistent datasets. 2572219089Spjd */ 2573219089Spjd (void) dmu_objset_find(spa_name(spa), 2574219089Spjd dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); 2575219089Spjd 2576219089Spjd /* 2577219089Spjd * Clean up any stale temporary dataset userrefs. 2578219089Spjd */ 2579219089Spjd dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); 2580168404Spjd } 2581168404Spjd 2582219089Spjd return (0); 2583219089Spjd} 2584168404Spjd 2585219089Spjdstatic int 2586219089Spjdspa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig) 2587219089Spjd{ 2588219089Spjd int mode = spa->spa_mode; 2589219089Spjd 2590219089Spjd spa_unload(spa); 2591219089Spjd spa_deactivate(spa); 2592219089Spjd 2593219089Spjd spa->spa_load_max_txg--; 2594219089Spjd 2595219089Spjd spa_activate(spa, mode); 2596219089Spjd spa_async_suspend(spa); 2597219089Spjd 2598219089Spjd return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig)); 2599168404Spjd} 2600168404Spjd 2601236884Smm/* 2602236884Smm * If spa_load() fails this function will try loading prior txg's. If 2603236884Smm * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool 2604236884Smm * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this 2605236884Smm * function will not rewind the pool and will return the same error as 2606236884Smm * spa_load(). 2607236884Smm */ 2608219089Spjdstatic int 2609219089Spjdspa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, 2610219089Spjd uint64_t max_request, int rewind_flags) 2611219089Spjd{ 2612236884Smm nvlist_t *loadinfo = NULL; 2613219089Spjd nvlist_t *config = NULL; 2614219089Spjd int load_error, rewind_error; 2615219089Spjd uint64_t safe_rewind_txg; 2616219089Spjd uint64_t min_txg; 2617219089Spjd 2618219089Spjd if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { 2619219089Spjd spa->spa_load_max_txg = spa->spa_load_txg; 2620219089Spjd spa_set_log_state(spa, SPA_LOG_CLEAR); 2621219089Spjd } else { 2622219089Spjd spa->spa_load_max_txg = max_request; 2623219089Spjd } 2624219089Spjd 2625219089Spjd load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING, 2626219089Spjd mosconfig); 2627219089Spjd if (load_error == 0) 2628219089Spjd return (0); 2629219089Spjd 2630219089Spjd if (spa->spa_root_vdev != NULL) 2631219089Spjd config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 2632219089Spjd 2633219089Spjd spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; 2634219089Spjd spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; 2635219089Spjd 2636219089Spjd if (rewind_flags & ZPOOL_NEVER_REWIND) { 2637219089Spjd nvlist_free(config); 2638219089Spjd return (load_error); 2639219089Spjd } 2640219089Spjd 2641236884Smm if (state == SPA_LOAD_RECOVER) { 2642236884Smm /* Price of rolling back is discarding txgs, including log */ 2643219089Spjd spa_set_log_state(spa, SPA_LOG_CLEAR); 2644236884Smm } else { 2645236884Smm /* 2646236884Smm * If we aren't rolling back save the load info from our first 2647236884Smm * import attempt so that we can restore it after attempting 2648236884Smm * to rewind. 2649236884Smm */ 2650236884Smm loadinfo = spa->spa_load_info; 2651236884Smm spa->spa_load_info = fnvlist_alloc(); 2652236884Smm } 2653219089Spjd 2654219089Spjd spa->spa_load_max_txg = spa->spa_last_ubsync_txg; 2655219089Spjd safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; 2656219089Spjd min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? 2657219089Spjd TXG_INITIAL : safe_rewind_txg; 2658219089Spjd 2659219089Spjd /* 2660219089Spjd * Continue as long as we're finding errors, we're still within 2661219089Spjd * the acceptable rewind range, and we're still finding uberblocks 2662219089Spjd */ 2663219089Spjd while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && 2664219089Spjd spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { 2665219089Spjd if (spa->spa_load_max_txg < safe_rewind_txg) 2666219089Spjd spa->spa_extreme_rewind = B_TRUE; 2667219089Spjd rewind_error = spa_load_retry(spa, state, mosconfig); 2668219089Spjd } 2669219089Spjd 2670219089Spjd spa->spa_extreme_rewind = B_FALSE; 2671219089Spjd spa->spa_load_max_txg = UINT64_MAX; 2672219089Spjd 2673219089Spjd if (config && (rewind_error || state != SPA_LOAD_RECOVER)) 2674219089Spjd spa_config_set(spa, config); 2675219089Spjd 2676236884Smm if (state == SPA_LOAD_RECOVER) { 2677236884Smm ASSERT3P(loadinfo, ==, NULL); 2678236884Smm return (rewind_error); 2679236884Smm } else { 2680236884Smm /* Store the rewind info as part of the initial load info */ 2681236884Smm fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO, 2682236884Smm spa->spa_load_info); 2683236884Smm 2684236884Smm /* Restore the initial load info */ 2685236884Smm fnvlist_free(spa->spa_load_info); 2686236884Smm spa->spa_load_info = loadinfo; 2687236884Smm 2688236884Smm return (load_error); 2689236884Smm } 2690219089Spjd} 2691219089Spjd 2692168404Spjd/* 2693168404Spjd * Pool Open/Import 2694168404Spjd * 2695168404Spjd * The import case is identical to an open except that the configuration is sent 2696168404Spjd * down from userland, instead of grabbed from the configuration cache. For the 2697168404Spjd * case of an open, the pool configuration will exist in the 2698185029Spjd * POOL_STATE_UNINITIALIZED state. 2699168404Spjd * 2700168404Spjd * The stats information (gen/count/ustats) is used to gather vdev statistics at 2701168404Spjd * the same time open the pool, without having to keep around the spa_t in some 2702168404Spjd * ambiguous state. 2703168404Spjd */ 2704168404Spjdstatic int 2705219089Spjdspa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, 2706219089Spjd nvlist_t **config) 2707168404Spjd{ 2708168404Spjd spa_t *spa; 2709219089Spjd spa_load_state_t state = SPA_LOAD_OPEN; 2710168404Spjd int error; 2711168404Spjd int locked = B_FALSE; 2712219089Spjd int firstopen = B_FALSE; 2713168404Spjd 2714168404Spjd *spapp = NULL; 2715168404Spjd 2716168404Spjd /* 2717168404Spjd * As disgusting as this is, we need to support recursive calls to this 2718168404Spjd * function because dsl_dir_open() is called during spa_load(), and ends 2719168404Spjd * up calling spa_open() again. The real fix is to figure out how to 2720168404Spjd * avoid dsl_dir_open() calling this in the first place. 2721168404Spjd */ 2722168404Spjd if (mutex_owner(&spa_namespace_lock) != curthread) { 2723168404Spjd mutex_enter(&spa_namespace_lock); 2724168404Spjd locked = B_TRUE; 2725168404Spjd } 2726168404Spjd 2727168404Spjd if ((spa = spa_lookup(pool)) == NULL) { 2728168404Spjd if (locked) 2729168404Spjd mutex_exit(&spa_namespace_lock); 2730168404Spjd return (ENOENT); 2731168404Spjd } 2732219089Spjd 2733168404Spjd if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 2734219089Spjd zpool_rewind_policy_t policy; 2735168404Spjd 2736219089Spjd firstopen = B_TRUE; 2737219089Spjd 2738219089Spjd zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config, 2739219089Spjd &policy); 2740219089Spjd if (policy.zrp_request & ZPOOL_DO_REWIND) 2741219089Spjd state = SPA_LOAD_RECOVER; 2742219089Spjd 2743209962Smm spa_activate(spa, spa_mode_global); 2744168404Spjd 2745219089Spjd if (state != SPA_LOAD_RECOVER) 2746219089Spjd spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 2747168404Spjd 2748219089Spjd error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg, 2749219089Spjd policy.zrp_request); 2750219089Spjd 2751168404Spjd if (error == EBADF) { 2752168404Spjd /* 2753168404Spjd * If vdev_validate() returns failure (indicated by 2754168404Spjd * EBADF), it indicates that one of the vdevs indicates 2755168404Spjd * that the pool has been exported or destroyed. If 2756168404Spjd * this is the case, the config cache is out of sync and 2757168404Spjd * we should remove the pool from the namespace. 2758168404Spjd */ 2759168404Spjd spa_unload(spa); 2760168404Spjd spa_deactivate(spa); 2761185029Spjd spa_config_sync(spa, B_TRUE, B_TRUE); 2762168404Spjd spa_remove(spa); 2763168404Spjd if (locked) 2764168404Spjd mutex_exit(&spa_namespace_lock); 2765168404Spjd return (ENOENT); 2766168404Spjd } 2767168404Spjd 2768168404Spjd if (error) { 2769168404Spjd /* 2770168404Spjd * We can't open the pool, but we still have useful 2771168404Spjd * information: the state of each vdev after the 2772168404Spjd * attempted vdev_open(). Return this to the user. 2773168404Spjd */ 2774219089Spjd if (config != NULL && spa->spa_config) { 2775219089Spjd VERIFY(nvlist_dup(spa->spa_config, config, 2776219089Spjd KM_SLEEP) == 0); 2777219089Spjd VERIFY(nvlist_add_nvlist(*config, 2778219089Spjd ZPOOL_CONFIG_LOAD_INFO, 2779219089Spjd spa->spa_load_info) == 0); 2780219089Spjd } 2781168404Spjd spa_unload(spa); 2782168404Spjd spa_deactivate(spa); 2783219089Spjd spa->spa_last_open_failed = error; 2784168404Spjd if (locked) 2785168404Spjd mutex_exit(&spa_namespace_lock); 2786168404Spjd *spapp = NULL; 2787168404Spjd return (error); 2788168404Spjd } 2789168404Spjd } 2790168404Spjd 2791168404Spjd spa_open_ref(spa, tag); 2792185029Spjd 2793219089Spjd if (config != NULL) 2794219089Spjd *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 2795219089Spjd 2796219089Spjd /* 2797219089Spjd * If we've recovered the pool, pass back any information we 2798219089Spjd * gathered while doing the load. 2799219089Spjd */ 2800219089Spjd if (state == SPA_LOAD_RECOVER) { 2801219089Spjd VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, 2802219089Spjd spa->spa_load_info) == 0); 2803219089Spjd } 2804219089Spjd 2805219089Spjd if (locked) { 2806219089Spjd spa->spa_last_open_failed = 0; 2807219089Spjd spa->spa_last_ubsync_txg = 0; 2808219089Spjd spa->spa_load_txg = 0; 2809168404Spjd mutex_exit(&spa_namespace_lock); 2810219089Spjd#ifdef __FreeBSD__ 2811219089Spjd#ifdef _KERNEL 2812219089Spjd if (firstopen) 2813219089Spjd zvol_create_minors(pool); 2814219089Spjd#endif 2815219089Spjd#endif 2816219089Spjd } 2817168404Spjd 2818168404Spjd *spapp = spa; 2819168404Spjd 2820168404Spjd return (0); 2821168404Spjd} 2822168404Spjd 2823168404Spjdint 2824219089Spjdspa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy, 2825219089Spjd nvlist_t **config) 2826219089Spjd{ 2827219089Spjd return (spa_open_common(name, spapp, tag, policy, config)); 2828219089Spjd} 2829219089Spjd 2830219089Spjdint 2831168404Spjdspa_open(const char *name, spa_t **spapp, void *tag) 2832168404Spjd{ 2833219089Spjd return (spa_open_common(name, spapp, tag, NULL, NULL)); 2834168404Spjd} 2835168404Spjd 2836168404Spjd/* 2837168404Spjd * Lookup the given spa_t, incrementing the inject count in the process, 2838168404Spjd * preventing it from being exported or destroyed. 2839168404Spjd */ 2840168404Spjdspa_t * 2841168404Spjdspa_inject_addref(char *name) 2842168404Spjd{ 2843168404Spjd spa_t *spa; 2844168404Spjd 2845168404Spjd mutex_enter(&spa_namespace_lock); 2846168404Spjd if ((spa = spa_lookup(name)) == NULL) { 2847168404Spjd mutex_exit(&spa_namespace_lock); 2848168404Spjd return (NULL); 2849168404Spjd } 2850168404Spjd spa->spa_inject_ref++; 2851168404Spjd mutex_exit(&spa_namespace_lock); 2852168404Spjd 2853168404Spjd return (spa); 2854168404Spjd} 2855168404Spjd 2856168404Spjdvoid 2857168404Spjdspa_inject_delref(spa_t *spa) 2858168404Spjd{ 2859168404Spjd mutex_enter(&spa_namespace_lock); 2860168404Spjd spa->spa_inject_ref--; 2861168404Spjd mutex_exit(&spa_namespace_lock); 2862168404Spjd} 2863168404Spjd 2864185029Spjd/* 2865185029Spjd * Add spares device information to the nvlist. 2866185029Spjd */ 2867168404Spjdstatic void 2868168404Spjdspa_add_spares(spa_t *spa, nvlist_t *config) 2869168404Spjd{ 2870168404Spjd nvlist_t **spares; 2871168404Spjd uint_t i, nspares; 2872168404Spjd nvlist_t *nvroot; 2873168404Spjd uint64_t guid; 2874168404Spjd vdev_stat_t *vs; 2875168404Spjd uint_t vsc; 2876168404Spjd uint64_t pool; 2877168404Spjd 2878209962Smm ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 2879209962Smm 2880185029Spjd if (spa->spa_spares.sav_count == 0) 2881168404Spjd return; 2882168404Spjd 2883168404Spjd VERIFY(nvlist_lookup_nvlist(config, 2884168404Spjd ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 2885185029Spjd VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 2886168404Spjd ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 2887168404Spjd if (nspares != 0) { 2888168404Spjd VERIFY(nvlist_add_nvlist_array(nvroot, 2889168404Spjd ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 2890168404Spjd VERIFY(nvlist_lookup_nvlist_array(nvroot, 2891168404Spjd ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 2892168404Spjd 2893168404Spjd /* 2894168404Spjd * Go through and find any spares which have since been 2895168404Spjd * repurposed as an active spare. If this is the case, update 2896168404Spjd * their status appropriately. 2897168404Spjd */ 2898168404Spjd for (i = 0; i < nspares; i++) { 2899168404Spjd VERIFY(nvlist_lookup_uint64(spares[i], 2900168404Spjd ZPOOL_CONFIG_GUID, &guid) == 0); 2901185029Spjd if (spa_spare_exists(guid, &pool, NULL) && 2902185029Spjd pool != 0ULL) { 2903168404Spjd VERIFY(nvlist_lookup_uint64_array( 2904219089Spjd spares[i], ZPOOL_CONFIG_VDEV_STATS, 2905168404Spjd (uint64_t **)&vs, &vsc) == 0); 2906168404Spjd vs->vs_state = VDEV_STATE_CANT_OPEN; 2907168404Spjd vs->vs_aux = VDEV_AUX_SPARED; 2908168404Spjd } 2909168404Spjd } 2910168404Spjd } 2911168404Spjd} 2912168404Spjd 2913185029Spjd/* 2914185029Spjd * Add l2cache device information to the nvlist, including vdev stats. 2915185029Spjd */ 2916185029Spjdstatic void 2917185029Spjdspa_add_l2cache(spa_t *spa, nvlist_t *config) 2918185029Spjd{ 2919185029Spjd nvlist_t **l2cache; 2920185029Spjd uint_t i, j, nl2cache; 2921185029Spjd nvlist_t *nvroot; 2922185029Spjd uint64_t guid; 2923185029Spjd vdev_t *vd; 2924185029Spjd vdev_stat_t *vs; 2925185029Spjd uint_t vsc; 2926185029Spjd 2927209962Smm ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 2928209962Smm 2929185029Spjd if (spa->spa_l2cache.sav_count == 0) 2930185029Spjd return; 2931185029Spjd 2932185029Spjd VERIFY(nvlist_lookup_nvlist(config, 2933185029Spjd ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 2934185029Spjd VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 2935185029Spjd ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 2936185029Spjd if (nl2cache != 0) { 2937185029Spjd VERIFY(nvlist_add_nvlist_array(nvroot, 2938185029Spjd ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 2939185029Spjd VERIFY(nvlist_lookup_nvlist_array(nvroot, 2940185029Spjd ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 2941185029Spjd 2942185029Spjd /* 2943185029Spjd * Update level 2 cache device stats. 2944185029Spjd */ 2945185029Spjd 2946185029Spjd for (i = 0; i < nl2cache; i++) { 2947185029Spjd VERIFY(nvlist_lookup_uint64(l2cache[i], 2948185029Spjd ZPOOL_CONFIG_GUID, &guid) == 0); 2949185029Spjd 2950185029Spjd vd = NULL; 2951185029Spjd for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 2952185029Spjd if (guid == 2953185029Spjd spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 2954185029Spjd vd = spa->spa_l2cache.sav_vdevs[j]; 2955185029Spjd break; 2956185029Spjd } 2957185029Spjd } 2958185029Spjd ASSERT(vd != NULL); 2959185029Spjd 2960185029Spjd VERIFY(nvlist_lookup_uint64_array(l2cache[i], 2961219089Spjd ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) 2962219089Spjd == 0); 2963185029Spjd vdev_get_stats(vd, vs); 2964185029Spjd } 2965185029Spjd } 2966185029Spjd} 2967185029Spjd 2968236884Smmstatic void 2969236884Smmspa_add_feature_stats(spa_t *spa, nvlist_t *config) 2970236884Smm{ 2971236884Smm nvlist_t *features; 2972236884Smm zap_cursor_t zc; 2973236884Smm zap_attribute_t za; 2974236884Smm 2975236884Smm ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 2976236884Smm VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0); 2977236884Smm 2978236884Smm if (spa->spa_feat_for_read_obj != 0) { 2979236884Smm for (zap_cursor_init(&zc, spa->spa_meta_objset, 2980236884Smm spa->spa_feat_for_read_obj); 2981236884Smm zap_cursor_retrieve(&zc, &za) == 0; 2982236884Smm zap_cursor_advance(&zc)) { 2983236884Smm ASSERT(za.za_integer_length == sizeof (uint64_t) && 2984236884Smm za.za_num_integers == 1); 2985236884Smm VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, 2986236884Smm za.za_first_integer)); 2987236884Smm } 2988236884Smm zap_cursor_fini(&zc); 2989236884Smm } 2990236884Smm 2991236884Smm if (spa->spa_feat_for_write_obj != 0) { 2992236884Smm for (zap_cursor_init(&zc, spa->spa_meta_objset, 2993236884Smm spa->spa_feat_for_write_obj); 2994236884Smm zap_cursor_retrieve(&zc, &za) == 0; 2995236884Smm zap_cursor_advance(&zc)) { 2996236884Smm ASSERT(za.za_integer_length == sizeof (uint64_t) && 2997236884Smm za.za_num_integers == 1); 2998236884Smm VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, 2999236884Smm za.za_first_integer)); 3000236884Smm } 3001236884Smm zap_cursor_fini(&zc); 3002236884Smm } 3003236884Smm 3004236884Smm VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, 3005236884Smm features) == 0); 3006236884Smm nvlist_free(features); 3007236884Smm} 3008236884Smm 3009168404Spjdint 3010236884Smmspa_get_stats(const char *name, nvlist_t **config, 3011236884Smm char *altroot, size_t buflen) 3012168404Spjd{ 3013168404Spjd int error; 3014168404Spjd spa_t *spa; 3015168404Spjd 3016168404Spjd *config = NULL; 3017219089Spjd error = spa_open_common(name, &spa, FTAG, NULL, config); 3018168404Spjd 3019209962Smm if (spa != NULL) { 3020209962Smm /* 3021209962Smm * This still leaves a window of inconsistency where the spares 3022209962Smm * or l2cache devices could change and the config would be 3023209962Smm * self-inconsistent. 3024209962Smm */ 3025209962Smm spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3026168404Spjd 3027209962Smm if (*config != NULL) { 3028219089Spjd uint64_t loadtimes[2]; 3029219089Spjd 3030219089Spjd loadtimes[0] = spa->spa_loaded_ts.tv_sec; 3031219089Spjd loadtimes[1] = spa->spa_loaded_ts.tv_nsec; 3032219089Spjd VERIFY(nvlist_add_uint64_array(*config, 3033219089Spjd ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0); 3034219089Spjd 3035185029Spjd VERIFY(nvlist_add_uint64(*config, 3036209962Smm ZPOOL_CONFIG_ERRCOUNT, 3037209962Smm spa_get_errlog_size(spa)) == 0); 3038185029Spjd 3039209962Smm if (spa_suspended(spa)) 3040209962Smm VERIFY(nvlist_add_uint64(*config, 3041209962Smm ZPOOL_CONFIG_SUSPENDED, 3042209962Smm spa->spa_failmode) == 0); 3043209962Smm 3044209962Smm spa_add_spares(spa, *config); 3045209962Smm spa_add_l2cache(spa, *config); 3046236884Smm spa_add_feature_stats(spa, *config); 3047209962Smm } 3048168404Spjd } 3049168404Spjd 3050168404Spjd /* 3051168404Spjd * We want to get the alternate root even for faulted pools, so we cheat 3052168404Spjd * and call spa_lookup() directly. 3053168404Spjd */ 3054168404Spjd if (altroot) { 3055168404Spjd if (spa == NULL) { 3056168404Spjd mutex_enter(&spa_namespace_lock); 3057168404Spjd spa = spa_lookup(name); 3058168404Spjd if (spa) 3059168404Spjd spa_altroot(spa, altroot, buflen); 3060168404Spjd else 3061168404Spjd altroot[0] = '\0'; 3062168404Spjd spa = NULL; 3063168404Spjd mutex_exit(&spa_namespace_lock); 3064168404Spjd } else { 3065168404Spjd spa_altroot(spa, altroot, buflen); 3066168404Spjd } 3067168404Spjd } 3068168404Spjd 3069209962Smm if (spa != NULL) { 3070209962Smm spa_config_exit(spa, SCL_CONFIG, FTAG); 3071168404Spjd spa_close(spa, FTAG); 3072209962Smm } 3073168404Spjd 3074168404Spjd return (error); 3075168404Spjd} 3076168404Spjd 3077168404Spjd/* 3078185029Spjd * Validate that the auxiliary device array is well formed. We must have an 3079185029Spjd * array of nvlists, each which describes a valid leaf vdev. If this is an 3080185029Spjd * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 3081185029Spjd * specified, as long as they are well-formed. 3082168404Spjd */ 3083168404Spjdstatic int 3084185029Spjdspa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 3085185029Spjd spa_aux_vdev_t *sav, const char *config, uint64_t version, 3086185029Spjd vdev_labeltype_t label) 3087168404Spjd{ 3088185029Spjd nvlist_t **dev; 3089185029Spjd uint_t i, ndev; 3090168404Spjd vdev_t *vd; 3091168404Spjd int error; 3092168404Spjd 3093185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3094185029Spjd 3095168404Spjd /* 3096185029Spjd * It's acceptable to have no devs specified. 3097168404Spjd */ 3098185029Spjd if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 3099168404Spjd return (0); 3100168404Spjd 3101185029Spjd if (ndev == 0) 3102168404Spjd return (EINVAL); 3103168404Spjd 3104168404Spjd /* 3105185029Spjd * Make sure the pool is formatted with a version that supports this 3106185029Spjd * device type. 3107168404Spjd */ 3108185029Spjd if (spa_version(spa) < version) 3109168404Spjd return (ENOTSUP); 3110168404Spjd 3111168404Spjd /* 3112185029Spjd * Set the pending device list so we correctly handle device in-use 3113168404Spjd * checking. 3114168404Spjd */ 3115185029Spjd sav->sav_pending = dev; 3116185029Spjd sav->sav_npending = ndev; 3117168404Spjd 3118185029Spjd for (i = 0; i < ndev; i++) { 3119185029Spjd if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 3120168404Spjd mode)) != 0) 3121168404Spjd goto out; 3122168404Spjd 3123168404Spjd if (!vd->vdev_ops->vdev_op_leaf) { 3124168404Spjd vdev_free(vd); 3125168404Spjd error = EINVAL; 3126168404Spjd goto out; 3127168404Spjd } 3128168404Spjd 3129185029Spjd /* 3130185029Spjd * The L2ARC currently only supports disk devices in 3131185029Spjd * kernel context. For user-level testing, we allow it. 3132185029Spjd */ 3133185029Spjd#ifdef _KERNEL 3134185029Spjd if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && 3135185029Spjd strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { 3136185029Spjd error = ENOTBLK; 3137230514Smm vdev_free(vd); 3138185029Spjd goto out; 3139185029Spjd } 3140185029Spjd#endif 3141168404Spjd vd->vdev_top = vd; 3142168404Spjd 3143168404Spjd if ((error = vdev_open(vd)) == 0 && 3144185029Spjd (error = vdev_label_init(vd, crtxg, label)) == 0) { 3145185029Spjd VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 3146168404Spjd vd->vdev_guid) == 0); 3147168404Spjd } 3148168404Spjd 3149168404Spjd vdev_free(vd); 3150168404Spjd 3151185029Spjd if (error && 3152185029Spjd (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 3153168404Spjd goto out; 3154168404Spjd else 3155168404Spjd error = 0; 3156168404Spjd } 3157168404Spjd 3158168404Spjdout: 3159185029Spjd sav->sav_pending = NULL; 3160185029Spjd sav->sav_npending = 0; 3161168404Spjd return (error); 3162168404Spjd} 3163168404Spjd 3164185029Spjdstatic int 3165185029Spjdspa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 3166185029Spjd{ 3167185029Spjd int error; 3168185029Spjd 3169185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3170185029Spjd 3171185029Spjd if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 3172185029Spjd &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 3173185029Spjd VDEV_LABEL_SPARE)) != 0) { 3174185029Spjd return (error); 3175185029Spjd } 3176185029Spjd 3177185029Spjd return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 3178185029Spjd &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 3179185029Spjd VDEV_LABEL_L2CACHE)); 3180185029Spjd} 3181185029Spjd 3182185029Spjdstatic void 3183185029Spjdspa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 3184185029Spjd const char *config) 3185185029Spjd{ 3186185029Spjd int i; 3187185029Spjd 3188185029Spjd if (sav->sav_config != NULL) { 3189185029Spjd nvlist_t **olddevs; 3190185029Spjd uint_t oldndevs; 3191185029Spjd nvlist_t **newdevs; 3192185029Spjd 3193185029Spjd /* 3194185029Spjd * Generate new dev list by concatentating with the 3195185029Spjd * current dev list. 3196185029Spjd */ 3197185029Spjd VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, 3198185029Spjd &olddevs, &oldndevs) == 0); 3199185029Spjd 3200185029Spjd newdevs = kmem_alloc(sizeof (void *) * 3201185029Spjd (ndevs + oldndevs), KM_SLEEP); 3202185029Spjd for (i = 0; i < oldndevs; i++) 3203185029Spjd VERIFY(nvlist_dup(olddevs[i], &newdevs[i], 3204185029Spjd KM_SLEEP) == 0); 3205185029Spjd for (i = 0; i < ndevs; i++) 3206185029Spjd VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], 3207185029Spjd KM_SLEEP) == 0); 3208185029Spjd 3209185029Spjd VERIFY(nvlist_remove(sav->sav_config, config, 3210185029Spjd DATA_TYPE_NVLIST_ARRAY) == 0); 3211185029Spjd 3212185029Spjd VERIFY(nvlist_add_nvlist_array(sav->sav_config, 3213185029Spjd config, newdevs, ndevs + oldndevs) == 0); 3214185029Spjd for (i = 0; i < oldndevs + ndevs; i++) 3215185029Spjd nvlist_free(newdevs[i]); 3216185029Spjd kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 3217185029Spjd } else { 3218185029Spjd /* 3219185029Spjd * Generate a new dev list. 3220185029Spjd */ 3221185029Spjd VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, 3222185029Spjd KM_SLEEP) == 0); 3223185029Spjd VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, 3224185029Spjd devs, ndevs) == 0); 3225185029Spjd } 3226185029Spjd} 3227185029Spjd 3228168404Spjd/* 3229185029Spjd * Stop and drop level 2 ARC devices 3230185029Spjd */ 3231185029Spjdvoid 3232185029Spjdspa_l2cache_drop(spa_t *spa) 3233185029Spjd{ 3234185029Spjd vdev_t *vd; 3235185029Spjd int i; 3236185029Spjd spa_aux_vdev_t *sav = &spa->spa_l2cache; 3237185029Spjd 3238185029Spjd for (i = 0; i < sav->sav_count; i++) { 3239185029Spjd uint64_t pool; 3240185029Spjd 3241185029Spjd vd = sav->sav_vdevs[i]; 3242185029Spjd ASSERT(vd != NULL); 3243185029Spjd 3244209962Smm if (spa_l2cache_exists(vd->vdev_guid, &pool) && 3245209962Smm pool != 0ULL && l2arc_vdev_present(vd)) 3246185029Spjd l2arc_remove_vdev(vd); 3247185029Spjd } 3248185029Spjd} 3249185029Spjd 3250185029Spjd/* 3251168404Spjd * Pool Creation 3252168404Spjd */ 3253168404Spjdint 3254185029Spjdspa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 3255185029Spjd const char *history_str, nvlist_t *zplprops) 3256168404Spjd{ 3257168404Spjd spa_t *spa; 3258185029Spjd char *altroot = NULL; 3259168404Spjd vdev_t *rvd; 3260168404Spjd dsl_pool_t *dp; 3261168404Spjd dmu_tx_t *tx; 3262219089Spjd int error = 0; 3263168404Spjd uint64_t txg = TXG_INITIAL; 3264185029Spjd nvlist_t **spares, **l2cache; 3265185029Spjd uint_t nspares, nl2cache; 3266219089Spjd uint64_t version, obj; 3267236884Smm boolean_t has_features; 3268168404Spjd 3269168404Spjd /* 3270168404Spjd * If this pool already exists, return failure. 3271168404Spjd */ 3272168404Spjd mutex_enter(&spa_namespace_lock); 3273168404Spjd if (spa_lookup(pool) != NULL) { 3274168404Spjd mutex_exit(&spa_namespace_lock); 3275168404Spjd return (EEXIST); 3276168404Spjd } 3277168404Spjd 3278168404Spjd /* 3279168404Spjd * Allocate a new spa_t structure. 3280168404Spjd */ 3281185029Spjd (void) nvlist_lookup_string(props, 3282185029Spjd zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 3283219089Spjd spa = spa_add(pool, NULL, altroot); 3284209962Smm spa_activate(spa, spa_mode_global); 3285168404Spjd 3286185029Spjd if (props && (error = spa_prop_validate(spa, props))) { 3287185029Spjd spa_deactivate(spa); 3288185029Spjd spa_remove(spa); 3289185029Spjd mutex_exit(&spa_namespace_lock); 3290185029Spjd return (error); 3291185029Spjd } 3292185029Spjd 3293236884Smm has_features = B_FALSE; 3294236884Smm for (nvpair_t *elem = nvlist_next_nvpair(props, NULL); 3295236884Smm elem != NULL; elem = nvlist_next_nvpair(props, elem)) { 3296236884Smm if (zpool_prop_feature(nvpair_name(elem))) 3297236884Smm has_features = B_TRUE; 3298236884Smm } 3299236884Smm 3300236884Smm if (has_features || nvlist_lookup_uint64(props, 3301236884Smm zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) { 3302185029Spjd version = SPA_VERSION; 3303236884Smm } 3304236884Smm ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 3305219089Spjd 3306219089Spjd spa->spa_first_txg = txg; 3307219089Spjd spa->spa_uberblock.ub_txg = txg - 1; 3308185029Spjd spa->spa_uberblock.ub_version = version; 3309168404Spjd spa->spa_ubsync = spa->spa_uberblock; 3310168404Spjd 3311168404Spjd /* 3312209962Smm * Create "The Godfather" zio to hold all async IOs 3313209962Smm */ 3314209962Smm spa->spa_async_zio_root = zio_root(spa, NULL, NULL, 3315209962Smm ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); 3316209962Smm 3317209962Smm /* 3318168404Spjd * Create the root vdev. 3319168404Spjd */ 3320185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3321168404Spjd 3322168404Spjd error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 3323168404Spjd 3324168404Spjd ASSERT(error != 0 || rvd != NULL); 3325168404Spjd ASSERT(error != 0 || spa->spa_root_vdev == rvd); 3326168404Spjd 3327185029Spjd if (error == 0 && !zfs_allocatable_devs(nvroot)) 3328168404Spjd error = EINVAL; 3329168404Spjd 3330168404Spjd if (error == 0 && 3331168404Spjd (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 3332185029Spjd (error = spa_validate_aux(spa, nvroot, txg, 3333168404Spjd VDEV_ALLOC_ADD)) == 0) { 3334219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 3335219089Spjd vdev_metaslab_set_size(rvd->vdev_child[c]); 3336219089Spjd vdev_expand(rvd->vdev_child[c], txg); 3337219089Spjd } 3338168404Spjd } 3339168404Spjd 3340185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 3341168404Spjd 3342168404Spjd if (error != 0) { 3343168404Spjd spa_unload(spa); 3344168404Spjd spa_deactivate(spa); 3345168404Spjd spa_remove(spa); 3346168404Spjd mutex_exit(&spa_namespace_lock); 3347168404Spjd return (error); 3348168404Spjd } 3349168404Spjd 3350168404Spjd /* 3351168404Spjd * Get the list of spares, if specified. 3352168404Spjd */ 3353168404Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 3354168404Spjd &spares, &nspares) == 0) { 3355185029Spjd VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, 3356168404Spjd KM_SLEEP) == 0); 3357185029Spjd VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 3358168404Spjd ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 3359185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3360168404Spjd spa_load_spares(spa); 3361185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 3362185029Spjd spa->spa_spares.sav_sync = B_TRUE; 3363168404Spjd } 3364168404Spjd 3365185029Spjd /* 3366185029Spjd * Get the list of level 2 cache devices, if specified. 3367185029Spjd */ 3368185029Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 3369185029Spjd &l2cache, &nl2cache) == 0) { 3370185029Spjd VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 3371185029Spjd NV_UNIQUE_NAME, KM_SLEEP) == 0); 3372185029Spjd VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 3373185029Spjd ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 3374185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3375185029Spjd spa_load_l2cache(spa); 3376185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 3377185029Spjd spa->spa_l2cache.sav_sync = B_TRUE; 3378185029Spjd } 3379185029Spjd 3380236884Smm spa->spa_is_initializing = B_TRUE; 3381185029Spjd spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); 3382168404Spjd spa->spa_meta_objset = dp->dp_meta_objset; 3383236884Smm spa->spa_is_initializing = B_FALSE; 3384168404Spjd 3385219089Spjd /* 3386219089Spjd * Create DDTs (dedup tables). 3387219089Spjd */ 3388219089Spjd ddt_create(spa); 3389219089Spjd 3390219089Spjd spa_update_dspace(spa); 3391219089Spjd 3392168404Spjd tx = dmu_tx_create_assigned(dp, txg); 3393168404Spjd 3394168404Spjd /* 3395168404Spjd * Create the pool config object. 3396168404Spjd */ 3397168404Spjd spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 3398185029Spjd DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, 3399168404Spjd DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 3400168404Spjd 3401168404Spjd if (zap_add(spa->spa_meta_objset, 3402168404Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 3403168404Spjd sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 3404168404Spjd cmn_err(CE_PANIC, "failed to add pool config"); 3405168404Spjd } 3406168404Spjd 3407236884Smm if (spa_version(spa) >= SPA_VERSION_FEATURES) 3408236884Smm spa_feature_create_zap_objects(spa, tx); 3409236884Smm 3410219089Spjd if (zap_add(spa->spa_meta_objset, 3411219089Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, 3412219089Spjd sizeof (uint64_t), 1, &version, tx) != 0) { 3413219089Spjd cmn_err(CE_PANIC, "failed to add pool version"); 3414219089Spjd } 3415219089Spjd 3416185029Spjd /* Newly created pools with the right version are always deflated. */ 3417185029Spjd if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 3418185029Spjd spa->spa_deflate = TRUE; 3419185029Spjd if (zap_add(spa->spa_meta_objset, 3420185029Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 3421185029Spjd sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 3422185029Spjd cmn_err(CE_PANIC, "failed to add deflate"); 3423185029Spjd } 3424168404Spjd } 3425168404Spjd 3426168404Spjd /* 3427219089Spjd * Create the deferred-free bpobj. Turn off compression 3428168404Spjd * because sync-to-convergence takes longer if the blocksize 3429168404Spjd * keeps changing. 3430168404Spjd */ 3431219089Spjd obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); 3432219089Spjd dmu_object_set_compress(spa->spa_meta_objset, obj, 3433168404Spjd ZIO_COMPRESS_OFF, tx); 3434168404Spjd if (zap_add(spa->spa_meta_objset, 3435219089Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, 3436219089Spjd sizeof (uint64_t), 1, &obj, tx) != 0) { 3437219089Spjd cmn_err(CE_PANIC, "failed to add bpobj"); 3438168404Spjd } 3439219089Spjd VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, 3440219089Spjd spa->spa_meta_objset, obj)); 3441168404Spjd 3442168404Spjd /* 3443168404Spjd * Create the pool's history object. 3444168404Spjd */ 3445185029Spjd if (version >= SPA_VERSION_ZPOOL_HISTORY) 3446185029Spjd spa_history_create_obj(spa, tx); 3447168404Spjd 3448185029Spjd /* 3449185029Spjd * Set pool properties. 3450185029Spjd */ 3451185029Spjd spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 3452185029Spjd spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 3453185029Spjd spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 3454219089Spjd spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); 3455219089Spjd 3456209962Smm if (props != NULL) { 3457209962Smm spa_configfile_set(spa, props, B_FALSE); 3458219089Spjd spa_sync_props(spa, props, tx); 3459209962Smm } 3460185029Spjd 3461168404Spjd dmu_tx_commit(tx); 3462168404Spjd 3463168404Spjd spa->spa_sync_on = B_TRUE; 3464168404Spjd txg_sync_start(spa->spa_dsl_pool); 3465168404Spjd 3466168404Spjd /* 3467168404Spjd * We explicitly wait for the first transaction to complete so that our 3468168404Spjd * bean counters are appropriately updated. 3469168404Spjd */ 3470168404Spjd txg_wait_synced(spa->spa_dsl_pool, txg); 3471168404Spjd 3472185029Spjd spa_config_sync(spa, B_FALSE, B_TRUE); 3473168404Spjd 3474185029Spjd if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL) 3475185029Spjd (void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE); 3476219089Spjd spa_history_log_version(spa, LOG_POOL_CREATE); 3477185029Spjd 3478208442Smm spa->spa_minref = refcount_count(&spa->spa_refcount); 3479208442Smm 3480168404Spjd mutex_exit(&spa_namespace_lock); 3481168404Spjd 3482168404Spjd return (0); 3483168404Spjd} 3484168404Spjd 3485219089Spjd#if defined(sun) 3486185029Spjd#ifdef _KERNEL 3487185029Spjd/* 3488219089Spjd * Get the root pool information from the root disk, then import the root pool 3489219089Spjd * during the system boot up time. 3490185029Spjd */ 3491219089Spjdextern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); 3492219089Spjd 3493219089Spjdstatic nvlist_t * 3494219089Spjdspa_generate_rootconf(char *devpath, char *devid, uint64_t *guid) 3495185029Spjd{ 3496219089Spjd nvlist_t *config; 3497185029Spjd nvlist_t *nvtop, *nvroot; 3498185029Spjd uint64_t pgid; 3499185029Spjd 3500219089Spjd if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0) 3501219089Spjd return (NULL); 3502219089Spjd 3503168404Spjd /* 3504185029Spjd * Add this top-level vdev to the child array. 3505168404Spjd */ 3506219089Spjd VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3507219089Spjd &nvtop) == 0); 3508219089Spjd VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 3509219089Spjd &pgid) == 0); 3510219089Spjd VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0); 3511168404Spjd 3512185029Spjd /* 3513185029Spjd * Put this pool's top-level vdevs into a root vdev. 3514185029Spjd */ 3515185029Spjd VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3516219089Spjd VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 3517219089Spjd VDEV_TYPE_ROOT) == 0); 3518185029Spjd VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 3519185029Spjd VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 3520185029Spjd VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 3521185029Spjd &nvtop, 1) == 0); 3522168404Spjd 3523168404Spjd /* 3524185029Spjd * Replace the existing vdev_tree with the new root vdev in 3525185029Spjd * this pool's configuration (remove the old, add the new). 3526168404Spjd */ 3527185029Spjd VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 3528185029Spjd nvlist_free(nvroot); 3529219089Spjd return (config); 3530185029Spjd} 3531168404Spjd 3532185029Spjd/* 3533219089Spjd * Walk the vdev tree and see if we can find a device with "better" 3534219089Spjd * configuration. A configuration is "better" if the label on that 3535219089Spjd * device has a more recent txg. 3536185029Spjd */ 3537219089Spjdstatic void 3538219089Spjdspa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg) 3539185029Spjd{ 3540219089Spjd for (int c = 0; c < vd->vdev_children; c++) 3541219089Spjd spa_alt_rootvdev(vd->vdev_child[c], avd, txg); 3542185029Spjd 3543219089Spjd if (vd->vdev_ops->vdev_op_leaf) { 3544219089Spjd nvlist_t *label; 3545219089Spjd uint64_t label_txg; 3546185029Spjd 3547219089Spjd if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid, 3548219089Spjd &label) != 0) 3549219089Spjd return; 3550185029Spjd 3551219089Spjd VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, 3552219089Spjd &label_txg) == 0); 3553168404Spjd 3554219089Spjd /* 3555219089Spjd * Do we have a better boot device? 3556219089Spjd */ 3557219089Spjd if (label_txg > *txg) { 3558219089Spjd *txg = label_txg; 3559219089Spjd *avd = vd; 3560185029Spjd } 3561219089Spjd nvlist_free(label); 3562185029Spjd } 3563185029Spjd} 3564185029Spjd 3565185029Spjd/* 3566185029Spjd * Import a root pool. 3567185029Spjd * 3568185029Spjd * For x86. devpath_list will consist of devid and/or physpath name of 3569185029Spjd * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). 3570185029Spjd * The GRUB "findroot" command will return the vdev we should boot. 3571185029Spjd * 3572185029Spjd * For Sparc, devpath_list consists the physpath name of the booting device 3573185029Spjd * no matter the rootpool is a single device pool or a mirrored pool. 3574185029Spjd * e.g. 3575185029Spjd * "/pci@1f,0/ide@d/disk@0,0:a" 3576185029Spjd */ 3577185029Spjdint 3578185029Spjdspa_import_rootpool(char *devpath, char *devid) 3579185029Spjd{ 3580219089Spjd spa_t *spa; 3581219089Spjd vdev_t *rvd, *bvd, *avd = NULL; 3582219089Spjd nvlist_t *config, *nvtop; 3583219089Spjd uint64_t guid, txg; 3584185029Spjd char *pname; 3585185029Spjd int error; 3586185029Spjd 3587185029Spjd /* 3588219089Spjd * Read the label from the boot device and generate a configuration. 3589185029Spjd */ 3590219089Spjd config = spa_generate_rootconf(devpath, devid, &guid); 3591219089Spjd#if defined(_OBP) && defined(_KERNEL) 3592219089Spjd if (config == NULL) { 3593219089Spjd if (strstr(devpath, "/iscsi/ssd") != NULL) { 3594219089Spjd /* iscsi boot */ 3595219089Spjd get_iscsi_bootpath_phy(devpath); 3596219089Spjd config = spa_generate_rootconf(devpath, devid, &guid); 3597219089Spjd } 3598219089Spjd } 3599219089Spjd#endif 3600219089Spjd if (config == NULL) { 3601236884Smm cmn_err(CE_NOTE, "Cannot read the pool label from '%s'", 3602219089Spjd devpath); 3603219089Spjd return (EIO); 3604219089Spjd } 3605185029Spjd 3606219089Spjd VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 3607219089Spjd &pname) == 0); 3608219089Spjd VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); 3609185029Spjd 3610209962Smm mutex_enter(&spa_namespace_lock); 3611209962Smm if ((spa = spa_lookup(pname)) != NULL) { 3612209962Smm /* 3613209962Smm * Remove the existing root pool from the namespace so that we 3614209962Smm * can replace it with the correct config we just read in. 3615209962Smm */ 3616209962Smm spa_remove(spa); 3617209962Smm } 3618185029Spjd 3619219089Spjd spa = spa_add(pname, config, NULL); 3620209962Smm spa->spa_is_root = B_TRUE; 3621219089Spjd spa->spa_import_flags = ZFS_IMPORT_VERBATIM; 3622209962Smm 3623219089Spjd /* 3624219089Spjd * Build up a vdev tree based on the boot device's label config. 3625219089Spjd */ 3626219089Spjd VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3627219089Spjd &nvtop) == 0); 3628219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3629219089Spjd error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, 3630219089Spjd VDEV_ALLOC_ROOTPOOL); 3631219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 3632219089Spjd if (error) { 3633209962Smm mutex_exit(&spa_namespace_lock); 3634219089Spjd nvlist_free(config); 3635219089Spjd cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", 3636219089Spjd pname); 3637219089Spjd return (error); 3638209962Smm } 3639209962Smm 3640219089Spjd /* 3641219089Spjd * Get the boot vdev. 3642219089Spjd */ 3643219089Spjd if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 3644219089Spjd cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu", 3645219089Spjd (u_longlong_t)guid); 3646219089Spjd error = ENOENT; 3647219089Spjd goto out; 3648219089Spjd } 3649209962Smm 3650219089Spjd /* 3651219089Spjd * Determine if there is a better boot device. 3652219089Spjd */ 3653219089Spjd avd = bvd; 3654219089Spjd spa_alt_rootvdev(rvd, &avd, &txg); 3655219089Spjd if (avd != bvd) { 3656219089Spjd cmn_err(CE_NOTE, "The boot device is 'degraded'. Please " 3657219089Spjd "try booting from '%s'", avd->vdev_path); 3658219089Spjd error = EINVAL; 3659219089Spjd goto out; 3660219089Spjd } 3661209962Smm 3662219089Spjd /* 3663219089Spjd * If the boot device is part of a spare vdev then ensure that 3664219089Spjd * we're booting off the active spare. 3665219089Spjd */ 3666219089Spjd if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops && 3667219089Spjd !bvd->vdev_isspare) { 3668219089Spjd cmn_err(CE_NOTE, "The boot device is currently spared. Please " 3669219089Spjd "try booting from '%s'", 3670219089Spjd bvd->vdev_parent-> 3671219089Spjd vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path); 3672219089Spjd error = EINVAL; 3673219089Spjd goto out; 3674219089Spjd } 3675209962Smm 3676219089Spjd error = 0; 3677219089Spjd spa_history_log_version(spa, LOG_POOL_IMPORT); 3678219089Spjdout: 3679219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3680219089Spjd vdev_free(rvd); 3681219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 3682209962Smm mutex_exit(&spa_namespace_lock); 3683209962Smm 3684219089Spjd nvlist_free(config); 3685219089Spjd return (error); 3686185029Spjd} 3687185029Spjd 3688219089Spjd#endif 3689219089Spjd#endif /* sun */ 3690219089Spjd 3691209962Smm/* 3692209962Smm * Import a non-root pool into the system. 3693209962Smm */ 3694185029Spjdint 3695219089Spjdspa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) 3696185029Spjd{ 3697209962Smm spa_t *spa; 3698209962Smm char *altroot = NULL; 3699219089Spjd spa_load_state_t state = SPA_LOAD_IMPORT; 3700219089Spjd zpool_rewind_policy_t policy; 3701219089Spjd uint64_t mode = spa_mode_global; 3702219089Spjd uint64_t readonly = B_FALSE; 3703209962Smm int error; 3704209962Smm nvlist_t *nvroot; 3705209962Smm nvlist_t **spares, **l2cache; 3706209962Smm uint_t nspares, nl2cache; 3707209962Smm 3708209962Smm /* 3709209962Smm * If a pool with this name exists, return failure. 3710209962Smm */ 3711209962Smm mutex_enter(&spa_namespace_lock); 3712219089Spjd if (spa_lookup(pool) != NULL) { 3713209962Smm mutex_exit(&spa_namespace_lock); 3714209962Smm return (EEXIST); 3715209962Smm } 3716209962Smm 3717209962Smm /* 3718209962Smm * Create and initialize the spa structure. 3719209962Smm */ 3720209962Smm (void) nvlist_lookup_string(props, 3721209962Smm zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 3722219089Spjd (void) nvlist_lookup_uint64(props, 3723219089Spjd zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); 3724219089Spjd if (readonly) 3725219089Spjd mode = FREAD; 3726219089Spjd spa = spa_add(pool, config, altroot); 3727219089Spjd spa->spa_import_flags = flags; 3728209962Smm 3729209962Smm /* 3730219089Spjd * Verbatim import - Take a pool and insert it into the namespace 3731219089Spjd * as if it had been loaded at boot. 3732219089Spjd */ 3733219089Spjd if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { 3734219089Spjd if (props != NULL) 3735219089Spjd spa_configfile_set(spa, props, B_FALSE); 3736219089Spjd 3737219089Spjd spa_config_sync(spa, B_FALSE, B_TRUE); 3738219089Spjd 3739219089Spjd mutex_exit(&spa_namespace_lock); 3740219089Spjd spa_history_log_version(spa, LOG_POOL_IMPORT); 3741219089Spjd 3742219089Spjd return (0); 3743219089Spjd } 3744219089Spjd 3745219089Spjd spa_activate(spa, mode); 3746219089Spjd 3747219089Spjd /* 3748209962Smm * Don't start async tasks until we know everything is healthy. 3749209962Smm */ 3750209962Smm spa_async_suspend(spa); 3751209962Smm 3752219089Spjd zpool_get_rewind_policy(config, &policy); 3753219089Spjd if (policy.zrp_request & ZPOOL_DO_REWIND) 3754219089Spjd state = SPA_LOAD_RECOVER; 3755219089Spjd 3756209962Smm /* 3757209962Smm * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig 3758209962Smm * because the user-supplied config is actually the one to trust when 3759209962Smm * doing an import. 3760209962Smm */ 3761219089Spjd if (state != SPA_LOAD_RECOVER) 3762219089Spjd spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 3763209962Smm 3764219089Spjd error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg, 3765219089Spjd policy.zrp_request); 3766219089Spjd 3767219089Spjd /* 3768219089Spjd * Propagate anything learned while loading the pool and pass it 3769219089Spjd * back to caller (i.e. rewind info, missing devices, etc). 3770219089Spjd */ 3771219089Spjd VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 3772219089Spjd spa->spa_load_info) == 0); 3773219089Spjd 3774209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3775209962Smm /* 3776209962Smm * Toss any existing sparelist, as it doesn't have any validity 3777209962Smm * anymore, and conflicts with spa_has_spare(). 3778209962Smm */ 3779209962Smm if (spa->spa_spares.sav_config) { 3780209962Smm nvlist_free(spa->spa_spares.sav_config); 3781209962Smm spa->spa_spares.sav_config = NULL; 3782209962Smm spa_load_spares(spa); 3783209962Smm } 3784209962Smm if (spa->spa_l2cache.sav_config) { 3785209962Smm nvlist_free(spa->spa_l2cache.sav_config); 3786209962Smm spa->spa_l2cache.sav_config = NULL; 3787209962Smm spa_load_l2cache(spa); 3788209962Smm } 3789209962Smm 3790209962Smm VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3791209962Smm &nvroot) == 0); 3792209962Smm if (error == 0) 3793209962Smm error = spa_validate_aux(spa, nvroot, -1ULL, 3794209962Smm VDEV_ALLOC_SPARE); 3795209962Smm if (error == 0) 3796209962Smm error = spa_validate_aux(spa, nvroot, -1ULL, 3797209962Smm VDEV_ALLOC_L2CACHE); 3798209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 3799209962Smm 3800209962Smm if (props != NULL) 3801209962Smm spa_configfile_set(spa, props, B_FALSE); 3802209962Smm 3803209962Smm if (error != 0 || (props && spa_writeable(spa) && 3804209962Smm (error = spa_prop_set(spa, props)))) { 3805209962Smm spa_unload(spa); 3806209962Smm spa_deactivate(spa); 3807209962Smm spa_remove(spa); 3808209962Smm mutex_exit(&spa_namespace_lock); 3809209962Smm return (error); 3810209962Smm } 3811209962Smm 3812209962Smm spa_async_resume(spa); 3813209962Smm 3814209962Smm /* 3815209962Smm * Override any spares and level 2 cache devices as specified by 3816209962Smm * the user, as these may have correct device names/devids, etc. 3817209962Smm */ 3818209962Smm if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 3819209962Smm &spares, &nspares) == 0) { 3820209962Smm if (spa->spa_spares.sav_config) 3821209962Smm VERIFY(nvlist_remove(spa->spa_spares.sav_config, 3822209962Smm ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 3823209962Smm else 3824209962Smm VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, 3825209962Smm NV_UNIQUE_NAME, KM_SLEEP) == 0); 3826209962Smm VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 3827209962Smm ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 3828209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3829209962Smm spa_load_spares(spa); 3830209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 3831209962Smm spa->spa_spares.sav_sync = B_TRUE; 3832209962Smm } 3833209962Smm if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 3834209962Smm &l2cache, &nl2cache) == 0) { 3835209962Smm if (spa->spa_l2cache.sav_config) 3836209962Smm VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, 3837209962Smm ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); 3838209962Smm else 3839209962Smm VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 3840209962Smm NV_UNIQUE_NAME, KM_SLEEP) == 0); 3841209962Smm VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 3842209962Smm ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 3843209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3844209962Smm spa_load_l2cache(spa); 3845209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 3846209962Smm spa->spa_l2cache.sav_sync = B_TRUE; 3847209962Smm } 3848209962Smm 3849219089Spjd /* 3850219089Spjd * Check for any removed devices. 3851219089Spjd */ 3852219089Spjd if (spa->spa_autoreplace) { 3853219089Spjd spa_aux_check_removed(&spa->spa_spares); 3854219089Spjd spa_aux_check_removed(&spa->spa_l2cache); 3855219089Spjd } 3856219089Spjd 3857209962Smm if (spa_writeable(spa)) { 3858209962Smm /* 3859209962Smm * Update the config cache to include the newly-imported pool. 3860209962Smm */ 3861209962Smm spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 3862209962Smm } 3863209962Smm 3864219089Spjd /* 3865219089Spjd * It's possible that the pool was expanded while it was exported. 3866219089Spjd * We kick off an async task to handle this for us. 3867219089Spjd */ 3868219089Spjd spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 3869219089Spjd 3870209962Smm mutex_exit(&spa_namespace_lock); 3871219089Spjd spa_history_log_version(spa, LOG_POOL_IMPORT); 3872209962Smm 3873219089Spjd#ifdef __FreeBSD__ 3874219089Spjd#ifdef _KERNEL 3875219089Spjd zvol_create_minors(pool); 3876219089Spjd#endif 3877219089Spjd#endif 3878209962Smm return (0); 3879185029Spjd} 3880185029Spjd 3881168404Spjdnvlist_t * 3882168404Spjdspa_tryimport(nvlist_t *tryconfig) 3883168404Spjd{ 3884168404Spjd nvlist_t *config = NULL; 3885168404Spjd char *poolname; 3886168404Spjd spa_t *spa; 3887168404Spjd uint64_t state; 3888208443Smm int error; 3889168404Spjd 3890168404Spjd if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 3891168404Spjd return (NULL); 3892168404Spjd 3893168404Spjd if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 3894168404Spjd return (NULL); 3895168404Spjd 3896168404Spjd /* 3897168404Spjd * Create and initialize the spa structure. 3898168404Spjd */ 3899168404Spjd mutex_enter(&spa_namespace_lock); 3900219089Spjd spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); 3901209962Smm spa_activate(spa, FREAD); 3902168404Spjd 3903168404Spjd /* 3904168404Spjd * Pass off the heavy lifting to spa_load(). 3905168404Spjd * Pass TRUE for mosconfig because the user-supplied config 3906168404Spjd * is actually the one to trust when doing an import. 3907168404Spjd */ 3908219089Spjd error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE); 3909168404Spjd 3910168404Spjd /* 3911168404Spjd * If 'tryconfig' was at least parsable, return the current config. 3912168404Spjd */ 3913168404Spjd if (spa->spa_root_vdev != NULL) { 3914168404Spjd config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 3915168404Spjd VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 3916168404Spjd poolname) == 0); 3917168404Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 3918168404Spjd state) == 0); 3919168498Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 3920168498Spjd spa->spa_uberblock.ub_timestamp) == 0); 3921236884Smm VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 3922236884Smm spa->spa_load_info) == 0); 3923168404Spjd 3924168404Spjd /* 3925185029Spjd * If the bootfs property exists on this pool then we 3926185029Spjd * copy it out so that external consumers can tell which 3927185029Spjd * pools are bootable. 3928168404Spjd */ 3929208443Smm if ((!error || error == EEXIST) && spa->spa_bootfs) { 3930185029Spjd char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 3931185029Spjd 3932185029Spjd /* 3933185029Spjd * We have to play games with the name since the 3934185029Spjd * pool was opened as TRYIMPORT_NAME. 3935185029Spjd */ 3936185029Spjd if (dsl_dsobj_to_dsname(spa_name(spa), 3937185029Spjd spa->spa_bootfs, tmpname) == 0) { 3938185029Spjd char *cp; 3939185029Spjd char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 3940185029Spjd 3941185029Spjd cp = strchr(tmpname, '/'); 3942185029Spjd if (cp == NULL) { 3943185029Spjd (void) strlcpy(dsname, tmpname, 3944185029Spjd MAXPATHLEN); 3945185029Spjd } else { 3946185029Spjd (void) snprintf(dsname, MAXPATHLEN, 3947185029Spjd "%s/%s", poolname, ++cp); 3948185029Spjd } 3949185029Spjd VERIFY(nvlist_add_string(config, 3950185029Spjd ZPOOL_CONFIG_BOOTFS, dsname) == 0); 3951185029Spjd kmem_free(dsname, MAXPATHLEN); 3952185029Spjd } 3953185029Spjd kmem_free(tmpname, MAXPATHLEN); 3954185029Spjd } 3955185029Spjd 3956185029Spjd /* 3957185029Spjd * Add the list of hot spares and level 2 cache devices. 3958185029Spjd */ 3959209962Smm spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3960168404Spjd spa_add_spares(spa, config); 3961185029Spjd spa_add_l2cache(spa, config); 3962209962Smm spa_config_exit(spa, SCL_CONFIG, FTAG); 3963168404Spjd } 3964168404Spjd 3965168404Spjd spa_unload(spa); 3966168404Spjd spa_deactivate(spa); 3967168404Spjd spa_remove(spa); 3968168404Spjd mutex_exit(&spa_namespace_lock); 3969168404Spjd 3970168404Spjd return (config); 3971168404Spjd} 3972168404Spjd 3973168404Spjd/* 3974168404Spjd * Pool export/destroy 3975168404Spjd * 3976168404Spjd * The act of destroying or exporting a pool is very simple. We make sure there 3977168404Spjd * is no more pending I/O and any references to the pool are gone. Then, we 3978168404Spjd * update the pool state and sync all the labels to disk, removing the 3979207670Smm * configuration from the cache afterwards. If the 'hardforce' flag is set, then 3980207670Smm * we don't sync the labels or remove the configuration cache. 3981168404Spjd */ 3982168404Spjdstatic int 3983185029Spjdspa_export_common(char *pool, int new_state, nvlist_t **oldconfig, 3984207670Smm boolean_t force, boolean_t hardforce) 3985168404Spjd{ 3986168404Spjd spa_t *spa; 3987168404Spjd 3988168404Spjd if (oldconfig) 3989168404Spjd *oldconfig = NULL; 3990168404Spjd 3991209962Smm if (!(spa_mode_global & FWRITE)) 3992168404Spjd return (EROFS); 3993168404Spjd 3994168404Spjd mutex_enter(&spa_namespace_lock); 3995168404Spjd if ((spa = spa_lookup(pool)) == NULL) { 3996168404Spjd mutex_exit(&spa_namespace_lock); 3997168404Spjd return (ENOENT); 3998168404Spjd } 3999168404Spjd 4000168404Spjd /* 4001168404Spjd * Put a hold on the pool, drop the namespace lock, stop async tasks, 4002168404Spjd * reacquire the namespace lock, and see if we can export. 4003168404Spjd */ 4004168404Spjd spa_open_ref(spa, FTAG); 4005168404Spjd mutex_exit(&spa_namespace_lock); 4006168404Spjd spa_async_suspend(spa); 4007168404Spjd mutex_enter(&spa_namespace_lock); 4008168404Spjd spa_close(spa, FTAG); 4009168404Spjd 4010168404Spjd /* 4011168404Spjd * The pool will be in core if it's openable, 4012168404Spjd * in which case we can modify its state. 4013168404Spjd */ 4014168404Spjd if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 4015168404Spjd /* 4016168404Spjd * Objsets may be open only because they're dirty, so we 4017168404Spjd * have to force it to sync before checking spa_refcnt. 4018168404Spjd */ 4019168404Spjd txg_wait_synced(spa->spa_dsl_pool, 0); 4020168404Spjd 4021168404Spjd /* 4022168404Spjd * A pool cannot be exported or destroyed if there are active 4023168404Spjd * references. If we are resetting a pool, allow references by 4024168404Spjd * fault injection handlers. 4025168404Spjd */ 4026168404Spjd if (!spa_refcount_zero(spa) || 4027168404Spjd (spa->spa_inject_ref != 0 && 4028168404Spjd new_state != POOL_STATE_UNINITIALIZED)) { 4029168404Spjd spa_async_resume(spa); 4030168404Spjd mutex_exit(&spa_namespace_lock); 4031168404Spjd return (EBUSY); 4032168404Spjd } 4033168404Spjd 4034185029Spjd /* 4035185029Spjd * A pool cannot be exported if it has an active shared spare. 4036185029Spjd * This is to prevent other pools stealing the active spare 4037185029Spjd * from an exported pool. At user's own will, such pool can 4038185029Spjd * be forcedly exported. 4039185029Spjd */ 4040185029Spjd if (!force && new_state == POOL_STATE_EXPORTED && 4041185029Spjd spa_has_active_shared_spare(spa)) { 4042185029Spjd spa_async_resume(spa); 4043185029Spjd mutex_exit(&spa_namespace_lock); 4044185029Spjd return (EXDEV); 4045185029Spjd } 4046168404Spjd 4047168404Spjd /* 4048168404Spjd * We want this to be reflected on every label, 4049168404Spjd * so mark them all dirty. spa_unload() will do the 4050168404Spjd * final sync that pushes these changes out. 4051168404Spjd */ 4052207670Smm if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 4053185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4054168404Spjd spa->spa_state = new_state; 4055219089Spjd spa->spa_final_txg = spa_last_synced_txg(spa) + 4056219089Spjd TXG_DEFER_SIZE + 1; 4057168404Spjd vdev_config_dirty(spa->spa_root_vdev); 4058185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 4059168404Spjd } 4060168404Spjd } 4061168404Spjd 4062185029Spjd spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY); 4063185029Spjd 4064168404Spjd if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 4065168404Spjd spa_unload(spa); 4066168404Spjd spa_deactivate(spa); 4067168404Spjd } 4068168404Spjd 4069168404Spjd if (oldconfig && spa->spa_config) 4070168404Spjd VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 4071168404Spjd 4072168404Spjd if (new_state != POOL_STATE_UNINITIALIZED) { 4073207670Smm if (!hardforce) 4074207670Smm spa_config_sync(spa, B_TRUE, B_TRUE); 4075168404Spjd spa_remove(spa); 4076168404Spjd } 4077168404Spjd mutex_exit(&spa_namespace_lock); 4078168404Spjd 4079168404Spjd return (0); 4080168404Spjd} 4081168404Spjd 4082168404Spjd/* 4083168404Spjd * Destroy a storage pool. 4084168404Spjd */ 4085168404Spjdint 4086168404Spjdspa_destroy(char *pool) 4087168404Spjd{ 4088207670Smm return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, 4089207670Smm B_FALSE, B_FALSE)); 4090168404Spjd} 4091168404Spjd 4092168404Spjd/* 4093168404Spjd * Export a storage pool. 4094168404Spjd */ 4095168404Spjdint 4096207670Smmspa_export(char *pool, nvlist_t **oldconfig, boolean_t force, 4097207670Smm boolean_t hardforce) 4098168404Spjd{ 4099207670Smm return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, 4100207670Smm force, hardforce)); 4101168404Spjd} 4102168404Spjd 4103168404Spjd/* 4104168404Spjd * Similar to spa_export(), this unloads the spa_t without actually removing it 4105168404Spjd * from the namespace in any way. 4106168404Spjd */ 4107168404Spjdint 4108168404Spjdspa_reset(char *pool) 4109168404Spjd{ 4110185029Spjd return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, 4111207670Smm B_FALSE, B_FALSE)); 4112168404Spjd} 4113168404Spjd 4114168404Spjd/* 4115168404Spjd * ========================================================================== 4116168404Spjd * Device manipulation 4117168404Spjd * ========================================================================== 4118168404Spjd */ 4119168404Spjd 4120168404Spjd/* 4121185029Spjd * Add a device to a storage pool. 4122168404Spjd */ 4123168404Spjdint 4124168404Spjdspa_vdev_add(spa_t *spa, nvlist_t *nvroot) 4125168404Spjd{ 4126219089Spjd uint64_t txg, id; 4127209962Smm int error; 4128168404Spjd vdev_t *rvd = spa->spa_root_vdev; 4129168404Spjd vdev_t *vd, *tvd; 4130185029Spjd nvlist_t **spares, **l2cache; 4131185029Spjd uint_t nspares, nl2cache; 4132168404Spjd 4133219089Spjd ASSERT(spa_writeable(spa)); 4134219089Spjd 4135168404Spjd txg = spa_vdev_enter(spa); 4136168404Spjd 4137168404Spjd if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 4138168404Spjd VDEV_ALLOC_ADD)) != 0) 4139168404Spjd return (spa_vdev_exit(spa, NULL, txg, error)); 4140168404Spjd 4141185029Spjd spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ 4142168404Spjd 4143185029Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 4144185029Spjd &nspares) != 0) 4145168404Spjd nspares = 0; 4146168404Spjd 4147185029Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 4148185029Spjd &nl2cache) != 0) 4149185029Spjd nl2cache = 0; 4150185029Spjd 4151185029Spjd if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) 4152168404Spjd return (spa_vdev_exit(spa, vd, txg, EINVAL)); 4153168404Spjd 4154185029Spjd if (vd->vdev_children != 0 && 4155185029Spjd (error = vdev_create(vd, txg, B_FALSE)) != 0) 4156185029Spjd return (spa_vdev_exit(spa, vd, txg, error)); 4157168404Spjd 4158168404Spjd /* 4159185029Spjd * We must validate the spares and l2cache devices after checking the 4160185029Spjd * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 4161168404Spjd */ 4162185029Spjd if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) 4163168404Spjd return (spa_vdev_exit(spa, vd, txg, error)); 4164168404Spjd 4165168404Spjd /* 4166168404Spjd * Transfer each new top-level vdev from vd to rvd. 4167168404Spjd */ 4168209962Smm for (int c = 0; c < vd->vdev_children; c++) { 4169219089Spjd 4170219089Spjd /* 4171219089Spjd * Set the vdev id to the first hole, if one exists. 4172219089Spjd */ 4173219089Spjd for (id = 0; id < rvd->vdev_children; id++) { 4174219089Spjd if (rvd->vdev_child[id]->vdev_ishole) { 4175219089Spjd vdev_free(rvd->vdev_child[id]); 4176219089Spjd break; 4177219089Spjd } 4178219089Spjd } 4179168404Spjd tvd = vd->vdev_child[c]; 4180168404Spjd vdev_remove_child(vd, tvd); 4181219089Spjd tvd->vdev_id = id; 4182168404Spjd vdev_add_child(rvd, tvd); 4183168404Spjd vdev_config_dirty(tvd); 4184168404Spjd } 4185168404Spjd 4186168404Spjd if (nspares != 0) { 4187185029Spjd spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 4188185029Spjd ZPOOL_CONFIG_SPARES); 4189168404Spjd spa_load_spares(spa); 4190185029Spjd spa->spa_spares.sav_sync = B_TRUE; 4191168404Spjd } 4192168404Spjd 4193185029Spjd if (nl2cache != 0) { 4194185029Spjd spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 4195185029Spjd ZPOOL_CONFIG_L2CACHE); 4196185029Spjd spa_load_l2cache(spa); 4197185029Spjd spa->spa_l2cache.sav_sync = B_TRUE; 4198185029Spjd } 4199185029Spjd 4200168404Spjd /* 4201168404Spjd * We have to be careful when adding new vdevs to an existing pool. 4202168404Spjd * If other threads start allocating from these vdevs before we 4203168404Spjd * sync the config cache, and we lose power, then upon reboot we may 4204168404Spjd * fail to open the pool because there are DVAs that the config cache 4205168404Spjd * can't translate. Therefore, we first add the vdevs without 4206168404Spjd * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 4207168404Spjd * and then let spa_config_update() initialize the new metaslabs. 4208168404Spjd * 4209168404Spjd * spa_load() checks for added-but-not-initialized vdevs, so that 4210168404Spjd * if we lose power at any point in this sequence, the remaining 4211168404Spjd * steps will be completed the next time we load the pool. 4212168404Spjd */ 4213168404Spjd (void) spa_vdev_exit(spa, vd, txg, 0); 4214168404Spjd 4215168404Spjd mutex_enter(&spa_namespace_lock); 4216168404Spjd spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 4217168404Spjd mutex_exit(&spa_namespace_lock); 4218168404Spjd 4219168404Spjd return (0); 4220168404Spjd} 4221168404Spjd 4222168404Spjd/* 4223168404Spjd * Attach a device to a mirror. The arguments are the path to any device 4224168404Spjd * in the mirror, and the nvroot for the new device. If the path specifies 4225168404Spjd * a device that is not mirrored, we automatically insert the mirror vdev. 4226168404Spjd * 4227168404Spjd * If 'replacing' is specified, the new device is intended to replace the 4228168404Spjd * existing device; in this case the two devices are made into their own 4229185029Spjd * mirror using the 'replacing' vdev, which is functionally identical to 4230168404Spjd * the mirror vdev (it actually reuses all the same ops) but has a few 4231168404Spjd * extra rules: you can't attach to it after it's been created, and upon 4232168404Spjd * completion of resilvering, the first disk (the one being replaced) 4233168404Spjd * is automatically detached. 4234168404Spjd */ 4235168404Spjdint 4236168404Spjdspa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 4237168404Spjd{ 4238219089Spjd uint64_t txg, dtl_max_txg; 4239168404Spjd vdev_t *rvd = spa->spa_root_vdev; 4240168404Spjd vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 4241168404Spjd vdev_ops_t *pvops; 4242185029Spjd char *oldvdpath, *newvdpath; 4243185029Spjd int newvd_isspare; 4244185029Spjd int error; 4245168404Spjd 4246219089Spjd ASSERT(spa_writeable(spa)); 4247219089Spjd 4248168404Spjd txg = spa_vdev_enter(spa); 4249168404Spjd 4250185029Spjd oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 4251168404Spjd 4252168404Spjd if (oldvd == NULL) 4253168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 4254168404Spjd 4255168404Spjd if (!oldvd->vdev_ops->vdev_op_leaf) 4256168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4257168404Spjd 4258168404Spjd pvd = oldvd->vdev_parent; 4259168404Spjd 4260168404Spjd if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 4261230514Smm VDEV_ALLOC_ATTACH)) != 0) 4262185029Spjd return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4263185029Spjd 4264185029Spjd if (newrootvd->vdev_children != 1) 4265168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 4266168404Spjd 4267168404Spjd newvd = newrootvd->vdev_child[0]; 4268168404Spjd 4269168404Spjd if (!newvd->vdev_ops->vdev_op_leaf) 4270168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 4271168404Spjd 4272168404Spjd if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 4273168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, error)); 4274168404Spjd 4275185029Spjd /* 4276185029Spjd * Spares can't replace logs 4277185029Spjd */ 4278185029Spjd if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) 4279185029Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4280185029Spjd 4281168404Spjd if (!replacing) { 4282168404Spjd /* 4283168404Spjd * For attach, the only allowable parent is a mirror or the root 4284168404Spjd * vdev. 4285168404Spjd */ 4286168404Spjd if (pvd->vdev_ops != &vdev_mirror_ops && 4287168404Spjd pvd->vdev_ops != &vdev_root_ops) 4288168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4289168404Spjd 4290168404Spjd pvops = &vdev_mirror_ops; 4291168404Spjd } else { 4292168404Spjd /* 4293168404Spjd * Active hot spares can only be replaced by inactive hot 4294168404Spjd * spares. 4295168404Spjd */ 4296168404Spjd if (pvd->vdev_ops == &vdev_spare_ops && 4297219089Spjd oldvd->vdev_isspare && 4298168404Spjd !spa_has_spare(spa, newvd->vdev_guid)) 4299168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4300168404Spjd 4301168404Spjd /* 4302168404Spjd * If the source is a hot spare, and the parent isn't already a 4303168404Spjd * spare, then we want to create a new hot spare. Otherwise, we 4304168404Spjd * want to create a replacing vdev. The user is not allowed to 4305168404Spjd * attach to a spared vdev child unless the 'isspare' state is 4306168404Spjd * the same (spare replaces spare, non-spare replaces 4307168404Spjd * non-spare). 4308168404Spjd */ 4309219089Spjd if (pvd->vdev_ops == &vdev_replacing_ops && 4310219089Spjd spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { 4311168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4312219089Spjd } else if (pvd->vdev_ops == &vdev_spare_ops && 4313219089Spjd newvd->vdev_isspare != oldvd->vdev_isspare) { 4314168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4315219089Spjd } 4316219089Spjd 4317219089Spjd if (newvd->vdev_isspare) 4318168404Spjd pvops = &vdev_spare_ops; 4319168404Spjd else 4320168404Spjd pvops = &vdev_replacing_ops; 4321168404Spjd } 4322168404Spjd 4323168404Spjd /* 4324219089Spjd * Make sure the new device is big enough. 4325168404Spjd */ 4326219089Spjd if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) 4327168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 4328168404Spjd 4329168404Spjd /* 4330168404Spjd * The new device cannot have a higher alignment requirement 4331168404Spjd * than the top-level vdev. 4332168404Spjd */ 4333168404Spjd if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 4334168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 4335168404Spjd 4336168404Spjd /* 4337168404Spjd * If this is an in-place replacement, update oldvd's path and devid 4338168404Spjd * to make it distinguishable from newvd, and unopenable from now on. 4339168404Spjd */ 4340168404Spjd if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 4341168404Spjd spa_strfree(oldvd->vdev_path); 4342168404Spjd oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 4343168404Spjd KM_SLEEP); 4344168404Spjd (void) sprintf(oldvd->vdev_path, "%s/%s", 4345168404Spjd newvd->vdev_path, "old"); 4346168404Spjd if (oldvd->vdev_devid != NULL) { 4347168404Spjd spa_strfree(oldvd->vdev_devid); 4348168404Spjd oldvd->vdev_devid = NULL; 4349168404Spjd } 4350168404Spjd } 4351168404Spjd 4352219089Spjd /* mark the device being resilvered */ 4353219089Spjd newvd->vdev_resilvering = B_TRUE; 4354219089Spjd 4355168404Spjd /* 4356168404Spjd * If the parent is not a mirror, or if we're replacing, insert the new 4357168404Spjd * mirror/replacing/spare vdev above oldvd. 4358168404Spjd */ 4359168404Spjd if (pvd->vdev_ops != pvops) 4360168404Spjd pvd = vdev_add_parent(oldvd, pvops); 4361168404Spjd 4362168404Spjd ASSERT(pvd->vdev_top->vdev_parent == rvd); 4363168404Spjd ASSERT(pvd->vdev_ops == pvops); 4364168404Spjd ASSERT(oldvd->vdev_parent == pvd); 4365168404Spjd 4366168404Spjd /* 4367168404Spjd * Extract the new device from its root and add it to pvd. 4368168404Spjd */ 4369168404Spjd vdev_remove_child(newrootvd, newvd); 4370168404Spjd newvd->vdev_id = pvd->vdev_children; 4371219089Spjd newvd->vdev_crtxg = oldvd->vdev_crtxg; 4372168404Spjd vdev_add_child(pvd, newvd); 4373168404Spjd 4374168404Spjd tvd = newvd->vdev_top; 4375168404Spjd ASSERT(pvd->vdev_top == tvd); 4376168404Spjd ASSERT(tvd->vdev_parent == rvd); 4377168404Spjd 4378168404Spjd vdev_config_dirty(tvd); 4379168404Spjd 4380168404Spjd /* 4381219089Spjd * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account 4382219089Spjd * for any dmu_sync-ed blocks. It will propagate upward when 4383219089Spjd * spa_vdev_exit() calls vdev_dtl_reassess(). 4384168404Spjd */ 4385219089Spjd dtl_max_txg = txg + TXG_CONCURRENT_STATES; 4386168404Spjd 4387219089Spjd vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, 4388219089Spjd dtl_max_txg - TXG_INITIAL); 4389168404Spjd 4390209962Smm if (newvd->vdev_isspare) { 4391168404Spjd spa_spare_activate(newvd); 4392209962Smm spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE); 4393209962Smm } 4394209962Smm 4395185029Spjd oldvdpath = spa_strdup(oldvd->vdev_path); 4396185029Spjd newvdpath = spa_strdup(newvd->vdev_path); 4397185029Spjd newvd_isspare = newvd->vdev_isspare; 4398168404Spjd 4399168404Spjd /* 4400168404Spjd * Mark newvd's DTL dirty in this txg. 4401168404Spjd */ 4402168404Spjd vdev_dirty(tvd, VDD_DTL, newvd, txg); 4403168404Spjd 4404219089Spjd /* 4405219089Spjd * Restart the resilver 4406219089Spjd */ 4407219089Spjd dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg); 4408168404Spjd 4409219089Spjd /* 4410219089Spjd * Commit the config 4411219089Spjd */ 4412219089Spjd (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); 4413185029Spjd 4414219089Spjd spa_history_log_internal(LOG_POOL_VDEV_ATTACH, spa, NULL, 4415219089Spjd "%s vdev=%s %s vdev=%s", 4416219089Spjd replacing && newvd_isspare ? "spare in" : 4417219089Spjd replacing ? "replace" : "attach", newvdpath, 4418219089Spjd replacing ? "for" : "to", oldvdpath); 4419219089Spjd 4420185029Spjd spa_strfree(oldvdpath); 4421185029Spjd spa_strfree(newvdpath); 4422185029Spjd 4423219089Spjd if (spa->spa_bootfs) 4424219089Spjd spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH); 4425168404Spjd 4426168404Spjd return (0); 4427168404Spjd} 4428168404Spjd 4429168404Spjd/* 4430168404Spjd * Detach a device from a mirror or replacing vdev. 4431168404Spjd * If 'replace_done' is specified, only detach if the parent 4432168404Spjd * is a replacing vdev. 4433168404Spjd */ 4434168404Spjdint 4435209962Smmspa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) 4436168404Spjd{ 4437168404Spjd uint64_t txg; 4438209962Smm int error; 4439168404Spjd vdev_t *rvd = spa->spa_root_vdev; 4440168404Spjd vdev_t *vd, *pvd, *cvd, *tvd; 4441168404Spjd boolean_t unspare = B_FALSE; 4442168404Spjd uint64_t unspare_guid; 4443219089Spjd char *vdpath; 4444168404Spjd 4445219089Spjd ASSERT(spa_writeable(spa)); 4446219089Spjd 4447168404Spjd txg = spa_vdev_enter(spa); 4448168404Spjd 4449185029Spjd vd = spa_lookup_by_guid(spa, guid, B_FALSE); 4450168404Spjd 4451168404Spjd if (vd == NULL) 4452168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 4453168404Spjd 4454168404Spjd if (!vd->vdev_ops->vdev_op_leaf) 4455168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4456168404Spjd 4457168404Spjd pvd = vd->vdev_parent; 4458168404Spjd 4459168404Spjd /* 4460209962Smm * If the parent/child relationship is not as expected, don't do it. 4461209962Smm * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing 4462209962Smm * vdev that's replacing B with C. The user's intent in replacing 4463209962Smm * is to go from M(A,B) to M(A,C). If the user decides to cancel 4464209962Smm * the replace by detaching C, the expected behavior is to end up 4465209962Smm * M(A,B). But suppose that right after deciding to detach C, 4466209962Smm * the replacement of B completes. We would have M(A,C), and then 4467209962Smm * ask to detach C, which would leave us with just A -- not what 4468209962Smm * the user wanted. To prevent this, we make sure that the 4469209962Smm * parent/child relationship hasn't changed -- in this example, 4470209962Smm * that C's parent is still the replacing vdev R. 4471209962Smm */ 4472209962Smm if (pvd->vdev_guid != pguid && pguid != 0) 4473209962Smm return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 4474209962Smm 4475209962Smm /* 4476219089Spjd * Only 'replacing' or 'spare' vdevs can be replaced. 4477168404Spjd */ 4478219089Spjd if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && 4479219089Spjd pvd->vdev_ops != &vdev_spare_ops) 4480219089Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4481168404Spjd 4482168404Spjd ASSERT(pvd->vdev_ops != &vdev_spare_ops || 4483185029Spjd spa_version(spa) >= SPA_VERSION_SPARES); 4484168404Spjd 4485168404Spjd /* 4486168404Spjd * Only mirror, replacing, and spare vdevs support detach. 4487168404Spjd */ 4488168404Spjd if (pvd->vdev_ops != &vdev_replacing_ops && 4489168404Spjd pvd->vdev_ops != &vdev_mirror_ops && 4490168404Spjd pvd->vdev_ops != &vdev_spare_ops) 4491168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4492168404Spjd 4493168404Spjd /* 4494209962Smm * If this device has the only valid copy of some data, 4495209962Smm * we cannot safely detach it. 4496168404Spjd */ 4497209962Smm if (vdev_dtl_required(vd)) 4498168404Spjd return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 4499168404Spjd 4500209962Smm ASSERT(pvd->vdev_children >= 2); 4501168404Spjd 4502168404Spjd /* 4503185029Spjd * If we are detaching the second disk from a replacing vdev, then 4504185029Spjd * check to see if we changed the original vdev's path to have "/old" 4505185029Spjd * at the end in spa_vdev_attach(). If so, undo that change now. 4506168404Spjd */ 4507219089Spjd if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && 4508219089Spjd vd->vdev_path != NULL) { 4509219089Spjd size_t len = strlen(vd->vdev_path); 4510219089Spjd 4511219089Spjd for (int c = 0; c < pvd->vdev_children; c++) { 4512219089Spjd cvd = pvd->vdev_child[c]; 4513219089Spjd 4514219089Spjd if (cvd == vd || cvd->vdev_path == NULL) 4515219089Spjd continue; 4516219089Spjd 4517219089Spjd if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 4518219089Spjd strcmp(cvd->vdev_path + len, "/old") == 0) { 4519219089Spjd spa_strfree(cvd->vdev_path); 4520219089Spjd cvd->vdev_path = spa_strdup(vd->vdev_path); 4521219089Spjd break; 4522219089Spjd } 4523185029Spjd } 4524185029Spjd } 4525168404Spjd 4526168404Spjd /* 4527168404Spjd * If we are detaching the original disk from a spare, then it implies 4528168404Spjd * that the spare should become a real disk, and be removed from the 4529168404Spjd * active spare list for the pool. 4530168404Spjd */ 4531168404Spjd if (pvd->vdev_ops == &vdev_spare_ops && 4532219089Spjd vd->vdev_id == 0 && 4533219089Spjd pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare) 4534168404Spjd unspare = B_TRUE; 4535168404Spjd 4536168404Spjd /* 4537168404Spjd * Erase the disk labels so the disk can be used for other things. 4538168404Spjd * This must be done after all other error cases are handled, 4539168404Spjd * but before we disembowel vd (so we can still do I/O to it). 4540168404Spjd * But if we can't do it, don't treat the error as fatal -- 4541168404Spjd * it may be that the unwritability of the disk is the reason 4542168404Spjd * it's being detached! 4543168404Spjd */ 4544168404Spjd error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 4545168404Spjd 4546168404Spjd /* 4547168404Spjd * Remove vd from its parent and compact the parent's children. 4548168404Spjd */ 4549168404Spjd vdev_remove_child(pvd, vd); 4550168404Spjd vdev_compact_children(pvd); 4551168404Spjd 4552168404Spjd /* 4553168404Spjd * Remember one of the remaining children so we can get tvd below. 4554168404Spjd */ 4555219089Spjd cvd = pvd->vdev_child[pvd->vdev_children - 1]; 4556168404Spjd 4557168404Spjd /* 4558168404Spjd * If we need to remove the remaining child from the list of hot spares, 4559209962Smm * do it now, marking the vdev as no longer a spare in the process. 4560209962Smm * We must do this before vdev_remove_parent(), because that can 4561209962Smm * change the GUID if it creates a new toplevel GUID. For a similar 4562209962Smm * reason, we must remove the spare now, in the same txg as the detach; 4563209962Smm * otherwise someone could attach a new sibling, change the GUID, and 4564209962Smm * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. 4565168404Spjd */ 4566168404Spjd if (unspare) { 4567168404Spjd ASSERT(cvd->vdev_isspare); 4568168404Spjd spa_spare_remove(cvd); 4569168404Spjd unspare_guid = cvd->vdev_guid; 4570209962Smm (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 4571219089Spjd cvd->vdev_unspare = B_TRUE; 4572168404Spjd } 4573168404Spjd 4574168404Spjd /* 4575168404Spjd * If the parent mirror/replacing vdev only has one child, 4576168404Spjd * the parent is no longer needed. Remove it from the tree. 4577168404Spjd */ 4578219089Spjd if (pvd->vdev_children == 1) { 4579219089Spjd if (pvd->vdev_ops == &vdev_spare_ops) 4580219089Spjd cvd->vdev_unspare = B_FALSE; 4581168404Spjd vdev_remove_parent(cvd); 4582219089Spjd cvd->vdev_resilvering = B_FALSE; 4583219089Spjd } 4584168404Spjd 4585219089Spjd 4586168404Spjd /* 4587168404Spjd * We don't set tvd until now because the parent we just removed 4588168404Spjd * may have been the previous top-level vdev. 4589168404Spjd */ 4590168404Spjd tvd = cvd->vdev_top; 4591168404Spjd ASSERT(tvd->vdev_parent == rvd); 4592168404Spjd 4593168404Spjd /* 4594168404Spjd * Reevaluate the parent vdev state. 4595168404Spjd */ 4596185029Spjd vdev_propagate_state(cvd); 4597168404Spjd 4598168404Spjd /* 4599219089Spjd * If the 'autoexpand' property is set on the pool then automatically 4600219089Spjd * try to expand the size of the pool. For example if the device we 4601219089Spjd * just detached was smaller than the others, it may be possible to 4602219089Spjd * add metaslabs (i.e. grow the pool). We need to reopen the vdev 4603219089Spjd * first so that we can obtain the updated sizes of the leaf vdevs. 4604168404Spjd */ 4605219089Spjd if (spa->spa_autoexpand) { 4606219089Spjd vdev_reopen(tvd); 4607219089Spjd vdev_expand(tvd, txg); 4608219089Spjd } 4609168404Spjd 4610168404Spjd vdev_config_dirty(tvd); 4611168404Spjd 4612168404Spjd /* 4613168404Spjd * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 4614168404Spjd * vd->vdev_detached is set and free vd's DTL object in syncing context. 4615168404Spjd * But first make sure we're not on any *other* txg's DTL list, to 4616168404Spjd * prevent vd from being accessed after it's freed. 4617168404Spjd */ 4618219089Spjd vdpath = spa_strdup(vd->vdev_path); 4619209962Smm for (int t = 0; t < TXG_SIZE; t++) 4620168404Spjd (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 4621168404Spjd vd->vdev_detached = B_TRUE; 4622168404Spjd vdev_dirty(tvd, VDD_DTL, vd, txg); 4623168404Spjd 4624185029Spjd spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); 4625185029Spjd 4626219089Spjd /* hang on to the spa before we release the lock */ 4627219089Spjd spa_open_ref(spa, FTAG); 4628219089Spjd 4629168404Spjd error = spa_vdev_exit(spa, vd, txg, 0); 4630168404Spjd 4631219089Spjd spa_history_log_internal(LOG_POOL_VDEV_DETACH, spa, NULL, 4632219089Spjd "vdev=%s", vdpath); 4633219089Spjd spa_strfree(vdpath); 4634219089Spjd 4635168404Spjd /* 4636168404Spjd * If this was the removal of the original device in a hot spare vdev, 4637168404Spjd * then we want to go through and remove the device from the hot spare 4638168404Spjd * list of every other pool. 4639168404Spjd */ 4640168404Spjd if (unspare) { 4641219089Spjd spa_t *altspa = NULL; 4642219089Spjd 4643168404Spjd mutex_enter(&spa_namespace_lock); 4644219089Spjd while ((altspa = spa_next(altspa)) != NULL) { 4645219089Spjd if (altspa->spa_state != POOL_STATE_ACTIVE || 4646219089Spjd altspa == spa) 4647168404Spjd continue; 4648219089Spjd 4649219089Spjd spa_open_ref(altspa, FTAG); 4650185029Spjd mutex_exit(&spa_namespace_lock); 4651219089Spjd (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); 4652185029Spjd mutex_enter(&spa_namespace_lock); 4653219089Spjd spa_close(altspa, FTAG); 4654168404Spjd } 4655168404Spjd mutex_exit(&spa_namespace_lock); 4656219089Spjd 4657219089Spjd /* search the rest of the vdevs for spares to remove */ 4658219089Spjd spa_vdev_resilver_done(spa); 4659168404Spjd } 4660168404Spjd 4661219089Spjd /* all done with the spa; OK to release */ 4662219089Spjd mutex_enter(&spa_namespace_lock); 4663219089Spjd spa_close(spa, FTAG); 4664219089Spjd mutex_exit(&spa_namespace_lock); 4665219089Spjd 4666168404Spjd return (error); 4667168404Spjd} 4668168404Spjd 4669219089Spjd/* 4670219089Spjd * Split a set of devices from their mirrors, and create a new pool from them. 4671219089Spjd */ 4672219089Spjdint 4673219089Spjdspa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, 4674219089Spjd nvlist_t *props, boolean_t exp) 4675219089Spjd{ 4676219089Spjd int error = 0; 4677219089Spjd uint64_t txg, *glist; 4678219089Spjd spa_t *newspa; 4679219089Spjd uint_t c, children, lastlog; 4680219089Spjd nvlist_t **child, *nvl, *tmp; 4681219089Spjd dmu_tx_t *tx; 4682219089Spjd char *altroot = NULL; 4683219089Spjd vdev_t *rvd, **vml = NULL; /* vdev modify list */ 4684219089Spjd boolean_t activate_slog; 4685219089Spjd 4686219089Spjd ASSERT(spa_writeable(spa)); 4687219089Spjd 4688219089Spjd txg = spa_vdev_enter(spa); 4689219089Spjd 4690219089Spjd /* clear the log and flush everything up to now */ 4691219089Spjd activate_slog = spa_passivate_log(spa); 4692219089Spjd (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 4693219089Spjd error = spa_offline_log(spa); 4694219089Spjd txg = spa_vdev_config_enter(spa); 4695219089Spjd 4696219089Spjd if (activate_slog) 4697219089Spjd spa_activate_log(spa); 4698219089Spjd 4699219089Spjd if (error != 0) 4700219089Spjd return (spa_vdev_exit(spa, NULL, txg, error)); 4701219089Spjd 4702219089Spjd /* check new spa name before going any further */ 4703219089Spjd if (spa_lookup(newname) != NULL) 4704219089Spjd return (spa_vdev_exit(spa, NULL, txg, EEXIST)); 4705219089Spjd 4706219089Spjd /* 4707219089Spjd * scan through all the children to ensure they're all mirrors 4708219089Spjd */ 4709219089Spjd if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || 4710219089Spjd nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, 4711219089Spjd &children) != 0) 4712219089Spjd return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4713219089Spjd 4714219089Spjd /* first, check to ensure we've got the right child count */ 4715219089Spjd rvd = spa->spa_root_vdev; 4716219089Spjd lastlog = 0; 4717219089Spjd for (c = 0; c < rvd->vdev_children; c++) { 4718219089Spjd vdev_t *vd = rvd->vdev_child[c]; 4719219089Spjd 4720219089Spjd /* don't count the holes & logs as children */ 4721219089Spjd if (vd->vdev_islog || vd->vdev_ishole) { 4722219089Spjd if (lastlog == 0) 4723219089Spjd lastlog = c; 4724219089Spjd continue; 4725219089Spjd } 4726219089Spjd 4727219089Spjd lastlog = 0; 4728219089Spjd } 4729219089Spjd if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) 4730219089Spjd return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4731219089Spjd 4732219089Spjd /* next, ensure no spare or cache devices are part of the split */ 4733219089Spjd if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || 4734219089Spjd nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) 4735219089Spjd return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4736219089Spjd 4737219089Spjd vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); 4738219089Spjd glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); 4739219089Spjd 4740219089Spjd /* then, loop over each vdev and validate it */ 4741219089Spjd for (c = 0; c < children; c++) { 4742219089Spjd uint64_t is_hole = 0; 4743219089Spjd 4744219089Spjd (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, 4745219089Spjd &is_hole); 4746219089Spjd 4747219089Spjd if (is_hole != 0) { 4748219089Spjd if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || 4749219089Spjd spa->spa_root_vdev->vdev_child[c]->vdev_islog) { 4750219089Spjd continue; 4751219089Spjd } else { 4752219089Spjd error = EINVAL; 4753219089Spjd break; 4754219089Spjd } 4755219089Spjd } 4756219089Spjd 4757219089Spjd /* which disk is going to be split? */ 4758219089Spjd if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, 4759219089Spjd &glist[c]) != 0) { 4760219089Spjd error = EINVAL; 4761219089Spjd break; 4762219089Spjd } 4763219089Spjd 4764219089Spjd /* look it up in the spa */ 4765219089Spjd vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); 4766219089Spjd if (vml[c] == NULL) { 4767219089Spjd error = ENODEV; 4768219089Spjd break; 4769219089Spjd } 4770219089Spjd 4771219089Spjd /* make sure there's nothing stopping the split */ 4772219089Spjd if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || 4773219089Spjd vml[c]->vdev_islog || 4774219089Spjd vml[c]->vdev_ishole || 4775219089Spjd vml[c]->vdev_isspare || 4776219089Spjd vml[c]->vdev_isl2cache || 4777219089Spjd !vdev_writeable(vml[c]) || 4778219089Spjd vml[c]->vdev_children != 0 || 4779219089Spjd vml[c]->vdev_state != VDEV_STATE_HEALTHY || 4780219089Spjd c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { 4781219089Spjd error = EINVAL; 4782219089Spjd break; 4783219089Spjd } 4784219089Spjd 4785219089Spjd if (vdev_dtl_required(vml[c])) { 4786219089Spjd error = EBUSY; 4787219089Spjd break; 4788219089Spjd } 4789219089Spjd 4790219089Spjd /* we need certain info from the top level */ 4791219089Spjd VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, 4792219089Spjd vml[c]->vdev_top->vdev_ms_array) == 0); 4793219089Spjd VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, 4794219089Spjd vml[c]->vdev_top->vdev_ms_shift) == 0); 4795219089Spjd VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, 4796219089Spjd vml[c]->vdev_top->vdev_asize) == 0); 4797219089Spjd VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, 4798219089Spjd vml[c]->vdev_top->vdev_ashift) == 0); 4799219089Spjd } 4800219089Spjd 4801219089Spjd if (error != 0) { 4802219089Spjd kmem_free(vml, children * sizeof (vdev_t *)); 4803219089Spjd kmem_free(glist, children * sizeof (uint64_t)); 4804219089Spjd return (spa_vdev_exit(spa, NULL, txg, error)); 4805219089Spjd } 4806219089Spjd 4807219089Spjd /* stop writers from using the disks */ 4808219089Spjd for (c = 0; c < children; c++) { 4809219089Spjd if (vml[c] != NULL) 4810219089Spjd vml[c]->vdev_offline = B_TRUE; 4811219089Spjd } 4812219089Spjd vdev_reopen(spa->spa_root_vdev); 4813219089Spjd 4814219089Spjd /* 4815219089Spjd * Temporarily record the splitting vdevs in the spa config. This 4816219089Spjd * will disappear once the config is regenerated. 4817219089Spjd */ 4818219089Spjd VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); 4819219089Spjd VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 4820219089Spjd glist, children) == 0); 4821219089Spjd kmem_free(glist, children * sizeof (uint64_t)); 4822219089Spjd 4823219089Spjd mutex_enter(&spa->spa_props_lock); 4824219089Spjd VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, 4825219089Spjd nvl) == 0); 4826219089Spjd mutex_exit(&spa->spa_props_lock); 4827219089Spjd spa->spa_config_splitting = nvl; 4828219089Spjd vdev_config_dirty(spa->spa_root_vdev); 4829219089Spjd 4830219089Spjd /* configure and create the new pool */ 4831219089Spjd VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0); 4832219089Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 4833219089Spjd exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0); 4834219089Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 4835219089Spjd spa_version(spa)) == 0); 4836219089Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, 4837219089Spjd spa->spa_config_txg) == 0); 4838219089Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, 4839219089Spjd spa_generate_guid(NULL)) == 0); 4840219089Spjd (void) nvlist_lookup_string(props, 4841219089Spjd zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 4842219089Spjd 4843219089Spjd /* add the new pool to the namespace */ 4844219089Spjd newspa = spa_add(newname, config, altroot); 4845219089Spjd newspa->spa_config_txg = spa->spa_config_txg; 4846219089Spjd spa_set_log_state(newspa, SPA_LOG_CLEAR); 4847219089Spjd 4848219089Spjd /* release the spa config lock, retaining the namespace lock */ 4849219089Spjd spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 4850219089Spjd 4851219089Spjd if (zio_injection_enabled) 4852219089Spjd zio_handle_panic_injection(spa, FTAG, 1); 4853219089Spjd 4854219089Spjd spa_activate(newspa, spa_mode_global); 4855219089Spjd spa_async_suspend(newspa); 4856219089Spjd 4857219089Spjd#ifndef sun 4858219089Spjd /* mark that we are creating new spa by splitting */ 4859219089Spjd newspa->spa_splitting_newspa = B_TRUE; 4860219089Spjd#endif 4861219089Spjd /* create the new pool from the disks of the original pool */ 4862219089Spjd error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE); 4863219089Spjd#ifndef sun 4864219089Spjd newspa->spa_splitting_newspa = B_FALSE; 4865219089Spjd#endif 4866219089Spjd if (error) 4867219089Spjd goto out; 4868219089Spjd 4869219089Spjd /* if that worked, generate a real config for the new pool */ 4870219089Spjd if (newspa->spa_root_vdev != NULL) { 4871219089Spjd VERIFY(nvlist_alloc(&newspa->spa_config_splitting, 4872219089Spjd NV_UNIQUE_NAME, KM_SLEEP) == 0); 4873219089Spjd VERIFY(nvlist_add_uint64(newspa->spa_config_splitting, 4874219089Spjd ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0); 4875219089Spjd spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, 4876219089Spjd B_TRUE)); 4877219089Spjd } 4878219089Spjd 4879219089Spjd /* set the props */ 4880219089Spjd if (props != NULL) { 4881219089Spjd spa_configfile_set(newspa, props, B_FALSE); 4882219089Spjd error = spa_prop_set(newspa, props); 4883219089Spjd if (error) 4884219089Spjd goto out; 4885219089Spjd } 4886219089Spjd 4887219089Spjd /* flush everything */ 4888219089Spjd txg = spa_vdev_config_enter(newspa); 4889219089Spjd vdev_config_dirty(newspa->spa_root_vdev); 4890219089Spjd (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); 4891219089Spjd 4892219089Spjd if (zio_injection_enabled) 4893219089Spjd zio_handle_panic_injection(spa, FTAG, 2); 4894219089Spjd 4895219089Spjd spa_async_resume(newspa); 4896219089Spjd 4897219089Spjd /* finally, update the original pool's config */ 4898219089Spjd txg = spa_vdev_config_enter(spa); 4899219089Spjd tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 4900219089Spjd error = dmu_tx_assign(tx, TXG_WAIT); 4901219089Spjd if (error != 0) 4902219089Spjd dmu_tx_abort(tx); 4903219089Spjd for (c = 0; c < children; c++) { 4904219089Spjd if (vml[c] != NULL) { 4905219089Spjd vdev_split(vml[c]); 4906219089Spjd if (error == 0) 4907219089Spjd spa_history_log_internal(LOG_POOL_VDEV_DETACH, 4908219089Spjd spa, tx, "vdev=%s", 4909219089Spjd vml[c]->vdev_path); 4910219089Spjd vdev_free(vml[c]); 4911219089Spjd } 4912219089Spjd } 4913219089Spjd vdev_config_dirty(spa->spa_root_vdev); 4914219089Spjd spa->spa_config_splitting = NULL; 4915219089Spjd nvlist_free(nvl); 4916219089Spjd if (error == 0) 4917219089Spjd dmu_tx_commit(tx); 4918219089Spjd (void) spa_vdev_exit(spa, NULL, txg, 0); 4919219089Spjd 4920219089Spjd if (zio_injection_enabled) 4921219089Spjd zio_handle_panic_injection(spa, FTAG, 3); 4922219089Spjd 4923219089Spjd /* split is complete; log a history record */ 4924219089Spjd spa_history_log_internal(LOG_POOL_SPLIT, newspa, NULL, 4925219089Spjd "split new pool %s from pool %s", newname, spa_name(spa)); 4926219089Spjd 4927219089Spjd kmem_free(vml, children * sizeof (vdev_t *)); 4928219089Spjd 4929219089Spjd /* if we're not going to mount the filesystems in userland, export */ 4930219089Spjd if (exp) 4931219089Spjd error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, 4932219089Spjd B_FALSE, B_FALSE); 4933219089Spjd 4934219089Spjd return (error); 4935219089Spjd 4936219089Spjdout: 4937219089Spjd spa_unload(newspa); 4938219089Spjd spa_deactivate(newspa); 4939219089Spjd spa_remove(newspa); 4940219089Spjd 4941219089Spjd txg = spa_vdev_config_enter(spa); 4942219089Spjd 4943219089Spjd /* re-online all offlined disks */ 4944219089Spjd for (c = 0; c < children; c++) { 4945219089Spjd if (vml[c] != NULL) 4946219089Spjd vml[c]->vdev_offline = B_FALSE; 4947219089Spjd } 4948219089Spjd vdev_reopen(spa->spa_root_vdev); 4949219089Spjd 4950219089Spjd nvlist_free(spa->spa_config_splitting); 4951219089Spjd spa->spa_config_splitting = NULL; 4952219089Spjd (void) spa_vdev_exit(spa, NULL, txg, error); 4953219089Spjd 4954219089Spjd kmem_free(vml, children * sizeof (vdev_t *)); 4955219089Spjd return (error); 4956219089Spjd} 4957219089Spjd 4958185029Spjdstatic nvlist_t * 4959185029Spjdspa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) 4960185029Spjd{ 4961185029Spjd for (int i = 0; i < count; i++) { 4962185029Spjd uint64_t guid; 4963185029Spjd 4964185029Spjd VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID, 4965185029Spjd &guid) == 0); 4966185029Spjd 4967185029Spjd if (guid == target_guid) 4968185029Spjd return (nvpp[i]); 4969185029Spjd } 4970185029Spjd 4971185029Spjd return (NULL); 4972185029Spjd} 4973185029Spjd 4974185029Spjdstatic void 4975185029Spjdspa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, 4976185029Spjd nvlist_t *dev_to_remove) 4977185029Spjd{ 4978185029Spjd nvlist_t **newdev = NULL; 4979185029Spjd 4980185029Spjd if (count > 1) 4981185029Spjd newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); 4982185029Spjd 4983185029Spjd for (int i = 0, j = 0; i < count; i++) { 4984185029Spjd if (dev[i] == dev_to_remove) 4985185029Spjd continue; 4986185029Spjd VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); 4987185029Spjd } 4988185029Spjd 4989185029Spjd VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); 4990185029Spjd VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); 4991185029Spjd 4992185029Spjd for (int i = 0; i < count - 1; i++) 4993185029Spjd nvlist_free(newdev[i]); 4994185029Spjd 4995185029Spjd if (count > 1) 4996185029Spjd kmem_free(newdev, (count - 1) * sizeof (void *)); 4997185029Spjd} 4998185029Spjd 4999168404Spjd/* 5000219089Spjd * Evacuate the device. 5001219089Spjd */ 5002219089Spjdstatic int 5003219089Spjdspa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd) 5004219089Spjd{ 5005219089Spjd uint64_t txg; 5006219089Spjd int error = 0; 5007219089Spjd 5008219089Spjd ASSERT(MUTEX_HELD(&spa_namespace_lock)); 5009219089Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5010219089Spjd ASSERT(vd == vd->vdev_top); 5011219089Spjd 5012219089Spjd /* 5013219089Spjd * Evacuate the device. We don't hold the config lock as writer 5014219089Spjd * since we need to do I/O but we do keep the 5015219089Spjd * spa_namespace_lock held. Once this completes the device 5016219089Spjd * should no longer have any blocks allocated on it. 5017219089Spjd */ 5018219089Spjd if (vd->vdev_islog) { 5019219089Spjd if (vd->vdev_stat.vs_alloc != 0) 5020219089Spjd error = spa_offline_log(spa); 5021219089Spjd } else { 5022219089Spjd error = ENOTSUP; 5023219089Spjd } 5024219089Spjd 5025219089Spjd if (error) 5026219089Spjd return (error); 5027219089Spjd 5028219089Spjd /* 5029219089Spjd * The evacuation succeeded. Remove any remaining MOS metadata 5030219089Spjd * associated with this vdev, and wait for these changes to sync. 5031219089Spjd */ 5032219089Spjd ASSERT3U(vd->vdev_stat.vs_alloc, ==, 0); 5033219089Spjd txg = spa_vdev_config_enter(spa); 5034219089Spjd vd->vdev_removing = B_TRUE; 5035219089Spjd vdev_dirty(vd, 0, NULL, txg); 5036219089Spjd vdev_config_dirty(vd); 5037219089Spjd spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 5038219089Spjd 5039219089Spjd return (0); 5040219089Spjd} 5041219089Spjd 5042219089Spjd/* 5043219089Spjd * Complete the removal by cleaning up the namespace. 5044219089Spjd */ 5045219089Spjdstatic void 5046219089Spjdspa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd) 5047219089Spjd{ 5048219089Spjd vdev_t *rvd = spa->spa_root_vdev; 5049219089Spjd uint64_t id = vd->vdev_id; 5050219089Spjd boolean_t last_vdev = (id == (rvd->vdev_children - 1)); 5051219089Spjd 5052219089Spjd ASSERT(MUTEX_HELD(&spa_namespace_lock)); 5053219089Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 5054219089Spjd ASSERT(vd == vd->vdev_top); 5055219089Spjd 5056219089Spjd /* 5057219089Spjd * Only remove any devices which are empty. 5058219089Spjd */ 5059219089Spjd if (vd->vdev_stat.vs_alloc != 0) 5060219089Spjd return; 5061219089Spjd 5062219089Spjd (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 5063219089Spjd 5064219089Spjd if (list_link_active(&vd->vdev_state_dirty_node)) 5065219089Spjd vdev_state_clean(vd); 5066219089Spjd if (list_link_active(&vd->vdev_config_dirty_node)) 5067219089Spjd vdev_config_clean(vd); 5068219089Spjd 5069219089Spjd vdev_free(vd); 5070219089Spjd 5071219089Spjd if (last_vdev) { 5072219089Spjd vdev_compact_children(rvd); 5073219089Spjd } else { 5074219089Spjd vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); 5075219089Spjd vdev_add_child(rvd, vd); 5076219089Spjd } 5077219089Spjd vdev_config_dirty(rvd); 5078219089Spjd 5079219089Spjd /* 5080219089Spjd * Reassess the health of our root vdev. 5081219089Spjd */ 5082219089Spjd vdev_reopen(rvd); 5083219089Spjd} 5084219089Spjd 5085219089Spjd/* 5086219089Spjd * Remove a device from the pool - 5087219089Spjd * 5088219089Spjd * Removing a device from the vdev namespace requires several steps 5089219089Spjd * and can take a significant amount of time. As a result we use 5090219089Spjd * the spa_vdev_config_[enter/exit] functions which allow us to 5091219089Spjd * grab and release the spa_config_lock while still holding the namespace 5092219089Spjd * lock. During each step the configuration is synced out. 5093219089Spjd */ 5094219089Spjd 5095219089Spjd/* 5096168404Spjd * Remove a device from the pool. Currently, this supports removing only hot 5097219089Spjd * spares, slogs, and level 2 ARC devices. 5098168404Spjd */ 5099168404Spjdint 5100168404Spjdspa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 5101168404Spjd{ 5102168404Spjd vdev_t *vd; 5103219089Spjd metaslab_group_t *mg; 5104185029Spjd nvlist_t **spares, **l2cache, *nv; 5105219089Spjd uint64_t txg = 0; 5106185029Spjd uint_t nspares, nl2cache; 5107185029Spjd int error = 0; 5108209962Smm boolean_t locked = MUTEX_HELD(&spa_namespace_lock); 5109168404Spjd 5110219089Spjd ASSERT(spa_writeable(spa)); 5111219089Spjd 5112209962Smm if (!locked) 5113209962Smm txg = spa_vdev_enter(spa); 5114168404Spjd 5115185029Spjd vd = spa_lookup_by_guid(spa, guid, B_FALSE); 5116168404Spjd 5117185029Spjd if (spa->spa_spares.sav_vdevs != NULL && 5118185029Spjd nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 5119185029Spjd ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && 5120185029Spjd (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { 5121185029Spjd /* 5122185029Spjd * Only remove the hot spare if it's not currently in use 5123185029Spjd * in this pool. 5124185029Spjd */ 5125185029Spjd if (vd == NULL || unspare) { 5126185029Spjd spa_vdev_remove_aux(spa->spa_spares.sav_config, 5127185029Spjd ZPOOL_CONFIG_SPARES, spares, nspares, nv); 5128185029Spjd spa_load_spares(spa); 5129185029Spjd spa->spa_spares.sav_sync = B_TRUE; 5130185029Spjd } else { 5131185029Spjd error = EBUSY; 5132168404Spjd } 5133185029Spjd } else if (spa->spa_l2cache.sav_vdevs != NULL && 5134185029Spjd nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 5135185029Spjd ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && 5136185029Spjd (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { 5137185029Spjd /* 5138185029Spjd * Cache devices can always be removed. 5139185029Spjd */ 5140185029Spjd spa_vdev_remove_aux(spa->spa_l2cache.sav_config, 5141185029Spjd ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); 5142185029Spjd spa_load_l2cache(spa); 5143185029Spjd spa->spa_l2cache.sav_sync = B_TRUE; 5144219089Spjd } else if (vd != NULL && vd->vdev_islog) { 5145219089Spjd ASSERT(!locked); 5146219089Spjd ASSERT(vd == vd->vdev_top); 5147219089Spjd 5148219089Spjd /* 5149219089Spjd * XXX - Once we have bp-rewrite this should 5150219089Spjd * become the common case. 5151219089Spjd */ 5152219089Spjd 5153219089Spjd mg = vd->vdev_mg; 5154219089Spjd 5155219089Spjd /* 5156219089Spjd * Stop allocating from this vdev. 5157219089Spjd */ 5158219089Spjd metaslab_group_passivate(mg); 5159219089Spjd 5160219089Spjd /* 5161219089Spjd * Wait for the youngest allocations and frees to sync, 5162219089Spjd * and then wait for the deferral of those frees to finish. 5163219089Spjd */ 5164219089Spjd spa_vdev_config_exit(spa, NULL, 5165219089Spjd txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); 5166219089Spjd 5167219089Spjd /* 5168219089Spjd * Attempt to evacuate the vdev. 5169219089Spjd */ 5170219089Spjd error = spa_vdev_remove_evacuate(spa, vd); 5171219089Spjd 5172219089Spjd txg = spa_vdev_config_enter(spa); 5173219089Spjd 5174219089Spjd /* 5175219089Spjd * If we couldn't evacuate the vdev, unwind. 5176219089Spjd */ 5177219089Spjd if (error) { 5178219089Spjd metaslab_group_activate(mg); 5179219089Spjd return (spa_vdev_exit(spa, NULL, txg, error)); 5180219089Spjd } 5181219089Spjd 5182219089Spjd /* 5183219089Spjd * Clean up the vdev namespace. 5184219089Spjd */ 5185219089Spjd spa_vdev_remove_from_namespace(spa, vd); 5186219089Spjd 5187185029Spjd } else if (vd != NULL) { 5188185029Spjd /* 5189185029Spjd * Normal vdevs cannot be removed (yet). 5190185029Spjd */ 5191185029Spjd error = ENOTSUP; 5192168404Spjd } else { 5193185029Spjd /* 5194185029Spjd * There is no vdev of any kind with the specified guid. 5195185029Spjd */ 5196185029Spjd error = ENOENT; 5197168404Spjd } 5198168404Spjd 5199209962Smm if (!locked) 5200209962Smm return (spa_vdev_exit(spa, NULL, txg, error)); 5201209962Smm 5202209962Smm return (error); 5203168404Spjd} 5204168404Spjd 5205168404Spjd/* 5206185029Spjd * Find any device that's done replacing, or a vdev marked 'unspare' that's 5207185029Spjd * current spared, so we can detach it. 5208168404Spjd */ 5209168404Spjdstatic vdev_t * 5210185029Spjdspa_vdev_resilver_done_hunt(vdev_t *vd) 5211168404Spjd{ 5212168404Spjd vdev_t *newvd, *oldvd; 5213168404Spjd 5214219089Spjd for (int c = 0; c < vd->vdev_children; c++) { 5215185029Spjd oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 5216168404Spjd if (oldvd != NULL) 5217168404Spjd return (oldvd); 5218168404Spjd } 5219168404Spjd 5220185029Spjd /* 5221219089Spjd * Check for a completed replacement. We always consider the first 5222219089Spjd * vdev in the list to be the oldest vdev, and the last one to be 5223219089Spjd * the newest (see spa_vdev_attach() for how that works). In 5224219089Spjd * the case where the newest vdev is faulted, we will not automatically 5225219089Spjd * remove it after a resilver completes. This is OK as it will require 5226219089Spjd * user intervention to determine which disk the admin wishes to keep. 5227185029Spjd */ 5228219089Spjd if (vd->vdev_ops == &vdev_replacing_ops) { 5229219089Spjd ASSERT(vd->vdev_children > 1); 5230219089Spjd 5231219089Spjd newvd = vd->vdev_child[vd->vdev_children - 1]; 5232168404Spjd oldvd = vd->vdev_child[0]; 5233168404Spjd 5234209962Smm if (vdev_dtl_empty(newvd, DTL_MISSING) && 5235219089Spjd vdev_dtl_empty(newvd, DTL_OUTAGE) && 5236209962Smm !vdev_dtl_required(oldvd)) 5237168404Spjd return (oldvd); 5238168404Spjd } 5239168404Spjd 5240185029Spjd /* 5241185029Spjd * Check for a completed resilver with the 'unspare' flag set. 5242185029Spjd */ 5243219089Spjd if (vd->vdev_ops == &vdev_spare_ops) { 5244219089Spjd vdev_t *first = vd->vdev_child[0]; 5245219089Spjd vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; 5246185029Spjd 5247219089Spjd if (last->vdev_unspare) { 5248219089Spjd oldvd = first; 5249219089Spjd newvd = last; 5250219089Spjd } else if (first->vdev_unspare) { 5251219089Spjd oldvd = last; 5252219089Spjd newvd = first; 5253219089Spjd } else { 5254219089Spjd oldvd = NULL; 5255219089Spjd } 5256219089Spjd 5257219089Spjd if (oldvd != NULL && 5258209962Smm vdev_dtl_empty(newvd, DTL_MISSING) && 5259219089Spjd vdev_dtl_empty(newvd, DTL_OUTAGE) && 5260219089Spjd !vdev_dtl_required(oldvd)) 5261185029Spjd return (oldvd); 5262219089Spjd 5263219089Spjd /* 5264219089Spjd * If there are more than two spares attached to a disk, 5265219089Spjd * and those spares are not required, then we want to 5266219089Spjd * attempt to free them up now so that they can be used 5267219089Spjd * by other pools. Once we're back down to a single 5268219089Spjd * disk+spare, we stop removing them. 5269219089Spjd */ 5270219089Spjd if (vd->vdev_children > 2) { 5271219089Spjd newvd = vd->vdev_child[1]; 5272219089Spjd 5273219089Spjd if (newvd->vdev_isspare && last->vdev_isspare && 5274219089Spjd vdev_dtl_empty(last, DTL_MISSING) && 5275219089Spjd vdev_dtl_empty(last, DTL_OUTAGE) && 5276219089Spjd !vdev_dtl_required(newvd)) 5277219089Spjd return (newvd); 5278185029Spjd } 5279185029Spjd } 5280185029Spjd 5281168404Spjd return (NULL); 5282168404Spjd} 5283168404Spjd 5284168404Spjdstatic void 5285185029Spjdspa_vdev_resilver_done(spa_t *spa) 5286168404Spjd{ 5287209962Smm vdev_t *vd, *pvd, *ppvd; 5288209962Smm uint64_t guid, sguid, pguid, ppguid; 5289168404Spjd 5290209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5291168404Spjd 5292185029Spjd while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 5293209962Smm pvd = vd->vdev_parent; 5294209962Smm ppvd = pvd->vdev_parent; 5295168404Spjd guid = vd->vdev_guid; 5296209962Smm pguid = pvd->vdev_guid; 5297209962Smm ppguid = ppvd->vdev_guid; 5298209962Smm sguid = 0; 5299168404Spjd /* 5300168404Spjd * If we have just finished replacing a hot spared device, then 5301168404Spjd * we need to detach the parent's first child (the original hot 5302168404Spjd * spare) as well. 5303168404Spjd */ 5304219089Spjd if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && 5305219089Spjd ppvd->vdev_children == 2) { 5306168404Spjd ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 5307209962Smm sguid = ppvd->vdev_child[1]->vdev_guid; 5308168404Spjd } 5309209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 5310209962Smm if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) 5311168404Spjd return; 5312209962Smm if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) 5313168404Spjd return; 5314209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5315168404Spjd } 5316168404Spjd 5317209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 5318168404Spjd} 5319168404Spjd 5320168404Spjd/* 5321219089Spjd * Update the stored path or FRU for this vdev. 5322168404Spjd */ 5323168404Spjdint 5324209962Smmspa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, 5325209962Smm boolean_t ispath) 5326168404Spjd{ 5327185029Spjd vdev_t *vd; 5328219089Spjd boolean_t sync = B_FALSE; 5329168404Spjd 5330219089Spjd ASSERT(spa_writeable(spa)); 5331168404Spjd 5332219089Spjd spa_vdev_state_enter(spa, SCL_ALL); 5333219089Spjd 5334209962Smm if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 5335219089Spjd return (spa_vdev_state_exit(spa, NULL, ENOENT)); 5336168404Spjd 5337168404Spjd if (!vd->vdev_ops->vdev_op_leaf) 5338219089Spjd return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 5339168404Spjd 5340209962Smm if (ispath) { 5341219089Spjd if (strcmp(value, vd->vdev_path) != 0) { 5342219089Spjd spa_strfree(vd->vdev_path); 5343219089Spjd vd->vdev_path = spa_strdup(value); 5344219089Spjd sync = B_TRUE; 5345219089Spjd } 5346209962Smm } else { 5347219089Spjd if (vd->vdev_fru == NULL) { 5348219089Spjd vd->vdev_fru = spa_strdup(value); 5349219089Spjd sync = B_TRUE; 5350219089Spjd } else if (strcmp(value, vd->vdev_fru) != 0) { 5351209962Smm spa_strfree(vd->vdev_fru); 5352219089Spjd vd->vdev_fru = spa_strdup(value); 5353219089Spjd sync = B_TRUE; 5354219089Spjd } 5355209962Smm } 5356168404Spjd 5357219089Spjd return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); 5358168404Spjd} 5359168404Spjd 5360209962Smmint 5361209962Smmspa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 5362209962Smm{ 5363209962Smm return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); 5364209962Smm} 5365209962Smm 5366209962Smmint 5367209962Smmspa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) 5368209962Smm{ 5369209962Smm return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); 5370209962Smm} 5371209962Smm 5372168404Spjd/* 5373168404Spjd * ========================================================================== 5374219089Spjd * SPA Scanning 5375168404Spjd * ========================================================================== 5376168404Spjd */ 5377168404Spjd 5378168404Spjdint 5379219089Spjdspa_scan_stop(spa_t *spa) 5380168404Spjd{ 5381185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5382219089Spjd if (dsl_scan_resilvering(spa->spa_dsl_pool)) 5383219089Spjd return (EBUSY); 5384219089Spjd return (dsl_scan_cancel(spa->spa_dsl_pool)); 5385219089Spjd} 5386168404Spjd 5387219089Spjdint 5388219089Spjdspa_scan(spa_t *spa, pool_scan_func_t func) 5389219089Spjd{ 5390219089Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5391219089Spjd 5392219089Spjd if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) 5393168404Spjd return (ENOTSUP); 5394168404Spjd 5395168404Spjd /* 5396185029Spjd * If a resilver was requested, but there is no DTL on a 5397185029Spjd * writeable leaf device, we have nothing to do. 5398168404Spjd */ 5399219089Spjd if (func == POOL_SCAN_RESILVER && 5400185029Spjd !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 5401185029Spjd spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 5402168404Spjd return (0); 5403168404Spjd } 5404168404Spjd 5405219089Spjd return (dsl_scan(spa->spa_dsl_pool, func)); 5406168404Spjd} 5407168404Spjd 5408168404Spjd/* 5409168404Spjd * ========================================================================== 5410168404Spjd * SPA async task processing 5411168404Spjd * ========================================================================== 5412168404Spjd */ 5413168404Spjd 5414168404Spjdstatic void 5415185029Spjdspa_async_remove(spa_t *spa, vdev_t *vd) 5416168404Spjd{ 5417185029Spjd if (vd->vdev_remove_wanted) { 5418219089Spjd vd->vdev_remove_wanted = B_FALSE; 5419219089Spjd vd->vdev_delayed_close = B_FALSE; 5420185029Spjd vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 5421209962Smm 5422209962Smm /* 5423209962Smm * We want to clear the stats, but we don't want to do a full 5424209962Smm * vdev_clear() as that will cause us to throw away 5425209962Smm * degraded/faulted state as well as attempt to reopen the 5426209962Smm * device, all of which is a waste. 5427209962Smm */ 5428209962Smm vd->vdev_stat.vs_read_errors = 0; 5429209962Smm vd->vdev_stat.vs_write_errors = 0; 5430209962Smm vd->vdev_stat.vs_checksum_errors = 0; 5431209962Smm 5432185029Spjd vdev_state_dirty(vd->vdev_top); 5433185029Spjd } 5434168404Spjd 5435185029Spjd for (int c = 0; c < vd->vdev_children; c++) 5436185029Spjd spa_async_remove(spa, vd->vdev_child[c]); 5437185029Spjd} 5438168404Spjd 5439185029Spjdstatic void 5440185029Spjdspa_async_probe(spa_t *spa, vdev_t *vd) 5441185029Spjd{ 5442185029Spjd if (vd->vdev_probe_wanted) { 5443219089Spjd vd->vdev_probe_wanted = B_FALSE; 5444185029Spjd vdev_reopen(vd); /* vdev_open() does the actual probe */ 5445168404Spjd } 5446168404Spjd 5447185029Spjd for (int c = 0; c < vd->vdev_children; c++) 5448185029Spjd spa_async_probe(spa, vd->vdev_child[c]); 5449168404Spjd} 5450168404Spjd 5451168404Spjdstatic void 5452219089Spjdspa_async_autoexpand(spa_t *spa, vdev_t *vd) 5453219089Spjd{ 5454219089Spjd sysevent_id_t eid; 5455219089Spjd nvlist_t *attr; 5456219089Spjd char *physpath; 5457219089Spjd 5458219089Spjd if (!spa->spa_autoexpand) 5459219089Spjd return; 5460219089Spjd 5461219089Spjd for (int c = 0; c < vd->vdev_children; c++) { 5462219089Spjd vdev_t *cvd = vd->vdev_child[c]; 5463219089Spjd spa_async_autoexpand(spa, cvd); 5464219089Spjd } 5465219089Spjd 5466219089Spjd if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) 5467219089Spjd return; 5468219089Spjd 5469219089Spjd physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP); 5470219089Spjd (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath); 5471219089Spjd 5472219089Spjd VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5473219089Spjd VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0); 5474219089Spjd 5475219089Spjd (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS, 5476219089Spjd ESC_ZFS_VDEV_AUTOEXPAND, attr, &eid, DDI_SLEEP); 5477219089Spjd 5478219089Spjd nvlist_free(attr); 5479219089Spjd kmem_free(physpath, MAXPATHLEN); 5480219089Spjd} 5481219089Spjd 5482219089Spjdstatic void 5483168404Spjdspa_async_thread(void *arg) 5484168404Spjd{ 5485168404Spjd spa_t *spa = arg; 5486168404Spjd int tasks; 5487168404Spjd 5488168404Spjd ASSERT(spa->spa_sync_on); 5489168404Spjd 5490168404Spjd mutex_enter(&spa->spa_async_lock); 5491168404Spjd tasks = spa->spa_async_tasks; 5492168404Spjd spa->spa_async_tasks = 0; 5493168404Spjd mutex_exit(&spa->spa_async_lock); 5494168404Spjd 5495168404Spjd /* 5496168404Spjd * See if the config needs to be updated. 5497168404Spjd */ 5498168404Spjd if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 5499219089Spjd uint64_t old_space, new_space; 5500219089Spjd 5501168404Spjd mutex_enter(&spa_namespace_lock); 5502219089Spjd old_space = metaslab_class_get_space(spa_normal_class(spa)); 5503168404Spjd spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 5504219089Spjd new_space = metaslab_class_get_space(spa_normal_class(spa)); 5505168404Spjd mutex_exit(&spa_namespace_lock); 5506219089Spjd 5507219089Spjd /* 5508219089Spjd * If the pool grew as a result of the config update, 5509219089Spjd * then log an internal history event. 5510219089Spjd */ 5511219089Spjd if (new_space != old_space) { 5512219089Spjd spa_history_log_internal(LOG_POOL_VDEV_ONLINE, 5513219089Spjd spa, NULL, 5514219089Spjd "pool '%s' size: %llu(+%llu)", 5515219089Spjd spa_name(spa), new_space, new_space - old_space); 5516219089Spjd } 5517168404Spjd } 5518168404Spjd 5519168404Spjd /* 5520185029Spjd * See if any devices need to be marked REMOVED. 5521168404Spjd */ 5522185029Spjd if (tasks & SPA_ASYNC_REMOVE) { 5523219089Spjd spa_vdev_state_enter(spa, SCL_NONE); 5524185029Spjd spa_async_remove(spa, spa->spa_root_vdev); 5525185029Spjd for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 5526185029Spjd spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); 5527185029Spjd for (int i = 0; i < spa->spa_spares.sav_count; i++) 5528185029Spjd spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); 5529185029Spjd (void) spa_vdev_state_exit(spa, NULL, 0); 5530185029Spjd } 5531168404Spjd 5532219089Spjd if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { 5533219089Spjd spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 5534219089Spjd spa_async_autoexpand(spa, spa->spa_root_vdev); 5535219089Spjd spa_config_exit(spa, SCL_CONFIG, FTAG); 5536219089Spjd } 5537219089Spjd 5538168404Spjd /* 5539185029Spjd * See if any devices need to be probed. 5540168404Spjd */ 5541185029Spjd if (tasks & SPA_ASYNC_PROBE) { 5542219089Spjd spa_vdev_state_enter(spa, SCL_NONE); 5543185029Spjd spa_async_probe(spa, spa->spa_root_vdev); 5544185029Spjd (void) spa_vdev_state_exit(spa, NULL, 0); 5545185029Spjd } 5546168404Spjd 5547168404Spjd /* 5548185029Spjd * If any devices are done replacing, detach them. 5549168404Spjd */ 5550185029Spjd if (tasks & SPA_ASYNC_RESILVER_DONE) 5551185029Spjd spa_vdev_resilver_done(spa); 5552168404Spjd 5553168404Spjd /* 5554168404Spjd * Kick off a resilver. 5555168404Spjd */ 5556168404Spjd if (tasks & SPA_ASYNC_RESILVER) 5557219089Spjd dsl_resilver_restart(spa->spa_dsl_pool, 0); 5558168404Spjd 5559168404Spjd /* 5560168404Spjd * Let the world know that we're done. 5561168404Spjd */ 5562168404Spjd mutex_enter(&spa->spa_async_lock); 5563168404Spjd spa->spa_async_thread = NULL; 5564168404Spjd cv_broadcast(&spa->spa_async_cv); 5565168404Spjd mutex_exit(&spa->spa_async_lock); 5566168404Spjd thread_exit(); 5567168404Spjd} 5568168404Spjd 5569168404Spjdvoid 5570168404Spjdspa_async_suspend(spa_t *spa) 5571168404Spjd{ 5572168404Spjd mutex_enter(&spa->spa_async_lock); 5573168404Spjd spa->spa_async_suspended++; 5574168404Spjd while (spa->spa_async_thread != NULL) 5575168404Spjd cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 5576168404Spjd mutex_exit(&spa->spa_async_lock); 5577168404Spjd} 5578168404Spjd 5579168404Spjdvoid 5580168404Spjdspa_async_resume(spa_t *spa) 5581168404Spjd{ 5582168404Spjd mutex_enter(&spa->spa_async_lock); 5583168404Spjd ASSERT(spa->spa_async_suspended != 0); 5584168404Spjd spa->spa_async_suspended--; 5585168404Spjd mutex_exit(&spa->spa_async_lock); 5586168404Spjd} 5587168404Spjd 5588168404Spjdstatic void 5589168404Spjdspa_async_dispatch(spa_t *spa) 5590168404Spjd{ 5591168404Spjd mutex_enter(&spa->spa_async_lock); 5592168404Spjd if (spa->spa_async_tasks && !spa->spa_async_suspended && 5593168404Spjd spa->spa_async_thread == NULL && 5594168404Spjd rootdir != NULL && !vn_is_readonly(rootdir)) 5595168404Spjd spa->spa_async_thread = thread_create(NULL, 0, 5596168404Spjd spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 5597168404Spjd mutex_exit(&spa->spa_async_lock); 5598168404Spjd} 5599168404Spjd 5600168404Spjdvoid 5601168404Spjdspa_async_request(spa_t *spa, int task) 5602168404Spjd{ 5603219089Spjd zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); 5604168404Spjd mutex_enter(&spa->spa_async_lock); 5605168404Spjd spa->spa_async_tasks |= task; 5606168404Spjd mutex_exit(&spa->spa_async_lock); 5607168404Spjd} 5608168404Spjd 5609168404Spjd/* 5610168404Spjd * ========================================================================== 5611168404Spjd * SPA syncing routines 5612168404Spjd * ========================================================================== 5613168404Spjd */ 5614168404Spjd 5615219089Spjdstatic int 5616219089Spjdbpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 5617168404Spjd{ 5618219089Spjd bpobj_t *bpo = arg; 5619219089Spjd bpobj_enqueue(bpo, bp, tx); 5620219089Spjd return (0); 5621219089Spjd} 5622168404Spjd 5623219089Spjdstatic int 5624219089Spjdspa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 5625219089Spjd{ 5626219089Spjd zio_t *zio = arg; 5627168404Spjd 5628219089Spjd zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp, 5629219089Spjd zio->io_flags)); 5630219089Spjd return (0); 5631168404Spjd} 5632168404Spjd 5633168404Spjdstatic void 5634168404Spjdspa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 5635168404Spjd{ 5636168404Spjd char *packed = NULL; 5637185029Spjd size_t bufsize; 5638168404Spjd size_t nvsize = 0; 5639168404Spjd dmu_buf_t *db; 5640168404Spjd 5641168404Spjd VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 5642168404Spjd 5643185029Spjd /* 5644185029Spjd * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 5645185029Spjd * information. This avoids the dbuf_will_dirty() path and 5646185029Spjd * saves us a pre-read to get data we don't actually care about. 5647185029Spjd */ 5648236884Smm bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE); 5649185029Spjd packed = kmem_alloc(bufsize, KM_SLEEP); 5650168404Spjd 5651168404Spjd VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 5652168404Spjd KM_SLEEP) == 0); 5653185029Spjd bzero(packed + nvsize, bufsize - nvsize); 5654168404Spjd 5655185029Spjd dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); 5656168404Spjd 5657185029Spjd kmem_free(packed, bufsize); 5658168404Spjd 5659168404Spjd VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 5660168404Spjd dmu_buf_will_dirty(db, tx); 5661168404Spjd *(uint64_t *)db->db_data = nvsize; 5662168404Spjd dmu_buf_rele(db, FTAG); 5663168404Spjd} 5664168404Spjd 5665168404Spjdstatic void 5666185029Spjdspa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 5667185029Spjd const char *config, const char *entry) 5668168404Spjd{ 5669168404Spjd nvlist_t *nvroot; 5670185029Spjd nvlist_t **list; 5671168404Spjd int i; 5672168404Spjd 5673185029Spjd if (!sav->sav_sync) 5674168404Spjd return; 5675168404Spjd 5676168404Spjd /* 5677185029Spjd * Update the MOS nvlist describing the list of available devices. 5678185029Spjd * spa_validate_aux() will have already made sure this nvlist is 5679185029Spjd * valid and the vdevs are labeled appropriately. 5680168404Spjd */ 5681185029Spjd if (sav->sav_object == 0) { 5682185029Spjd sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 5683185029Spjd DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 5684185029Spjd sizeof (uint64_t), tx); 5685168404Spjd VERIFY(zap_update(spa->spa_meta_objset, 5686185029Spjd DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 5687185029Spjd &sav->sav_object, tx) == 0); 5688168404Spjd } 5689168404Spjd 5690168404Spjd VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5691185029Spjd if (sav->sav_count == 0) { 5692185029Spjd VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); 5693168404Spjd } else { 5694185029Spjd list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 5695185029Spjd for (i = 0; i < sav->sav_count; i++) 5696185029Spjd list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 5697219089Spjd B_FALSE, VDEV_CONFIG_L2CACHE); 5698185029Spjd VERIFY(nvlist_add_nvlist_array(nvroot, config, list, 5699185029Spjd sav->sav_count) == 0); 5700185029Spjd for (i = 0; i < sav->sav_count; i++) 5701185029Spjd nvlist_free(list[i]); 5702185029Spjd kmem_free(list, sav->sav_count * sizeof (void *)); 5703168404Spjd } 5704168404Spjd 5705185029Spjd spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 5706168404Spjd nvlist_free(nvroot); 5707168404Spjd 5708185029Spjd sav->sav_sync = B_FALSE; 5709168404Spjd} 5710168404Spjd 5711168404Spjdstatic void 5712168404Spjdspa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 5713168404Spjd{ 5714168404Spjd nvlist_t *config; 5715168404Spjd 5716185029Spjd if (list_is_empty(&spa->spa_config_dirty_list)) 5717168404Spjd return; 5718168404Spjd 5719185029Spjd spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 5720168404Spjd 5721185029Spjd config = spa_config_generate(spa, spa->spa_root_vdev, 5722185029Spjd dmu_tx_get_txg(tx), B_FALSE); 5723185029Spjd 5724185029Spjd spa_config_exit(spa, SCL_STATE, FTAG); 5725185029Spjd 5726168404Spjd if (spa->spa_config_syncing) 5727168404Spjd nvlist_free(spa->spa_config_syncing); 5728168404Spjd spa->spa_config_syncing = config; 5729168404Spjd 5730168404Spjd spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 5731168404Spjd} 5732168404Spjd 5733236884Smmstatic void 5734236884Smmspa_sync_version(void *arg1, void *arg2, dmu_tx_t *tx) 5735236884Smm{ 5736236884Smm spa_t *spa = arg1; 5737236884Smm uint64_t version = *(uint64_t *)arg2; 5738236884Smm 5739236884Smm /* 5740236884Smm * Setting the version is special cased when first creating the pool. 5741236884Smm */ 5742236884Smm ASSERT(tx->tx_txg != TXG_INITIAL); 5743236884Smm 5744236884Smm ASSERT(version <= SPA_VERSION); 5745236884Smm ASSERT(version >= spa_version(spa)); 5746236884Smm 5747236884Smm spa->spa_uberblock.ub_version = version; 5748236884Smm vdev_config_dirty(spa->spa_root_vdev); 5749236884Smm} 5750236884Smm 5751185029Spjd/* 5752185029Spjd * Set zpool properties. 5753185029Spjd */ 5754168404Spjdstatic void 5755219089Spjdspa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx) 5756168404Spjd{ 5757168404Spjd spa_t *spa = arg1; 5758185029Spjd objset_t *mos = spa->spa_meta_objset; 5759168404Spjd nvlist_t *nvp = arg2; 5760236884Smm nvpair_t *elem = NULL; 5761168404Spjd 5762168404Spjd mutex_enter(&spa->spa_props_lock); 5763168404Spjd 5764185029Spjd while ((elem = nvlist_next_nvpair(nvp, elem))) { 5765236884Smm uint64_t intval; 5766236884Smm char *strval, *fname; 5767236884Smm zpool_prop_t prop; 5768236884Smm const char *propname; 5769236884Smm zprop_type_t proptype; 5770236884Smm zfeature_info_t *feature; 5771236884Smm 5772185029Spjd switch (prop = zpool_name_to_prop(nvpair_name(elem))) { 5773236884Smm case ZPROP_INVAL: 5774236884Smm /* 5775236884Smm * We checked this earlier in spa_prop_validate(). 5776236884Smm */ 5777236884Smm ASSERT(zpool_prop_feature(nvpair_name(elem))); 5778236884Smm 5779236884Smm fname = strchr(nvpair_name(elem), '@') + 1; 5780236884Smm VERIFY3U(0, ==, zfeature_lookup_name(fname, &feature)); 5781236884Smm 5782236884Smm spa_feature_enable(spa, feature, tx); 5783236884Smm break; 5784236884Smm 5785185029Spjd case ZPOOL_PROP_VERSION: 5786236884Smm VERIFY(nvpair_value_uint64(elem, &intval) == 0); 5787185029Spjd /* 5788236884Smm * The version is synced seperatly before other 5789236884Smm * properties and should be correct by now. 5790185029Spjd */ 5791236884Smm ASSERT3U(spa_version(spa), >=, intval); 5792185029Spjd break; 5793168404Spjd 5794185029Spjd case ZPOOL_PROP_ALTROOT: 5795185029Spjd /* 5796185029Spjd * 'altroot' is a non-persistent property. It should 5797185029Spjd * have been set temporarily at creation or import time. 5798185029Spjd */ 5799185029Spjd ASSERT(spa->spa_root != NULL); 5800185029Spjd break; 5801168404Spjd 5802219089Spjd case ZPOOL_PROP_READONLY: 5803185029Spjd case ZPOOL_PROP_CACHEFILE: 5804185029Spjd /* 5805219089Spjd * 'readonly' and 'cachefile' are also non-persisitent 5806219089Spjd * properties. 5807185029Spjd */ 5808168404Spjd break; 5809228103Smm case ZPOOL_PROP_COMMENT: 5810228103Smm VERIFY(nvpair_value_string(elem, &strval) == 0); 5811228103Smm if (spa->spa_comment != NULL) 5812228103Smm spa_strfree(spa->spa_comment); 5813228103Smm spa->spa_comment = spa_strdup(strval); 5814228103Smm /* 5815228103Smm * We need to dirty the configuration on all the vdevs 5816228103Smm * so that their labels get updated. It's unnecessary 5817228103Smm * to do this for pool creation since the vdev's 5818228103Smm * configuratoin has already been dirtied. 5819228103Smm */ 5820228103Smm if (tx->tx_txg != TXG_INITIAL) 5821228103Smm vdev_config_dirty(spa->spa_root_vdev); 5822228103Smm break; 5823185029Spjd default: 5824185029Spjd /* 5825185029Spjd * Set pool property values in the poolprops mos object. 5826185029Spjd */ 5827185029Spjd if (spa->spa_pool_props_object == 0) { 5828236884Smm spa->spa_pool_props_object = 5829236884Smm zap_create_link(mos, DMU_OT_POOL_PROPS, 5830185029Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 5831236884Smm tx); 5832185029Spjd } 5833185029Spjd 5834185029Spjd /* normalize the property name */ 5835185029Spjd propname = zpool_prop_to_name(prop); 5836185029Spjd proptype = zpool_prop_get_type(prop); 5837185029Spjd 5838185029Spjd if (nvpair_type(elem) == DATA_TYPE_STRING) { 5839185029Spjd ASSERT(proptype == PROP_TYPE_STRING); 5840185029Spjd VERIFY(nvpair_value_string(elem, &strval) == 0); 5841185029Spjd VERIFY(zap_update(mos, 5842185029Spjd spa->spa_pool_props_object, propname, 5843185029Spjd 1, strlen(strval) + 1, strval, tx) == 0); 5844185029Spjd 5845185029Spjd } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 5846185029Spjd VERIFY(nvpair_value_uint64(elem, &intval) == 0); 5847185029Spjd 5848185029Spjd if (proptype == PROP_TYPE_INDEX) { 5849185029Spjd const char *unused; 5850185029Spjd VERIFY(zpool_prop_index_to_string( 5851185029Spjd prop, intval, &unused) == 0); 5852185029Spjd } 5853185029Spjd VERIFY(zap_update(mos, 5854185029Spjd spa->spa_pool_props_object, propname, 5855185029Spjd 8, 1, &intval, tx) == 0); 5856185029Spjd } else { 5857185029Spjd ASSERT(0); /* not allowed */ 5858185029Spjd } 5859185029Spjd 5860185029Spjd switch (prop) { 5861185029Spjd case ZPOOL_PROP_DELEGATION: 5862185029Spjd spa->spa_delegation = intval; 5863185029Spjd break; 5864185029Spjd case ZPOOL_PROP_BOOTFS: 5865185029Spjd spa->spa_bootfs = intval; 5866185029Spjd break; 5867185029Spjd case ZPOOL_PROP_FAILUREMODE: 5868185029Spjd spa->spa_failmode = intval; 5869185029Spjd break; 5870219089Spjd case ZPOOL_PROP_AUTOEXPAND: 5871219089Spjd spa->spa_autoexpand = intval; 5872219089Spjd if (tx->tx_txg != TXG_INITIAL) 5873219089Spjd spa_async_request(spa, 5874219089Spjd SPA_ASYNC_AUTOEXPAND); 5875219089Spjd break; 5876219089Spjd case ZPOOL_PROP_DEDUPDITTO: 5877219089Spjd spa->spa_dedup_ditto = intval; 5878219089Spjd break; 5879185029Spjd default: 5880185029Spjd break; 5881185029Spjd } 5882168404Spjd } 5883185029Spjd 5884185029Spjd /* log internal history if this is not a zpool create */ 5885185029Spjd if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY && 5886185029Spjd tx->tx_txg != TXG_INITIAL) { 5887219089Spjd spa_history_log_internal(LOG_POOL_PROPSET, 5888219089Spjd spa, tx, "%s %lld %s", 5889185029Spjd nvpair_name(elem), intval, spa_name(spa)); 5890185029Spjd } 5891168404Spjd } 5892185029Spjd 5893185029Spjd mutex_exit(&spa->spa_props_lock); 5894168404Spjd} 5895168404Spjd 5896168404Spjd/* 5897219089Spjd * Perform one-time upgrade on-disk changes. spa_version() does not 5898219089Spjd * reflect the new version this txg, so there must be no changes this 5899219089Spjd * txg to anything that the upgrade code depends on after it executes. 5900219089Spjd * Therefore this must be called after dsl_pool_sync() does the sync 5901219089Spjd * tasks. 5902219089Spjd */ 5903219089Spjdstatic void 5904219089Spjdspa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) 5905219089Spjd{ 5906219089Spjd dsl_pool_t *dp = spa->spa_dsl_pool; 5907219089Spjd 5908219089Spjd ASSERT(spa->spa_sync_pass == 1); 5909219089Spjd 5910219089Spjd if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && 5911219089Spjd spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { 5912219089Spjd dsl_pool_create_origin(dp, tx); 5913219089Spjd 5914219089Spjd /* Keeping the origin open increases spa_minref */ 5915219089Spjd spa->spa_minref += 3; 5916219089Spjd } 5917219089Spjd 5918219089Spjd if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && 5919219089Spjd spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { 5920219089Spjd dsl_pool_upgrade_clones(dp, tx); 5921219089Spjd } 5922219089Spjd 5923219089Spjd if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && 5924219089Spjd spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { 5925219089Spjd dsl_pool_upgrade_dir_clones(dp, tx); 5926219089Spjd 5927219089Spjd /* Keeping the freedir open increases spa_minref */ 5928219089Spjd spa->spa_minref += 3; 5929219089Spjd } 5930236884Smm 5931236884Smm if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES && 5932236884Smm spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 5933236884Smm spa_feature_create_zap_objects(spa, tx); 5934236884Smm } 5935219089Spjd} 5936219089Spjd 5937219089Spjd/* 5938168404Spjd * Sync the specified transaction group. New blocks may be dirtied as 5939168404Spjd * part of the process, so we iterate until it converges. 5940168404Spjd */ 5941168404Spjdvoid 5942168404Spjdspa_sync(spa_t *spa, uint64_t txg) 5943168404Spjd{ 5944168404Spjd dsl_pool_t *dp = spa->spa_dsl_pool; 5945168404Spjd objset_t *mos = spa->spa_meta_objset; 5946219089Spjd bpobj_t *defer_bpo = &spa->spa_deferred_bpobj; 5947219089Spjd bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; 5948168404Spjd vdev_t *rvd = spa->spa_root_vdev; 5949168404Spjd vdev_t *vd; 5950168404Spjd dmu_tx_t *tx; 5951185029Spjd int error; 5952168404Spjd 5953219089Spjd VERIFY(spa_writeable(spa)); 5954219089Spjd 5955168404Spjd /* 5956168404Spjd * Lock out configuration changes. 5957168404Spjd */ 5958185029Spjd spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 5959168404Spjd 5960168404Spjd spa->spa_syncing_txg = txg; 5961168404Spjd spa->spa_sync_pass = 0; 5962168404Spjd 5963185029Spjd /* 5964185029Spjd * If there are any pending vdev state changes, convert them 5965185029Spjd * into config changes that go out with this transaction group. 5966185029Spjd */ 5967185029Spjd spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 5968209962Smm while (list_head(&spa->spa_state_dirty_list) != NULL) { 5969209962Smm /* 5970209962Smm * We need the write lock here because, for aux vdevs, 5971209962Smm * calling vdev_config_dirty() modifies sav_config. 5972209962Smm * This is ugly and will become unnecessary when we 5973209962Smm * eliminate the aux vdev wart by integrating all vdevs 5974209962Smm * into the root vdev tree. 5975209962Smm */ 5976209962Smm spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 5977209962Smm spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); 5978209962Smm while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 5979209962Smm vdev_state_clean(vd); 5980209962Smm vdev_config_dirty(vd); 5981209962Smm } 5982209962Smm spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 5983209962Smm spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 5984185029Spjd } 5985185029Spjd spa_config_exit(spa, SCL_STATE, FTAG); 5986185029Spjd 5987168404Spjd tx = dmu_tx_create_assigned(dp, txg); 5988168404Spjd 5989168404Spjd /* 5990185029Spjd * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 5991168404Spjd * set spa_deflate if we have no raid-z vdevs. 5992168404Spjd */ 5993185029Spjd if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 5994185029Spjd spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 5995168404Spjd int i; 5996168404Spjd 5997168404Spjd for (i = 0; i < rvd->vdev_children; i++) { 5998168404Spjd vd = rvd->vdev_child[i]; 5999168404Spjd if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 6000168404Spjd break; 6001168404Spjd } 6002168404Spjd if (i == rvd->vdev_children) { 6003168404Spjd spa->spa_deflate = TRUE; 6004168404Spjd VERIFY(0 == zap_add(spa->spa_meta_objset, 6005168404Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 6006168404Spjd sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 6007168404Spjd } 6008168404Spjd } 6009168404Spjd 6010168404Spjd /* 6011219089Spjd * If anything has changed in this txg, or if someone is waiting 6012219089Spjd * for this txg to sync (eg, spa_vdev_remove()), push the 6013219089Spjd * deferred frees from the previous txg. If not, leave them 6014219089Spjd * alone so that we don't generate work on an otherwise idle 6015219089Spjd * system. 6016168404Spjd */ 6017168404Spjd if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 6018168404Spjd !txg_list_empty(&dp->dp_dirty_dirs, txg) || 6019219089Spjd !txg_list_empty(&dp->dp_sync_tasks, txg) || 6020219089Spjd ((dsl_scan_active(dp->dp_scan) || 6021219089Spjd txg_sync_waiting(dp)) && !spa_shutting_down(spa))) { 6022219089Spjd zio_t *zio = zio_root(spa, NULL, NULL, 0); 6023219089Spjd VERIFY3U(bpobj_iterate(defer_bpo, 6024219089Spjd spa_free_sync_cb, zio, tx), ==, 0); 6025219089Spjd VERIFY3U(zio_wait(zio), ==, 0); 6026219089Spjd } 6027168404Spjd 6028168404Spjd /* 6029168404Spjd * Iterate to convergence. 6030168404Spjd */ 6031168404Spjd do { 6032219089Spjd int pass = ++spa->spa_sync_pass; 6033168404Spjd 6034168404Spjd spa_sync_config_object(spa, tx); 6035185029Spjd spa_sync_aux_dev(spa, &spa->spa_spares, tx, 6036185029Spjd ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 6037185029Spjd spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 6038185029Spjd ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 6039168404Spjd spa_errlog_sync(spa, txg); 6040168404Spjd dsl_pool_sync(dp, txg); 6041168404Spjd 6042219089Spjd if (pass <= SYNC_PASS_DEFERRED_FREE) { 6043219089Spjd zio_t *zio = zio_root(spa, NULL, NULL, 0); 6044219089Spjd bplist_iterate(free_bpl, spa_free_sync_cb, 6045219089Spjd zio, tx); 6046219089Spjd VERIFY(zio_wait(zio) == 0); 6047219089Spjd } else { 6048219089Spjd bplist_iterate(free_bpl, bpobj_enqueue_cb, 6049219089Spjd defer_bpo, tx); 6050168404Spjd } 6051168404Spjd 6052219089Spjd ddt_sync(spa, txg); 6053219089Spjd dsl_scan_sync(dp, tx); 6054168404Spjd 6055219089Spjd while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) 6056219089Spjd vdev_sync(vd, txg); 6057168404Spjd 6058219089Spjd if (pass == 1) 6059219089Spjd spa_sync_upgrades(spa, tx); 6060168404Spjd 6061219089Spjd } while (dmu_objset_is_dirty(mos, txg)); 6062219089Spjd 6063168404Spjd /* 6064168404Spjd * Rewrite the vdev configuration (which includes the uberblock) 6065168404Spjd * to commit the transaction group. 6066168404Spjd * 6067185029Spjd * If there are no dirty vdevs, we sync the uberblock to a few 6068185029Spjd * random top-level vdevs that are known to be visible in the 6069185029Spjd * config cache (see spa_vdev_add() for a complete description). 6070185029Spjd * If there *are* dirty vdevs, sync the uberblock to all vdevs. 6071168404Spjd */ 6072185029Spjd for (;;) { 6073185029Spjd /* 6074185029Spjd * We hold SCL_STATE to prevent vdev open/close/etc. 6075185029Spjd * while we're attempting to write the vdev labels. 6076185029Spjd */ 6077185029Spjd spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6078168404Spjd 6079185029Spjd if (list_is_empty(&spa->spa_config_dirty_list)) { 6080185029Spjd vdev_t *svd[SPA_DVAS_PER_BP]; 6081185029Spjd int svdcount = 0; 6082185029Spjd int children = rvd->vdev_children; 6083185029Spjd int c0 = spa_get_random(children); 6084185029Spjd 6085219089Spjd for (int c = 0; c < children; c++) { 6086185029Spjd vd = rvd->vdev_child[(c0 + c) % children]; 6087185029Spjd if (vd->vdev_ms_array == 0 || vd->vdev_islog) 6088185029Spjd continue; 6089185029Spjd svd[svdcount++] = vd; 6090185029Spjd if (svdcount == SPA_DVAS_PER_BP) 6091185029Spjd break; 6092185029Spjd } 6093213198Smm error = vdev_config_sync(svd, svdcount, txg, B_FALSE); 6094213198Smm if (error != 0) 6095213198Smm error = vdev_config_sync(svd, svdcount, txg, 6096213198Smm B_TRUE); 6097185029Spjd } else { 6098185029Spjd error = vdev_config_sync(rvd->vdev_child, 6099213198Smm rvd->vdev_children, txg, B_FALSE); 6100213198Smm if (error != 0) 6101213198Smm error = vdev_config_sync(rvd->vdev_child, 6102213198Smm rvd->vdev_children, txg, B_TRUE); 6103168404Spjd } 6104185029Spjd 6105185029Spjd spa_config_exit(spa, SCL_STATE, FTAG); 6106185029Spjd 6107185029Spjd if (error == 0) 6108185029Spjd break; 6109185029Spjd zio_suspend(spa, NULL); 6110185029Spjd zio_resume_wait(spa); 6111168404Spjd } 6112168404Spjd dmu_tx_commit(tx); 6113168404Spjd 6114168404Spjd /* 6115168404Spjd * Clear the dirty config list. 6116168404Spjd */ 6117185029Spjd while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) 6118168404Spjd vdev_config_clean(vd); 6119168404Spjd 6120168404Spjd /* 6121168404Spjd * Now that the new config has synced transactionally, 6122168404Spjd * let it become visible to the config cache. 6123168404Spjd */ 6124168404Spjd if (spa->spa_config_syncing != NULL) { 6125168404Spjd spa_config_set(spa, spa->spa_config_syncing); 6126168404Spjd spa->spa_config_txg = txg; 6127168404Spjd spa->spa_config_syncing = NULL; 6128168404Spjd } 6129168404Spjd 6130168404Spjd spa->spa_ubsync = spa->spa_uberblock; 6131168404Spjd 6132219089Spjd dsl_pool_sync_done(dp, txg); 6133168404Spjd 6134168404Spjd /* 6135168404Spjd * Update usable space statistics. 6136168404Spjd */ 6137168404Spjd while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 6138168404Spjd vdev_sync_done(vd, txg); 6139168404Spjd 6140219089Spjd spa_update_dspace(spa); 6141219089Spjd 6142168404Spjd /* 6143168404Spjd * It had better be the case that we didn't dirty anything 6144168404Spjd * since vdev_config_sync(). 6145168404Spjd */ 6146168404Spjd ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 6147168404Spjd ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 6148168404Spjd ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 6149168404Spjd 6150219089Spjd spa->spa_sync_pass = 0; 6151219089Spjd 6152185029Spjd spa_config_exit(spa, SCL_CONFIG, FTAG); 6153168404Spjd 6154219089Spjd spa_handle_ignored_writes(spa); 6155219089Spjd 6156168404Spjd /* 6157168404Spjd * If any async tasks have been requested, kick them off. 6158168404Spjd */ 6159168404Spjd spa_async_dispatch(spa); 6160168404Spjd} 6161168404Spjd 6162168404Spjd/* 6163168404Spjd * Sync all pools. We don't want to hold the namespace lock across these 6164168404Spjd * operations, so we take a reference on the spa_t and drop the lock during the 6165168404Spjd * sync. 6166168404Spjd */ 6167168404Spjdvoid 6168168404Spjdspa_sync_allpools(void) 6169168404Spjd{ 6170168404Spjd spa_t *spa = NULL; 6171168404Spjd mutex_enter(&spa_namespace_lock); 6172168404Spjd while ((spa = spa_next(spa)) != NULL) { 6173219089Spjd if (spa_state(spa) != POOL_STATE_ACTIVE || 6174219089Spjd !spa_writeable(spa) || spa_suspended(spa)) 6175168404Spjd continue; 6176168404Spjd spa_open_ref(spa, FTAG); 6177168404Spjd mutex_exit(&spa_namespace_lock); 6178168404Spjd txg_wait_synced(spa_get_dsl(spa), 0); 6179168404Spjd mutex_enter(&spa_namespace_lock); 6180168404Spjd spa_close(spa, FTAG); 6181168404Spjd } 6182168404Spjd mutex_exit(&spa_namespace_lock); 6183168404Spjd} 6184168404Spjd 6185168404Spjd/* 6186168404Spjd * ========================================================================== 6187168404Spjd * Miscellaneous routines 6188168404Spjd * ========================================================================== 6189168404Spjd */ 6190168404Spjd 6191168404Spjd/* 6192168404Spjd * Remove all pools in the system. 6193168404Spjd */ 6194168404Spjdvoid 6195168404Spjdspa_evict_all(void) 6196168404Spjd{ 6197168404Spjd spa_t *spa; 6198168404Spjd 6199168404Spjd /* 6200168404Spjd * Remove all cached state. All pools should be closed now, 6201168404Spjd * so every spa in the AVL tree should be unreferenced. 6202168404Spjd */ 6203168404Spjd mutex_enter(&spa_namespace_lock); 6204168404Spjd while ((spa = spa_next(NULL)) != NULL) { 6205168404Spjd /* 6206168404Spjd * Stop async tasks. The async thread may need to detach 6207168404Spjd * a device that's been replaced, which requires grabbing 6208168404Spjd * spa_namespace_lock, so we must drop it here. 6209168404Spjd */ 6210168404Spjd spa_open_ref(spa, FTAG); 6211168404Spjd mutex_exit(&spa_namespace_lock); 6212168404Spjd spa_async_suspend(spa); 6213168404Spjd mutex_enter(&spa_namespace_lock); 6214168404Spjd spa_close(spa, FTAG); 6215168404Spjd 6216168404Spjd if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 6217168404Spjd spa_unload(spa); 6218168404Spjd spa_deactivate(spa); 6219168404Spjd } 6220168404Spjd spa_remove(spa); 6221168404Spjd } 6222168404Spjd mutex_exit(&spa_namespace_lock); 6223168404Spjd} 6224168404Spjd 6225168404Spjdvdev_t * 6226209962Smmspa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) 6227168404Spjd{ 6228185029Spjd vdev_t *vd; 6229185029Spjd int i; 6230185029Spjd 6231185029Spjd if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 6232185029Spjd return (vd); 6233185029Spjd 6234209962Smm if (aux) { 6235185029Spjd for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 6236185029Spjd vd = spa->spa_l2cache.sav_vdevs[i]; 6237185029Spjd if (vd->vdev_guid == guid) 6238185029Spjd return (vd); 6239185029Spjd } 6240209962Smm 6241209962Smm for (i = 0; i < spa->spa_spares.sav_count; i++) { 6242209962Smm vd = spa->spa_spares.sav_vdevs[i]; 6243209962Smm if (vd->vdev_guid == guid) 6244209962Smm return (vd); 6245209962Smm } 6246185029Spjd } 6247185029Spjd 6248185029Spjd return (NULL); 6249168404Spjd} 6250168404Spjd 6251168404Spjdvoid 6252185029Spjdspa_upgrade(spa_t *spa, uint64_t version) 6253168404Spjd{ 6254219089Spjd ASSERT(spa_writeable(spa)); 6255219089Spjd 6256185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6257168404Spjd 6258168404Spjd /* 6259168404Spjd * This should only be called for a non-faulted pool, and since a 6260168404Spjd * future version would result in an unopenable pool, this shouldn't be 6261168404Spjd * possible. 6262168404Spjd */ 6263185029Spjd ASSERT(spa->spa_uberblock.ub_version <= SPA_VERSION); 6264185029Spjd ASSERT(version >= spa->spa_uberblock.ub_version); 6265168404Spjd 6266185029Spjd spa->spa_uberblock.ub_version = version; 6267168404Spjd vdev_config_dirty(spa->spa_root_vdev); 6268168404Spjd 6269185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 6270168404Spjd 6271168404Spjd txg_wait_synced(spa_get_dsl(spa), 0); 6272168404Spjd} 6273168404Spjd 6274168404Spjdboolean_t 6275168404Spjdspa_has_spare(spa_t *spa, uint64_t guid) 6276168404Spjd{ 6277168404Spjd int i; 6278168404Spjd uint64_t spareguid; 6279185029Spjd spa_aux_vdev_t *sav = &spa->spa_spares; 6280168404Spjd 6281185029Spjd for (i = 0; i < sav->sav_count; i++) 6282185029Spjd if (sav->sav_vdevs[i]->vdev_guid == guid) 6283168404Spjd return (B_TRUE); 6284168404Spjd 6285185029Spjd for (i = 0; i < sav->sav_npending; i++) { 6286185029Spjd if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 6287185029Spjd &spareguid) == 0 && spareguid == guid) 6288168404Spjd return (B_TRUE); 6289168404Spjd } 6290168404Spjd 6291168404Spjd return (B_FALSE); 6292168404Spjd} 6293168404Spjd 6294185029Spjd/* 6295185029Spjd * Check if a pool has an active shared spare device. 6296185029Spjd * Note: reference count of an active spare is 2, as a spare and as a replace 6297185029Spjd */ 6298185029Spjdstatic boolean_t 6299185029Spjdspa_has_active_shared_spare(spa_t *spa) 6300168404Spjd{ 6301185029Spjd int i, refcnt; 6302185029Spjd uint64_t pool; 6303185029Spjd spa_aux_vdev_t *sav = &spa->spa_spares; 6304185029Spjd 6305185029Spjd for (i = 0; i < sav->sav_count; i++) { 6306185029Spjd if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, 6307185029Spjd &refcnt) && pool != 0ULL && pool == spa_guid(spa) && 6308185029Spjd refcnt > 2) 6309185029Spjd return (B_TRUE); 6310185029Spjd } 6311185029Spjd 6312185029Spjd return (B_FALSE); 6313168404Spjd} 6314168404Spjd 6315185029Spjd/* 6316185029Spjd * Post a sysevent corresponding to the given event. The 'name' must be one of 6317185029Spjd * the event definitions in sys/sysevent/eventdefs.h. The payload will be 6318185029Spjd * filled in from the spa and (optionally) the vdev. This doesn't do anything 6319185029Spjd * in the userland libzpool, as we don't want consumers to misinterpret ztest 6320185029Spjd * or zdb as real changes. 6321185029Spjd */ 6322185029Spjdvoid 6323185029Spjdspa_event_notify(spa_t *spa, vdev_t *vd, const char *name) 6324168404Spjd{ 6325185029Spjd#ifdef _KERNEL 6326185029Spjd sysevent_t *ev; 6327185029Spjd sysevent_attr_list_t *attr = NULL; 6328185029Spjd sysevent_value_t value; 6329185029Spjd sysevent_id_t eid; 6330168404Spjd 6331185029Spjd ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", 6332185029Spjd SE_SLEEP); 6333168404Spjd 6334185029Spjd value.value_type = SE_DATA_TYPE_STRING; 6335185029Spjd value.value.sv_string = spa_name(spa); 6336185029Spjd if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) 6337185029Spjd goto done; 6338168404Spjd 6339185029Spjd value.value_type = SE_DATA_TYPE_UINT64; 6340185029Spjd value.value.sv_uint64 = spa_guid(spa); 6341185029Spjd if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) 6342185029Spjd goto done; 6343168404Spjd 6344185029Spjd if (vd) { 6345185029Spjd value.value_type = SE_DATA_TYPE_UINT64; 6346185029Spjd value.value.sv_uint64 = vd->vdev_guid; 6347185029Spjd if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, 6348185029Spjd SE_SLEEP) != 0) 6349185029Spjd goto done; 6350168404Spjd 6351185029Spjd if (vd->vdev_path) { 6352185029Spjd value.value_type = SE_DATA_TYPE_STRING; 6353185029Spjd value.value.sv_string = vd->vdev_path; 6354185029Spjd if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, 6355185029Spjd &value, SE_SLEEP) != 0) 6356185029Spjd goto done; 6357168404Spjd } 6358168404Spjd } 6359168404Spjd 6360185029Spjd if (sysevent_attach_attributes(ev, attr) != 0) 6361185029Spjd goto done; 6362185029Spjd attr = NULL; 6363168404Spjd 6364185029Spjd (void) log_sysevent(ev, SE_SLEEP, &eid); 6365185029Spjd 6366185029Spjddone: 6367185029Spjd if (attr) 6368185029Spjd sysevent_free_attr(attr); 6369185029Spjd sysevent_free(ev); 6370185029Spjd#endif 6371168404Spjd} 6372