spa.c revision 240415
1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd 22168404Spjd/* 23219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24236155Smm * Copyright (c) 2012 by Delphix. All rights reserved. 25168404Spjd */ 26168404Spjd 27168404Spjd/* 28168404Spjd * This file contains all the routines used when modifying on-disk SPA state. 29168404Spjd * This includes opening, importing, destroying, exporting a pool, and syncing a 30168404Spjd * pool. 31168404Spjd */ 32168404Spjd 33168404Spjd#include <sys/zfs_context.h> 34168404Spjd#include <sys/fm/fs/zfs.h> 35168404Spjd#include <sys/spa_impl.h> 36168404Spjd#include <sys/zio.h> 37168404Spjd#include <sys/zio_checksum.h> 38168404Spjd#include <sys/dmu.h> 39168404Spjd#include <sys/dmu_tx.h> 40168404Spjd#include <sys/zap.h> 41168404Spjd#include <sys/zil.h> 42219089Spjd#include <sys/ddt.h> 43168404Spjd#include <sys/vdev_impl.h> 44168404Spjd#include <sys/metaslab.h> 45219089Spjd#include <sys/metaslab_impl.h> 46168404Spjd#include <sys/uberblock_impl.h> 47168404Spjd#include <sys/txg.h> 48168404Spjd#include <sys/avl.h> 49168404Spjd#include <sys/dmu_traverse.h> 50168404Spjd#include <sys/dmu_objset.h> 51168404Spjd#include <sys/unique.h> 52168404Spjd#include <sys/dsl_pool.h> 53168404Spjd#include <sys/dsl_dataset.h> 54168404Spjd#include <sys/dsl_dir.h> 55168404Spjd#include <sys/dsl_prop.h> 56168404Spjd#include <sys/dsl_synctask.h> 57168404Spjd#include <sys/fs/zfs.h> 58185029Spjd#include <sys/arc.h> 59168404Spjd#include <sys/callb.h> 60185029Spjd#include <sys/spa_boot.h> 61219089Spjd#include <sys/zfs_ioctl.h> 62219089Spjd#include <sys/dsl_scan.h> 63236884Smm#include <sys/zfeature.h> 64219089Spjd#include <sys/zvol.h> 65168404Spjd 66219089Spjd#ifdef _KERNEL 67219089Spjd#include <sys/callb.h> 68219089Spjd#include <sys/cpupart.h> 69219089Spjd#include <sys/zone.h> 70219089Spjd#endif /* _KERNEL */ 71219089Spjd 72185029Spjd#include "zfs_prop.h" 73185029Spjd#include "zfs_comutil.h" 74168404Spjd 75204073Spjd/* Check hostid on import? */ 76204073Spjdstatic int check_hostid = 1; 77204073Spjd 78204073SpjdSYSCTL_DECL(_vfs_zfs); 79204073SpjdTUNABLE_INT("vfs.zfs.check_hostid", &check_hostid); 80204073SpjdSYSCTL_INT(_vfs_zfs, OID_AUTO, check_hostid, CTLFLAG_RW, &check_hostid, 0, 81204073Spjd "Check hostid on import?"); 82204073Spjd 83219089Spjdtypedef enum zti_modes { 84209962Smm zti_mode_fixed, /* value is # of threads (min 1) */ 85209962Smm zti_mode_online_percent, /* value is % of online CPUs */ 86219089Spjd zti_mode_batch, /* cpu-intensive; value is ignored */ 87211931Smm zti_mode_null, /* don't create a taskq */ 88209962Smm zti_nmodes 89219089Spjd} zti_modes_t; 90168712Spjd 91211931Smm#define ZTI_FIX(n) { zti_mode_fixed, (n) } 92211931Smm#define ZTI_PCT(n) { zti_mode_online_percent, (n) } 93219089Spjd#define ZTI_BATCH { zti_mode_batch, 0 } 94211931Smm#define ZTI_NULL { zti_mode_null, 0 } 95209962Smm 96211931Smm#define ZTI_ONE ZTI_FIX(1) 97209962Smm 98209962Smmtypedef struct zio_taskq_info { 99211931Smm enum zti_modes zti_mode; 100211931Smm uint_t zti_value; 101209962Smm} zio_taskq_info_t; 102209962Smm 103209962Smmstatic const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { 104219089Spjd "issue", "issue_high", "intr", "intr_high" 105209962Smm}; 106209962Smm 107211931Smm/* 108211931Smm * Define the taskq threads for the following I/O types: 109211931Smm * NULL, READ, WRITE, FREE, CLAIM, and IOCTL 110211931Smm */ 111211931Smmconst zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { 112211931Smm /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ 113211931Smm { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 114219089Spjd { ZTI_FIX(8), ZTI_NULL, ZTI_BATCH, ZTI_NULL }, 115219089Spjd { ZTI_BATCH, ZTI_FIX(5), ZTI_FIX(8), ZTI_FIX(5) }, 116219089Spjd { ZTI_FIX(100), ZTI_NULL, ZTI_ONE, ZTI_NULL }, 117211931Smm { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 118211931Smm { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 119209962Smm}; 120209962Smm 121236884Smmstatic dsl_syncfunc_t spa_sync_version; 122219089Spjdstatic dsl_syncfunc_t spa_sync_props; 123239620Smmstatic dsl_checkfunc_t spa_change_guid_check; 124239620Smmstatic dsl_syncfunc_t spa_change_guid_sync; 125185029Spjdstatic boolean_t spa_has_active_shared_spare(spa_t *spa); 126219089Spjdstatic int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config, 127219089Spjd spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, 128219089Spjd char **ereport); 129219089Spjdstatic void spa_vdev_resilver_done(spa_t *spa); 130185029Spjd 131219089Spjduint_t zio_taskq_batch_pct = 100; /* 1 thread per cpu in pset */ 132219089Spjd#ifdef PSRSET_BIND 133219089Spjdid_t zio_taskq_psrset_bind = PS_NONE; 134219089Spjd#endif 135219089Spjd#ifdef SYSDC 136219089Spjdboolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ 137219089Spjd#endif 138219089Spjduint_t zio_taskq_basedc = 80; /* base duty cycle */ 139219089Spjd 140219089Spjdboolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */ 141219089Spjd 142168404Spjd/* 143219089Spjd * This (illegal) pool name is used when temporarily importing a spa_t in order 144219089Spjd * to get the vdev stats associated with the imported devices. 145219089Spjd */ 146219089Spjd#define TRYIMPORT_NAME "$import" 147219089Spjd 148219089Spjd/* 149168404Spjd * ========================================================================== 150185029Spjd * SPA properties routines 151185029Spjd * ========================================================================== 152185029Spjd */ 153185029Spjd 154185029Spjd/* 155185029Spjd * Add a (source=src, propname=propval) list to an nvlist. 156185029Spjd */ 157185029Spjdstatic void 158185029Spjdspa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, 159185029Spjd uint64_t intval, zprop_source_t src) 160185029Spjd{ 161185029Spjd const char *propname = zpool_prop_to_name(prop); 162185029Spjd nvlist_t *propval; 163185029Spjd 164185029Spjd VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 165185029Spjd VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); 166185029Spjd 167185029Spjd if (strval != NULL) 168185029Spjd VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); 169185029Spjd else 170185029Spjd VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); 171185029Spjd 172185029Spjd VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); 173185029Spjd nvlist_free(propval); 174185029Spjd} 175185029Spjd 176185029Spjd/* 177185029Spjd * Get property values from the spa configuration. 178185029Spjd */ 179185029Spjdstatic void 180185029Spjdspa_prop_get_config(spa_t *spa, nvlist_t **nvp) 181185029Spjd{ 182236155Smm vdev_t *rvd = spa->spa_root_vdev; 183236884Smm dsl_pool_t *pool = spa->spa_dsl_pool; 184209962Smm uint64_t size; 185219089Spjd uint64_t alloc; 186236155Smm uint64_t space; 187185029Spjd uint64_t cap, version; 188185029Spjd zprop_source_t src = ZPROP_SRC_NONE; 189185029Spjd spa_config_dirent_t *dp; 190185029Spjd 191185029Spjd ASSERT(MUTEX_HELD(&spa->spa_props_lock)); 192185029Spjd 193236155Smm if (rvd != NULL) { 194219089Spjd alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 195219089Spjd size = metaslab_class_get_space(spa_normal_class(spa)); 196209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); 197209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); 198219089Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); 199219089Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, 200219089Spjd size - alloc, src); 201236155Smm 202236155Smm space = 0; 203236155Smm for (int c = 0; c < rvd->vdev_children; c++) { 204236155Smm vdev_t *tvd = rvd->vdev_child[c]; 205236155Smm space += tvd->vdev_max_asize - tvd->vdev_asize; 206236155Smm } 207236155Smm spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, space, 208236155Smm src); 209236155Smm 210219089Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, 211219089Spjd (spa_mode(spa) == FREAD), src); 212185029Spjd 213219089Spjd cap = (size == 0) ? 0 : (alloc * 100 / size); 214209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); 215185029Spjd 216219089Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, 217219089Spjd ddt_get_pool_dedup_ratio(spa), src); 218219089Spjd 219209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 220236155Smm rvd->vdev_state, src); 221209962Smm 222209962Smm version = spa_version(spa); 223209962Smm if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) 224209962Smm src = ZPROP_SRC_DEFAULT; 225209962Smm else 226209962Smm src = ZPROP_SRC_LOCAL; 227209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); 228209962Smm } 229209962Smm 230236884Smm if (pool != NULL) { 231236884Smm dsl_dir_t *freedir = pool->dp_free_dir; 232236884Smm 233236884Smm /* 234236884Smm * The $FREE directory was introduced in SPA_VERSION_DEADLISTS, 235236884Smm * when opening pools before this version freedir will be NULL. 236236884Smm */ 237236884Smm if (freedir != NULL) { 238236884Smm spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, 239236884Smm freedir->dd_phys->dd_used_bytes, src); 240236884Smm } else { 241236884Smm spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, 242236884Smm NULL, 0, src); 243236884Smm } 244236884Smm } 245236884Smm 246185029Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); 247185029Spjd 248228103Smm if (spa->spa_comment != NULL) { 249228103Smm spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment, 250228103Smm 0, ZPROP_SRC_LOCAL); 251228103Smm } 252228103Smm 253185029Spjd if (spa->spa_root != NULL) 254185029Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 255185029Spjd 0, ZPROP_SRC_LOCAL); 256185029Spjd 257185029Spjd if ((dp = list_head(&spa->spa_config_list)) != NULL) { 258185029Spjd if (dp->scd_path == NULL) { 259185029Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 260185029Spjd "none", 0, ZPROP_SRC_LOCAL); 261185029Spjd } else if (strcmp(dp->scd_path, spa_config_path) != 0) { 262185029Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 263185029Spjd dp->scd_path, 0, ZPROP_SRC_LOCAL); 264185029Spjd } 265185029Spjd } 266185029Spjd} 267185029Spjd 268185029Spjd/* 269185029Spjd * Get zpool property values. 270185029Spjd */ 271185029Spjdint 272185029Spjdspa_prop_get(spa_t *spa, nvlist_t **nvp) 273185029Spjd{ 274219089Spjd objset_t *mos = spa->spa_meta_objset; 275185029Spjd zap_cursor_t zc; 276185029Spjd zap_attribute_t za; 277185029Spjd int err; 278185029Spjd 279185029Spjd VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 280185029Spjd 281185029Spjd mutex_enter(&spa->spa_props_lock); 282185029Spjd 283185029Spjd /* 284185029Spjd * Get properties from the spa config. 285185029Spjd */ 286185029Spjd spa_prop_get_config(spa, nvp); 287185029Spjd 288185029Spjd /* If no pool property object, no more prop to get. */ 289219089Spjd if (mos == NULL || spa->spa_pool_props_object == 0) { 290185029Spjd mutex_exit(&spa->spa_props_lock); 291185029Spjd return (0); 292185029Spjd } 293185029Spjd 294185029Spjd /* 295185029Spjd * Get properties from the MOS pool property object. 296185029Spjd */ 297185029Spjd for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 298185029Spjd (err = zap_cursor_retrieve(&zc, &za)) == 0; 299185029Spjd zap_cursor_advance(&zc)) { 300185029Spjd uint64_t intval = 0; 301185029Spjd char *strval = NULL; 302185029Spjd zprop_source_t src = ZPROP_SRC_DEFAULT; 303185029Spjd zpool_prop_t prop; 304185029Spjd 305185029Spjd if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL) 306185029Spjd continue; 307185029Spjd 308185029Spjd switch (za.za_integer_length) { 309185029Spjd case 8: 310185029Spjd /* integer property */ 311185029Spjd if (za.za_first_integer != 312185029Spjd zpool_prop_default_numeric(prop)) 313185029Spjd src = ZPROP_SRC_LOCAL; 314185029Spjd 315185029Spjd if (prop == ZPOOL_PROP_BOOTFS) { 316185029Spjd dsl_pool_t *dp; 317185029Spjd dsl_dataset_t *ds = NULL; 318185029Spjd 319185029Spjd dp = spa_get_dsl(spa); 320185029Spjd rw_enter(&dp->dp_config_rwlock, RW_READER); 321185029Spjd if (err = dsl_dataset_hold_obj(dp, 322185029Spjd za.za_first_integer, FTAG, &ds)) { 323185029Spjd rw_exit(&dp->dp_config_rwlock); 324185029Spjd break; 325185029Spjd } 326185029Spjd 327185029Spjd strval = kmem_alloc( 328185029Spjd MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, 329185029Spjd KM_SLEEP); 330185029Spjd dsl_dataset_name(ds, strval); 331185029Spjd dsl_dataset_rele(ds, FTAG); 332185029Spjd rw_exit(&dp->dp_config_rwlock); 333185029Spjd } else { 334185029Spjd strval = NULL; 335185029Spjd intval = za.za_first_integer; 336185029Spjd } 337185029Spjd 338185029Spjd spa_prop_add_list(*nvp, prop, strval, intval, src); 339185029Spjd 340185029Spjd if (strval != NULL) 341185029Spjd kmem_free(strval, 342185029Spjd MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); 343185029Spjd 344185029Spjd break; 345185029Spjd 346185029Spjd case 1: 347185029Spjd /* string property */ 348185029Spjd strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 349185029Spjd err = zap_lookup(mos, spa->spa_pool_props_object, 350185029Spjd za.za_name, 1, za.za_num_integers, strval); 351185029Spjd if (err) { 352185029Spjd kmem_free(strval, za.za_num_integers); 353185029Spjd break; 354185029Spjd } 355185029Spjd spa_prop_add_list(*nvp, prop, strval, 0, src); 356185029Spjd kmem_free(strval, za.za_num_integers); 357185029Spjd break; 358185029Spjd 359185029Spjd default: 360185029Spjd break; 361185029Spjd } 362185029Spjd } 363185029Spjd zap_cursor_fini(&zc); 364185029Spjd mutex_exit(&spa->spa_props_lock); 365185029Spjdout: 366185029Spjd if (err && err != ENOENT) { 367185029Spjd nvlist_free(*nvp); 368185029Spjd *nvp = NULL; 369185029Spjd return (err); 370185029Spjd } 371185029Spjd 372185029Spjd return (0); 373185029Spjd} 374185029Spjd 375185029Spjd/* 376185029Spjd * Validate the given pool properties nvlist and modify the list 377185029Spjd * for the property values to be set. 378185029Spjd */ 379185029Spjdstatic int 380185029Spjdspa_prop_validate(spa_t *spa, nvlist_t *props) 381185029Spjd{ 382185029Spjd nvpair_t *elem; 383185029Spjd int error = 0, reset_bootfs = 0; 384185029Spjd uint64_t objnum; 385236884Smm boolean_t has_feature = B_FALSE; 386185029Spjd 387185029Spjd elem = NULL; 388185029Spjd while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 389185029Spjd uint64_t intval; 390236884Smm char *strval, *slash, *check, *fname; 391236884Smm const char *propname = nvpair_name(elem); 392236884Smm zpool_prop_t prop = zpool_name_to_prop(propname); 393185029Spjd 394236884Smm switch (prop) { 395236884Smm case ZPROP_INVAL: 396236884Smm if (!zpool_prop_feature(propname)) { 397236884Smm error = EINVAL; 398236884Smm break; 399236884Smm } 400185029Spjd 401236884Smm /* 402236884Smm * Sanitize the input. 403236884Smm */ 404236884Smm if (nvpair_type(elem) != DATA_TYPE_UINT64) { 405236884Smm error = EINVAL; 406236884Smm break; 407236884Smm } 408185029Spjd 409236884Smm if (nvpair_value_uint64(elem, &intval) != 0) { 410236884Smm error = EINVAL; 411236884Smm break; 412236884Smm } 413236884Smm 414236884Smm if (intval != 0) { 415236884Smm error = EINVAL; 416236884Smm break; 417236884Smm } 418236884Smm 419236884Smm fname = strchr(propname, '@') + 1; 420236884Smm if (zfeature_lookup_name(fname, NULL) != 0) { 421236884Smm error = EINVAL; 422236884Smm break; 423236884Smm } 424236884Smm 425236884Smm has_feature = B_TRUE; 426236884Smm break; 427236884Smm 428185029Spjd case ZPOOL_PROP_VERSION: 429185029Spjd error = nvpair_value_uint64(elem, &intval); 430185029Spjd if (!error && 431236884Smm (intval < spa_version(spa) || 432236884Smm intval > SPA_VERSION_BEFORE_FEATURES || 433236884Smm has_feature)) 434185029Spjd error = EINVAL; 435185029Spjd break; 436185029Spjd 437185029Spjd case ZPOOL_PROP_DELEGATION: 438185029Spjd case ZPOOL_PROP_AUTOREPLACE: 439185029Spjd case ZPOOL_PROP_LISTSNAPS: 440219089Spjd case ZPOOL_PROP_AUTOEXPAND: 441185029Spjd error = nvpair_value_uint64(elem, &intval); 442185029Spjd if (!error && intval > 1) 443185029Spjd error = EINVAL; 444185029Spjd break; 445185029Spjd 446185029Spjd case ZPOOL_PROP_BOOTFS: 447209962Smm /* 448209962Smm * If the pool version is less than SPA_VERSION_BOOTFS, 449209962Smm * or the pool is still being created (version == 0), 450209962Smm * the bootfs property cannot be set. 451209962Smm */ 452185029Spjd if (spa_version(spa) < SPA_VERSION_BOOTFS) { 453185029Spjd error = ENOTSUP; 454185029Spjd break; 455185029Spjd } 456185029Spjd 457185029Spjd /* 458185029Spjd * Make sure the vdev config is bootable 459185029Spjd */ 460185029Spjd if (!vdev_is_bootable(spa->spa_root_vdev)) { 461185029Spjd error = ENOTSUP; 462185029Spjd break; 463185029Spjd } 464185029Spjd 465185029Spjd reset_bootfs = 1; 466185029Spjd 467185029Spjd error = nvpair_value_string(elem, &strval); 468185029Spjd 469185029Spjd if (!error) { 470236884Smm objset_t *os; 471185029Spjd uint64_t compress; 472185029Spjd 473185029Spjd if (strval == NULL || strval[0] == '\0') { 474185029Spjd objnum = zpool_prop_default_numeric( 475185029Spjd ZPOOL_PROP_BOOTFS); 476185029Spjd break; 477185029Spjd } 478185029Spjd 479219089Spjd if (error = dmu_objset_hold(strval, FTAG, &os)) 480185029Spjd break; 481185029Spjd 482219089Spjd /* Must be ZPL and not gzip compressed. */ 483219089Spjd 484219089Spjd if (dmu_objset_type(os) != DMU_OST_ZFS) { 485219089Spjd error = ENOTSUP; 486219089Spjd } else if ((error = dsl_prop_get_integer(strval, 487185029Spjd zfs_prop_to_name(ZFS_PROP_COMPRESSION), 488185029Spjd &compress, NULL)) == 0 && 489185029Spjd !BOOTFS_COMPRESS_VALID(compress)) { 490185029Spjd error = ENOTSUP; 491185029Spjd } else { 492185029Spjd objnum = dmu_objset_id(os); 493185029Spjd } 494219089Spjd dmu_objset_rele(os, FTAG); 495185029Spjd } 496185029Spjd break; 497185029Spjd 498185029Spjd case ZPOOL_PROP_FAILUREMODE: 499185029Spjd error = nvpair_value_uint64(elem, &intval); 500185029Spjd if (!error && (intval < ZIO_FAILURE_MODE_WAIT || 501185029Spjd intval > ZIO_FAILURE_MODE_PANIC)) 502185029Spjd error = EINVAL; 503185029Spjd 504185029Spjd /* 505185029Spjd * This is a special case which only occurs when 506185029Spjd * the pool has completely failed. This allows 507185029Spjd * the user to change the in-core failmode property 508185029Spjd * without syncing it out to disk (I/Os might 509185029Spjd * currently be blocked). We do this by returning 510185029Spjd * EIO to the caller (spa_prop_set) to trick it 511185029Spjd * into thinking we encountered a property validation 512185029Spjd * error. 513185029Spjd */ 514185029Spjd if (!error && spa_suspended(spa)) { 515185029Spjd spa->spa_failmode = intval; 516185029Spjd error = EIO; 517185029Spjd } 518185029Spjd break; 519185029Spjd 520185029Spjd case ZPOOL_PROP_CACHEFILE: 521185029Spjd if ((error = nvpair_value_string(elem, &strval)) != 0) 522185029Spjd break; 523185029Spjd 524185029Spjd if (strval[0] == '\0') 525185029Spjd break; 526185029Spjd 527185029Spjd if (strcmp(strval, "none") == 0) 528185029Spjd break; 529185029Spjd 530185029Spjd if (strval[0] != '/') { 531185029Spjd error = EINVAL; 532185029Spjd break; 533185029Spjd } 534185029Spjd 535185029Spjd slash = strrchr(strval, '/'); 536185029Spjd ASSERT(slash != NULL); 537185029Spjd 538185029Spjd if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 539185029Spjd strcmp(slash, "/..") == 0) 540185029Spjd error = EINVAL; 541185029Spjd break; 542219089Spjd 543228103Smm case ZPOOL_PROP_COMMENT: 544228103Smm if ((error = nvpair_value_string(elem, &strval)) != 0) 545228103Smm break; 546228103Smm for (check = strval; *check != '\0'; check++) { 547228103Smm /* 548228103Smm * The kernel doesn't have an easy isprint() 549228103Smm * check. For this kernel check, we merely 550228103Smm * check ASCII apart from DEL. Fix this if 551228103Smm * there is an easy-to-use kernel isprint(). 552228103Smm */ 553228103Smm if (*check >= 0x7f) { 554228103Smm error = EINVAL; 555228103Smm break; 556228103Smm } 557228103Smm check++; 558228103Smm } 559228103Smm if (strlen(strval) > ZPROP_MAX_COMMENT) 560228103Smm error = E2BIG; 561228103Smm break; 562228103Smm 563219089Spjd case ZPOOL_PROP_DEDUPDITTO: 564219089Spjd if (spa_version(spa) < SPA_VERSION_DEDUP) 565219089Spjd error = ENOTSUP; 566219089Spjd else 567219089Spjd error = nvpair_value_uint64(elem, &intval); 568219089Spjd if (error == 0 && 569219089Spjd intval != 0 && intval < ZIO_DEDUPDITTO_MIN) 570219089Spjd error = EINVAL; 571219089Spjd break; 572185029Spjd } 573185029Spjd 574185029Spjd if (error) 575185029Spjd break; 576185029Spjd } 577185029Spjd 578185029Spjd if (!error && reset_bootfs) { 579185029Spjd error = nvlist_remove(props, 580185029Spjd zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 581185029Spjd 582185029Spjd if (!error) { 583185029Spjd error = nvlist_add_uint64(props, 584185029Spjd zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 585185029Spjd } 586185029Spjd } 587185029Spjd 588185029Spjd return (error); 589185029Spjd} 590185029Spjd 591209962Smmvoid 592209962Smmspa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) 593209962Smm{ 594209962Smm char *cachefile; 595209962Smm spa_config_dirent_t *dp; 596209962Smm 597209962Smm if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), 598209962Smm &cachefile) != 0) 599209962Smm return; 600209962Smm 601209962Smm dp = kmem_alloc(sizeof (spa_config_dirent_t), 602209962Smm KM_SLEEP); 603209962Smm 604209962Smm if (cachefile[0] == '\0') 605209962Smm dp->scd_path = spa_strdup(spa_config_path); 606209962Smm else if (strcmp(cachefile, "none") == 0) 607209962Smm dp->scd_path = NULL; 608209962Smm else 609209962Smm dp->scd_path = spa_strdup(cachefile); 610209962Smm 611209962Smm list_insert_head(&spa->spa_config_list, dp); 612209962Smm if (need_sync) 613209962Smm spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 614209962Smm} 615209962Smm 616185029Spjdint 617185029Spjdspa_prop_set(spa_t *spa, nvlist_t *nvp) 618185029Spjd{ 619185029Spjd int error; 620236884Smm nvpair_t *elem = NULL; 621209962Smm boolean_t need_sync = B_FALSE; 622185029Spjd 623185029Spjd if ((error = spa_prop_validate(spa, nvp)) != 0) 624185029Spjd return (error); 625185029Spjd 626209962Smm while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { 627236884Smm zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem)); 628209962Smm 629219089Spjd if (prop == ZPOOL_PROP_CACHEFILE || 630219089Spjd prop == ZPOOL_PROP_ALTROOT || 631219089Spjd prop == ZPOOL_PROP_READONLY) 632209962Smm continue; 633209962Smm 634236884Smm if (prop == ZPOOL_PROP_VERSION || prop == ZPROP_INVAL) { 635236884Smm uint64_t ver; 636236884Smm 637236884Smm if (prop == ZPOOL_PROP_VERSION) { 638236884Smm VERIFY(nvpair_value_uint64(elem, &ver) == 0); 639236884Smm } else { 640236884Smm ASSERT(zpool_prop_feature(nvpair_name(elem))); 641236884Smm ver = SPA_VERSION_FEATURES; 642236884Smm need_sync = B_TRUE; 643236884Smm } 644236884Smm 645236884Smm /* Save time if the version is already set. */ 646236884Smm if (ver == spa_version(spa)) 647236884Smm continue; 648236884Smm 649236884Smm /* 650236884Smm * In addition to the pool directory object, we might 651236884Smm * create the pool properties object, the features for 652236884Smm * read object, the features for write object, or the 653236884Smm * feature descriptions object. 654236884Smm */ 655236884Smm error = dsl_sync_task_do(spa_get_dsl(spa), NULL, 656236884Smm spa_sync_version, spa, &ver, 6); 657236884Smm if (error) 658236884Smm return (error); 659236884Smm continue; 660236884Smm } 661236884Smm 662209962Smm need_sync = B_TRUE; 663209962Smm break; 664209962Smm } 665209962Smm 666236884Smm if (need_sync) { 667209962Smm return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, 668236884Smm spa, nvp, 6)); 669236884Smm } 670236884Smm 671236884Smm return (0); 672185029Spjd} 673185029Spjd 674185029Spjd/* 675185029Spjd * If the bootfs property value is dsobj, clear it. 676185029Spjd */ 677185029Spjdvoid 678185029Spjdspa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 679185029Spjd{ 680185029Spjd if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 681185029Spjd VERIFY(zap_remove(spa->spa_meta_objset, 682185029Spjd spa->spa_pool_props_object, 683185029Spjd zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 684185029Spjd spa->spa_bootfs = 0; 685185029Spjd } 686185029Spjd} 687185029Spjd 688239620Smm/*ARGSUSED*/ 689239620Smmstatic int 690239620Smmspa_change_guid_check(void *arg1, void *arg2, dmu_tx_t *tx) 691239620Smm{ 692239620Smm spa_t *spa = arg1; 693239620Smm uint64_t *newguid = arg2; 694239620Smm vdev_t *rvd = spa->spa_root_vdev; 695239620Smm uint64_t vdev_state; 696239620Smm 697239620Smm spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 698239620Smm vdev_state = rvd->vdev_state; 699239620Smm spa_config_exit(spa, SCL_STATE, FTAG); 700239620Smm 701239620Smm if (vdev_state != VDEV_STATE_HEALTHY) 702239620Smm return (ENXIO); 703239620Smm 704239620Smm ASSERT3U(spa_guid(spa), !=, *newguid); 705239620Smm 706239620Smm return (0); 707239620Smm} 708239620Smm 709239620Smmstatic void 710239620Smmspa_change_guid_sync(void *arg1, void *arg2, dmu_tx_t *tx) 711239620Smm{ 712239620Smm spa_t *spa = arg1; 713239620Smm uint64_t *newguid = arg2; 714239620Smm uint64_t oldguid; 715239620Smm vdev_t *rvd = spa->spa_root_vdev; 716239620Smm 717239620Smm oldguid = spa_guid(spa); 718239620Smm 719239620Smm spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 720239620Smm rvd->vdev_guid = *newguid; 721239620Smm rvd->vdev_guid_sum += (*newguid - oldguid); 722239620Smm vdev_config_dirty(rvd); 723239620Smm spa_config_exit(spa, SCL_STATE, FTAG); 724239620Smm 725239620Smm#ifdef __FreeBSD__ 726239620Smm /* 727239620Smm * TODO: until recent illumos logging changes are merged 728239620Smm * log reguid as pool property change 729239620Smm */ 730239620Smm spa_history_log_internal(LOG_POOL_PROPSET, spa, tx, 731239620Smm "guid change old=%llu new=%llu", oldguid, *newguid); 732239620Smm#else 733239620Smm spa_history_log_internal(spa, "guid change", tx, "old=%lld new=%lld", 734239620Smm oldguid, *newguid); 735239620Smm#endif 736239620Smm} 737239620Smm 738185029Spjd/* 739228103Smm * Change the GUID for the pool. This is done so that we can later 740228103Smm * re-import a pool built from a clone of our own vdevs. We will modify 741228103Smm * the root vdev's guid, our own pool guid, and then mark all of our 742228103Smm * vdevs dirty. Note that we must make sure that all our vdevs are 743228103Smm * online when we do this, or else any vdevs that weren't present 744228103Smm * would be orphaned from our pool. We are also going to issue a 745228103Smm * sysevent to update any watchers. 746228103Smm */ 747228103Smmint 748228103Smmspa_change_guid(spa_t *spa) 749228103Smm{ 750239620Smm int error; 751239620Smm uint64_t guid; 752228103Smm 753239620Smm mutex_enter(&spa_namespace_lock); 754239620Smm guid = spa_generate_guid(NULL); 755228103Smm 756239620Smm error = dsl_sync_task_do(spa_get_dsl(spa), spa_change_guid_check, 757239620Smm spa_change_guid_sync, spa, &guid, 5); 758228103Smm 759239620Smm if (error == 0) { 760239620Smm spa_config_sync(spa, B_FALSE, B_TRUE); 761239620Smm spa_event_notify(spa, NULL, ESC_ZFS_POOL_REGUID); 762239620Smm } 763228103Smm 764239620Smm mutex_exit(&spa_namespace_lock); 765228103Smm 766239620Smm return (error); 767228103Smm} 768228103Smm 769228103Smm/* 770185029Spjd * ========================================================================== 771168404Spjd * SPA state manipulation (open/create/destroy/import/export) 772168404Spjd * ========================================================================== 773168404Spjd */ 774168404Spjd 775168404Spjdstatic int 776168404Spjdspa_error_entry_compare(const void *a, const void *b) 777168404Spjd{ 778168404Spjd spa_error_entry_t *sa = (spa_error_entry_t *)a; 779168404Spjd spa_error_entry_t *sb = (spa_error_entry_t *)b; 780168404Spjd int ret; 781168404Spjd 782168404Spjd ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 783168404Spjd sizeof (zbookmark_t)); 784168404Spjd 785168404Spjd if (ret < 0) 786168404Spjd return (-1); 787168404Spjd else if (ret > 0) 788168404Spjd return (1); 789168404Spjd else 790168404Spjd return (0); 791168404Spjd} 792168404Spjd 793168404Spjd/* 794168404Spjd * Utility function which retrieves copies of the current logs and 795168404Spjd * re-initializes them in the process. 796168404Spjd */ 797168404Spjdvoid 798168404Spjdspa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 799168404Spjd{ 800168404Spjd ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 801168404Spjd 802168404Spjd bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 803168404Spjd bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 804168404Spjd 805168404Spjd avl_create(&spa->spa_errlist_scrub, 806168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 807168404Spjd offsetof(spa_error_entry_t, se_avl)); 808168404Spjd avl_create(&spa->spa_errlist_last, 809168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 810168404Spjd offsetof(spa_error_entry_t, se_avl)); 811168404Spjd} 812168404Spjd 813219089Spjdstatic taskq_t * 814219089Spjdspa_taskq_create(spa_t *spa, const char *name, enum zti_modes mode, 815219089Spjd uint_t value) 816168404Spjd{ 817219089Spjd uint_t flags = TASKQ_PREPOPULATE; 818219089Spjd boolean_t batch = B_FALSE; 819168404Spjd 820219089Spjd switch (mode) { 821219089Spjd case zti_mode_null: 822219089Spjd return (NULL); /* no taskq needed */ 823168404Spjd 824219089Spjd case zti_mode_fixed: 825219089Spjd ASSERT3U(value, >=, 1); 826219089Spjd value = MAX(value, 1); 827219089Spjd break; 828168404Spjd 829219089Spjd case zti_mode_batch: 830219089Spjd batch = B_TRUE; 831219089Spjd flags |= TASKQ_THREADS_CPU_PCT; 832219089Spjd value = zio_taskq_batch_pct; 833219089Spjd break; 834219089Spjd 835219089Spjd case zti_mode_online_percent: 836219089Spjd flags |= TASKQ_THREADS_CPU_PCT; 837219089Spjd break; 838219089Spjd 839219089Spjd default: 840219089Spjd panic("unrecognized mode for %s taskq (%u:%u) in " 841219089Spjd "spa_activate()", 842219089Spjd name, mode, value); 843219089Spjd break; 844219089Spjd } 845219089Spjd 846219089Spjd#ifdef SYSDC 847219089Spjd if (zio_taskq_sysdc && spa->spa_proc != &p0) { 848219089Spjd if (batch) 849219089Spjd flags |= TASKQ_DC_BATCH; 850219089Spjd 851219089Spjd return (taskq_create_sysdc(name, value, 50, INT_MAX, 852219089Spjd spa->spa_proc, zio_taskq_basedc, flags)); 853219089Spjd } 854219089Spjd#endif 855219089Spjd return (taskq_create_proc(name, value, maxclsyspri, 50, INT_MAX, 856219089Spjd spa->spa_proc, flags)); 857219089Spjd} 858219089Spjd 859219089Spjdstatic void 860219089Spjdspa_create_zio_taskqs(spa_t *spa) 861219089Spjd{ 862185029Spjd for (int t = 0; t < ZIO_TYPES; t++) { 863185029Spjd for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 864211931Smm const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; 865211931Smm enum zti_modes mode = ztip->zti_mode; 866211931Smm uint_t value = ztip->zti_value; 867209962Smm char name[32]; 868209962Smm 869209962Smm (void) snprintf(name, sizeof (name), 870211931Smm "%s_%s", zio_type_name[t], zio_taskq_types[q]); 871209962Smm 872219089Spjd spa->spa_zio_taskq[t][q] = 873219089Spjd spa_taskq_create(spa, name, mode, value); 874219089Spjd } 875219089Spjd } 876219089Spjd} 877209962Smm 878219089Spjd#ifdef _KERNEL 879219089Spjd#ifdef SPA_PROCESS 880219089Spjdstatic void 881219089Spjdspa_thread(void *arg) 882219089Spjd{ 883219089Spjd callb_cpr_t cprinfo; 884209962Smm 885219089Spjd spa_t *spa = arg; 886219089Spjd user_t *pu = PTOU(curproc); 887209962Smm 888219089Spjd CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, 889219089Spjd spa->spa_name); 890209962Smm 891219089Spjd ASSERT(curproc != &p0); 892219089Spjd (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), 893219089Spjd "zpool-%s", spa->spa_name); 894219089Spjd (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); 895211931Smm 896219089Spjd#ifdef PSRSET_BIND 897219089Spjd /* bind this thread to the requested psrset */ 898219089Spjd if (zio_taskq_psrset_bind != PS_NONE) { 899219089Spjd pool_lock(); 900219089Spjd mutex_enter(&cpu_lock); 901219089Spjd mutex_enter(&pidlock); 902219089Spjd mutex_enter(&curproc->p_lock); 903219089Spjd 904219089Spjd if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, 905219089Spjd 0, NULL, NULL) == 0) { 906219089Spjd curthread->t_bind_pset = zio_taskq_psrset_bind; 907219089Spjd } else { 908219089Spjd cmn_err(CE_WARN, 909219089Spjd "Couldn't bind process for zfs pool \"%s\" to " 910219089Spjd "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); 911219089Spjd } 912219089Spjd 913219089Spjd mutex_exit(&curproc->p_lock); 914219089Spjd mutex_exit(&pidlock); 915219089Spjd mutex_exit(&cpu_lock); 916219089Spjd pool_unlock(); 917219089Spjd } 918219089Spjd#endif 919219089Spjd 920219089Spjd#ifdef SYSDC 921219089Spjd if (zio_taskq_sysdc) { 922219089Spjd sysdc_thread_enter(curthread, 100, 0); 923219089Spjd } 924219089Spjd#endif 925219089Spjd 926219089Spjd spa->spa_proc = curproc; 927219089Spjd spa->spa_did = curthread->t_did; 928219089Spjd 929219089Spjd spa_create_zio_taskqs(spa); 930219089Spjd 931219089Spjd mutex_enter(&spa->spa_proc_lock); 932219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); 933219089Spjd 934219089Spjd spa->spa_proc_state = SPA_PROC_ACTIVE; 935219089Spjd cv_broadcast(&spa->spa_proc_cv); 936219089Spjd 937219089Spjd CALLB_CPR_SAFE_BEGIN(&cprinfo); 938219089Spjd while (spa->spa_proc_state == SPA_PROC_ACTIVE) 939219089Spjd cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 940219089Spjd CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); 941219089Spjd 942219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); 943219089Spjd spa->spa_proc_state = SPA_PROC_GONE; 944219089Spjd spa->spa_proc = &p0; 945219089Spjd cv_broadcast(&spa->spa_proc_cv); 946219089Spjd CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ 947219089Spjd 948219089Spjd mutex_enter(&curproc->p_lock); 949219089Spjd lwp_exit(); 950219089Spjd} 951219089Spjd#endif /* SPA_PROCESS */ 952219089Spjd#endif 953219089Spjd 954219089Spjd/* 955219089Spjd * Activate an uninitialized pool. 956219089Spjd */ 957219089Spjdstatic void 958219089Spjdspa_activate(spa_t *spa, int mode) 959219089Spjd{ 960219089Spjd ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 961219089Spjd 962219089Spjd spa->spa_state = POOL_STATE_ACTIVE; 963219089Spjd spa->spa_mode = mode; 964219089Spjd 965219089Spjd spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops); 966219089Spjd spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); 967219089Spjd 968219089Spjd /* Try to create a covering process */ 969219089Spjd mutex_enter(&spa->spa_proc_lock); 970219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_NONE); 971219089Spjd ASSERT(spa->spa_proc == &p0); 972219089Spjd spa->spa_did = 0; 973219089Spjd 974219089Spjd#ifdef SPA_PROCESS 975219089Spjd /* Only create a process if we're going to be around a while. */ 976219089Spjd if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { 977219089Spjd if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, 978219089Spjd NULL, 0) == 0) { 979219089Spjd spa->spa_proc_state = SPA_PROC_CREATED; 980219089Spjd while (spa->spa_proc_state == SPA_PROC_CREATED) { 981219089Spjd cv_wait(&spa->spa_proc_cv, 982219089Spjd &spa->spa_proc_lock); 983209962Smm } 984219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 985219089Spjd ASSERT(spa->spa_proc != &p0); 986219089Spjd ASSERT(spa->spa_did != 0); 987219089Spjd } else { 988219089Spjd#ifdef _KERNEL 989219089Spjd cmn_err(CE_WARN, 990219089Spjd "Couldn't create process for zfs pool \"%s\"\n", 991219089Spjd spa->spa_name); 992219089Spjd#endif 993185029Spjd } 994168404Spjd } 995219089Spjd#endif /* SPA_PROCESS */ 996219089Spjd mutex_exit(&spa->spa_proc_lock); 997168404Spjd 998219089Spjd /* If we didn't create a process, we need to create our taskqs. */ 999219089Spjd ASSERT(spa->spa_proc == &p0); 1000219089Spjd if (spa->spa_proc == &p0) { 1001219089Spjd spa_create_zio_taskqs(spa); 1002219089Spjd } 1003219089Spjd 1004185029Spjd list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), 1005185029Spjd offsetof(vdev_t, vdev_config_dirty_node)); 1006185029Spjd list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), 1007185029Spjd offsetof(vdev_t, vdev_state_dirty_node)); 1008168404Spjd 1009168404Spjd txg_list_create(&spa->spa_vdev_txg_list, 1010168404Spjd offsetof(struct vdev, vdev_txg_node)); 1011168404Spjd 1012168404Spjd avl_create(&spa->spa_errlist_scrub, 1013168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 1014168404Spjd offsetof(spa_error_entry_t, se_avl)); 1015168404Spjd avl_create(&spa->spa_errlist_last, 1016168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 1017168404Spjd offsetof(spa_error_entry_t, se_avl)); 1018168404Spjd} 1019168404Spjd 1020168404Spjd/* 1021168404Spjd * Opposite of spa_activate(). 1022168404Spjd */ 1023168404Spjdstatic void 1024168404Spjdspa_deactivate(spa_t *spa) 1025168404Spjd{ 1026168404Spjd ASSERT(spa->spa_sync_on == B_FALSE); 1027168404Spjd ASSERT(spa->spa_dsl_pool == NULL); 1028168404Spjd ASSERT(spa->spa_root_vdev == NULL); 1029209962Smm ASSERT(spa->spa_async_zio_root == NULL); 1030168404Spjd ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 1031168404Spjd 1032168404Spjd txg_list_destroy(&spa->spa_vdev_txg_list); 1033168404Spjd 1034185029Spjd list_destroy(&spa->spa_config_dirty_list); 1035185029Spjd list_destroy(&spa->spa_state_dirty_list); 1036168404Spjd 1037185029Spjd for (int t = 0; t < ZIO_TYPES; t++) { 1038185029Spjd for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 1039211931Smm if (spa->spa_zio_taskq[t][q] != NULL) 1040211931Smm taskq_destroy(spa->spa_zio_taskq[t][q]); 1041185029Spjd spa->spa_zio_taskq[t][q] = NULL; 1042185029Spjd } 1043168404Spjd } 1044168404Spjd 1045168404Spjd metaslab_class_destroy(spa->spa_normal_class); 1046168404Spjd spa->spa_normal_class = NULL; 1047168404Spjd 1048185029Spjd metaslab_class_destroy(spa->spa_log_class); 1049185029Spjd spa->spa_log_class = NULL; 1050185029Spjd 1051168404Spjd /* 1052168404Spjd * If this was part of an import or the open otherwise failed, we may 1053168404Spjd * still have errors left in the queues. Empty them just in case. 1054168404Spjd */ 1055168404Spjd spa_errlog_drain(spa); 1056168404Spjd 1057168404Spjd avl_destroy(&spa->spa_errlist_scrub); 1058168404Spjd avl_destroy(&spa->spa_errlist_last); 1059168404Spjd 1060168404Spjd spa->spa_state = POOL_STATE_UNINITIALIZED; 1061219089Spjd 1062219089Spjd mutex_enter(&spa->spa_proc_lock); 1063219089Spjd if (spa->spa_proc_state != SPA_PROC_NONE) { 1064219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1065219089Spjd spa->spa_proc_state = SPA_PROC_DEACTIVATE; 1066219089Spjd cv_broadcast(&spa->spa_proc_cv); 1067219089Spjd while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { 1068219089Spjd ASSERT(spa->spa_proc != &p0); 1069219089Spjd cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1070219089Spjd } 1071219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_GONE); 1072219089Spjd spa->spa_proc_state = SPA_PROC_NONE; 1073219089Spjd } 1074219089Spjd ASSERT(spa->spa_proc == &p0); 1075219089Spjd mutex_exit(&spa->spa_proc_lock); 1076219089Spjd 1077219089Spjd#ifdef SPA_PROCESS 1078219089Spjd /* 1079219089Spjd * We want to make sure spa_thread() has actually exited the ZFS 1080219089Spjd * module, so that the module can't be unloaded out from underneath 1081219089Spjd * it. 1082219089Spjd */ 1083219089Spjd if (spa->spa_did != 0) { 1084219089Spjd thread_join(spa->spa_did); 1085219089Spjd spa->spa_did = 0; 1086219089Spjd } 1087219089Spjd#endif /* SPA_PROCESS */ 1088168404Spjd} 1089168404Spjd 1090168404Spjd/* 1091168404Spjd * Verify a pool configuration, and construct the vdev tree appropriately. This 1092168404Spjd * will create all the necessary vdevs in the appropriate layout, with each vdev 1093168404Spjd * in the CLOSED state. This will prep the pool before open/creation/import. 1094168404Spjd * All vdev validation is done by the vdev_alloc() routine. 1095168404Spjd */ 1096168404Spjdstatic int 1097168404Spjdspa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 1098168404Spjd uint_t id, int atype) 1099168404Spjd{ 1100168404Spjd nvlist_t **child; 1101219089Spjd uint_t children; 1102168404Spjd int error; 1103168404Spjd 1104168404Spjd if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 1105168404Spjd return (error); 1106168404Spjd 1107168404Spjd if ((*vdp)->vdev_ops->vdev_op_leaf) 1108168404Spjd return (0); 1109168404Spjd 1110185029Spjd error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 1111185029Spjd &child, &children); 1112185029Spjd 1113185029Spjd if (error == ENOENT) 1114185029Spjd return (0); 1115185029Spjd 1116185029Spjd if (error) { 1117168404Spjd vdev_free(*vdp); 1118168404Spjd *vdp = NULL; 1119168404Spjd return (EINVAL); 1120168404Spjd } 1121168404Spjd 1122219089Spjd for (int c = 0; c < children; c++) { 1123168404Spjd vdev_t *vd; 1124168404Spjd if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 1125168404Spjd atype)) != 0) { 1126168404Spjd vdev_free(*vdp); 1127168404Spjd *vdp = NULL; 1128168404Spjd return (error); 1129168404Spjd } 1130168404Spjd } 1131168404Spjd 1132168404Spjd ASSERT(*vdp != NULL); 1133168404Spjd 1134168404Spjd return (0); 1135168404Spjd} 1136168404Spjd 1137168404Spjd/* 1138168404Spjd * Opposite of spa_load(). 1139168404Spjd */ 1140168404Spjdstatic void 1141168404Spjdspa_unload(spa_t *spa) 1142168404Spjd{ 1143168404Spjd int i; 1144168404Spjd 1145185029Spjd ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1146185029Spjd 1147168404Spjd /* 1148168404Spjd * Stop async tasks. 1149168404Spjd */ 1150168404Spjd spa_async_suspend(spa); 1151168404Spjd 1152168404Spjd /* 1153168404Spjd * Stop syncing. 1154168404Spjd */ 1155168404Spjd if (spa->spa_sync_on) { 1156168404Spjd txg_sync_stop(spa->spa_dsl_pool); 1157168404Spjd spa->spa_sync_on = B_FALSE; 1158168404Spjd } 1159168404Spjd 1160168404Spjd /* 1161185029Spjd * Wait for any outstanding async I/O to complete. 1162168404Spjd */ 1163209962Smm if (spa->spa_async_zio_root != NULL) { 1164209962Smm (void) zio_wait(spa->spa_async_zio_root); 1165209962Smm spa->spa_async_zio_root = NULL; 1166209962Smm } 1167168404Spjd 1168219089Spjd bpobj_close(&spa->spa_deferred_bpobj); 1169219089Spjd 1170168404Spjd /* 1171168404Spjd * Close the dsl pool. 1172168404Spjd */ 1173168404Spjd if (spa->spa_dsl_pool) { 1174168404Spjd dsl_pool_close(spa->spa_dsl_pool); 1175168404Spjd spa->spa_dsl_pool = NULL; 1176219089Spjd spa->spa_meta_objset = NULL; 1177168404Spjd } 1178168404Spjd 1179219089Spjd ddt_unload(spa); 1180219089Spjd 1181209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1182209962Smm 1183168404Spjd /* 1184209962Smm * Drop and purge level 2 cache 1185209962Smm */ 1186209962Smm spa_l2cache_drop(spa); 1187209962Smm 1188209962Smm /* 1189168404Spjd * Close all vdevs. 1190168404Spjd */ 1191168404Spjd if (spa->spa_root_vdev) 1192168404Spjd vdev_free(spa->spa_root_vdev); 1193168404Spjd ASSERT(spa->spa_root_vdev == NULL); 1194168404Spjd 1195185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) 1196185029Spjd vdev_free(spa->spa_spares.sav_vdevs[i]); 1197185029Spjd if (spa->spa_spares.sav_vdevs) { 1198185029Spjd kmem_free(spa->spa_spares.sav_vdevs, 1199185029Spjd spa->spa_spares.sav_count * sizeof (void *)); 1200185029Spjd spa->spa_spares.sav_vdevs = NULL; 1201168404Spjd } 1202185029Spjd if (spa->spa_spares.sav_config) { 1203185029Spjd nvlist_free(spa->spa_spares.sav_config); 1204185029Spjd spa->spa_spares.sav_config = NULL; 1205168404Spjd } 1206185029Spjd spa->spa_spares.sav_count = 0; 1207168404Spjd 1208230514Smm for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 1209230514Smm vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]); 1210185029Spjd vdev_free(spa->spa_l2cache.sav_vdevs[i]); 1211230514Smm } 1212185029Spjd if (spa->spa_l2cache.sav_vdevs) { 1213185029Spjd kmem_free(spa->spa_l2cache.sav_vdevs, 1214185029Spjd spa->spa_l2cache.sav_count * sizeof (void *)); 1215185029Spjd spa->spa_l2cache.sav_vdevs = NULL; 1216185029Spjd } 1217185029Spjd if (spa->spa_l2cache.sav_config) { 1218185029Spjd nvlist_free(spa->spa_l2cache.sav_config); 1219185029Spjd spa->spa_l2cache.sav_config = NULL; 1220185029Spjd } 1221185029Spjd spa->spa_l2cache.sav_count = 0; 1222185029Spjd 1223168404Spjd spa->spa_async_suspended = 0; 1224209962Smm 1225228103Smm if (spa->spa_comment != NULL) { 1226228103Smm spa_strfree(spa->spa_comment); 1227228103Smm spa->spa_comment = NULL; 1228228103Smm } 1229228103Smm 1230209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 1231168404Spjd} 1232168404Spjd 1233168404Spjd/* 1234168404Spjd * Load (or re-load) the current list of vdevs describing the active spares for 1235168404Spjd * this pool. When this is called, we have some form of basic information in 1236185029Spjd * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and 1237185029Spjd * then re-generate a more complete list including status information. 1238168404Spjd */ 1239168404Spjdstatic void 1240168404Spjdspa_load_spares(spa_t *spa) 1241168404Spjd{ 1242168404Spjd nvlist_t **spares; 1243168404Spjd uint_t nspares; 1244168404Spjd int i; 1245168404Spjd vdev_t *vd, *tvd; 1246168404Spjd 1247185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1248185029Spjd 1249168404Spjd /* 1250168404Spjd * First, close and free any existing spare vdevs. 1251168404Spjd */ 1252185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) { 1253185029Spjd vd = spa->spa_spares.sav_vdevs[i]; 1254168404Spjd 1255168404Spjd /* Undo the call to spa_activate() below */ 1256185029Spjd if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1257185029Spjd B_FALSE)) != NULL && tvd->vdev_isspare) 1258168404Spjd spa_spare_remove(tvd); 1259168404Spjd vdev_close(vd); 1260168404Spjd vdev_free(vd); 1261168404Spjd } 1262168404Spjd 1263185029Spjd if (spa->spa_spares.sav_vdevs) 1264185029Spjd kmem_free(spa->spa_spares.sav_vdevs, 1265185029Spjd spa->spa_spares.sav_count * sizeof (void *)); 1266168404Spjd 1267185029Spjd if (spa->spa_spares.sav_config == NULL) 1268168404Spjd nspares = 0; 1269168404Spjd else 1270185029Spjd VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 1271168404Spjd ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1272168404Spjd 1273185029Spjd spa->spa_spares.sav_count = (int)nspares; 1274185029Spjd spa->spa_spares.sav_vdevs = NULL; 1275168404Spjd 1276168404Spjd if (nspares == 0) 1277168404Spjd return; 1278168404Spjd 1279168404Spjd /* 1280168404Spjd * Construct the array of vdevs, opening them to get status in the 1281168404Spjd * process. For each spare, there is potentially two different vdev_t 1282168404Spjd * structures associated with it: one in the list of spares (used only 1283168404Spjd * for basic validation purposes) and one in the active vdev 1284168404Spjd * configuration (if it's spared in). During this phase we open and 1285168404Spjd * validate each vdev on the spare list. If the vdev also exists in the 1286168404Spjd * active configuration, then we also mark this vdev as an active spare. 1287168404Spjd */ 1288185029Spjd spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), 1289185029Spjd KM_SLEEP); 1290185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) { 1291168404Spjd VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 1292168404Spjd VDEV_ALLOC_SPARE) == 0); 1293168404Spjd ASSERT(vd != NULL); 1294168404Spjd 1295185029Spjd spa->spa_spares.sav_vdevs[i] = vd; 1296168404Spjd 1297185029Spjd if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1298185029Spjd B_FALSE)) != NULL) { 1299168404Spjd if (!tvd->vdev_isspare) 1300168404Spjd spa_spare_add(tvd); 1301168404Spjd 1302168404Spjd /* 1303168404Spjd * We only mark the spare active if we were successfully 1304168404Spjd * able to load the vdev. Otherwise, importing a pool 1305168404Spjd * with a bad active spare would result in strange 1306168404Spjd * behavior, because multiple pool would think the spare 1307168404Spjd * is actively in use. 1308168404Spjd * 1309168404Spjd * There is a vulnerability here to an equally bizarre 1310168404Spjd * circumstance, where a dead active spare is later 1311168404Spjd * brought back to life (onlined or otherwise). Given 1312168404Spjd * the rarity of this scenario, and the extra complexity 1313168404Spjd * it adds, we ignore the possibility. 1314168404Spjd */ 1315168404Spjd if (!vdev_is_dead(tvd)) 1316168404Spjd spa_spare_activate(tvd); 1317168404Spjd } 1318168404Spjd 1319185029Spjd vd->vdev_top = vd; 1320209962Smm vd->vdev_aux = &spa->spa_spares; 1321185029Spjd 1322168404Spjd if (vdev_open(vd) != 0) 1323168404Spjd continue; 1324168404Spjd 1325185029Spjd if (vdev_validate_aux(vd) == 0) 1326185029Spjd spa_spare_add(vd); 1327168404Spjd } 1328168404Spjd 1329168404Spjd /* 1330168404Spjd * Recompute the stashed list of spares, with status information 1331168404Spjd * this time. 1332168404Spjd */ 1333185029Spjd VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, 1334168404Spjd DATA_TYPE_NVLIST_ARRAY) == 0); 1335168404Spjd 1336185029Spjd spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), 1337185029Spjd KM_SLEEP); 1338185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) 1339185029Spjd spares[i] = vdev_config_generate(spa, 1340219089Spjd spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); 1341185029Spjd VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 1342185029Spjd ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); 1343185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) 1344168404Spjd nvlist_free(spares[i]); 1345185029Spjd kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); 1346168404Spjd} 1347168404Spjd 1348185029Spjd/* 1349185029Spjd * Load (or re-load) the current list of vdevs describing the active l2cache for 1350185029Spjd * this pool. When this is called, we have some form of basic information in 1351185029Spjd * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and 1352185029Spjd * then re-generate a more complete list including status information. 1353185029Spjd * Devices which are already active have their details maintained, and are 1354185029Spjd * not re-opened. 1355185029Spjd */ 1356185029Spjdstatic void 1357185029Spjdspa_load_l2cache(spa_t *spa) 1358185029Spjd{ 1359185029Spjd nvlist_t **l2cache; 1360185029Spjd uint_t nl2cache; 1361185029Spjd int i, j, oldnvdevs; 1362219089Spjd uint64_t guid; 1363185029Spjd vdev_t *vd, **oldvdevs, **newvdevs; 1364185029Spjd spa_aux_vdev_t *sav = &spa->spa_l2cache; 1365185029Spjd 1366185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1367185029Spjd 1368185029Spjd if (sav->sav_config != NULL) { 1369185029Spjd VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 1370185029Spjd ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 1371185029Spjd newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); 1372185029Spjd } else { 1373185029Spjd nl2cache = 0; 1374185029Spjd } 1375185029Spjd 1376185029Spjd oldvdevs = sav->sav_vdevs; 1377185029Spjd oldnvdevs = sav->sav_count; 1378185029Spjd sav->sav_vdevs = NULL; 1379185029Spjd sav->sav_count = 0; 1380185029Spjd 1381185029Spjd /* 1382185029Spjd * Process new nvlist of vdevs. 1383185029Spjd */ 1384185029Spjd for (i = 0; i < nl2cache; i++) { 1385185029Spjd VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, 1386185029Spjd &guid) == 0); 1387185029Spjd 1388185029Spjd newvdevs[i] = NULL; 1389185029Spjd for (j = 0; j < oldnvdevs; j++) { 1390185029Spjd vd = oldvdevs[j]; 1391185029Spjd if (vd != NULL && guid == vd->vdev_guid) { 1392185029Spjd /* 1393185029Spjd * Retain previous vdev for add/remove ops. 1394185029Spjd */ 1395185029Spjd newvdevs[i] = vd; 1396185029Spjd oldvdevs[j] = NULL; 1397185029Spjd break; 1398185029Spjd } 1399185029Spjd } 1400185029Spjd 1401185029Spjd if (newvdevs[i] == NULL) { 1402185029Spjd /* 1403185029Spjd * Create new vdev 1404185029Spjd */ 1405185029Spjd VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, 1406185029Spjd VDEV_ALLOC_L2CACHE) == 0); 1407185029Spjd ASSERT(vd != NULL); 1408185029Spjd newvdevs[i] = vd; 1409185029Spjd 1410185029Spjd /* 1411185029Spjd * Commit this vdev as an l2cache device, 1412185029Spjd * even if it fails to open. 1413185029Spjd */ 1414185029Spjd spa_l2cache_add(vd); 1415185029Spjd 1416185029Spjd vd->vdev_top = vd; 1417185029Spjd vd->vdev_aux = sav; 1418185029Spjd 1419185029Spjd spa_l2cache_activate(vd); 1420185029Spjd 1421185029Spjd if (vdev_open(vd) != 0) 1422185029Spjd continue; 1423185029Spjd 1424185029Spjd (void) vdev_validate_aux(vd); 1425185029Spjd 1426219089Spjd if (!vdev_is_dead(vd)) 1427219089Spjd l2arc_add_vdev(spa, vd); 1428185029Spjd } 1429185029Spjd } 1430185029Spjd 1431185029Spjd /* 1432185029Spjd * Purge vdevs that were dropped 1433185029Spjd */ 1434185029Spjd for (i = 0; i < oldnvdevs; i++) { 1435185029Spjd uint64_t pool; 1436185029Spjd 1437185029Spjd vd = oldvdevs[i]; 1438185029Spjd if (vd != NULL) { 1439230514Smm ASSERT(vd->vdev_isl2cache); 1440230514Smm 1441209962Smm if (spa_l2cache_exists(vd->vdev_guid, &pool) && 1442209962Smm pool != 0ULL && l2arc_vdev_present(vd)) 1443185029Spjd l2arc_remove_vdev(vd); 1444230514Smm vdev_clear_stats(vd); 1445230514Smm vdev_free(vd); 1446185029Spjd } 1447185029Spjd } 1448185029Spjd 1449185029Spjd if (oldvdevs) 1450185029Spjd kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); 1451185029Spjd 1452185029Spjd if (sav->sav_config == NULL) 1453185029Spjd goto out; 1454185029Spjd 1455185029Spjd sav->sav_vdevs = newvdevs; 1456185029Spjd sav->sav_count = (int)nl2cache; 1457185029Spjd 1458185029Spjd /* 1459185029Spjd * Recompute the stashed list of l2cache devices, with status 1460185029Spjd * information this time. 1461185029Spjd */ 1462185029Spjd VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 1463185029Spjd DATA_TYPE_NVLIST_ARRAY) == 0); 1464185029Spjd 1465185029Spjd l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 1466185029Spjd for (i = 0; i < sav->sav_count; i++) 1467185029Spjd l2cache[i] = vdev_config_generate(spa, 1468219089Spjd sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); 1469185029Spjd VERIFY(nvlist_add_nvlist_array(sav->sav_config, 1470185029Spjd ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); 1471185029Spjdout: 1472185029Spjd for (i = 0; i < sav->sav_count; i++) 1473185029Spjd nvlist_free(l2cache[i]); 1474185029Spjd if (sav->sav_count) 1475185029Spjd kmem_free(l2cache, sav->sav_count * sizeof (void *)); 1476185029Spjd} 1477185029Spjd 1478168404Spjdstatic int 1479168404Spjdload_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 1480168404Spjd{ 1481168404Spjd dmu_buf_t *db; 1482168404Spjd char *packed = NULL; 1483168404Spjd size_t nvsize = 0; 1484168404Spjd int error; 1485168404Spjd *value = NULL; 1486168404Spjd 1487168404Spjd VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 1488168404Spjd nvsize = *(uint64_t *)db->db_data; 1489168404Spjd dmu_buf_rele(db, FTAG); 1490168404Spjd 1491168404Spjd packed = kmem_alloc(nvsize, KM_SLEEP); 1492209962Smm error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, 1493209962Smm DMU_READ_PREFETCH); 1494168404Spjd if (error == 0) 1495168404Spjd error = nvlist_unpack(packed, nvsize, value, 0); 1496168404Spjd kmem_free(packed, nvsize); 1497168404Spjd 1498168404Spjd return (error); 1499168404Spjd} 1500168404Spjd 1501168404Spjd/* 1502185029Spjd * Checks to see if the given vdev could not be opened, in which case we post a 1503185029Spjd * sysevent to notify the autoreplace code that the device has been removed. 1504185029Spjd */ 1505185029Spjdstatic void 1506185029Spjdspa_check_removed(vdev_t *vd) 1507185029Spjd{ 1508219089Spjd for (int c = 0; c < vd->vdev_children; c++) 1509185029Spjd spa_check_removed(vd->vdev_child[c]); 1510185029Spjd 1511185029Spjd if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) { 1512185029Spjd zfs_post_autoreplace(vd->vdev_spa, vd); 1513185029Spjd spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK); 1514185029Spjd } 1515185029Spjd} 1516185029Spjd 1517185029Spjd/* 1518219089Spjd * Validate the current config against the MOS config 1519213197Smm */ 1520219089Spjdstatic boolean_t 1521219089Spjdspa_config_valid(spa_t *spa, nvlist_t *config) 1522213197Smm{ 1523219089Spjd vdev_t *mrvd, *rvd = spa->spa_root_vdev; 1524219089Spjd nvlist_t *nv; 1525213197Smm 1526219089Spjd VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0); 1527213197Smm 1528219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1529219089Spjd VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); 1530219089Spjd 1531219089Spjd ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children); 1532219089Spjd 1533219089Spjd /* 1534219089Spjd * If we're doing a normal import, then build up any additional 1535219089Spjd * diagnostic information about missing devices in this config. 1536219089Spjd * We'll pass this up to the user for further processing. 1537219089Spjd */ 1538219089Spjd if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { 1539219089Spjd nvlist_t **child, *nv; 1540219089Spjd uint64_t idx = 0; 1541219089Spjd 1542219089Spjd child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **), 1543219089Spjd KM_SLEEP); 1544219089Spjd VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); 1545219089Spjd 1546219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 1547219089Spjd vdev_t *tvd = rvd->vdev_child[c]; 1548219089Spjd vdev_t *mtvd = mrvd->vdev_child[c]; 1549219089Spjd 1550219089Spjd if (tvd->vdev_ops == &vdev_missing_ops && 1551219089Spjd mtvd->vdev_ops != &vdev_missing_ops && 1552219089Spjd mtvd->vdev_islog) 1553219089Spjd child[idx++] = vdev_config_generate(spa, mtvd, 1554219089Spjd B_FALSE, 0); 1555219089Spjd } 1556219089Spjd 1557219089Spjd if (idx) { 1558219089Spjd VERIFY(nvlist_add_nvlist_array(nv, 1559219089Spjd ZPOOL_CONFIG_CHILDREN, child, idx) == 0); 1560219089Spjd VERIFY(nvlist_add_nvlist(spa->spa_load_info, 1561219089Spjd ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0); 1562219089Spjd 1563219089Spjd for (int i = 0; i < idx; i++) 1564219089Spjd nvlist_free(child[i]); 1565219089Spjd } 1566219089Spjd nvlist_free(nv); 1567219089Spjd kmem_free(child, rvd->vdev_children * sizeof (char **)); 1568219089Spjd } 1569219089Spjd 1570219089Spjd /* 1571219089Spjd * Compare the root vdev tree with the information we have 1572219089Spjd * from the MOS config (mrvd). Check each top-level vdev 1573219089Spjd * with the corresponding MOS config top-level (mtvd). 1574219089Spjd */ 1575219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 1576213197Smm vdev_t *tvd = rvd->vdev_child[c]; 1577219089Spjd vdev_t *mtvd = mrvd->vdev_child[c]; 1578213197Smm 1579219089Spjd /* 1580219089Spjd * Resolve any "missing" vdevs in the current configuration. 1581219089Spjd * If we find that the MOS config has more accurate information 1582219089Spjd * about the top-level vdev then use that vdev instead. 1583219089Spjd */ 1584219089Spjd if (tvd->vdev_ops == &vdev_missing_ops && 1585219089Spjd mtvd->vdev_ops != &vdev_missing_ops) { 1586219089Spjd 1587219089Spjd if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) 1588219089Spjd continue; 1589219089Spjd 1590219089Spjd /* 1591219089Spjd * Device specific actions. 1592219089Spjd */ 1593219089Spjd if (mtvd->vdev_islog) { 1594219089Spjd spa_set_log_state(spa, SPA_LOG_CLEAR); 1595219089Spjd } else { 1596219089Spjd /* 1597219089Spjd * XXX - once we have 'readonly' pool 1598219089Spjd * support we should be able to handle 1599219089Spjd * missing data devices by transitioning 1600219089Spjd * the pool to readonly. 1601219089Spjd */ 1602219089Spjd continue; 1603219089Spjd } 1604219089Spjd 1605219089Spjd /* 1606219089Spjd * Swap the missing vdev with the data we were 1607219089Spjd * able to obtain from the MOS config. 1608219089Spjd */ 1609219089Spjd vdev_remove_child(rvd, tvd); 1610219089Spjd vdev_remove_child(mrvd, mtvd); 1611219089Spjd 1612219089Spjd vdev_add_child(rvd, mtvd); 1613219089Spjd vdev_add_child(mrvd, tvd); 1614219089Spjd 1615219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 1616219089Spjd vdev_load(mtvd); 1617219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1618219089Spjd 1619219089Spjd vdev_reopen(rvd); 1620219089Spjd } else if (mtvd->vdev_islog) { 1621219089Spjd /* 1622219089Spjd * Load the slog device's state from the MOS config 1623219089Spjd * since it's possible that the label does not 1624219089Spjd * contain the most up-to-date information. 1625219089Spjd */ 1626219089Spjd vdev_load_log_state(tvd, mtvd); 1627219089Spjd vdev_reopen(tvd); 1628219089Spjd } 1629213197Smm } 1630219089Spjd vdev_free(mrvd); 1631219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 1632219089Spjd 1633219089Spjd /* 1634219089Spjd * Ensure we were able to validate the config. 1635219089Spjd */ 1636219089Spjd return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum); 1637213197Smm} 1638213197Smm 1639213197Smm/* 1640185029Spjd * Check for missing log devices 1641185029Spjd */ 1642219089Spjdstatic int 1643185029Spjdspa_check_logs(spa_t *spa) 1644185029Spjd{ 1645185029Spjd switch (spa->spa_log_state) { 1646185029Spjd case SPA_LOG_MISSING: 1647185029Spjd /* need to recheck in case slog has been restored */ 1648185029Spjd case SPA_LOG_UNKNOWN: 1649185029Spjd if (dmu_objset_find(spa->spa_name, zil_check_log_chain, NULL, 1650185029Spjd DS_FIND_CHILDREN)) { 1651219089Spjd spa_set_log_state(spa, SPA_LOG_MISSING); 1652185029Spjd return (1); 1653185029Spjd } 1654185029Spjd break; 1655185029Spjd } 1656185029Spjd return (0); 1657185029Spjd} 1658185029Spjd 1659219089Spjdstatic boolean_t 1660219089Spjdspa_passivate_log(spa_t *spa) 1661219089Spjd{ 1662219089Spjd vdev_t *rvd = spa->spa_root_vdev; 1663219089Spjd boolean_t slog_found = B_FALSE; 1664219089Spjd 1665219089Spjd ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1666219089Spjd 1667219089Spjd if (!spa_has_slogs(spa)) 1668219089Spjd return (B_FALSE); 1669219089Spjd 1670219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 1671219089Spjd vdev_t *tvd = rvd->vdev_child[c]; 1672219089Spjd metaslab_group_t *mg = tvd->vdev_mg; 1673219089Spjd 1674219089Spjd if (tvd->vdev_islog) { 1675219089Spjd metaslab_group_passivate(mg); 1676219089Spjd slog_found = B_TRUE; 1677219089Spjd } 1678219089Spjd } 1679219089Spjd 1680219089Spjd return (slog_found); 1681219089Spjd} 1682219089Spjd 1683219089Spjdstatic void 1684219089Spjdspa_activate_log(spa_t *spa) 1685219089Spjd{ 1686219089Spjd vdev_t *rvd = spa->spa_root_vdev; 1687219089Spjd 1688219089Spjd ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1689219089Spjd 1690219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 1691219089Spjd vdev_t *tvd = rvd->vdev_child[c]; 1692219089Spjd metaslab_group_t *mg = tvd->vdev_mg; 1693219089Spjd 1694219089Spjd if (tvd->vdev_islog) 1695219089Spjd metaslab_group_activate(mg); 1696219089Spjd } 1697219089Spjd} 1698219089Spjd 1699219089Spjdint 1700219089Spjdspa_offline_log(spa_t *spa) 1701219089Spjd{ 1702219089Spjd int error = 0; 1703219089Spjd 1704219089Spjd if ((error = dmu_objset_find(spa_name(spa), zil_vdev_offline, 1705219089Spjd NULL, DS_FIND_CHILDREN)) == 0) { 1706219089Spjd 1707219089Spjd /* 1708219089Spjd * We successfully offlined the log device, sync out the 1709219089Spjd * current txg so that the "stubby" block can be removed 1710219089Spjd * by zil_sync(). 1711219089Spjd */ 1712219089Spjd txg_wait_synced(spa->spa_dsl_pool, 0); 1713219089Spjd } 1714219089Spjd return (error); 1715219089Spjd} 1716219089Spjd 1717219089Spjdstatic void 1718219089Spjdspa_aux_check_removed(spa_aux_vdev_t *sav) 1719219089Spjd{ 1720219089Spjd int i; 1721219089Spjd 1722219089Spjd for (i = 0; i < sav->sav_count; i++) 1723219089Spjd spa_check_removed(sav->sav_vdevs[i]); 1724219089Spjd} 1725219089Spjd 1726219089Spjdvoid 1727219089Spjdspa_claim_notify(zio_t *zio) 1728219089Spjd{ 1729219089Spjd spa_t *spa = zio->io_spa; 1730219089Spjd 1731219089Spjd if (zio->io_error) 1732219089Spjd return; 1733219089Spjd 1734219089Spjd mutex_enter(&spa->spa_props_lock); /* any mutex will do */ 1735219089Spjd if (spa->spa_claim_max_txg < zio->io_bp->blk_birth) 1736219089Spjd spa->spa_claim_max_txg = zio->io_bp->blk_birth; 1737219089Spjd mutex_exit(&spa->spa_props_lock); 1738219089Spjd} 1739219089Spjd 1740219089Spjdtypedef struct spa_load_error { 1741219089Spjd uint64_t sle_meta_count; 1742219089Spjd uint64_t sle_data_count; 1743219089Spjd} spa_load_error_t; 1744219089Spjd 1745219089Spjdstatic void 1746219089Spjdspa_load_verify_done(zio_t *zio) 1747219089Spjd{ 1748219089Spjd blkptr_t *bp = zio->io_bp; 1749219089Spjd spa_load_error_t *sle = zio->io_private; 1750219089Spjd dmu_object_type_t type = BP_GET_TYPE(bp); 1751219089Spjd int error = zio->io_error; 1752219089Spjd 1753219089Spjd if (error) { 1754236884Smm if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && 1755219089Spjd type != DMU_OT_INTENT_LOG) 1756219089Spjd atomic_add_64(&sle->sle_meta_count, 1); 1757219089Spjd else 1758219089Spjd atomic_add_64(&sle->sle_data_count, 1); 1759219089Spjd } 1760219089Spjd zio_data_buf_free(zio->io_data, zio->io_size); 1761219089Spjd} 1762219089Spjd 1763219089Spjd/*ARGSUSED*/ 1764219089Spjdstatic int 1765219089Spjdspa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 1766219089Spjd arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) 1767219089Spjd{ 1768219089Spjd if (bp != NULL) { 1769219089Spjd zio_t *rio = arg; 1770219089Spjd size_t size = BP_GET_PSIZE(bp); 1771219089Spjd void *data = zio_data_buf_alloc(size); 1772219089Spjd 1773219089Spjd zio_nowait(zio_read(rio, spa, bp, data, size, 1774219089Spjd spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, 1775219089Spjd ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | 1776219089Spjd ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); 1777219089Spjd } 1778219089Spjd return (0); 1779219089Spjd} 1780219089Spjd 1781219089Spjdstatic int 1782219089Spjdspa_load_verify(spa_t *spa) 1783219089Spjd{ 1784219089Spjd zio_t *rio; 1785219089Spjd spa_load_error_t sle = { 0 }; 1786219089Spjd zpool_rewind_policy_t policy; 1787219089Spjd boolean_t verify_ok = B_FALSE; 1788219089Spjd int error; 1789219089Spjd 1790219089Spjd zpool_get_rewind_policy(spa->spa_config, &policy); 1791219089Spjd 1792219089Spjd if (policy.zrp_request & ZPOOL_NEVER_REWIND) 1793219089Spjd return (0); 1794219089Spjd 1795219089Spjd rio = zio_root(spa, NULL, &sle, 1796219089Spjd ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 1797219089Spjd 1798219089Spjd error = traverse_pool(spa, spa->spa_verify_min_txg, 1799219089Spjd TRAVERSE_PRE | TRAVERSE_PREFETCH, spa_load_verify_cb, rio); 1800219089Spjd 1801219089Spjd (void) zio_wait(rio); 1802219089Spjd 1803219089Spjd spa->spa_load_meta_errors = sle.sle_meta_count; 1804219089Spjd spa->spa_load_data_errors = sle.sle_data_count; 1805219089Spjd 1806219089Spjd if (!error && sle.sle_meta_count <= policy.zrp_maxmeta && 1807219089Spjd sle.sle_data_count <= policy.zrp_maxdata) { 1808219089Spjd int64_t loss = 0; 1809219089Spjd 1810219089Spjd verify_ok = B_TRUE; 1811219089Spjd spa->spa_load_txg = spa->spa_uberblock.ub_txg; 1812219089Spjd spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; 1813219089Spjd 1814219089Spjd loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; 1815219089Spjd VERIFY(nvlist_add_uint64(spa->spa_load_info, 1816219089Spjd ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0); 1817219089Spjd VERIFY(nvlist_add_int64(spa->spa_load_info, 1818219089Spjd ZPOOL_CONFIG_REWIND_TIME, loss) == 0); 1819219089Spjd VERIFY(nvlist_add_uint64(spa->spa_load_info, 1820219089Spjd ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0); 1821219089Spjd } else { 1822219089Spjd spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; 1823219089Spjd } 1824219089Spjd 1825219089Spjd if (error) { 1826219089Spjd if (error != ENXIO && error != EIO) 1827219089Spjd error = EIO; 1828219089Spjd return (error); 1829219089Spjd } 1830219089Spjd 1831219089Spjd return (verify_ok ? 0 : EIO); 1832219089Spjd} 1833219089Spjd 1834185029Spjd/* 1835219089Spjd * Find a value in the pool props object. 1836168404Spjd */ 1837219089Spjdstatic void 1838219089Spjdspa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) 1839219089Spjd{ 1840219089Spjd (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, 1841219089Spjd zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); 1842219089Spjd} 1843219089Spjd 1844219089Spjd/* 1845219089Spjd * Find a value in the pool directory object. 1846219089Spjd */ 1847168404Spjdstatic int 1848219089Spjdspa_dir_prop(spa_t *spa, const char *name, uint64_t *val) 1849168404Spjd{ 1850219089Spjd return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1851219089Spjd name, sizeof (uint64_t), 1, val)); 1852219089Spjd} 1853168404Spjd 1854219089Spjdstatic int 1855219089Spjdspa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) 1856219089Spjd{ 1857219089Spjd vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); 1858219089Spjd return (err); 1859219089Spjd} 1860219089Spjd 1861219089Spjd/* 1862219089Spjd * Fix up config after a partly-completed split. This is done with the 1863219089Spjd * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off 1864219089Spjd * pool have that entry in their config, but only the splitting one contains 1865219089Spjd * a list of all the guids of the vdevs that are being split off. 1866219089Spjd * 1867219089Spjd * This function determines what to do with that list: either rejoin 1868219089Spjd * all the disks to the pool, or complete the splitting process. To attempt 1869219089Spjd * the rejoin, each disk that is offlined is marked online again, and 1870219089Spjd * we do a reopen() call. If the vdev label for every disk that was 1871219089Spjd * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) 1872219089Spjd * then we call vdev_split() on each disk, and complete the split. 1873219089Spjd * 1874219089Spjd * Otherwise we leave the config alone, with all the vdevs in place in 1875219089Spjd * the original pool. 1876219089Spjd */ 1877219089Spjdstatic void 1878219089Spjdspa_try_repair(spa_t *spa, nvlist_t *config) 1879219089Spjd{ 1880219089Spjd uint_t extracted; 1881219089Spjd uint64_t *glist; 1882219089Spjd uint_t i, gcount; 1883219089Spjd nvlist_t *nvl; 1884219089Spjd vdev_t **vd; 1885219089Spjd boolean_t attempt_reopen; 1886219089Spjd 1887219089Spjd if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) 1888219089Spjd return; 1889219089Spjd 1890219089Spjd /* check that the config is complete */ 1891219089Spjd if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 1892219089Spjd &glist, &gcount) != 0) 1893219089Spjd return; 1894219089Spjd 1895219089Spjd vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); 1896219089Spjd 1897219089Spjd /* attempt to online all the vdevs & validate */ 1898219089Spjd attempt_reopen = B_TRUE; 1899219089Spjd for (i = 0; i < gcount; i++) { 1900219089Spjd if (glist[i] == 0) /* vdev is hole */ 1901219089Spjd continue; 1902219089Spjd 1903219089Spjd vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); 1904219089Spjd if (vd[i] == NULL) { 1905219089Spjd /* 1906219089Spjd * Don't bother attempting to reopen the disks; 1907219089Spjd * just do the split. 1908219089Spjd */ 1909219089Spjd attempt_reopen = B_FALSE; 1910219089Spjd } else { 1911219089Spjd /* attempt to re-online it */ 1912219089Spjd vd[i]->vdev_offline = B_FALSE; 1913219089Spjd } 1914219089Spjd } 1915219089Spjd 1916219089Spjd if (attempt_reopen) { 1917219089Spjd vdev_reopen(spa->spa_root_vdev); 1918219089Spjd 1919219089Spjd /* check each device to see what state it's in */ 1920219089Spjd for (extracted = 0, i = 0; i < gcount; i++) { 1921219089Spjd if (vd[i] != NULL && 1922219089Spjd vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) 1923219089Spjd break; 1924219089Spjd ++extracted; 1925219089Spjd } 1926219089Spjd } 1927219089Spjd 1928209962Smm /* 1929219089Spjd * If every disk has been moved to the new pool, or if we never 1930219089Spjd * even attempted to look at them, then we split them off for 1931219089Spjd * good. 1932209962Smm */ 1933219089Spjd if (!attempt_reopen || gcount == extracted) { 1934219089Spjd for (i = 0; i < gcount; i++) 1935219089Spjd if (vd[i] != NULL) 1936219089Spjd vdev_split(vd[i]); 1937219089Spjd vdev_reopen(spa->spa_root_vdev); 1938219089Spjd } 1939209962Smm 1940219089Spjd kmem_free(vd, gcount * sizeof (vdev_t *)); 1941219089Spjd} 1942185029Spjd 1943219089Spjdstatic int 1944219089Spjdspa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, 1945219089Spjd boolean_t mosconfig) 1946219089Spjd{ 1947219089Spjd nvlist_t *config = spa->spa_config; 1948219089Spjd char *ereport = FM_EREPORT_ZFS_POOL; 1949228103Smm char *comment; 1950219089Spjd int error; 1951219089Spjd uint64_t pool_guid; 1952219089Spjd nvlist_t *nvl; 1953168404Spjd 1954219089Spjd if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) 1955219089Spjd return (EINVAL); 1956168404Spjd 1957228103Smm ASSERT(spa->spa_comment == NULL); 1958228103Smm if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) 1959228103Smm spa->spa_comment = spa_strdup(comment); 1960228103Smm 1961168404Spjd /* 1962168404Spjd * Versioning wasn't explicitly added to the label until later, so if 1963168404Spjd * it's not present treat it as the initial version. 1964168404Spjd */ 1965219089Spjd if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 1966219089Spjd &spa->spa_ubsync.ub_version) != 0) 1967219089Spjd spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 1968168404Spjd 1969168404Spjd (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 1970168404Spjd &spa->spa_config_txg); 1971168404Spjd 1972168404Spjd if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 1973168404Spjd spa_guid_exists(pool_guid, 0)) { 1974168404Spjd error = EEXIST; 1975219089Spjd } else { 1976228103Smm spa->spa_config_guid = pool_guid; 1977219089Spjd 1978219089Spjd if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, 1979219089Spjd &nvl) == 0) { 1980219089Spjd VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting, 1981219089Spjd KM_SLEEP) == 0); 1982219089Spjd } 1983219089Spjd 1984236884Smm nvlist_free(spa->spa_load_info); 1985236884Smm spa->spa_load_info = fnvlist_alloc(); 1986236884Smm 1987219089Spjd gethrestime(&spa->spa_loaded_ts); 1988219089Spjd error = spa_load_impl(spa, pool_guid, config, state, type, 1989219089Spjd mosconfig, &ereport); 1990168404Spjd } 1991168404Spjd 1992219089Spjd spa->spa_minref = refcount_count(&spa->spa_refcount); 1993219089Spjd if (error) { 1994219089Spjd if (error != EEXIST) { 1995219089Spjd spa->spa_loaded_ts.tv_sec = 0; 1996219089Spjd spa->spa_loaded_ts.tv_nsec = 0; 1997219089Spjd } 1998219089Spjd if (error != EBADF) { 1999219089Spjd zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); 2000219089Spjd } 2001219089Spjd } 2002219089Spjd spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; 2003219089Spjd spa->spa_ena = 0; 2004168404Spjd 2005219089Spjd return (error); 2006219089Spjd} 2007219089Spjd 2008219089Spjd/* 2009219089Spjd * Load an existing storage pool, using the pool's builtin spa_config as a 2010219089Spjd * source of configuration information. 2011219089Spjd */ 2012219089Spjdstatic int 2013219089Spjdspa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, 2014219089Spjd spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, 2015219089Spjd char **ereport) 2016219089Spjd{ 2017219089Spjd int error = 0; 2018219089Spjd nvlist_t *nvroot = NULL; 2019236884Smm nvlist_t *label; 2020219089Spjd vdev_t *rvd; 2021219089Spjd uberblock_t *ub = &spa->spa_uberblock; 2022219089Spjd uint64_t children, config_cache_txg = spa->spa_config_txg; 2023219089Spjd int orig_mode = spa->spa_mode; 2024219089Spjd int parse; 2025219089Spjd uint64_t obj; 2026236884Smm boolean_t missing_feat_write = B_FALSE; 2027219089Spjd 2028168404Spjd /* 2029219089Spjd * If this is an untrusted config, access the pool in read-only mode. 2030219089Spjd * This prevents things like resilvering recently removed devices. 2031219089Spjd */ 2032219089Spjd if (!mosconfig) 2033219089Spjd spa->spa_mode = FREAD; 2034219089Spjd 2035219089Spjd ASSERT(MUTEX_HELD(&spa_namespace_lock)); 2036219089Spjd 2037219089Spjd spa->spa_load_state = state; 2038219089Spjd 2039219089Spjd if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot)) 2040219089Spjd return (EINVAL); 2041219089Spjd 2042219089Spjd parse = (type == SPA_IMPORT_EXISTING ? 2043219089Spjd VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); 2044219089Spjd 2045219089Spjd /* 2046209962Smm * Create "The Godfather" zio to hold all async IOs 2047209962Smm */ 2048209962Smm spa->spa_async_zio_root = zio_root(spa, NULL, NULL, 2049209962Smm ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); 2050209962Smm 2051209962Smm /* 2052168404Spjd * Parse the configuration into a vdev tree. We explicitly set the 2053168404Spjd * value that will be returned by spa_version() since parsing the 2054168404Spjd * configuration requires knowing the version number. 2055168404Spjd */ 2056185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2057219089Spjd error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse); 2058185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2059168404Spjd 2060168404Spjd if (error != 0) 2061219089Spjd return (error); 2062168404Spjd 2063168404Spjd ASSERT(spa->spa_root_vdev == rvd); 2064168404Spjd 2065219089Spjd if (type != SPA_IMPORT_ASSEMBLE) { 2066219089Spjd ASSERT(spa_guid(spa) == pool_guid); 2067219089Spjd } 2068219089Spjd 2069168404Spjd /* 2070168404Spjd * Try to open all vdevs, loading each label in the process. 2071168404Spjd */ 2072185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2073168926Spjd error = vdev_open(rvd); 2074185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2075168926Spjd if (error != 0) 2076219089Spjd return (error); 2077168404Spjd 2078168404Spjd /* 2079209962Smm * We need to validate the vdev labels against the configuration that 2080209962Smm * we have in hand, which is dependent on the setting of mosconfig. If 2081209962Smm * mosconfig is true then we're validating the vdev labels based on 2082219089Spjd * that config. Otherwise, we're validating against the cached config 2083209962Smm * (zpool.cache) that was read when we loaded the zfs module, and then 2084209962Smm * later we will recursively call spa_load() and validate against 2085209962Smm * the vdev config. 2086219089Spjd * 2087219089Spjd * If we're assembling a new pool that's been split off from an 2088219089Spjd * existing pool, the labels haven't yet been updated so we skip 2089219089Spjd * validation for now. 2090168404Spjd */ 2091219089Spjd if (type != SPA_IMPORT_ASSEMBLE) { 2092219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2093230514Smm error = vdev_validate(rvd, mosconfig); 2094219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2095168404Spjd 2096219089Spjd if (error != 0) 2097219089Spjd return (error); 2098219089Spjd 2099219089Spjd if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 2100219089Spjd return (ENXIO); 2101168404Spjd } 2102168404Spjd 2103168404Spjd /* 2104168404Spjd * Find the best uberblock. 2105168404Spjd */ 2106236884Smm vdev_uberblock_load(rvd, ub, &label); 2107168404Spjd 2108168404Spjd /* 2109168404Spjd * If we weren't able to find a single valid uberblock, return failure. 2110168404Spjd */ 2111236884Smm if (ub->ub_txg == 0) { 2112236884Smm nvlist_free(label); 2113219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); 2114236884Smm } 2115168404Spjd 2116168404Spjd /* 2117236884Smm * If the pool has an unsupported version we can't open it. 2118168404Spjd */ 2119236884Smm if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) { 2120236884Smm nvlist_free(label); 2121219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); 2122236884Smm } 2123168404Spjd 2124236884Smm if (ub->ub_version >= SPA_VERSION_FEATURES) { 2125236884Smm nvlist_t *features; 2126236884Smm 2127236884Smm /* 2128236884Smm * If we weren't able to find what's necessary for reading the 2129236884Smm * MOS in the label, return failure. 2130236884Smm */ 2131236884Smm if (label == NULL || nvlist_lookup_nvlist(label, 2132236884Smm ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) { 2133236884Smm nvlist_free(label); 2134236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 2135236884Smm ENXIO)); 2136236884Smm } 2137236884Smm 2138236884Smm /* 2139236884Smm * Update our in-core representation with the definitive values 2140236884Smm * from the label. 2141236884Smm */ 2142236884Smm nvlist_free(spa->spa_label_features); 2143236884Smm VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0); 2144236884Smm } 2145236884Smm 2146236884Smm nvlist_free(label); 2147236884Smm 2148168404Spjd /* 2149236884Smm * Look through entries in the label nvlist's features_for_read. If 2150236884Smm * there is a feature listed there which we don't understand then we 2151236884Smm * cannot open a pool. 2152236884Smm */ 2153236884Smm if (ub->ub_version >= SPA_VERSION_FEATURES) { 2154236884Smm nvlist_t *unsup_feat; 2155236884Smm 2156236884Smm VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) == 2157236884Smm 0); 2158236884Smm 2159236884Smm for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features, 2160236884Smm NULL); nvp != NULL; 2161236884Smm nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) { 2162236884Smm if (!zfeature_is_supported(nvpair_name(nvp))) { 2163236884Smm VERIFY(nvlist_add_string(unsup_feat, 2164236884Smm nvpair_name(nvp), "") == 0); 2165236884Smm } 2166236884Smm } 2167236884Smm 2168236884Smm if (!nvlist_empty(unsup_feat)) { 2169236884Smm VERIFY(nvlist_add_nvlist(spa->spa_load_info, 2170236884Smm ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0); 2171236884Smm nvlist_free(unsup_feat); 2172236884Smm return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 2173236884Smm ENOTSUP)); 2174236884Smm } 2175236884Smm 2176236884Smm nvlist_free(unsup_feat); 2177236884Smm } 2178236884Smm 2179236884Smm /* 2180168404Spjd * If the vdev guid sum doesn't match the uberblock, we have an 2181219089Spjd * incomplete configuration. We first check to see if the pool 2182219089Spjd * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN). 2183219089Spjd * If it is, defer the vdev_guid_sum check till later so we 2184219089Spjd * can handle missing vdevs. 2185168404Spjd */ 2186219089Spjd if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN, 2187219089Spjd &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE && 2188219089Spjd rvd->vdev_guid_sum != ub->ub_guid_sum) 2189219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); 2190219089Spjd 2191219089Spjd if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { 2192219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2193219089Spjd spa_try_repair(spa, config); 2194219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2195219089Spjd nvlist_free(spa->spa_config_splitting); 2196219089Spjd spa->spa_config_splitting = NULL; 2197168404Spjd } 2198168404Spjd 2199168404Spjd /* 2200168404Spjd * Initialize internal SPA structures. 2201168404Spjd */ 2202168404Spjd spa->spa_state = POOL_STATE_ACTIVE; 2203168404Spjd spa->spa_ubsync = spa->spa_uberblock; 2204219089Spjd spa->spa_verify_min_txg = spa->spa_extreme_rewind ? 2205219089Spjd TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; 2206219089Spjd spa->spa_first_txg = spa->spa_last_ubsync_txg ? 2207219089Spjd spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; 2208219089Spjd spa->spa_claim_max_txg = spa->spa_first_txg; 2209219089Spjd spa->spa_prev_software_version = ub->ub_software_version; 2210219089Spjd 2211236884Smm error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 2212219089Spjd if (error) 2213219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2214168404Spjd spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 2215168404Spjd 2216219089Spjd if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0) 2217219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2218168404Spjd 2219236884Smm if (spa_version(spa) >= SPA_VERSION_FEATURES) { 2220236884Smm boolean_t missing_feat_read = B_FALSE; 2221238926Smm nvlist_t *unsup_feat, *enabled_feat; 2222236884Smm 2223236884Smm if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ, 2224236884Smm &spa->spa_feat_for_read_obj) != 0) { 2225236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2226236884Smm } 2227236884Smm 2228236884Smm if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE, 2229236884Smm &spa->spa_feat_for_write_obj) != 0) { 2230236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2231236884Smm } 2232236884Smm 2233236884Smm if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS, 2234236884Smm &spa->spa_feat_desc_obj) != 0) { 2235236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2236236884Smm } 2237236884Smm 2238238926Smm enabled_feat = fnvlist_alloc(); 2239238926Smm unsup_feat = fnvlist_alloc(); 2240236884Smm 2241236884Smm if (!feature_is_supported(spa->spa_meta_objset, 2242236884Smm spa->spa_feat_for_read_obj, spa->spa_feat_desc_obj, 2243238926Smm unsup_feat, enabled_feat)) 2244236884Smm missing_feat_read = B_TRUE; 2245236884Smm 2246236884Smm if (spa_writeable(spa) || state == SPA_LOAD_TRYIMPORT) { 2247236884Smm if (!feature_is_supported(spa->spa_meta_objset, 2248236884Smm spa->spa_feat_for_write_obj, spa->spa_feat_desc_obj, 2249238926Smm unsup_feat, enabled_feat)) { 2250236884Smm missing_feat_write = B_TRUE; 2251238926Smm } 2252236884Smm } 2253236884Smm 2254238926Smm fnvlist_add_nvlist(spa->spa_load_info, 2255238926Smm ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat); 2256238926Smm 2257236884Smm if (!nvlist_empty(unsup_feat)) { 2258238926Smm fnvlist_add_nvlist(spa->spa_load_info, 2259238926Smm ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); 2260236884Smm } 2261236884Smm 2262238926Smm fnvlist_free(enabled_feat); 2263238926Smm fnvlist_free(unsup_feat); 2264236884Smm 2265236884Smm if (!missing_feat_read) { 2266236884Smm fnvlist_add_boolean(spa->spa_load_info, 2267236884Smm ZPOOL_CONFIG_CAN_RDONLY); 2268236884Smm } 2269236884Smm 2270236884Smm /* 2271236884Smm * If the state is SPA_LOAD_TRYIMPORT, our objective is 2272236884Smm * twofold: to determine whether the pool is available for 2273236884Smm * import in read-write mode and (if it is not) whether the 2274236884Smm * pool is available for import in read-only mode. If the pool 2275236884Smm * is available for import in read-write mode, it is displayed 2276236884Smm * as available in userland; if it is not available for import 2277236884Smm * in read-only mode, it is displayed as unavailable in 2278236884Smm * userland. If the pool is available for import in read-only 2279236884Smm * mode but not read-write mode, it is displayed as unavailable 2280236884Smm * in userland with a special note that the pool is actually 2281236884Smm * available for open in read-only mode. 2282236884Smm * 2283236884Smm * As a result, if the state is SPA_LOAD_TRYIMPORT and we are 2284236884Smm * missing a feature for write, we must first determine whether 2285236884Smm * the pool can be opened read-only before returning to 2286236884Smm * userland in order to know whether to display the 2287236884Smm * abovementioned note. 2288236884Smm */ 2289236884Smm if (missing_feat_read || (missing_feat_write && 2290236884Smm spa_writeable(spa))) { 2291236884Smm return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 2292236884Smm ENOTSUP)); 2293236884Smm } 2294236884Smm } 2295236884Smm 2296236884Smm spa->spa_is_initializing = B_TRUE; 2297236884Smm error = dsl_pool_open(spa->spa_dsl_pool); 2298236884Smm spa->spa_is_initializing = B_FALSE; 2299236884Smm if (error != 0) 2300236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2301236884Smm 2302168404Spjd if (!mosconfig) { 2303168498Spjd uint64_t hostid; 2304219089Spjd nvlist_t *policy = NULL, *nvconfig; 2305168404Spjd 2306219089Spjd if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) 2307219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2308168404Spjd 2309219089Spjd if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig, 2310185029Spjd ZPOOL_CONFIG_HOSTID, &hostid) == 0) { 2311168498Spjd char *hostname; 2312168498Spjd unsigned long myhostid = 0; 2313168498Spjd 2314219089Spjd VERIFY(nvlist_lookup_string(nvconfig, 2315168498Spjd ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 2316168498Spjd 2317219089Spjd#ifdef _KERNEL 2318219089Spjd myhostid = zone_get_hostid(NULL); 2319219089Spjd#else /* _KERNEL */ 2320219089Spjd /* 2321219089Spjd * We're emulating the system's hostid in userland, so 2322219089Spjd * we can't use zone_get_hostid(). 2323219089Spjd */ 2324168498Spjd (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 2325219089Spjd#endif /* _KERNEL */ 2326204073Spjd if (check_hostid && hostid != 0 && myhostid != 0 && 2327219089Spjd hostid != myhostid) { 2328219089Spjd nvlist_free(nvconfig); 2329168498Spjd cmn_err(CE_WARN, "pool '%s' could not be " 2330168498Spjd "loaded as it was last accessed by " 2331185029Spjd "another system (host: %s hostid: 0x%lx). " 2332236146Smm "See: http://illumos.org/msg/ZFS-8000-EY", 2333185029Spjd spa_name(spa), hostname, 2334168498Spjd (unsigned long)hostid); 2335219089Spjd return (EBADF); 2336168498Spjd } 2337168498Spjd } 2338219089Spjd if (nvlist_lookup_nvlist(spa->spa_config, 2339219089Spjd ZPOOL_REWIND_POLICY, &policy) == 0) 2340219089Spjd VERIFY(nvlist_add_nvlist(nvconfig, 2341219089Spjd ZPOOL_REWIND_POLICY, policy) == 0); 2342168498Spjd 2343219089Spjd spa_config_set(spa, nvconfig); 2344168404Spjd spa_unload(spa); 2345168404Spjd spa_deactivate(spa); 2346209962Smm spa_activate(spa, orig_mode); 2347168404Spjd 2348219089Spjd return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE)); 2349168404Spjd } 2350168404Spjd 2351219089Spjd if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0) 2352219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2353219089Spjd error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); 2354219089Spjd if (error != 0) 2355219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2356168404Spjd 2357168404Spjd /* 2358168404Spjd * Load the bit that tells us to use the new accounting function 2359168404Spjd * (raid-z deflation). If we have an older pool, this will not 2360168404Spjd * be present. 2361168404Spjd */ 2362219089Spjd error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate); 2363219089Spjd if (error != 0 && error != ENOENT) 2364219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2365168404Spjd 2366219089Spjd error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, 2367219089Spjd &spa->spa_creation_version); 2368219089Spjd if (error != 0 && error != ENOENT) 2369219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2370219089Spjd 2371168404Spjd /* 2372168404Spjd * Load the persistent error log. If we have an older pool, this will 2373168404Spjd * not be present. 2374168404Spjd */ 2375219089Spjd error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last); 2376219089Spjd if (error != 0 && error != ENOENT) 2377219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2378168404Spjd 2379219089Spjd error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, 2380219089Spjd &spa->spa_errlog_scrub); 2381219089Spjd if (error != 0 && error != ENOENT) 2382219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2383168404Spjd 2384168404Spjd /* 2385168404Spjd * Load the history object. If we have an older pool, this 2386168404Spjd * will not be present. 2387168404Spjd */ 2388219089Spjd error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history); 2389219089Spjd if (error != 0 && error != ENOENT) 2390219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2391168404Spjd 2392168404Spjd /* 2393219089Spjd * If we're assembling the pool from the split-off vdevs of 2394219089Spjd * an existing pool, we don't want to attach the spares & cache 2395219089Spjd * devices. 2396219089Spjd */ 2397219089Spjd 2398219089Spjd /* 2399168404Spjd * Load any hot spares for this pool. 2400168404Spjd */ 2401219089Spjd error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object); 2402219089Spjd if (error != 0 && error != ENOENT) 2403219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2404219089Spjd if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 2405185029Spjd ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 2406185029Spjd if (load_nvlist(spa, spa->spa_spares.sav_object, 2407219089Spjd &spa->spa_spares.sav_config) != 0) 2408219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2409168404Spjd 2410185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2411168404Spjd spa_load_spares(spa); 2412185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2413219089Spjd } else if (error == 0) { 2414219089Spjd spa->spa_spares.sav_sync = B_TRUE; 2415168404Spjd } 2416168404Spjd 2417185029Spjd /* 2418185029Spjd * Load any level 2 ARC devices for this pool. 2419185029Spjd */ 2420219089Spjd error = spa_dir_prop(spa, DMU_POOL_L2CACHE, 2421185029Spjd &spa->spa_l2cache.sav_object); 2422219089Spjd if (error != 0 && error != ENOENT) 2423219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2424219089Spjd if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 2425185029Spjd ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 2426185029Spjd if (load_nvlist(spa, spa->spa_l2cache.sav_object, 2427219089Spjd &spa->spa_l2cache.sav_config) != 0) 2428219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2429185029Spjd 2430185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2431185029Spjd spa_load_l2cache(spa); 2432185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2433219089Spjd } else if (error == 0) { 2434219089Spjd spa->spa_l2cache.sav_sync = B_TRUE; 2435185029Spjd } 2436185029Spjd 2437219089Spjd spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 2438213197Smm 2439219089Spjd error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object); 2440219089Spjd if (error && error != ENOENT) 2441219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2442185029Spjd 2443219089Spjd if (error == 0) { 2444219089Spjd uint64_t autoreplace; 2445185029Spjd 2446219089Spjd spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); 2447219089Spjd spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); 2448219089Spjd spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); 2449219089Spjd spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); 2450219089Spjd spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); 2451219089Spjd spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO, 2452219089Spjd &spa->spa_dedup_ditto); 2453185029Spjd 2454219089Spjd spa->spa_autoreplace = (autoreplace != 0); 2455168404Spjd } 2456168404Spjd 2457168404Spjd /* 2458185029Spjd * If the 'autoreplace' property is set, then post a resource notifying 2459185029Spjd * the ZFS DE that it should not issue any faults for unopenable 2460185029Spjd * devices. We also iterate over the vdevs, and post a sysevent for any 2461185029Spjd * unopenable vdevs so that the normal autoreplace handler can take 2462185029Spjd * over. 2463185029Spjd */ 2464219089Spjd if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) { 2465185029Spjd spa_check_removed(spa->spa_root_vdev); 2466219089Spjd /* 2467219089Spjd * For the import case, this is done in spa_import(), because 2468219089Spjd * at this point we're using the spare definitions from 2469219089Spjd * the MOS config, not necessarily from the userland config. 2470219089Spjd */ 2471219089Spjd if (state != SPA_LOAD_IMPORT) { 2472219089Spjd spa_aux_check_removed(&spa->spa_spares); 2473219089Spjd spa_aux_check_removed(&spa->spa_l2cache); 2474219089Spjd } 2475219089Spjd } 2476185029Spjd 2477185029Spjd /* 2478168404Spjd * Load the vdev state for all toplevel vdevs. 2479168404Spjd */ 2480168404Spjd vdev_load(rvd); 2481168404Spjd 2482168404Spjd /* 2483168404Spjd * Propagate the leaf DTLs we just loaded all the way up the tree. 2484168404Spjd */ 2485185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2486168404Spjd vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 2487185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2488168404Spjd 2489168404Spjd /* 2490219089Spjd * Load the DDTs (dedup tables). 2491168404Spjd */ 2492219089Spjd error = ddt_load(spa); 2493219089Spjd if (error != 0) 2494219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2495219089Spjd 2496219089Spjd spa_update_dspace(spa); 2497219089Spjd 2498219089Spjd /* 2499219089Spjd * Validate the config, using the MOS config to fill in any 2500219089Spjd * information which might be missing. If we fail to validate 2501219089Spjd * the config then declare the pool unfit for use. If we're 2502219089Spjd * assembling a pool from a split, the log is not transferred 2503219089Spjd * over. 2504219089Spjd */ 2505219089Spjd if (type != SPA_IMPORT_ASSEMBLE) { 2506219089Spjd nvlist_t *nvconfig; 2507219089Spjd 2508219089Spjd if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) 2509219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2510219089Spjd 2511219089Spjd if (!spa_config_valid(spa, nvconfig)) { 2512219089Spjd nvlist_free(nvconfig); 2513219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, 2514219089Spjd ENXIO)); 2515219089Spjd } 2516219089Spjd nvlist_free(nvconfig); 2517219089Spjd 2518219089Spjd /* 2519236884Smm * Now that we've validated the config, check the state of the 2520219089Spjd * root vdev. If it can't be opened, it indicates one or 2521219089Spjd * more toplevel vdevs are faulted. 2522219089Spjd */ 2523219089Spjd if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 2524219089Spjd return (ENXIO); 2525219089Spjd 2526219089Spjd if (spa_check_logs(spa)) { 2527219089Spjd *ereport = FM_EREPORT_ZFS_LOG_REPLAY; 2528219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO)); 2529219089Spjd } 2530168404Spjd } 2531168404Spjd 2532236884Smm if (missing_feat_write) { 2533236884Smm ASSERT(state == SPA_LOAD_TRYIMPORT); 2534236884Smm 2535236884Smm /* 2536236884Smm * At this point, we know that we can open the pool in 2537236884Smm * read-only mode but not read-write mode. We now have enough 2538236884Smm * information and can return to userland. 2539236884Smm */ 2540236884Smm return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); 2541236884Smm } 2542236884Smm 2543219089Spjd /* 2544219089Spjd * We've successfully opened the pool, verify that we're ready 2545219089Spjd * to start pushing transactions. 2546219089Spjd */ 2547219089Spjd if (state != SPA_LOAD_TRYIMPORT) { 2548219089Spjd if (error = spa_load_verify(spa)) 2549219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 2550219089Spjd error)); 2551219089Spjd } 2552219089Spjd 2553219089Spjd if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER || 2554219089Spjd spa->spa_load_max_txg == UINT64_MAX)) { 2555168404Spjd dmu_tx_t *tx; 2556168404Spjd int need_update = B_FALSE; 2557168404Spjd 2558209962Smm ASSERT(state != SPA_LOAD_TRYIMPORT); 2559209962Smm 2560168404Spjd /* 2561168404Spjd * Claim log blocks that haven't been committed yet. 2562168404Spjd * This must all happen in a single txg. 2563219089Spjd * Note: spa_claim_max_txg is updated by spa_claim_notify(), 2564219089Spjd * invoked from zil_claim_log_block()'s i/o done callback. 2565219089Spjd * Price of rollback is that we abandon the log. 2566168404Spjd */ 2567219089Spjd spa->spa_claiming = B_TRUE; 2568219089Spjd 2569168404Spjd tx = dmu_tx_create_assigned(spa_get_dsl(spa), 2570168404Spjd spa_first_txg(spa)); 2571185029Spjd (void) dmu_objset_find(spa_name(spa), 2572168404Spjd zil_claim, tx, DS_FIND_CHILDREN); 2573168404Spjd dmu_tx_commit(tx); 2574168404Spjd 2575219089Spjd spa->spa_claiming = B_FALSE; 2576219089Spjd 2577219089Spjd spa_set_log_state(spa, SPA_LOG_GOOD); 2578168404Spjd spa->spa_sync_on = B_TRUE; 2579168404Spjd txg_sync_start(spa->spa_dsl_pool); 2580168404Spjd 2581168404Spjd /* 2582219089Spjd * Wait for all claims to sync. We sync up to the highest 2583219089Spjd * claimed log block birth time so that claimed log blocks 2584219089Spjd * don't appear to be from the future. spa_claim_max_txg 2585219089Spjd * will have been set for us by either zil_check_log_chain() 2586219089Spjd * (invoked from spa_check_logs()) or zil_claim() above. 2587168404Spjd */ 2588219089Spjd txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); 2589168404Spjd 2590168404Spjd /* 2591168404Spjd * If the config cache is stale, or we have uninitialized 2592168404Spjd * metaslabs (see spa_vdev_add()), then update the config. 2593209962Smm * 2594219089Spjd * If this is a verbatim import, trust the current 2595209962Smm * in-core spa_config and update the disk labels. 2596168404Spjd */ 2597168404Spjd if (config_cache_txg != spa->spa_config_txg || 2598219089Spjd state == SPA_LOAD_IMPORT || 2599219089Spjd state == SPA_LOAD_RECOVER || 2600219089Spjd (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) 2601168404Spjd need_update = B_TRUE; 2602168404Spjd 2603209962Smm for (int c = 0; c < rvd->vdev_children; c++) 2604168404Spjd if (rvd->vdev_child[c]->vdev_ms_array == 0) 2605168404Spjd need_update = B_TRUE; 2606168404Spjd 2607168404Spjd /* 2608168404Spjd * Update the config cache asychronously in case we're the 2609168404Spjd * root pool, in which case the config cache isn't writable yet. 2610168404Spjd */ 2611168404Spjd if (need_update) 2612168404Spjd spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 2613208683Spjd 2614208683Spjd /* 2615208683Spjd * Check all DTLs to see if anything needs resilvering. 2616208683Spjd */ 2617219089Spjd if (!dsl_scan_resilvering(spa->spa_dsl_pool) && 2618219089Spjd vdev_resilver_needed(rvd, NULL, NULL)) 2619208683Spjd spa_async_request(spa, SPA_ASYNC_RESILVER); 2620219089Spjd 2621219089Spjd /* 2622219089Spjd * Delete any inconsistent datasets. 2623219089Spjd */ 2624219089Spjd (void) dmu_objset_find(spa_name(spa), 2625219089Spjd dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); 2626219089Spjd 2627219089Spjd /* 2628219089Spjd * Clean up any stale temporary dataset userrefs. 2629219089Spjd */ 2630219089Spjd dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); 2631168404Spjd } 2632168404Spjd 2633219089Spjd return (0); 2634219089Spjd} 2635168404Spjd 2636219089Spjdstatic int 2637219089Spjdspa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig) 2638219089Spjd{ 2639219089Spjd int mode = spa->spa_mode; 2640219089Spjd 2641219089Spjd spa_unload(spa); 2642219089Spjd spa_deactivate(spa); 2643219089Spjd 2644219089Spjd spa->spa_load_max_txg--; 2645219089Spjd 2646219089Spjd spa_activate(spa, mode); 2647219089Spjd spa_async_suspend(spa); 2648219089Spjd 2649219089Spjd return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig)); 2650168404Spjd} 2651168404Spjd 2652236884Smm/* 2653236884Smm * If spa_load() fails this function will try loading prior txg's. If 2654236884Smm * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool 2655236884Smm * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this 2656236884Smm * function will not rewind the pool and will return the same error as 2657236884Smm * spa_load(). 2658236884Smm */ 2659219089Spjdstatic int 2660219089Spjdspa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, 2661219089Spjd uint64_t max_request, int rewind_flags) 2662219089Spjd{ 2663236884Smm nvlist_t *loadinfo = NULL; 2664219089Spjd nvlist_t *config = NULL; 2665219089Spjd int load_error, rewind_error; 2666219089Spjd uint64_t safe_rewind_txg; 2667219089Spjd uint64_t min_txg; 2668219089Spjd 2669219089Spjd if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { 2670219089Spjd spa->spa_load_max_txg = spa->spa_load_txg; 2671219089Spjd spa_set_log_state(spa, SPA_LOG_CLEAR); 2672219089Spjd } else { 2673219089Spjd spa->spa_load_max_txg = max_request; 2674219089Spjd } 2675219089Spjd 2676219089Spjd load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING, 2677219089Spjd mosconfig); 2678219089Spjd if (load_error == 0) 2679219089Spjd return (0); 2680219089Spjd 2681219089Spjd if (spa->spa_root_vdev != NULL) 2682219089Spjd config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 2683219089Spjd 2684219089Spjd spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; 2685219089Spjd spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; 2686219089Spjd 2687219089Spjd if (rewind_flags & ZPOOL_NEVER_REWIND) { 2688219089Spjd nvlist_free(config); 2689219089Spjd return (load_error); 2690219089Spjd } 2691219089Spjd 2692236884Smm if (state == SPA_LOAD_RECOVER) { 2693236884Smm /* Price of rolling back is discarding txgs, including log */ 2694219089Spjd spa_set_log_state(spa, SPA_LOG_CLEAR); 2695236884Smm } else { 2696236884Smm /* 2697236884Smm * If we aren't rolling back save the load info from our first 2698236884Smm * import attempt so that we can restore it after attempting 2699236884Smm * to rewind. 2700236884Smm */ 2701236884Smm loadinfo = spa->spa_load_info; 2702236884Smm spa->spa_load_info = fnvlist_alloc(); 2703236884Smm } 2704219089Spjd 2705219089Spjd spa->spa_load_max_txg = spa->spa_last_ubsync_txg; 2706219089Spjd safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; 2707219089Spjd min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? 2708219089Spjd TXG_INITIAL : safe_rewind_txg; 2709219089Spjd 2710219089Spjd /* 2711219089Spjd * Continue as long as we're finding errors, we're still within 2712219089Spjd * the acceptable rewind range, and we're still finding uberblocks 2713219089Spjd */ 2714219089Spjd while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && 2715219089Spjd spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { 2716219089Spjd if (spa->spa_load_max_txg < safe_rewind_txg) 2717219089Spjd spa->spa_extreme_rewind = B_TRUE; 2718219089Spjd rewind_error = spa_load_retry(spa, state, mosconfig); 2719219089Spjd } 2720219089Spjd 2721219089Spjd spa->spa_extreme_rewind = B_FALSE; 2722219089Spjd spa->spa_load_max_txg = UINT64_MAX; 2723219089Spjd 2724219089Spjd if (config && (rewind_error || state != SPA_LOAD_RECOVER)) 2725219089Spjd spa_config_set(spa, config); 2726219089Spjd 2727236884Smm if (state == SPA_LOAD_RECOVER) { 2728236884Smm ASSERT3P(loadinfo, ==, NULL); 2729236884Smm return (rewind_error); 2730236884Smm } else { 2731236884Smm /* Store the rewind info as part of the initial load info */ 2732236884Smm fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO, 2733236884Smm spa->spa_load_info); 2734236884Smm 2735236884Smm /* Restore the initial load info */ 2736236884Smm fnvlist_free(spa->spa_load_info); 2737236884Smm spa->spa_load_info = loadinfo; 2738236884Smm 2739236884Smm return (load_error); 2740236884Smm } 2741219089Spjd} 2742219089Spjd 2743168404Spjd/* 2744168404Spjd * Pool Open/Import 2745168404Spjd * 2746168404Spjd * The import case is identical to an open except that the configuration is sent 2747168404Spjd * down from userland, instead of grabbed from the configuration cache. For the 2748168404Spjd * case of an open, the pool configuration will exist in the 2749185029Spjd * POOL_STATE_UNINITIALIZED state. 2750168404Spjd * 2751168404Spjd * The stats information (gen/count/ustats) is used to gather vdev statistics at 2752168404Spjd * the same time open the pool, without having to keep around the spa_t in some 2753168404Spjd * ambiguous state. 2754168404Spjd */ 2755168404Spjdstatic int 2756219089Spjdspa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, 2757219089Spjd nvlist_t **config) 2758168404Spjd{ 2759168404Spjd spa_t *spa; 2760219089Spjd spa_load_state_t state = SPA_LOAD_OPEN; 2761168404Spjd int error; 2762168404Spjd int locked = B_FALSE; 2763219089Spjd int firstopen = B_FALSE; 2764168404Spjd 2765168404Spjd *spapp = NULL; 2766168404Spjd 2767168404Spjd /* 2768168404Spjd * As disgusting as this is, we need to support recursive calls to this 2769168404Spjd * function because dsl_dir_open() is called during spa_load(), and ends 2770168404Spjd * up calling spa_open() again. The real fix is to figure out how to 2771168404Spjd * avoid dsl_dir_open() calling this in the first place. 2772168404Spjd */ 2773168404Spjd if (mutex_owner(&spa_namespace_lock) != curthread) { 2774168404Spjd mutex_enter(&spa_namespace_lock); 2775168404Spjd locked = B_TRUE; 2776168404Spjd } 2777168404Spjd 2778168404Spjd if ((spa = spa_lookup(pool)) == NULL) { 2779168404Spjd if (locked) 2780168404Spjd mutex_exit(&spa_namespace_lock); 2781168404Spjd return (ENOENT); 2782168404Spjd } 2783219089Spjd 2784168404Spjd if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 2785219089Spjd zpool_rewind_policy_t policy; 2786168404Spjd 2787219089Spjd firstopen = B_TRUE; 2788219089Spjd 2789219089Spjd zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config, 2790219089Spjd &policy); 2791219089Spjd if (policy.zrp_request & ZPOOL_DO_REWIND) 2792219089Spjd state = SPA_LOAD_RECOVER; 2793219089Spjd 2794209962Smm spa_activate(spa, spa_mode_global); 2795168404Spjd 2796219089Spjd if (state != SPA_LOAD_RECOVER) 2797219089Spjd spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 2798168404Spjd 2799219089Spjd error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg, 2800219089Spjd policy.zrp_request); 2801219089Spjd 2802168404Spjd if (error == EBADF) { 2803168404Spjd /* 2804168404Spjd * If vdev_validate() returns failure (indicated by 2805168404Spjd * EBADF), it indicates that one of the vdevs indicates 2806168404Spjd * that the pool has been exported or destroyed. If 2807168404Spjd * this is the case, the config cache is out of sync and 2808168404Spjd * we should remove the pool from the namespace. 2809168404Spjd */ 2810168404Spjd spa_unload(spa); 2811168404Spjd spa_deactivate(spa); 2812185029Spjd spa_config_sync(spa, B_TRUE, B_TRUE); 2813168404Spjd spa_remove(spa); 2814168404Spjd if (locked) 2815168404Spjd mutex_exit(&spa_namespace_lock); 2816168404Spjd return (ENOENT); 2817168404Spjd } 2818168404Spjd 2819168404Spjd if (error) { 2820168404Spjd /* 2821168404Spjd * We can't open the pool, but we still have useful 2822168404Spjd * information: the state of each vdev after the 2823168404Spjd * attempted vdev_open(). Return this to the user. 2824168404Spjd */ 2825219089Spjd if (config != NULL && spa->spa_config) { 2826219089Spjd VERIFY(nvlist_dup(spa->spa_config, config, 2827219089Spjd KM_SLEEP) == 0); 2828219089Spjd VERIFY(nvlist_add_nvlist(*config, 2829219089Spjd ZPOOL_CONFIG_LOAD_INFO, 2830219089Spjd spa->spa_load_info) == 0); 2831219089Spjd } 2832168404Spjd spa_unload(spa); 2833168404Spjd spa_deactivate(spa); 2834219089Spjd spa->spa_last_open_failed = error; 2835168404Spjd if (locked) 2836168404Spjd mutex_exit(&spa_namespace_lock); 2837168404Spjd *spapp = NULL; 2838168404Spjd return (error); 2839168404Spjd } 2840168404Spjd } 2841168404Spjd 2842168404Spjd spa_open_ref(spa, tag); 2843185029Spjd 2844219089Spjd if (config != NULL) 2845219089Spjd *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 2846219089Spjd 2847219089Spjd /* 2848219089Spjd * If we've recovered the pool, pass back any information we 2849219089Spjd * gathered while doing the load. 2850219089Spjd */ 2851219089Spjd if (state == SPA_LOAD_RECOVER) { 2852219089Spjd VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, 2853219089Spjd spa->spa_load_info) == 0); 2854219089Spjd } 2855219089Spjd 2856219089Spjd if (locked) { 2857219089Spjd spa->spa_last_open_failed = 0; 2858219089Spjd spa->spa_last_ubsync_txg = 0; 2859219089Spjd spa->spa_load_txg = 0; 2860168404Spjd mutex_exit(&spa_namespace_lock); 2861219089Spjd#ifdef __FreeBSD__ 2862219089Spjd#ifdef _KERNEL 2863219089Spjd if (firstopen) 2864219089Spjd zvol_create_minors(pool); 2865219089Spjd#endif 2866219089Spjd#endif 2867219089Spjd } 2868168404Spjd 2869168404Spjd *spapp = spa; 2870168404Spjd 2871168404Spjd return (0); 2872168404Spjd} 2873168404Spjd 2874168404Spjdint 2875219089Spjdspa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy, 2876219089Spjd nvlist_t **config) 2877219089Spjd{ 2878219089Spjd return (spa_open_common(name, spapp, tag, policy, config)); 2879219089Spjd} 2880219089Spjd 2881219089Spjdint 2882168404Spjdspa_open(const char *name, spa_t **spapp, void *tag) 2883168404Spjd{ 2884219089Spjd return (spa_open_common(name, spapp, tag, NULL, NULL)); 2885168404Spjd} 2886168404Spjd 2887168404Spjd/* 2888168404Spjd * Lookup the given spa_t, incrementing the inject count in the process, 2889168404Spjd * preventing it from being exported or destroyed. 2890168404Spjd */ 2891168404Spjdspa_t * 2892168404Spjdspa_inject_addref(char *name) 2893168404Spjd{ 2894168404Spjd spa_t *spa; 2895168404Spjd 2896168404Spjd mutex_enter(&spa_namespace_lock); 2897168404Spjd if ((spa = spa_lookup(name)) == NULL) { 2898168404Spjd mutex_exit(&spa_namespace_lock); 2899168404Spjd return (NULL); 2900168404Spjd } 2901168404Spjd spa->spa_inject_ref++; 2902168404Spjd mutex_exit(&spa_namespace_lock); 2903168404Spjd 2904168404Spjd return (spa); 2905168404Spjd} 2906168404Spjd 2907168404Spjdvoid 2908168404Spjdspa_inject_delref(spa_t *spa) 2909168404Spjd{ 2910168404Spjd mutex_enter(&spa_namespace_lock); 2911168404Spjd spa->spa_inject_ref--; 2912168404Spjd mutex_exit(&spa_namespace_lock); 2913168404Spjd} 2914168404Spjd 2915185029Spjd/* 2916185029Spjd * Add spares device information to the nvlist. 2917185029Spjd */ 2918168404Spjdstatic void 2919168404Spjdspa_add_spares(spa_t *spa, nvlist_t *config) 2920168404Spjd{ 2921168404Spjd nvlist_t **spares; 2922168404Spjd uint_t i, nspares; 2923168404Spjd nvlist_t *nvroot; 2924168404Spjd uint64_t guid; 2925168404Spjd vdev_stat_t *vs; 2926168404Spjd uint_t vsc; 2927168404Spjd uint64_t pool; 2928168404Spjd 2929209962Smm ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 2930209962Smm 2931185029Spjd if (spa->spa_spares.sav_count == 0) 2932168404Spjd return; 2933168404Spjd 2934168404Spjd VERIFY(nvlist_lookup_nvlist(config, 2935168404Spjd ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 2936185029Spjd VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 2937168404Spjd ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 2938168404Spjd if (nspares != 0) { 2939168404Spjd VERIFY(nvlist_add_nvlist_array(nvroot, 2940168404Spjd ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 2941168404Spjd VERIFY(nvlist_lookup_nvlist_array(nvroot, 2942168404Spjd ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 2943168404Spjd 2944168404Spjd /* 2945168404Spjd * Go through and find any spares which have since been 2946168404Spjd * repurposed as an active spare. If this is the case, update 2947168404Spjd * their status appropriately. 2948168404Spjd */ 2949168404Spjd for (i = 0; i < nspares; i++) { 2950168404Spjd VERIFY(nvlist_lookup_uint64(spares[i], 2951168404Spjd ZPOOL_CONFIG_GUID, &guid) == 0); 2952185029Spjd if (spa_spare_exists(guid, &pool, NULL) && 2953185029Spjd pool != 0ULL) { 2954168404Spjd VERIFY(nvlist_lookup_uint64_array( 2955219089Spjd spares[i], ZPOOL_CONFIG_VDEV_STATS, 2956168404Spjd (uint64_t **)&vs, &vsc) == 0); 2957168404Spjd vs->vs_state = VDEV_STATE_CANT_OPEN; 2958168404Spjd vs->vs_aux = VDEV_AUX_SPARED; 2959168404Spjd } 2960168404Spjd } 2961168404Spjd } 2962168404Spjd} 2963168404Spjd 2964185029Spjd/* 2965185029Spjd * Add l2cache device information to the nvlist, including vdev stats. 2966185029Spjd */ 2967185029Spjdstatic void 2968185029Spjdspa_add_l2cache(spa_t *spa, nvlist_t *config) 2969185029Spjd{ 2970185029Spjd nvlist_t **l2cache; 2971185029Spjd uint_t i, j, nl2cache; 2972185029Spjd nvlist_t *nvroot; 2973185029Spjd uint64_t guid; 2974185029Spjd vdev_t *vd; 2975185029Spjd vdev_stat_t *vs; 2976185029Spjd uint_t vsc; 2977185029Spjd 2978209962Smm ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 2979209962Smm 2980185029Spjd if (spa->spa_l2cache.sav_count == 0) 2981185029Spjd return; 2982185029Spjd 2983185029Spjd VERIFY(nvlist_lookup_nvlist(config, 2984185029Spjd ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 2985185029Spjd VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 2986185029Spjd ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 2987185029Spjd if (nl2cache != 0) { 2988185029Spjd VERIFY(nvlist_add_nvlist_array(nvroot, 2989185029Spjd ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 2990185029Spjd VERIFY(nvlist_lookup_nvlist_array(nvroot, 2991185029Spjd ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 2992185029Spjd 2993185029Spjd /* 2994185029Spjd * Update level 2 cache device stats. 2995185029Spjd */ 2996185029Spjd 2997185029Spjd for (i = 0; i < nl2cache; i++) { 2998185029Spjd VERIFY(nvlist_lookup_uint64(l2cache[i], 2999185029Spjd ZPOOL_CONFIG_GUID, &guid) == 0); 3000185029Spjd 3001185029Spjd vd = NULL; 3002185029Spjd for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 3003185029Spjd if (guid == 3004185029Spjd spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 3005185029Spjd vd = spa->spa_l2cache.sav_vdevs[j]; 3006185029Spjd break; 3007185029Spjd } 3008185029Spjd } 3009185029Spjd ASSERT(vd != NULL); 3010185029Spjd 3011185029Spjd VERIFY(nvlist_lookup_uint64_array(l2cache[i], 3012219089Spjd ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) 3013219089Spjd == 0); 3014185029Spjd vdev_get_stats(vd, vs); 3015185029Spjd } 3016185029Spjd } 3017185029Spjd} 3018185029Spjd 3019236884Smmstatic void 3020236884Smmspa_add_feature_stats(spa_t *spa, nvlist_t *config) 3021236884Smm{ 3022236884Smm nvlist_t *features; 3023236884Smm zap_cursor_t zc; 3024236884Smm zap_attribute_t za; 3025236884Smm 3026236884Smm ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 3027236884Smm VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3028236884Smm 3029236884Smm if (spa->spa_feat_for_read_obj != 0) { 3030236884Smm for (zap_cursor_init(&zc, spa->spa_meta_objset, 3031236884Smm spa->spa_feat_for_read_obj); 3032236884Smm zap_cursor_retrieve(&zc, &za) == 0; 3033236884Smm zap_cursor_advance(&zc)) { 3034236884Smm ASSERT(za.za_integer_length == sizeof (uint64_t) && 3035236884Smm za.za_num_integers == 1); 3036236884Smm VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, 3037236884Smm za.za_first_integer)); 3038236884Smm } 3039236884Smm zap_cursor_fini(&zc); 3040236884Smm } 3041236884Smm 3042236884Smm if (spa->spa_feat_for_write_obj != 0) { 3043236884Smm for (zap_cursor_init(&zc, spa->spa_meta_objset, 3044236884Smm spa->spa_feat_for_write_obj); 3045236884Smm zap_cursor_retrieve(&zc, &za) == 0; 3046236884Smm zap_cursor_advance(&zc)) { 3047236884Smm ASSERT(za.za_integer_length == sizeof (uint64_t) && 3048236884Smm za.za_num_integers == 1); 3049236884Smm VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, 3050236884Smm za.za_first_integer)); 3051236884Smm } 3052236884Smm zap_cursor_fini(&zc); 3053236884Smm } 3054236884Smm 3055236884Smm VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, 3056236884Smm features) == 0); 3057236884Smm nvlist_free(features); 3058236884Smm} 3059236884Smm 3060168404Spjdint 3061236884Smmspa_get_stats(const char *name, nvlist_t **config, 3062236884Smm char *altroot, size_t buflen) 3063168404Spjd{ 3064168404Spjd int error; 3065168404Spjd spa_t *spa; 3066168404Spjd 3067168404Spjd *config = NULL; 3068219089Spjd error = spa_open_common(name, &spa, FTAG, NULL, config); 3069168404Spjd 3070209962Smm if (spa != NULL) { 3071209962Smm /* 3072209962Smm * This still leaves a window of inconsistency where the spares 3073209962Smm * or l2cache devices could change and the config would be 3074209962Smm * self-inconsistent. 3075209962Smm */ 3076209962Smm spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3077168404Spjd 3078209962Smm if (*config != NULL) { 3079219089Spjd uint64_t loadtimes[2]; 3080219089Spjd 3081219089Spjd loadtimes[0] = spa->spa_loaded_ts.tv_sec; 3082219089Spjd loadtimes[1] = spa->spa_loaded_ts.tv_nsec; 3083219089Spjd VERIFY(nvlist_add_uint64_array(*config, 3084219089Spjd ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0); 3085219089Spjd 3086185029Spjd VERIFY(nvlist_add_uint64(*config, 3087209962Smm ZPOOL_CONFIG_ERRCOUNT, 3088209962Smm spa_get_errlog_size(spa)) == 0); 3089185029Spjd 3090209962Smm if (spa_suspended(spa)) 3091209962Smm VERIFY(nvlist_add_uint64(*config, 3092209962Smm ZPOOL_CONFIG_SUSPENDED, 3093209962Smm spa->spa_failmode) == 0); 3094209962Smm 3095209962Smm spa_add_spares(spa, *config); 3096209962Smm spa_add_l2cache(spa, *config); 3097236884Smm spa_add_feature_stats(spa, *config); 3098209962Smm } 3099168404Spjd } 3100168404Spjd 3101168404Spjd /* 3102168404Spjd * We want to get the alternate root even for faulted pools, so we cheat 3103168404Spjd * and call spa_lookup() directly. 3104168404Spjd */ 3105168404Spjd if (altroot) { 3106168404Spjd if (spa == NULL) { 3107168404Spjd mutex_enter(&spa_namespace_lock); 3108168404Spjd spa = spa_lookup(name); 3109168404Spjd if (spa) 3110168404Spjd spa_altroot(spa, altroot, buflen); 3111168404Spjd else 3112168404Spjd altroot[0] = '\0'; 3113168404Spjd spa = NULL; 3114168404Spjd mutex_exit(&spa_namespace_lock); 3115168404Spjd } else { 3116168404Spjd spa_altroot(spa, altroot, buflen); 3117168404Spjd } 3118168404Spjd } 3119168404Spjd 3120209962Smm if (spa != NULL) { 3121209962Smm spa_config_exit(spa, SCL_CONFIG, FTAG); 3122168404Spjd spa_close(spa, FTAG); 3123209962Smm } 3124168404Spjd 3125168404Spjd return (error); 3126168404Spjd} 3127168404Spjd 3128168404Spjd/* 3129185029Spjd * Validate that the auxiliary device array is well formed. We must have an 3130185029Spjd * array of nvlists, each which describes a valid leaf vdev. If this is an 3131185029Spjd * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 3132185029Spjd * specified, as long as they are well-formed. 3133168404Spjd */ 3134168404Spjdstatic int 3135185029Spjdspa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 3136185029Spjd spa_aux_vdev_t *sav, const char *config, uint64_t version, 3137185029Spjd vdev_labeltype_t label) 3138168404Spjd{ 3139185029Spjd nvlist_t **dev; 3140185029Spjd uint_t i, ndev; 3141168404Spjd vdev_t *vd; 3142168404Spjd int error; 3143168404Spjd 3144185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3145185029Spjd 3146168404Spjd /* 3147185029Spjd * It's acceptable to have no devs specified. 3148168404Spjd */ 3149185029Spjd if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 3150168404Spjd return (0); 3151168404Spjd 3152185029Spjd if (ndev == 0) 3153168404Spjd return (EINVAL); 3154168404Spjd 3155168404Spjd /* 3156185029Spjd * Make sure the pool is formatted with a version that supports this 3157185029Spjd * device type. 3158168404Spjd */ 3159185029Spjd if (spa_version(spa) < version) 3160168404Spjd return (ENOTSUP); 3161168404Spjd 3162168404Spjd /* 3163185029Spjd * Set the pending device list so we correctly handle device in-use 3164168404Spjd * checking. 3165168404Spjd */ 3166185029Spjd sav->sav_pending = dev; 3167185029Spjd sav->sav_npending = ndev; 3168168404Spjd 3169185029Spjd for (i = 0; i < ndev; i++) { 3170185029Spjd if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 3171168404Spjd mode)) != 0) 3172168404Spjd goto out; 3173168404Spjd 3174168404Spjd if (!vd->vdev_ops->vdev_op_leaf) { 3175168404Spjd vdev_free(vd); 3176168404Spjd error = EINVAL; 3177168404Spjd goto out; 3178168404Spjd } 3179168404Spjd 3180185029Spjd /* 3181185029Spjd * The L2ARC currently only supports disk devices in 3182185029Spjd * kernel context. For user-level testing, we allow it. 3183185029Spjd */ 3184185029Spjd#ifdef _KERNEL 3185185029Spjd if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && 3186185029Spjd strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { 3187185029Spjd error = ENOTBLK; 3188230514Smm vdev_free(vd); 3189185029Spjd goto out; 3190185029Spjd } 3191185029Spjd#endif 3192168404Spjd vd->vdev_top = vd; 3193168404Spjd 3194168404Spjd if ((error = vdev_open(vd)) == 0 && 3195185029Spjd (error = vdev_label_init(vd, crtxg, label)) == 0) { 3196185029Spjd VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 3197168404Spjd vd->vdev_guid) == 0); 3198168404Spjd } 3199168404Spjd 3200168404Spjd vdev_free(vd); 3201168404Spjd 3202185029Spjd if (error && 3203185029Spjd (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 3204168404Spjd goto out; 3205168404Spjd else 3206168404Spjd error = 0; 3207168404Spjd } 3208168404Spjd 3209168404Spjdout: 3210185029Spjd sav->sav_pending = NULL; 3211185029Spjd sav->sav_npending = 0; 3212168404Spjd return (error); 3213168404Spjd} 3214168404Spjd 3215185029Spjdstatic int 3216185029Spjdspa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 3217185029Spjd{ 3218185029Spjd int error; 3219185029Spjd 3220185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3221185029Spjd 3222185029Spjd if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 3223185029Spjd &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 3224185029Spjd VDEV_LABEL_SPARE)) != 0) { 3225185029Spjd return (error); 3226185029Spjd } 3227185029Spjd 3228185029Spjd return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 3229185029Spjd &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 3230185029Spjd VDEV_LABEL_L2CACHE)); 3231185029Spjd} 3232185029Spjd 3233185029Spjdstatic void 3234185029Spjdspa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 3235185029Spjd const char *config) 3236185029Spjd{ 3237185029Spjd int i; 3238185029Spjd 3239185029Spjd if (sav->sav_config != NULL) { 3240185029Spjd nvlist_t **olddevs; 3241185029Spjd uint_t oldndevs; 3242185029Spjd nvlist_t **newdevs; 3243185029Spjd 3244185029Spjd /* 3245185029Spjd * Generate new dev list by concatentating with the 3246185029Spjd * current dev list. 3247185029Spjd */ 3248185029Spjd VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, 3249185029Spjd &olddevs, &oldndevs) == 0); 3250185029Spjd 3251185029Spjd newdevs = kmem_alloc(sizeof (void *) * 3252185029Spjd (ndevs + oldndevs), KM_SLEEP); 3253185029Spjd for (i = 0; i < oldndevs; i++) 3254185029Spjd VERIFY(nvlist_dup(olddevs[i], &newdevs[i], 3255185029Spjd KM_SLEEP) == 0); 3256185029Spjd for (i = 0; i < ndevs; i++) 3257185029Spjd VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], 3258185029Spjd KM_SLEEP) == 0); 3259185029Spjd 3260185029Spjd VERIFY(nvlist_remove(sav->sav_config, config, 3261185029Spjd DATA_TYPE_NVLIST_ARRAY) == 0); 3262185029Spjd 3263185029Spjd VERIFY(nvlist_add_nvlist_array(sav->sav_config, 3264185029Spjd config, newdevs, ndevs + oldndevs) == 0); 3265185029Spjd for (i = 0; i < oldndevs + ndevs; i++) 3266185029Spjd nvlist_free(newdevs[i]); 3267185029Spjd kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 3268185029Spjd } else { 3269185029Spjd /* 3270185029Spjd * Generate a new dev list. 3271185029Spjd */ 3272185029Spjd VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, 3273185029Spjd KM_SLEEP) == 0); 3274185029Spjd VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, 3275185029Spjd devs, ndevs) == 0); 3276185029Spjd } 3277185029Spjd} 3278185029Spjd 3279168404Spjd/* 3280185029Spjd * Stop and drop level 2 ARC devices 3281185029Spjd */ 3282185029Spjdvoid 3283185029Spjdspa_l2cache_drop(spa_t *spa) 3284185029Spjd{ 3285185029Spjd vdev_t *vd; 3286185029Spjd int i; 3287185029Spjd spa_aux_vdev_t *sav = &spa->spa_l2cache; 3288185029Spjd 3289185029Spjd for (i = 0; i < sav->sav_count; i++) { 3290185029Spjd uint64_t pool; 3291185029Spjd 3292185029Spjd vd = sav->sav_vdevs[i]; 3293185029Spjd ASSERT(vd != NULL); 3294185029Spjd 3295209962Smm if (spa_l2cache_exists(vd->vdev_guid, &pool) && 3296209962Smm pool != 0ULL && l2arc_vdev_present(vd)) 3297185029Spjd l2arc_remove_vdev(vd); 3298185029Spjd } 3299185029Spjd} 3300185029Spjd 3301185029Spjd/* 3302168404Spjd * Pool Creation 3303168404Spjd */ 3304168404Spjdint 3305185029Spjdspa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 3306185029Spjd const char *history_str, nvlist_t *zplprops) 3307168404Spjd{ 3308168404Spjd spa_t *spa; 3309185029Spjd char *altroot = NULL; 3310168404Spjd vdev_t *rvd; 3311168404Spjd dsl_pool_t *dp; 3312168404Spjd dmu_tx_t *tx; 3313219089Spjd int error = 0; 3314168404Spjd uint64_t txg = TXG_INITIAL; 3315185029Spjd nvlist_t **spares, **l2cache; 3316185029Spjd uint_t nspares, nl2cache; 3317219089Spjd uint64_t version, obj; 3318236884Smm boolean_t has_features; 3319168404Spjd 3320168404Spjd /* 3321168404Spjd * If this pool already exists, return failure. 3322168404Spjd */ 3323168404Spjd mutex_enter(&spa_namespace_lock); 3324168404Spjd if (spa_lookup(pool) != NULL) { 3325168404Spjd mutex_exit(&spa_namespace_lock); 3326168404Spjd return (EEXIST); 3327168404Spjd } 3328168404Spjd 3329168404Spjd /* 3330168404Spjd * Allocate a new spa_t structure. 3331168404Spjd */ 3332185029Spjd (void) nvlist_lookup_string(props, 3333185029Spjd zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 3334219089Spjd spa = spa_add(pool, NULL, altroot); 3335209962Smm spa_activate(spa, spa_mode_global); 3336168404Spjd 3337185029Spjd if (props && (error = spa_prop_validate(spa, props))) { 3338185029Spjd spa_deactivate(spa); 3339185029Spjd spa_remove(spa); 3340185029Spjd mutex_exit(&spa_namespace_lock); 3341185029Spjd return (error); 3342185029Spjd } 3343185029Spjd 3344236884Smm has_features = B_FALSE; 3345236884Smm for (nvpair_t *elem = nvlist_next_nvpair(props, NULL); 3346236884Smm elem != NULL; elem = nvlist_next_nvpair(props, elem)) { 3347236884Smm if (zpool_prop_feature(nvpair_name(elem))) 3348236884Smm has_features = B_TRUE; 3349236884Smm } 3350236884Smm 3351236884Smm if (has_features || nvlist_lookup_uint64(props, 3352236884Smm zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) { 3353185029Spjd version = SPA_VERSION; 3354236884Smm } 3355236884Smm ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 3356219089Spjd 3357219089Spjd spa->spa_first_txg = txg; 3358219089Spjd spa->spa_uberblock.ub_txg = txg - 1; 3359185029Spjd spa->spa_uberblock.ub_version = version; 3360168404Spjd spa->spa_ubsync = spa->spa_uberblock; 3361168404Spjd 3362168404Spjd /* 3363209962Smm * Create "The Godfather" zio to hold all async IOs 3364209962Smm */ 3365209962Smm spa->spa_async_zio_root = zio_root(spa, NULL, NULL, 3366209962Smm ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); 3367209962Smm 3368209962Smm /* 3369168404Spjd * Create the root vdev. 3370168404Spjd */ 3371185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3372168404Spjd 3373168404Spjd error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 3374168404Spjd 3375168404Spjd ASSERT(error != 0 || rvd != NULL); 3376168404Spjd ASSERT(error != 0 || spa->spa_root_vdev == rvd); 3377168404Spjd 3378185029Spjd if (error == 0 && !zfs_allocatable_devs(nvroot)) 3379168404Spjd error = EINVAL; 3380168404Spjd 3381168404Spjd if (error == 0 && 3382168404Spjd (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 3383185029Spjd (error = spa_validate_aux(spa, nvroot, txg, 3384168404Spjd VDEV_ALLOC_ADD)) == 0) { 3385219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 3386219089Spjd vdev_metaslab_set_size(rvd->vdev_child[c]); 3387219089Spjd vdev_expand(rvd->vdev_child[c], txg); 3388219089Spjd } 3389168404Spjd } 3390168404Spjd 3391185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 3392168404Spjd 3393168404Spjd if (error != 0) { 3394168404Spjd spa_unload(spa); 3395168404Spjd spa_deactivate(spa); 3396168404Spjd spa_remove(spa); 3397168404Spjd mutex_exit(&spa_namespace_lock); 3398168404Spjd return (error); 3399168404Spjd } 3400168404Spjd 3401168404Spjd /* 3402168404Spjd * Get the list of spares, if specified. 3403168404Spjd */ 3404168404Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 3405168404Spjd &spares, &nspares) == 0) { 3406185029Spjd VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, 3407168404Spjd KM_SLEEP) == 0); 3408185029Spjd VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 3409168404Spjd ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 3410185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3411168404Spjd spa_load_spares(spa); 3412185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 3413185029Spjd spa->spa_spares.sav_sync = B_TRUE; 3414168404Spjd } 3415168404Spjd 3416185029Spjd /* 3417185029Spjd * Get the list of level 2 cache devices, if specified. 3418185029Spjd */ 3419185029Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 3420185029Spjd &l2cache, &nl2cache) == 0) { 3421185029Spjd VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 3422185029Spjd NV_UNIQUE_NAME, KM_SLEEP) == 0); 3423185029Spjd VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 3424185029Spjd ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 3425185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3426185029Spjd spa_load_l2cache(spa); 3427185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 3428185029Spjd spa->spa_l2cache.sav_sync = B_TRUE; 3429185029Spjd } 3430185029Spjd 3431236884Smm spa->spa_is_initializing = B_TRUE; 3432185029Spjd spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); 3433168404Spjd spa->spa_meta_objset = dp->dp_meta_objset; 3434236884Smm spa->spa_is_initializing = B_FALSE; 3435168404Spjd 3436219089Spjd /* 3437219089Spjd * Create DDTs (dedup tables). 3438219089Spjd */ 3439219089Spjd ddt_create(spa); 3440219089Spjd 3441219089Spjd spa_update_dspace(spa); 3442219089Spjd 3443168404Spjd tx = dmu_tx_create_assigned(dp, txg); 3444168404Spjd 3445168404Spjd /* 3446168404Spjd * Create the pool config object. 3447168404Spjd */ 3448168404Spjd spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 3449185029Spjd DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, 3450168404Spjd DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 3451168404Spjd 3452168404Spjd if (zap_add(spa->spa_meta_objset, 3453168404Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 3454168404Spjd sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 3455168404Spjd cmn_err(CE_PANIC, "failed to add pool config"); 3456168404Spjd } 3457168404Spjd 3458236884Smm if (spa_version(spa) >= SPA_VERSION_FEATURES) 3459236884Smm spa_feature_create_zap_objects(spa, tx); 3460236884Smm 3461219089Spjd if (zap_add(spa->spa_meta_objset, 3462219089Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, 3463219089Spjd sizeof (uint64_t), 1, &version, tx) != 0) { 3464219089Spjd cmn_err(CE_PANIC, "failed to add pool version"); 3465219089Spjd } 3466219089Spjd 3467185029Spjd /* Newly created pools with the right version are always deflated. */ 3468185029Spjd if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 3469185029Spjd spa->spa_deflate = TRUE; 3470185029Spjd if (zap_add(spa->spa_meta_objset, 3471185029Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 3472185029Spjd sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 3473185029Spjd cmn_err(CE_PANIC, "failed to add deflate"); 3474185029Spjd } 3475168404Spjd } 3476168404Spjd 3477168404Spjd /* 3478219089Spjd * Create the deferred-free bpobj. Turn off compression 3479168404Spjd * because sync-to-convergence takes longer if the blocksize 3480168404Spjd * keeps changing. 3481168404Spjd */ 3482219089Spjd obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); 3483219089Spjd dmu_object_set_compress(spa->spa_meta_objset, obj, 3484168404Spjd ZIO_COMPRESS_OFF, tx); 3485168404Spjd if (zap_add(spa->spa_meta_objset, 3486219089Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, 3487219089Spjd sizeof (uint64_t), 1, &obj, tx) != 0) { 3488219089Spjd cmn_err(CE_PANIC, "failed to add bpobj"); 3489168404Spjd } 3490219089Spjd VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, 3491219089Spjd spa->spa_meta_objset, obj)); 3492168404Spjd 3493168404Spjd /* 3494168404Spjd * Create the pool's history object. 3495168404Spjd */ 3496185029Spjd if (version >= SPA_VERSION_ZPOOL_HISTORY) 3497185029Spjd spa_history_create_obj(spa, tx); 3498168404Spjd 3499185029Spjd /* 3500185029Spjd * Set pool properties. 3501185029Spjd */ 3502185029Spjd spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 3503185029Spjd spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 3504185029Spjd spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 3505219089Spjd spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); 3506219089Spjd 3507209962Smm if (props != NULL) { 3508209962Smm spa_configfile_set(spa, props, B_FALSE); 3509219089Spjd spa_sync_props(spa, props, tx); 3510209962Smm } 3511185029Spjd 3512168404Spjd dmu_tx_commit(tx); 3513168404Spjd 3514168404Spjd spa->spa_sync_on = B_TRUE; 3515168404Spjd txg_sync_start(spa->spa_dsl_pool); 3516168404Spjd 3517168404Spjd /* 3518168404Spjd * We explicitly wait for the first transaction to complete so that our 3519168404Spjd * bean counters are appropriately updated. 3520168404Spjd */ 3521168404Spjd txg_wait_synced(spa->spa_dsl_pool, txg); 3522168404Spjd 3523185029Spjd spa_config_sync(spa, B_FALSE, B_TRUE); 3524168404Spjd 3525185029Spjd if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL) 3526185029Spjd (void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE); 3527219089Spjd spa_history_log_version(spa, LOG_POOL_CREATE); 3528185029Spjd 3529208442Smm spa->spa_minref = refcount_count(&spa->spa_refcount); 3530208442Smm 3531168404Spjd mutex_exit(&spa_namespace_lock); 3532168404Spjd 3533168404Spjd return (0); 3534168404Spjd} 3535168404Spjd 3536219089Spjd#if defined(sun) 3537185029Spjd#ifdef _KERNEL 3538185029Spjd/* 3539219089Spjd * Get the root pool information from the root disk, then import the root pool 3540219089Spjd * during the system boot up time. 3541185029Spjd */ 3542219089Spjdextern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); 3543219089Spjd 3544219089Spjdstatic nvlist_t * 3545219089Spjdspa_generate_rootconf(char *devpath, char *devid, uint64_t *guid) 3546185029Spjd{ 3547219089Spjd nvlist_t *config; 3548185029Spjd nvlist_t *nvtop, *nvroot; 3549185029Spjd uint64_t pgid; 3550185029Spjd 3551219089Spjd if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0) 3552219089Spjd return (NULL); 3553219089Spjd 3554168404Spjd /* 3555185029Spjd * Add this top-level vdev to the child array. 3556168404Spjd */ 3557219089Spjd VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3558219089Spjd &nvtop) == 0); 3559219089Spjd VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 3560219089Spjd &pgid) == 0); 3561219089Spjd VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0); 3562168404Spjd 3563185029Spjd /* 3564185029Spjd * Put this pool's top-level vdevs into a root vdev. 3565185029Spjd */ 3566185029Spjd VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3567219089Spjd VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 3568219089Spjd VDEV_TYPE_ROOT) == 0); 3569185029Spjd VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 3570185029Spjd VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 3571185029Spjd VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 3572185029Spjd &nvtop, 1) == 0); 3573168404Spjd 3574168404Spjd /* 3575185029Spjd * Replace the existing vdev_tree with the new root vdev in 3576185029Spjd * this pool's configuration (remove the old, add the new). 3577168404Spjd */ 3578185029Spjd VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 3579185029Spjd nvlist_free(nvroot); 3580219089Spjd return (config); 3581185029Spjd} 3582168404Spjd 3583185029Spjd/* 3584219089Spjd * Walk the vdev tree and see if we can find a device with "better" 3585219089Spjd * configuration. A configuration is "better" if the label on that 3586219089Spjd * device has a more recent txg. 3587185029Spjd */ 3588219089Spjdstatic void 3589219089Spjdspa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg) 3590185029Spjd{ 3591219089Spjd for (int c = 0; c < vd->vdev_children; c++) 3592219089Spjd spa_alt_rootvdev(vd->vdev_child[c], avd, txg); 3593185029Spjd 3594219089Spjd if (vd->vdev_ops->vdev_op_leaf) { 3595219089Spjd nvlist_t *label; 3596219089Spjd uint64_t label_txg; 3597185029Spjd 3598219089Spjd if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid, 3599219089Spjd &label) != 0) 3600219089Spjd return; 3601185029Spjd 3602219089Spjd VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, 3603219089Spjd &label_txg) == 0); 3604168404Spjd 3605219089Spjd /* 3606219089Spjd * Do we have a better boot device? 3607219089Spjd */ 3608219089Spjd if (label_txg > *txg) { 3609219089Spjd *txg = label_txg; 3610219089Spjd *avd = vd; 3611185029Spjd } 3612219089Spjd nvlist_free(label); 3613185029Spjd } 3614185029Spjd} 3615185029Spjd 3616185029Spjd/* 3617185029Spjd * Import a root pool. 3618185029Spjd * 3619185029Spjd * For x86. devpath_list will consist of devid and/or physpath name of 3620185029Spjd * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). 3621185029Spjd * The GRUB "findroot" command will return the vdev we should boot. 3622185029Spjd * 3623185029Spjd * For Sparc, devpath_list consists the physpath name of the booting device 3624185029Spjd * no matter the rootpool is a single device pool or a mirrored pool. 3625185029Spjd * e.g. 3626185029Spjd * "/pci@1f,0/ide@d/disk@0,0:a" 3627185029Spjd */ 3628185029Spjdint 3629185029Spjdspa_import_rootpool(char *devpath, char *devid) 3630185029Spjd{ 3631219089Spjd spa_t *spa; 3632219089Spjd vdev_t *rvd, *bvd, *avd = NULL; 3633219089Spjd nvlist_t *config, *nvtop; 3634219089Spjd uint64_t guid, txg; 3635185029Spjd char *pname; 3636185029Spjd int error; 3637185029Spjd 3638185029Spjd /* 3639219089Spjd * Read the label from the boot device and generate a configuration. 3640185029Spjd */ 3641219089Spjd config = spa_generate_rootconf(devpath, devid, &guid); 3642219089Spjd#if defined(_OBP) && defined(_KERNEL) 3643219089Spjd if (config == NULL) { 3644219089Spjd if (strstr(devpath, "/iscsi/ssd") != NULL) { 3645219089Spjd /* iscsi boot */ 3646219089Spjd get_iscsi_bootpath_phy(devpath); 3647219089Spjd config = spa_generate_rootconf(devpath, devid, &guid); 3648219089Spjd } 3649219089Spjd } 3650219089Spjd#endif 3651219089Spjd if (config == NULL) { 3652236884Smm cmn_err(CE_NOTE, "Cannot read the pool label from '%s'", 3653219089Spjd devpath); 3654219089Spjd return (EIO); 3655219089Spjd } 3656185029Spjd 3657219089Spjd VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 3658219089Spjd &pname) == 0); 3659219089Spjd VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); 3660185029Spjd 3661209962Smm mutex_enter(&spa_namespace_lock); 3662209962Smm if ((spa = spa_lookup(pname)) != NULL) { 3663209962Smm /* 3664209962Smm * Remove the existing root pool from the namespace so that we 3665209962Smm * can replace it with the correct config we just read in. 3666209962Smm */ 3667209962Smm spa_remove(spa); 3668209962Smm } 3669185029Spjd 3670219089Spjd spa = spa_add(pname, config, NULL); 3671209962Smm spa->spa_is_root = B_TRUE; 3672219089Spjd spa->spa_import_flags = ZFS_IMPORT_VERBATIM; 3673209962Smm 3674219089Spjd /* 3675219089Spjd * Build up a vdev tree based on the boot device's label config. 3676219089Spjd */ 3677219089Spjd VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3678219089Spjd &nvtop) == 0); 3679219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3680219089Spjd error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, 3681219089Spjd VDEV_ALLOC_ROOTPOOL); 3682219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 3683219089Spjd if (error) { 3684209962Smm mutex_exit(&spa_namespace_lock); 3685219089Spjd nvlist_free(config); 3686219089Spjd cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", 3687219089Spjd pname); 3688219089Spjd return (error); 3689209962Smm } 3690209962Smm 3691219089Spjd /* 3692219089Spjd * Get the boot vdev. 3693219089Spjd */ 3694219089Spjd if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 3695219089Spjd cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu", 3696219089Spjd (u_longlong_t)guid); 3697219089Spjd error = ENOENT; 3698219089Spjd goto out; 3699219089Spjd } 3700209962Smm 3701219089Spjd /* 3702219089Spjd * Determine if there is a better boot device. 3703219089Spjd */ 3704219089Spjd avd = bvd; 3705219089Spjd spa_alt_rootvdev(rvd, &avd, &txg); 3706219089Spjd if (avd != bvd) { 3707219089Spjd cmn_err(CE_NOTE, "The boot device is 'degraded'. Please " 3708219089Spjd "try booting from '%s'", avd->vdev_path); 3709219089Spjd error = EINVAL; 3710219089Spjd goto out; 3711219089Spjd } 3712209962Smm 3713219089Spjd /* 3714219089Spjd * If the boot device is part of a spare vdev then ensure that 3715219089Spjd * we're booting off the active spare. 3716219089Spjd */ 3717219089Spjd if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops && 3718219089Spjd !bvd->vdev_isspare) { 3719219089Spjd cmn_err(CE_NOTE, "The boot device is currently spared. Please " 3720219089Spjd "try booting from '%s'", 3721219089Spjd bvd->vdev_parent-> 3722219089Spjd vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path); 3723219089Spjd error = EINVAL; 3724219089Spjd goto out; 3725219089Spjd } 3726209962Smm 3727219089Spjd error = 0; 3728219089Spjd spa_history_log_version(spa, LOG_POOL_IMPORT); 3729219089Spjdout: 3730219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3731219089Spjd vdev_free(rvd); 3732219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 3733209962Smm mutex_exit(&spa_namespace_lock); 3734209962Smm 3735219089Spjd nvlist_free(config); 3736219089Spjd return (error); 3737185029Spjd} 3738185029Spjd 3739219089Spjd#endif 3740219089Spjd#endif /* sun */ 3741219089Spjd 3742209962Smm/* 3743209962Smm * Import a non-root pool into the system. 3744209962Smm */ 3745185029Spjdint 3746219089Spjdspa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) 3747185029Spjd{ 3748209962Smm spa_t *spa; 3749209962Smm char *altroot = NULL; 3750219089Spjd spa_load_state_t state = SPA_LOAD_IMPORT; 3751219089Spjd zpool_rewind_policy_t policy; 3752219089Spjd uint64_t mode = spa_mode_global; 3753219089Spjd uint64_t readonly = B_FALSE; 3754209962Smm int error; 3755209962Smm nvlist_t *nvroot; 3756209962Smm nvlist_t **spares, **l2cache; 3757209962Smm uint_t nspares, nl2cache; 3758209962Smm 3759209962Smm /* 3760209962Smm * If a pool with this name exists, return failure. 3761209962Smm */ 3762209962Smm mutex_enter(&spa_namespace_lock); 3763219089Spjd if (spa_lookup(pool) != NULL) { 3764209962Smm mutex_exit(&spa_namespace_lock); 3765209962Smm return (EEXIST); 3766209962Smm } 3767209962Smm 3768209962Smm /* 3769209962Smm * Create and initialize the spa structure. 3770209962Smm */ 3771209962Smm (void) nvlist_lookup_string(props, 3772209962Smm zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 3773219089Spjd (void) nvlist_lookup_uint64(props, 3774219089Spjd zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); 3775219089Spjd if (readonly) 3776219089Spjd mode = FREAD; 3777219089Spjd spa = spa_add(pool, config, altroot); 3778219089Spjd spa->spa_import_flags = flags; 3779209962Smm 3780209962Smm /* 3781219089Spjd * Verbatim import - Take a pool and insert it into the namespace 3782219089Spjd * as if it had been loaded at boot. 3783219089Spjd */ 3784219089Spjd if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { 3785219089Spjd if (props != NULL) 3786219089Spjd spa_configfile_set(spa, props, B_FALSE); 3787219089Spjd 3788219089Spjd spa_config_sync(spa, B_FALSE, B_TRUE); 3789219089Spjd 3790219089Spjd mutex_exit(&spa_namespace_lock); 3791219089Spjd spa_history_log_version(spa, LOG_POOL_IMPORT); 3792219089Spjd 3793219089Spjd return (0); 3794219089Spjd } 3795219089Spjd 3796219089Spjd spa_activate(spa, mode); 3797219089Spjd 3798219089Spjd /* 3799209962Smm * Don't start async tasks until we know everything is healthy. 3800209962Smm */ 3801209962Smm spa_async_suspend(spa); 3802209962Smm 3803219089Spjd zpool_get_rewind_policy(config, &policy); 3804219089Spjd if (policy.zrp_request & ZPOOL_DO_REWIND) 3805219089Spjd state = SPA_LOAD_RECOVER; 3806219089Spjd 3807209962Smm /* 3808209962Smm * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig 3809209962Smm * because the user-supplied config is actually the one to trust when 3810209962Smm * doing an import. 3811209962Smm */ 3812219089Spjd if (state != SPA_LOAD_RECOVER) 3813219089Spjd spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 3814209962Smm 3815219089Spjd error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg, 3816219089Spjd policy.zrp_request); 3817219089Spjd 3818219089Spjd /* 3819219089Spjd * Propagate anything learned while loading the pool and pass it 3820219089Spjd * back to caller (i.e. rewind info, missing devices, etc). 3821219089Spjd */ 3822219089Spjd VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 3823219089Spjd spa->spa_load_info) == 0); 3824219089Spjd 3825209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3826209962Smm /* 3827209962Smm * Toss any existing sparelist, as it doesn't have any validity 3828209962Smm * anymore, and conflicts with spa_has_spare(). 3829209962Smm */ 3830209962Smm if (spa->spa_spares.sav_config) { 3831209962Smm nvlist_free(spa->spa_spares.sav_config); 3832209962Smm spa->spa_spares.sav_config = NULL; 3833209962Smm spa_load_spares(spa); 3834209962Smm } 3835209962Smm if (spa->spa_l2cache.sav_config) { 3836209962Smm nvlist_free(spa->spa_l2cache.sav_config); 3837209962Smm spa->spa_l2cache.sav_config = NULL; 3838209962Smm spa_load_l2cache(spa); 3839209962Smm } 3840209962Smm 3841209962Smm VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3842209962Smm &nvroot) == 0); 3843209962Smm if (error == 0) 3844209962Smm error = spa_validate_aux(spa, nvroot, -1ULL, 3845209962Smm VDEV_ALLOC_SPARE); 3846209962Smm if (error == 0) 3847209962Smm error = spa_validate_aux(spa, nvroot, -1ULL, 3848209962Smm VDEV_ALLOC_L2CACHE); 3849209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 3850209962Smm 3851209962Smm if (props != NULL) 3852209962Smm spa_configfile_set(spa, props, B_FALSE); 3853209962Smm 3854209962Smm if (error != 0 || (props && spa_writeable(spa) && 3855209962Smm (error = spa_prop_set(spa, props)))) { 3856209962Smm spa_unload(spa); 3857209962Smm spa_deactivate(spa); 3858209962Smm spa_remove(spa); 3859209962Smm mutex_exit(&spa_namespace_lock); 3860209962Smm return (error); 3861209962Smm } 3862209962Smm 3863209962Smm spa_async_resume(spa); 3864209962Smm 3865209962Smm /* 3866209962Smm * Override any spares and level 2 cache devices as specified by 3867209962Smm * the user, as these may have correct device names/devids, etc. 3868209962Smm */ 3869209962Smm if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 3870209962Smm &spares, &nspares) == 0) { 3871209962Smm if (spa->spa_spares.sav_config) 3872209962Smm VERIFY(nvlist_remove(spa->spa_spares.sav_config, 3873209962Smm ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 3874209962Smm else 3875209962Smm VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, 3876209962Smm NV_UNIQUE_NAME, KM_SLEEP) == 0); 3877209962Smm VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 3878209962Smm ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 3879209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3880209962Smm spa_load_spares(spa); 3881209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 3882209962Smm spa->spa_spares.sav_sync = B_TRUE; 3883209962Smm } 3884209962Smm if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 3885209962Smm &l2cache, &nl2cache) == 0) { 3886209962Smm if (spa->spa_l2cache.sav_config) 3887209962Smm VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, 3888209962Smm ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); 3889209962Smm else 3890209962Smm VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 3891209962Smm NV_UNIQUE_NAME, KM_SLEEP) == 0); 3892209962Smm VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 3893209962Smm ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 3894209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3895209962Smm spa_load_l2cache(spa); 3896209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 3897209962Smm spa->spa_l2cache.sav_sync = B_TRUE; 3898209962Smm } 3899209962Smm 3900219089Spjd /* 3901219089Spjd * Check for any removed devices. 3902219089Spjd */ 3903219089Spjd if (spa->spa_autoreplace) { 3904219089Spjd spa_aux_check_removed(&spa->spa_spares); 3905219089Spjd spa_aux_check_removed(&spa->spa_l2cache); 3906219089Spjd } 3907219089Spjd 3908209962Smm if (spa_writeable(spa)) { 3909209962Smm /* 3910209962Smm * Update the config cache to include the newly-imported pool. 3911209962Smm */ 3912209962Smm spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 3913209962Smm } 3914209962Smm 3915219089Spjd /* 3916219089Spjd * It's possible that the pool was expanded while it was exported. 3917219089Spjd * We kick off an async task to handle this for us. 3918219089Spjd */ 3919219089Spjd spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 3920219089Spjd 3921209962Smm mutex_exit(&spa_namespace_lock); 3922219089Spjd spa_history_log_version(spa, LOG_POOL_IMPORT); 3923209962Smm 3924219089Spjd#ifdef __FreeBSD__ 3925219089Spjd#ifdef _KERNEL 3926219089Spjd zvol_create_minors(pool); 3927219089Spjd#endif 3928219089Spjd#endif 3929209962Smm return (0); 3930185029Spjd} 3931185029Spjd 3932168404Spjdnvlist_t * 3933168404Spjdspa_tryimport(nvlist_t *tryconfig) 3934168404Spjd{ 3935168404Spjd nvlist_t *config = NULL; 3936168404Spjd char *poolname; 3937168404Spjd spa_t *spa; 3938168404Spjd uint64_t state; 3939208443Smm int error; 3940168404Spjd 3941168404Spjd if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 3942168404Spjd return (NULL); 3943168404Spjd 3944168404Spjd if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 3945168404Spjd return (NULL); 3946168404Spjd 3947168404Spjd /* 3948168404Spjd * Create and initialize the spa structure. 3949168404Spjd */ 3950168404Spjd mutex_enter(&spa_namespace_lock); 3951219089Spjd spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); 3952209962Smm spa_activate(spa, FREAD); 3953168404Spjd 3954168404Spjd /* 3955168404Spjd * Pass off the heavy lifting to spa_load(). 3956168404Spjd * Pass TRUE for mosconfig because the user-supplied config 3957168404Spjd * is actually the one to trust when doing an import. 3958168404Spjd */ 3959219089Spjd error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE); 3960168404Spjd 3961168404Spjd /* 3962168404Spjd * If 'tryconfig' was at least parsable, return the current config. 3963168404Spjd */ 3964168404Spjd if (spa->spa_root_vdev != NULL) { 3965168404Spjd config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 3966168404Spjd VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 3967168404Spjd poolname) == 0); 3968168404Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 3969168404Spjd state) == 0); 3970168498Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 3971168498Spjd spa->spa_uberblock.ub_timestamp) == 0); 3972236884Smm VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 3973236884Smm spa->spa_load_info) == 0); 3974168404Spjd 3975168404Spjd /* 3976185029Spjd * If the bootfs property exists on this pool then we 3977185029Spjd * copy it out so that external consumers can tell which 3978185029Spjd * pools are bootable. 3979168404Spjd */ 3980208443Smm if ((!error || error == EEXIST) && spa->spa_bootfs) { 3981185029Spjd char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 3982185029Spjd 3983185029Spjd /* 3984185029Spjd * We have to play games with the name since the 3985185029Spjd * pool was opened as TRYIMPORT_NAME. 3986185029Spjd */ 3987185029Spjd if (dsl_dsobj_to_dsname(spa_name(spa), 3988185029Spjd spa->spa_bootfs, tmpname) == 0) { 3989185029Spjd char *cp; 3990185029Spjd char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 3991185029Spjd 3992185029Spjd cp = strchr(tmpname, '/'); 3993185029Spjd if (cp == NULL) { 3994185029Spjd (void) strlcpy(dsname, tmpname, 3995185029Spjd MAXPATHLEN); 3996185029Spjd } else { 3997185029Spjd (void) snprintf(dsname, MAXPATHLEN, 3998185029Spjd "%s/%s", poolname, ++cp); 3999185029Spjd } 4000185029Spjd VERIFY(nvlist_add_string(config, 4001185029Spjd ZPOOL_CONFIG_BOOTFS, dsname) == 0); 4002185029Spjd kmem_free(dsname, MAXPATHLEN); 4003185029Spjd } 4004185029Spjd kmem_free(tmpname, MAXPATHLEN); 4005185029Spjd } 4006185029Spjd 4007185029Spjd /* 4008185029Spjd * Add the list of hot spares and level 2 cache devices. 4009185029Spjd */ 4010209962Smm spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4011168404Spjd spa_add_spares(spa, config); 4012185029Spjd spa_add_l2cache(spa, config); 4013209962Smm spa_config_exit(spa, SCL_CONFIG, FTAG); 4014168404Spjd } 4015168404Spjd 4016168404Spjd spa_unload(spa); 4017168404Spjd spa_deactivate(spa); 4018168404Spjd spa_remove(spa); 4019168404Spjd mutex_exit(&spa_namespace_lock); 4020168404Spjd 4021168404Spjd return (config); 4022168404Spjd} 4023168404Spjd 4024168404Spjd/* 4025168404Spjd * Pool export/destroy 4026168404Spjd * 4027168404Spjd * The act of destroying or exporting a pool is very simple. We make sure there 4028168404Spjd * is no more pending I/O and any references to the pool are gone. Then, we 4029168404Spjd * update the pool state and sync all the labels to disk, removing the 4030207670Smm * configuration from the cache afterwards. If the 'hardforce' flag is set, then 4031207670Smm * we don't sync the labels or remove the configuration cache. 4032168404Spjd */ 4033168404Spjdstatic int 4034185029Spjdspa_export_common(char *pool, int new_state, nvlist_t **oldconfig, 4035207670Smm boolean_t force, boolean_t hardforce) 4036168404Spjd{ 4037168404Spjd spa_t *spa; 4038168404Spjd 4039168404Spjd if (oldconfig) 4040168404Spjd *oldconfig = NULL; 4041168404Spjd 4042209962Smm if (!(spa_mode_global & FWRITE)) 4043168404Spjd return (EROFS); 4044168404Spjd 4045168404Spjd mutex_enter(&spa_namespace_lock); 4046168404Spjd if ((spa = spa_lookup(pool)) == NULL) { 4047168404Spjd mutex_exit(&spa_namespace_lock); 4048168404Spjd return (ENOENT); 4049168404Spjd } 4050168404Spjd 4051168404Spjd /* 4052168404Spjd * Put a hold on the pool, drop the namespace lock, stop async tasks, 4053168404Spjd * reacquire the namespace lock, and see if we can export. 4054168404Spjd */ 4055168404Spjd spa_open_ref(spa, FTAG); 4056168404Spjd mutex_exit(&spa_namespace_lock); 4057168404Spjd spa_async_suspend(spa); 4058168404Spjd mutex_enter(&spa_namespace_lock); 4059168404Spjd spa_close(spa, FTAG); 4060168404Spjd 4061168404Spjd /* 4062168404Spjd * The pool will be in core if it's openable, 4063168404Spjd * in which case we can modify its state. 4064168404Spjd */ 4065168404Spjd if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 4066168404Spjd /* 4067168404Spjd * Objsets may be open only because they're dirty, so we 4068168404Spjd * have to force it to sync before checking spa_refcnt. 4069168404Spjd */ 4070168404Spjd txg_wait_synced(spa->spa_dsl_pool, 0); 4071168404Spjd 4072168404Spjd /* 4073168404Spjd * A pool cannot be exported or destroyed if there are active 4074168404Spjd * references. If we are resetting a pool, allow references by 4075168404Spjd * fault injection handlers. 4076168404Spjd */ 4077168404Spjd if (!spa_refcount_zero(spa) || 4078168404Spjd (spa->spa_inject_ref != 0 && 4079168404Spjd new_state != POOL_STATE_UNINITIALIZED)) { 4080168404Spjd spa_async_resume(spa); 4081168404Spjd mutex_exit(&spa_namespace_lock); 4082168404Spjd return (EBUSY); 4083168404Spjd } 4084168404Spjd 4085185029Spjd /* 4086185029Spjd * A pool cannot be exported if it has an active shared spare. 4087185029Spjd * This is to prevent other pools stealing the active spare 4088185029Spjd * from an exported pool. At user's own will, such pool can 4089185029Spjd * be forcedly exported. 4090185029Spjd */ 4091185029Spjd if (!force && new_state == POOL_STATE_EXPORTED && 4092185029Spjd spa_has_active_shared_spare(spa)) { 4093185029Spjd spa_async_resume(spa); 4094185029Spjd mutex_exit(&spa_namespace_lock); 4095185029Spjd return (EXDEV); 4096185029Spjd } 4097168404Spjd 4098168404Spjd /* 4099168404Spjd * We want this to be reflected on every label, 4100168404Spjd * so mark them all dirty. spa_unload() will do the 4101168404Spjd * final sync that pushes these changes out. 4102168404Spjd */ 4103207670Smm if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 4104185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4105168404Spjd spa->spa_state = new_state; 4106219089Spjd spa->spa_final_txg = spa_last_synced_txg(spa) + 4107219089Spjd TXG_DEFER_SIZE + 1; 4108168404Spjd vdev_config_dirty(spa->spa_root_vdev); 4109185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 4110168404Spjd } 4111168404Spjd } 4112168404Spjd 4113185029Spjd spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY); 4114185029Spjd 4115168404Spjd if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 4116168404Spjd spa_unload(spa); 4117168404Spjd spa_deactivate(spa); 4118168404Spjd } 4119168404Spjd 4120168404Spjd if (oldconfig && spa->spa_config) 4121168404Spjd VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 4122168404Spjd 4123168404Spjd if (new_state != POOL_STATE_UNINITIALIZED) { 4124207670Smm if (!hardforce) 4125207670Smm spa_config_sync(spa, B_TRUE, B_TRUE); 4126168404Spjd spa_remove(spa); 4127168404Spjd } 4128168404Spjd mutex_exit(&spa_namespace_lock); 4129168404Spjd 4130168404Spjd return (0); 4131168404Spjd} 4132168404Spjd 4133168404Spjd/* 4134168404Spjd * Destroy a storage pool. 4135168404Spjd */ 4136168404Spjdint 4137168404Spjdspa_destroy(char *pool) 4138168404Spjd{ 4139207670Smm return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, 4140207670Smm B_FALSE, B_FALSE)); 4141168404Spjd} 4142168404Spjd 4143168404Spjd/* 4144168404Spjd * Export a storage pool. 4145168404Spjd */ 4146168404Spjdint 4147207670Smmspa_export(char *pool, nvlist_t **oldconfig, boolean_t force, 4148207670Smm boolean_t hardforce) 4149168404Spjd{ 4150207670Smm return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, 4151207670Smm force, hardforce)); 4152168404Spjd} 4153168404Spjd 4154168404Spjd/* 4155168404Spjd * Similar to spa_export(), this unloads the spa_t without actually removing it 4156168404Spjd * from the namespace in any way. 4157168404Spjd */ 4158168404Spjdint 4159168404Spjdspa_reset(char *pool) 4160168404Spjd{ 4161185029Spjd return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, 4162207670Smm B_FALSE, B_FALSE)); 4163168404Spjd} 4164168404Spjd 4165168404Spjd/* 4166168404Spjd * ========================================================================== 4167168404Spjd * Device manipulation 4168168404Spjd * ========================================================================== 4169168404Spjd */ 4170168404Spjd 4171168404Spjd/* 4172185029Spjd * Add a device to a storage pool. 4173168404Spjd */ 4174168404Spjdint 4175168404Spjdspa_vdev_add(spa_t *spa, nvlist_t *nvroot) 4176168404Spjd{ 4177219089Spjd uint64_t txg, id; 4178209962Smm int error; 4179168404Spjd vdev_t *rvd = spa->spa_root_vdev; 4180168404Spjd vdev_t *vd, *tvd; 4181185029Spjd nvlist_t **spares, **l2cache; 4182185029Spjd uint_t nspares, nl2cache; 4183168404Spjd 4184219089Spjd ASSERT(spa_writeable(spa)); 4185219089Spjd 4186168404Spjd txg = spa_vdev_enter(spa); 4187168404Spjd 4188168404Spjd if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 4189168404Spjd VDEV_ALLOC_ADD)) != 0) 4190168404Spjd return (spa_vdev_exit(spa, NULL, txg, error)); 4191168404Spjd 4192185029Spjd spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ 4193168404Spjd 4194185029Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 4195185029Spjd &nspares) != 0) 4196168404Spjd nspares = 0; 4197168404Spjd 4198185029Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 4199185029Spjd &nl2cache) != 0) 4200185029Spjd nl2cache = 0; 4201185029Spjd 4202185029Spjd if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) 4203168404Spjd return (spa_vdev_exit(spa, vd, txg, EINVAL)); 4204168404Spjd 4205185029Spjd if (vd->vdev_children != 0 && 4206185029Spjd (error = vdev_create(vd, txg, B_FALSE)) != 0) 4207185029Spjd return (spa_vdev_exit(spa, vd, txg, error)); 4208168404Spjd 4209168404Spjd /* 4210185029Spjd * We must validate the spares and l2cache devices after checking the 4211185029Spjd * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 4212168404Spjd */ 4213185029Spjd if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) 4214168404Spjd return (spa_vdev_exit(spa, vd, txg, error)); 4215168404Spjd 4216168404Spjd /* 4217168404Spjd * Transfer each new top-level vdev from vd to rvd. 4218168404Spjd */ 4219209962Smm for (int c = 0; c < vd->vdev_children; c++) { 4220219089Spjd 4221219089Spjd /* 4222219089Spjd * Set the vdev id to the first hole, if one exists. 4223219089Spjd */ 4224219089Spjd for (id = 0; id < rvd->vdev_children; id++) { 4225219089Spjd if (rvd->vdev_child[id]->vdev_ishole) { 4226219089Spjd vdev_free(rvd->vdev_child[id]); 4227219089Spjd break; 4228219089Spjd } 4229219089Spjd } 4230168404Spjd tvd = vd->vdev_child[c]; 4231168404Spjd vdev_remove_child(vd, tvd); 4232219089Spjd tvd->vdev_id = id; 4233168404Spjd vdev_add_child(rvd, tvd); 4234168404Spjd vdev_config_dirty(tvd); 4235168404Spjd } 4236168404Spjd 4237168404Spjd if (nspares != 0) { 4238185029Spjd spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 4239185029Spjd ZPOOL_CONFIG_SPARES); 4240168404Spjd spa_load_spares(spa); 4241185029Spjd spa->spa_spares.sav_sync = B_TRUE; 4242168404Spjd } 4243168404Spjd 4244185029Spjd if (nl2cache != 0) { 4245185029Spjd spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 4246185029Spjd ZPOOL_CONFIG_L2CACHE); 4247185029Spjd spa_load_l2cache(spa); 4248185029Spjd spa->spa_l2cache.sav_sync = B_TRUE; 4249185029Spjd } 4250185029Spjd 4251168404Spjd /* 4252168404Spjd * We have to be careful when adding new vdevs to an existing pool. 4253168404Spjd * If other threads start allocating from these vdevs before we 4254168404Spjd * sync the config cache, and we lose power, then upon reboot we may 4255168404Spjd * fail to open the pool because there are DVAs that the config cache 4256168404Spjd * can't translate. Therefore, we first add the vdevs without 4257168404Spjd * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 4258168404Spjd * and then let spa_config_update() initialize the new metaslabs. 4259168404Spjd * 4260168404Spjd * spa_load() checks for added-but-not-initialized vdevs, so that 4261168404Spjd * if we lose power at any point in this sequence, the remaining 4262168404Spjd * steps will be completed the next time we load the pool. 4263168404Spjd */ 4264168404Spjd (void) spa_vdev_exit(spa, vd, txg, 0); 4265168404Spjd 4266168404Spjd mutex_enter(&spa_namespace_lock); 4267168404Spjd spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 4268168404Spjd mutex_exit(&spa_namespace_lock); 4269168404Spjd 4270168404Spjd return (0); 4271168404Spjd} 4272168404Spjd 4273168404Spjd/* 4274168404Spjd * Attach a device to a mirror. The arguments are the path to any device 4275168404Spjd * in the mirror, and the nvroot for the new device. If the path specifies 4276168404Spjd * a device that is not mirrored, we automatically insert the mirror vdev. 4277168404Spjd * 4278168404Spjd * If 'replacing' is specified, the new device is intended to replace the 4279168404Spjd * existing device; in this case the two devices are made into their own 4280185029Spjd * mirror using the 'replacing' vdev, which is functionally identical to 4281168404Spjd * the mirror vdev (it actually reuses all the same ops) but has a few 4282168404Spjd * extra rules: you can't attach to it after it's been created, and upon 4283168404Spjd * completion of resilvering, the first disk (the one being replaced) 4284168404Spjd * is automatically detached. 4285168404Spjd */ 4286168404Spjdint 4287168404Spjdspa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 4288168404Spjd{ 4289219089Spjd uint64_t txg, dtl_max_txg; 4290168404Spjd vdev_t *rvd = spa->spa_root_vdev; 4291168404Spjd vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 4292168404Spjd vdev_ops_t *pvops; 4293185029Spjd char *oldvdpath, *newvdpath; 4294185029Spjd int newvd_isspare; 4295185029Spjd int error; 4296168404Spjd 4297219089Spjd ASSERT(spa_writeable(spa)); 4298219089Spjd 4299168404Spjd txg = spa_vdev_enter(spa); 4300168404Spjd 4301185029Spjd oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 4302168404Spjd 4303168404Spjd if (oldvd == NULL) 4304168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 4305168404Spjd 4306168404Spjd if (!oldvd->vdev_ops->vdev_op_leaf) 4307168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4308168404Spjd 4309168404Spjd pvd = oldvd->vdev_parent; 4310168404Spjd 4311168404Spjd if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 4312230514Smm VDEV_ALLOC_ATTACH)) != 0) 4313185029Spjd return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4314185029Spjd 4315185029Spjd if (newrootvd->vdev_children != 1) 4316168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 4317168404Spjd 4318168404Spjd newvd = newrootvd->vdev_child[0]; 4319168404Spjd 4320168404Spjd if (!newvd->vdev_ops->vdev_op_leaf) 4321168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 4322168404Spjd 4323168404Spjd if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 4324168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, error)); 4325168404Spjd 4326185029Spjd /* 4327185029Spjd * Spares can't replace logs 4328185029Spjd */ 4329185029Spjd if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) 4330185029Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4331185029Spjd 4332168404Spjd if (!replacing) { 4333168404Spjd /* 4334168404Spjd * For attach, the only allowable parent is a mirror or the root 4335168404Spjd * vdev. 4336168404Spjd */ 4337168404Spjd if (pvd->vdev_ops != &vdev_mirror_ops && 4338168404Spjd pvd->vdev_ops != &vdev_root_ops) 4339168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4340168404Spjd 4341168404Spjd pvops = &vdev_mirror_ops; 4342168404Spjd } else { 4343168404Spjd /* 4344168404Spjd * Active hot spares can only be replaced by inactive hot 4345168404Spjd * spares. 4346168404Spjd */ 4347168404Spjd if (pvd->vdev_ops == &vdev_spare_ops && 4348219089Spjd oldvd->vdev_isspare && 4349168404Spjd !spa_has_spare(spa, newvd->vdev_guid)) 4350168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4351168404Spjd 4352168404Spjd /* 4353168404Spjd * If the source is a hot spare, and the parent isn't already a 4354168404Spjd * spare, then we want to create a new hot spare. Otherwise, we 4355168404Spjd * want to create a replacing vdev. The user is not allowed to 4356168404Spjd * attach to a spared vdev child unless the 'isspare' state is 4357168404Spjd * the same (spare replaces spare, non-spare replaces 4358168404Spjd * non-spare). 4359168404Spjd */ 4360219089Spjd if (pvd->vdev_ops == &vdev_replacing_ops && 4361219089Spjd spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { 4362168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4363219089Spjd } else if (pvd->vdev_ops == &vdev_spare_ops && 4364219089Spjd newvd->vdev_isspare != oldvd->vdev_isspare) { 4365168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4366219089Spjd } 4367219089Spjd 4368219089Spjd if (newvd->vdev_isspare) 4369168404Spjd pvops = &vdev_spare_ops; 4370168404Spjd else 4371168404Spjd pvops = &vdev_replacing_ops; 4372168404Spjd } 4373168404Spjd 4374168404Spjd /* 4375219089Spjd * Make sure the new device is big enough. 4376168404Spjd */ 4377219089Spjd if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) 4378168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 4379168404Spjd 4380168404Spjd /* 4381168404Spjd * The new device cannot have a higher alignment requirement 4382168404Spjd * than the top-level vdev. 4383168404Spjd */ 4384168404Spjd if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 4385168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 4386168404Spjd 4387168404Spjd /* 4388168404Spjd * If this is an in-place replacement, update oldvd's path and devid 4389168404Spjd * to make it distinguishable from newvd, and unopenable from now on. 4390168404Spjd */ 4391168404Spjd if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 4392168404Spjd spa_strfree(oldvd->vdev_path); 4393168404Spjd oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 4394168404Spjd KM_SLEEP); 4395168404Spjd (void) sprintf(oldvd->vdev_path, "%s/%s", 4396168404Spjd newvd->vdev_path, "old"); 4397168404Spjd if (oldvd->vdev_devid != NULL) { 4398168404Spjd spa_strfree(oldvd->vdev_devid); 4399168404Spjd oldvd->vdev_devid = NULL; 4400168404Spjd } 4401168404Spjd } 4402168404Spjd 4403219089Spjd /* mark the device being resilvered */ 4404219089Spjd newvd->vdev_resilvering = B_TRUE; 4405219089Spjd 4406168404Spjd /* 4407168404Spjd * If the parent is not a mirror, or if we're replacing, insert the new 4408168404Spjd * mirror/replacing/spare vdev above oldvd. 4409168404Spjd */ 4410168404Spjd if (pvd->vdev_ops != pvops) 4411168404Spjd pvd = vdev_add_parent(oldvd, pvops); 4412168404Spjd 4413168404Spjd ASSERT(pvd->vdev_top->vdev_parent == rvd); 4414168404Spjd ASSERT(pvd->vdev_ops == pvops); 4415168404Spjd ASSERT(oldvd->vdev_parent == pvd); 4416168404Spjd 4417168404Spjd /* 4418168404Spjd * Extract the new device from its root and add it to pvd. 4419168404Spjd */ 4420168404Spjd vdev_remove_child(newrootvd, newvd); 4421168404Spjd newvd->vdev_id = pvd->vdev_children; 4422219089Spjd newvd->vdev_crtxg = oldvd->vdev_crtxg; 4423168404Spjd vdev_add_child(pvd, newvd); 4424168404Spjd 4425168404Spjd tvd = newvd->vdev_top; 4426168404Spjd ASSERT(pvd->vdev_top == tvd); 4427168404Spjd ASSERT(tvd->vdev_parent == rvd); 4428168404Spjd 4429168404Spjd vdev_config_dirty(tvd); 4430168404Spjd 4431168404Spjd /* 4432219089Spjd * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account 4433219089Spjd * for any dmu_sync-ed blocks. It will propagate upward when 4434219089Spjd * spa_vdev_exit() calls vdev_dtl_reassess(). 4435168404Spjd */ 4436219089Spjd dtl_max_txg = txg + TXG_CONCURRENT_STATES; 4437168404Spjd 4438219089Spjd vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, 4439219089Spjd dtl_max_txg - TXG_INITIAL); 4440168404Spjd 4441209962Smm if (newvd->vdev_isspare) { 4442168404Spjd spa_spare_activate(newvd); 4443209962Smm spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE); 4444209962Smm } 4445209962Smm 4446185029Spjd oldvdpath = spa_strdup(oldvd->vdev_path); 4447185029Spjd newvdpath = spa_strdup(newvd->vdev_path); 4448185029Spjd newvd_isspare = newvd->vdev_isspare; 4449168404Spjd 4450168404Spjd /* 4451168404Spjd * Mark newvd's DTL dirty in this txg. 4452168404Spjd */ 4453168404Spjd vdev_dirty(tvd, VDD_DTL, newvd, txg); 4454168404Spjd 4455219089Spjd /* 4456219089Spjd * Restart the resilver 4457219089Spjd */ 4458219089Spjd dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg); 4459168404Spjd 4460219089Spjd /* 4461219089Spjd * Commit the config 4462219089Spjd */ 4463219089Spjd (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); 4464185029Spjd 4465219089Spjd spa_history_log_internal(LOG_POOL_VDEV_ATTACH, spa, NULL, 4466219089Spjd "%s vdev=%s %s vdev=%s", 4467219089Spjd replacing && newvd_isspare ? "spare in" : 4468219089Spjd replacing ? "replace" : "attach", newvdpath, 4469219089Spjd replacing ? "for" : "to", oldvdpath); 4470219089Spjd 4471185029Spjd spa_strfree(oldvdpath); 4472185029Spjd spa_strfree(newvdpath); 4473185029Spjd 4474219089Spjd if (spa->spa_bootfs) 4475219089Spjd spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH); 4476168404Spjd 4477168404Spjd return (0); 4478168404Spjd} 4479168404Spjd 4480168404Spjd/* 4481168404Spjd * Detach a device from a mirror or replacing vdev. 4482168404Spjd * If 'replace_done' is specified, only detach if the parent 4483168404Spjd * is a replacing vdev. 4484168404Spjd */ 4485168404Spjdint 4486209962Smmspa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) 4487168404Spjd{ 4488168404Spjd uint64_t txg; 4489209962Smm int error; 4490168404Spjd vdev_t *rvd = spa->spa_root_vdev; 4491168404Spjd vdev_t *vd, *pvd, *cvd, *tvd; 4492168404Spjd boolean_t unspare = B_FALSE; 4493168404Spjd uint64_t unspare_guid; 4494219089Spjd char *vdpath; 4495168404Spjd 4496219089Spjd ASSERT(spa_writeable(spa)); 4497219089Spjd 4498168404Spjd txg = spa_vdev_enter(spa); 4499168404Spjd 4500185029Spjd vd = spa_lookup_by_guid(spa, guid, B_FALSE); 4501168404Spjd 4502168404Spjd if (vd == NULL) 4503168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 4504168404Spjd 4505168404Spjd if (!vd->vdev_ops->vdev_op_leaf) 4506168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4507168404Spjd 4508168404Spjd pvd = vd->vdev_parent; 4509168404Spjd 4510168404Spjd /* 4511209962Smm * If the parent/child relationship is not as expected, don't do it. 4512209962Smm * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing 4513209962Smm * vdev that's replacing B with C. The user's intent in replacing 4514209962Smm * is to go from M(A,B) to M(A,C). If the user decides to cancel 4515209962Smm * the replace by detaching C, the expected behavior is to end up 4516209962Smm * M(A,B). But suppose that right after deciding to detach C, 4517209962Smm * the replacement of B completes. We would have M(A,C), and then 4518209962Smm * ask to detach C, which would leave us with just A -- not what 4519209962Smm * the user wanted. To prevent this, we make sure that the 4520209962Smm * parent/child relationship hasn't changed -- in this example, 4521209962Smm * that C's parent is still the replacing vdev R. 4522209962Smm */ 4523209962Smm if (pvd->vdev_guid != pguid && pguid != 0) 4524209962Smm return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 4525209962Smm 4526209962Smm /* 4527219089Spjd * Only 'replacing' or 'spare' vdevs can be replaced. 4528168404Spjd */ 4529219089Spjd if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && 4530219089Spjd pvd->vdev_ops != &vdev_spare_ops) 4531219089Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4532168404Spjd 4533168404Spjd ASSERT(pvd->vdev_ops != &vdev_spare_ops || 4534185029Spjd spa_version(spa) >= SPA_VERSION_SPARES); 4535168404Spjd 4536168404Spjd /* 4537168404Spjd * Only mirror, replacing, and spare vdevs support detach. 4538168404Spjd */ 4539168404Spjd if (pvd->vdev_ops != &vdev_replacing_ops && 4540168404Spjd pvd->vdev_ops != &vdev_mirror_ops && 4541168404Spjd pvd->vdev_ops != &vdev_spare_ops) 4542168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4543168404Spjd 4544168404Spjd /* 4545209962Smm * If this device has the only valid copy of some data, 4546209962Smm * we cannot safely detach it. 4547168404Spjd */ 4548209962Smm if (vdev_dtl_required(vd)) 4549168404Spjd return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 4550168404Spjd 4551209962Smm ASSERT(pvd->vdev_children >= 2); 4552168404Spjd 4553168404Spjd /* 4554185029Spjd * If we are detaching the second disk from a replacing vdev, then 4555185029Spjd * check to see if we changed the original vdev's path to have "/old" 4556185029Spjd * at the end in spa_vdev_attach(). If so, undo that change now. 4557168404Spjd */ 4558219089Spjd if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && 4559219089Spjd vd->vdev_path != NULL) { 4560219089Spjd size_t len = strlen(vd->vdev_path); 4561219089Spjd 4562219089Spjd for (int c = 0; c < pvd->vdev_children; c++) { 4563219089Spjd cvd = pvd->vdev_child[c]; 4564219089Spjd 4565219089Spjd if (cvd == vd || cvd->vdev_path == NULL) 4566219089Spjd continue; 4567219089Spjd 4568219089Spjd if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 4569219089Spjd strcmp(cvd->vdev_path + len, "/old") == 0) { 4570219089Spjd spa_strfree(cvd->vdev_path); 4571219089Spjd cvd->vdev_path = spa_strdup(vd->vdev_path); 4572219089Spjd break; 4573219089Spjd } 4574185029Spjd } 4575185029Spjd } 4576168404Spjd 4577168404Spjd /* 4578168404Spjd * If we are detaching the original disk from a spare, then it implies 4579168404Spjd * that the spare should become a real disk, and be removed from the 4580168404Spjd * active spare list for the pool. 4581168404Spjd */ 4582168404Spjd if (pvd->vdev_ops == &vdev_spare_ops && 4583219089Spjd vd->vdev_id == 0 && 4584219089Spjd pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare) 4585168404Spjd unspare = B_TRUE; 4586168404Spjd 4587168404Spjd /* 4588168404Spjd * Erase the disk labels so the disk can be used for other things. 4589168404Spjd * This must be done after all other error cases are handled, 4590168404Spjd * but before we disembowel vd (so we can still do I/O to it). 4591168404Spjd * But if we can't do it, don't treat the error as fatal -- 4592168404Spjd * it may be that the unwritability of the disk is the reason 4593168404Spjd * it's being detached! 4594168404Spjd */ 4595168404Spjd error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 4596168404Spjd 4597168404Spjd /* 4598168404Spjd * Remove vd from its parent and compact the parent's children. 4599168404Spjd */ 4600168404Spjd vdev_remove_child(pvd, vd); 4601168404Spjd vdev_compact_children(pvd); 4602168404Spjd 4603168404Spjd /* 4604168404Spjd * Remember one of the remaining children so we can get tvd below. 4605168404Spjd */ 4606219089Spjd cvd = pvd->vdev_child[pvd->vdev_children - 1]; 4607168404Spjd 4608168404Spjd /* 4609168404Spjd * If we need to remove the remaining child from the list of hot spares, 4610209962Smm * do it now, marking the vdev as no longer a spare in the process. 4611209962Smm * We must do this before vdev_remove_parent(), because that can 4612209962Smm * change the GUID if it creates a new toplevel GUID. For a similar 4613209962Smm * reason, we must remove the spare now, in the same txg as the detach; 4614209962Smm * otherwise someone could attach a new sibling, change the GUID, and 4615209962Smm * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. 4616168404Spjd */ 4617168404Spjd if (unspare) { 4618168404Spjd ASSERT(cvd->vdev_isspare); 4619168404Spjd spa_spare_remove(cvd); 4620168404Spjd unspare_guid = cvd->vdev_guid; 4621209962Smm (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 4622219089Spjd cvd->vdev_unspare = B_TRUE; 4623168404Spjd } 4624168404Spjd 4625168404Spjd /* 4626168404Spjd * If the parent mirror/replacing vdev only has one child, 4627168404Spjd * the parent is no longer needed. Remove it from the tree. 4628168404Spjd */ 4629219089Spjd if (pvd->vdev_children == 1) { 4630219089Spjd if (pvd->vdev_ops == &vdev_spare_ops) 4631219089Spjd cvd->vdev_unspare = B_FALSE; 4632168404Spjd vdev_remove_parent(cvd); 4633219089Spjd cvd->vdev_resilvering = B_FALSE; 4634219089Spjd } 4635168404Spjd 4636219089Spjd 4637168404Spjd /* 4638168404Spjd * We don't set tvd until now because the parent we just removed 4639168404Spjd * may have been the previous top-level vdev. 4640168404Spjd */ 4641168404Spjd tvd = cvd->vdev_top; 4642168404Spjd ASSERT(tvd->vdev_parent == rvd); 4643168404Spjd 4644168404Spjd /* 4645168404Spjd * Reevaluate the parent vdev state. 4646168404Spjd */ 4647185029Spjd vdev_propagate_state(cvd); 4648168404Spjd 4649168404Spjd /* 4650219089Spjd * If the 'autoexpand' property is set on the pool then automatically 4651219089Spjd * try to expand the size of the pool. For example if the device we 4652219089Spjd * just detached was smaller than the others, it may be possible to 4653219089Spjd * add metaslabs (i.e. grow the pool). We need to reopen the vdev 4654219089Spjd * first so that we can obtain the updated sizes of the leaf vdevs. 4655168404Spjd */ 4656219089Spjd if (spa->spa_autoexpand) { 4657219089Spjd vdev_reopen(tvd); 4658219089Spjd vdev_expand(tvd, txg); 4659219089Spjd } 4660168404Spjd 4661168404Spjd vdev_config_dirty(tvd); 4662168404Spjd 4663168404Spjd /* 4664168404Spjd * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 4665168404Spjd * vd->vdev_detached is set and free vd's DTL object in syncing context. 4666168404Spjd * But first make sure we're not on any *other* txg's DTL list, to 4667168404Spjd * prevent vd from being accessed after it's freed. 4668168404Spjd */ 4669219089Spjd vdpath = spa_strdup(vd->vdev_path); 4670209962Smm for (int t = 0; t < TXG_SIZE; t++) 4671168404Spjd (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 4672168404Spjd vd->vdev_detached = B_TRUE; 4673168404Spjd vdev_dirty(tvd, VDD_DTL, vd, txg); 4674168404Spjd 4675185029Spjd spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); 4676185029Spjd 4677219089Spjd /* hang on to the spa before we release the lock */ 4678219089Spjd spa_open_ref(spa, FTAG); 4679219089Spjd 4680168404Spjd error = spa_vdev_exit(spa, vd, txg, 0); 4681168404Spjd 4682219089Spjd spa_history_log_internal(LOG_POOL_VDEV_DETACH, spa, NULL, 4683219089Spjd "vdev=%s", vdpath); 4684219089Spjd spa_strfree(vdpath); 4685219089Spjd 4686168404Spjd /* 4687168404Spjd * If this was the removal of the original device in a hot spare vdev, 4688168404Spjd * then we want to go through and remove the device from the hot spare 4689168404Spjd * list of every other pool. 4690168404Spjd */ 4691168404Spjd if (unspare) { 4692219089Spjd spa_t *altspa = NULL; 4693219089Spjd 4694168404Spjd mutex_enter(&spa_namespace_lock); 4695219089Spjd while ((altspa = spa_next(altspa)) != NULL) { 4696219089Spjd if (altspa->spa_state != POOL_STATE_ACTIVE || 4697219089Spjd altspa == spa) 4698168404Spjd continue; 4699219089Spjd 4700219089Spjd spa_open_ref(altspa, FTAG); 4701185029Spjd mutex_exit(&spa_namespace_lock); 4702219089Spjd (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); 4703185029Spjd mutex_enter(&spa_namespace_lock); 4704219089Spjd spa_close(altspa, FTAG); 4705168404Spjd } 4706168404Spjd mutex_exit(&spa_namespace_lock); 4707219089Spjd 4708219089Spjd /* search the rest of the vdevs for spares to remove */ 4709219089Spjd spa_vdev_resilver_done(spa); 4710168404Spjd } 4711168404Spjd 4712219089Spjd /* all done with the spa; OK to release */ 4713219089Spjd mutex_enter(&spa_namespace_lock); 4714219089Spjd spa_close(spa, FTAG); 4715219089Spjd mutex_exit(&spa_namespace_lock); 4716219089Spjd 4717168404Spjd return (error); 4718168404Spjd} 4719168404Spjd 4720219089Spjd/* 4721219089Spjd * Split a set of devices from their mirrors, and create a new pool from them. 4722219089Spjd */ 4723219089Spjdint 4724219089Spjdspa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, 4725219089Spjd nvlist_t *props, boolean_t exp) 4726219089Spjd{ 4727219089Spjd int error = 0; 4728219089Spjd uint64_t txg, *glist; 4729219089Spjd spa_t *newspa; 4730219089Spjd uint_t c, children, lastlog; 4731219089Spjd nvlist_t **child, *nvl, *tmp; 4732219089Spjd dmu_tx_t *tx; 4733219089Spjd char *altroot = NULL; 4734219089Spjd vdev_t *rvd, **vml = NULL; /* vdev modify list */ 4735219089Spjd boolean_t activate_slog; 4736219089Spjd 4737219089Spjd ASSERT(spa_writeable(spa)); 4738219089Spjd 4739219089Spjd txg = spa_vdev_enter(spa); 4740219089Spjd 4741219089Spjd /* clear the log and flush everything up to now */ 4742219089Spjd activate_slog = spa_passivate_log(spa); 4743219089Spjd (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 4744219089Spjd error = spa_offline_log(spa); 4745219089Spjd txg = spa_vdev_config_enter(spa); 4746219089Spjd 4747219089Spjd if (activate_slog) 4748219089Spjd spa_activate_log(spa); 4749219089Spjd 4750219089Spjd if (error != 0) 4751219089Spjd return (spa_vdev_exit(spa, NULL, txg, error)); 4752219089Spjd 4753219089Spjd /* check new spa name before going any further */ 4754219089Spjd if (spa_lookup(newname) != NULL) 4755219089Spjd return (spa_vdev_exit(spa, NULL, txg, EEXIST)); 4756219089Spjd 4757219089Spjd /* 4758219089Spjd * scan through all the children to ensure they're all mirrors 4759219089Spjd */ 4760219089Spjd if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || 4761219089Spjd nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, 4762219089Spjd &children) != 0) 4763219089Spjd return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4764219089Spjd 4765219089Spjd /* first, check to ensure we've got the right child count */ 4766219089Spjd rvd = spa->spa_root_vdev; 4767219089Spjd lastlog = 0; 4768219089Spjd for (c = 0; c < rvd->vdev_children; c++) { 4769219089Spjd vdev_t *vd = rvd->vdev_child[c]; 4770219089Spjd 4771219089Spjd /* don't count the holes & logs as children */ 4772219089Spjd if (vd->vdev_islog || vd->vdev_ishole) { 4773219089Spjd if (lastlog == 0) 4774219089Spjd lastlog = c; 4775219089Spjd continue; 4776219089Spjd } 4777219089Spjd 4778219089Spjd lastlog = 0; 4779219089Spjd } 4780219089Spjd if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) 4781219089Spjd return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4782219089Spjd 4783219089Spjd /* next, ensure no spare or cache devices are part of the split */ 4784219089Spjd if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || 4785219089Spjd nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) 4786219089Spjd return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4787219089Spjd 4788219089Spjd vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); 4789219089Spjd glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); 4790219089Spjd 4791219089Spjd /* then, loop over each vdev and validate it */ 4792219089Spjd for (c = 0; c < children; c++) { 4793219089Spjd uint64_t is_hole = 0; 4794219089Spjd 4795219089Spjd (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, 4796219089Spjd &is_hole); 4797219089Spjd 4798219089Spjd if (is_hole != 0) { 4799219089Spjd if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || 4800219089Spjd spa->spa_root_vdev->vdev_child[c]->vdev_islog) { 4801219089Spjd continue; 4802219089Spjd } else { 4803219089Spjd error = EINVAL; 4804219089Spjd break; 4805219089Spjd } 4806219089Spjd } 4807219089Spjd 4808219089Spjd /* which disk is going to be split? */ 4809219089Spjd if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, 4810219089Spjd &glist[c]) != 0) { 4811219089Spjd error = EINVAL; 4812219089Spjd break; 4813219089Spjd } 4814219089Spjd 4815219089Spjd /* look it up in the spa */ 4816219089Spjd vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); 4817219089Spjd if (vml[c] == NULL) { 4818219089Spjd error = ENODEV; 4819219089Spjd break; 4820219089Spjd } 4821219089Spjd 4822219089Spjd /* make sure there's nothing stopping the split */ 4823219089Spjd if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || 4824219089Spjd vml[c]->vdev_islog || 4825219089Spjd vml[c]->vdev_ishole || 4826219089Spjd vml[c]->vdev_isspare || 4827219089Spjd vml[c]->vdev_isl2cache || 4828219089Spjd !vdev_writeable(vml[c]) || 4829219089Spjd vml[c]->vdev_children != 0 || 4830219089Spjd vml[c]->vdev_state != VDEV_STATE_HEALTHY || 4831219089Spjd c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { 4832219089Spjd error = EINVAL; 4833219089Spjd break; 4834219089Spjd } 4835219089Spjd 4836219089Spjd if (vdev_dtl_required(vml[c])) { 4837219089Spjd error = EBUSY; 4838219089Spjd break; 4839219089Spjd } 4840219089Spjd 4841219089Spjd /* we need certain info from the top level */ 4842219089Spjd VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, 4843219089Spjd vml[c]->vdev_top->vdev_ms_array) == 0); 4844219089Spjd VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, 4845219089Spjd vml[c]->vdev_top->vdev_ms_shift) == 0); 4846219089Spjd VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, 4847219089Spjd vml[c]->vdev_top->vdev_asize) == 0); 4848219089Spjd VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, 4849219089Spjd vml[c]->vdev_top->vdev_ashift) == 0); 4850219089Spjd } 4851219089Spjd 4852219089Spjd if (error != 0) { 4853219089Spjd kmem_free(vml, children * sizeof (vdev_t *)); 4854219089Spjd kmem_free(glist, children * sizeof (uint64_t)); 4855219089Spjd return (spa_vdev_exit(spa, NULL, txg, error)); 4856219089Spjd } 4857219089Spjd 4858219089Spjd /* stop writers from using the disks */ 4859219089Spjd for (c = 0; c < children; c++) { 4860219089Spjd if (vml[c] != NULL) 4861219089Spjd vml[c]->vdev_offline = B_TRUE; 4862219089Spjd } 4863219089Spjd vdev_reopen(spa->spa_root_vdev); 4864219089Spjd 4865219089Spjd /* 4866219089Spjd * Temporarily record the splitting vdevs in the spa config. This 4867219089Spjd * will disappear once the config is regenerated. 4868219089Spjd */ 4869219089Spjd VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); 4870219089Spjd VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 4871219089Spjd glist, children) == 0); 4872219089Spjd kmem_free(glist, children * sizeof (uint64_t)); 4873219089Spjd 4874219089Spjd mutex_enter(&spa->spa_props_lock); 4875219089Spjd VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, 4876219089Spjd nvl) == 0); 4877219089Spjd mutex_exit(&spa->spa_props_lock); 4878219089Spjd spa->spa_config_splitting = nvl; 4879219089Spjd vdev_config_dirty(spa->spa_root_vdev); 4880219089Spjd 4881219089Spjd /* configure and create the new pool */ 4882219089Spjd VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0); 4883219089Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 4884219089Spjd exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0); 4885219089Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 4886219089Spjd spa_version(spa)) == 0); 4887219089Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, 4888219089Spjd spa->spa_config_txg) == 0); 4889219089Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, 4890219089Spjd spa_generate_guid(NULL)) == 0); 4891219089Spjd (void) nvlist_lookup_string(props, 4892219089Spjd zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 4893219089Spjd 4894219089Spjd /* add the new pool to the namespace */ 4895219089Spjd newspa = spa_add(newname, config, altroot); 4896219089Spjd newspa->spa_config_txg = spa->spa_config_txg; 4897219089Spjd spa_set_log_state(newspa, SPA_LOG_CLEAR); 4898219089Spjd 4899219089Spjd /* release the spa config lock, retaining the namespace lock */ 4900219089Spjd spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 4901219089Spjd 4902219089Spjd if (zio_injection_enabled) 4903219089Spjd zio_handle_panic_injection(spa, FTAG, 1); 4904219089Spjd 4905219089Spjd spa_activate(newspa, spa_mode_global); 4906219089Spjd spa_async_suspend(newspa); 4907219089Spjd 4908219089Spjd#ifndef sun 4909219089Spjd /* mark that we are creating new spa by splitting */ 4910219089Spjd newspa->spa_splitting_newspa = B_TRUE; 4911219089Spjd#endif 4912219089Spjd /* create the new pool from the disks of the original pool */ 4913219089Spjd error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE); 4914219089Spjd#ifndef sun 4915219089Spjd newspa->spa_splitting_newspa = B_FALSE; 4916219089Spjd#endif 4917219089Spjd if (error) 4918219089Spjd goto out; 4919219089Spjd 4920219089Spjd /* if that worked, generate a real config for the new pool */ 4921219089Spjd if (newspa->spa_root_vdev != NULL) { 4922219089Spjd VERIFY(nvlist_alloc(&newspa->spa_config_splitting, 4923219089Spjd NV_UNIQUE_NAME, KM_SLEEP) == 0); 4924219089Spjd VERIFY(nvlist_add_uint64(newspa->spa_config_splitting, 4925219089Spjd ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0); 4926219089Spjd spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, 4927219089Spjd B_TRUE)); 4928219089Spjd } 4929219089Spjd 4930219089Spjd /* set the props */ 4931219089Spjd if (props != NULL) { 4932219089Spjd spa_configfile_set(newspa, props, B_FALSE); 4933219089Spjd error = spa_prop_set(newspa, props); 4934219089Spjd if (error) 4935219089Spjd goto out; 4936219089Spjd } 4937219089Spjd 4938219089Spjd /* flush everything */ 4939219089Spjd txg = spa_vdev_config_enter(newspa); 4940219089Spjd vdev_config_dirty(newspa->spa_root_vdev); 4941219089Spjd (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); 4942219089Spjd 4943219089Spjd if (zio_injection_enabled) 4944219089Spjd zio_handle_panic_injection(spa, FTAG, 2); 4945219089Spjd 4946219089Spjd spa_async_resume(newspa); 4947219089Spjd 4948219089Spjd /* finally, update the original pool's config */ 4949219089Spjd txg = spa_vdev_config_enter(spa); 4950219089Spjd tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 4951219089Spjd error = dmu_tx_assign(tx, TXG_WAIT); 4952219089Spjd if (error != 0) 4953219089Spjd dmu_tx_abort(tx); 4954219089Spjd for (c = 0; c < children; c++) { 4955219089Spjd if (vml[c] != NULL) { 4956219089Spjd vdev_split(vml[c]); 4957219089Spjd if (error == 0) 4958219089Spjd spa_history_log_internal(LOG_POOL_VDEV_DETACH, 4959219089Spjd spa, tx, "vdev=%s", 4960219089Spjd vml[c]->vdev_path); 4961219089Spjd vdev_free(vml[c]); 4962219089Spjd } 4963219089Spjd } 4964219089Spjd vdev_config_dirty(spa->spa_root_vdev); 4965219089Spjd spa->spa_config_splitting = NULL; 4966219089Spjd nvlist_free(nvl); 4967219089Spjd if (error == 0) 4968219089Spjd dmu_tx_commit(tx); 4969219089Spjd (void) spa_vdev_exit(spa, NULL, txg, 0); 4970219089Spjd 4971219089Spjd if (zio_injection_enabled) 4972219089Spjd zio_handle_panic_injection(spa, FTAG, 3); 4973219089Spjd 4974219089Spjd /* split is complete; log a history record */ 4975219089Spjd spa_history_log_internal(LOG_POOL_SPLIT, newspa, NULL, 4976219089Spjd "split new pool %s from pool %s", newname, spa_name(spa)); 4977219089Spjd 4978219089Spjd kmem_free(vml, children * sizeof (vdev_t *)); 4979219089Spjd 4980219089Spjd /* if we're not going to mount the filesystems in userland, export */ 4981219089Spjd if (exp) 4982219089Spjd error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, 4983219089Spjd B_FALSE, B_FALSE); 4984219089Spjd 4985219089Spjd return (error); 4986219089Spjd 4987219089Spjdout: 4988219089Spjd spa_unload(newspa); 4989219089Spjd spa_deactivate(newspa); 4990219089Spjd spa_remove(newspa); 4991219089Spjd 4992219089Spjd txg = spa_vdev_config_enter(spa); 4993219089Spjd 4994219089Spjd /* re-online all offlined disks */ 4995219089Spjd for (c = 0; c < children; c++) { 4996219089Spjd if (vml[c] != NULL) 4997219089Spjd vml[c]->vdev_offline = B_FALSE; 4998219089Spjd } 4999219089Spjd vdev_reopen(spa->spa_root_vdev); 5000219089Spjd 5001219089Spjd nvlist_free(spa->spa_config_splitting); 5002219089Spjd spa->spa_config_splitting = NULL; 5003219089Spjd (void) spa_vdev_exit(spa, NULL, txg, error); 5004219089Spjd 5005219089Spjd kmem_free(vml, children * sizeof (vdev_t *)); 5006219089Spjd return (error); 5007219089Spjd} 5008219089Spjd 5009185029Spjdstatic nvlist_t * 5010185029Spjdspa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) 5011185029Spjd{ 5012185029Spjd for (int i = 0; i < count; i++) { 5013185029Spjd uint64_t guid; 5014185029Spjd 5015185029Spjd VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID, 5016185029Spjd &guid) == 0); 5017185029Spjd 5018185029Spjd if (guid == target_guid) 5019185029Spjd return (nvpp[i]); 5020185029Spjd } 5021185029Spjd 5022185029Spjd return (NULL); 5023185029Spjd} 5024185029Spjd 5025185029Spjdstatic void 5026185029Spjdspa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, 5027185029Spjd nvlist_t *dev_to_remove) 5028185029Spjd{ 5029185029Spjd nvlist_t **newdev = NULL; 5030185029Spjd 5031185029Spjd if (count > 1) 5032185029Spjd newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); 5033185029Spjd 5034185029Spjd for (int i = 0, j = 0; i < count; i++) { 5035185029Spjd if (dev[i] == dev_to_remove) 5036185029Spjd continue; 5037185029Spjd VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); 5038185029Spjd } 5039185029Spjd 5040185029Spjd VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); 5041185029Spjd VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); 5042185029Spjd 5043185029Spjd for (int i = 0; i < count - 1; i++) 5044185029Spjd nvlist_free(newdev[i]); 5045185029Spjd 5046185029Spjd if (count > 1) 5047185029Spjd kmem_free(newdev, (count - 1) * sizeof (void *)); 5048185029Spjd} 5049185029Spjd 5050168404Spjd/* 5051219089Spjd * Evacuate the device. 5052219089Spjd */ 5053219089Spjdstatic int 5054219089Spjdspa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd) 5055219089Spjd{ 5056219089Spjd uint64_t txg; 5057219089Spjd int error = 0; 5058219089Spjd 5059219089Spjd ASSERT(MUTEX_HELD(&spa_namespace_lock)); 5060219089Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5061219089Spjd ASSERT(vd == vd->vdev_top); 5062219089Spjd 5063219089Spjd /* 5064219089Spjd * Evacuate the device. We don't hold the config lock as writer 5065219089Spjd * since we need to do I/O but we do keep the 5066219089Spjd * spa_namespace_lock held. Once this completes the device 5067219089Spjd * should no longer have any blocks allocated on it. 5068219089Spjd */ 5069219089Spjd if (vd->vdev_islog) { 5070219089Spjd if (vd->vdev_stat.vs_alloc != 0) 5071219089Spjd error = spa_offline_log(spa); 5072219089Spjd } else { 5073219089Spjd error = ENOTSUP; 5074219089Spjd } 5075219089Spjd 5076219089Spjd if (error) 5077219089Spjd return (error); 5078219089Spjd 5079219089Spjd /* 5080219089Spjd * The evacuation succeeded. Remove any remaining MOS metadata 5081219089Spjd * associated with this vdev, and wait for these changes to sync. 5082219089Spjd */ 5083240415Smm ASSERT0(vd->vdev_stat.vs_alloc); 5084219089Spjd txg = spa_vdev_config_enter(spa); 5085219089Spjd vd->vdev_removing = B_TRUE; 5086219089Spjd vdev_dirty(vd, 0, NULL, txg); 5087219089Spjd vdev_config_dirty(vd); 5088219089Spjd spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 5089219089Spjd 5090219089Spjd return (0); 5091219089Spjd} 5092219089Spjd 5093219089Spjd/* 5094219089Spjd * Complete the removal by cleaning up the namespace. 5095219089Spjd */ 5096219089Spjdstatic void 5097219089Spjdspa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd) 5098219089Spjd{ 5099219089Spjd vdev_t *rvd = spa->spa_root_vdev; 5100219089Spjd uint64_t id = vd->vdev_id; 5101219089Spjd boolean_t last_vdev = (id == (rvd->vdev_children - 1)); 5102219089Spjd 5103219089Spjd ASSERT(MUTEX_HELD(&spa_namespace_lock)); 5104219089Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 5105219089Spjd ASSERT(vd == vd->vdev_top); 5106219089Spjd 5107219089Spjd /* 5108219089Spjd * Only remove any devices which are empty. 5109219089Spjd */ 5110219089Spjd if (vd->vdev_stat.vs_alloc != 0) 5111219089Spjd return; 5112219089Spjd 5113219089Spjd (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 5114219089Spjd 5115219089Spjd if (list_link_active(&vd->vdev_state_dirty_node)) 5116219089Spjd vdev_state_clean(vd); 5117219089Spjd if (list_link_active(&vd->vdev_config_dirty_node)) 5118219089Spjd vdev_config_clean(vd); 5119219089Spjd 5120219089Spjd vdev_free(vd); 5121219089Spjd 5122219089Spjd if (last_vdev) { 5123219089Spjd vdev_compact_children(rvd); 5124219089Spjd } else { 5125219089Spjd vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); 5126219089Spjd vdev_add_child(rvd, vd); 5127219089Spjd } 5128219089Spjd vdev_config_dirty(rvd); 5129219089Spjd 5130219089Spjd /* 5131219089Spjd * Reassess the health of our root vdev. 5132219089Spjd */ 5133219089Spjd vdev_reopen(rvd); 5134219089Spjd} 5135219089Spjd 5136219089Spjd/* 5137219089Spjd * Remove a device from the pool - 5138219089Spjd * 5139219089Spjd * Removing a device from the vdev namespace requires several steps 5140219089Spjd * and can take a significant amount of time. As a result we use 5141219089Spjd * the spa_vdev_config_[enter/exit] functions which allow us to 5142219089Spjd * grab and release the spa_config_lock while still holding the namespace 5143219089Spjd * lock. During each step the configuration is synced out. 5144219089Spjd */ 5145219089Spjd 5146219089Spjd/* 5147168404Spjd * Remove a device from the pool. Currently, this supports removing only hot 5148219089Spjd * spares, slogs, and level 2 ARC devices. 5149168404Spjd */ 5150168404Spjdint 5151168404Spjdspa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 5152168404Spjd{ 5153168404Spjd vdev_t *vd; 5154219089Spjd metaslab_group_t *mg; 5155185029Spjd nvlist_t **spares, **l2cache, *nv; 5156219089Spjd uint64_t txg = 0; 5157185029Spjd uint_t nspares, nl2cache; 5158185029Spjd int error = 0; 5159209962Smm boolean_t locked = MUTEX_HELD(&spa_namespace_lock); 5160168404Spjd 5161219089Spjd ASSERT(spa_writeable(spa)); 5162219089Spjd 5163209962Smm if (!locked) 5164209962Smm txg = spa_vdev_enter(spa); 5165168404Spjd 5166185029Spjd vd = spa_lookup_by_guid(spa, guid, B_FALSE); 5167168404Spjd 5168185029Spjd if (spa->spa_spares.sav_vdevs != NULL && 5169185029Spjd nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 5170185029Spjd ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && 5171185029Spjd (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { 5172185029Spjd /* 5173185029Spjd * Only remove the hot spare if it's not currently in use 5174185029Spjd * in this pool. 5175185029Spjd */ 5176185029Spjd if (vd == NULL || unspare) { 5177185029Spjd spa_vdev_remove_aux(spa->spa_spares.sav_config, 5178185029Spjd ZPOOL_CONFIG_SPARES, spares, nspares, nv); 5179185029Spjd spa_load_spares(spa); 5180185029Spjd spa->spa_spares.sav_sync = B_TRUE; 5181185029Spjd } else { 5182185029Spjd error = EBUSY; 5183168404Spjd } 5184185029Spjd } else if (spa->spa_l2cache.sav_vdevs != NULL && 5185185029Spjd nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 5186185029Spjd ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && 5187185029Spjd (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { 5188185029Spjd /* 5189185029Spjd * Cache devices can always be removed. 5190185029Spjd */ 5191185029Spjd spa_vdev_remove_aux(spa->spa_l2cache.sav_config, 5192185029Spjd ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); 5193185029Spjd spa_load_l2cache(spa); 5194185029Spjd spa->spa_l2cache.sav_sync = B_TRUE; 5195219089Spjd } else if (vd != NULL && vd->vdev_islog) { 5196219089Spjd ASSERT(!locked); 5197219089Spjd ASSERT(vd == vd->vdev_top); 5198219089Spjd 5199219089Spjd /* 5200219089Spjd * XXX - Once we have bp-rewrite this should 5201219089Spjd * become the common case. 5202219089Spjd */ 5203219089Spjd 5204219089Spjd mg = vd->vdev_mg; 5205219089Spjd 5206219089Spjd /* 5207219089Spjd * Stop allocating from this vdev. 5208219089Spjd */ 5209219089Spjd metaslab_group_passivate(mg); 5210219089Spjd 5211219089Spjd /* 5212219089Spjd * Wait for the youngest allocations and frees to sync, 5213219089Spjd * and then wait for the deferral of those frees to finish. 5214219089Spjd */ 5215219089Spjd spa_vdev_config_exit(spa, NULL, 5216219089Spjd txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); 5217219089Spjd 5218219089Spjd /* 5219219089Spjd * Attempt to evacuate the vdev. 5220219089Spjd */ 5221219089Spjd error = spa_vdev_remove_evacuate(spa, vd); 5222219089Spjd 5223219089Spjd txg = spa_vdev_config_enter(spa); 5224219089Spjd 5225219089Spjd /* 5226219089Spjd * If we couldn't evacuate the vdev, unwind. 5227219089Spjd */ 5228219089Spjd if (error) { 5229219089Spjd metaslab_group_activate(mg); 5230219089Spjd return (spa_vdev_exit(spa, NULL, txg, error)); 5231219089Spjd } 5232219089Spjd 5233219089Spjd /* 5234219089Spjd * Clean up the vdev namespace. 5235219089Spjd */ 5236219089Spjd spa_vdev_remove_from_namespace(spa, vd); 5237219089Spjd 5238185029Spjd } else if (vd != NULL) { 5239185029Spjd /* 5240185029Spjd * Normal vdevs cannot be removed (yet). 5241185029Spjd */ 5242185029Spjd error = ENOTSUP; 5243168404Spjd } else { 5244185029Spjd /* 5245185029Spjd * There is no vdev of any kind with the specified guid. 5246185029Spjd */ 5247185029Spjd error = ENOENT; 5248168404Spjd } 5249168404Spjd 5250209962Smm if (!locked) 5251209962Smm return (spa_vdev_exit(spa, NULL, txg, error)); 5252209962Smm 5253209962Smm return (error); 5254168404Spjd} 5255168404Spjd 5256168404Spjd/* 5257185029Spjd * Find any device that's done replacing, or a vdev marked 'unspare' that's 5258185029Spjd * current spared, so we can detach it. 5259168404Spjd */ 5260168404Spjdstatic vdev_t * 5261185029Spjdspa_vdev_resilver_done_hunt(vdev_t *vd) 5262168404Spjd{ 5263168404Spjd vdev_t *newvd, *oldvd; 5264168404Spjd 5265219089Spjd for (int c = 0; c < vd->vdev_children; c++) { 5266185029Spjd oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 5267168404Spjd if (oldvd != NULL) 5268168404Spjd return (oldvd); 5269168404Spjd } 5270168404Spjd 5271185029Spjd /* 5272219089Spjd * Check for a completed replacement. We always consider the first 5273219089Spjd * vdev in the list to be the oldest vdev, and the last one to be 5274219089Spjd * the newest (see spa_vdev_attach() for how that works). In 5275219089Spjd * the case where the newest vdev is faulted, we will not automatically 5276219089Spjd * remove it after a resilver completes. This is OK as it will require 5277219089Spjd * user intervention to determine which disk the admin wishes to keep. 5278185029Spjd */ 5279219089Spjd if (vd->vdev_ops == &vdev_replacing_ops) { 5280219089Spjd ASSERT(vd->vdev_children > 1); 5281219089Spjd 5282219089Spjd newvd = vd->vdev_child[vd->vdev_children - 1]; 5283168404Spjd oldvd = vd->vdev_child[0]; 5284168404Spjd 5285209962Smm if (vdev_dtl_empty(newvd, DTL_MISSING) && 5286219089Spjd vdev_dtl_empty(newvd, DTL_OUTAGE) && 5287209962Smm !vdev_dtl_required(oldvd)) 5288168404Spjd return (oldvd); 5289168404Spjd } 5290168404Spjd 5291185029Spjd /* 5292185029Spjd * Check for a completed resilver with the 'unspare' flag set. 5293185029Spjd */ 5294219089Spjd if (vd->vdev_ops == &vdev_spare_ops) { 5295219089Spjd vdev_t *first = vd->vdev_child[0]; 5296219089Spjd vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; 5297185029Spjd 5298219089Spjd if (last->vdev_unspare) { 5299219089Spjd oldvd = first; 5300219089Spjd newvd = last; 5301219089Spjd } else if (first->vdev_unspare) { 5302219089Spjd oldvd = last; 5303219089Spjd newvd = first; 5304219089Spjd } else { 5305219089Spjd oldvd = NULL; 5306219089Spjd } 5307219089Spjd 5308219089Spjd if (oldvd != NULL && 5309209962Smm vdev_dtl_empty(newvd, DTL_MISSING) && 5310219089Spjd vdev_dtl_empty(newvd, DTL_OUTAGE) && 5311219089Spjd !vdev_dtl_required(oldvd)) 5312185029Spjd return (oldvd); 5313219089Spjd 5314219089Spjd /* 5315219089Spjd * If there are more than two spares attached to a disk, 5316219089Spjd * and those spares are not required, then we want to 5317219089Spjd * attempt to free them up now so that they can be used 5318219089Spjd * by other pools. Once we're back down to a single 5319219089Spjd * disk+spare, we stop removing them. 5320219089Spjd */ 5321219089Spjd if (vd->vdev_children > 2) { 5322219089Spjd newvd = vd->vdev_child[1]; 5323219089Spjd 5324219089Spjd if (newvd->vdev_isspare && last->vdev_isspare && 5325219089Spjd vdev_dtl_empty(last, DTL_MISSING) && 5326219089Spjd vdev_dtl_empty(last, DTL_OUTAGE) && 5327219089Spjd !vdev_dtl_required(newvd)) 5328219089Spjd return (newvd); 5329185029Spjd } 5330185029Spjd } 5331185029Spjd 5332168404Spjd return (NULL); 5333168404Spjd} 5334168404Spjd 5335168404Spjdstatic void 5336185029Spjdspa_vdev_resilver_done(spa_t *spa) 5337168404Spjd{ 5338209962Smm vdev_t *vd, *pvd, *ppvd; 5339209962Smm uint64_t guid, sguid, pguid, ppguid; 5340168404Spjd 5341209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5342168404Spjd 5343185029Spjd while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 5344209962Smm pvd = vd->vdev_parent; 5345209962Smm ppvd = pvd->vdev_parent; 5346168404Spjd guid = vd->vdev_guid; 5347209962Smm pguid = pvd->vdev_guid; 5348209962Smm ppguid = ppvd->vdev_guid; 5349209962Smm sguid = 0; 5350168404Spjd /* 5351168404Spjd * If we have just finished replacing a hot spared device, then 5352168404Spjd * we need to detach the parent's first child (the original hot 5353168404Spjd * spare) as well. 5354168404Spjd */ 5355219089Spjd if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && 5356219089Spjd ppvd->vdev_children == 2) { 5357168404Spjd ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 5358209962Smm sguid = ppvd->vdev_child[1]->vdev_guid; 5359168404Spjd } 5360209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 5361209962Smm if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) 5362168404Spjd return; 5363209962Smm if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) 5364168404Spjd return; 5365209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5366168404Spjd } 5367168404Spjd 5368209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 5369168404Spjd} 5370168404Spjd 5371168404Spjd/* 5372219089Spjd * Update the stored path or FRU for this vdev. 5373168404Spjd */ 5374168404Spjdint 5375209962Smmspa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, 5376209962Smm boolean_t ispath) 5377168404Spjd{ 5378185029Spjd vdev_t *vd; 5379219089Spjd boolean_t sync = B_FALSE; 5380168404Spjd 5381219089Spjd ASSERT(spa_writeable(spa)); 5382168404Spjd 5383219089Spjd spa_vdev_state_enter(spa, SCL_ALL); 5384219089Spjd 5385209962Smm if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 5386219089Spjd return (spa_vdev_state_exit(spa, NULL, ENOENT)); 5387168404Spjd 5388168404Spjd if (!vd->vdev_ops->vdev_op_leaf) 5389219089Spjd return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 5390168404Spjd 5391209962Smm if (ispath) { 5392219089Spjd if (strcmp(value, vd->vdev_path) != 0) { 5393219089Spjd spa_strfree(vd->vdev_path); 5394219089Spjd vd->vdev_path = spa_strdup(value); 5395219089Spjd sync = B_TRUE; 5396219089Spjd } 5397209962Smm } else { 5398219089Spjd if (vd->vdev_fru == NULL) { 5399219089Spjd vd->vdev_fru = spa_strdup(value); 5400219089Spjd sync = B_TRUE; 5401219089Spjd } else if (strcmp(value, vd->vdev_fru) != 0) { 5402209962Smm spa_strfree(vd->vdev_fru); 5403219089Spjd vd->vdev_fru = spa_strdup(value); 5404219089Spjd sync = B_TRUE; 5405219089Spjd } 5406209962Smm } 5407168404Spjd 5408219089Spjd return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); 5409168404Spjd} 5410168404Spjd 5411209962Smmint 5412209962Smmspa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 5413209962Smm{ 5414209962Smm return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); 5415209962Smm} 5416209962Smm 5417209962Smmint 5418209962Smmspa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) 5419209962Smm{ 5420209962Smm return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); 5421209962Smm} 5422209962Smm 5423168404Spjd/* 5424168404Spjd * ========================================================================== 5425219089Spjd * SPA Scanning 5426168404Spjd * ========================================================================== 5427168404Spjd */ 5428168404Spjd 5429168404Spjdint 5430219089Spjdspa_scan_stop(spa_t *spa) 5431168404Spjd{ 5432185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5433219089Spjd if (dsl_scan_resilvering(spa->spa_dsl_pool)) 5434219089Spjd return (EBUSY); 5435219089Spjd return (dsl_scan_cancel(spa->spa_dsl_pool)); 5436219089Spjd} 5437168404Spjd 5438219089Spjdint 5439219089Spjdspa_scan(spa_t *spa, pool_scan_func_t func) 5440219089Spjd{ 5441219089Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5442219089Spjd 5443219089Spjd if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) 5444168404Spjd return (ENOTSUP); 5445168404Spjd 5446168404Spjd /* 5447185029Spjd * If a resilver was requested, but there is no DTL on a 5448185029Spjd * writeable leaf device, we have nothing to do. 5449168404Spjd */ 5450219089Spjd if (func == POOL_SCAN_RESILVER && 5451185029Spjd !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 5452185029Spjd spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 5453168404Spjd return (0); 5454168404Spjd } 5455168404Spjd 5456219089Spjd return (dsl_scan(spa->spa_dsl_pool, func)); 5457168404Spjd} 5458168404Spjd 5459168404Spjd/* 5460168404Spjd * ========================================================================== 5461168404Spjd * SPA async task processing 5462168404Spjd * ========================================================================== 5463168404Spjd */ 5464168404Spjd 5465168404Spjdstatic void 5466185029Spjdspa_async_remove(spa_t *spa, vdev_t *vd) 5467168404Spjd{ 5468185029Spjd if (vd->vdev_remove_wanted) { 5469219089Spjd vd->vdev_remove_wanted = B_FALSE; 5470219089Spjd vd->vdev_delayed_close = B_FALSE; 5471185029Spjd vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 5472209962Smm 5473209962Smm /* 5474209962Smm * We want to clear the stats, but we don't want to do a full 5475209962Smm * vdev_clear() as that will cause us to throw away 5476209962Smm * degraded/faulted state as well as attempt to reopen the 5477209962Smm * device, all of which is a waste. 5478209962Smm */ 5479209962Smm vd->vdev_stat.vs_read_errors = 0; 5480209962Smm vd->vdev_stat.vs_write_errors = 0; 5481209962Smm vd->vdev_stat.vs_checksum_errors = 0; 5482209962Smm 5483185029Spjd vdev_state_dirty(vd->vdev_top); 5484185029Spjd } 5485168404Spjd 5486185029Spjd for (int c = 0; c < vd->vdev_children; c++) 5487185029Spjd spa_async_remove(spa, vd->vdev_child[c]); 5488185029Spjd} 5489168404Spjd 5490185029Spjdstatic void 5491185029Spjdspa_async_probe(spa_t *spa, vdev_t *vd) 5492185029Spjd{ 5493185029Spjd if (vd->vdev_probe_wanted) { 5494219089Spjd vd->vdev_probe_wanted = B_FALSE; 5495185029Spjd vdev_reopen(vd); /* vdev_open() does the actual probe */ 5496168404Spjd } 5497168404Spjd 5498185029Spjd for (int c = 0; c < vd->vdev_children; c++) 5499185029Spjd spa_async_probe(spa, vd->vdev_child[c]); 5500168404Spjd} 5501168404Spjd 5502168404Spjdstatic void 5503219089Spjdspa_async_autoexpand(spa_t *spa, vdev_t *vd) 5504219089Spjd{ 5505219089Spjd sysevent_id_t eid; 5506219089Spjd nvlist_t *attr; 5507219089Spjd char *physpath; 5508219089Spjd 5509219089Spjd if (!spa->spa_autoexpand) 5510219089Spjd return; 5511219089Spjd 5512219089Spjd for (int c = 0; c < vd->vdev_children; c++) { 5513219089Spjd vdev_t *cvd = vd->vdev_child[c]; 5514219089Spjd spa_async_autoexpand(spa, cvd); 5515219089Spjd } 5516219089Spjd 5517219089Spjd if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) 5518219089Spjd return; 5519219089Spjd 5520219089Spjd physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP); 5521219089Spjd (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath); 5522219089Spjd 5523219089Spjd VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5524219089Spjd VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0); 5525219089Spjd 5526219089Spjd (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS, 5527219089Spjd ESC_ZFS_VDEV_AUTOEXPAND, attr, &eid, DDI_SLEEP); 5528219089Spjd 5529219089Spjd nvlist_free(attr); 5530219089Spjd kmem_free(physpath, MAXPATHLEN); 5531219089Spjd} 5532219089Spjd 5533219089Spjdstatic void 5534168404Spjdspa_async_thread(void *arg) 5535168404Spjd{ 5536168404Spjd spa_t *spa = arg; 5537168404Spjd int tasks; 5538168404Spjd 5539168404Spjd ASSERT(spa->spa_sync_on); 5540168404Spjd 5541168404Spjd mutex_enter(&spa->spa_async_lock); 5542168404Spjd tasks = spa->spa_async_tasks; 5543168404Spjd spa->spa_async_tasks = 0; 5544168404Spjd mutex_exit(&spa->spa_async_lock); 5545168404Spjd 5546168404Spjd /* 5547168404Spjd * See if the config needs to be updated. 5548168404Spjd */ 5549168404Spjd if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 5550219089Spjd uint64_t old_space, new_space; 5551219089Spjd 5552168404Spjd mutex_enter(&spa_namespace_lock); 5553219089Spjd old_space = metaslab_class_get_space(spa_normal_class(spa)); 5554168404Spjd spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 5555219089Spjd new_space = metaslab_class_get_space(spa_normal_class(spa)); 5556168404Spjd mutex_exit(&spa_namespace_lock); 5557219089Spjd 5558219089Spjd /* 5559219089Spjd * If the pool grew as a result of the config update, 5560219089Spjd * then log an internal history event. 5561219089Spjd */ 5562219089Spjd if (new_space != old_space) { 5563219089Spjd spa_history_log_internal(LOG_POOL_VDEV_ONLINE, 5564219089Spjd spa, NULL, 5565219089Spjd "pool '%s' size: %llu(+%llu)", 5566219089Spjd spa_name(spa), new_space, new_space - old_space); 5567219089Spjd } 5568168404Spjd } 5569168404Spjd 5570168404Spjd /* 5571185029Spjd * See if any devices need to be marked REMOVED. 5572168404Spjd */ 5573185029Spjd if (tasks & SPA_ASYNC_REMOVE) { 5574219089Spjd spa_vdev_state_enter(spa, SCL_NONE); 5575185029Spjd spa_async_remove(spa, spa->spa_root_vdev); 5576185029Spjd for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 5577185029Spjd spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); 5578185029Spjd for (int i = 0; i < spa->spa_spares.sav_count; i++) 5579185029Spjd spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); 5580185029Spjd (void) spa_vdev_state_exit(spa, NULL, 0); 5581185029Spjd } 5582168404Spjd 5583219089Spjd if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { 5584219089Spjd spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 5585219089Spjd spa_async_autoexpand(spa, spa->spa_root_vdev); 5586219089Spjd spa_config_exit(spa, SCL_CONFIG, FTAG); 5587219089Spjd } 5588219089Spjd 5589168404Spjd /* 5590185029Spjd * See if any devices need to be probed. 5591168404Spjd */ 5592185029Spjd if (tasks & SPA_ASYNC_PROBE) { 5593219089Spjd spa_vdev_state_enter(spa, SCL_NONE); 5594185029Spjd spa_async_probe(spa, spa->spa_root_vdev); 5595185029Spjd (void) spa_vdev_state_exit(spa, NULL, 0); 5596185029Spjd } 5597168404Spjd 5598168404Spjd /* 5599185029Spjd * If any devices are done replacing, detach them. 5600168404Spjd */ 5601185029Spjd if (tasks & SPA_ASYNC_RESILVER_DONE) 5602185029Spjd spa_vdev_resilver_done(spa); 5603168404Spjd 5604168404Spjd /* 5605168404Spjd * Kick off a resilver. 5606168404Spjd */ 5607168404Spjd if (tasks & SPA_ASYNC_RESILVER) 5608219089Spjd dsl_resilver_restart(spa->spa_dsl_pool, 0); 5609168404Spjd 5610168404Spjd /* 5611168404Spjd * Let the world know that we're done. 5612168404Spjd */ 5613168404Spjd mutex_enter(&spa->spa_async_lock); 5614168404Spjd spa->spa_async_thread = NULL; 5615168404Spjd cv_broadcast(&spa->spa_async_cv); 5616168404Spjd mutex_exit(&spa->spa_async_lock); 5617168404Spjd thread_exit(); 5618168404Spjd} 5619168404Spjd 5620168404Spjdvoid 5621168404Spjdspa_async_suspend(spa_t *spa) 5622168404Spjd{ 5623168404Spjd mutex_enter(&spa->spa_async_lock); 5624168404Spjd spa->spa_async_suspended++; 5625168404Spjd while (spa->spa_async_thread != NULL) 5626168404Spjd cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 5627168404Spjd mutex_exit(&spa->spa_async_lock); 5628168404Spjd} 5629168404Spjd 5630168404Spjdvoid 5631168404Spjdspa_async_resume(spa_t *spa) 5632168404Spjd{ 5633168404Spjd mutex_enter(&spa->spa_async_lock); 5634168404Spjd ASSERT(spa->spa_async_suspended != 0); 5635168404Spjd spa->spa_async_suspended--; 5636168404Spjd mutex_exit(&spa->spa_async_lock); 5637168404Spjd} 5638168404Spjd 5639168404Spjdstatic void 5640168404Spjdspa_async_dispatch(spa_t *spa) 5641168404Spjd{ 5642168404Spjd mutex_enter(&spa->spa_async_lock); 5643168404Spjd if (spa->spa_async_tasks && !spa->spa_async_suspended && 5644168404Spjd spa->spa_async_thread == NULL && 5645168404Spjd rootdir != NULL && !vn_is_readonly(rootdir)) 5646168404Spjd spa->spa_async_thread = thread_create(NULL, 0, 5647168404Spjd spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 5648168404Spjd mutex_exit(&spa->spa_async_lock); 5649168404Spjd} 5650168404Spjd 5651168404Spjdvoid 5652168404Spjdspa_async_request(spa_t *spa, int task) 5653168404Spjd{ 5654219089Spjd zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); 5655168404Spjd mutex_enter(&spa->spa_async_lock); 5656168404Spjd spa->spa_async_tasks |= task; 5657168404Spjd mutex_exit(&spa->spa_async_lock); 5658168404Spjd} 5659168404Spjd 5660168404Spjd/* 5661168404Spjd * ========================================================================== 5662168404Spjd * SPA syncing routines 5663168404Spjd * ========================================================================== 5664168404Spjd */ 5665168404Spjd 5666219089Spjdstatic int 5667219089Spjdbpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 5668168404Spjd{ 5669219089Spjd bpobj_t *bpo = arg; 5670219089Spjd bpobj_enqueue(bpo, bp, tx); 5671219089Spjd return (0); 5672219089Spjd} 5673168404Spjd 5674219089Spjdstatic int 5675219089Spjdspa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 5676219089Spjd{ 5677219089Spjd zio_t *zio = arg; 5678168404Spjd 5679219089Spjd zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp, 5680219089Spjd zio->io_flags)); 5681219089Spjd return (0); 5682168404Spjd} 5683168404Spjd 5684168404Spjdstatic void 5685168404Spjdspa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 5686168404Spjd{ 5687168404Spjd char *packed = NULL; 5688185029Spjd size_t bufsize; 5689168404Spjd size_t nvsize = 0; 5690168404Spjd dmu_buf_t *db; 5691168404Spjd 5692168404Spjd VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 5693168404Spjd 5694185029Spjd /* 5695185029Spjd * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 5696185029Spjd * information. This avoids the dbuf_will_dirty() path and 5697185029Spjd * saves us a pre-read to get data we don't actually care about. 5698185029Spjd */ 5699236884Smm bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE); 5700185029Spjd packed = kmem_alloc(bufsize, KM_SLEEP); 5701168404Spjd 5702168404Spjd VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 5703168404Spjd KM_SLEEP) == 0); 5704185029Spjd bzero(packed + nvsize, bufsize - nvsize); 5705168404Spjd 5706185029Spjd dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); 5707168404Spjd 5708185029Spjd kmem_free(packed, bufsize); 5709168404Spjd 5710168404Spjd VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 5711168404Spjd dmu_buf_will_dirty(db, tx); 5712168404Spjd *(uint64_t *)db->db_data = nvsize; 5713168404Spjd dmu_buf_rele(db, FTAG); 5714168404Spjd} 5715168404Spjd 5716168404Spjdstatic void 5717185029Spjdspa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 5718185029Spjd const char *config, const char *entry) 5719168404Spjd{ 5720168404Spjd nvlist_t *nvroot; 5721185029Spjd nvlist_t **list; 5722168404Spjd int i; 5723168404Spjd 5724185029Spjd if (!sav->sav_sync) 5725168404Spjd return; 5726168404Spjd 5727168404Spjd /* 5728185029Spjd * Update the MOS nvlist describing the list of available devices. 5729185029Spjd * spa_validate_aux() will have already made sure this nvlist is 5730185029Spjd * valid and the vdevs are labeled appropriately. 5731168404Spjd */ 5732185029Spjd if (sav->sav_object == 0) { 5733185029Spjd sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 5734185029Spjd DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 5735185029Spjd sizeof (uint64_t), tx); 5736168404Spjd VERIFY(zap_update(spa->spa_meta_objset, 5737185029Spjd DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 5738185029Spjd &sav->sav_object, tx) == 0); 5739168404Spjd } 5740168404Spjd 5741168404Spjd VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5742185029Spjd if (sav->sav_count == 0) { 5743185029Spjd VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); 5744168404Spjd } else { 5745185029Spjd list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 5746185029Spjd for (i = 0; i < sav->sav_count; i++) 5747185029Spjd list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 5748219089Spjd B_FALSE, VDEV_CONFIG_L2CACHE); 5749185029Spjd VERIFY(nvlist_add_nvlist_array(nvroot, config, list, 5750185029Spjd sav->sav_count) == 0); 5751185029Spjd for (i = 0; i < sav->sav_count; i++) 5752185029Spjd nvlist_free(list[i]); 5753185029Spjd kmem_free(list, sav->sav_count * sizeof (void *)); 5754168404Spjd } 5755168404Spjd 5756185029Spjd spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 5757168404Spjd nvlist_free(nvroot); 5758168404Spjd 5759185029Spjd sav->sav_sync = B_FALSE; 5760168404Spjd} 5761168404Spjd 5762168404Spjdstatic void 5763168404Spjdspa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 5764168404Spjd{ 5765168404Spjd nvlist_t *config; 5766168404Spjd 5767185029Spjd if (list_is_empty(&spa->spa_config_dirty_list)) 5768168404Spjd return; 5769168404Spjd 5770185029Spjd spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 5771168404Spjd 5772185029Spjd config = spa_config_generate(spa, spa->spa_root_vdev, 5773185029Spjd dmu_tx_get_txg(tx), B_FALSE); 5774185029Spjd 5775185029Spjd spa_config_exit(spa, SCL_STATE, FTAG); 5776185029Spjd 5777168404Spjd if (spa->spa_config_syncing) 5778168404Spjd nvlist_free(spa->spa_config_syncing); 5779168404Spjd spa->spa_config_syncing = config; 5780168404Spjd 5781168404Spjd spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 5782168404Spjd} 5783168404Spjd 5784236884Smmstatic void 5785236884Smmspa_sync_version(void *arg1, void *arg2, dmu_tx_t *tx) 5786236884Smm{ 5787236884Smm spa_t *spa = arg1; 5788236884Smm uint64_t version = *(uint64_t *)arg2; 5789236884Smm 5790236884Smm /* 5791236884Smm * Setting the version is special cased when first creating the pool. 5792236884Smm */ 5793236884Smm ASSERT(tx->tx_txg != TXG_INITIAL); 5794236884Smm 5795236884Smm ASSERT(version <= SPA_VERSION); 5796236884Smm ASSERT(version >= spa_version(spa)); 5797236884Smm 5798236884Smm spa->spa_uberblock.ub_version = version; 5799236884Smm vdev_config_dirty(spa->spa_root_vdev); 5800236884Smm} 5801236884Smm 5802185029Spjd/* 5803185029Spjd * Set zpool properties. 5804185029Spjd */ 5805168404Spjdstatic void 5806219089Spjdspa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx) 5807168404Spjd{ 5808168404Spjd spa_t *spa = arg1; 5809185029Spjd objset_t *mos = spa->spa_meta_objset; 5810168404Spjd nvlist_t *nvp = arg2; 5811236884Smm nvpair_t *elem = NULL; 5812168404Spjd 5813168404Spjd mutex_enter(&spa->spa_props_lock); 5814168404Spjd 5815185029Spjd while ((elem = nvlist_next_nvpair(nvp, elem))) { 5816236884Smm uint64_t intval; 5817236884Smm char *strval, *fname; 5818236884Smm zpool_prop_t prop; 5819236884Smm const char *propname; 5820236884Smm zprop_type_t proptype; 5821236884Smm zfeature_info_t *feature; 5822236884Smm 5823185029Spjd switch (prop = zpool_name_to_prop(nvpair_name(elem))) { 5824236884Smm case ZPROP_INVAL: 5825236884Smm /* 5826236884Smm * We checked this earlier in spa_prop_validate(). 5827236884Smm */ 5828236884Smm ASSERT(zpool_prop_feature(nvpair_name(elem))); 5829236884Smm 5830236884Smm fname = strchr(nvpair_name(elem), '@') + 1; 5831236884Smm VERIFY3U(0, ==, zfeature_lookup_name(fname, &feature)); 5832236884Smm 5833236884Smm spa_feature_enable(spa, feature, tx); 5834236884Smm break; 5835236884Smm 5836185029Spjd case ZPOOL_PROP_VERSION: 5837236884Smm VERIFY(nvpair_value_uint64(elem, &intval) == 0); 5838185029Spjd /* 5839236884Smm * The version is synced seperatly before other 5840236884Smm * properties and should be correct by now. 5841185029Spjd */ 5842236884Smm ASSERT3U(spa_version(spa), >=, intval); 5843185029Spjd break; 5844168404Spjd 5845185029Spjd case ZPOOL_PROP_ALTROOT: 5846185029Spjd /* 5847185029Spjd * 'altroot' is a non-persistent property. It should 5848185029Spjd * have been set temporarily at creation or import time. 5849185029Spjd */ 5850185029Spjd ASSERT(spa->spa_root != NULL); 5851185029Spjd break; 5852168404Spjd 5853219089Spjd case ZPOOL_PROP_READONLY: 5854185029Spjd case ZPOOL_PROP_CACHEFILE: 5855185029Spjd /* 5856219089Spjd * 'readonly' and 'cachefile' are also non-persisitent 5857219089Spjd * properties. 5858185029Spjd */ 5859168404Spjd break; 5860228103Smm case ZPOOL_PROP_COMMENT: 5861228103Smm VERIFY(nvpair_value_string(elem, &strval) == 0); 5862228103Smm if (spa->spa_comment != NULL) 5863228103Smm spa_strfree(spa->spa_comment); 5864228103Smm spa->spa_comment = spa_strdup(strval); 5865228103Smm /* 5866228103Smm * We need to dirty the configuration on all the vdevs 5867228103Smm * so that their labels get updated. It's unnecessary 5868228103Smm * to do this for pool creation since the vdev's 5869228103Smm * configuratoin has already been dirtied. 5870228103Smm */ 5871228103Smm if (tx->tx_txg != TXG_INITIAL) 5872228103Smm vdev_config_dirty(spa->spa_root_vdev); 5873228103Smm break; 5874185029Spjd default: 5875185029Spjd /* 5876185029Spjd * Set pool property values in the poolprops mos object. 5877185029Spjd */ 5878185029Spjd if (spa->spa_pool_props_object == 0) { 5879236884Smm spa->spa_pool_props_object = 5880236884Smm zap_create_link(mos, DMU_OT_POOL_PROPS, 5881185029Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 5882236884Smm tx); 5883185029Spjd } 5884185029Spjd 5885185029Spjd /* normalize the property name */ 5886185029Spjd propname = zpool_prop_to_name(prop); 5887185029Spjd proptype = zpool_prop_get_type(prop); 5888185029Spjd 5889185029Spjd if (nvpair_type(elem) == DATA_TYPE_STRING) { 5890185029Spjd ASSERT(proptype == PROP_TYPE_STRING); 5891185029Spjd VERIFY(nvpair_value_string(elem, &strval) == 0); 5892185029Spjd VERIFY(zap_update(mos, 5893185029Spjd spa->spa_pool_props_object, propname, 5894185029Spjd 1, strlen(strval) + 1, strval, tx) == 0); 5895185029Spjd 5896185029Spjd } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 5897185029Spjd VERIFY(nvpair_value_uint64(elem, &intval) == 0); 5898185029Spjd 5899185029Spjd if (proptype == PROP_TYPE_INDEX) { 5900185029Spjd const char *unused; 5901185029Spjd VERIFY(zpool_prop_index_to_string( 5902185029Spjd prop, intval, &unused) == 0); 5903185029Spjd } 5904185029Spjd VERIFY(zap_update(mos, 5905185029Spjd spa->spa_pool_props_object, propname, 5906185029Spjd 8, 1, &intval, tx) == 0); 5907185029Spjd } else { 5908185029Spjd ASSERT(0); /* not allowed */ 5909185029Spjd } 5910185029Spjd 5911185029Spjd switch (prop) { 5912185029Spjd case ZPOOL_PROP_DELEGATION: 5913185029Spjd spa->spa_delegation = intval; 5914185029Spjd break; 5915185029Spjd case ZPOOL_PROP_BOOTFS: 5916185029Spjd spa->spa_bootfs = intval; 5917185029Spjd break; 5918185029Spjd case ZPOOL_PROP_FAILUREMODE: 5919185029Spjd spa->spa_failmode = intval; 5920185029Spjd break; 5921219089Spjd case ZPOOL_PROP_AUTOEXPAND: 5922219089Spjd spa->spa_autoexpand = intval; 5923219089Spjd if (tx->tx_txg != TXG_INITIAL) 5924219089Spjd spa_async_request(spa, 5925219089Spjd SPA_ASYNC_AUTOEXPAND); 5926219089Spjd break; 5927219089Spjd case ZPOOL_PROP_DEDUPDITTO: 5928219089Spjd spa->spa_dedup_ditto = intval; 5929219089Spjd break; 5930185029Spjd default: 5931185029Spjd break; 5932185029Spjd } 5933168404Spjd } 5934185029Spjd 5935185029Spjd /* log internal history if this is not a zpool create */ 5936185029Spjd if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY && 5937185029Spjd tx->tx_txg != TXG_INITIAL) { 5938219089Spjd spa_history_log_internal(LOG_POOL_PROPSET, 5939219089Spjd spa, tx, "%s %lld %s", 5940185029Spjd nvpair_name(elem), intval, spa_name(spa)); 5941185029Spjd } 5942168404Spjd } 5943185029Spjd 5944185029Spjd mutex_exit(&spa->spa_props_lock); 5945168404Spjd} 5946168404Spjd 5947168404Spjd/* 5948219089Spjd * Perform one-time upgrade on-disk changes. spa_version() does not 5949219089Spjd * reflect the new version this txg, so there must be no changes this 5950219089Spjd * txg to anything that the upgrade code depends on after it executes. 5951219089Spjd * Therefore this must be called after dsl_pool_sync() does the sync 5952219089Spjd * tasks. 5953219089Spjd */ 5954219089Spjdstatic void 5955219089Spjdspa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) 5956219089Spjd{ 5957219089Spjd dsl_pool_t *dp = spa->spa_dsl_pool; 5958219089Spjd 5959219089Spjd ASSERT(spa->spa_sync_pass == 1); 5960219089Spjd 5961219089Spjd if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && 5962219089Spjd spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { 5963219089Spjd dsl_pool_create_origin(dp, tx); 5964219089Spjd 5965219089Spjd /* Keeping the origin open increases spa_minref */ 5966219089Spjd spa->spa_minref += 3; 5967219089Spjd } 5968219089Spjd 5969219089Spjd if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && 5970219089Spjd spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { 5971219089Spjd dsl_pool_upgrade_clones(dp, tx); 5972219089Spjd } 5973219089Spjd 5974219089Spjd if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && 5975219089Spjd spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { 5976219089Spjd dsl_pool_upgrade_dir_clones(dp, tx); 5977219089Spjd 5978219089Spjd /* Keeping the freedir open increases spa_minref */ 5979219089Spjd spa->spa_minref += 3; 5980219089Spjd } 5981236884Smm 5982236884Smm if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES && 5983236884Smm spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 5984236884Smm spa_feature_create_zap_objects(spa, tx); 5985236884Smm } 5986219089Spjd} 5987219089Spjd 5988219089Spjd/* 5989168404Spjd * Sync the specified transaction group. New blocks may be dirtied as 5990168404Spjd * part of the process, so we iterate until it converges. 5991168404Spjd */ 5992168404Spjdvoid 5993168404Spjdspa_sync(spa_t *spa, uint64_t txg) 5994168404Spjd{ 5995168404Spjd dsl_pool_t *dp = spa->spa_dsl_pool; 5996168404Spjd objset_t *mos = spa->spa_meta_objset; 5997219089Spjd bpobj_t *defer_bpo = &spa->spa_deferred_bpobj; 5998219089Spjd bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; 5999168404Spjd vdev_t *rvd = spa->spa_root_vdev; 6000168404Spjd vdev_t *vd; 6001168404Spjd dmu_tx_t *tx; 6002185029Spjd int error; 6003168404Spjd 6004219089Spjd VERIFY(spa_writeable(spa)); 6005219089Spjd 6006168404Spjd /* 6007168404Spjd * Lock out configuration changes. 6008168404Spjd */ 6009185029Spjd spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 6010168404Spjd 6011168404Spjd spa->spa_syncing_txg = txg; 6012168404Spjd spa->spa_sync_pass = 0; 6013168404Spjd 6014185029Spjd /* 6015185029Spjd * If there are any pending vdev state changes, convert them 6016185029Spjd * into config changes that go out with this transaction group. 6017185029Spjd */ 6018185029Spjd spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6019209962Smm while (list_head(&spa->spa_state_dirty_list) != NULL) { 6020209962Smm /* 6021209962Smm * We need the write lock here because, for aux vdevs, 6022209962Smm * calling vdev_config_dirty() modifies sav_config. 6023209962Smm * This is ugly and will become unnecessary when we 6024209962Smm * eliminate the aux vdev wart by integrating all vdevs 6025209962Smm * into the root vdev tree. 6026209962Smm */ 6027209962Smm spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 6028209962Smm spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); 6029209962Smm while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 6030209962Smm vdev_state_clean(vd); 6031209962Smm vdev_config_dirty(vd); 6032209962Smm } 6033209962Smm spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 6034209962Smm spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 6035185029Spjd } 6036185029Spjd spa_config_exit(spa, SCL_STATE, FTAG); 6037185029Spjd 6038168404Spjd tx = dmu_tx_create_assigned(dp, txg); 6039168404Spjd 6040168404Spjd /* 6041185029Spjd * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 6042168404Spjd * set spa_deflate if we have no raid-z vdevs. 6043168404Spjd */ 6044185029Spjd if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 6045185029Spjd spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 6046168404Spjd int i; 6047168404Spjd 6048168404Spjd for (i = 0; i < rvd->vdev_children; i++) { 6049168404Spjd vd = rvd->vdev_child[i]; 6050168404Spjd if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 6051168404Spjd break; 6052168404Spjd } 6053168404Spjd if (i == rvd->vdev_children) { 6054168404Spjd spa->spa_deflate = TRUE; 6055168404Spjd VERIFY(0 == zap_add(spa->spa_meta_objset, 6056168404Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 6057168404Spjd sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 6058168404Spjd } 6059168404Spjd } 6060168404Spjd 6061168404Spjd /* 6062219089Spjd * If anything has changed in this txg, or if someone is waiting 6063219089Spjd * for this txg to sync (eg, spa_vdev_remove()), push the 6064219089Spjd * deferred frees from the previous txg. If not, leave them 6065219089Spjd * alone so that we don't generate work on an otherwise idle 6066219089Spjd * system. 6067168404Spjd */ 6068168404Spjd if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 6069168404Spjd !txg_list_empty(&dp->dp_dirty_dirs, txg) || 6070219089Spjd !txg_list_empty(&dp->dp_sync_tasks, txg) || 6071219089Spjd ((dsl_scan_active(dp->dp_scan) || 6072219089Spjd txg_sync_waiting(dp)) && !spa_shutting_down(spa))) { 6073219089Spjd zio_t *zio = zio_root(spa, NULL, NULL, 0); 6074219089Spjd VERIFY3U(bpobj_iterate(defer_bpo, 6075219089Spjd spa_free_sync_cb, zio, tx), ==, 0); 6076240415Smm VERIFY0(zio_wait(zio)); 6077219089Spjd } 6078168404Spjd 6079168404Spjd /* 6080168404Spjd * Iterate to convergence. 6081168404Spjd */ 6082168404Spjd do { 6083219089Spjd int pass = ++spa->spa_sync_pass; 6084168404Spjd 6085168404Spjd spa_sync_config_object(spa, tx); 6086185029Spjd spa_sync_aux_dev(spa, &spa->spa_spares, tx, 6087185029Spjd ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 6088185029Spjd spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 6089185029Spjd ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 6090168404Spjd spa_errlog_sync(spa, txg); 6091168404Spjd dsl_pool_sync(dp, txg); 6092168404Spjd 6093219089Spjd if (pass <= SYNC_PASS_DEFERRED_FREE) { 6094219089Spjd zio_t *zio = zio_root(spa, NULL, NULL, 0); 6095219089Spjd bplist_iterate(free_bpl, spa_free_sync_cb, 6096219089Spjd zio, tx); 6097219089Spjd VERIFY(zio_wait(zio) == 0); 6098219089Spjd } else { 6099219089Spjd bplist_iterate(free_bpl, bpobj_enqueue_cb, 6100219089Spjd defer_bpo, tx); 6101168404Spjd } 6102168404Spjd 6103219089Spjd ddt_sync(spa, txg); 6104219089Spjd dsl_scan_sync(dp, tx); 6105168404Spjd 6106219089Spjd while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) 6107219089Spjd vdev_sync(vd, txg); 6108168404Spjd 6109219089Spjd if (pass == 1) 6110219089Spjd spa_sync_upgrades(spa, tx); 6111168404Spjd 6112219089Spjd } while (dmu_objset_is_dirty(mos, txg)); 6113219089Spjd 6114168404Spjd /* 6115168404Spjd * Rewrite the vdev configuration (which includes the uberblock) 6116168404Spjd * to commit the transaction group. 6117168404Spjd * 6118185029Spjd * If there are no dirty vdevs, we sync the uberblock to a few 6119185029Spjd * random top-level vdevs that are known to be visible in the 6120185029Spjd * config cache (see spa_vdev_add() for a complete description). 6121185029Spjd * If there *are* dirty vdevs, sync the uberblock to all vdevs. 6122168404Spjd */ 6123185029Spjd for (;;) { 6124185029Spjd /* 6125185029Spjd * We hold SCL_STATE to prevent vdev open/close/etc. 6126185029Spjd * while we're attempting to write the vdev labels. 6127185029Spjd */ 6128185029Spjd spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6129168404Spjd 6130185029Spjd if (list_is_empty(&spa->spa_config_dirty_list)) { 6131185029Spjd vdev_t *svd[SPA_DVAS_PER_BP]; 6132185029Spjd int svdcount = 0; 6133185029Spjd int children = rvd->vdev_children; 6134185029Spjd int c0 = spa_get_random(children); 6135185029Spjd 6136219089Spjd for (int c = 0; c < children; c++) { 6137185029Spjd vd = rvd->vdev_child[(c0 + c) % children]; 6138185029Spjd if (vd->vdev_ms_array == 0 || vd->vdev_islog) 6139185029Spjd continue; 6140185029Spjd svd[svdcount++] = vd; 6141185029Spjd if (svdcount == SPA_DVAS_PER_BP) 6142185029Spjd break; 6143185029Spjd } 6144213198Smm error = vdev_config_sync(svd, svdcount, txg, B_FALSE); 6145213198Smm if (error != 0) 6146213198Smm error = vdev_config_sync(svd, svdcount, txg, 6147213198Smm B_TRUE); 6148185029Spjd } else { 6149185029Spjd error = vdev_config_sync(rvd->vdev_child, 6150213198Smm rvd->vdev_children, txg, B_FALSE); 6151213198Smm if (error != 0) 6152213198Smm error = vdev_config_sync(rvd->vdev_child, 6153213198Smm rvd->vdev_children, txg, B_TRUE); 6154168404Spjd } 6155185029Spjd 6156239620Smm if (error == 0) 6157239620Smm spa->spa_last_synced_guid = rvd->vdev_guid; 6158239620Smm 6159185029Spjd spa_config_exit(spa, SCL_STATE, FTAG); 6160185029Spjd 6161185029Spjd if (error == 0) 6162185029Spjd break; 6163185029Spjd zio_suspend(spa, NULL); 6164185029Spjd zio_resume_wait(spa); 6165168404Spjd } 6166168404Spjd dmu_tx_commit(tx); 6167168404Spjd 6168168404Spjd /* 6169168404Spjd * Clear the dirty config list. 6170168404Spjd */ 6171185029Spjd while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) 6172168404Spjd vdev_config_clean(vd); 6173168404Spjd 6174168404Spjd /* 6175168404Spjd * Now that the new config has synced transactionally, 6176168404Spjd * let it become visible to the config cache. 6177168404Spjd */ 6178168404Spjd if (spa->spa_config_syncing != NULL) { 6179168404Spjd spa_config_set(spa, spa->spa_config_syncing); 6180168404Spjd spa->spa_config_txg = txg; 6181168404Spjd spa->spa_config_syncing = NULL; 6182168404Spjd } 6183168404Spjd 6184168404Spjd spa->spa_ubsync = spa->spa_uberblock; 6185168404Spjd 6186219089Spjd dsl_pool_sync_done(dp, txg); 6187168404Spjd 6188168404Spjd /* 6189168404Spjd * Update usable space statistics. 6190168404Spjd */ 6191168404Spjd while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 6192168404Spjd vdev_sync_done(vd, txg); 6193168404Spjd 6194219089Spjd spa_update_dspace(spa); 6195219089Spjd 6196168404Spjd /* 6197168404Spjd * It had better be the case that we didn't dirty anything 6198168404Spjd * since vdev_config_sync(). 6199168404Spjd */ 6200168404Spjd ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 6201168404Spjd ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 6202168404Spjd ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 6203168404Spjd 6204219089Spjd spa->spa_sync_pass = 0; 6205219089Spjd 6206185029Spjd spa_config_exit(spa, SCL_CONFIG, FTAG); 6207168404Spjd 6208219089Spjd spa_handle_ignored_writes(spa); 6209219089Spjd 6210168404Spjd /* 6211168404Spjd * If any async tasks have been requested, kick them off. 6212168404Spjd */ 6213168404Spjd spa_async_dispatch(spa); 6214168404Spjd} 6215168404Spjd 6216168404Spjd/* 6217168404Spjd * Sync all pools. We don't want to hold the namespace lock across these 6218168404Spjd * operations, so we take a reference on the spa_t and drop the lock during the 6219168404Spjd * sync. 6220168404Spjd */ 6221168404Spjdvoid 6222168404Spjdspa_sync_allpools(void) 6223168404Spjd{ 6224168404Spjd spa_t *spa = NULL; 6225168404Spjd mutex_enter(&spa_namespace_lock); 6226168404Spjd while ((spa = spa_next(spa)) != NULL) { 6227219089Spjd if (spa_state(spa) != POOL_STATE_ACTIVE || 6228219089Spjd !spa_writeable(spa) || spa_suspended(spa)) 6229168404Spjd continue; 6230168404Spjd spa_open_ref(spa, FTAG); 6231168404Spjd mutex_exit(&spa_namespace_lock); 6232168404Spjd txg_wait_synced(spa_get_dsl(spa), 0); 6233168404Spjd mutex_enter(&spa_namespace_lock); 6234168404Spjd spa_close(spa, FTAG); 6235168404Spjd } 6236168404Spjd mutex_exit(&spa_namespace_lock); 6237168404Spjd} 6238168404Spjd 6239168404Spjd/* 6240168404Spjd * ========================================================================== 6241168404Spjd * Miscellaneous routines 6242168404Spjd * ========================================================================== 6243168404Spjd */ 6244168404Spjd 6245168404Spjd/* 6246168404Spjd * Remove all pools in the system. 6247168404Spjd */ 6248168404Spjdvoid 6249168404Spjdspa_evict_all(void) 6250168404Spjd{ 6251168404Spjd spa_t *spa; 6252168404Spjd 6253168404Spjd /* 6254168404Spjd * Remove all cached state. All pools should be closed now, 6255168404Spjd * so every spa in the AVL tree should be unreferenced. 6256168404Spjd */ 6257168404Spjd mutex_enter(&spa_namespace_lock); 6258168404Spjd while ((spa = spa_next(NULL)) != NULL) { 6259168404Spjd /* 6260168404Spjd * Stop async tasks. The async thread may need to detach 6261168404Spjd * a device that's been replaced, which requires grabbing 6262168404Spjd * spa_namespace_lock, so we must drop it here. 6263168404Spjd */ 6264168404Spjd spa_open_ref(spa, FTAG); 6265168404Spjd mutex_exit(&spa_namespace_lock); 6266168404Spjd spa_async_suspend(spa); 6267168404Spjd mutex_enter(&spa_namespace_lock); 6268168404Spjd spa_close(spa, FTAG); 6269168404Spjd 6270168404Spjd if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 6271168404Spjd spa_unload(spa); 6272168404Spjd spa_deactivate(spa); 6273168404Spjd } 6274168404Spjd spa_remove(spa); 6275168404Spjd } 6276168404Spjd mutex_exit(&spa_namespace_lock); 6277168404Spjd} 6278168404Spjd 6279168404Spjdvdev_t * 6280209962Smmspa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) 6281168404Spjd{ 6282185029Spjd vdev_t *vd; 6283185029Spjd int i; 6284185029Spjd 6285185029Spjd if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 6286185029Spjd return (vd); 6287185029Spjd 6288209962Smm if (aux) { 6289185029Spjd for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 6290185029Spjd vd = spa->spa_l2cache.sav_vdevs[i]; 6291185029Spjd if (vd->vdev_guid == guid) 6292185029Spjd return (vd); 6293185029Spjd } 6294209962Smm 6295209962Smm for (i = 0; i < spa->spa_spares.sav_count; i++) { 6296209962Smm vd = spa->spa_spares.sav_vdevs[i]; 6297209962Smm if (vd->vdev_guid == guid) 6298209962Smm return (vd); 6299209962Smm } 6300185029Spjd } 6301185029Spjd 6302185029Spjd return (NULL); 6303168404Spjd} 6304168404Spjd 6305168404Spjdvoid 6306185029Spjdspa_upgrade(spa_t *spa, uint64_t version) 6307168404Spjd{ 6308219089Spjd ASSERT(spa_writeable(spa)); 6309219089Spjd 6310185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6311168404Spjd 6312168404Spjd /* 6313168404Spjd * This should only be called for a non-faulted pool, and since a 6314168404Spjd * future version would result in an unopenable pool, this shouldn't be 6315168404Spjd * possible. 6316168404Spjd */ 6317185029Spjd ASSERT(spa->spa_uberblock.ub_version <= SPA_VERSION); 6318185029Spjd ASSERT(version >= spa->spa_uberblock.ub_version); 6319168404Spjd 6320185029Spjd spa->spa_uberblock.ub_version = version; 6321168404Spjd vdev_config_dirty(spa->spa_root_vdev); 6322168404Spjd 6323185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 6324168404Spjd 6325168404Spjd txg_wait_synced(spa_get_dsl(spa), 0); 6326168404Spjd} 6327168404Spjd 6328168404Spjdboolean_t 6329168404Spjdspa_has_spare(spa_t *spa, uint64_t guid) 6330168404Spjd{ 6331168404Spjd int i; 6332168404Spjd uint64_t spareguid; 6333185029Spjd spa_aux_vdev_t *sav = &spa->spa_spares; 6334168404Spjd 6335185029Spjd for (i = 0; i < sav->sav_count; i++) 6336185029Spjd if (sav->sav_vdevs[i]->vdev_guid == guid) 6337168404Spjd return (B_TRUE); 6338168404Spjd 6339185029Spjd for (i = 0; i < sav->sav_npending; i++) { 6340185029Spjd if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 6341185029Spjd &spareguid) == 0 && spareguid == guid) 6342168404Spjd return (B_TRUE); 6343168404Spjd } 6344168404Spjd 6345168404Spjd return (B_FALSE); 6346168404Spjd} 6347168404Spjd 6348185029Spjd/* 6349185029Spjd * Check if a pool has an active shared spare device. 6350185029Spjd * Note: reference count of an active spare is 2, as a spare and as a replace 6351185029Spjd */ 6352185029Spjdstatic boolean_t 6353185029Spjdspa_has_active_shared_spare(spa_t *spa) 6354168404Spjd{ 6355185029Spjd int i, refcnt; 6356185029Spjd uint64_t pool; 6357185029Spjd spa_aux_vdev_t *sav = &spa->spa_spares; 6358185029Spjd 6359185029Spjd for (i = 0; i < sav->sav_count; i++) { 6360185029Spjd if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, 6361185029Spjd &refcnt) && pool != 0ULL && pool == spa_guid(spa) && 6362185029Spjd refcnt > 2) 6363185029Spjd return (B_TRUE); 6364185029Spjd } 6365185029Spjd 6366185029Spjd return (B_FALSE); 6367168404Spjd} 6368168404Spjd 6369185029Spjd/* 6370185029Spjd * Post a sysevent corresponding to the given event. The 'name' must be one of 6371185029Spjd * the event definitions in sys/sysevent/eventdefs.h. The payload will be 6372185029Spjd * filled in from the spa and (optionally) the vdev. This doesn't do anything 6373185029Spjd * in the userland libzpool, as we don't want consumers to misinterpret ztest 6374185029Spjd * or zdb as real changes. 6375185029Spjd */ 6376185029Spjdvoid 6377185029Spjdspa_event_notify(spa_t *spa, vdev_t *vd, const char *name) 6378168404Spjd{ 6379185029Spjd#ifdef _KERNEL 6380185029Spjd sysevent_t *ev; 6381185029Spjd sysevent_attr_list_t *attr = NULL; 6382185029Spjd sysevent_value_t value; 6383185029Spjd sysevent_id_t eid; 6384168404Spjd 6385185029Spjd ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", 6386185029Spjd SE_SLEEP); 6387168404Spjd 6388185029Spjd value.value_type = SE_DATA_TYPE_STRING; 6389185029Spjd value.value.sv_string = spa_name(spa); 6390185029Spjd if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) 6391185029Spjd goto done; 6392168404Spjd 6393185029Spjd value.value_type = SE_DATA_TYPE_UINT64; 6394185029Spjd value.value.sv_uint64 = spa_guid(spa); 6395185029Spjd if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) 6396185029Spjd goto done; 6397168404Spjd 6398185029Spjd if (vd) { 6399185029Spjd value.value_type = SE_DATA_TYPE_UINT64; 6400185029Spjd value.value.sv_uint64 = vd->vdev_guid; 6401185029Spjd if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, 6402185029Spjd SE_SLEEP) != 0) 6403185029Spjd goto done; 6404168404Spjd 6405185029Spjd if (vd->vdev_path) { 6406185029Spjd value.value_type = SE_DATA_TYPE_STRING; 6407185029Spjd value.value.sv_string = vd->vdev_path; 6408185029Spjd if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, 6409185029Spjd &value, SE_SLEEP) != 0) 6410185029Spjd goto done; 6411168404Spjd } 6412168404Spjd } 6413168404Spjd 6414185029Spjd if (sysevent_attach_attributes(ev, attr) != 0) 6415185029Spjd goto done; 6416185029Spjd attr = NULL; 6417168404Spjd 6418185029Spjd (void) log_sysevent(ev, SE_SLEEP, &eid); 6419185029Spjd 6420185029Spjddone: 6421185029Spjd if (attr) 6422185029Spjd sysevent_free_attr(attr); 6423185029Spjd sysevent_free(ev); 6424185029Spjd#endif 6425168404Spjd} 6426