spa.c revision 246666
1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd 22168404Spjd/* 23219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24236155Smm * Copyright (c) 2012 by Delphix. All rights reserved. 25168404Spjd */ 26168404Spjd 27168404Spjd/* 28168404Spjd * This file contains all the routines used when modifying on-disk SPA state. 29168404Spjd * This includes opening, importing, destroying, exporting a pool, and syncing a 30168404Spjd * pool. 31168404Spjd */ 32168404Spjd 33168404Spjd#include <sys/zfs_context.h> 34168404Spjd#include <sys/fm/fs/zfs.h> 35168404Spjd#include <sys/spa_impl.h> 36168404Spjd#include <sys/zio.h> 37168404Spjd#include <sys/zio_checksum.h> 38168404Spjd#include <sys/dmu.h> 39168404Spjd#include <sys/dmu_tx.h> 40168404Spjd#include <sys/zap.h> 41168404Spjd#include <sys/zil.h> 42219089Spjd#include <sys/ddt.h> 43168404Spjd#include <sys/vdev_impl.h> 44168404Spjd#include <sys/metaslab.h> 45219089Spjd#include <sys/metaslab_impl.h> 46168404Spjd#include <sys/uberblock_impl.h> 47168404Spjd#include <sys/txg.h> 48168404Spjd#include <sys/avl.h> 49168404Spjd#include <sys/dmu_traverse.h> 50168404Spjd#include <sys/dmu_objset.h> 51168404Spjd#include <sys/unique.h> 52168404Spjd#include <sys/dsl_pool.h> 53168404Spjd#include <sys/dsl_dataset.h> 54168404Spjd#include <sys/dsl_dir.h> 55168404Spjd#include <sys/dsl_prop.h> 56168404Spjd#include <sys/dsl_synctask.h> 57168404Spjd#include <sys/fs/zfs.h> 58185029Spjd#include <sys/arc.h> 59168404Spjd#include <sys/callb.h> 60185029Spjd#include <sys/spa_boot.h> 61219089Spjd#include <sys/zfs_ioctl.h> 62219089Spjd#include <sys/dsl_scan.h> 63236884Smm#include <sys/zfeature.h> 64219089Spjd#include <sys/zvol.h> 65240868Spjd#include <sys/trim_map.h> 66168404Spjd 67219089Spjd#ifdef _KERNEL 68219089Spjd#include <sys/callb.h> 69219089Spjd#include <sys/cpupart.h> 70219089Spjd#include <sys/zone.h> 71219089Spjd#endif /* _KERNEL */ 72219089Spjd 73185029Spjd#include "zfs_prop.h" 74185029Spjd#include "zfs_comutil.h" 75168404Spjd 76204073Spjd/* Check hostid on import? */ 77204073Spjdstatic int check_hostid = 1; 78204073Spjd 79204073SpjdSYSCTL_DECL(_vfs_zfs); 80204073SpjdTUNABLE_INT("vfs.zfs.check_hostid", &check_hostid); 81204073SpjdSYSCTL_INT(_vfs_zfs, OID_AUTO, check_hostid, CTLFLAG_RW, &check_hostid, 0, 82204073Spjd "Check hostid on import?"); 83204073Spjd 84219089Spjdtypedef enum zti_modes { 85209962Smm zti_mode_fixed, /* value is # of threads (min 1) */ 86209962Smm zti_mode_online_percent, /* value is % of online CPUs */ 87219089Spjd zti_mode_batch, /* cpu-intensive; value is ignored */ 88211931Smm zti_mode_null, /* don't create a taskq */ 89209962Smm zti_nmodes 90219089Spjd} zti_modes_t; 91168712Spjd 92211931Smm#define ZTI_FIX(n) { zti_mode_fixed, (n) } 93211931Smm#define ZTI_PCT(n) { zti_mode_online_percent, (n) } 94219089Spjd#define ZTI_BATCH { zti_mode_batch, 0 } 95211931Smm#define ZTI_NULL { zti_mode_null, 0 } 96209962Smm 97211931Smm#define ZTI_ONE ZTI_FIX(1) 98209962Smm 99209962Smmtypedef struct zio_taskq_info { 100211931Smm enum zti_modes zti_mode; 101211931Smm uint_t zti_value; 102209962Smm} zio_taskq_info_t; 103209962Smm 104209962Smmstatic const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { 105219089Spjd "issue", "issue_high", "intr", "intr_high" 106209962Smm}; 107209962Smm 108211931Smm/* 109211931Smm * Define the taskq threads for the following I/O types: 110211931Smm * NULL, READ, WRITE, FREE, CLAIM, and IOCTL 111211931Smm */ 112211931Smmconst zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { 113211931Smm /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ 114211931Smm { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 115219089Spjd { ZTI_FIX(8), ZTI_NULL, ZTI_BATCH, ZTI_NULL }, 116219089Spjd { ZTI_BATCH, ZTI_FIX(5), ZTI_FIX(8), ZTI_FIX(5) }, 117219089Spjd { ZTI_FIX(100), ZTI_NULL, ZTI_ONE, ZTI_NULL }, 118211931Smm { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 119211931Smm { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 120209962Smm}; 121209962Smm 122236884Smmstatic dsl_syncfunc_t spa_sync_version; 123219089Spjdstatic dsl_syncfunc_t spa_sync_props; 124239620Smmstatic dsl_checkfunc_t spa_change_guid_check; 125239620Smmstatic dsl_syncfunc_t spa_change_guid_sync; 126185029Spjdstatic boolean_t spa_has_active_shared_spare(spa_t *spa); 127219089Spjdstatic int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config, 128219089Spjd spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, 129219089Spjd char **ereport); 130219089Spjdstatic void spa_vdev_resilver_done(spa_t *spa); 131185029Spjd 132219089Spjduint_t zio_taskq_batch_pct = 100; /* 1 thread per cpu in pset */ 133219089Spjd#ifdef PSRSET_BIND 134219089Spjdid_t zio_taskq_psrset_bind = PS_NONE; 135219089Spjd#endif 136219089Spjd#ifdef SYSDC 137219089Spjdboolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ 138219089Spjd#endif 139219089Spjduint_t zio_taskq_basedc = 80; /* base duty cycle */ 140219089Spjd 141219089Spjdboolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */ 142243503Smmextern int zfs_sync_pass_deferred_free; 143219089Spjd 144168404Spjd/* 145219089Spjd * This (illegal) pool name is used when temporarily importing a spa_t in order 146219089Spjd * to get the vdev stats associated with the imported devices. 147219089Spjd */ 148219089Spjd#define TRYIMPORT_NAME "$import" 149219089Spjd 150219089Spjd/* 151168404Spjd * ========================================================================== 152185029Spjd * SPA properties routines 153185029Spjd * ========================================================================== 154185029Spjd */ 155185029Spjd 156185029Spjd/* 157185029Spjd * Add a (source=src, propname=propval) list to an nvlist. 158185029Spjd */ 159185029Spjdstatic void 160185029Spjdspa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, 161185029Spjd uint64_t intval, zprop_source_t src) 162185029Spjd{ 163185029Spjd const char *propname = zpool_prop_to_name(prop); 164185029Spjd nvlist_t *propval; 165185029Spjd 166185029Spjd VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 167185029Spjd VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); 168185029Spjd 169185029Spjd if (strval != NULL) 170185029Spjd VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); 171185029Spjd else 172185029Spjd VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); 173185029Spjd 174185029Spjd VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); 175185029Spjd nvlist_free(propval); 176185029Spjd} 177185029Spjd 178185029Spjd/* 179185029Spjd * Get property values from the spa configuration. 180185029Spjd */ 181185029Spjdstatic void 182185029Spjdspa_prop_get_config(spa_t *spa, nvlist_t **nvp) 183185029Spjd{ 184236155Smm vdev_t *rvd = spa->spa_root_vdev; 185236884Smm dsl_pool_t *pool = spa->spa_dsl_pool; 186209962Smm uint64_t size; 187219089Spjd uint64_t alloc; 188236155Smm uint64_t space; 189185029Spjd uint64_t cap, version; 190185029Spjd zprop_source_t src = ZPROP_SRC_NONE; 191185029Spjd spa_config_dirent_t *dp; 192185029Spjd 193185029Spjd ASSERT(MUTEX_HELD(&spa->spa_props_lock)); 194185029Spjd 195236155Smm if (rvd != NULL) { 196219089Spjd alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 197219089Spjd size = metaslab_class_get_space(spa_normal_class(spa)); 198209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); 199209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); 200219089Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); 201219089Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, 202219089Spjd size - alloc, src); 203236155Smm 204236155Smm space = 0; 205236155Smm for (int c = 0; c < rvd->vdev_children; c++) { 206236155Smm vdev_t *tvd = rvd->vdev_child[c]; 207236155Smm space += tvd->vdev_max_asize - tvd->vdev_asize; 208236155Smm } 209236155Smm spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, space, 210236155Smm src); 211236155Smm 212219089Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, 213219089Spjd (spa_mode(spa) == FREAD), src); 214185029Spjd 215219089Spjd cap = (size == 0) ? 0 : (alloc * 100 / size); 216209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); 217185029Spjd 218219089Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, 219219089Spjd ddt_get_pool_dedup_ratio(spa), src); 220219089Spjd 221209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 222236155Smm rvd->vdev_state, src); 223209962Smm 224209962Smm version = spa_version(spa); 225209962Smm if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) 226209962Smm src = ZPROP_SRC_DEFAULT; 227209962Smm else 228209962Smm src = ZPROP_SRC_LOCAL; 229209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); 230209962Smm } 231209962Smm 232236884Smm if (pool != NULL) { 233236884Smm dsl_dir_t *freedir = pool->dp_free_dir; 234236884Smm 235236884Smm /* 236236884Smm * The $FREE directory was introduced in SPA_VERSION_DEADLISTS, 237236884Smm * when opening pools before this version freedir will be NULL. 238236884Smm */ 239236884Smm if (freedir != NULL) { 240236884Smm spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, 241236884Smm freedir->dd_phys->dd_used_bytes, src); 242236884Smm } else { 243236884Smm spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, 244236884Smm NULL, 0, src); 245236884Smm } 246236884Smm } 247236884Smm 248185029Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); 249185029Spjd 250228103Smm if (spa->spa_comment != NULL) { 251228103Smm spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment, 252228103Smm 0, ZPROP_SRC_LOCAL); 253228103Smm } 254228103Smm 255185029Spjd if (spa->spa_root != NULL) 256185029Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 257185029Spjd 0, ZPROP_SRC_LOCAL); 258185029Spjd 259185029Spjd if ((dp = list_head(&spa->spa_config_list)) != NULL) { 260185029Spjd if (dp->scd_path == NULL) { 261185029Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 262185029Spjd "none", 0, ZPROP_SRC_LOCAL); 263185029Spjd } else if (strcmp(dp->scd_path, spa_config_path) != 0) { 264185029Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 265185029Spjd dp->scd_path, 0, ZPROP_SRC_LOCAL); 266185029Spjd } 267185029Spjd } 268185029Spjd} 269185029Spjd 270185029Spjd/* 271185029Spjd * Get zpool property values. 272185029Spjd */ 273185029Spjdint 274185029Spjdspa_prop_get(spa_t *spa, nvlist_t **nvp) 275185029Spjd{ 276219089Spjd objset_t *mos = spa->spa_meta_objset; 277185029Spjd zap_cursor_t zc; 278185029Spjd zap_attribute_t za; 279185029Spjd int err; 280185029Spjd 281185029Spjd VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 282185029Spjd 283185029Spjd mutex_enter(&spa->spa_props_lock); 284185029Spjd 285185029Spjd /* 286185029Spjd * Get properties from the spa config. 287185029Spjd */ 288185029Spjd spa_prop_get_config(spa, nvp); 289185029Spjd 290185029Spjd /* If no pool property object, no more prop to get. */ 291219089Spjd if (mos == NULL || spa->spa_pool_props_object == 0) { 292185029Spjd mutex_exit(&spa->spa_props_lock); 293185029Spjd return (0); 294185029Spjd } 295185029Spjd 296185029Spjd /* 297185029Spjd * Get properties from the MOS pool property object. 298185029Spjd */ 299185029Spjd for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 300185029Spjd (err = zap_cursor_retrieve(&zc, &za)) == 0; 301185029Spjd zap_cursor_advance(&zc)) { 302185029Spjd uint64_t intval = 0; 303185029Spjd char *strval = NULL; 304185029Spjd zprop_source_t src = ZPROP_SRC_DEFAULT; 305185029Spjd zpool_prop_t prop; 306185029Spjd 307185029Spjd if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL) 308185029Spjd continue; 309185029Spjd 310185029Spjd switch (za.za_integer_length) { 311185029Spjd case 8: 312185029Spjd /* integer property */ 313185029Spjd if (za.za_first_integer != 314185029Spjd zpool_prop_default_numeric(prop)) 315185029Spjd src = ZPROP_SRC_LOCAL; 316185029Spjd 317185029Spjd if (prop == ZPOOL_PROP_BOOTFS) { 318185029Spjd dsl_pool_t *dp; 319185029Spjd dsl_dataset_t *ds = NULL; 320185029Spjd 321185029Spjd dp = spa_get_dsl(spa); 322185029Spjd rw_enter(&dp->dp_config_rwlock, RW_READER); 323185029Spjd if (err = dsl_dataset_hold_obj(dp, 324185029Spjd za.za_first_integer, FTAG, &ds)) { 325185029Spjd rw_exit(&dp->dp_config_rwlock); 326185029Spjd break; 327185029Spjd } 328185029Spjd 329185029Spjd strval = kmem_alloc( 330185029Spjd MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, 331185029Spjd KM_SLEEP); 332185029Spjd dsl_dataset_name(ds, strval); 333185029Spjd dsl_dataset_rele(ds, FTAG); 334185029Spjd rw_exit(&dp->dp_config_rwlock); 335185029Spjd } else { 336185029Spjd strval = NULL; 337185029Spjd intval = za.za_first_integer; 338185029Spjd } 339185029Spjd 340185029Spjd spa_prop_add_list(*nvp, prop, strval, intval, src); 341185029Spjd 342185029Spjd if (strval != NULL) 343185029Spjd kmem_free(strval, 344185029Spjd MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); 345185029Spjd 346185029Spjd break; 347185029Spjd 348185029Spjd case 1: 349185029Spjd /* string property */ 350185029Spjd strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 351185029Spjd err = zap_lookup(mos, spa->spa_pool_props_object, 352185029Spjd za.za_name, 1, za.za_num_integers, strval); 353185029Spjd if (err) { 354185029Spjd kmem_free(strval, za.za_num_integers); 355185029Spjd break; 356185029Spjd } 357185029Spjd spa_prop_add_list(*nvp, prop, strval, 0, src); 358185029Spjd kmem_free(strval, za.za_num_integers); 359185029Spjd break; 360185029Spjd 361185029Spjd default: 362185029Spjd break; 363185029Spjd } 364185029Spjd } 365185029Spjd zap_cursor_fini(&zc); 366185029Spjd mutex_exit(&spa->spa_props_lock); 367185029Spjdout: 368185029Spjd if (err && err != ENOENT) { 369185029Spjd nvlist_free(*nvp); 370185029Spjd *nvp = NULL; 371185029Spjd return (err); 372185029Spjd } 373185029Spjd 374185029Spjd return (0); 375185029Spjd} 376185029Spjd 377185029Spjd/* 378185029Spjd * Validate the given pool properties nvlist and modify the list 379185029Spjd * for the property values to be set. 380185029Spjd */ 381185029Spjdstatic int 382185029Spjdspa_prop_validate(spa_t *spa, nvlist_t *props) 383185029Spjd{ 384185029Spjd nvpair_t *elem; 385185029Spjd int error = 0, reset_bootfs = 0; 386185029Spjd uint64_t objnum; 387236884Smm boolean_t has_feature = B_FALSE; 388185029Spjd 389185029Spjd elem = NULL; 390185029Spjd while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 391185029Spjd uint64_t intval; 392236884Smm char *strval, *slash, *check, *fname; 393236884Smm const char *propname = nvpair_name(elem); 394236884Smm zpool_prop_t prop = zpool_name_to_prop(propname); 395185029Spjd 396236884Smm switch (prop) { 397236884Smm case ZPROP_INVAL: 398236884Smm if (!zpool_prop_feature(propname)) { 399236884Smm error = EINVAL; 400236884Smm break; 401236884Smm } 402185029Spjd 403236884Smm /* 404236884Smm * Sanitize the input. 405236884Smm */ 406236884Smm if (nvpair_type(elem) != DATA_TYPE_UINT64) { 407236884Smm error = EINVAL; 408236884Smm break; 409236884Smm } 410185029Spjd 411236884Smm if (nvpair_value_uint64(elem, &intval) != 0) { 412236884Smm error = EINVAL; 413236884Smm break; 414236884Smm } 415236884Smm 416236884Smm if (intval != 0) { 417236884Smm error = EINVAL; 418236884Smm break; 419236884Smm } 420236884Smm 421236884Smm fname = strchr(propname, '@') + 1; 422236884Smm if (zfeature_lookup_name(fname, NULL) != 0) { 423236884Smm error = EINVAL; 424236884Smm break; 425236884Smm } 426236884Smm 427236884Smm has_feature = B_TRUE; 428236884Smm break; 429236884Smm 430185029Spjd case ZPOOL_PROP_VERSION: 431185029Spjd error = nvpair_value_uint64(elem, &intval); 432185029Spjd if (!error && 433236884Smm (intval < spa_version(spa) || 434236884Smm intval > SPA_VERSION_BEFORE_FEATURES || 435236884Smm has_feature)) 436185029Spjd error = EINVAL; 437185029Spjd break; 438185029Spjd 439185029Spjd case ZPOOL_PROP_DELEGATION: 440185029Spjd case ZPOOL_PROP_AUTOREPLACE: 441185029Spjd case ZPOOL_PROP_LISTSNAPS: 442219089Spjd case ZPOOL_PROP_AUTOEXPAND: 443185029Spjd error = nvpair_value_uint64(elem, &intval); 444185029Spjd if (!error && intval > 1) 445185029Spjd error = EINVAL; 446185029Spjd break; 447185029Spjd 448185029Spjd case ZPOOL_PROP_BOOTFS: 449209962Smm /* 450209962Smm * If the pool version is less than SPA_VERSION_BOOTFS, 451209962Smm * or the pool is still being created (version == 0), 452209962Smm * the bootfs property cannot be set. 453209962Smm */ 454185029Spjd if (spa_version(spa) < SPA_VERSION_BOOTFS) { 455185029Spjd error = ENOTSUP; 456185029Spjd break; 457185029Spjd } 458185029Spjd 459185029Spjd /* 460185029Spjd * Make sure the vdev config is bootable 461185029Spjd */ 462185029Spjd if (!vdev_is_bootable(spa->spa_root_vdev)) { 463185029Spjd error = ENOTSUP; 464185029Spjd break; 465185029Spjd } 466185029Spjd 467185029Spjd reset_bootfs = 1; 468185029Spjd 469185029Spjd error = nvpair_value_string(elem, &strval); 470185029Spjd 471185029Spjd if (!error) { 472236884Smm objset_t *os; 473185029Spjd uint64_t compress; 474185029Spjd 475185029Spjd if (strval == NULL || strval[0] == '\0') { 476185029Spjd objnum = zpool_prop_default_numeric( 477185029Spjd ZPOOL_PROP_BOOTFS); 478185029Spjd break; 479185029Spjd } 480185029Spjd 481219089Spjd if (error = dmu_objset_hold(strval, FTAG, &os)) 482185029Spjd break; 483185029Spjd 484219089Spjd /* Must be ZPL and not gzip compressed. */ 485219089Spjd 486219089Spjd if (dmu_objset_type(os) != DMU_OST_ZFS) { 487219089Spjd error = ENOTSUP; 488219089Spjd } else if ((error = dsl_prop_get_integer(strval, 489185029Spjd zfs_prop_to_name(ZFS_PROP_COMPRESSION), 490185029Spjd &compress, NULL)) == 0 && 491185029Spjd !BOOTFS_COMPRESS_VALID(compress)) { 492185029Spjd error = ENOTSUP; 493185029Spjd } else { 494185029Spjd objnum = dmu_objset_id(os); 495185029Spjd } 496219089Spjd dmu_objset_rele(os, FTAG); 497185029Spjd } 498185029Spjd break; 499185029Spjd 500185029Spjd case ZPOOL_PROP_FAILUREMODE: 501185029Spjd error = nvpair_value_uint64(elem, &intval); 502185029Spjd if (!error && (intval < ZIO_FAILURE_MODE_WAIT || 503185029Spjd intval > ZIO_FAILURE_MODE_PANIC)) 504185029Spjd error = EINVAL; 505185029Spjd 506185029Spjd /* 507185029Spjd * This is a special case which only occurs when 508185029Spjd * the pool has completely failed. This allows 509185029Spjd * the user to change the in-core failmode property 510185029Spjd * without syncing it out to disk (I/Os might 511185029Spjd * currently be blocked). We do this by returning 512185029Spjd * EIO to the caller (spa_prop_set) to trick it 513185029Spjd * into thinking we encountered a property validation 514185029Spjd * error. 515185029Spjd */ 516185029Spjd if (!error && spa_suspended(spa)) { 517185029Spjd spa->spa_failmode = intval; 518185029Spjd error = EIO; 519185029Spjd } 520185029Spjd break; 521185029Spjd 522185029Spjd case ZPOOL_PROP_CACHEFILE: 523185029Spjd if ((error = nvpair_value_string(elem, &strval)) != 0) 524185029Spjd break; 525185029Spjd 526185029Spjd if (strval[0] == '\0') 527185029Spjd break; 528185029Spjd 529185029Spjd if (strcmp(strval, "none") == 0) 530185029Spjd break; 531185029Spjd 532185029Spjd if (strval[0] != '/') { 533185029Spjd error = EINVAL; 534185029Spjd break; 535185029Spjd } 536185029Spjd 537185029Spjd slash = strrchr(strval, '/'); 538185029Spjd ASSERT(slash != NULL); 539185029Spjd 540185029Spjd if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 541185029Spjd strcmp(slash, "/..") == 0) 542185029Spjd error = EINVAL; 543185029Spjd break; 544219089Spjd 545228103Smm case ZPOOL_PROP_COMMENT: 546228103Smm if ((error = nvpair_value_string(elem, &strval)) != 0) 547228103Smm break; 548228103Smm for (check = strval; *check != '\0'; check++) { 549228103Smm /* 550228103Smm * The kernel doesn't have an easy isprint() 551228103Smm * check. For this kernel check, we merely 552228103Smm * check ASCII apart from DEL. Fix this if 553228103Smm * there is an easy-to-use kernel isprint(). 554228103Smm */ 555228103Smm if (*check >= 0x7f) { 556228103Smm error = EINVAL; 557228103Smm break; 558228103Smm } 559228103Smm check++; 560228103Smm } 561228103Smm if (strlen(strval) > ZPROP_MAX_COMMENT) 562228103Smm error = E2BIG; 563228103Smm break; 564228103Smm 565219089Spjd case ZPOOL_PROP_DEDUPDITTO: 566219089Spjd if (spa_version(spa) < SPA_VERSION_DEDUP) 567219089Spjd error = ENOTSUP; 568219089Spjd else 569219089Spjd error = nvpair_value_uint64(elem, &intval); 570219089Spjd if (error == 0 && 571219089Spjd intval != 0 && intval < ZIO_DEDUPDITTO_MIN) 572219089Spjd error = EINVAL; 573219089Spjd break; 574185029Spjd } 575185029Spjd 576185029Spjd if (error) 577185029Spjd break; 578185029Spjd } 579185029Spjd 580185029Spjd if (!error && reset_bootfs) { 581185029Spjd error = nvlist_remove(props, 582185029Spjd zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 583185029Spjd 584185029Spjd if (!error) { 585185029Spjd error = nvlist_add_uint64(props, 586185029Spjd zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 587185029Spjd } 588185029Spjd } 589185029Spjd 590185029Spjd return (error); 591185029Spjd} 592185029Spjd 593209962Smmvoid 594209962Smmspa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) 595209962Smm{ 596209962Smm char *cachefile; 597209962Smm spa_config_dirent_t *dp; 598209962Smm 599209962Smm if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), 600209962Smm &cachefile) != 0) 601209962Smm return; 602209962Smm 603209962Smm dp = kmem_alloc(sizeof (spa_config_dirent_t), 604209962Smm KM_SLEEP); 605209962Smm 606209962Smm if (cachefile[0] == '\0') 607209962Smm dp->scd_path = spa_strdup(spa_config_path); 608209962Smm else if (strcmp(cachefile, "none") == 0) 609209962Smm dp->scd_path = NULL; 610209962Smm else 611209962Smm dp->scd_path = spa_strdup(cachefile); 612209962Smm 613209962Smm list_insert_head(&spa->spa_config_list, dp); 614209962Smm if (need_sync) 615209962Smm spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 616209962Smm} 617209962Smm 618185029Spjdint 619185029Spjdspa_prop_set(spa_t *spa, nvlist_t *nvp) 620185029Spjd{ 621185029Spjd int error; 622236884Smm nvpair_t *elem = NULL; 623209962Smm boolean_t need_sync = B_FALSE; 624185029Spjd 625185029Spjd if ((error = spa_prop_validate(spa, nvp)) != 0) 626185029Spjd return (error); 627185029Spjd 628209962Smm while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { 629236884Smm zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem)); 630209962Smm 631219089Spjd if (prop == ZPOOL_PROP_CACHEFILE || 632219089Spjd prop == ZPOOL_PROP_ALTROOT || 633219089Spjd prop == ZPOOL_PROP_READONLY) 634209962Smm continue; 635209962Smm 636236884Smm if (prop == ZPOOL_PROP_VERSION || prop == ZPROP_INVAL) { 637236884Smm uint64_t ver; 638236884Smm 639236884Smm if (prop == ZPOOL_PROP_VERSION) { 640236884Smm VERIFY(nvpair_value_uint64(elem, &ver) == 0); 641236884Smm } else { 642236884Smm ASSERT(zpool_prop_feature(nvpair_name(elem))); 643236884Smm ver = SPA_VERSION_FEATURES; 644236884Smm need_sync = B_TRUE; 645236884Smm } 646236884Smm 647236884Smm /* Save time if the version is already set. */ 648236884Smm if (ver == spa_version(spa)) 649236884Smm continue; 650236884Smm 651236884Smm /* 652236884Smm * In addition to the pool directory object, we might 653236884Smm * create the pool properties object, the features for 654236884Smm * read object, the features for write object, or the 655236884Smm * feature descriptions object. 656236884Smm */ 657236884Smm error = dsl_sync_task_do(spa_get_dsl(spa), NULL, 658236884Smm spa_sync_version, spa, &ver, 6); 659236884Smm if (error) 660236884Smm return (error); 661236884Smm continue; 662236884Smm } 663236884Smm 664209962Smm need_sync = B_TRUE; 665209962Smm break; 666209962Smm } 667209962Smm 668236884Smm if (need_sync) { 669209962Smm return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, 670236884Smm spa, nvp, 6)); 671236884Smm } 672236884Smm 673236884Smm return (0); 674185029Spjd} 675185029Spjd 676185029Spjd/* 677185029Spjd * If the bootfs property value is dsobj, clear it. 678185029Spjd */ 679185029Spjdvoid 680185029Spjdspa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 681185029Spjd{ 682185029Spjd if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 683185029Spjd VERIFY(zap_remove(spa->spa_meta_objset, 684185029Spjd spa->spa_pool_props_object, 685185029Spjd zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 686185029Spjd spa->spa_bootfs = 0; 687185029Spjd } 688185029Spjd} 689185029Spjd 690239620Smm/*ARGSUSED*/ 691239620Smmstatic int 692239620Smmspa_change_guid_check(void *arg1, void *arg2, dmu_tx_t *tx) 693239620Smm{ 694239620Smm spa_t *spa = arg1; 695239620Smm uint64_t *newguid = arg2; 696239620Smm vdev_t *rvd = spa->spa_root_vdev; 697239620Smm uint64_t vdev_state; 698239620Smm 699239620Smm spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 700239620Smm vdev_state = rvd->vdev_state; 701239620Smm spa_config_exit(spa, SCL_STATE, FTAG); 702239620Smm 703239620Smm if (vdev_state != VDEV_STATE_HEALTHY) 704239620Smm return (ENXIO); 705239620Smm 706239620Smm ASSERT3U(spa_guid(spa), !=, *newguid); 707239620Smm 708239620Smm return (0); 709239620Smm} 710239620Smm 711239620Smmstatic void 712239620Smmspa_change_guid_sync(void *arg1, void *arg2, dmu_tx_t *tx) 713239620Smm{ 714239620Smm spa_t *spa = arg1; 715239620Smm uint64_t *newguid = arg2; 716239620Smm uint64_t oldguid; 717239620Smm vdev_t *rvd = spa->spa_root_vdev; 718239620Smm 719239620Smm oldguid = spa_guid(spa); 720239620Smm 721239620Smm spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 722239620Smm rvd->vdev_guid = *newguid; 723239620Smm rvd->vdev_guid_sum += (*newguid - oldguid); 724239620Smm vdev_config_dirty(rvd); 725239620Smm spa_config_exit(spa, SCL_STATE, FTAG); 726239620Smm 727239620Smm#ifdef __FreeBSD__ 728239620Smm /* 729239620Smm * TODO: until recent illumos logging changes are merged 730239620Smm * log reguid as pool property change 731239620Smm */ 732239620Smm spa_history_log_internal(LOG_POOL_PROPSET, spa, tx, 733239620Smm "guid change old=%llu new=%llu", oldguid, *newguid); 734239620Smm#else 735239620Smm spa_history_log_internal(spa, "guid change", tx, "old=%lld new=%lld", 736239620Smm oldguid, *newguid); 737239620Smm#endif 738239620Smm} 739239620Smm 740185029Spjd/* 741228103Smm * Change the GUID for the pool. This is done so that we can later 742228103Smm * re-import a pool built from a clone of our own vdevs. We will modify 743228103Smm * the root vdev's guid, our own pool guid, and then mark all of our 744228103Smm * vdevs dirty. Note that we must make sure that all our vdevs are 745228103Smm * online when we do this, or else any vdevs that weren't present 746228103Smm * would be orphaned from our pool. We are also going to issue a 747228103Smm * sysevent to update any watchers. 748228103Smm */ 749228103Smmint 750228103Smmspa_change_guid(spa_t *spa) 751228103Smm{ 752239620Smm int error; 753239620Smm uint64_t guid; 754228103Smm 755239620Smm mutex_enter(&spa_namespace_lock); 756239620Smm guid = spa_generate_guid(NULL); 757228103Smm 758239620Smm error = dsl_sync_task_do(spa_get_dsl(spa), spa_change_guid_check, 759239620Smm spa_change_guid_sync, spa, &guid, 5); 760228103Smm 761239620Smm if (error == 0) { 762239620Smm spa_config_sync(spa, B_FALSE, B_TRUE); 763239620Smm spa_event_notify(spa, NULL, ESC_ZFS_POOL_REGUID); 764239620Smm } 765228103Smm 766239620Smm mutex_exit(&spa_namespace_lock); 767228103Smm 768239620Smm return (error); 769228103Smm} 770228103Smm 771228103Smm/* 772185029Spjd * ========================================================================== 773168404Spjd * SPA state manipulation (open/create/destroy/import/export) 774168404Spjd * ========================================================================== 775168404Spjd */ 776168404Spjd 777168404Spjdstatic int 778168404Spjdspa_error_entry_compare(const void *a, const void *b) 779168404Spjd{ 780168404Spjd spa_error_entry_t *sa = (spa_error_entry_t *)a; 781168404Spjd spa_error_entry_t *sb = (spa_error_entry_t *)b; 782168404Spjd int ret; 783168404Spjd 784168404Spjd ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 785168404Spjd sizeof (zbookmark_t)); 786168404Spjd 787168404Spjd if (ret < 0) 788168404Spjd return (-1); 789168404Spjd else if (ret > 0) 790168404Spjd return (1); 791168404Spjd else 792168404Spjd return (0); 793168404Spjd} 794168404Spjd 795168404Spjd/* 796168404Spjd * Utility function which retrieves copies of the current logs and 797168404Spjd * re-initializes them in the process. 798168404Spjd */ 799168404Spjdvoid 800168404Spjdspa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 801168404Spjd{ 802168404Spjd ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 803168404Spjd 804168404Spjd bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 805168404Spjd bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 806168404Spjd 807168404Spjd avl_create(&spa->spa_errlist_scrub, 808168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 809168404Spjd offsetof(spa_error_entry_t, se_avl)); 810168404Spjd avl_create(&spa->spa_errlist_last, 811168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 812168404Spjd offsetof(spa_error_entry_t, se_avl)); 813168404Spjd} 814168404Spjd 815219089Spjdstatic taskq_t * 816219089Spjdspa_taskq_create(spa_t *spa, const char *name, enum zti_modes mode, 817219089Spjd uint_t value) 818168404Spjd{ 819219089Spjd uint_t flags = TASKQ_PREPOPULATE; 820219089Spjd boolean_t batch = B_FALSE; 821168404Spjd 822219089Spjd switch (mode) { 823219089Spjd case zti_mode_null: 824219089Spjd return (NULL); /* no taskq needed */ 825168404Spjd 826219089Spjd case zti_mode_fixed: 827219089Spjd ASSERT3U(value, >=, 1); 828219089Spjd value = MAX(value, 1); 829219089Spjd break; 830168404Spjd 831219089Spjd case zti_mode_batch: 832219089Spjd batch = B_TRUE; 833219089Spjd flags |= TASKQ_THREADS_CPU_PCT; 834219089Spjd value = zio_taskq_batch_pct; 835219089Spjd break; 836219089Spjd 837219089Spjd case zti_mode_online_percent: 838219089Spjd flags |= TASKQ_THREADS_CPU_PCT; 839219089Spjd break; 840219089Spjd 841219089Spjd default: 842219089Spjd panic("unrecognized mode for %s taskq (%u:%u) in " 843219089Spjd "spa_activate()", 844219089Spjd name, mode, value); 845219089Spjd break; 846219089Spjd } 847219089Spjd 848219089Spjd#ifdef SYSDC 849219089Spjd if (zio_taskq_sysdc && spa->spa_proc != &p0) { 850219089Spjd if (batch) 851219089Spjd flags |= TASKQ_DC_BATCH; 852219089Spjd 853219089Spjd return (taskq_create_sysdc(name, value, 50, INT_MAX, 854219089Spjd spa->spa_proc, zio_taskq_basedc, flags)); 855219089Spjd } 856219089Spjd#endif 857219089Spjd return (taskq_create_proc(name, value, maxclsyspri, 50, INT_MAX, 858219089Spjd spa->spa_proc, flags)); 859219089Spjd} 860219089Spjd 861219089Spjdstatic void 862219089Spjdspa_create_zio_taskqs(spa_t *spa) 863219089Spjd{ 864185029Spjd for (int t = 0; t < ZIO_TYPES; t++) { 865185029Spjd for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 866211931Smm const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; 867211931Smm enum zti_modes mode = ztip->zti_mode; 868211931Smm uint_t value = ztip->zti_value; 869209962Smm char name[32]; 870209962Smm 871209962Smm (void) snprintf(name, sizeof (name), 872211931Smm "%s_%s", zio_type_name[t], zio_taskq_types[q]); 873209962Smm 874219089Spjd spa->spa_zio_taskq[t][q] = 875219089Spjd spa_taskq_create(spa, name, mode, value); 876219089Spjd } 877219089Spjd } 878219089Spjd} 879209962Smm 880219089Spjd#ifdef _KERNEL 881219089Spjd#ifdef SPA_PROCESS 882219089Spjdstatic void 883219089Spjdspa_thread(void *arg) 884219089Spjd{ 885219089Spjd callb_cpr_t cprinfo; 886209962Smm 887219089Spjd spa_t *spa = arg; 888219089Spjd user_t *pu = PTOU(curproc); 889209962Smm 890219089Spjd CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, 891219089Spjd spa->spa_name); 892209962Smm 893219089Spjd ASSERT(curproc != &p0); 894219089Spjd (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), 895219089Spjd "zpool-%s", spa->spa_name); 896219089Spjd (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); 897211931Smm 898219089Spjd#ifdef PSRSET_BIND 899219089Spjd /* bind this thread to the requested psrset */ 900219089Spjd if (zio_taskq_psrset_bind != PS_NONE) { 901219089Spjd pool_lock(); 902219089Spjd mutex_enter(&cpu_lock); 903219089Spjd mutex_enter(&pidlock); 904219089Spjd mutex_enter(&curproc->p_lock); 905219089Spjd 906219089Spjd if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, 907219089Spjd 0, NULL, NULL) == 0) { 908219089Spjd curthread->t_bind_pset = zio_taskq_psrset_bind; 909219089Spjd } else { 910219089Spjd cmn_err(CE_WARN, 911219089Spjd "Couldn't bind process for zfs pool \"%s\" to " 912219089Spjd "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); 913219089Spjd } 914219089Spjd 915219089Spjd mutex_exit(&curproc->p_lock); 916219089Spjd mutex_exit(&pidlock); 917219089Spjd mutex_exit(&cpu_lock); 918219089Spjd pool_unlock(); 919219089Spjd } 920219089Spjd#endif 921219089Spjd 922219089Spjd#ifdef SYSDC 923219089Spjd if (zio_taskq_sysdc) { 924219089Spjd sysdc_thread_enter(curthread, 100, 0); 925219089Spjd } 926219089Spjd#endif 927219089Spjd 928219089Spjd spa->spa_proc = curproc; 929219089Spjd spa->spa_did = curthread->t_did; 930219089Spjd 931219089Spjd spa_create_zio_taskqs(spa); 932219089Spjd 933219089Spjd mutex_enter(&spa->spa_proc_lock); 934219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); 935219089Spjd 936219089Spjd spa->spa_proc_state = SPA_PROC_ACTIVE; 937219089Spjd cv_broadcast(&spa->spa_proc_cv); 938219089Spjd 939219089Spjd CALLB_CPR_SAFE_BEGIN(&cprinfo); 940219089Spjd while (spa->spa_proc_state == SPA_PROC_ACTIVE) 941219089Spjd cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 942219089Spjd CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); 943219089Spjd 944219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); 945219089Spjd spa->spa_proc_state = SPA_PROC_GONE; 946219089Spjd spa->spa_proc = &p0; 947219089Spjd cv_broadcast(&spa->spa_proc_cv); 948219089Spjd CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ 949219089Spjd 950219089Spjd mutex_enter(&curproc->p_lock); 951219089Spjd lwp_exit(); 952219089Spjd} 953219089Spjd#endif /* SPA_PROCESS */ 954219089Spjd#endif 955219089Spjd 956219089Spjd/* 957219089Spjd * Activate an uninitialized pool. 958219089Spjd */ 959219089Spjdstatic void 960219089Spjdspa_activate(spa_t *spa, int mode) 961219089Spjd{ 962219089Spjd ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 963219089Spjd 964219089Spjd spa->spa_state = POOL_STATE_ACTIVE; 965219089Spjd spa->spa_mode = mode; 966219089Spjd 967219089Spjd spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops); 968219089Spjd spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); 969219089Spjd 970219089Spjd /* Try to create a covering process */ 971219089Spjd mutex_enter(&spa->spa_proc_lock); 972219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_NONE); 973219089Spjd ASSERT(spa->spa_proc == &p0); 974219089Spjd spa->spa_did = 0; 975219089Spjd 976219089Spjd#ifdef SPA_PROCESS 977219089Spjd /* Only create a process if we're going to be around a while. */ 978219089Spjd if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { 979219089Spjd if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, 980219089Spjd NULL, 0) == 0) { 981219089Spjd spa->spa_proc_state = SPA_PROC_CREATED; 982219089Spjd while (spa->spa_proc_state == SPA_PROC_CREATED) { 983219089Spjd cv_wait(&spa->spa_proc_cv, 984219089Spjd &spa->spa_proc_lock); 985209962Smm } 986219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 987219089Spjd ASSERT(spa->spa_proc != &p0); 988219089Spjd ASSERT(spa->spa_did != 0); 989219089Spjd } else { 990219089Spjd#ifdef _KERNEL 991219089Spjd cmn_err(CE_WARN, 992219089Spjd "Couldn't create process for zfs pool \"%s\"\n", 993219089Spjd spa->spa_name); 994219089Spjd#endif 995185029Spjd } 996168404Spjd } 997219089Spjd#endif /* SPA_PROCESS */ 998219089Spjd mutex_exit(&spa->spa_proc_lock); 999168404Spjd 1000219089Spjd /* If we didn't create a process, we need to create our taskqs. */ 1001219089Spjd ASSERT(spa->spa_proc == &p0); 1002219089Spjd if (spa->spa_proc == &p0) { 1003219089Spjd spa_create_zio_taskqs(spa); 1004219089Spjd } 1005219089Spjd 1006240868Spjd /* 1007240868Spjd * Start TRIM thread. 1008240868Spjd */ 1009240868Spjd trim_thread_create(spa); 1010240868Spjd 1011185029Spjd list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), 1012185029Spjd offsetof(vdev_t, vdev_config_dirty_node)); 1013185029Spjd list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), 1014185029Spjd offsetof(vdev_t, vdev_state_dirty_node)); 1015168404Spjd 1016168404Spjd txg_list_create(&spa->spa_vdev_txg_list, 1017168404Spjd offsetof(struct vdev, vdev_txg_node)); 1018168404Spjd 1019168404Spjd avl_create(&spa->spa_errlist_scrub, 1020168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 1021168404Spjd offsetof(spa_error_entry_t, se_avl)); 1022168404Spjd avl_create(&spa->spa_errlist_last, 1023168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 1024168404Spjd offsetof(spa_error_entry_t, se_avl)); 1025168404Spjd} 1026168404Spjd 1027168404Spjd/* 1028168404Spjd * Opposite of spa_activate(). 1029168404Spjd */ 1030168404Spjdstatic void 1031168404Spjdspa_deactivate(spa_t *spa) 1032168404Spjd{ 1033168404Spjd ASSERT(spa->spa_sync_on == B_FALSE); 1034168404Spjd ASSERT(spa->spa_dsl_pool == NULL); 1035168404Spjd ASSERT(spa->spa_root_vdev == NULL); 1036209962Smm ASSERT(spa->spa_async_zio_root == NULL); 1037168404Spjd ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 1038168404Spjd 1039240868Spjd /* 1040240868Spjd * Stop TRIM thread in case spa_unload() wasn't called directly 1041240868Spjd * before spa_deactivate(). 1042240868Spjd */ 1043240868Spjd trim_thread_destroy(spa); 1044240868Spjd 1045168404Spjd txg_list_destroy(&spa->spa_vdev_txg_list); 1046168404Spjd 1047185029Spjd list_destroy(&spa->spa_config_dirty_list); 1048185029Spjd list_destroy(&spa->spa_state_dirty_list); 1049168404Spjd 1050185029Spjd for (int t = 0; t < ZIO_TYPES; t++) { 1051185029Spjd for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 1052211931Smm if (spa->spa_zio_taskq[t][q] != NULL) 1053211931Smm taskq_destroy(spa->spa_zio_taskq[t][q]); 1054185029Spjd spa->spa_zio_taskq[t][q] = NULL; 1055185029Spjd } 1056168404Spjd } 1057168404Spjd 1058168404Spjd metaslab_class_destroy(spa->spa_normal_class); 1059168404Spjd spa->spa_normal_class = NULL; 1060168404Spjd 1061185029Spjd metaslab_class_destroy(spa->spa_log_class); 1062185029Spjd spa->spa_log_class = NULL; 1063185029Spjd 1064168404Spjd /* 1065168404Spjd * If this was part of an import or the open otherwise failed, we may 1066168404Spjd * still have errors left in the queues. Empty them just in case. 1067168404Spjd */ 1068168404Spjd spa_errlog_drain(spa); 1069168404Spjd 1070168404Spjd avl_destroy(&spa->spa_errlist_scrub); 1071168404Spjd avl_destroy(&spa->spa_errlist_last); 1072168404Spjd 1073168404Spjd spa->spa_state = POOL_STATE_UNINITIALIZED; 1074219089Spjd 1075219089Spjd mutex_enter(&spa->spa_proc_lock); 1076219089Spjd if (spa->spa_proc_state != SPA_PROC_NONE) { 1077219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1078219089Spjd spa->spa_proc_state = SPA_PROC_DEACTIVATE; 1079219089Spjd cv_broadcast(&spa->spa_proc_cv); 1080219089Spjd while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { 1081219089Spjd ASSERT(spa->spa_proc != &p0); 1082219089Spjd cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1083219089Spjd } 1084219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_GONE); 1085219089Spjd spa->spa_proc_state = SPA_PROC_NONE; 1086219089Spjd } 1087219089Spjd ASSERT(spa->spa_proc == &p0); 1088219089Spjd mutex_exit(&spa->spa_proc_lock); 1089219089Spjd 1090219089Spjd#ifdef SPA_PROCESS 1091219089Spjd /* 1092219089Spjd * We want to make sure spa_thread() has actually exited the ZFS 1093219089Spjd * module, so that the module can't be unloaded out from underneath 1094219089Spjd * it. 1095219089Spjd */ 1096219089Spjd if (spa->spa_did != 0) { 1097219089Spjd thread_join(spa->spa_did); 1098219089Spjd spa->spa_did = 0; 1099219089Spjd } 1100219089Spjd#endif /* SPA_PROCESS */ 1101168404Spjd} 1102168404Spjd 1103168404Spjd/* 1104168404Spjd * Verify a pool configuration, and construct the vdev tree appropriately. This 1105168404Spjd * will create all the necessary vdevs in the appropriate layout, with each vdev 1106168404Spjd * in the CLOSED state. This will prep the pool before open/creation/import. 1107168404Spjd * All vdev validation is done by the vdev_alloc() routine. 1108168404Spjd */ 1109168404Spjdstatic int 1110168404Spjdspa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 1111168404Spjd uint_t id, int atype) 1112168404Spjd{ 1113168404Spjd nvlist_t **child; 1114219089Spjd uint_t children; 1115168404Spjd int error; 1116168404Spjd 1117168404Spjd if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 1118168404Spjd return (error); 1119168404Spjd 1120168404Spjd if ((*vdp)->vdev_ops->vdev_op_leaf) 1121168404Spjd return (0); 1122168404Spjd 1123185029Spjd error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 1124185029Spjd &child, &children); 1125185029Spjd 1126185029Spjd if (error == ENOENT) 1127185029Spjd return (0); 1128185029Spjd 1129185029Spjd if (error) { 1130168404Spjd vdev_free(*vdp); 1131168404Spjd *vdp = NULL; 1132168404Spjd return (EINVAL); 1133168404Spjd } 1134168404Spjd 1135219089Spjd for (int c = 0; c < children; c++) { 1136168404Spjd vdev_t *vd; 1137168404Spjd if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 1138168404Spjd atype)) != 0) { 1139168404Spjd vdev_free(*vdp); 1140168404Spjd *vdp = NULL; 1141168404Spjd return (error); 1142168404Spjd } 1143168404Spjd } 1144168404Spjd 1145168404Spjd ASSERT(*vdp != NULL); 1146168404Spjd 1147168404Spjd return (0); 1148168404Spjd} 1149168404Spjd 1150168404Spjd/* 1151168404Spjd * Opposite of spa_load(). 1152168404Spjd */ 1153168404Spjdstatic void 1154168404Spjdspa_unload(spa_t *spa) 1155168404Spjd{ 1156168404Spjd int i; 1157168404Spjd 1158185029Spjd ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1159185029Spjd 1160168404Spjd /* 1161240868Spjd * Stop TRIM thread. 1162240868Spjd */ 1163240868Spjd trim_thread_destroy(spa); 1164240868Spjd 1165240868Spjd /* 1166168404Spjd * Stop async tasks. 1167168404Spjd */ 1168168404Spjd spa_async_suspend(spa); 1169168404Spjd 1170168404Spjd /* 1171168404Spjd * Stop syncing. 1172168404Spjd */ 1173168404Spjd if (spa->spa_sync_on) { 1174168404Spjd txg_sync_stop(spa->spa_dsl_pool); 1175168404Spjd spa->spa_sync_on = B_FALSE; 1176168404Spjd } 1177168404Spjd 1178168404Spjd /* 1179185029Spjd * Wait for any outstanding async I/O to complete. 1180168404Spjd */ 1181209962Smm if (spa->spa_async_zio_root != NULL) { 1182209962Smm (void) zio_wait(spa->spa_async_zio_root); 1183209962Smm spa->spa_async_zio_root = NULL; 1184209962Smm } 1185168404Spjd 1186219089Spjd bpobj_close(&spa->spa_deferred_bpobj); 1187219089Spjd 1188168404Spjd /* 1189168404Spjd * Close the dsl pool. 1190168404Spjd */ 1191168404Spjd if (spa->spa_dsl_pool) { 1192168404Spjd dsl_pool_close(spa->spa_dsl_pool); 1193168404Spjd spa->spa_dsl_pool = NULL; 1194219089Spjd spa->spa_meta_objset = NULL; 1195168404Spjd } 1196168404Spjd 1197219089Spjd ddt_unload(spa); 1198219089Spjd 1199209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1200209962Smm 1201168404Spjd /* 1202209962Smm * Drop and purge level 2 cache 1203209962Smm */ 1204209962Smm spa_l2cache_drop(spa); 1205209962Smm 1206209962Smm /* 1207168404Spjd * Close all vdevs. 1208168404Spjd */ 1209168404Spjd if (spa->spa_root_vdev) 1210168404Spjd vdev_free(spa->spa_root_vdev); 1211168404Spjd ASSERT(spa->spa_root_vdev == NULL); 1212168404Spjd 1213185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) 1214185029Spjd vdev_free(spa->spa_spares.sav_vdevs[i]); 1215185029Spjd if (spa->spa_spares.sav_vdevs) { 1216185029Spjd kmem_free(spa->spa_spares.sav_vdevs, 1217185029Spjd spa->spa_spares.sav_count * sizeof (void *)); 1218185029Spjd spa->spa_spares.sav_vdevs = NULL; 1219168404Spjd } 1220185029Spjd if (spa->spa_spares.sav_config) { 1221185029Spjd nvlist_free(spa->spa_spares.sav_config); 1222185029Spjd spa->spa_spares.sav_config = NULL; 1223168404Spjd } 1224185029Spjd spa->spa_spares.sav_count = 0; 1225168404Spjd 1226230514Smm for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 1227230514Smm vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]); 1228185029Spjd vdev_free(spa->spa_l2cache.sav_vdevs[i]); 1229230514Smm } 1230185029Spjd if (spa->spa_l2cache.sav_vdevs) { 1231185029Spjd kmem_free(spa->spa_l2cache.sav_vdevs, 1232185029Spjd spa->spa_l2cache.sav_count * sizeof (void *)); 1233185029Spjd spa->spa_l2cache.sav_vdevs = NULL; 1234185029Spjd } 1235185029Spjd if (spa->spa_l2cache.sav_config) { 1236185029Spjd nvlist_free(spa->spa_l2cache.sav_config); 1237185029Spjd spa->spa_l2cache.sav_config = NULL; 1238185029Spjd } 1239185029Spjd spa->spa_l2cache.sav_count = 0; 1240185029Spjd 1241168404Spjd spa->spa_async_suspended = 0; 1242209962Smm 1243228103Smm if (spa->spa_comment != NULL) { 1244228103Smm spa_strfree(spa->spa_comment); 1245228103Smm spa->spa_comment = NULL; 1246228103Smm } 1247228103Smm 1248209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 1249168404Spjd} 1250168404Spjd 1251168404Spjd/* 1252168404Spjd * Load (or re-load) the current list of vdevs describing the active spares for 1253168404Spjd * this pool. When this is called, we have some form of basic information in 1254185029Spjd * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and 1255185029Spjd * then re-generate a more complete list including status information. 1256168404Spjd */ 1257168404Spjdstatic void 1258168404Spjdspa_load_spares(spa_t *spa) 1259168404Spjd{ 1260168404Spjd nvlist_t **spares; 1261168404Spjd uint_t nspares; 1262168404Spjd int i; 1263168404Spjd vdev_t *vd, *tvd; 1264168404Spjd 1265185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1266185029Spjd 1267168404Spjd /* 1268168404Spjd * First, close and free any existing spare vdevs. 1269168404Spjd */ 1270185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) { 1271185029Spjd vd = spa->spa_spares.sav_vdevs[i]; 1272168404Spjd 1273168404Spjd /* Undo the call to spa_activate() below */ 1274185029Spjd if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1275185029Spjd B_FALSE)) != NULL && tvd->vdev_isspare) 1276168404Spjd spa_spare_remove(tvd); 1277168404Spjd vdev_close(vd); 1278168404Spjd vdev_free(vd); 1279168404Spjd } 1280168404Spjd 1281185029Spjd if (spa->spa_spares.sav_vdevs) 1282185029Spjd kmem_free(spa->spa_spares.sav_vdevs, 1283185029Spjd spa->spa_spares.sav_count * sizeof (void *)); 1284168404Spjd 1285185029Spjd if (spa->spa_spares.sav_config == NULL) 1286168404Spjd nspares = 0; 1287168404Spjd else 1288185029Spjd VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 1289168404Spjd ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1290168404Spjd 1291185029Spjd spa->spa_spares.sav_count = (int)nspares; 1292185029Spjd spa->spa_spares.sav_vdevs = NULL; 1293168404Spjd 1294168404Spjd if (nspares == 0) 1295168404Spjd return; 1296168404Spjd 1297168404Spjd /* 1298168404Spjd * Construct the array of vdevs, opening them to get status in the 1299168404Spjd * process. For each spare, there is potentially two different vdev_t 1300168404Spjd * structures associated with it: one in the list of spares (used only 1301168404Spjd * for basic validation purposes) and one in the active vdev 1302168404Spjd * configuration (if it's spared in). During this phase we open and 1303168404Spjd * validate each vdev on the spare list. If the vdev also exists in the 1304168404Spjd * active configuration, then we also mark this vdev as an active spare. 1305168404Spjd */ 1306185029Spjd spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), 1307185029Spjd KM_SLEEP); 1308185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) { 1309168404Spjd VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 1310168404Spjd VDEV_ALLOC_SPARE) == 0); 1311168404Spjd ASSERT(vd != NULL); 1312168404Spjd 1313185029Spjd spa->spa_spares.sav_vdevs[i] = vd; 1314168404Spjd 1315185029Spjd if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1316185029Spjd B_FALSE)) != NULL) { 1317168404Spjd if (!tvd->vdev_isspare) 1318168404Spjd spa_spare_add(tvd); 1319168404Spjd 1320168404Spjd /* 1321168404Spjd * We only mark the spare active if we were successfully 1322168404Spjd * able to load the vdev. Otherwise, importing a pool 1323168404Spjd * with a bad active spare would result in strange 1324168404Spjd * behavior, because multiple pool would think the spare 1325168404Spjd * is actively in use. 1326168404Spjd * 1327168404Spjd * There is a vulnerability here to an equally bizarre 1328168404Spjd * circumstance, where a dead active spare is later 1329168404Spjd * brought back to life (onlined or otherwise). Given 1330168404Spjd * the rarity of this scenario, and the extra complexity 1331168404Spjd * it adds, we ignore the possibility. 1332168404Spjd */ 1333168404Spjd if (!vdev_is_dead(tvd)) 1334168404Spjd spa_spare_activate(tvd); 1335168404Spjd } 1336168404Spjd 1337185029Spjd vd->vdev_top = vd; 1338209962Smm vd->vdev_aux = &spa->spa_spares; 1339185029Spjd 1340168404Spjd if (vdev_open(vd) != 0) 1341168404Spjd continue; 1342168404Spjd 1343185029Spjd if (vdev_validate_aux(vd) == 0) 1344185029Spjd spa_spare_add(vd); 1345168404Spjd } 1346168404Spjd 1347168404Spjd /* 1348168404Spjd * Recompute the stashed list of spares, with status information 1349168404Spjd * this time. 1350168404Spjd */ 1351185029Spjd VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, 1352168404Spjd DATA_TYPE_NVLIST_ARRAY) == 0); 1353168404Spjd 1354185029Spjd spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), 1355185029Spjd KM_SLEEP); 1356185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) 1357185029Spjd spares[i] = vdev_config_generate(spa, 1358219089Spjd spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); 1359185029Spjd VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 1360185029Spjd ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); 1361185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) 1362168404Spjd nvlist_free(spares[i]); 1363185029Spjd kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); 1364168404Spjd} 1365168404Spjd 1366185029Spjd/* 1367185029Spjd * Load (or re-load) the current list of vdevs describing the active l2cache for 1368185029Spjd * this pool. When this is called, we have some form of basic information in 1369185029Spjd * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and 1370185029Spjd * then re-generate a more complete list including status information. 1371185029Spjd * Devices which are already active have their details maintained, and are 1372185029Spjd * not re-opened. 1373185029Spjd */ 1374185029Spjdstatic void 1375185029Spjdspa_load_l2cache(spa_t *spa) 1376185029Spjd{ 1377185029Spjd nvlist_t **l2cache; 1378185029Spjd uint_t nl2cache; 1379185029Spjd int i, j, oldnvdevs; 1380219089Spjd uint64_t guid; 1381185029Spjd vdev_t *vd, **oldvdevs, **newvdevs; 1382185029Spjd spa_aux_vdev_t *sav = &spa->spa_l2cache; 1383185029Spjd 1384185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1385185029Spjd 1386185029Spjd if (sav->sav_config != NULL) { 1387185029Spjd VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 1388185029Spjd ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 1389185029Spjd newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); 1390185029Spjd } else { 1391185029Spjd nl2cache = 0; 1392185029Spjd } 1393185029Spjd 1394185029Spjd oldvdevs = sav->sav_vdevs; 1395185029Spjd oldnvdevs = sav->sav_count; 1396185029Spjd sav->sav_vdevs = NULL; 1397185029Spjd sav->sav_count = 0; 1398185029Spjd 1399185029Spjd /* 1400185029Spjd * Process new nvlist of vdevs. 1401185029Spjd */ 1402185029Spjd for (i = 0; i < nl2cache; i++) { 1403185029Spjd VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, 1404185029Spjd &guid) == 0); 1405185029Spjd 1406185029Spjd newvdevs[i] = NULL; 1407185029Spjd for (j = 0; j < oldnvdevs; j++) { 1408185029Spjd vd = oldvdevs[j]; 1409185029Spjd if (vd != NULL && guid == vd->vdev_guid) { 1410185029Spjd /* 1411185029Spjd * Retain previous vdev for add/remove ops. 1412185029Spjd */ 1413185029Spjd newvdevs[i] = vd; 1414185029Spjd oldvdevs[j] = NULL; 1415185029Spjd break; 1416185029Spjd } 1417185029Spjd } 1418185029Spjd 1419185029Spjd if (newvdevs[i] == NULL) { 1420185029Spjd /* 1421185029Spjd * Create new vdev 1422185029Spjd */ 1423185029Spjd VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, 1424185029Spjd VDEV_ALLOC_L2CACHE) == 0); 1425185029Spjd ASSERT(vd != NULL); 1426185029Spjd newvdevs[i] = vd; 1427185029Spjd 1428185029Spjd /* 1429185029Spjd * Commit this vdev as an l2cache device, 1430185029Spjd * even if it fails to open. 1431185029Spjd */ 1432185029Spjd spa_l2cache_add(vd); 1433185029Spjd 1434185029Spjd vd->vdev_top = vd; 1435185029Spjd vd->vdev_aux = sav; 1436185029Spjd 1437185029Spjd spa_l2cache_activate(vd); 1438185029Spjd 1439185029Spjd if (vdev_open(vd) != 0) 1440185029Spjd continue; 1441185029Spjd 1442185029Spjd (void) vdev_validate_aux(vd); 1443185029Spjd 1444219089Spjd if (!vdev_is_dead(vd)) 1445219089Spjd l2arc_add_vdev(spa, vd); 1446185029Spjd } 1447185029Spjd } 1448185029Spjd 1449185029Spjd /* 1450185029Spjd * Purge vdevs that were dropped 1451185029Spjd */ 1452185029Spjd for (i = 0; i < oldnvdevs; i++) { 1453185029Spjd uint64_t pool; 1454185029Spjd 1455185029Spjd vd = oldvdevs[i]; 1456185029Spjd if (vd != NULL) { 1457230514Smm ASSERT(vd->vdev_isl2cache); 1458230514Smm 1459209962Smm if (spa_l2cache_exists(vd->vdev_guid, &pool) && 1460209962Smm pool != 0ULL && l2arc_vdev_present(vd)) 1461185029Spjd l2arc_remove_vdev(vd); 1462230514Smm vdev_clear_stats(vd); 1463230514Smm vdev_free(vd); 1464185029Spjd } 1465185029Spjd } 1466185029Spjd 1467185029Spjd if (oldvdevs) 1468185029Spjd kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); 1469185029Spjd 1470185029Spjd if (sav->sav_config == NULL) 1471185029Spjd goto out; 1472185029Spjd 1473185029Spjd sav->sav_vdevs = newvdevs; 1474185029Spjd sav->sav_count = (int)nl2cache; 1475185029Spjd 1476185029Spjd /* 1477185029Spjd * Recompute the stashed list of l2cache devices, with status 1478185029Spjd * information this time. 1479185029Spjd */ 1480185029Spjd VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 1481185029Spjd DATA_TYPE_NVLIST_ARRAY) == 0); 1482185029Spjd 1483185029Spjd l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 1484185029Spjd for (i = 0; i < sav->sav_count; i++) 1485185029Spjd l2cache[i] = vdev_config_generate(spa, 1486219089Spjd sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); 1487185029Spjd VERIFY(nvlist_add_nvlist_array(sav->sav_config, 1488185029Spjd ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); 1489185029Spjdout: 1490185029Spjd for (i = 0; i < sav->sav_count; i++) 1491185029Spjd nvlist_free(l2cache[i]); 1492185029Spjd if (sav->sav_count) 1493185029Spjd kmem_free(l2cache, sav->sav_count * sizeof (void *)); 1494185029Spjd} 1495185029Spjd 1496168404Spjdstatic int 1497168404Spjdload_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 1498168404Spjd{ 1499168404Spjd dmu_buf_t *db; 1500168404Spjd char *packed = NULL; 1501168404Spjd size_t nvsize = 0; 1502168404Spjd int error; 1503168404Spjd *value = NULL; 1504168404Spjd 1505168404Spjd VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 1506168404Spjd nvsize = *(uint64_t *)db->db_data; 1507168404Spjd dmu_buf_rele(db, FTAG); 1508168404Spjd 1509168404Spjd packed = kmem_alloc(nvsize, KM_SLEEP); 1510209962Smm error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, 1511209962Smm DMU_READ_PREFETCH); 1512168404Spjd if (error == 0) 1513168404Spjd error = nvlist_unpack(packed, nvsize, value, 0); 1514168404Spjd kmem_free(packed, nvsize); 1515168404Spjd 1516168404Spjd return (error); 1517168404Spjd} 1518168404Spjd 1519168404Spjd/* 1520185029Spjd * Checks to see if the given vdev could not be opened, in which case we post a 1521185029Spjd * sysevent to notify the autoreplace code that the device has been removed. 1522185029Spjd */ 1523185029Spjdstatic void 1524185029Spjdspa_check_removed(vdev_t *vd) 1525185029Spjd{ 1526219089Spjd for (int c = 0; c < vd->vdev_children; c++) 1527185029Spjd spa_check_removed(vd->vdev_child[c]); 1528185029Spjd 1529185029Spjd if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) { 1530185029Spjd zfs_post_autoreplace(vd->vdev_spa, vd); 1531185029Spjd spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK); 1532185029Spjd } 1533185029Spjd} 1534185029Spjd 1535185029Spjd/* 1536219089Spjd * Validate the current config against the MOS config 1537213197Smm */ 1538219089Spjdstatic boolean_t 1539219089Spjdspa_config_valid(spa_t *spa, nvlist_t *config) 1540213197Smm{ 1541219089Spjd vdev_t *mrvd, *rvd = spa->spa_root_vdev; 1542219089Spjd nvlist_t *nv; 1543213197Smm 1544219089Spjd VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0); 1545213197Smm 1546219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1547219089Spjd VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); 1548219089Spjd 1549219089Spjd ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children); 1550219089Spjd 1551219089Spjd /* 1552219089Spjd * If we're doing a normal import, then build up any additional 1553219089Spjd * diagnostic information about missing devices in this config. 1554219089Spjd * We'll pass this up to the user for further processing. 1555219089Spjd */ 1556219089Spjd if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { 1557219089Spjd nvlist_t **child, *nv; 1558219089Spjd uint64_t idx = 0; 1559219089Spjd 1560219089Spjd child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **), 1561219089Spjd KM_SLEEP); 1562219089Spjd VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); 1563219089Spjd 1564219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 1565219089Spjd vdev_t *tvd = rvd->vdev_child[c]; 1566219089Spjd vdev_t *mtvd = mrvd->vdev_child[c]; 1567219089Spjd 1568219089Spjd if (tvd->vdev_ops == &vdev_missing_ops && 1569219089Spjd mtvd->vdev_ops != &vdev_missing_ops && 1570219089Spjd mtvd->vdev_islog) 1571219089Spjd child[idx++] = vdev_config_generate(spa, mtvd, 1572219089Spjd B_FALSE, 0); 1573219089Spjd } 1574219089Spjd 1575219089Spjd if (idx) { 1576219089Spjd VERIFY(nvlist_add_nvlist_array(nv, 1577219089Spjd ZPOOL_CONFIG_CHILDREN, child, idx) == 0); 1578219089Spjd VERIFY(nvlist_add_nvlist(spa->spa_load_info, 1579219089Spjd ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0); 1580219089Spjd 1581219089Spjd for (int i = 0; i < idx; i++) 1582219089Spjd nvlist_free(child[i]); 1583219089Spjd } 1584219089Spjd nvlist_free(nv); 1585219089Spjd kmem_free(child, rvd->vdev_children * sizeof (char **)); 1586219089Spjd } 1587219089Spjd 1588219089Spjd /* 1589219089Spjd * Compare the root vdev tree with the information we have 1590219089Spjd * from the MOS config (mrvd). Check each top-level vdev 1591219089Spjd * with the corresponding MOS config top-level (mtvd). 1592219089Spjd */ 1593219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 1594213197Smm vdev_t *tvd = rvd->vdev_child[c]; 1595219089Spjd vdev_t *mtvd = mrvd->vdev_child[c]; 1596213197Smm 1597219089Spjd /* 1598219089Spjd * Resolve any "missing" vdevs in the current configuration. 1599219089Spjd * If we find that the MOS config has more accurate information 1600219089Spjd * about the top-level vdev then use that vdev instead. 1601219089Spjd */ 1602219089Spjd if (tvd->vdev_ops == &vdev_missing_ops && 1603219089Spjd mtvd->vdev_ops != &vdev_missing_ops) { 1604219089Spjd 1605219089Spjd if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) 1606219089Spjd continue; 1607219089Spjd 1608219089Spjd /* 1609219089Spjd * Device specific actions. 1610219089Spjd */ 1611219089Spjd if (mtvd->vdev_islog) { 1612219089Spjd spa_set_log_state(spa, SPA_LOG_CLEAR); 1613219089Spjd } else { 1614219089Spjd /* 1615219089Spjd * XXX - once we have 'readonly' pool 1616219089Spjd * support we should be able to handle 1617219089Spjd * missing data devices by transitioning 1618219089Spjd * the pool to readonly. 1619219089Spjd */ 1620219089Spjd continue; 1621219089Spjd } 1622219089Spjd 1623219089Spjd /* 1624219089Spjd * Swap the missing vdev with the data we were 1625219089Spjd * able to obtain from the MOS config. 1626219089Spjd */ 1627219089Spjd vdev_remove_child(rvd, tvd); 1628219089Spjd vdev_remove_child(mrvd, mtvd); 1629219089Spjd 1630219089Spjd vdev_add_child(rvd, mtvd); 1631219089Spjd vdev_add_child(mrvd, tvd); 1632219089Spjd 1633219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 1634219089Spjd vdev_load(mtvd); 1635219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1636219089Spjd 1637219089Spjd vdev_reopen(rvd); 1638219089Spjd } else if (mtvd->vdev_islog) { 1639219089Spjd /* 1640219089Spjd * Load the slog device's state from the MOS config 1641219089Spjd * since it's possible that the label does not 1642219089Spjd * contain the most up-to-date information. 1643219089Spjd */ 1644219089Spjd vdev_load_log_state(tvd, mtvd); 1645219089Spjd vdev_reopen(tvd); 1646219089Spjd } 1647213197Smm } 1648219089Spjd vdev_free(mrvd); 1649219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 1650219089Spjd 1651219089Spjd /* 1652219089Spjd * Ensure we were able to validate the config. 1653219089Spjd */ 1654219089Spjd return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum); 1655213197Smm} 1656213197Smm 1657213197Smm/* 1658185029Spjd * Check for missing log devices 1659185029Spjd */ 1660219089Spjdstatic int 1661185029Spjdspa_check_logs(spa_t *spa) 1662185029Spjd{ 1663185029Spjd switch (spa->spa_log_state) { 1664185029Spjd case SPA_LOG_MISSING: 1665185029Spjd /* need to recheck in case slog has been restored */ 1666185029Spjd case SPA_LOG_UNKNOWN: 1667185029Spjd if (dmu_objset_find(spa->spa_name, zil_check_log_chain, NULL, 1668185029Spjd DS_FIND_CHILDREN)) { 1669219089Spjd spa_set_log_state(spa, SPA_LOG_MISSING); 1670185029Spjd return (1); 1671185029Spjd } 1672185029Spjd break; 1673185029Spjd } 1674185029Spjd return (0); 1675185029Spjd} 1676185029Spjd 1677219089Spjdstatic boolean_t 1678219089Spjdspa_passivate_log(spa_t *spa) 1679219089Spjd{ 1680219089Spjd vdev_t *rvd = spa->spa_root_vdev; 1681219089Spjd boolean_t slog_found = B_FALSE; 1682219089Spjd 1683219089Spjd ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1684219089Spjd 1685219089Spjd if (!spa_has_slogs(spa)) 1686219089Spjd return (B_FALSE); 1687219089Spjd 1688219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 1689219089Spjd vdev_t *tvd = rvd->vdev_child[c]; 1690219089Spjd metaslab_group_t *mg = tvd->vdev_mg; 1691219089Spjd 1692219089Spjd if (tvd->vdev_islog) { 1693219089Spjd metaslab_group_passivate(mg); 1694219089Spjd slog_found = B_TRUE; 1695219089Spjd } 1696219089Spjd } 1697219089Spjd 1698219089Spjd return (slog_found); 1699219089Spjd} 1700219089Spjd 1701219089Spjdstatic void 1702219089Spjdspa_activate_log(spa_t *spa) 1703219089Spjd{ 1704219089Spjd vdev_t *rvd = spa->spa_root_vdev; 1705219089Spjd 1706219089Spjd ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1707219089Spjd 1708219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 1709219089Spjd vdev_t *tvd = rvd->vdev_child[c]; 1710219089Spjd metaslab_group_t *mg = tvd->vdev_mg; 1711219089Spjd 1712219089Spjd if (tvd->vdev_islog) 1713219089Spjd metaslab_group_activate(mg); 1714219089Spjd } 1715219089Spjd} 1716219089Spjd 1717219089Spjdint 1718219089Spjdspa_offline_log(spa_t *spa) 1719219089Spjd{ 1720219089Spjd int error = 0; 1721219089Spjd 1722219089Spjd if ((error = dmu_objset_find(spa_name(spa), zil_vdev_offline, 1723219089Spjd NULL, DS_FIND_CHILDREN)) == 0) { 1724219089Spjd 1725219089Spjd /* 1726219089Spjd * We successfully offlined the log device, sync out the 1727219089Spjd * current txg so that the "stubby" block can be removed 1728219089Spjd * by zil_sync(). 1729219089Spjd */ 1730219089Spjd txg_wait_synced(spa->spa_dsl_pool, 0); 1731219089Spjd } 1732219089Spjd return (error); 1733219089Spjd} 1734219089Spjd 1735219089Spjdstatic void 1736219089Spjdspa_aux_check_removed(spa_aux_vdev_t *sav) 1737219089Spjd{ 1738219089Spjd int i; 1739219089Spjd 1740219089Spjd for (i = 0; i < sav->sav_count; i++) 1741219089Spjd spa_check_removed(sav->sav_vdevs[i]); 1742219089Spjd} 1743219089Spjd 1744219089Spjdvoid 1745219089Spjdspa_claim_notify(zio_t *zio) 1746219089Spjd{ 1747219089Spjd spa_t *spa = zio->io_spa; 1748219089Spjd 1749219089Spjd if (zio->io_error) 1750219089Spjd return; 1751219089Spjd 1752219089Spjd mutex_enter(&spa->spa_props_lock); /* any mutex will do */ 1753219089Spjd if (spa->spa_claim_max_txg < zio->io_bp->blk_birth) 1754219089Spjd spa->spa_claim_max_txg = zio->io_bp->blk_birth; 1755219089Spjd mutex_exit(&spa->spa_props_lock); 1756219089Spjd} 1757219089Spjd 1758219089Spjdtypedef struct spa_load_error { 1759219089Spjd uint64_t sle_meta_count; 1760219089Spjd uint64_t sle_data_count; 1761219089Spjd} spa_load_error_t; 1762219089Spjd 1763219089Spjdstatic void 1764219089Spjdspa_load_verify_done(zio_t *zio) 1765219089Spjd{ 1766219089Spjd blkptr_t *bp = zio->io_bp; 1767219089Spjd spa_load_error_t *sle = zio->io_private; 1768219089Spjd dmu_object_type_t type = BP_GET_TYPE(bp); 1769219089Spjd int error = zio->io_error; 1770219089Spjd 1771219089Spjd if (error) { 1772236884Smm if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && 1773219089Spjd type != DMU_OT_INTENT_LOG) 1774219089Spjd atomic_add_64(&sle->sle_meta_count, 1); 1775219089Spjd else 1776219089Spjd atomic_add_64(&sle->sle_data_count, 1); 1777219089Spjd } 1778219089Spjd zio_data_buf_free(zio->io_data, zio->io_size); 1779219089Spjd} 1780219089Spjd 1781219089Spjd/*ARGSUSED*/ 1782219089Spjdstatic int 1783219089Spjdspa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 1784246666Smm const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) 1785219089Spjd{ 1786219089Spjd if (bp != NULL) { 1787219089Spjd zio_t *rio = arg; 1788219089Spjd size_t size = BP_GET_PSIZE(bp); 1789219089Spjd void *data = zio_data_buf_alloc(size); 1790219089Spjd 1791219089Spjd zio_nowait(zio_read(rio, spa, bp, data, size, 1792219089Spjd spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, 1793219089Spjd ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | 1794219089Spjd ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); 1795219089Spjd } 1796219089Spjd return (0); 1797219089Spjd} 1798219089Spjd 1799219089Spjdstatic int 1800219089Spjdspa_load_verify(spa_t *spa) 1801219089Spjd{ 1802219089Spjd zio_t *rio; 1803219089Spjd spa_load_error_t sle = { 0 }; 1804219089Spjd zpool_rewind_policy_t policy; 1805219089Spjd boolean_t verify_ok = B_FALSE; 1806219089Spjd int error; 1807219089Spjd 1808219089Spjd zpool_get_rewind_policy(spa->spa_config, &policy); 1809219089Spjd 1810219089Spjd if (policy.zrp_request & ZPOOL_NEVER_REWIND) 1811219089Spjd return (0); 1812219089Spjd 1813219089Spjd rio = zio_root(spa, NULL, &sle, 1814219089Spjd ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 1815219089Spjd 1816219089Spjd error = traverse_pool(spa, spa->spa_verify_min_txg, 1817219089Spjd TRAVERSE_PRE | TRAVERSE_PREFETCH, spa_load_verify_cb, rio); 1818219089Spjd 1819219089Spjd (void) zio_wait(rio); 1820219089Spjd 1821219089Spjd spa->spa_load_meta_errors = sle.sle_meta_count; 1822219089Spjd spa->spa_load_data_errors = sle.sle_data_count; 1823219089Spjd 1824219089Spjd if (!error && sle.sle_meta_count <= policy.zrp_maxmeta && 1825219089Spjd sle.sle_data_count <= policy.zrp_maxdata) { 1826219089Spjd int64_t loss = 0; 1827219089Spjd 1828219089Spjd verify_ok = B_TRUE; 1829219089Spjd spa->spa_load_txg = spa->spa_uberblock.ub_txg; 1830219089Spjd spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; 1831219089Spjd 1832219089Spjd loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; 1833219089Spjd VERIFY(nvlist_add_uint64(spa->spa_load_info, 1834219089Spjd ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0); 1835219089Spjd VERIFY(nvlist_add_int64(spa->spa_load_info, 1836219089Spjd ZPOOL_CONFIG_REWIND_TIME, loss) == 0); 1837219089Spjd VERIFY(nvlist_add_uint64(spa->spa_load_info, 1838219089Spjd ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0); 1839219089Spjd } else { 1840219089Spjd spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; 1841219089Spjd } 1842219089Spjd 1843219089Spjd if (error) { 1844219089Spjd if (error != ENXIO && error != EIO) 1845219089Spjd error = EIO; 1846219089Spjd return (error); 1847219089Spjd } 1848219089Spjd 1849219089Spjd return (verify_ok ? 0 : EIO); 1850219089Spjd} 1851219089Spjd 1852185029Spjd/* 1853219089Spjd * Find a value in the pool props object. 1854168404Spjd */ 1855219089Spjdstatic void 1856219089Spjdspa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) 1857219089Spjd{ 1858219089Spjd (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, 1859219089Spjd zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); 1860219089Spjd} 1861219089Spjd 1862219089Spjd/* 1863219089Spjd * Find a value in the pool directory object. 1864219089Spjd */ 1865168404Spjdstatic int 1866219089Spjdspa_dir_prop(spa_t *spa, const char *name, uint64_t *val) 1867168404Spjd{ 1868219089Spjd return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1869219089Spjd name, sizeof (uint64_t), 1, val)); 1870219089Spjd} 1871168404Spjd 1872219089Spjdstatic int 1873219089Spjdspa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) 1874219089Spjd{ 1875219089Spjd vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); 1876219089Spjd return (err); 1877219089Spjd} 1878219089Spjd 1879219089Spjd/* 1880219089Spjd * Fix up config after a partly-completed split. This is done with the 1881219089Spjd * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off 1882219089Spjd * pool have that entry in their config, but only the splitting one contains 1883219089Spjd * a list of all the guids of the vdevs that are being split off. 1884219089Spjd * 1885219089Spjd * This function determines what to do with that list: either rejoin 1886219089Spjd * all the disks to the pool, or complete the splitting process. To attempt 1887219089Spjd * the rejoin, each disk that is offlined is marked online again, and 1888219089Spjd * we do a reopen() call. If the vdev label for every disk that was 1889219089Spjd * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) 1890219089Spjd * then we call vdev_split() on each disk, and complete the split. 1891219089Spjd * 1892219089Spjd * Otherwise we leave the config alone, with all the vdevs in place in 1893219089Spjd * the original pool. 1894219089Spjd */ 1895219089Spjdstatic void 1896219089Spjdspa_try_repair(spa_t *spa, nvlist_t *config) 1897219089Spjd{ 1898219089Spjd uint_t extracted; 1899219089Spjd uint64_t *glist; 1900219089Spjd uint_t i, gcount; 1901219089Spjd nvlist_t *nvl; 1902219089Spjd vdev_t **vd; 1903219089Spjd boolean_t attempt_reopen; 1904219089Spjd 1905219089Spjd if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) 1906219089Spjd return; 1907219089Spjd 1908219089Spjd /* check that the config is complete */ 1909219089Spjd if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 1910219089Spjd &glist, &gcount) != 0) 1911219089Spjd return; 1912219089Spjd 1913219089Spjd vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); 1914219089Spjd 1915219089Spjd /* attempt to online all the vdevs & validate */ 1916219089Spjd attempt_reopen = B_TRUE; 1917219089Spjd for (i = 0; i < gcount; i++) { 1918219089Spjd if (glist[i] == 0) /* vdev is hole */ 1919219089Spjd continue; 1920219089Spjd 1921219089Spjd vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); 1922219089Spjd if (vd[i] == NULL) { 1923219089Spjd /* 1924219089Spjd * Don't bother attempting to reopen the disks; 1925219089Spjd * just do the split. 1926219089Spjd */ 1927219089Spjd attempt_reopen = B_FALSE; 1928219089Spjd } else { 1929219089Spjd /* attempt to re-online it */ 1930219089Spjd vd[i]->vdev_offline = B_FALSE; 1931219089Spjd } 1932219089Spjd } 1933219089Spjd 1934219089Spjd if (attempt_reopen) { 1935219089Spjd vdev_reopen(spa->spa_root_vdev); 1936219089Spjd 1937219089Spjd /* check each device to see what state it's in */ 1938219089Spjd for (extracted = 0, i = 0; i < gcount; i++) { 1939219089Spjd if (vd[i] != NULL && 1940219089Spjd vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) 1941219089Spjd break; 1942219089Spjd ++extracted; 1943219089Spjd } 1944219089Spjd } 1945219089Spjd 1946209962Smm /* 1947219089Spjd * If every disk has been moved to the new pool, or if we never 1948219089Spjd * even attempted to look at them, then we split them off for 1949219089Spjd * good. 1950209962Smm */ 1951219089Spjd if (!attempt_reopen || gcount == extracted) { 1952219089Spjd for (i = 0; i < gcount; i++) 1953219089Spjd if (vd[i] != NULL) 1954219089Spjd vdev_split(vd[i]); 1955219089Spjd vdev_reopen(spa->spa_root_vdev); 1956219089Spjd } 1957209962Smm 1958219089Spjd kmem_free(vd, gcount * sizeof (vdev_t *)); 1959219089Spjd} 1960185029Spjd 1961219089Spjdstatic int 1962219089Spjdspa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, 1963219089Spjd boolean_t mosconfig) 1964219089Spjd{ 1965219089Spjd nvlist_t *config = spa->spa_config; 1966219089Spjd char *ereport = FM_EREPORT_ZFS_POOL; 1967228103Smm char *comment; 1968219089Spjd int error; 1969219089Spjd uint64_t pool_guid; 1970219089Spjd nvlist_t *nvl; 1971168404Spjd 1972219089Spjd if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) 1973219089Spjd return (EINVAL); 1974168404Spjd 1975228103Smm ASSERT(spa->spa_comment == NULL); 1976228103Smm if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) 1977228103Smm spa->spa_comment = spa_strdup(comment); 1978228103Smm 1979168404Spjd /* 1980168404Spjd * Versioning wasn't explicitly added to the label until later, so if 1981168404Spjd * it's not present treat it as the initial version. 1982168404Spjd */ 1983219089Spjd if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 1984219089Spjd &spa->spa_ubsync.ub_version) != 0) 1985219089Spjd spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 1986168404Spjd 1987168404Spjd (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 1988168404Spjd &spa->spa_config_txg); 1989168404Spjd 1990168404Spjd if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 1991168404Spjd spa_guid_exists(pool_guid, 0)) { 1992168404Spjd error = EEXIST; 1993219089Spjd } else { 1994228103Smm spa->spa_config_guid = pool_guid; 1995219089Spjd 1996219089Spjd if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, 1997219089Spjd &nvl) == 0) { 1998219089Spjd VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting, 1999219089Spjd KM_SLEEP) == 0); 2000219089Spjd } 2001219089Spjd 2002236884Smm nvlist_free(spa->spa_load_info); 2003236884Smm spa->spa_load_info = fnvlist_alloc(); 2004236884Smm 2005219089Spjd gethrestime(&spa->spa_loaded_ts); 2006219089Spjd error = spa_load_impl(spa, pool_guid, config, state, type, 2007219089Spjd mosconfig, &ereport); 2008168404Spjd } 2009168404Spjd 2010219089Spjd spa->spa_minref = refcount_count(&spa->spa_refcount); 2011219089Spjd if (error) { 2012219089Spjd if (error != EEXIST) { 2013219089Spjd spa->spa_loaded_ts.tv_sec = 0; 2014219089Spjd spa->spa_loaded_ts.tv_nsec = 0; 2015219089Spjd } 2016219089Spjd if (error != EBADF) { 2017219089Spjd zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); 2018219089Spjd } 2019219089Spjd } 2020219089Spjd spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; 2021219089Spjd spa->spa_ena = 0; 2022168404Spjd 2023219089Spjd return (error); 2024219089Spjd} 2025219089Spjd 2026219089Spjd/* 2027219089Spjd * Load an existing storage pool, using the pool's builtin spa_config as a 2028219089Spjd * source of configuration information. 2029219089Spjd */ 2030219089Spjdstatic int 2031219089Spjdspa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, 2032219089Spjd spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, 2033219089Spjd char **ereport) 2034219089Spjd{ 2035219089Spjd int error = 0; 2036219089Spjd nvlist_t *nvroot = NULL; 2037236884Smm nvlist_t *label; 2038219089Spjd vdev_t *rvd; 2039219089Spjd uberblock_t *ub = &spa->spa_uberblock; 2040219089Spjd uint64_t children, config_cache_txg = spa->spa_config_txg; 2041219089Spjd int orig_mode = spa->spa_mode; 2042219089Spjd int parse; 2043219089Spjd uint64_t obj; 2044236884Smm boolean_t missing_feat_write = B_FALSE; 2045219089Spjd 2046168404Spjd /* 2047219089Spjd * If this is an untrusted config, access the pool in read-only mode. 2048219089Spjd * This prevents things like resilvering recently removed devices. 2049219089Spjd */ 2050219089Spjd if (!mosconfig) 2051219089Spjd spa->spa_mode = FREAD; 2052219089Spjd 2053219089Spjd ASSERT(MUTEX_HELD(&spa_namespace_lock)); 2054219089Spjd 2055219089Spjd spa->spa_load_state = state; 2056219089Spjd 2057219089Spjd if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot)) 2058219089Spjd return (EINVAL); 2059219089Spjd 2060219089Spjd parse = (type == SPA_IMPORT_EXISTING ? 2061219089Spjd VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); 2062219089Spjd 2063219089Spjd /* 2064209962Smm * Create "The Godfather" zio to hold all async IOs 2065209962Smm */ 2066209962Smm spa->spa_async_zio_root = zio_root(spa, NULL, NULL, 2067209962Smm ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); 2068209962Smm 2069209962Smm /* 2070168404Spjd * Parse the configuration into a vdev tree. We explicitly set the 2071168404Spjd * value that will be returned by spa_version() since parsing the 2072168404Spjd * configuration requires knowing the version number. 2073168404Spjd */ 2074185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2075219089Spjd error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse); 2076185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2077168404Spjd 2078168404Spjd if (error != 0) 2079219089Spjd return (error); 2080168404Spjd 2081168404Spjd ASSERT(spa->spa_root_vdev == rvd); 2082168404Spjd 2083219089Spjd if (type != SPA_IMPORT_ASSEMBLE) { 2084219089Spjd ASSERT(spa_guid(spa) == pool_guid); 2085219089Spjd } 2086219089Spjd 2087168404Spjd /* 2088168404Spjd * Try to open all vdevs, loading each label in the process. 2089168404Spjd */ 2090185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2091168926Spjd error = vdev_open(rvd); 2092185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2093168926Spjd if (error != 0) 2094219089Spjd return (error); 2095168404Spjd 2096168404Spjd /* 2097209962Smm * We need to validate the vdev labels against the configuration that 2098209962Smm * we have in hand, which is dependent on the setting of mosconfig. If 2099209962Smm * mosconfig is true then we're validating the vdev labels based on 2100219089Spjd * that config. Otherwise, we're validating against the cached config 2101209962Smm * (zpool.cache) that was read when we loaded the zfs module, and then 2102209962Smm * later we will recursively call spa_load() and validate against 2103209962Smm * the vdev config. 2104219089Spjd * 2105219089Spjd * If we're assembling a new pool that's been split off from an 2106219089Spjd * existing pool, the labels haven't yet been updated so we skip 2107219089Spjd * validation for now. 2108168404Spjd */ 2109219089Spjd if (type != SPA_IMPORT_ASSEMBLE) { 2110219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2111230514Smm error = vdev_validate(rvd, mosconfig); 2112219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2113168404Spjd 2114219089Spjd if (error != 0) 2115219089Spjd return (error); 2116219089Spjd 2117219089Spjd if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 2118219089Spjd return (ENXIO); 2119168404Spjd } 2120168404Spjd 2121168404Spjd /* 2122168404Spjd * Find the best uberblock. 2123168404Spjd */ 2124236884Smm vdev_uberblock_load(rvd, ub, &label); 2125168404Spjd 2126168404Spjd /* 2127168404Spjd * If we weren't able to find a single valid uberblock, return failure. 2128168404Spjd */ 2129236884Smm if (ub->ub_txg == 0) { 2130236884Smm nvlist_free(label); 2131219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); 2132236884Smm } 2133168404Spjd 2134168404Spjd /* 2135236884Smm * If the pool has an unsupported version we can't open it. 2136168404Spjd */ 2137236884Smm if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) { 2138236884Smm nvlist_free(label); 2139219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); 2140236884Smm } 2141168404Spjd 2142236884Smm if (ub->ub_version >= SPA_VERSION_FEATURES) { 2143236884Smm nvlist_t *features; 2144236884Smm 2145236884Smm /* 2146236884Smm * If we weren't able to find what's necessary for reading the 2147236884Smm * MOS in the label, return failure. 2148236884Smm */ 2149236884Smm if (label == NULL || nvlist_lookup_nvlist(label, 2150236884Smm ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) { 2151236884Smm nvlist_free(label); 2152236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 2153236884Smm ENXIO)); 2154236884Smm } 2155236884Smm 2156236884Smm /* 2157236884Smm * Update our in-core representation with the definitive values 2158236884Smm * from the label. 2159236884Smm */ 2160236884Smm nvlist_free(spa->spa_label_features); 2161236884Smm VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0); 2162236884Smm } 2163236884Smm 2164236884Smm nvlist_free(label); 2165236884Smm 2166168404Spjd /* 2167236884Smm * Look through entries in the label nvlist's features_for_read. If 2168236884Smm * there is a feature listed there which we don't understand then we 2169236884Smm * cannot open a pool. 2170236884Smm */ 2171236884Smm if (ub->ub_version >= SPA_VERSION_FEATURES) { 2172236884Smm nvlist_t *unsup_feat; 2173236884Smm 2174236884Smm VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) == 2175236884Smm 0); 2176236884Smm 2177236884Smm for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features, 2178236884Smm NULL); nvp != NULL; 2179236884Smm nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) { 2180236884Smm if (!zfeature_is_supported(nvpair_name(nvp))) { 2181236884Smm VERIFY(nvlist_add_string(unsup_feat, 2182236884Smm nvpair_name(nvp), "") == 0); 2183236884Smm } 2184236884Smm } 2185236884Smm 2186236884Smm if (!nvlist_empty(unsup_feat)) { 2187236884Smm VERIFY(nvlist_add_nvlist(spa->spa_load_info, 2188236884Smm ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0); 2189236884Smm nvlist_free(unsup_feat); 2190236884Smm return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 2191236884Smm ENOTSUP)); 2192236884Smm } 2193236884Smm 2194236884Smm nvlist_free(unsup_feat); 2195236884Smm } 2196236884Smm 2197236884Smm /* 2198168404Spjd * If the vdev guid sum doesn't match the uberblock, we have an 2199219089Spjd * incomplete configuration. We first check to see if the pool 2200219089Spjd * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN). 2201219089Spjd * If it is, defer the vdev_guid_sum check till later so we 2202219089Spjd * can handle missing vdevs. 2203168404Spjd */ 2204219089Spjd if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN, 2205219089Spjd &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE && 2206219089Spjd rvd->vdev_guid_sum != ub->ub_guid_sum) 2207219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); 2208219089Spjd 2209219089Spjd if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { 2210219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2211219089Spjd spa_try_repair(spa, config); 2212219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2213219089Spjd nvlist_free(spa->spa_config_splitting); 2214219089Spjd spa->spa_config_splitting = NULL; 2215168404Spjd } 2216168404Spjd 2217168404Spjd /* 2218168404Spjd * Initialize internal SPA structures. 2219168404Spjd */ 2220168404Spjd spa->spa_state = POOL_STATE_ACTIVE; 2221168404Spjd spa->spa_ubsync = spa->spa_uberblock; 2222219089Spjd spa->spa_verify_min_txg = spa->spa_extreme_rewind ? 2223219089Spjd TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; 2224219089Spjd spa->spa_first_txg = spa->spa_last_ubsync_txg ? 2225219089Spjd spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; 2226219089Spjd spa->spa_claim_max_txg = spa->spa_first_txg; 2227219089Spjd spa->spa_prev_software_version = ub->ub_software_version; 2228219089Spjd 2229236884Smm error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 2230219089Spjd if (error) 2231219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2232168404Spjd spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 2233168404Spjd 2234219089Spjd if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0) 2235219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2236168404Spjd 2237236884Smm if (spa_version(spa) >= SPA_VERSION_FEATURES) { 2238236884Smm boolean_t missing_feat_read = B_FALSE; 2239238926Smm nvlist_t *unsup_feat, *enabled_feat; 2240236884Smm 2241236884Smm if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ, 2242236884Smm &spa->spa_feat_for_read_obj) != 0) { 2243236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2244236884Smm } 2245236884Smm 2246236884Smm if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE, 2247236884Smm &spa->spa_feat_for_write_obj) != 0) { 2248236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2249236884Smm } 2250236884Smm 2251236884Smm if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS, 2252236884Smm &spa->spa_feat_desc_obj) != 0) { 2253236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2254236884Smm } 2255236884Smm 2256238926Smm enabled_feat = fnvlist_alloc(); 2257238926Smm unsup_feat = fnvlist_alloc(); 2258236884Smm 2259236884Smm if (!feature_is_supported(spa->spa_meta_objset, 2260236884Smm spa->spa_feat_for_read_obj, spa->spa_feat_desc_obj, 2261238926Smm unsup_feat, enabled_feat)) 2262236884Smm missing_feat_read = B_TRUE; 2263236884Smm 2264236884Smm if (spa_writeable(spa) || state == SPA_LOAD_TRYIMPORT) { 2265236884Smm if (!feature_is_supported(spa->spa_meta_objset, 2266236884Smm spa->spa_feat_for_write_obj, spa->spa_feat_desc_obj, 2267238926Smm unsup_feat, enabled_feat)) { 2268236884Smm missing_feat_write = B_TRUE; 2269238926Smm } 2270236884Smm } 2271236884Smm 2272238926Smm fnvlist_add_nvlist(spa->spa_load_info, 2273238926Smm ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat); 2274238926Smm 2275236884Smm if (!nvlist_empty(unsup_feat)) { 2276238926Smm fnvlist_add_nvlist(spa->spa_load_info, 2277238926Smm ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); 2278236884Smm } 2279236884Smm 2280238926Smm fnvlist_free(enabled_feat); 2281238926Smm fnvlist_free(unsup_feat); 2282236884Smm 2283236884Smm if (!missing_feat_read) { 2284236884Smm fnvlist_add_boolean(spa->spa_load_info, 2285236884Smm ZPOOL_CONFIG_CAN_RDONLY); 2286236884Smm } 2287236884Smm 2288236884Smm /* 2289236884Smm * If the state is SPA_LOAD_TRYIMPORT, our objective is 2290236884Smm * twofold: to determine whether the pool is available for 2291236884Smm * import in read-write mode and (if it is not) whether the 2292236884Smm * pool is available for import in read-only mode. If the pool 2293236884Smm * is available for import in read-write mode, it is displayed 2294236884Smm * as available in userland; if it is not available for import 2295236884Smm * in read-only mode, it is displayed as unavailable in 2296236884Smm * userland. If the pool is available for import in read-only 2297236884Smm * mode but not read-write mode, it is displayed as unavailable 2298236884Smm * in userland with a special note that the pool is actually 2299236884Smm * available for open in read-only mode. 2300236884Smm * 2301236884Smm * As a result, if the state is SPA_LOAD_TRYIMPORT and we are 2302236884Smm * missing a feature for write, we must first determine whether 2303236884Smm * the pool can be opened read-only before returning to 2304236884Smm * userland in order to know whether to display the 2305236884Smm * abovementioned note. 2306236884Smm */ 2307236884Smm if (missing_feat_read || (missing_feat_write && 2308236884Smm spa_writeable(spa))) { 2309236884Smm return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 2310236884Smm ENOTSUP)); 2311236884Smm } 2312236884Smm } 2313236884Smm 2314236884Smm spa->spa_is_initializing = B_TRUE; 2315236884Smm error = dsl_pool_open(spa->spa_dsl_pool); 2316236884Smm spa->spa_is_initializing = B_FALSE; 2317236884Smm if (error != 0) 2318236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2319236884Smm 2320168404Spjd if (!mosconfig) { 2321168498Spjd uint64_t hostid; 2322219089Spjd nvlist_t *policy = NULL, *nvconfig; 2323168404Spjd 2324219089Spjd if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) 2325219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2326168404Spjd 2327219089Spjd if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig, 2328185029Spjd ZPOOL_CONFIG_HOSTID, &hostid) == 0) { 2329168498Spjd char *hostname; 2330168498Spjd unsigned long myhostid = 0; 2331168498Spjd 2332219089Spjd VERIFY(nvlist_lookup_string(nvconfig, 2333168498Spjd ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 2334168498Spjd 2335219089Spjd#ifdef _KERNEL 2336219089Spjd myhostid = zone_get_hostid(NULL); 2337219089Spjd#else /* _KERNEL */ 2338219089Spjd /* 2339219089Spjd * We're emulating the system's hostid in userland, so 2340219089Spjd * we can't use zone_get_hostid(). 2341219089Spjd */ 2342168498Spjd (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 2343219089Spjd#endif /* _KERNEL */ 2344204073Spjd if (check_hostid && hostid != 0 && myhostid != 0 && 2345219089Spjd hostid != myhostid) { 2346219089Spjd nvlist_free(nvconfig); 2347168498Spjd cmn_err(CE_WARN, "pool '%s' could not be " 2348168498Spjd "loaded as it was last accessed by " 2349185029Spjd "another system (host: %s hostid: 0x%lx). " 2350236146Smm "See: http://illumos.org/msg/ZFS-8000-EY", 2351185029Spjd spa_name(spa), hostname, 2352168498Spjd (unsigned long)hostid); 2353219089Spjd return (EBADF); 2354168498Spjd } 2355168498Spjd } 2356219089Spjd if (nvlist_lookup_nvlist(spa->spa_config, 2357219089Spjd ZPOOL_REWIND_POLICY, &policy) == 0) 2358219089Spjd VERIFY(nvlist_add_nvlist(nvconfig, 2359219089Spjd ZPOOL_REWIND_POLICY, policy) == 0); 2360168498Spjd 2361219089Spjd spa_config_set(spa, nvconfig); 2362168404Spjd spa_unload(spa); 2363168404Spjd spa_deactivate(spa); 2364209962Smm spa_activate(spa, orig_mode); 2365168404Spjd 2366219089Spjd return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE)); 2367168404Spjd } 2368168404Spjd 2369219089Spjd if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0) 2370219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2371219089Spjd error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); 2372219089Spjd if (error != 0) 2373219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2374168404Spjd 2375168404Spjd /* 2376168404Spjd * Load the bit that tells us to use the new accounting function 2377168404Spjd * (raid-z deflation). If we have an older pool, this will not 2378168404Spjd * be present. 2379168404Spjd */ 2380219089Spjd error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate); 2381219089Spjd if (error != 0 && error != ENOENT) 2382219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2383168404Spjd 2384219089Spjd error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, 2385219089Spjd &spa->spa_creation_version); 2386219089Spjd if (error != 0 && error != ENOENT) 2387219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2388219089Spjd 2389168404Spjd /* 2390168404Spjd * Load the persistent error log. If we have an older pool, this will 2391168404Spjd * not be present. 2392168404Spjd */ 2393219089Spjd error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last); 2394219089Spjd if (error != 0 && error != ENOENT) 2395219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2396168404Spjd 2397219089Spjd error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, 2398219089Spjd &spa->spa_errlog_scrub); 2399219089Spjd if (error != 0 && error != ENOENT) 2400219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2401168404Spjd 2402168404Spjd /* 2403168404Spjd * Load the history object. If we have an older pool, this 2404168404Spjd * will not be present. 2405168404Spjd */ 2406219089Spjd error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history); 2407219089Spjd if (error != 0 && error != ENOENT) 2408219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2409168404Spjd 2410168404Spjd /* 2411219089Spjd * If we're assembling the pool from the split-off vdevs of 2412219089Spjd * an existing pool, we don't want to attach the spares & cache 2413219089Spjd * devices. 2414219089Spjd */ 2415219089Spjd 2416219089Spjd /* 2417168404Spjd * Load any hot spares for this pool. 2418168404Spjd */ 2419219089Spjd error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object); 2420219089Spjd if (error != 0 && error != ENOENT) 2421219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2422219089Spjd if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 2423185029Spjd ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 2424185029Spjd if (load_nvlist(spa, spa->spa_spares.sav_object, 2425219089Spjd &spa->spa_spares.sav_config) != 0) 2426219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2427168404Spjd 2428185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2429168404Spjd spa_load_spares(spa); 2430185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2431219089Spjd } else if (error == 0) { 2432219089Spjd spa->spa_spares.sav_sync = B_TRUE; 2433168404Spjd } 2434168404Spjd 2435185029Spjd /* 2436185029Spjd * Load any level 2 ARC devices for this pool. 2437185029Spjd */ 2438219089Spjd error = spa_dir_prop(spa, DMU_POOL_L2CACHE, 2439185029Spjd &spa->spa_l2cache.sav_object); 2440219089Spjd if (error != 0 && error != ENOENT) 2441219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2442219089Spjd if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 2443185029Spjd ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 2444185029Spjd if (load_nvlist(spa, spa->spa_l2cache.sav_object, 2445219089Spjd &spa->spa_l2cache.sav_config) != 0) 2446219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2447185029Spjd 2448185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2449185029Spjd spa_load_l2cache(spa); 2450185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2451219089Spjd } else if (error == 0) { 2452219089Spjd spa->spa_l2cache.sav_sync = B_TRUE; 2453185029Spjd } 2454185029Spjd 2455219089Spjd spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 2456213197Smm 2457219089Spjd error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object); 2458219089Spjd if (error && error != ENOENT) 2459219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2460185029Spjd 2461219089Spjd if (error == 0) { 2462219089Spjd uint64_t autoreplace; 2463185029Spjd 2464219089Spjd spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); 2465219089Spjd spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); 2466219089Spjd spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); 2467219089Spjd spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); 2468219089Spjd spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); 2469219089Spjd spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO, 2470219089Spjd &spa->spa_dedup_ditto); 2471185029Spjd 2472219089Spjd spa->spa_autoreplace = (autoreplace != 0); 2473168404Spjd } 2474168404Spjd 2475168404Spjd /* 2476185029Spjd * If the 'autoreplace' property is set, then post a resource notifying 2477185029Spjd * the ZFS DE that it should not issue any faults for unopenable 2478185029Spjd * devices. We also iterate over the vdevs, and post a sysevent for any 2479185029Spjd * unopenable vdevs so that the normal autoreplace handler can take 2480185029Spjd * over. 2481185029Spjd */ 2482219089Spjd if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) { 2483185029Spjd spa_check_removed(spa->spa_root_vdev); 2484219089Spjd /* 2485219089Spjd * For the import case, this is done in spa_import(), because 2486219089Spjd * at this point we're using the spare definitions from 2487219089Spjd * the MOS config, not necessarily from the userland config. 2488219089Spjd */ 2489219089Spjd if (state != SPA_LOAD_IMPORT) { 2490219089Spjd spa_aux_check_removed(&spa->spa_spares); 2491219089Spjd spa_aux_check_removed(&spa->spa_l2cache); 2492219089Spjd } 2493219089Spjd } 2494185029Spjd 2495185029Spjd /* 2496168404Spjd * Load the vdev state for all toplevel vdevs. 2497168404Spjd */ 2498168404Spjd vdev_load(rvd); 2499168404Spjd 2500168404Spjd /* 2501168404Spjd * Propagate the leaf DTLs we just loaded all the way up the tree. 2502168404Spjd */ 2503185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2504168404Spjd vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 2505185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2506168404Spjd 2507168404Spjd /* 2508219089Spjd * Load the DDTs (dedup tables). 2509168404Spjd */ 2510219089Spjd error = ddt_load(spa); 2511219089Spjd if (error != 0) 2512219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2513219089Spjd 2514219089Spjd spa_update_dspace(spa); 2515219089Spjd 2516219089Spjd /* 2517219089Spjd * Validate the config, using the MOS config to fill in any 2518219089Spjd * information which might be missing. If we fail to validate 2519219089Spjd * the config then declare the pool unfit for use. If we're 2520219089Spjd * assembling a pool from a split, the log is not transferred 2521219089Spjd * over. 2522219089Spjd */ 2523219089Spjd if (type != SPA_IMPORT_ASSEMBLE) { 2524219089Spjd nvlist_t *nvconfig; 2525219089Spjd 2526219089Spjd if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) 2527219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2528219089Spjd 2529219089Spjd if (!spa_config_valid(spa, nvconfig)) { 2530219089Spjd nvlist_free(nvconfig); 2531219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, 2532219089Spjd ENXIO)); 2533219089Spjd } 2534219089Spjd nvlist_free(nvconfig); 2535219089Spjd 2536219089Spjd /* 2537236884Smm * Now that we've validated the config, check the state of the 2538219089Spjd * root vdev. If it can't be opened, it indicates one or 2539219089Spjd * more toplevel vdevs are faulted. 2540219089Spjd */ 2541219089Spjd if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 2542219089Spjd return (ENXIO); 2543219089Spjd 2544219089Spjd if (spa_check_logs(spa)) { 2545219089Spjd *ereport = FM_EREPORT_ZFS_LOG_REPLAY; 2546219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO)); 2547219089Spjd } 2548168404Spjd } 2549168404Spjd 2550236884Smm if (missing_feat_write) { 2551236884Smm ASSERT(state == SPA_LOAD_TRYIMPORT); 2552236884Smm 2553236884Smm /* 2554236884Smm * At this point, we know that we can open the pool in 2555236884Smm * read-only mode but not read-write mode. We now have enough 2556236884Smm * information and can return to userland. 2557236884Smm */ 2558236884Smm return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); 2559236884Smm } 2560236884Smm 2561219089Spjd /* 2562219089Spjd * We've successfully opened the pool, verify that we're ready 2563219089Spjd * to start pushing transactions. 2564219089Spjd */ 2565219089Spjd if (state != SPA_LOAD_TRYIMPORT) { 2566219089Spjd if (error = spa_load_verify(spa)) 2567219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 2568219089Spjd error)); 2569219089Spjd } 2570219089Spjd 2571219089Spjd if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER || 2572219089Spjd spa->spa_load_max_txg == UINT64_MAX)) { 2573168404Spjd dmu_tx_t *tx; 2574168404Spjd int need_update = B_FALSE; 2575168404Spjd 2576209962Smm ASSERT(state != SPA_LOAD_TRYIMPORT); 2577209962Smm 2578168404Spjd /* 2579168404Spjd * Claim log blocks that haven't been committed yet. 2580168404Spjd * This must all happen in a single txg. 2581219089Spjd * Note: spa_claim_max_txg is updated by spa_claim_notify(), 2582219089Spjd * invoked from zil_claim_log_block()'s i/o done callback. 2583219089Spjd * Price of rollback is that we abandon the log. 2584168404Spjd */ 2585219089Spjd spa->spa_claiming = B_TRUE; 2586219089Spjd 2587168404Spjd tx = dmu_tx_create_assigned(spa_get_dsl(spa), 2588168404Spjd spa_first_txg(spa)); 2589185029Spjd (void) dmu_objset_find(spa_name(spa), 2590168404Spjd zil_claim, tx, DS_FIND_CHILDREN); 2591168404Spjd dmu_tx_commit(tx); 2592168404Spjd 2593219089Spjd spa->spa_claiming = B_FALSE; 2594219089Spjd 2595219089Spjd spa_set_log_state(spa, SPA_LOG_GOOD); 2596168404Spjd spa->spa_sync_on = B_TRUE; 2597168404Spjd txg_sync_start(spa->spa_dsl_pool); 2598168404Spjd 2599168404Spjd /* 2600219089Spjd * Wait for all claims to sync. We sync up to the highest 2601219089Spjd * claimed log block birth time so that claimed log blocks 2602219089Spjd * don't appear to be from the future. spa_claim_max_txg 2603219089Spjd * will have been set for us by either zil_check_log_chain() 2604219089Spjd * (invoked from spa_check_logs()) or zil_claim() above. 2605168404Spjd */ 2606219089Spjd txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); 2607168404Spjd 2608168404Spjd /* 2609168404Spjd * If the config cache is stale, or we have uninitialized 2610168404Spjd * metaslabs (see spa_vdev_add()), then update the config. 2611209962Smm * 2612219089Spjd * If this is a verbatim import, trust the current 2613209962Smm * in-core spa_config and update the disk labels. 2614168404Spjd */ 2615168404Spjd if (config_cache_txg != spa->spa_config_txg || 2616219089Spjd state == SPA_LOAD_IMPORT || 2617219089Spjd state == SPA_LOAD_RECOVER || 2618219089Spjd (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) 2619168404Spjd need_update = B_TRUE; 2620168404Spjd 2621209962Smm for (int c = 0; c < rvd->vdev_children; c++) 2622168404Spjd if (rvd->vdev_child[c]->vdev_ms_array == 0) 2623168404Spjd need_update = B_TRUE; 2624168404Spjd 2625168404Spjd /* 2626168404Spjd * Update the config cache asychronously in case we're the 2627168404Spjd * root pool, in which case the config cache isn't writable yet. 2628168404Spjd */ 2629168404Spjd if (need_update) 2630168404Spjd spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 2631208683Spjd 2632208683Spjd /* 2633208683Spjd * Check all DTLs to see if anything needs resilvering. 2634208683Spjd */ 2635219089Spjd if (!dsl_scan_resilvering(spa->spa_dsl_pool) && 2636219089Spjd vdev_resilver_needed(rvd, NULL, NULL)) 2637208683Spjd spa_async_request(spa, SPA_ASYNC_RESILVER); 2638219089Spjd 2639219089Spjd /* 2640219089Spjd * Delete any inconsistent datasets. 2641219089Spjd */ 2642219089Spjd (void) dmu_objset_find(spa_name(spa), 2643219089Spjd dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); 2644219089Spjd 2645219089Spjd /* 2646219089Spjd * Clean up any stale temporary dataset userrefs. 2647219089Spjd */ 2648219089Spjd dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); 2649168404Spjd } 2650168404Spjd 2651219089Spjd return (0); 2652219089Spjd} 2653168404Spjd 2654219089Spjdstatic int 2655219089Spjdspa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig) 2656219089Spjd{ 2657219089Spjd int mode = spa->spa_mode; 2658219089Spjd 2659219089Spjd spa_unload(spa); 2660219089Spjd spa_deactivate(spa); 2661219089Spjd 2662219089Spjd spa->spa_load_max_txg--; 2663219089Spjd 2664219089Spjd spa_activate(spa, mode); 2665219089Spjd spa_async_suspend(spa); 2666219089Spjd 2667219089Spjd return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig)); 2668168404Spjd} 2669168404Spjd 2670236884Smm/* 2671236884Smm * If spa_load() fails this function will try loading prior txg's. If 2672236884Smm * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool 2673236884Smm * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this 2674236884Smm * function will not rewind the pool and will return the same error as 2675236884Smm * spa_load(). 2676236884Smm */ 2677219089Spjdstatic int 2678219089Spjdspa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, 2679219089Spjd uint64_t max_request, int rewind_flags) 2680219089Spjd{ 2681236884Smm nvlist_t *loadinfo = NULL; 2682219089Spjd nvlist_t *config = NULL; 2683219089Spjd int load_error, rewind_error; 2684219089Spjd uint64_t safe_rewind_txg; 2685219089Spjd uint64_t min_txg; 2686219089Spjd 2687219089Spjd if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { 2688219089Spjd spa->spa_load_max_txg = spa->spa_load_txg; 2689219089Spjd spa_set_log_state(spa, SPA_LOG_CLEAR); 2690219089Spjd } else { 2691219089Spjd spa->spa_load_max_txg = max_request; 2692219089Spjd } 2693219089Spjd 2694219089Spjd load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING, 2695219089Spjd mosconfig); 2696219089Spjd if (load_error == 0) 2697219089Spjd return (0); 2698219089Spjd 2699219089Spjd if (spa->spa_root_vdev != NULL) 2700219089Spjd config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 2701219089Spjd 2702219089Spjd spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; 2703219089Spjd spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; 2704219089Spjd 2705219089Spjd if (rewind_flags & ZPOOL_NEVER_REWIND) { 2706219089Spjd nvlist_free(config); 2707219089Spjd return (load_error); 2708219089Spjd } 2709219089Spjd 2710236884Smm if (state == SPA_LOAD_RECOVER) { 2711236884Smm /* Price of rolling back is discarding txgs, including log */ 2712219089Spjd spa_set_log_state(spa, SPA_LOG_CLEAR); 2713236884Smm } else { 2714236884Smm /* 2715236884Smm * If we aren't rolling back save the load info from our first 2716236884Smm * import attempt so that we can restore it after attempting 2717236884Smm * to rewind. 2718236884Smm */ 2719236884Smm loadinfo = spa->spa_load_info; 2720236884Smm spa->spa_load_info = fnvlist_alloc(); 2721236884Smm } 2722219089Spjd 2723219089Spjd spa->spa_load_max_txg = spa->spa_last_ubsync_txg; 2724219089Spjd safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; 2725219089Spjd min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? 2726219089Spjd TXG_INITIAL : safe_rewind_txg; 2727219089Spjd 2728219089Spjd /* 2729219089Spjd * Continue as long as we're finding errors, we're still within 2730219089Spjd * the acceptable rewind range, and we're still finding uberblocks 2731219089Spjd */ 2732219089Spjd while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && 2733219089Spjd spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { 2734219089Spjd if (spa->spa_load_max_txg < safe_rewind_txg) 2735219089Spjd spa->spa_extreme_rewind = B_TRUE; 2736219089Spjd rewind_error = spa_load_retry(spa, state, mosconfig); 2737219089Spjd } 2738219089Spjd 2739219089Spjd spa->spa_extreme_rewind = B_FALSE; 2740219089Spjd spa->spa_load_max_txg = UINT64_MAX; 2741219089Spjd 2742219089Spjd if (config && (rewind_error || state != SPA_LOAD_RECOVER)) 2743219089Spjd spa_config_set(spa, config); 2744219089Spjd 2745236884Smm if (state == SPA_LOAD_RECOVER) { 2746236884Smm ASSERT3P(loadinfo, ==, NULL); 2747236884Smm return (rewind_error); 2748236884Smm } else { 2749236884Smm /* Store the rewind info as part of the initial load info */ 2750236884Smm fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO, 2751236884Smm spa->spa_load_info); 2752236884Smm 2753236884Smm /* Restore the initial load info */ 2754236884Smm fnvlist_free(spa->spa_load_info); 2755236884Smm spa->spa_load_info = loadinfo; 2756236884Smm 2757236884Smm return (load_error); 2758236884Smm } 2759219089Spjd} 2760219089Spjd 2761168404Spjd/* 2762168404Spjd * Pool Open/Import 2763168404Spjd * 2764168404Spjd * The import case is identical to an open except that the configuration is sent 2765168404Spjd * down from userland, instead of grabbed from the configuration cache. For the 2766168404Spjd * case of an open, the pool configuration will exist in the 2767185029Spjd * POOL_STATE_UNINITIALIZED state. 2768168404Spjd * 2769168404Spjd * The stats information (gen/count/ustats) is used to gather vdev statistics at 2770168404Spjd * the same time open the pool, without having to keep around the spa_t in some 2771168404Spjd * ambiguous state. 2772168404Spjd */ 2773168404Spjdstatic int 2774219089Spjdspa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, 2775219089Spjd nvlist_t **config) 2776168404Spjd{ 2777168404Spjd spa_t *spa; 2778219089Spjd spa_load_state_t state = SPA_LOAD_OPEN; 2779168404Spjd int error; 2780168404Spjd int locked = B_FALSE; 2781219089Spjd int firstopen = B_FALSE; 2782168404Spjd 2783168404Spjd *spapp = NULL; 2784168404Spjd 2785168404Spjd /* 2786168404Spjd * As disgusting as this is, we need to support recursive calls to this 2787168404Spjd * function because dsl_dir_open() is called during spa_load(), and ends 2788168404Spjd * up calling spa_open() again. The real fix is to figure out how to 2789168404Spjd * avoid dsl_dir_open() calling this in the first place. 2790168404Spjd */ 2791168404Spjd if (mutex_owner(&spa_namespace_lock) != curthread) { 2792168404Spjd mutex_enter(&spa_namespace_lock); 2793168404Spjd locked = B_TRUE; 2794168404Spjd } 2795168404Spjd 2796168404Spjd if ((spa = spa_lookup(pool)) == NULL) { 2797168404Spjd if (locked) 2798168404Spjd mutex_exit(&spa_namespace_lock); 2799168404Spjd return (ENOENT); 2800168404Spjd } 2801219089Spjd 2802168404Spjd if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 2803219089Spjd zpool_rewind_policy_t policy; 2804168404Spjd 2805219089Spjd firstopen = B_TRUE; 2806219089Spjd 2807219089Spjd zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config, 2808219089Spjd &policy); 2809219089Spjd if (policy.zrp_request & ZPOOL_DO_REWIND) 2810219089Spjd state = SPA_LOAD_RECOVER; 2811219089Spjd 2812209962Smm spa_activate(spa, spa_mode_global); 2813168404Spjd 2814219089Spjd if (state != SPA_LOAD_RECOVER) 2815219089Spjd spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 2816168404Spjd 2817219089Spjd error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg, 2818219089Spjd policy.zrp_request); 2819219089Spjd 2820168404Spjd if (error == EBADF) { 2821168404Spjd /* 2822168404Spjd * If vdev_validate() returns failure (indicated by 2823168404Spjd * EBADF), it indicates that one of the vdevs indicates 2824168404Spjd * that the pool has been exported or destroyed. If 2825168404Spjd * this is the case, the config cache is out of sync and 2826168404Spjd * we should remove the pool from the namespace. 2827168404Spjd */ 2828168404Spjd spa_unload(spa); 2829168404Spjd spa_deactivate(spa); 2830185029Spjd spa_config_sync(spa, B_TRUE, B_TRUE); 2831168404Spjd spa_remove(spa); 2832168404Spjd if (locked) 2833168404Spjd mutex_exit(&spa_namespace_lock); 2834168404Spjd return (ENOENT); 2835168404Spjd } 2836168404Spjd 2837168404Spjd if (error) { 2838168404Spjd /* 2839168404Spjd * We can't open the pool, but we still have useful 2840168404Spjd * information: the state of each vdev after the 2841168404Spjd * attempted vdev_open(). Return this to the user. 2842168404Spjd */ 2843219089Spjd if (config != NULL && spa->spa_config) { 2844219089Spjd VERIFY(nvlist_dup(spa->spa_config, config, 2845219089Spjd KM_SLEEP) == 0); 2846219089Spjd VERIFY(nvlist_add_nvlist(*config, 2847219089Spjd ZPOOL_CONFIG_LOAD_INFO, 2848219089Spjd spa->spa_load_info) == 0); 2849219089Spjd } 2850168404Spjd spa_unload(spa); 2851168404Spjd spa_deactivate(spa); 2852219089Spjd spa->spa_last_open_failed = error; 2853168404Spjd if (locked) 2854168404Spjd mutex_exit(&spa_namespace_lock); 2855168404Spjd *spapp = NULL; 2856168404Spjd return (error); 2857168404Spjd } 2858168404Spjd } 2859168404Spjd 2860168404Spjd spa_open_ref(spa, tag); 2861185029Spjd 2862219089Spjd if (config != NULL) 2863219089Spjd *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 2864219089Spjd 2865219089Spjd /* 2866219089Spjd * If we've recovered the pool, pass back any information we 2867219089Spjd * gathered while doing the load. 2868219089Spjd */ 2869219089Spjd if (state == SPA_LOAD_RECOVER) { 2870219089Spjd VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, 2871219089Spjd spa->spa_load_info) == 0); 2872219089Spjd } 2873219089Spjd 2874219089Spjd if (locked) { 2875219089Spjd spa->spa_last_open_failed = 0; 2876219089Spjd spa->spa_last_ubsync_txg = 0; 2877219089Spjd spa->spa_load_txg = 0; 2878168404Spjd mutex_exit(&spa_namespace_lock); 2879219089Spjd#ifdef __FreeBSD__ 2880219089Spjd#ifdef _KERNEL 2881219089Spjd if (firstopen) 2882219089Spjd zvol_create_minors(pool); 2883219089Spjd#endif 2884219089Spjd#endif 2885219089Spjd } 2886168404Spjd 2887168404Spjd *spapp = spa; 2888168404Spjd 2889168404Spjd return (0); 2890168404Spjd} 2891168404Spjd 2892168404Spjdint 2893219089Spjdspa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy, 2894219089Spjd nvlist_t **config) 2895219089Spjd{ 2896219089Spjd return (spa_open_common(name, spapp, tag, policy, config)); 2897219089Spjd} 2898219089Spjd 2899219089Spjdint 2900168404Spjdspa_open(const char *name, spa_t **spapp, void *tag) 2901168404Spjd{ 2902219089Spjd return (spa_open_common(name, spapp, tag, NULL, NULL)); 2903168404Spjd} 2904168404Spjd 2905168404Spjd/* 2906168404Spjd * Lookup the given spa_t, incrementing the inject count in the process, 2907168404Spjd * preventing it from being exported or destroyed. 2908168404Spjd */ 2909168404Spjdspa_t * 2910168404Spjdspa_inject_addref(char *name) 2911168404Spjd{ 2912168404Spjd spa_t *spa; 2913168404Spjd 2914168404Spjd mutex_enter(&spa_namespace_lock); 2915168404Spjd if ((spa = spa_lookup(name)) == NULL) { 2916168404Spjd mutex_exit(&spa_namespace_lock); 2917168404Spjd return (NULL); 2918168404Spjd } 2919168404Spjd spa->spa_inject_ref++; 2920168404Spjd mutex_exit(&spa_namespace_lock); 2921168404Spjd 2922168404Spjd return (spa); 2923168404Spjd} 2924168404Spjd 2925168404Spjdvoid 2926168404Spjdspa_inject_delref(spa_t *spa) 2927168404Spjd{ 2928168404Spjd mutex_enter(&spa_namespace_lock); 2929168404Spjd spa->spa_inject_ref--; 2930168404Spjd mutex_exit(&spa_namespace_lock); 2931168404Spjd} 2932168404Spjd 2933185029Spjd/* 2934185029Spjd * Add spares device information to the nvlist. 2935185029Spjd */ 2936168404Spjdstatic void 2937168404Spjdspa_add_spares(spa_t *spa, nvlist_t *config) 2938168404Spjd{ 2939168404Spjd nvlist_t **spares; 2940168404Spjd uint_t i, nspares; 2941168404Spjd nvlist_t *nvroot; 2942168404Spjd uint64_t guid; 2943168404Spjd vdev_stat_t *vs; 2944168404Spjd uint_t vsc; 2945168404Spjd uint64_t pool; 2946168404Spjd 2947209962Smm ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 2948209962Smm 2949185029Spjd if (spa->spa_spares.sav_count == 0) 2950168404Spjd return; 2951168404Spjd 2952168404Spjd VERIFY(nvlist_lookup_nvlist(config, 2953168404Spjd ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 2954185029Spjd VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 2955168404Spjd ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 2956168404Spjd if (nspares != 0) { 2957168404Spjd VERIFY(nvlist_add_nvlist_array(nvroot, 2958168404Spjd ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 2959168404Spjd VERIFY(nvlist_lookup_nvlist_array(nvroot, 2960168404Spjd ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 2961168404Spjd 2962168404Spjd /* 2963168404Spjd * Go through and find any spares which have since been 2964168404Spjd * repurposed as an active spare. If this is the case, update 2965168404Spjd * their status appropriately. 2966168404Spjd */ 2967168404Spjd for (i = 0; i < nspares; i++) { 2968168404Spjd VERIFY(nvlist_lookup_uint64(spares[i], 2969168404Spjd ZPOOL_CONFIG_GUID, &guid) == 0); 2970185029Spjd if (spa_spare_exists(guid, &pool, NULL) && 2971185029Spjd pool != 0ULL) { 2972168404Spjd VERIFY(nvlist_lookup_uint64_array( 2973219089Spjd spares[i], ZPOOL_CONFIG_VDEV_STATS, 2974168404Spjd (uint64_t **)&vs, &vsc) == 0); 2975168404Spjd vs->vs_state = VDEV_STATE_CANT_OPEN; 2976168404Spjd vs->vs_aux = VDEV_AUX_SPARED; 2977168404Spjd } 2978168404Spjd } 2979168404Spjd } 2980168404Spjd} 2981168404Spjd 2982185029Spjd/* 2983185029Spjd * Add l2cache device information to the nvlist, including vdev stats. 2984185029Spjd */ 2985185029Spjdstatic void 2986185029Spjdspa_add_l2cache(spa_t *spa, nvlist_t *config) 2987185029Spjd{ 2988185029Spjd nvlist_t **l2cache; 2989185029Spjd uint_t i, j, nl2cache; 2990185029Spjd nvlist_t *nvroot; 2991185029Spjd uint64_t guid; 2992185029Spjd vdev_t *vd; 2993185029Spjd vdev_stat_t *vs; 2994185029Spjd uint_t vsc; 2995185029Spjd 2996209962Smm ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 2997209962Smm 2998185029Spjd if (spa->spa_l2cache.sav_count == 0) 2999185029Spjd return; 3000185029Spjd 3001185029Spjd VERIFY(nvlist_lookup_nvlist(config, 3002185029Spjd ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 3003185029Spjd VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 3004185029Spjd ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 3005185029Spjd if (nl2cache != 0) { 3006185029Spjd VERIFY(nvlist_add_nvlist_array(nvroot, 3007185029Spjd ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 3008185029Spjd VERIFY(nvlist_lookup_nvlist_array(nvroot, 3009185029Spjd ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 3010185029Spjd 3011185029Spjd /* 3012185029Spjd * Update level 2 cache device stats. 3013185029Spjd */ 3014185029Spjd 3015185029Spjd for (i = 0; i < nl2cache; i++) { 3016185029Spjd VERIFY(nvlist_lookup_uint64(l2cache[i], 3017185029Spjd ZPOOL_CONFIG_GUID, &guid) == 0); 3018185029Spjd 3019185029Spjd vd = NULL; 3020185029Spjd for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 3021185029Spjd if (guid == 3022185029Spjd spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 3023185029Spjd vd = spa->spa_l2cache.sav_vdevs[j]; 3024185029Spjd break; 3025185029Spjd } 3026185029Spjd } 3027185029Spjd ASSERT(vd != NULL); 3028185029Spjd 3029185029Spjd VERIFY(nvlist_lookup_uint64_array(l2cache[i], 3030219089Spjd ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) 3031219089Spjd == 0); 3032185029Spjd vdev_get_stats(vd, vs); 3033185029Spjd } 3034185029Spjd } 3035185029Spjd} 3036185029Spjd 3037236884Smmstatic void 3038236884Smmspa_add_feature_stats(spa_t *spa, nvlist_t *config) 3039236884Smm{ 3040236884Smm nvlist_t *features; 3041236884Smm zap_cursor_t zc; 3042236884Smm zap_attribute_t za; 3043236884Smm 3044236884Smm ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 3045236884Smm VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3046236884Smm 3047236884Smm if (spa->spa_feat_for_read_obj != 0) { 3048236884Smm for (zap_cursor_init(&zc, spa->spa_meta_objset, 3049236884Smm spa->spa_feat_for_read_obj); 3050236884Smm zap_cursor_retrieve(&zc, &za) == 0; 3051236884Smm zap_cursor_advance(&zc)) { 3052236884Smm ASSERT(za.za_integer_length == sizeof (uint64_t) && 3053236884Smm za.za_num_integers == 1); 3054236884Smm VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, 3055236884Smm za.za_first_integer)); 3056236884Smm } 3057236884Smm zap_cursor_fini(&zc); 3058236884Smm } 3059236884Smm 3060236884Smm if (spa->spa_feat_for_write_obj != 0) { 3061236884Smm for (zap_cursor_init(&zc, spa->spa_meta_objset, 3062236884Smm spa->spa_feat_for_write_obj); 3063236884Smm zap_cursor_retrieve(&zc, &za) == 0; 3064236884Smm zap_cursor_advance(&zc)) { 3065236884Smm ASSERT(za.za_integer_length == sizeof (uint64_t) && 3066236884Smm za.za_num_integers == 1); 3067236884Smm VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, 3068236884Smm za.za_first_integer)); 3069236884Smm } 3070236884Smm zap_cursor_fini(&zc); 3071236884Smm } 3072236884Smm 3073236884Smm VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, 3074236884Smm features) == 0); 3075236884Smm nvlist_free(features); 3076236884Smm} 3077236884Smm 3078168404Spjdint 3079236884Smmspa_get_stats(const char *name, nvlist_t **config, 3080236884Smm char *altroot, size_t buflen) 3081168404Spjd{ 3082168404Spjd int error; 3083168404Spjd spa_t *spa; 3084168404Spjd 3085168404Spjd *config = NULL; 3086219089Spjd error = spa_open_common(name, &spa, FTAG, NULL, config); 3087168404Spjd 3088209962Smm if (spa != NULL) { 3089209962Smm /* 3090209962Smm * This still leaves a window of inconsistency where the spares 3091209962Smm * or l2cache devices could change and the config would be 3092209962Smm * self-inconsistent. 3093209962Smm */ 3094209962Smm spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3095168404Spjd 3096209962Smm if (*config != NULL) { 3097219089Spjd uint64_t loadtimes[2]; 3098219089Spjd 3099219089Spjd loadtimes[0] = spa->spa_loaded_ts.tv_sec; 3100219089Spjd loadtimes[1] = spa->spa_loaded_ts.tv_nsec; 3101219089Spjd VERIFY(nvlist_add_uint64_array(*config, 3102219089Spjd ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0); 3103219089Spjd 3104185029Spjd VERIFY(nvlist_add_uint64(*config, 3105209962Smm ZPOOL_CONFIG_ERRCOUNT, 3106209962Smm spa_get_errlog_size(spa)) == 0); 3107185029Spjd 3108209962Smm if (spa_suspended(spa)) 3109209962Smm VERIFY(nvlist_add_uint64(*config, 3110209962Smm ZPOOL_CONFIG_SUSPENDED, 3111209962Smm spa->spa_failmode) == 0); 3112209962Smm 3113209962Smm spa_add_spares(spa, *config); 3114209962Smm spa_add_l2cache(spa, *config); 3115236884Smm spa_add_feature_stats(spa, *config); 3116209962Smm } 3117168404Spjd } 3118168404Spjd 3119168404Spjd /* 3120168404Spjd * We want to get the alternate root even for faulted pools, so we cheat 3121168404Spjd * and call spa_lookup() directly. 3122168404Spjd */ 3123168404Spjd if (altroot) { 3124168404Spjd if (spa == NULL) { 3125168404Spjd mutex_enter(&spa_namespace_lock); 3126168404Spjd spa = spa_lookup(name); 3127168404Spjd if (spa) 3128168404Spjd spa_altroot(spa, altroot, buflen); 3129168404Spjd else 3130168404Spjd altroot[0] = '\0'; 3131168404Spjd spa = NULL; 3132168404Spjd mutex_exit(&spa_namespace_lock); 3133168404Spjd } else { 3134168404Spjd spa_altroot(spa, altroot, buflen); 3135168404Spjd } 3136168404Spjd } 3137168404Spjd 3138209962Smm if (spa != NULL) { 3139209962Smm spa_config_exit(spa, SCL_CONFIG, FTAG); 3140168404Spjd spa_close(spa, FTAG); 3141209962Smm } 3142168404Spjd 3143168404Spjd return (error); 3144168404Spjd} 3145168404Spjd 3146168404Spjd/* 3147185029Spjd * Validate that the auxiliary device array is well formed. We must have an 3148185029Spjd * array of nvlists, each which describes a valid leaf vdev. If this is an 3149185029Spjd * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 3150185029Spjd * specified, as long as they are well-formed. 3151168404Spjd */ 3152168404Spjdstatic int 3153185029Spjdspa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 3154185029Spjd spa_aux_vdev_t *sav, const char *config, uint64_t version, 3155185029Spjd vdev_labeltype_t label) 3156168404Spjd{ 3157185029Spjd nvlist_t **dev; 3158185029Spjd uint_t i, ndev; 3159168404Spjd vdev_t *vd; 3160168404Spjd int error; 3161168404Spjd 3162185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3163185029Spjd 3164168404Spjd /* 3165185029Spjd * It's acceptable to have no devs specified. 3166168404Spjd */ 3167185029Spjd if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 3168168404Spjd return (0); 3169168404Spjd 3170185029Spjd if (ndev == 0) 3171168404Spjd return (EINVAL); 3172168404Spjd 3173168404Spjd /* 3174185029Spjd * Make sure the pool is formatted with a version that supports this 3175185029Spjd * device type. 3176168404Spjd */ 3177185029Spjd if (spa_version(spa) < version) 3178168404Spjd return (ENOTSUP); 3179168404Spjd 3180168404Spjd /* 3181185029Spjd * Set the pending device list so we correctly handle device in-use 3182168404Spjd * checking. 3183168404Spjd */ 3184185029Spjd sav->sav_pending = dev; 3185185029Spjd sav->sav_npending = ndev; 3186168404Spjd 3187185029Spjd for (i = 0; i < ndev; i++) { 3188185029Spjd if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 3189168404Spjd mode)) != 0) 3190168404Spjd goto out; 3191168404Spjd 3192168404Spjd if (!vd->vdev_ops->vdev_op_leaf) { 3193168404Spjd vdev_free(vd); 3194168404Spjd error = EINVAL; 3195168404Spjd goto out; 3196168404Spjd } 3197168404Spjd 3198185029Spjd /* 3199185029Spjd * The L2ARC currently only supports disk devices in 3200185029Spjd * kernel context. For user-level testing, we allow it. 3201185029Spjd */ 3202185029Spjd#ifdef _KERNEL 3203185029Spjd if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && 3204185029Spjd strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { 3205185029Spjd error = ENOTBLK; 3206230514Smm vdev_free(vd); 3207185029Spjd goto out; 3208185029Spjd } 3209185029Spjd#endif 3210168404Spjd vd->vdev_top = vd; 3211168404Spjd 3212168404Spjd if ((error = vdev_open(vd)) == 0 && 3213185029Spjd (error = vdev_label_init(vd, crtxg, label)) == 0) { 3214185029Spjd VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 3215168404Spjd vd->vdev_guid) == 0); 3216168404Spjd } 3217168404Spjd 3218168404Spjd vdev_free(vd); 3219168404Spjd 3220185029Spjd if (error && 3221185029Spjd (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 3222168404Spjd goto out; 3223168404Spjd else 3224168404Spjd error = 0; 3225168404Spjd } 3226168404Spjd 3227168404Spjdout: 3228185029Spjd sav->sav_pending = NULL; 3229185029Spjd sav->sav_npending = 0; 3230168404Spjd return (error); 3231168404Spjd} 3232168404Spjd 3233185029Spjdstatic int 3234185029Spjdspa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 3235185029Spjd{ 3236185029Spjd int error; 3237185029Spjd 3238185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3239185029Spjd 3240185029Spjd if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 3241185029Spjd &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 3242185029Spjd VDEV_LABEL_SPARE)) != 0) { 3243185029Spjd return (error); 3244185029Spjd } 3245185029Spjd 3246185029Spjd return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 3247185029Spjd &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 3248185029Spjd VDEV_LABEL_L2CACHE)); 3249185029Spjd} 3250185029Spjd 3251185029Spjdstatic void 3252185029Spjdspa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 3253185029Spjd const char *config) 3254185029Spjd{ 3255185029Spjd int i; 3256185029Spjd 3257185029Spjd if (sav->sav_config != NULL) { 3258185029Spjd nvlist_t **olddevs; 3259185029Spjd uint_t oldndevs; 3260185029Spjd nvlist_t **newdevs; 3261185029Spjd 3262185029Spjd /* 3263185029Spjd * Generate new dev list by concatentating with the 3264185029Spjd * current dev list. 3265185029Spjd */ 3266185029Spjd VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, 3267185029Spjd &olddevs, &oldndevs) == 0); 3268185029Spjd 3269185029Spjd newdevs = kmem_alloc(sizeof (void *) * 3270185029Spjd (ndevs + oldndevs), KM_SLEEP); 3271185029Spjd for (i = 0; i < oldndevs; i++) 3272185029Spjd VERIFY(nvlist_dup(olddevs[i], &newdevs[i], 3273185029Spjd KM_SLEEP) == 0); 3274185029Spjd for (i = 0; i < ndevs; i++) 3275185029Spjd VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], 3276185029Spjd KM_SLEEP) == 0); 3277185029Spjd 3278185029Spjd VERIFY(nvlist_remove(sav->sav_config, config, 3279185029Spjd DATA_TYPE_NVLIST_ARRAY) == 0); 3280185029Spjd 3281185029Spjd VERIFY(nvlist_add_nvlist_array(sav->sav_config, 3282185029Spjd config, newdevs, ndevs + oldndevs) == 0); 3283185029Spjd for (i = 0; i < oldndevs + ndevs; i++) 3284185029Spjd nvlist_free(newdevs[i]); 3285185029Spjd kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 3286185029Spjd } else { 3287185029Spjd /* 3288185029Spjd * Generate a new dev list. 3289185029Spjd */ 3290185029Spjd VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, 3291185029Spjd KM_SLEEP) == 0); 3292185029Spjd VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, 3293185029Spjd devs, ndevs) == 0); 3294185029Spjd } 3295185029Spjd} 3296185029Spjd 3297168404Spjd/* 3298185029Spjd * Stop and drop level 2 ARC devices 3299185029Spjd */ 3300185029Spjdvoid 3301185029Spjdspa_l2cache_drop(spa_t *spa) 3302185029Spjd{ 3303185029Spjd vdev_t *vd; 3304185029Spjd int i; 3305185029Spjd spa_aux_vdev_t *sav = &spa->spa_l2cache; 3306185029Spjd 3307185029Spjd for (i = 0; i < sav->sav_count; i++) { 3308185029Spjd uint64_t pool; 3309185029Spjd 3310185029Spjd vd = sav->sav_vdevs[i]; 3311185029Spjd ASSERT(vd != NULL); 3312185029Spjd 3313209962Smm if (spa_l2cache_exists(vd->vdev_guid, &pool) && 3314209962Smm pool != 0ULL && l2arc_vdev_present(vd)) 3315185029Spjd l2arc_remove_vdev(vd); 3316185029Spjd } 3317185029Spjd} 3318185029Spjd 3319185029Spjd/* 3320168404Spjd * Pool Creation 3321168404Spjd */ 3322168404Spjdint 3323185029Spjdspa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 3324185029Spjd const char *history_str, nvlist_t *zplprops) 3325168404Spjd{ 3326168404Spjd spa_t *spa; 3327185029Spjd char *altroot = NULL; 3328168404Spjd vdev_t *rvd; 3329168404Spjd dsl_pool_t *dp; 3330168404Spjd dmu_tx_t *tx; 3331219089Spjd int error = 0; 3332168404Spjd uint64_t txg = TXG_INITIAL; 3333185029Spjd nvlist_t **spares, **l2cache; 3334185029Spjd uint_t nspares, nl2cache; 3335219089Spjd uint64_t version, obj; 3336236884Smm boolean_t has_features; 3337168404Spjd 3338168404Spjd /* 3339168404Spjd * If this pool already exists, return failure. 3340168404Spjd */ 3341168404Spjd mutex_enter(&spa_namespace_lock); 3342168404Spjd if (spa_lookup(pool) != NULL) { 3343168404Spjd mutex_exit(&spa_namespace_lock); 3344168404Spjd return (EEXIST); 3345168404Spjd } 3346168404Spjd 3347168404Spjd /* 3348168404Spjd * Allocate a new spa_t structure. 3349168404Spjd */ 3350185029Spjd (void) nvlist_lookup_string(props, 3351185029Spjd zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 3352219089Spjd spa = spa_add(pool, NULL, altroot); 3353209962Smm spa_activate(spa, spa_mode_global); 3354168404Spjd 3355185029Spjd if (props && (error = spa_prop_validate(spa, props))) { 3356185029Spjd spa_deactivate(spa); 3357185029Spjd spa_remove(spa); 3358185029Spjd mutex_exit(&spa_namespace_lock); 3359185029Spjd return (error); 3360185029Spjd } 3361185029Spjd 3362236884Smm has_features = B_FALSE; 3363236884Smm for (nvpair_t *elem = nvlist_next_nvpair(props, NULL); 3364236884Smm elem != NULL; elem = nvlist_next_nvpair(props, elem)) { 3365236884Smm if (zpool_prop_feature(nvpair_name(elem))) 3366236884Smm has_features = B_TRUE; 3367236884Smm } 3368236884Smm 3369236884Smm if (has_features || nvlist_lookup_uint64(props, 3370236884Smm zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) { 3371185029Spjd version = SPA_VERSION; 3372236884Smm } 3373236884Smm ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 3374219089Spjd 3375219089Spjd spa->spa_first_txg = txg; 3376219089Spjd spa->spa_uberblock.ub_txg = txg - 1; 3377185029Spjd spa->spa_uberblock.ub_version = version; 3378168404Spjd spa->spa_ubsync = spa->spa_uberblock; 3379168404Spjd 3380168404Spjd /* 3381209962Smm * Create "The Godfather" zio to hold all async IOs 3382209962Smm */ 3383209962Smm spa->spa_async_zio_root = zio_root(spa, NULL, NULL, 3384209962Smm ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); 3385209962Smm 3386209962Smm /* 3387168404Spjd * Create the root vdev. 3388168404Spjd */ 3389185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3390168404Spjd 3391168404Spjd error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 3392168404Spjd 3393168404Spjd ASSERT(error != 0 || rvd != NULL); 3394168404Spjd ASSERT(error != 0 || spa->spa_root_vdev == rvd); 3395168404Spjd 3396185029Spjd if (error == 0 && !zfs_allocatable_devs(nvroot)) 3397168404Spjd error = EINVAL; 3398168404Spjd 3399168404Spjd if (error == 0 && 3400168404Spjd (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 3401185029Spjd (error = spa_validate_aux(spa, nvroot, txg, 3402168404Spjd VDEV_ALLOC_ADD)) == 0) { 3403219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 3404219089Spjd vdev_metaslab_set_size(rvd->vdev_child[c]); 3405219089Spjd vdev_expand(rvd->vdev_child[c], txg); 3406219089Spjd } 3407168404Spjd } 3408168404Spjd 3409185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 3410168404Spjd 3411168404Spjd if (error != 0) { 3412168404Spjd spa_unload(spa); 3413168404Spjd spa_deactivate(spa); 3414168404Spjd spa_remove(spa); 3415168404Spjd mutex_exit(&spa_namespace_lock); 3416168404Spjd return (error); 3417168404Spjd } 3418168404Spjd 3419168404Spjd /* 3420168404Spjd * Get the list of spares, if specified. 3421168404Spjd */ 3422168404Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 3423168404Spjd &spares, &nspares) == 0) { 3424185029Spjd VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, 3425168404Spjd KM_SLEEP) == 0); 3426185029Spjd VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 3427168404Spjd ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 3428185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3429168404Spjd spa_load_spares(spa); 3430185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 3431185029Spjd spa->spa_spares.sav_sync = B_TRUE; 3432168404Spjd } 3433168404Spjd 3434185029Spjd /* 3435185029Spjd * Get the list of level 2 cache devices, if specified. 3436185029Spjd */ 3437185029Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 3438185029Spjd &l2cache, &nl2cache) == 0) { 3439185029Spjd VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 3440185029Spjd NV_UNIQUE_NAME, KM_SLEEP) == 0); 3441185029Spjd VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 3442185029Spjd ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 3443185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3444185029Spjd spa_load_l2cache(spa); 3445185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 3446185029Spjd spa->spa_l2cache.sav_sync = B_TRUE; 3447185029Spjd } 3448185029Spjd 3449236884Smm spa->spa_is_initializing = B_TRUE; 3450185029Spjd spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); 3451168404Spjd spa->spa_meta_objset = dp->dp_meta_objset; 3452236884Smm spa->spa_is_initializing = B_FALSE; 3453168404Spjd 3454219089Spjd /* 3455219089Spjd * Create DDTs (dedup tables). 3456219089Spjd */ 3457219089Spjd ddt_create(spa); 3458219089Spjd 3459219089Spjd spa_update_dspace(spa); 3460219089Spjd 3461168404Spjd tx = dmu_tx_create_assigned(dp, txg); 3462168404Spjd 3463168404Spjd /* 3464168404Spjd * Create the pool config object. 3465168404Spjd */ 3466168404Spjd spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 3467185029Spjd DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, 3468168404Spjd DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 3469168404Spjd 3470168404Spjd if (zap_add(spa->spa_meta_objset, 3471168404Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 3472168404Spjd sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 3473168404Spjd cmn_err(CE_PANIC, "failed to add pool config"); 3474168404Spjd } 3475168404Spjd 3476236884Smm if (spa_version(spa) >= SPA_VERSION_FEATURES) 3477236884Smm spa_feature_create_zap_objects(spa, tx); 3478236884Smm 3479219089Spjd if (zap_add(spa->spa_meta_objset, 3480219089Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, 3481219089Spjd sizeof (uint64_t), 1, &version, tx) != 0) { 3482219089Spjd cmn_err(CE_PANIC, "failed to add pool version"); 3483219089Spjd } 3484219089Spjd 3485185029Spjd /* Newly created pools with the right version are always deflated. */ 3486185029Spjd if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 3487185029Spjd spa->spa_deflate = TRUE; 3488185029Spjd if (zap_add(spa->spa_meta_objset, 3489185029Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 3490185029Spjd sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 3491185029Spjd cmn_err(CE_PANIC, "failed to add deflate"); 3492185029Spjd } 3493168404Spjd } 3494168404Spjd 3495168404Spjd /* 3496219089Spjd * Create the deferred-free bpobj. Turn off compression 3497168404Spjd * because sync-to-convergence takes longer if the blocksize 3498168404Spjd * keeps changing. 3499168404Spjd */ 3500219089Spjd obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); 3501219089Spjd dmu_object_set_compress(spa->spa_meta_objset, obj, 3502168404Spjd ZIO_COMPRESS_OFF, tx); 3503168404Spjd if (zap_add(spa->spa_meta_objset, 3504219089Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, 3505219089Spjd sizeof (uint64_t), 1, &obj, tx) != 0) { 3506219089Spjd cmn_err(CE_PANIC, "failed to add bpobj"); 3507168404Spjd } 3508219089Spjd VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, 3509219089Spjd spa->spa_meta_objset, obj)); 3510168404Spjd 3511168404Spjd /* 3512168404Spjd * Create the pool's history object. 3513168404Spjd */ 3514185029Spjd if (version >= SPA_VERSION_ZPOOL_HISTORY) 3515185029Spjd spa_history_create_obj(spa, tx); 3516168404Spjd 3517185029Spjd /* 3518185029Spjd * Set pool properties. 3519185029Spjd */ 3520185029Spjd spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 3521185029Spjd spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 3522185029Spjd spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 3523219089Spjd spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); 3524219089Spjd 3525209962Smm if (props != NULL) { 3526209962Smm spa_configfile_set(spa, props, B_FALSE); 3527219089Spjd spa_sync_props(spa, props, tx); 3528209962Smm } 3529185029Spjd 3530168404Spjd dmu_tx_commit(tx); 3531168404Spjd 3532168404Spjd spa->spa_sync_on = B_TRUE; 3533168404Spjd txg_sync_start(spa->spa_dsl_pool); 3534168404Spjd 3535168404Spjd /* 3536168404Spjd * We explicitly wait for the first transaction to complete so that our 3537168404Spjd * bean counters are appropriately updated. 3538168404Spjd */ 3539168404Spjd txg_wait_synced(spa->spa_dsl_pool, txg); 3540168404Spjd 3541185029Spjd spa_config_sync(spa, B_FALSE, B_TRUE); 3542168404Spjd 3543185029Spjd if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL) 3544185029Spjd (void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE); 3545219089Spjd spa_history_log_version(spa, LOG_POOL_CREATE); 3546185029Spjd 3547208442Smm spa->spa_minref = refcount_count(&spa->spa_refcount); 3548208442Smm 3549168404Spjd mutex_exit(&spa_namespace_lock); 3550168404Spjd 3551168404Spjd return (0); 3552168404Spjd} 3553168404Spjd 3554241286Savg#ifdef _KERNEL 3555219089Spjd#if defined(sun) 3556185029Spjd/* 3557219089Spjd * Get the root pool information from the root disk, then import the root pool 3558219089Spjd * during the system boot up time. 3559185029Spjd */ 3560219089Spjdextern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); 3561219089Spjd 3562219089Spjdstatic nvlist_t * 3563219089Spjdspa_generate_rootconf(char *devpath, char *devid, uint64_t *guid) 3564185029Spjd{ 3565219089Spjd nvlist_t *config; 3566185029Spjd nvlist_t *nvtop, *nvroot; 3567185029Spjd uint64_t pgid; 3568185029Spjd 3569219089Spjd if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0) 3570219089Spjd return (NULL); 3571219089Spjd 3572168404Spjd /* 3573185029Spjd * Add this top-level vdev to the child array. 3574168404Spjd */ 3575219089Spjd VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3576219089Spjd &nvtop) == 0); 3577219089Spjd VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 3578219089Spjd &pgid) == 0); 3579219089Spjd VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0); 3580168404Spjd 3581185029Spjd /* 3582185029Spjd * Put this pool's top-level vdevs into a root vdev. 3583185029Spjd */ 3584185029Spjd VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3585219089Spjd VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 3586219089Spjd VDEV_TYPE_ROOT) == 0); 3587185029Spjd VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 3588185029Spjd VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 3589185029Spjd VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 3590185029Spjd &nvtop, 1) == 0); 3591168404Spjd 3592168404Spjd /* 3593185029Spjd * Replace the existing vdev_tree with the new root vdev in 3594185029Spjd * this pool's configuration (remove the old, add the new). 3595168404Spjd */ 3596185029Spjd VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 3597185029Spjd nvlist_free(nvroot); 3598219089Spjd return (config); 3599185029Spjd} 3600168404Spjd 3601185029Spjd/* 3602219089Spjd * Walk the vdev tree and see if we can find a device with "better" 3603219089Spjd * configuration. A configuration is "better" if the label on that 3604219089Spjd * device has a more recent txg. 3605185029Spjd */ 3606219089Spjdstatic void 3607219089Spjdspa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg) 3608185029Spjd{ 3609219089Spjd for (int c = 0; c < vd->vdev_children; c++) 3610219089Spjd spa_alt_rootvdev(vd->vdev_child[c], avd, txg); 3611185029Spjd 3612219089Spjd if (vd->vdev_ops->vdev_op_leaf) { 3613219089Spjd nvlist_t *label; 3614219089Spjd uint64_t label_txg; 3615185029Spjd 3616219089Spjd if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid, 3617219089Spjd &label) != 0) 3618219089Spjd return; 3619185029Spjd 3620219089Spjd VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, 3621219089Spjd &label_txg) == 0); 3622168404Spjd 3623219089Spjd /* 3624219089Spjd * Do we have a better boot device? 3625219089Spjd */ 3626219089Spjd if (label_txg > *txg) { 3627219089Spjd *txg = label_txg; 3628219089Spjd *avd = vd; 3629185029Spjd } 3630219089Spjd nvlist_free(label); 3631185029Spjd } 3632185029Spjd} 3633185029Spjd 3634185029Spjd/* 3635185029Spjd * Import a root pool. 3636185029Spjd * 3637185029Spjd * For x86. devpath_list will consist of devid and/or physpath name of 3638185029Spjd * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). 3639185029Spjd * The GRUB "findroot" command will return the vdev we should boot. 3640185029Spjd * 3641185029Spjd * For Sparc, devpath_list consists the physpath name of the booting device 3642185029Spjd * no matter the rootpool is a single device pool or a mirrored pool. 3643185029Spjd * e.g. 3644185029Spjd * "/pci@1f,0/ide@d/disk@0,0:a" 3645185029Spjd */ 3646185029Spjdint 3647185029Spjdspa_import_rootpool(char *devpath, char *devid) 3648185029Spjd{ 3649219089Spjd spa_t *spa; 3650219089Spjd vdev_t *rvd, *bvd, *avd = NULL; 3651219089Spjd nvlist_t *config, *nvtop; 3652219089Spjd uint64_t guid, txg; 3653185029Spjd char *pname; 3654185029Spjd int error; 3655185029Spjd 3656185029Spjd /* 3657219089Spjd * Read the label from the boot device and generate a configuration. 3658185029Spjd */ 3659219089Spjd config = spa_generate_rootconf(devpath, devid, &guid); 3660219089Spjd#if defined(_OBP) && defined(_KERNEL) 3661219089Spjd if (config == NULL) { 3662219089Spjd if (strstr(devpath, "/iscsi/ssd") != NULL) { 3663219089Spjd /* iscsi boot */ 3664219089Spjd get_iscsi_bootpath_phy(devpath); 3665219089Spjd config = spa_generate_rootconf(devpath, devid, &guid); 3666219089Spjd } 3667219089Spjd } 3668219089Spjd#endif 3669219089Spjd if (config == NULL) { 3670236884Smm cmn_err(CE_NOTE, "Cannot read the pool label from '%s'", 3671219089Spjd devpath); 3672219089Spjd return (EIO); 3673219089Spjd } 3674185029Spjd 3675219089Spjd VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 3676219089Spjd &pname) == 0); 3677219089Spjd VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); 3678185029Spjd 3679209962Smm mutex_enter(&spa_namespace_lock); 3680209962Smm if ((spa = spa_lookup(pname)) != NULL) { 3681209962Smm /* 3682209962Smm * Remove the existing root pool from the namespace so that we 3683209962Smm * can replace it with the correct config we just read in. 3684209962Smm */ 3685209962Smm spa_remove(spa); 3686209962Smm } 3687185029Spjd 3688219089Spjd spa = spa_add(pname, config, NULL); 3689209962Smm spa->spa_is_root = B_TRUE; 3690219089Spjd spa->spa_import_flags = ZFS_IMPORT_VERBATIM; 3691209962Smm 3692219089Spjd /* 3693219089Spjd * Build up a vdev tree based on the boot device's label config. 3694219089Spjd */ 3695219089Spjd VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3696219089Spjd &nvtop) == 0); 3697219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3698219089Spjd error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, 3699219089Spjd VDEV_ALLOC_ROOTPOOL); 3700219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 3701219089Spjd if (error) { 3702209962Smm mutex_exit(&spa_namespace_lock); 3703219089Spjd nvlist_free(config); 3704219089Spjd cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", 3705219089Spjd pname); 3706219089Spjd return (error); 3707209962Smm } 3708209962Smm 3709219089Spjd /* 3710219089Spjd * Get the boot vdev. 3711219089Spjd */ 3712219089Spjd if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 3713219089Spjd cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu", 3714219089Spjd (u_longlong_t)guid); 3715219089Spjd error = ENOENT; 3716219089Spjd goto out; 3717219089Spjd } 3718209962Smm 3719219089Spjd /* 3720219089Spjd * Determine if there is a better boot device. 3721219089Spjd */ 3722219089Spjd avd = bvd; 3723219089Spjd spa_alt_rootvdev(rvd, &avd, &txg); 3724219089Spjd if (avd != bvd) { 3725219089Spjd cmn_err(CE_NOTE, "The boot device is 'degraded'. Please " 3726219089Spjd "try booting from '%s'", avd->vdev_path); 3727219089Spjd error = EINVAL; 3728219089Spjd goto out; 3729219089Spjd } 3730209962Smm 3731219089Spjd /* 3732219089Spjd * If the boot device is part of a spare vdev then ensure that 3733219089Spjd * we're booting off the active spare. 3734219089Spjd */ 3735219089Spjd if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops && 3736219089Spjd !bvd->vdev_isspare) { 3737219089Spjd cmn_err(CE_NOTE, "The boot device is currently spared. Please " 3738219089Spjd "try booting from '%s'", 3739219089Spjd bvd->vdev_parent-> 3740219089Spjd vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path); 3741219089Spjd error = EINVAL; 3742219089Spjd goto out; 3743219089Spjd } 3744209962Smm 3745219089Spjd error = 0; 3746219089Spjd spa_history_log_version(spa, LOG_POOL_IMPORT); 3747219089Spjdout: 3748219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3749219089Spjd vdev_free(rvd); 3750219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 3751209962Smm mutex_exit(&spa_namespace_lock); 3752209962Smm 3753219089Spjd nvlist_free(config); 3754219089Spjd return (error); 3755185029Spjd} 3756185029Spjd 3757241286Savg#else 3758241286Savg 3759243502Savgextern int vdev_geom_read_pool_label(const char *name, nvlist_t ***configs, 3760243502Savg uint64_t *count); 3761241286Savg 3762241286Savgstatic nvlist_t * 3763241286Savgspa_generate_rootconf(const char *name) 3764241286Savg{ 3765243502Savg nvlist_t **configs, **tops; 3766241286Savg nvlist_t *config; 3767243502Savg nvlist_t *best_cfg, *nvtop, *nvroot; 3768243502Savg uint64_t *holes; 3769243502Savg uint64_t best_txg; 3770243213Savg uint64_t nchildren; 3771241286Savg uint64_t pgid; 3772243502Savg uint64_t count; 3773243502Savg uint64_t i; 3774243502Savg uint_t nholes; 3775241286Savg 3776243502Savg if (vdev_geom_read_pool_label(name, &configs, &count) != 0) 3777241286Savg return (NULL); 3778241286Savg 3779243502Savg ASSERT3U(count, !=, 0); 3780243502Savg best_txg = 0; 3781243502Savg for (i = 0; i < count; i++) { 3782243502Savg uint64_t txg; 3783243502Savg 3784243502Savg VERIFY(nvlist_lookup_uint64(configs[i], ZPOOL_CONFIG_POOL_TXG, 3785243502Savg &txg) == 0); 3786243502Savg if (txg > best_txg) { 3787243502Savg best_txg = txg; 3788243502Savg best_cfg = configs[i]; 3789243502Savg } 3790243502Savg } 3791243502Savg 3792241286Savg /* 3793243213Savg * Multi-vdev root pool configuration discovery is not supported yet. 3794243213Savg */ 3795245945Savg nchildren = 1; 3796245945Savg nvlist_lookup_uint64(best_cfg, ZPOOL_CONFIG_VDEV_CHILDREN, &nchildren); 3797243502Savg holes = NULL; 3798243502Savg nvlist_lookup_uint64_array(best_cfg, ZPOOL_CONFIG_HOLE_ARRAY, 3799243502Savg &holes, &nholes); 3800243502Savg 3801244635Savg tops = kmem_zalloc(nchildren * sizeof(void *), KM_SLEEP); 3802243502Savg for (i = 0; i < nchildren; i++) { 3803243502Savg if (i >= count) 3804243502Savg break; 3805243502Savg if (configs[i] == NULL) 3806243502Savg continue; 3807243502Savg VERIFY(nvlist_lookup_nvlist(configs[i], ZPOOL_CONFIG_VDEV_TREE, 3808243502Savg &nvtop) == 0); 3809243502Savg nvlist_dup(nvtop, &tops[i], KM_SLEEP); 3810243213Savg } 3811243502Savg for (i = 0; holes != NULL && i < nholes; i++) { 3812243502Savg if (i >= nchildren) 3813243502Savg continue; 3814243502Savg if (tops[holes[i]] != NULL) 3815243502Savg continue; 3816243502Savg nvlist_alloc(&tops[holes[i]], NV_UNIQUE_NAME, KM_SLEEP); 3817243502Savg VERIFY(nvlist_add_string(tops[holes[i]], ZPOOL_CONFIG_TYPE, 3818243502Savg VDEV_TYPE_HOLE) == 0); 3819243502Savg VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_ID, 3820243502Savg holes[i]) == 0); 3821243502Savg VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_GUID, 3822243502Savg 0) == 0); 3823243502Savg } 3824243502Savg for (i = 0; i < nchildren; i++) { 3825243502Savg if (tops[i] != NULL) 3826243502Savg continue; 3827243502Savg nvlist_alloc(&tops[i], NV_UNIQUE_NAME, KM_SLEEP); 3828243502Savg VERIFY(nvlist_add_string(tops[i], ZPOOL_CONFIG_TYPE, 3829243502Savg VDEV_TYPE_MISSING) == 0); 3830243502Savg VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_ID, 3831243502Savg i) == 0); 3832243502Savg VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_GUID, 3833243502Savg 0) == 0); 3834243502Savg } 3835243213Savg 3836243213Savg /* 3837243502Savg * Create pool config based on the best vdev config. 3838241286Savg */ 3839243502Savg nvlist_dup(best_cfg, &config, KM_SLEEP); 3840241286Savg 3841241286Savg /* 3842241286Savg * Put this pool's top-level vdevs into a root vdev. 3843241286Savg */ 3844243502Savg VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 3845243502Savg &pgid) == 0); 3846241286Savg VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3847241286Savg VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 3848241286Savg VDEV_TYPE_ROOT) == 0); 3849241286Savg VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 3850241286Savg VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 3851241286Savg VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 3852243502Savg tops, nchildren) == 0); 3853241286Savg 3854241286Savg /* 3855241286Savg * Replace the existing vdev_tree with the new root vdev in 3856241286Savg * this pool's configuration (remove the old, add the new). 3857241286Savg */ 3858241286Savg VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 3859243502Savg 3860243502Savg /* 3861243502Savg * Drop vdev config elements that should not be present at pool level. 3862243502Savg */ 3863243502Savg nvlist_remove(config, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64); 3864243502Savg nvlist_remove(config, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64); 3865243502Savg 3866243502Savg for (i = 0; i < count; i++) 3867243502Savg nvlist_free(configs[i]); 3868243502Savg kmem_free(configs, count * sizeof(void *)); 3869243502Savg for (i = 0; i < nchildren; i++) 3870243502Savg nvlist_free(tops[i]); 3871243502Savg kmem_free(tops, nchildren * sizeof(void *)); 3872241286Savg nvlist_free(nvroot); 3873241286Savg return (config); 3874241286Savg} 3875241286Savg 3876241286Savgint 3877241286Savgspa_import_rootpool(const char *name) 3878241286Savg{ 3879241286Savg spa_t *spa; 3880241286Savg vdev_t *rvd, *bvd, *avd = NULL; 3881241286Savg nvlist_t *config, *nvtop; 3882241286Savg uint64_t txg; 3883241286Savg char *pname; 3884241286Savg int error; 3885241286Savg 3886241286Savg /* 3887241286Savg * Read the label from the boot device and generate a configuration. 3888241286Savg */ 3889241286Savg config = spa_generate_rootconf(name); 3890243213Savg 3891243213Savg mutex_enter(&spa_namespace_lock); 3892243213Savg if (config != NULL) { 3893243213Savg VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 3894243213Savg &pname) == 0 && strcmp(name, pname) == 0); 3895243213Savg VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) 3896243213Savg == 0); 3897243213Savg 3898243213Savg if ((spa = spa_lookup(pname)) != NULL) { 3899243213Savg /* 3900243213Savg * Remove the existing root pool from the namespace so 3901243213Savg * that we can replace it with the correct config 3902243213Savg * we just read in. 3903243213Savg */ 3904243213Savg spa_remove(spa); 3905243213Savg } 3906243213Savg spa = spa_add(pname, config, NULL); 3907243501Savg 3908243501Savg /* 3909243501Savg * Set spa_ubsync.ub_version as it can be used in vdev_alloc() 3910243501Savg * via spa_version(). 3911243501Savg */ 3912243501Savg if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 3913243501Savg &spa->spa_ubsync.ub_version) != 0) 3914243501Savg spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 3915243213Savg } else if ((spa = spa_lookup(name)) == NULL) { 3916241286Savg cmn_err(CE_NOTE, "Cannot find the pool label for '%s'", 3917241286Savg name); 3918241286Savg return (EIO); 3919243213Savg } else { 3920243213Savg VERIFY(nvlist_dup(spa->spa_config, &config, KM_SLEEP) == 0); 3921241286Savg } 3922241286Savg spa->spa_is_root = B_TRUE; 3923241286Savg spa->spa_import_flags = ZFS_IMPORT_VERBATIM; 3924241286Savg 3925241286Savg /* 3926241286Savg * Build up a vdev tree based on the boot device's label config. 3927241286Savg */ 3928241286Savg VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3929241286Savg &nvtop) == 0); 3930241286Savg spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3931241286Savg error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, 3932241286Savg VDEV_ALLOC_ROOTPOOL); 3933241286Savg spa_config_exit(spa, SCL_ALL, FTAG); 3934241286Savg if (error) { 3935241286Savg mutex_exit(&spa_namespace_lock); 3936241286Savg nvlist_free(config); 3937241286Savg cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", 3938241286Savg pname); 3939241286Savg return (error); 3940241286Savg } 3941241286Savg 3942241286Savg spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3943241286Savg vdev_free(rvd); 3944241286Savg spa_config_exit(spa, SCL_ALL, FTAG); 3945241286Savg mutex_exit(&spa_namespace_lock); 3946241286Savg 3947243213Savg nvlist_free(config); 3948243213Savg return (0); 3949241286Savg} 3950241286Savg 3951241286Savg#endif /* sun */ 3952219089Spjd#endif 3953219089Spjd 3954209962Smm/* 3955209962Smm * Import a non-root pool into the system. 3956209962Smm */ 3957185029Spjdint 3958219089Spjdspa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) 3959185029Spjd{ 3960209962Smm spa_t *spa; 3961209962Smm char *altroot = NULL; 3962219089Spjd spa_load_state_t state = SPA_LOAD_IMPORT; 3963219089Spjd zpool_rewind_policy_t policy; 3964219089Spjd uint64_t mode = spa_mode_global; 3965219089Spjd uint64_t readonly = B_FALSE; 3966209962Smm int error; 3967209962Smm nvlist_t *nvroot; 3968209962Smm nvlist_t **spares, **l2cache; 3969209962Smm uint_t nspares, nl2cache; 3970209962Smm 3971209962Smm /* 3972209962Smm * If a pool with this name exists, return failure. 3973209962Smm */ 3974209962Smm mutex_enter(&spa_namespace_lock); 3975219089Spjd if (spa_lookup(pool) != NULL) { 3976209962Smm mutex_exit(&spa_namespace_lock); 3977209962Smm return (EEXIST); 3978209962Smm } 3979209962Smm 3980209962Smm /* 3981209962Smm * Create and initialize the spa structure. 3982209962Smm */ 3983209962Smm (void) nvlist_lookup_string(props, 3984209962Smm zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 3985219089Spjd (void) nvlist_lookup_uint64(props, 3986219089Spjd zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); 3987219089Spjd if (readonly) 3988219089Spjd mode = FREAD; 3989219089Spjd spa = spa_add(pool, config, altroot); 3990219089Spjd spa->spa_import_flags = flags; 3991209962Smm 3992209962Smm /* 3993219089Spjd * Verbatim import - Take a pool and insert it into the namespace 3994219089Spjd * as if it had been loaded at boot. 3995219089Spjd */ 3996219089Spjd if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { 3997219089Spjd if (props != NULL) 3998219089Spjd spa_configfile_set(spa, props, B_FALSE); 3999219089Spjd 4000219089Spjd spa_config_sync(spa, B_FALSE, B_TRUE); 4001219089Spjd 4002219089Spjd mutex_exit(&spa_namespace_lock); 4003219089Spjd spa_history_log_version(spa, LOG_POOL_IMPORT); 4004219089Spjd 4005219089Spjd return (0); 4006219089Spjd } 4007219089Spjd 4008219089Spjd spa_activate(spa, mode); 4009219089Spjd 4010219089Spjd /* 4011209962Smm * Don't start async tasks until we know everything is healthy. 4012209962Smm */ 4013209962Smm spa_async_suspend(spa); 4014209962Smm 4015219089Spjd zpool_get_rewind_policy(config, &policy); 4016219089Spjd if (policy.zrp_request & ZPOOL_DO_REWIND) 4017219089Spjd state = SPA_LOAD_RECOVER; 4018219089Spjd 4019209962Smm /* 4020209962Smm * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig 4021209962Smm * because the user-supplied config is actually the one to trust when 4022209962Smm * doing an import. 4023209962Smm */ 4024219089Spjd if (state != SPA_LOAD_RECOVER) 4025219089Spjd spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 4026209962Smm 4027219089Spjd error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg, 4028219089Spjd policy.zrp_request); 4029219089Spjd 4030219089Spjd /* 4031219089Spjd * Propagate anything learned while loading the pool and pass it 4032219089Spjd * back to caller (i.e. rewind info, missing devices, etc). 4033219089Spjd */ 4034219089Spjd VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 4035219089Spjd spa->spa_load_info) == 0); 4036219089Spjd 4037209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4038209962Smm /* 4039209962Smm * Toss any existing sparelist, as it doesn't have any validity 4040209962Smm * anymore, and conflicts with spa_has_spare(). 4041209962Smm */ 4042209962Smm if (spa->spa_spares.sav_config) { 4043209962Smm nvlist_free(spa->spa_spares.sav_config); 4044209962Smm spa->spa_spares.sav_config = NULL; 4045209962Smm spa_load_spares(spa); 4046209962Smm } 4047209962Smm if (spa->spa_l2cache.sav_config) { 4048209962Smm nvlist_free(spa->spa_l2cache.sav_config); 4049209962Smm spa->spa_l2cache.sav_config = NULL; 4050209962Smm spa_load_l2cache(spa); 4051209962Smm } 4052209962Smm 4053209962Smm VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 4054209962Smm &nvroot) == 0); 4055209962Smm if (error == 0) 4056209962Smm error = spa_validate_aux(spa, nvroot, -1ULL, 4057209962Smm VDEV_ALLOC_SPARE); 4058209962Smm if (error == 0) 4059209962Smm error = spa_validate_aux(spa, nvroot, -1ULL, 4060209962Smm VDEV_ALLOC_L2CACHE); 4061209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 4062209962Smm 4063209962Smm if (props != NULL) 4064209962Smm spa_configfile_set(spa, props, B_FALSE); 4065209962Smm 4066209962Smm if (error != 0 || (props && spa_writeable(spa) && 4067209962Smm (error = spa_prop_set(spa, props)))) { 4068209962Smm spa_unload(spa); 4069209962Smm spa_deactivate(spa); 4070209962Smm spa_remove(spa); 4071209962Smm mutex_exit(&spa_namespace_lock); 4072209962Smm return (error); 4073209962Smm } 4074209962Smm 4075209962Smm spa_async_resume(spa); 4076209962Smm 4077209962Smm /* 4078209962Smm * Override any spares and level 2 cache devices as specified by 4079209962Smm * the user, as these may have correct device names/devids, etc. 4080209962Smm */ 4081209962Smm if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 4082209962Smm &spares, &nspares) == 0) { 4083209962Smm if (spa->spa_spares.sav_config) 4084209962Smm VERIFY(nvlist_remove(spa->spa_spares.sav_config, 4085209962Smm ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 4086209962Smm else 4087209962Smm VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, 4088209962Smm NV_UNIQUE_NAME, KM_SLEEP) == 0); 4089209962Smm VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 4090209962Smm ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 4091209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4092209962Smm spa_load_spares(spa); 4093209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 4094209962Smm spa->spa_spares.sav_sync = B_TRUE; 4095209962Smm } 4096209962Smm if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 4097209962Smm &l2cache, &nl2cache) == 0) { 4098209962Smm if (spa->spa_l2cache.sav_config) 4099209962Smm VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, 4100209962Smm ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); 4101209962Smm else 4102209962Smm VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 4103209962Smm NV_UNIQUE_NAME, KM_SLEEP) == 0); 4104209962Smm VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 4105209962Smm ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 4106209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4107209962Smm spa_load_l2cache(spa); 4108209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 4109209962Smm spa->spa_l2cache.sav_sync = B_TRUE; 4110209962Smm } 4111209962Smm 4112219089Spjd /* 4113219089Spjd * Check for any removed devices. 4114219089Spjd */ 4115219089Spjd if (spa->spa_autoreplace) { 4116219089Spjd spa_aux_check_removed(&spa->spa_spares); 4117219089Spjd spa_aux_check_removed(&spa->spa_l2cache); 4118219089Spjd } 4119219089Spjd 4120209962Smm if (spa_writeable(spa)) { 4121209962Smm /* 4122209962Smm * Update the config cache to include the newly-imported pool. 4123209962Smm */ 4124209962Smm spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 4125209962Smm } 4126209962Smm 4127219089Spjd /* 4128219089Spjd * It's possible that the pool was expanded while it was exported. 4129219089Spjd * We kick off an async task to handle this for us. 4130219089Spjd */ 4131219089Spjd spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 4132219089Spjd 4133209962Smm mutex_exit(&spa_namespace_lock); 4134219089Spjd spa_history_log_version(spa, LOG_POOL_IMPORT); 4135209962Smm 4136219089Spjd#ifdef __FreeBSD__ 4137219089Spjd#ifdef _KERNEL 4138219089Spjd zvol_create_minors(pool); 4139219089Spjd#endif 4140219089Spjd#endif 4141209962Smm return (0); 4142185029Spjd} 4143185029Spjd 4144168404Spjdnvlist_t * 4145168404Spjdspa_tryimport(nvlist_t *tryconfig) 4146168404Spjd{ 4147168404Spjd nvlist_t *config = NULL; 4148168404Spjd char *poolname; 4149168404Spjd spa_t *spa; 4150168404Spjd uint64_t state; 4151208443Smm int error; 4152168404Spjd 4153168404Spjd if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 4154168404Spjd return (NULL); 4155168404Spjd 4156168404Spjd if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 4157168404Spjd return (NULL); 4158168404Spjd 4159168404Spjd /* 4160168404Spjd * Create and initialize the spa structure. 4161168404Spjd */ 4162168404Spjd mutex_enter(&spa_namespace_lock); 4163219089Spjd spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); 4164209962Smm spa_activate(spa, FREAD); 4165168404Spjd 4166168404Spjd /* 4167168404Spjd * Pass off the heavy lifting to spa_load(). 4168168404Spjd * Pass TRUE for mosconfig because the user-supplied config 4169168404Spjd * is actually the one to trust when doing an import. 4170168404Spjd */ 4171219089Spjd error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE); 4172168404Spjd 4173168404Spjd /* 4174168404Spjd * If 'tryconfig' was at least parsable, return the current config. 4175168404Spjd */ 4176168404Spjd if (spa->spa_root_vdev != NULL) { 4177168404Spjd config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 4178168404Spjd VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 4179168404Spjd poolname) == 0); 4180168404Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 4181168404Spjd state) == 0); 4182168498Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 4183168498Spjd spa->spa_uberblock.ub_timestamp) == 0); 4184236884Smm VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 4185236884Smm spa->spa_load_info) == 0); 4186168404Spjd 4187168404Spjd /* 4188185029Spjd * If the bootfs property exists on this pool then we 4189185029Spjd * copy it out so that external consumers can tell which 4190185029Spjd * pools are bootable. 4191168404Spjd */ 4192208443Smm if ((!error || error == EEXIST) && spa->spa_bootfs) { 4193185029Spjd char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 4194185029Spjd 4195185029Spjd /* 4196185029Spjd * We have to play games with the name since the 4197185029Spjd * pool was opened as TRYIMPORT_NAME. 4198185029Spjd */ 4199185029Spjd if (dsl_dsobj_to_dsname(spa_name(spa), 4200185029Spjd spa->spa_bootfs, tmpname) == 0) { 4201185029Spjd char *cp; 4202185029Spjd char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 4203185029Spjd 4204185029Spjd cp = strchr(tmpname, '/'); 4205185029Spjd if (cp == NULL) { 4206185029Spjd (void) strlcpy(dsname, tmpname, 4207185029Spjd MAXPATHLEN); 4208185029Spjd } else { 4209185029Spjd (void) snprintf(dsname, MAXPATHLEN, 4210185029Spjd "%s/%s", poolname, ++cp); 4211185029Spjd } 4212185029Spjd VERIFY(nvlist_add_string(config, 4213185029Spjd ZPOOL_CONFIG_BOOTFS, dsname) == 0); 4214185029Spjd kmem_free(dsname, MAXPATHLEN); 4215185029Spjd } 4216185029Spjd kmem_free(tmpname, MAXPATHLEN); 4217185029Spjd } 4218185029Spjd 4219185029Spjd /* 4220185029Spjd * Add the list of hot spares and level 2 cache devices. 4221185029Spjd */ 4222209962Smm spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4223168404Spjd spa_add_spares(spa, config); 4224185029Spjd spa_add_l2cache(spa, config); 4225209962Smm spa_config_exit(spa, SCL_CONFIG, FTAG); 4226168404Spjd } 4227168404Spjd 4228168404Spjd spa_unload(spa); 4229168404Spjd spa_deactivate(spa); 4230168404Spjd spa_remove(spa); 4231168404Spjd mutex_exit(&spa_namespace_lock); 4232168404Spjd 4233168404Spjd return (config); 4234168404Spjd} 4235168404Spjd 4236168404Spjd/* 4237168404Spjd * Pool export/destroy 4238168404Spjd * 4239168404Spjd * The act of destroying or exporting a pool is very simple. We make sure there 4240168404Spjd * is no more pending I/O and any references to the pool are gone. Then, we 4241168404Spjd * update the pool state and sync all the labels to disk, removing the 4242207670Smm * configuration from the cache afterwards. If the 'hardforce' flag is set, then 4243207670Smm * we don't sync the labels or remove the configuration cache. 4244168404Spjd */ 4245168404Spjdstatic int 4246185029Spjdspa_export_common(char *pool, int new_state, nvlist_t **oldconfig, 4247207670Smm boolean_t force, boolean_t hardforce) 4248168404Spjd{ 4249168404Spjd spa_t *spa; 4250168404Spjd 4251168404Spjd if (oldconfig) 4252168404Spjd *oldconfig = NULL; 4253168404Spjd 4254209962Smm if (!(spa_mode_global & FWRITE)) 4255168404Spjd return (EROFS); 4256168404Spjd 4257168404Spjd mutex_enter(&spa_namespace_lock); 4258168404Spjd if ((spa = spa_lookup(pool)) == NULL) { 4259168404Spjd mutex_exit(&spa_namespace_lock); 4260168404Spjd return (ENOENT); 4261168404Spjd } 4262168404Spjd 4263168404Spjd /* 4264168404Spjd * Put a hold on the pool, drop the namespace lock, stop async tasks, 4265168404Spjd * reacquire the namespace lock, and see if we can export. 4266168404Spjd */ 4267168404Spjd spa_open_ref(spa, FTAG); 4268168404Spjd mutex_exit(&spa_namespace_lock); 4269168404Spjd spa_async_suspend(spa); 4270168404Spjd mutex_enter(&spa_namespace_lock); 4271168404Spjd spa_close(spa, FTAG); 4272168404Spjd 4273168404Spjd /* 4274168404Spjd * The pool will be in core if it's openable, 4275168404Spjd * in which case we can modify its state. 4276168404Spjd */ 4277168404Spjd if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 4278168404Spjd /* 4279168404Spjd * Objsets may be open only because they're dirty, so we 4280168404Spjd * have to force it to sync before checking spa_refcnt. 4281168404Spjd */ 4282168404Spjd txg_wait_synced(spa->spa_dsl_pool, 0); 4283168404Spjd 4284168404Spjd /* 4285168404Spjd * A pool cannot be exported or destroyed if there are active 4286168404Spjd * references. If we are resetting a pool, allow references by 4287168404Spjd * fault injection handlers. 4288168404Spjd */ 4289168404Spjd if (!spa_refcount_zero(spa) || 4290168404Spjd (spa->spa_inject_ref != 0 && 4291168404Spjd new_state != POOL_STATE_UNINITIALIZED)) { 4292168404Spjd spa_async_resume(spa); 4293168404Spjd mutex_exit(&spa_namespace_lock); 4294168404Spjd return (EBUSY); 4295168404Spjd } 4296168404Spjd 4297185029Spjd /* 4298185029Spjd * A pool cannot be exported if it has an active shared spare. 4299185029Spjd * This is to prevent other pools stealing the active spare 4300185029Spjd * from an exported pool. At user's own will, such pool can 4301185029Spjd * be forcedly exported. 4302185029Spjd */ 4303185029Spjd if (!force && new_state == POOL_STATE_EXPORTED && 4304185029Spjd spa_has_active_shared_spare(spa)) { 4305185029Spjd spa_async_resume(spa); 4306185029Spjd mutex_exit(&spa_namespace_lock); 4307185029Spjd return (EXDEV); 4308185029Spjd } 4309168404Spjd 4310168404Spjd /* 4311168404Spjd * We want this to be reflected on every label, 4312168404Spjd * so mark them all dirty. spa_unload() will do the 4313168404Spjd * final sync that pushes these changes out. 4314168404Spjd */ 4315207670Smm if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 4316185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4317168404Spjd spa->spa_state = new_state; 4318219089Spjd spa->spa_final_txg = spa_last_synced_txg(spa) + 4319219089Spjd TXG_DEFER_SIZE + 1; 4320168404Spjd vdev_config_dirty(spa->spa_root_vdev); 4321185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 4322168404Spjd } 4323168404Spjd } 4324168404Spjd 4325185029Spjd spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY); 4326185029Spjd 4327168404Spjd if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 4328168404Spjd spa_unload(spa); 4329168404Spjd spa_deactivate(spa); 4330168404Spjd } 4331168404Spjd 4332168404Spjd if (oldconfig && spa->spa_config) 4333168404Spjd VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 4334168404Spjd 4335168404Spjd if (new_state != POOL_STATE_UNINITIALIZED) { 4336207670Smm if (!hardforce) 4337207670Smm spa_config_sync(spa, B_TRUE, B_TRUE); 4338168404Spjd spa_remove(spa); 4339168404Spjd } 4340168404Spjd mutex_exit(&spa_namespace_lock); 4341168404Spjd 4342168404Spjd return (0); 4343168404Spjd} 4344168404Spjd 4345168404Spjd/* 4346168404Spjd * Destroy a storage pool. 4347168404Spjd */ 4348168404Spjdint 4349168404Spjdspa_destroy(char *pool) 4350168404Spjd{ 4351207670Smm return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, 4352207670Smm B_FALSE, B_FALSE)); 4353168404Spjd} 4354168404Spjd 4355168404Spjd/* 4356168404Spjd * Export a storage pool. 4357168404Spjd */ 4358168404Spjdint 4359207670Smmspa_export(char *pool, nvlist_t **oldconfig, boolean_t force, 4360207670Smm boolean_t hardforce) 4361168404Spjd{ 4362207670Smm return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, 4363207670Smm force, hardforce)); 4364168404Spjd} 4365168404Spjd 4366168404Spjd/* 4367168404Spjd * Similar to spa_export(), this unloads the spa_t without actually removing it 4368168404Spjd * from the namespace in any way. 4369168404Spjd */ 4370168404Spjdint 4371168404Spjdspa_reset(char *pool) 4372168404Spjd{ 4373185029Spjd return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, 4374207670Smm B_FALSE, B_FALSE)); 4375168404Spjd} 4376168404Spjd 4377168404Spjd/* 4378168404Spjd * ========================================================================== 4379168404Spjd * Device manipulation 4380168404Spjd * ========================================================================== 4381168404Spjd */ 4382168404Spjd 4383168404Spjd/* 4384185029Spjd * Add a device to a storage pool. 4385168404Spjd */ 4386168404Spjdint 4387168404Spjdspa_vdev_add(spa_t *spa, nvlist_t *nvroot) 4388168404Spjd{ 4389219089Spjd uint64_t txg, id; 4390209962Smm int error; 4391168404Spjd vdev_t *rvd = spa->spa_root_vdev; 4392168404Spjd vdev_t *vd, *tvd; 4393185029Spjd nvlist_t **spares, **l2cache; 4394185029Spjd uint_t nspares, nl2cache; 4395168404Spjd 4396219089Spjd ASSERT(spa_writeable(spa)); 4397219089Spjd 4398168404Spjd txg = spa_vdev_enter(spa); 4399168404Spjd 4400168404Spjd if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 4401168404Spjd VDEV_ALLOC_ADD)) != 0) 4402168404Spjd return (spa_vdev_exit(spa, NULL, txg, error)); 4403168404Spjd 4404185029Spjd spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ 4405168404Spjd 4406185029Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 4407185029Spjd &nspares) != 0) 4408168404Spjd nspares = 0; 4409168404Spjd 4410185029Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 4411185029Spjd &nl2cache) != 0) 4412185029Spjd nl2cache = 0; 4413185029Spjd 4414185029Spjd if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) 4415168404Spjd return (spa_vdev_exit(spa, vd, txg, EINVAL)); 4416168404Spjd 4417185029Spjd if (vd->vdev_children != 0 && 4418185029Spjd (error = vdev_create(vd, txg, B_FALSE)) != 0) 4419185029Spjd return (spa_vdev_exit(spa, vd, txg, error)); 4420168404Spjd 4421168404Spjd /* 4422185029Spjd * We must validate the spares and l2cache devices after checking the 4423185029Spjd * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 4424168404Spjd */ 4425185029Spjd if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) 4426168404Spjd return (spa_vdev_exit(spa, vd, txg, error)); 4427168404Spjd 4428168404Spjd /* 4429168404Spjd * Transfer each new top-level vdev from vd to rvd. 4430168404Spjd */ 4431209962Smm for (int c = 0; c < vd->vdev_children; c++) { 4432219089Spjd 4433219089Spjd /* 4434219089Spjd * Set the vdev id to the first hole, if one exists. 4435219089Spjd */ 4436219089Spjd for (id = 0; id < rvd->vdev_children; id++) { 4437219089Spjd if (rvd->vdev_child[id]->vdev_ishole) { 4438219089Spjd vdev_free(rvd->vdev_child[id]); 4439219089Spjd break; 4440219089Spjd } 4441219089Spjd } 4442168404Spjd tvd = vd->vdev_child[c]; 4443168404Spjd vdev_remove_child(vd, tvd); 4444219089Spjd tvd->vdev_id = id; 4445168404Spjd vdev_add_child(rvd, tvd); 4446168404Spjd vdev_config_dirty(tvd); 4447168404Spjd } 4448168404Spjd 4449168404Spjd if (nspares != 0) { 4450185029Spjd spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 4451185029Spjd ZPOOL_CONFIG_SPARES); 4452168404Spjd spa_load_spares(spa); 4453185029Spjd spa->spa_spares.sav_sync = B_TRUE; 4454168404Spjd } 4455168404Spjd 4456185029Spjd if (nl2cache != 0) { 4457185029Spjd spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 4458185029Spjd ZPOOL_CONFIG_L2CACHE); 4459185029Spjd spa_load_l2cache(spa); 4460185029Spjd spa->spa_l2cache.sav_sync = B_TRUE; 4461185029Spjd } 4462185029Spjd 4463168404Spjd /* 4464168404Spjd * We have to be careful when adding new vdevs to an existing pool. 4465168404Spjd * If other threads start allocating from these vdevs before we 4466168404Spjd * sync the config cache, and we lose power, then upon reboot we may 4467168404Spjd * fail to open the pool because there are DVAs that the config cache 4468168404Spjd * can't translate. Therefore, we first add the vdevs without 4469168404Spjd * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 4470168404Spjd * and then let spa_config_update() initialize the new metaslabs. 4471168404Spjd * 4472168404Spjd * spa_load() checks for added-but-not-initialized vdevs, so that 4473168404Spjd * if we lose power at any point in this sequence, the remaining 4474168404Spjd * steps will be completed the next time we load the pool. 4475168404Spjd */ 4476168404Spjd (void) spa_vdev_exit(spa, vd, txg, 0); 4477168404Spjd 4478168404Spjd mutex_enter(&spa_namespace_lock); 4479168404Spjd spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 4480168404Spjd mutex_exit(&spa_namespace_lock); 4481168404Spjd 4482168404Spjd return (0); 4483168404Spjd} 4484168404Spjd 4485168404Spjd/* 4486168404Spjd * Attach a device to a mirror. The arguments are the path to any device 4487168404Spjd * in the mirror, and the nvroot for the new device. If the path specifies 4488168404Spjd * a device that is not mirrored, we automatically insert the mirror vdev. 4489168404Spjd * 4490168404Spjd * If 'replacing' is specified, the new device is intended to replace the 4491168404Spjd * existing device; in this case the two devices are made into their own 4492185029Spjd * mirror using the 'replacing' vdev, which is functionally identical to 4493168404Spjd * the mirror vdev (it actually reuses all the same ops) but has a few 4494168404Spjd * extra rules: you can't attach to it after it's been created, and upon 4495168404Spjd * completion of resilvering, the first disk (the one being replaced) 4496168404Spjd * is automatically detached. 4497168404Spjd */ 4498168404Spjdint 4499168404Spjdspa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 4500168404Spjd{ 4501219089Spjd uint64_t txg, dtl_max_txg; 4502168404Spjd vdev_t *rvd = spa->spa_root_vdev; 4503168404Spjd vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 4504168404Spjd vdev_ops_t *pvops; 4505185029Spjd char *oldvdpath, *newvdpath; 4506185029Spjd int newvd_isspare; 4507185029Spjd int error; 4508168404Spjd 4509219089Spjd ASSERT(spa_writeable(spa)); 4510219089Spjd 4511168404Spjd txg = spa_vdev_enter(spa); 4512168404Spjd 4513185029Spjd oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 4514168404Spjd 4515168404Spjd if (oldvd == NULL) 4516168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 4517168404Spjd 4518168404Spjd if (!oldvd->vdev_ops->vdev_op_leaf) 4519168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4520168404Spjd 4521168404Spjd pvd = oldvd->vdev_parent; 4522168404Spjd 4523168404Spjd if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 4524230514Smm VDEV_ALLOC_ATTACH)) != 0) 4525185029Spjd return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4526185029Spjd 4527185029Spjd if (newrootvd->vdev_children != 1) 4528168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 4529168404Spjd 4530168404Spjd newvd = newrootvd->vdev_child[0]; 4531168404Spjd 4532168404Spjd if (!newvd->vdev_ops->vdev_op_leaf) 4533168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 4534168404Spjd 4535168404Spjd if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 4536168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, error)); 4537168404Spjd 4538185029Spjd /* 4539185029Spjd * Spares can't replace logs 4540185029Spjd */ 4541185029Spjd if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) 4542185029Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4543185029Spjd 4544168404Spjd if (!replacing) { 4545168404Spjd /* 4546168404Spjd * For attach, the only allowable parent is a mirror or the root 4547168404Spjd * vdev. 4548168404Spjd */ 4549168404Spjd if (pvd->vdev_ops != &vdev_mirror_ops && 4550168404Spjd pvd->vdev_ops != &vdev_root_ops) 4551168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4552168404Spjd 4553168404Spjd pvops = &vdev_mirror_ops; 4554168404Spjd } else { 4555168404Spjd /* 4556168404Spjd * Active hot spares can only be replaced by inactive hot 4557168404Spjd * spares. 4558168404Spjd */ 4559168404Spjd if (pvd->vdev_ops == &vdev_spare_ops && 4560219089Spjd oldvd->vdev_isspare && 4561168404Spjd !spa_has_spare(spa, newvd->vdev_guid)) 4562168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4563168404Spjd 4564168404Spjd /* 4565168404Spjd * If the source is a hot spare, and the parent isn't already a 4566168404Spjd * spare, then we want to create a new hot spare. Otherwise, we 4567168404Spjd * want to create a replacing vdev. The user is not allowed to 4568168404Spjd * attach to a spared vdev child unless the 'isspare' state is 4569168404Spjd * the same (spare replaces spare, non-spare replaces 4570168404Spjd * non-spare). 4571168404Spjd */ 4572219089Spjd if (pvd->vdev_ops == &vdev_replacing_ops && 4573219089Spjd spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { 4574168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4575219089Spjd } else if (pvd->vdev_ops == &vdev_spare_ops && 4576219089Spjd newvd->vdev_isspare != oldvd->vdev_isspare) { 4577168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4578219089Spjd } 4579219089Spjd 4580219089Spjd if (newvd->vdev_isspare) 4581168404Spjd pvops = &vdev_spare_ops; 4582168404Spjd else 4583168404Spjd pvops = &vdev_replacing_ops; 4584168404Spjd } 4585168404Spjd 4586168404Spjd /* 4587219089Spjd * Make sure the new device is big enough. 4588168404Spjd */ 4589219089Spjd if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) 4590168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 4591168404Spjd 4592168404Spjd /* 4593168404Spjd * The new device cannot have a higher alignment requirement 4594168404Spjd * than the top-level vdev. 4595168404Spjd */ 4596168404Spjd if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 4597168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 4598168404Spjd 4599168404Spjd /* 4600168404Spjd * If this is an in-place replacement, update oldvd's path and devid 4601168404Spjd * to make it distinguishable from newvd, and unopenable from now on. 4602168404Spjd */ 4603168404Spjd if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 4604168404Spjd spa_strfree(oldvd->vdev_path); 4605168404Spjd oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 4606168404Spjd KM_SLEEP); 4607168404Spjd (void) sprintf(oldvd->vdev_path, "%s/%s", 4608168404Spjd newvd->vdev_path, "old"); 4609168404Spjd if (oldvd->vdev_devid != NULL) { 4610168404Spjd spa_strfree(oldvd->vdev_devid); 4611168404Spjd oldvd->vdev_devid = NULL; 4612168404Spjd } 4613168404Spjd } 4614168404Spjd 4615219089Spjd /* mark the device being resilvered */ 4616219089Spjd newvd->vdev_resilvering = B_TRUE; 4617219089Spjd 4618168404Spjd /* 4619168404Spjd * If the parent is not a mirror, or if we're replacing, insert the new 4620168404Spjd * mirror/replacing/spare vdev above oldvd. 4621168404Spjd */ 4622168404Spjd if (pvd->vdev_ops != pvops) 4623168404Spjd pvd = vdev_add_parent(oldvd, pvops); 4624168404Spjd 4625168404Spjd ASSERT(pvd->vdev_top->vdev_parent == rvd); 4626168404Spjd ASSERT(pvd->vdev_ops == pvops); 4627168404Spjd ASSERT(oldvd->vdev_parent == pvd); 4628168404Spjd 4629168404Spjd /* 4630168404Spjd * Extract the new device from its root and add it to pvd. 4631168404Spjd */ 4632168404Spjd vdev_remove_child(newrootvd, newvd); 4633168404Spjd newvd->vdev_id = pvd->vdev_children; 4634219089Spjd newvd->vdev_crtxg = oldvd->vdev_crtxg; 4635168404Spjd vdev_add_child(pvd, newvd); 4636168404Spjd 4637168404Spjd tvd = newvd->vdev_top; 4638168404Spjd ASSERT(pvd->vdev_top == tvd); 4639168404Spjd ASSERT(tvd->vdev_parent == rvd); 4640168404Spjd 4641168404Spjd vdev_config_dirty(tvd); 4642168404Spjd 4643168404Spjd /* 4644219089Spjd * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account 4645219089Spjd * for any dmu_sync-ed blocks. It will propagate upward when 4646219089Spjd * spa_vdev_exit() calls vdev_dtl_reassess(). 4647168404Spjd */ 4648219089Spjd dtl_max_txg = txg + TXG_CONCURRENT_STATES; 4649168404Spjd 4650219089Spjd vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, 4651219089Spjd dtl_max_txg - TXG_INITIAL); 4652168404Spjd 4653209962Smm if (newvd->vdev_isspare) { 4654168404Spjd spa_spare_activate(newvd); 4655209962Smm spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE); 4656209962Smm } 4657209962Smm 4658185029Spjd oldvdpath = spa_strdup(oldvd->vdev_path); 4659185029Spjd newvdpath = spa_strdup(newvd->vdev_path); 4660185029Spjd newvd_isspare = newvd->vdev_isspare; 4661168404Spjd 4662168404Spjd /* 4663168404Spjd * Mark newvd's DTL dirty in this txg. 4664168404Spjd */ 4665168404Spjd vdev_dirty(tvd, VDD_DTL, newvd, txg); 4666168404Spjd 4667219089Spjd /* 4668219089Spjd * Restart the resilver 4669219089Spjd */ 4670219089Spjd dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg); 4671168404Spjd 4672219089Spjd /* 4673219089Spjd * Commit the config 4674219089Spjd */ 4675219089Spjd (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); 4676185029Spjd 4677219089Spjd spa_history_log_internal(LOG_POOL_VDEV_ATTACH, spa, NULL, 4678219089Spjd "%s vdev=%s %s vdev=%s", 4679219089Spjd replacing && newvd_isspare ? "spare in" : 4680219089Spjd replacing ? "replace" : "attach", newvdpath, 4681219089Spjd replacing ? "for" : "to", oldvdpath); 4682219089Spjd 4683185029Spjd spa_strfree(oldvdpath); 4684185029Spjd spa_strfree(newvdpath); 4685185029Spjd 4686219089Spjd if (spa->spa_bootfs) 4687219089Spjd spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH); 4688168404Spjd 4689168404Spjd return (0); 4690168404Spjd} 4691168404Spjd 4692168404Spjd/* 4693168404Spjd * Detach a device from a mirror or replacing vdev. 4694168404Spjd * If 'replace_done' is specified, only detach if the parent 4695168404Spjd * is a replacing vdev. 4696168404Spjd */ 4697168404Spjdint 4698209962Smmspa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) 4699168404Spjd{ 4700168404Spjd uint64_t txg; 4701209962Smm int error; 4702168404Spjd vdev_t *rvd = spa->spa_root_vdev; 4703168404Spjd vdev_t *vd, *pvd, *cvd, *tvd; 4704168404Spjd boolean_t unspare = B_FALSE; 4705168404Spjd uint64_t unspare_guid; 4706219089Spjd char *vdpath; 4707168404Spjd 4708219089Spjd ASSERT(spa_writeable(spa)); 4709219089Spjd 4710168404Spjd txg = spa_vdev_enter(spa); 4711168404Spjd 4712185029Spjd vd = spa_lookup_by_guid(spa, guid, B_FALSE); 4713168404Spjd 4714168404Spjd if (vd == NULL) 4715168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 4716168404Spjd 4717168404Spjd if (!vd->vdev_ops->vdev_op_leaf) 4718168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4719168404Spjd 4720168404Spjd pvd = vd->vdev_parent; 4721168404Spjd 4722168404Spjd /* 4723209962Smm * If the parent/child relationship is not as expected, don't do it. 4724209962Smm * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing 4725209962Smm * vdev that's replacing B with C. The user's intent in replacing 4726209962Smm * is to go from M(A,B) to M(A,C). If the user decides to cancel 4727209962Smm * the replace by detaching C, the expected behavior is to end up 4728209962Smm * M(A,B). But suppose that right after deciding to detach C, 4729209962Smm * the replacement of B completes. We would have M(A,C), and then 4730209962Smm * ask to detach C, which would leave us with just A -- not what 4731209962Smm * the user wanted. To prevent this, we make sure that the 4732209962Smm * parent/child relationship hasn't changed -- in this example, 4733209962Smm * that C's parent is still the replacing vdev R. 4734209962Smm */ 4735209962Smm if (pvd->vdev_guid != pguid && pguid != 0) 4736209962Smm return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 4737209962Smm 4738209962Smm /* 4739219089Spjd * Only 'replacing' or 'spare' vdevs can be replaced. 4740168404Spjd */ 4741219089Spjd if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && 4742219089Spjd pvd->vdev_ops != &vdev_spare_ops) 4743219089Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4744168404Spjd 4745168404Spjd ASSERT(pvd->vdev_ops != &vdev_spare_ops || 4746185029Spjd spa_version(spa) >= SPA_VERSION_SPARES); 4747168404Spjd 4748168404Spjd /* 4749168404Spjd * Only mirror, replacing, and spare vdevs support detach. 4750168404Spjd */ 4751168404Spjd if (pvd->vdev_ops != &vdev_replacing_ops && 4752168404Spjd pvd->vdev_ops != &vdev_mirror_ops && 4753168404Spjd pvd->vdev_ops != &vdev_spare_ops) 4754168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4755168404Spjd 4756168404Spjd /* 4757209962Smm * If this device has the only valid copy of some data, 4758209962Smm * we cannot safely detach it. 4759168404Spjd */ 4760209962Smm if (vdev_dtl_required(vd)) 4761168404Spjd return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 4762168404Spjd 4763209962Smm ASSERT(pvd->vdev_children >= 2); 4764168404Spjd 4765168404Spjd /* 4766185029Spjd * If we are detaching the second disk from a replacing vdev, then 4767185029Spjd * check to see if we changed the original vdev's path to have "/old" 4768185029Spjd * at the end in spa_vdev_attach(). If so, undo that change now. 4769168404Spjd */ 4770219089Spjd if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && 4771219089Spjd vd->vdev_path != NULL) { 4772219089Spjd size_t len = strlen(vd->vdev_path); 4773219089Spjd 4774219089Spjd for (int c = 0; c < pvd->vdev_children; c++) { 4775219089Spjd cvd = pvd->vdev_child[c]; 4776219089Spjd 4777219089Spjd if (cvd == vd || cvd->vdev_path == NULL) 4778219089Spjd continue; 4779219089Spjd 4780219089Spjd if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 4781219089Spjd strcmp(cvd->vdev_path + len, "/old") == 0) { 4782219089Spjd spa_strfree(cvd->vdev_path); 4783219089Spjd cvd->vdev_path = spa_strdup(vd->vdev_path); 4784219089Spjd break; 4785219089Spjd } 4786185029Spjd } 4787185029Spjd } 4788168404Spjd 4789168404Spjd /* 4790168404Spjd * If we are detaching the original disk from a spare, then it implies 4791168404Spjd * that the spare should become a real disk, and be removed from the 4792168404Spjd * active spare list for the pool. 4793168404Spjd */ 4794168404Spjd if (pvd->vdev_ops == &vdev_spare_ops && 4795219089Spjd vd->vdev_id == 0 && 4796219089Spjd pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare) 4797168404Spjd unspare = B_TRUE; 4798168404Spjd 4799168404Spjd /* 4800168404Spjd * Erase the disk labels so the disk can be used for other things. 4801168404Spjd * This must be done after all other error cases are handled, 4802168404Spjd * but before we disembowel vd (so we can still do I/O to it). 4803168404Spjd * But if we can't do it, don't treat the error as fatal -- 4804168404Spjd * it may be that the unwritability of the disk is the reason 4805168404Spjd * it's being detached! 4806168404Spjd */ 4807168404Spjd error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 4808168404Spjd 4809168404Spjd /* 4810168404Spjd * Remove vd from its parent and compact the parent's children. 4811168404Spjd */ 4812168404Spjd vdev_remove_child(pvd, vd); 4813168404Spjd vdev_compact_children(pvd); 4814168404Spjd 4815168404Spjd /* 4816168404Spjd * Remember one of the remaining children so we can get tvd below. 4817168404Spjd */ 4818219089Spjd cvd = pvd->vdev_child[pvd->vdev_children - 1]; 4819168404Spjd 4820168404Spjd /* 4821168404Spjd * If we need to remove the remaining child from the list of hot spares, 4822209962Smm * do it now, marking the vdev as no longer a spare in the process. 4823209962Smm * We must do this before vdev_remove_parent(), because that can 4824209962Smm * change the GUID if it creates a new toplevel GUID. For a similar 4825209962Smm * reason, we must remove the spare now, in the same txg as the detach; 4826209962Smm * otherwise someone could attach a new sibling, change the GUID, and 4827209962Smm * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. 4828168404Spjd */ 4829168404Spjd if (unspare) { 4830168404Spjd ASSERT(cvd->vdev_isspare); 4831168404Spjd spa_spare_remove(cvd); 4832168404Spjd unspare_guid = cvd->vdev_guid; 4833209962Smm (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 4834219089Spjd cvd->vdev_unspare = B_TRUE; 4835168404Spjd } 4836168404Spjd 4837168404Spjd /* 4838168404Spjd * If the parent mirror/replacing vdev only has one child, 4839168404Spjd * the parent is no longer needed. Remove it from the tree. 4840168404Spjd */ 4841219089Spjd if (pvd->vdev_children == 1) { 4842219089Spjd if (pvd->vdev_ops == &vdev_spare_ops) 4843219089Spjd cvd->vdev_unspare = B_FALSE; 4844168404Spjd vdev_remove_parent(cvd); 4845219089Spjd cvd->vdev_resilvering = B_FALSE; 4846219089Spjd } 4847168404Spjd 4848219089Spjd 4849168404Spjd /* 4850168404Spjd * We don't set tvd until now because the parent we just removed 4851168404Spjd * may have been the previous top-level vdev. 4852168404Spjd */ 4853168404Spjd tvd = cvd->vdev_top; 4854168404Spjd ASSERT(tvd->vdev_parent == rvd); 4855168404Spjd 4856168404Spjd /* 4857168404Spjd * Reevaluate the parent vdev state. 4858168404Spjd */ 4859185029Spjd vdev_propagate_state(cvd); 4860168404Spjd 4861168404Spjd /* 4862219089Spjd * If the 'autoexpand' property is set on the pool then automatically 4863219089Spjd * try to expand the size of the pool. For example if the device we 4864219089Spjd * just detached was smaller than the others, it may be possible to 4865219089Spjd * add metaslabs (i.e. grow the pool). We need to reopen the vdev 4866219089Spjd * first so that we can obtain the updated sizes of the leaf vdevs. 4867168404Spjd */ 4868219089Spjd if (spa->spa_autoexpand) { 4869219089Spjd vdev_reopen(tvd); 4870219089Spjd vdev_expand(tvd, txg); 4871219089Spjd } 4872168404Spjd 4873168404Spjd vdev_config_dirty(tvd); 4874168404Spjd 4875168404Spjd /* 4876168404Spjd * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 4877168404Spjd * vd->vdev_detached is set and free vd's DTL object in syncing context. 4878168404Spjd * But first make sure we're not on any *other* txg's DTL list, to 4879168404Spjd * prevent vd from being accessed after it's freed. 4880168404Spjd */ 4881219089Spjd vdpath = spa_strdup(vd->vdev_path); 4882209962Smm for (int t = 0; t < TXG_SIZE; t++) 4883168404Spjd (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 4884168404Spjd vd->vdev_detached = B_TRUE; 4885168404Spjd vdev_dirty(tvd, VDD_DTL, vd, txg); 4886168404Spjd 4887185029Spjd spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); 4888185029Spjd 4889219089Spjd /* hang on to the spa before we release the lock */ 4890219089Spjd spa_open_ref(spa, FTAG); 4891219089Spjd 4892168404Spjd error = spa_vdev_exit(spa, vd, txg, 0); 4893168404Spjd 4894219089Spjd spa_history_log_internal(LOG_POOL_VDEV_DETACH, spa, NULL, 4895219089Spjd "vdev=%s", vdpath); 4896219089Spjd spa_strfree(vdpath); 4897219089Spjd 4898168404Spjd /* 4899168404Spjd * If this was the removal of the original device in a hot spare vdev, 4900168404Spjd * then we want to go through and remove the device from the hot spare 4901168404Spjd * list of every other pool. 4902168404Spjd */ 4903168404Spjd if (unspare) { 4904219089Spjd spa_t *altspa = NULL; 4905219089Spjd 4906168404Spjd mutex_enter(&spa_namespace_lock); 4907219089Spjd while ((altspa = spa_next(altspa)) != NULL) { 4908219089Spjd if (altspa->spa_state != POOL_STATE_ACTIVE || 4909219089Spjd altspa == spa) 4910168404Spjd continue; 4911219089Spjd 4912219089Spjd spa_open_ref(altspa, FTAG); 4913185029Spjd mutex_exit(&spa_namespace_lock); 4914219089Spjd (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); 4915185029Spjd mutex_enter(&spa_namespace_lock); 4916219089Spjd spa_close(altspa, FTAG); 4917168404Spjd } 4918168404Spjd mutex_exit(&spa_namespace_lock); 4919219089Spjd 4920219089Spjd /* search the rest of the vdevs for spares to remove */ 4921219089Spjd spa_vdev_resilver_done(spa); 4922168404Spjd } 4923168404Spjd 4924219089Spjd /* all done with the spa; OK to release */ 4925219089Spjd mutex_enter(&spa_namespace_lock); 4926219089Spjd spa_close(spa, FTAG); 4927219089Spjd mutex_exit(&spa_namespace_lock); 4928219089Spjd 4929168404Spjd return (error); 4930168404Spjd} 4931168404Spjd 4932219089Spjd/* 4933219089Spjd * Split a set of devices from their mirrors, and create a new pool from them. 4934219089Spjd */ 4935219089Spjdint 4936219089Spjdspa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, 4937219089Spjd nvlist_t *props, boolean_t exp) 4938219089Spjd{ 4939219089Spjd int error = 0; 4940219089Spjd uint64_t txg, *glist; 4941219089Spjd spa_t *newspa; 4942219089Spjd uint_t c, children, lastlog; 4943219089Spjd nvlist_t **child, *nvl, *tmp; 4944219089Spjd dmu_tx_t *tx; 4945219089Spjd char *altroot = NULL; 4946219089Spjd vdev_t *rvd, **vml = NULL; /* vdev modify list */ 4947219089Spjd boolean_t activate_slog; 4948219089Spjd 4949219089Spjd ASSERT(spa_writeable(spa)); 4950219089Spjd 4951219089Spjd txg = spa_vdev_enter(spa); 4952219089Spjd 4953219089Spjd /* clear the log and flush everything up to now */ 4954219089Spjd activate_slog = spa_passivate_log(spa); 4955219089Spjd (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 4956219089Spjd error = spa_offline_log(spa); 4957219089Spjd txg = spa_vdev_config_enter(spa); 4958219089Spjd 4959219089Spjd if (activate_slog) 4960219089Spjd spa_activate_log(spa); 4961219089Spjd 4962219089Spjd if (error != 0) 4963219089Spjd return (spa_vdev_exit(spa, NULL, txg, error)); 4964219089Spjd 4965219089Spjd /* check new spa name before going any further */ 4966219089Spjd if (spa_lookup(newname) != NULL) 4967219089Spjd return (spa_vdev_exit(spa, NULL, txg, EEXIST)); 4968219089Spjd 4969219089Spjd /* 4970219089Spjd * scan through all the children to ensure they're all mirrors 4971219089Spjd */ 4972219089Spjd if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || 4973219089Spjd nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, 4974219089Spjd &children) != 0) 4975219089Spjd return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4976219089Spjd 4977219089Spjd /* first, check to ensure we've got the right child count */ 4978219089Spjd rvd = spa->spa_root_vdev; 4979219089Spjd lastlog = 0; 4980219089Spjd for (c = 0; c < rvd->vdev_children; c++) { 4981219089Spjd vdev_t *vd = rvd->vdev_child[c]; 4982219089Spjd 4983219089Spjd /* don't count the holes & logs as children */ 4984219089Spjd if (vd->vdev_islog || vd->vdev_ishole) { 4985219089Spjd if (lastlog == 0) 4986219089Spjd lastlog = c; 4987219089Spjd continue; 4988219089Spjd } 4989219089Spjd 4990219089Spjd lastlog = 0; 4991219089Spjd } 4992219089Spjd if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) 4993219089Spjd return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4994219089Spjd 4995219089Spjd /* next, ensure no spare or cache devices are part of the split */ 4996219089Spjd if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || 4997219089Spjd nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) 4998219089Spjd return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4999219089Spjd 5000219089Spjd vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); 5001219089Spjd glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); 5002219089Spjd 5003219089Spjd /* then, loop over each vdev and validate it */ 5004219089Spjd for (c = 0; c < children; c++) { 5005219089Spjd uint64_t is_hole = 0; 5006219089Spjd 5007219089Spjd (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, 5008219089Spjd &is_hole); 5009219089Spjd 5010219089Spjd if (is_hole != 0) { 5011219089Spjd if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || 5012219089Spjd spa->spa_root_vdev->vdev_child[c]->vdev_islog) { 5013219089Spjd continue; 5014219089Spjd } else { 5015219089Spjd error = EINVAL; 5016219089Spjd break; 5017219089Spjd } 5018219089Spjd } 5019219089Spjd 5020219089Spjd /* which disk is going to be split? */ 5021219089Spjd if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, 5022219089Spjd &glist[c]) != 0) { 5023219089Spjd error = EINVAL; 5024219089Spjd break; 5025219089Spjd } 5026219089Spjd 5027219089Spjd /* look it up in the spa */ 5028219089Spjd vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); 5029219089Spjd if (vml[c] == NULL) { 5030219089Spjd error = ENODEV; 5031219089Spjd break; 5032219089Spjd } 5033219089Spjd 5034219089Spjd /* make sure there's nothing stopping the split */ 5035219089Spjd if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || 5036219089Spjd vml[c]->vdev_islog || 5037219089Spjd vml[c]->vdev_ishole || 5038219089Spjd vml[c]->vdev_isspare || 5039219089Spjd vml[c]->vdev_isl2cache || 5040219089Spjd !vdev_writeable(vml[c]) || 5041219089Spjd vml[c]->vdev_children != 0 || 5042219089Spjd vml[c]->vdev_state != VDEV_STATE_HEALTHY || 5043219089Spjd c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { 5044219089Spjd error = EINVAL; 5045219089Spjd break; 5046219089Spjd } 5047219089Spjd 5048219089Spjd if (vdev_dtl_required(vml[c])) { 5049219089Spjd error = EBUSY; 5050219089Spjd break; 5051219089Spjd } 5052219089Spjd 5053219089Spjd /* we need certain info from the top level */ 5054219089Spjd VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, 5055219089Spjd vml[c]->vdev_top->vdev_ms_array) == 0); 5056219089Spjd VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, 5057219089Spjd vml[c]->vdev_top->vdev_ms_shift) == 0); 5058219089Spjd VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, 5059219089Spjd vml[c]->vdev_top->vdev_asize) == 0); 5060219089Spjd VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, 5061219089Spjd vml[c]->vdev_top->vdev_ashift) == 0); 5062219089Spjd } 5063219089Spjd 5064219089Spjd if (error != 0) { 5065219089Spjd kmem_free(vml, children * sizeof (vdev_t *)); 5066219089Spjd kmem_free(glist, children * sizeof (uint64_t)); 5067219089Spjd return (spa_vdev_exit(spa, NULL, txg, error)); 5068219089Spjd } 5069219089Spjd 5070219089Spjd /* stop writers from using the disks */ 5071219089Spjd for (c = 0; c < children; c++) { 5072219089Spjd if (vml[c] != NULL) 5073219089Spjd vml[c]->vdev_offline = B_TRUE; 5074219089Spjd } 5075219089Spjd vdev_reopen(spa->spa_root_vdev); 5076219089Spjd 5077219089Spjd /* 5078219089Spjd * Temporarily record the splitting vdevs in the spa config. This 5079219089Spjd * will disappear once the config is regenerated. 5080219089Spjd */ 5081219089Spjd VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5082219089Spjd VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 5083219089Spjd glist, children) == 0); 5084219089Spjd kmem_free(glist, children * sizeof (uint64_t)); 5085219089Spjd 5086219089Spjd mutex_enter(&spa->spa_props_lock); 5087219089Spjd VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, 5088219089Spjd nvl) == 0); 5089219089Spjd mutex_exit(&spa->spa_props_lock); 5090219089Spjd spa->spa_config_splitting = nvl; 5091219089Spjd vdev_config_dirty(spa->spa_root_vdev); 5092219089Spjd 5093219089Spjd /* configure and create the new pool */ 5094219089Spjd VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0); 5095219089Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 5096219089Spjd exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0); 5097219089Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 5098219089Spjd spa_version(spa)) == 0); 5099219089Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, 5100219089Spjd spa->spa_config_txg) == 0); 5101219089Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, 5102219089Spjd spa_generate_guid(NULL)) == 0); 5103219089Spjd (void) nvlist_lookup_string(props, 5104219089Spjd zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 5105219089Spjd 5106219089Spjd /* add the new pool to the namespace */ 5107219089Spjd newspa = spa_add(newname, config, altroot); 5108219089Spjd newspa->spa_config_txg = spa->spa_config_txg; 5109219089Spjd spa_set_log_state(newspa, SPA_LOG_CLEAR); 5110219089Spjd 5111219089Spjd /* release the spa config lock, retaining the namespace lock */ 5112219089Spjd spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 5113219089Spjd 5114219089Spjd if (zio_injection_enabled) 5115219089Spjd zio_handle_panic_injection(spa, FTAG, 1); 5116219089Spjd 5117219089Spjd spa_activate(newspa, spa_mode_global); 5118219089Spjd spa_async_suspend(newspa); 5119219089Spjd 5120219089Spjd#ifndef sun 5121219089Spjd /* mark that we are creating new spa by splitting */ 5122219089Spjd newspa->spa_splitting_newspa = B_TRUE; 5123219089Spjd#endif 5124219089Spjd /* create the new pool from the disks of the original pool */ 5125219089Spjd error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE); 5126219089Spjd#ifndef sun 5127219089Spjd newspa->spa_splitting_newspa = B_FALSE; 5128219089Spjd#endif 5129219089Spjd if (error) 5130219089Spjd goto out; 5131219089Spjd 5132219089Spjd /* if that worked, generate a real config for the new pool */ 5133219089Spjd if (newspa->spa_root_vdev != NULL) { 5134219089Spjd VERIFY(nvlist_alloc(&newspa->spa_config_splitting, 5135219089Spjd NV_UNIQUE_NAME, KM_SLEEP) == 0); 5136219089Spjd VERIFY(nvlist_add_uint64(newspa->spa_config_splitting, 5137219089Spjd ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0); 5138219089Spjd spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, 5139219089Spjd B_TRUE)); 5140219089Spjd } 5141219089Spjd 5142219089Spjd /* set the props */ 5143219089Spjd if (props != NULL) { 5144219089Spjd spa_configfile_set(newspa, props, B_FALSE); 5145219089Spjd error = spa_prop_set(newspa, props); 5146219089Spjd if (error) 5147219089Spjd goto out; 5148219089Spjd } 5149219089Spjd 5150219089Spjd /* flush everything */ 5151219089Spjd txg = spa_vdev_config_enter(newspa); 5152219089Spjd vdev_config_dirty(newspa->spa_root_vdev); 5153219089Spjd (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); 5154219089Spjd 5155219089Spjd if (zio_injection_enabled) 5156219089Spjd zio_handle_panic_injection(spa, FTAG, 2); 5157219089Spjd 5158219089Spjd spa_async_resume(newspa); 5159219089Spjd 5160219089Spjd /* finally, update the original pool's config */ 5161219089Spjd txg = spa_vdev_config_enter(spa); 5162219089Spjd tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 5163219089Spjd error = dmu_tx_assign(tx, TXG_WAIT); 5164219089Spjd if (error != 0) 5165219089Spjd dmu_tx_abort(tx); 5166219089Spjd for (c = 0; c < children; c++) { 5167219089Spjd if (vml[c] != NULL) { 5168219089Spjd vdev_split(vml[c]); 5169219089Spjd if (error == 0) 5170219089Spjd spa_history_log_internal(LOG_POOL_VDEV_DETACH, 5171219089Spjd spa, tx, "vdev=%s", 5172219089Spjd vml[c]->vdev_path); 5173219089Spjd vdev_free(vml[c]); 5174219089Spjd } 5175219089Spjd } 5176219089Spjd vdev_config_dirty(spa->spa_root_vdev); 5177219089Spjd spa->spa_config_splitting = NULL; 5178219089Spjd nvlist_free(nvl); 5179219089Spjd if (error == 0) 5180219089Spjd dmu_tx_commit(tx); 5181219089Spjd (void) spa_vdev_exit(spa, NULL, txg, 0); 5182219089Spjd 5183219089Spjd if (zio_injection_enabled) 5184219089Spjd zio_handle_panic_injection(spa, FTAG, 3); 5185219089Spjd 5186219089Spjd /* split is complete; log a history record */ 5187219089Spjd spa_history_log_internal(LOG_POOL_SPLIT, newspa, NULL, 5188219089Spjd "split new pool %s from pool %s", newname, spa_name(spa)); 5189219089Spjd 5190219089Spjd kmem_free(vml, children * sizeof (vdev_t *)); 5191219089Spjd 5192219089Spjd /* if we're not going to mount the filesystems in userland, export */ 5193219089Spjd if (exp) 5194219089Spjd error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, 5195219089Spjd B_FALSE, B_FALSE); 5196219089Spjd 5197219089Spjd return (error); 5198219089Spjd 5199219089Spjdout: 5200219089Spjd spa_unload(newspa); 5201219089Spjd spa_deactivate(newspa); 5202219089Spjd spa_remove(newspa); 5203219089Spjd 5204219089Spjd txg = spa_vdev_config_enter(spa); 5205219089Spjd 5206219089Spjd /* re-online all offlined disks */ 5207219089Spjd for (c = 0; c < children; c++) { 5208219089Spjd if (vml[c] != NULL) 5209219089Spjd vml[c]->vdev_offline = B_FALSE; 5210219089Spjd } 5211219089Spjd vdev_reopen(spa->spa_root_vdev); 5212219089Spjd 5213219089Spjd nvlist_free(spa->spa_config_splitting); 5214219089Spjd spa->spa_config_splitting = NULL; 5215219089Spjd (void) spa_vdev_exit(spa, NULL, txg, error); 5216219089Spjd 5217219089Spjd kmem_free(vml, children * sizeof (vdev_t *)); 5218219089Spjd return (error); 5219219089Spjd} 5220219089Spjd 5221185029Spjdstatic nvlist_t * 5222185029Spjdspa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) 5223185029Spjd{ 5224185029Spjd for (int i = 0; i < count; i++) { 5225185029Spjd uint64_t guid; 5226185029Spjd 5227185029Spjd VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID, 5228185029Spjd &guid) == 0); 5229185029Spjd 5230185029Spjd if (guid == target_guid) 5231185029Spjd return (nvpp[i]); 5232185029Spjd } 5233185029Spjd 5234185029Spjd return (NULL); 5235185029Spjd} 5236185029Spjd 5237185029Spjdstatic void 5238185029Spjdspa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, 5239185029Spjd nvlist_t *dev_to_remove) 5240185029Spjd{ 5241185029Spjd nvlist_t **newdev = NULL; 5242185029Spjd 5243185029Spjd if (count > 1) 5244185029Spjd newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); 5245185029Spjd 5246185029Spjd for (int i = 0, j = 0; i < count; i++) { 5247185029Spjd if (dev[i] == dev_to_remove) 5248185029Spjd continue; 5249185029Spjd VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); 5250185029Spjd } 5251185029Spjd 5252185029Spjd VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); 5253185029Spjd VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); 5254185029Spjd 5255185029Spjd for (int i = 0; i < count - 1; i++) 5256185029Spjd nvlist_free(newdev[i]); 5257185029Spjd 5258185029Spjd if (count > 1) 5259185029Spjd kmem_free(newdev, (count - 1) * sizeof (void *)); 5260185029Spjd} 5261185029Spjd 5262168404Spjd/* 5263219089Spjd * Evacuate the device. 5264219089Spjd */ 5265219089Spjdstatic int 5266219089Spjdspa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd) 5267219089Spjd{ 5268219089Spjd uint64_t txg; 5269219089Spjd int error = 0; 5270219089Spjd 5271219089Spjd ASSERT(MUTEX_HELD(&spa_namespace_lock)); 5272219089Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5273219089Spjd ASSERT(vd == vd->vdev_top); 5274219089Spjd 5275219089Spjd /* 5276219089Spjd * Evacuate the device. We don't hold the config lock as writer 5277219089Spjd * since we need to do I/O but we do keep the 5278219089Spjd * spa_namespace_lock held. Once this completes the device 5279219089Spjd * should no longer have any blocks allocated on it. 5280219089Spjd */ 5281219089Spjd if (vd->vdev_islog) { 5282219089Spjd if (vd->vdev_stat.vs_alloc != 0) 5283219089Spjd error = spa_offline_log(spa); 5284219089Spjd } else { 5285219089Spjd error = ENOTSUP; 5286219089Spjd } 5287219089Spjd 5288219089Spjd if (error) 5289219089Spjd return (error); 5290219089Spjd 5291219089Spjd /* 5292219089Spjd * The evacuation succeeded. Remove any remaining MOS metadata 5293219089Spjd * associated with this vdev, and wait for these changes to sync. 5294219089Spjd */ 5295240415Smm ASSERT0(vd->vdev_stat.vs_alloc); 5296219089Spjd txg = spa_vdev_config_enter(spa); 5297219089Spjd vd->vdev_removing = B_TRUE; 5298219089Spjd vdev_dirty(vd, 0, NULL, txg); 5299219089Spjd vdev_config_dirty(vd); 5300219089Spjd spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 5301219089Spjd 5302219089Spjd return (0); 5303219089Spjd} 5304219089Spjd 5305219089Spjd/* 5306219089Spjd * Complete the removal by cleaning up the namespace. 5307219089Spjd */ 5308219089Spjdstatic void 5309219089Spjdspa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd) 5310219089Spjd{ 5311219089Spjd vdev_t *rvd = spa->spa_root_vdev; 5312219089Spjd uint64_t id = vd->vdev_id; 5313219089Spjd boolean_t last_vdev = (id == (rvd->vdev_children - 1)); 5314219089Spjd 5315219089Spjd ASSERT(MUTEX_HELD(&spa_namespace_lock)); 5316219089Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 5317219089Spjd ASSERT(vd == vd->vdev_top); 5318219089Spjd 5319219089Spjd /* 5320219089Spjd * Only remove any devices which are empty. 5321219089Spjd */ 5322219089Spjd if (vd->vdev_stat.vs_alloc != 0) 5323219089Spjd return; 5324219089Spjd 5325219089Spjd (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 5326219089Spjd 5327219089Spjd if (list_link_active(&vd->vdev_state_dirty_node)) 5328219089Spjd vdev_state_clean(vd); 5329219089Spjd if (list_link_active(&vd->vdev_config_dirty_node)) 5330219089Spjd vdev_config_clean(vd); 5331219089Spjd 5332219089Spjd vdev_free(vd); 5333219089Spjd 5334219089Spjd if (last_vdev) { 5335219089Spjd vdev_compact_children(rvd); 5336219089Spjd } else { 5337219089Spjd vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); 5338219089Spjd vdev_add_child(rvd, vd); 5339219089Spjd } 5340219089Spjd vdev_config_dirty(rvd); 5341219089Spjd 5342219089Spjd /* 5343219089Spjd * Reassess the health of our root vdev. 5344219089Spjd */ 5345219089Spjd vdev_reopen(rvd); 5346219089Spjd} 5347219089Spjd 5348219089Spjd/* 5349219089Spjd * Remove a device from the pool - 5350219089Spjd * 5351219089Spjd * Removing a device from the vdev namespace requires several steps 5352219089Spjd * and can take a significant amount of time. As a result we use 5353219089Spjd * the spa_vdev_config_[enter/exit] functions which allow us to 5354219089Spjd * grab and release the spa_config_lock while still holding the namespace 5355219089Spjd * lock. During each step the configuration is synced out. 5356219089Spjd */ 5357219089Spjd 5358219089Spjd/* 5359168404Spjd * Remove a device from the pool. Currently, this supports removing only hot 5360219089Spjd * spares, slogs, and level 2 ARC devices. 5361168404Spjd */ 5362168404Spjdint 5363168404Spjdspa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 5364168404Spjd{ 5365168404Spjd vdev_t *vd; 5366219089Spjd metaslab_group_t *mg; 5367185029Spjd nvlist_t **spares, **l2cache, *nv; 5368219089Spjd uint64_t txg = 0; 5369185029Spjd uint_t nspares, nl2cache; 5370185029Spjd int error = 0; 5371209962Smm boolean_t locked = MUTEX_HELD(&spa_namespace_lock); 5372168404Spjd 5373219089Spjd ASSERT(spa_writeable(spa)); 5374219089Spjd 5375209962Smm if (!locked) 5376209962Smm txg = spa_vdev_enter(spa); 5377168404Spjd 5378185029Spjd vd = spa_lookup_by_guid(spa, guid, B_FALSE); 5379168404Spjd 5380185029Spjd if (spa->spa_spares.sav_vdevs != NULL && 5381185029Spjd nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 5382185029Spjd ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && 5383185029Spjd (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { 5384185029Spjd /* 5385185029Spjd * Only remove the hot spare if it's not currently in use 5386185029Spjd * in this pool. 5387185029Spjd */ 5388185029Spjd if (vd == NULL || unspare) { 5389185029Spjd spa_vdev_remove_aux(spa->spa_spares.sav_config, 5390185029Spjd ZPOOL_CONFIG_SPARES, spares, nspares, nv); 5391185029Spjd spa_load_spares(spa); 5392185029Spjd spa->spa_spares.sav_sync = B_TRUE; 5393185029Spjd } else { 5394185029Spjd error = EBUSY; 5395168404Spjd } 5396185029Spjd } else if (spa->spa_l2cache.sav_vdevs != NULL && 5397185029Spjd nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 5398185029Spjd ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && 5399185029Spjd (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { 5400185029Spjd /* 5401185029Spjd * Cache devices can always be removed. 5402185029Spjd */ 5403185029Spjd spa_vdev_remove_aux(spa->spa_l2cache.sav_config, 5404185029Spjd ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); 5405185029Spjd spa_load_l2cache(spa); 5406185029Spjd spa->spa_l2cache.sav_sync = B_TRUE; 5407219089Spjd } else if (vd != NULL && vd->vdev_islog) { 5408219089Spjd ASSERT(!locked); 5409219089Spjd ASSERT(vd == vd->vdev_top); 5410219089Spjd 5411219089Spjd /* 5412219089Spjd * XXX - Once we have bp-rewrite this should 5413219089Spjd * become the common case. 5414219089Spjd */ 5415219089Spjd 5416219089Spjd mg = vd->vdev_mg; 5417219089Spjd 5418219089Spjd /* 5419219089Spjd * Stop allocating from this vdev. 5420219089Spjd */ 5421219089Spjd metaslab_group_passivate(mg); 5422219089Spjd 5423219089Spjd /* 5424219089Spjd * Wait for the youngest allocations and frees to sync, 5425219089Spjd * and then wait for the deferral of those frees to finish. 5426219089Spjd */ 5427219089Spjd spa_vdev_config_exit(spa, NULL, 5428219089Spjd txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); 5429219089Spjd 5430219089Spjd /* 5431219089Spjd * Attempt to evacuate the vdev. 5432219089Spjd */ 5433219089Spjd error = spa_vdev_remove_evacuate(spa, vd); 5434219089Spjd 5435219089Spjd txg = spa_vdev_config_enter(spa); 5436219089Spjd 5437219089Spjd /* 5438219089Spjd * If we couldn't evacuate the vdev, unwind. 5439219089Spjd */ 5440219089Spjd if (error) { 5441219089Spjd metaslab_group_activate(mg); 5442219089Spjd return (spa_vdev_exit(spa, NULL, txg, error)); 5443219089Spjd } 5444219089Spjd 5445219089Spjd /* 5446219089Spjd * Clean up the vdev namespace. 5447219089Spjd */ 5448219089Spjd spa_vdev_remove_from_namespace(spa, vd); 5449219089Spjd 5450185029Spjd } else if (vd != NULL) { 5451185029Spjd /* 5452185029Spjd * Normal vdevs cannot be removed (yet). 5453185029Spjd */ 5454185029Spjd error = ENOTSUP; 5455168404Spjd } else { 5456185029Spjd /* 5457185029Spjd * There is no vdev of any kind with the specified guid. 5458185029Spjd */ 5459185029Spjd error = ENOENT; 5460168404Spjd } 5461168404Spjd 5462209962Smm if (!locked) 5463209962Smm return (spa_vdev_exit(spa, NULL, txg, error)); 5464209962Smm 5465209962Smm return (error); 5466168404Spjd} 5467168404Spjd 5468168404Spjd/* 5469185029Spjd * Find any device that's done replacing, or a vdev marked 'unspare' that's 5470185029Spjd * current spared, so we can detach it. 5471168404Spjd */ 5472168404Spjdstatic vdev_t * 5473185029Spjdspa_vdev_resilver_done_hunt(vdev_t *vd) 5474168404Spjd{ 5475168404Spjd vdev_t *newvd, *oldvd; 5476168404Spjd 5477219089Spjd for (int c = 0; c < vd->vdev_children; c++) { 5478185029Spjd oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 5479168404Spjd if (oldvd != NULL) 5480168404Spjd return (oldvd); 5481168404Spjd } 5482168404Spjd 5483185029Spjd /* 5484219089Spjd * Check for a completed replacement. We always consider the first 5485219089Spjd * vdev in the list to be the oldest vdev, and the last one to be 5486219089Spjd * the newest (see spa_vdev_attach() for how that works). In 5487219089Spjd * the case where the newest vdev is faulted, we will not automatically 5488219089Spjd * remove it after a resilver completes. This is OK as it will require 5489219089Spjd * user intervention to determine which disk the admin wishes to keep. 5490185029Spjd */ 5491219089Spjd if (vd->vdev_ops == &vdev_replacing_ops) { 5492219089Spjd ASSERT(vd->vdev_children > 1); 5493219089Spjd 5494219089Spjd newvd = vd->vdev_child[vd->vdev_children - 1]; 5495168404Spjd oldvd = vd->vdev_child[0]; 5496168404Spjd 5497209962Smm if (vdev_dtl_empty(newvd, DTL_MISSING) && 5498219089Spjd vdev_dtl_empty(newvd, DTL_OUTAGE) && 5499209962Smm !vdev_dtl_required(oldvd)) 5500168404Spjd return (oldvd); 5501168404Spjd } 5502168404Spjd 5503185029Spjd /* 5504185029Spjd * Check for a completed resilver with the 'unspare' flag set. 5505185029Spjd */ 5506219089Spjd if (vd->vdev_ops == &vdev_spare_ops) { 5507219089Spjd vdev_t *first = vd->vdev_child[0]; 5508219089Spjd vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; 5509185029Spjd 5510219089Spjd if (last->vdev_unspare) { 5511219089Spjd oldvd = first; 5512219089Spjd newvd = last; 5513219089Spjd } else if (first->vdev_unspare) { 5514219089Spjd oldvd = last; 5515219089Spjd newvd = first; 5516219089Spjd } else { 5517219089Spjd oldvd = NULL; 5518219089Spjd } 5519219089Spjd 5520219089Spjd if (oldvd != NULL && 5521209962Smm vdev_dtl_empty(newvd, DTL_MISSING) && 5522219089Spjd vdev_dtl_empty(newvd, DTL_OUTAGE) && 5523219089Spjd !vdev_dtl_required(oldvd)) 5524185029Spjd return (oldvd); 5525219089Spjd 5526219089Spjd /* 5527219089Spjd * If there are more than two spares attached to a disk, 5528219089Spjd * and those spares are not required, then we want to 5529219089Spjd * attempt to free them up now so that they can be used 5530219089Spjd * by other pools. Once we're back down to a single 5531219089Spjd * disk+spare, we stop removing them. 5532219089Spjd */ 5533219089Spjd if (vd->vdev_children > 2) { 5534219089Spjd newvd = vd->vdev_child[1]; 5535219089Spjd 5536219089Spjd if (newvd->vdev_isspare && last->vdev_isspare && 5537219089Spjd vdev_dtl_empty(last, DTL_MISSING) && 5538219089Spjd vdev_dtl_empty(last, DTL_OUTAGE) && 5539219089Spjd !vdev_dtl_required(newvd)) 5540219089Spjd return (newvd); 5541185029Spjd } 5542185029Spjd } 5543185029Spjd 5544168404Spjd return (NULL); 5545168404Spjd} 5546168404Spjd 5547168404Spjdstatic void 5548185029Spjdspa_vdev_resilver_done(spa_t *spa) 5549168404Spjd{ 5550209962Smm vdev_t *vd, *pvd, *ppvd; 5551209962Smm uint64_t guid, sguid, pguid, ppguid; 5552168404Spjd 5553209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5554168404Spjd 5555185029Spjd while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 5556209962Smm pvd = vd->vdev_parent; 5557209962Smm ppvd = pvd->vdev_parent; 5558168404Spjd guid = vd->vdev_guid; 5559209962Smm pguid = pvd->vdev_guid; 5560209962Smm ppguid = ppvd->vdev_guid; 5561209962Smm sguid = 0; 5562168404Spjd /* 5563168404Spjd * If we have just finished replacing a hot spared device, then 5564168404Spjd * we need to detach the parent's first child (the original hot 5565168404Spjd * spare) as well. 5566168404Spjd */ 5567219089Spjd if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && 5568219089Spjd ppvd->vdev_children == 2) { 5569168404Spjd ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 5570209962Smm sguid = ppvd->vdev_child[1]->vdev_guid; 5571168404Spjd } 5572209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 5573209962Smm if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) 5574168404Spjd return; 5575209962Smm if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) 5576168404Spjd return; 5577209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5578168404Spjd } 5579168404Spjd 5580209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 5581168404Spjd} 5582168404Spjd 5583168404Spjd/* 5584219089Spjd * Update the stored path or FRU for this vdev. 5585168404Spjd */ 5586168404Spjdint 5587209962Smmspa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, 5588209962Smm boolean_t ispath) 5589168404Spjd{ 5590185029Spjd vdev_t *vd; 5591219089Spjd boolean_t sync = B_FALSE; 5592168404Spjd 5593219089Spjd ASSERT(spa_writeable(spa)); 5594168404Spjd 5595219089Spjd spa_vdev_state_enter(spa, SCL_ALL); 5596219089Spjd 5597209962Smm if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 5598219089Spjd return (spa_vdev_state_exit(spa, NULL, ENOENT)); 5599168404Spjd 5600168404Spjd if (!vd->vdev_ops->vdev_op_leaf) 5601219089Spjd return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 5602168404Spjd 5603209962Smm if (ispath) { 5604219089Spjd if (strcmp(value, vd->vdev_path) != 0) { 5605219089Spjd spa_strfree(vd->vdev_path); 5606219089Spjd vd->vdev_path = spa_strdup(value); 5607219089Spjd sync = B_TRUE; 5608219089Spjd } 5609209962Smm } else { 5610219089Spjd if (vd->vdev_fru == NULL) { 5611219089Spjd vd->vdev_fru = spa_strdup(value); 5612219089Spjd sync = B_TRUE; 5613219089Spjd } else if (strcmp(value, vd->vdev_fru) != 0) { 5614209962Smm spa_strfree(vd->vdev_fru); 5615219089Spjd vd->vdev_fru = spa_strdup(value); 5616219089Spjd sync = B_TRUE; 5617219089Spjd } 5618209962Smm } 5619168404Spjd 5620219089Spjd return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); 5621168404Spjd} 5622168404Spjd 5623209962Smmint 5624209962Smmspa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 5625209962Smm{ 5626209962Smm return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); 5627209962Smm} 5628209962Smm 5629209962Smmint 5630209962Smmspa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) 5631209962Smm{ 5632209962Smm return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); 5633209962Smm} 5634209962Smm 5635168404Spjd/* 5636168404Spjd * ========================================================================== 5637219089Spjd * SPA Scanning 5638168404Spjd * ========================================================================== 5639168404Spjd */ 5640168404Spjd 5641168404Spjdint 5642219089Spjdspa_scan_stop(spa_t *spa) 5643168404Spjd{ 5644185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5645219089Spjd if (dsl_scan_resilvering(spa->spa_dsl_pool)) 5646219089Spjd return (EBUSY); 5647219089Spjd return (dsl_scan_cancel(spa->spa_dsl_pool)); 5648219089Spjd} 5649168404Spjd 5650219089Spjdint 5651219089Spjdspa_scan(spa_t *spa, pool_scan_func_t func) 5652219089Spjd{ 5653219089Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5654219089Spjd 5655219089Spjd if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) 5656168404Spjd return (ENOTSUP); 5657168404Spjd 5658168404Spjd /* 5659185029Spjd * If a resilver was requested, but there is no DTL on a 5660185029Spjd * writeable leaf device, we have nothing to do. 5661168404Spjd */ 5662219089Spjd if (func == POOL_SCAN_RESILVER && 5663185029Spjd !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 5664185029Spjd spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 5665168404Spjd return (0); 5666168404Spjd } 5667168404Spjd 5668219089Spjd return (dsl_scan(spa->spa_dsl_pool, func)); 5669168404Spjd} 5670168404Spjd 5671168404Spjd/* 5672168404Spjd * ========================================================================== 5673168404Spjd * SPA async task processing 5674168404Spjd * ========================================================================== 5675168404Spjd */ 5676168404Spjd 5677168404Spjdstatic void 5678185029Spjdspa_async_remove(spa_t *spa, vdev_t *vd) 5679168404Spjd{ 5680185029Spjd if (vd->vdev_remove_wanted) { 5681219089Spjd vd->vdev_remove_wanted = B_FALSE; 5682219089Spjd vd->vdev_delayed_close = B_FALSE; 5683185029Spjd vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 5684209962Smm 5685209962Smm /* 5686209962Smm * We want to clear the stats, but we don't want to do a full 5687209962Smm * vdev_clear() as that will cause us to throw away 5688209962Smm * degraded/faulted state as well as attempt to reopen the 5689209962Smm * device, all of which is a waste. 5690209962Smm */ 5691209962Smm vd->vdev_stat.vs_read_errors = 0; 5692209962Smm vd->vdev_stat.vs_write_errors = 0; 5693209962Smm vd->vdev_stat.vs_checksum_errors = 0; 5694209962Smm 5695185029Spjd vdev_state_dirty(vd->vdev_top); 5696185029Spjd } 5697168404Spjd 5698185029Spjd for (int c = 0; c < vd->vdev_children; c++) 5699185029Spjd spa_async_remove(spa, vd->vdev_child[c]); 5700185029Spjd} 5701168404Spjd 5702185029Spjdstatic void 5703185029Spjdspa_async_probe(spa_t *spa, vdev_t *vd) 5704185029Spjd{ 5705185029Spjd if (vd->vdev_probe_wanted) { 5706219089Spjd vd->vdev_probe_wanted = B_FALSE; 5707185029Spjd vdev_reopen(vd); /* vdev_open() does the actual probe */ 5708168404Spjd } 5709168404Spjd 5710185029Spjd for (int c = 0; c < vd->vdev_children; c++) 5711185029Spjd spa_async_probe(spa, vd->vdev_child[c]); 5712168404Spjd} 5713168404Spjd 5714168404Spjdstatic void 5715219089Spjdspa_async_autoexpand(spa_t *spa, vdev_t *vd) 5716219089Spjd{ 5717219089Spjd sysevent_id_t eid; 5718219089Spjd nvlist_t *attr; 5719219089Spjd char *physpath; 5720219089Spjd 5721219089Spjd if (!spa->spa_autoexpand) 5722219089Spjd return; 5723219089Spjd 5724219089Spjd for (int c = 0; c < vd->vdev_children; c++) { 5725219089Spjd vdev_t *cvd = vd->vdev_child[c]; 5726219089Spjd spa_async_autoexpand(spa, cvd); 5727219089Spjd } 5728219089Spjd 5729219089Spjd if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) 5730219089Spjd return; 5731219089Spjd 5732219089Spjd physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP); 5733219089Spjd (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath); 5734219089Spjd 5735219089Spjd VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5736219089Spjd VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0); 5737219089Spjd 5738219089Spjd (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS, 5739219089Spjd ESC_ZFS_VDEV_AUTOEXPAND, attr, &eid, DDI_SLEEP); 5740219089Spjd 5741219089Spjd nvlist_free(attr); 5742219089Spjd kmem_free(physpath, MAXPATHLEN); 5743219089Spjd} 5744219089Spjd 5745219089Spjdstatic void 5746168404Spjdspa_async_thread(void *arg) 5747168404Spjd{ 5748168404Spjd spa_t *spa = arg; 5749168404Spjd int tasks; 5750168404Spjd 5751168404Spjd ASSERT(spa->spa_sync_on); 5752168404Spjd 5753168404Spjd mutex_enter(&spa->spa_async_lock); 5754168404Spjd tasks = spa->spa_async_tasks; 5755168404Spjd spa->spa_async_tasks = 0; 5756168404Spjd mutex_exit(&spa->spa_async_lock); 5757168404Spjd 5758168404Spjd /* 5759168404Spjd * See if the config needs to be updated. 5760168404Spjd */ 5761168404Spjd if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 5762219089Spjd uint64_t old_space, new_space; 5763219089Spjd 5764168404Spjd mutex_enter(&spa_namespace_lock); 5765219089Spjd old_space = metaslab_class_get_space(spa_normal_class(spa)); 5766168404Spjd spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 5767219089Spjd new_space = metaslab_class_get_space(spa_normal_class(spa)); 5768168404Spjd mutex_exit(&spa_namespace_lock); 5769219089Spjd 5770219089Spjd /* 5771219089Spjd * If the pool grew as a result of the config update, 5772219089Spjd * then log an internal history event. 5773219089Spjd */ 5774219089Spjd if (new_space != old_space) { 5775219089Spjd spa_history_log_internal(LOG_POOL_VDEV_ONLINE, 5776219089Spjd spa, NULL, 5777219089Spjd "pool '%s' size: %llu(+%llu)", 5778219089Spjd spa_name(spa), new_space, new_space - old_space); 5779219089Spjd } 5780168404Spjd } 5781168404Spjd 5782168404Spjd /* 5783185029Spjd * See if any devices need to be marked REMOVED. 5784168404Spjd */ 5785185029Spjd if (tasks & SPA_ASYNC_REMOVE) { 5786219089Spjd spa_vdev_state_enter(spa, SCL_NONE); 5787185029Spjd spa_async_remove(spa, spa->spa_root_vdev); 5788185029Spjd for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 5789185029Spjd spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); 5790185029Spjd for (int i = 0; i < spa->spa_spares.sav_count; i++) 5791185029Spjd spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); 5792185029Spjd (void) spa_vdev_state_exit(spa, NULL, 0); 5793185029Spjd } 5794168404Spjd 5795219089Spjd if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { 5796219089Spjd spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 5797219089Spjd spa_async_autoexpand(spa, spa->spa_root_vdev); 5798219089Spjd spa_config_exit(spa, SCL_CONFIG, FTAG); 5799219089Spjd } 5800219089Spjd 5801168404Spjd /* 5802185029Spjd * See if any devices need to be probed. 5803168404Spjd */ 5804185029Spjd if (tasks & SPA_ASYNC_PROBE) { 5805219089Spjd spa_vdev_state_enter(spa, SCL_NONE); 5806185029Spjd spa_async_probe(spa, spa->spa_root_vdev); 5807185029Spjd (void) spa_vdev_state_exit(spa, NULL, 0); 5808185029Spjd } 5809168404Spjd 5810168404Spjd /* 5811185029Spjd * If any devices are done replacing, detach them. 5812168404Spjd */ 5813185029Spjd if (tasks & SPA_ASYNC_RESILVER_DONE) 5814185029Spjd spa_vdev_resilver_done(spa); 5815168404Spjd 5816168404Spjd /* 5817168404Spjd * Kick off a resilver. 5818168404Spjd */ 5819168404Spjd if (tasks & SPA_ASYNC_RESILVER) 5820219089Spjd dsl_resilver_restart(spa->spa_dsl_pool, 0); 5821168404Spjd 5822168404Spjd /* 5823168404Spjd * Let the world know that we're done. 5824168404Spjd */ 5825168404Spjd mutex_enter(&spa->spa_async_lock); 5826168404Spjd spa->spa_async_thread = NULL; 5827168404Spjd cv_broadcast(&spa->spa_async_cv); 5828168404Spjd mutex_exit(&spa->spa_async_lock); 5829168404Spjd thread_exit(); 5830168404Spjd} 5831168404Spjd 5832168404Spjdvoid 5833168404Spjdspa_async_suspend(spa_t *spa) 5834168404Spjd{ 5835168404Spjd mutex_enter(&spa->spa_async_lock); 5836168404Spjd spa->spa_async_suspended++; 5837168404Spjd while (spa->spa_async_thread != NULL) 5838168404Spjd cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 5839168404Spjd mutex_exit(&spa->spa_async_lock); 5840168404Spjd} 5841168404Spjd 5842168404Spjdvoid 5843168404Spjdspa_async_resume(spa_t *spa) 5844168404Spjd{ 5845168404Spjd mutex_enter(&spa->spa_async_lock); 5846168404Spjd ASSERT(spa->spa_async_suspended != 0); 5847168404Spjd spa->spa_async_suspended--; 5848168404Spjd mutex_exit(&spa->spa_async_lock); 5849168404Spjd} 5850168404Spjd 5851168404Spjdstatic void 5852168404Spjdspa_async_dispatch(spa_t *spa) 5853168404Spjd{ 5854168404Spjd mutex_enter(&spa->spa_async_lock); 5855168404Spjd if (spa->spa_async_tasks && !spa->spa_async_suspended && 5856168404Spjd spa->spa_async_thread == NULL && 5857168404Spjd rootdir != NULL && !vn_is_readonly(rootdir)) 5858168404Spjd spa->spa_async_thread = thread_create(NULL, 0, 5859168404Spjd spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 5860168404Spjd mutex_exit(&spa->spa_async_lock); 5861168404Spjd} 5862168404Spjd 5863168404Spjdvoid 5864168404Spjdspa_async_request(spa_t *spa, int task) 5865168404Spjd{ 5866219089Spjd zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); 5867168404Spjd mutex_enter(&spa->spa_async_lock); 5868168404Spjd spa->spa_async_tasks |= task; 5869168404Spjd mutex_exit(&spa->spa_async_lock); 5870168404Spjd} 5871168404Spjd 5872168404Spjd/* 5873168404Spjd * ========================================================================== 5874168404Spjd * SPA syncing routines 5875168404Spjd * ========================================================================== 5876168404Spjd */ 5877168404Spjd 5878219089Spjdstatic int 5879219089Spjdbpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 5880168404Spjd{ 5881219089Spjd bpobj_t *bpo = arg; 5882219089Spjd bpobj_enqueue(bpo, bp, tx); 5883219089Spjd return (0); 5884219089Spjd} 5885168404Spjd 5886219089Spjdstatic int 5887219089Spjdspa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 5888219089Spjd{ 5889219089Spjd zio_t *zio = arg; 5890168404Spjd 5891219089Spjd zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp, 5892240868Spjd BP_GET_PSIZE(bp), zio->io_flags)); 5893219089Spjd return (0); 5894168404Spjd} 5895168404Spjd 5896168404Spjdstatic void 5897168404Spjdspa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 5898168404Spjd{ 5899168404Spjd char *packed = NULL; 5900185029Spjd size_t bufsize; 5901168404Spjd size_t nvsize = 0; 5902168404Spjd dmu_buf_t *db; 5903168404Spjd 5904168404Spjd VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 5905168404Spjd 5906185029Spjd /* 5907185029Spjd * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 5908185029Spjd * information. This avoids the dbuf_will_dirty() path and 5909185029Spjd * saves us a pre-read to get data we don't actually care about. 5910185029Spjd */ 5911236884Smm bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE); 5912185029Spjd packed = kmem_alloc(bufsize, KM_SLEEP); 5913168404Spjd 5914168404Spjd VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 5915168404Spjd KM_SLEEP) == 0); 5916185029Spjd bzero(packed + nvsize, bufsize - nvsize); 5917168404Spjd 5918185029Spjd dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); 5919168404Spjd 5920185029Spjd kmem_free(packed, bufsize); 5921168404Spjd 5922168404Spjd VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 5923168404Spjd dmu_buf_will_dirty(db, tx); 5924168404Spjd *(uint64_t *)db->db_data = nvsize; 5925168404Spjd dmu_buf_rele(db, FTAG); 5926168404Spjd} 5927168404Spjd 5928168404Spjdstatic void 5929185029Spjdspa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 5930185029Spjd const char *config, const char *entry) 5931168404Spjd{ 5932168404Spjd nvlist_t *nvroot; 5933185029Spjd nvlist_t **list; 5934168404Spjd int i; 5935168404Spjd 5936185029Spjd if (!sav->sav_sync) 5937168404Spjd return; 5938168404Spjd 5939168404Spjd /* 5940185029Spjd * Update the MOS nvlist describing the list of available devices. 5941185029Spjd * spa_validate_aux() will have already made sure this nvlist is 5942185029Spjd * valid and the vdevs are labeled appropriately. 5943168404Spjd */ 5944185029Spjd if (sav->sav_object == 0) { 5945185029Spjd sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 5946185029Spjd DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 5947185029Spjd sizeof (uint64_t), tx); 5948168404Spjd VERIFY(zap_update(spa->spa_meta_objset, 5949185029Spjd DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 5950185029Spjd &sav->sav_object, tx) == 0); 5951168404Spjd } 5952168404Spjd 5953168404Spjd VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5954185029Spjd if (sav->sav_count == 0) { 5955185029Spjd VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); 5956168404Spjd } else { 5957185029Spjd list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 5958185029Spjd for (i = 0; i < sav->sav_count; i++) 5959185029Spjd list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 5960219089Spjd B_FALSE, VDEV_CONFIG_L2CACHE); 5961185029Spjd VERIFY(nvlist_add_nvlist_array(nvroot, config, list, 5962185029Spjd sav->sav_count) == 0); 5963185029Spjd for (i = 0; i < sav->sav_count; i++) 5964185029Spjd nvlist_free(list[i]); 5965185029Spjd kmem_free(list, sav->sav_count * sizeof (void *)); 5966168404Spjd } 5967168404Spjd 5968185029Spjd spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 5969168404Spjd nvlist_free(nvroot); 5970168404Spjd 5971185029Spjd sav->sav_sync = B_FALSE; 5972168404Spjd} 5973168404Spjd 5974168404Spjdstatic void 5975168404Spjdspa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 5976168404Spjd{ 5977168404Spjd nvlist_t *config; 5978168404Spjd 5979185029Spjd if (list_is_empty(&spa->spa_config_dirty_list)) 5980168404Spjd return; 5981168404Spjd 5982185029Spjd spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 5983168404Spjd 5984185029Spjd config = spa_config_generate(spa, spa->spa_root_vdev, 5985185029Spjd dmu_tx_get_txg(tx), B_FALSE); 5986185029Spjd 5987243505Smm /* 5988243505Smm * If we're upgrading the spa version then make sure that 5989243505Smm * the config object gets updated with the correct version. 5990243505Smm */ 5991243505Smm if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version) 5992243505Smm fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 5993243505Smm spa->spa_uberblock.ub_version); 5994243505Smm 5995185029Spjd spa_config_exit(spa, SCL_STATE, FTAG); 5996185029Spjd 5997168404Spjd if (spa->spa_config_syncing) 5998168404Spjd nvlist_free(spa->spa_config_syncing); 5999168404Spjd spa->spa_config_syncing = config; 6000168404Spjd 6001168404Spjd spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 6002168404Spjd} 6003168404Spjd 6004236884Smmstatic void 6005236884Smmspa_sync_version(void *arg1, void *arg2, dmu_tx_t *tx) 6006236884Smm{ 6007236884Smm spa_t *spa = arg1; 6008236884Smm uint64_t version = *(uint64_t *)arg2; 6009236884Smm 6010236884Smm /* 6011236884Smm * Setting the version is special cased when first creating the pool. 6012236884Smm */ 6013236884Smm ASSERT(tx->tx_txg != TXG_INITIAL); 6014236884Smm 6015236884Smm ASSERT(version <= SPA_VERSION); 6016236884Smm ASSERT(version >= spa_version(spa)); 6017236884Smm 6018236884Smm spa->spa_uberblock.ub_version = version; 6019236884Smm vdev_config_dirty(spa->spa_root_vdev); 6020236884Smm} 6021236884Smm 6022185029Spjd/* 6023185029Spjd * Set zpool properties. 6024185029Spjd */ 6025168404Spjdstatic void 6026219089Spjdspa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx) 6027168404Spjd{ 6028168404Spjd spa_t *spa = arg1; 6029185029Spjd objset_t *mos = spa->spa_meta_objset; 6030168404Spjd nvlist_t *nvp = arg2; 6031236884Smm nvpair_t *elem = NULL; 6032168404Spjd 6033168404Spjd mutex_enter(&spa->spa_props_lock); 6034168404Spjd 6035185029Spjd while ((elem = nvlist_next_nvpair(nvp, elem))) { 6036236884Smm uint64_t intval; 6037236884Smm char *strval, *fname; 6038236884Smm zpool_prop_t prop; 6039236884Smm const char *propname; 6040236884Smm zprop_type_t proptype; 6041236884Smm zfeature_info_t *feature; 6042236884Smm 6043185029Spjd switch (prop = zpool_name_to_prop(nvpair_name(elem))) { 6044236884Smm case ZPROP_INVAL: 6045236884Smm /* 6046236884Smm * We checked this earlier in spa_prop_validate(). 6047236884Smm */ 6048236884Smm ASSERT(zpool_prop_feature(nvpair_name(elem))); 6049236884Smm 6050236884Smm fname = strchr(nvpair_name(elem), '@') + 1; 6051236884Smm VERIFY3U(0, ==, zfeature_lookup_name(fname, &feature)); 6052236884Smm 6053236884Smm spa_feature_enable(spa, feature, tx); 6054236884Smm break; 6055236884Smm 6056185029Spjd case ZPOOL_PROP_VERSION: 6057236884Smm VERIFY(nvpair_value_uint64(elem, &intval) == 0); 6058185029Spjd /* 6059236884Smm * The version is synced seperatly before other 6060236884Smm * properties and should be correct by now. 6061185029Spjd */ 6062236884Smm ASSERT3U(spa_version(spa), >=, intval); 6063185029Spjd break; 6064168404Spjd 6065185029Spjd case ZPOOL_PROP_ALTROOT: 6066185029Spjd /* 6067185029Spjd * 'altroot' is a non-persistent property. It should 6068185029Spjd * have been set temporarily at creation or import time. 6069185029Spjd */ 6070185029Spjd ASSERT(spa->spa_root != NULL); 6071185029Spjd break; 6072168404Spjd 6073219089Spjd case ZPOOL_PROP_READONLY: 6074185029Spjd case ZPOOL_PROP_CACHEFILE: 6075185029Spjd /* 6076219089Spjd * 'readonly' and 'cachefile' are also non-persisitent 6077219089Spjd * properties. 6078185029Spjd */ 6079168404Spjd break; 6080228103Smm case ZPOOL_PROP_COMMENT: 6081228103Smm VERIFY(nvpair_value_string(elem, &strval) == 0); 6082228103Smm if (spa->spa_comment != NULL) 6083228103Smm spa_strfree(spa->spa_comment); 6084228103Smm spa->spa_comment = spa_strdup(strval); 6085228103Smm /* 6086228103Smm * We need to dirty the configuration on all the vdevs 6087228103Smm * so that their labels get updated. It's unnecessary 6088228103Smm * to do this for pool creation since the vdev's 6089228103Smm * configuratoin has already been dirtied. 6090228103Smm */ 6091228103Smm if (tx->tx_txg != TXG_INITIAL) 6092228103Smm vdev_config_dirty(spa->spa_root_vdev); 6093228103Smm break; 6094185029Spjd default: 6095185029Spjd /* 6096185029Spjd * Set pool property values in the poolprops mos object. 6097185029Spjd */ 6098185029Spjd if (spa->spa_pool_props_object == 0) { 6099236884Smm spa->spa_pool_props_object = 6100236884Smm zap_create_link(mos, DMU_OT_POOL_PROPS, 6101185029Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 6102236884Smm tx); 6103185029Spjd } 6104185029Spjd 6105185029Spjd /* normalize the property name */ 6106185029Spjd propname = zpool_prop_to_name(prop); 6107185029Spjd proptype = zpool_prop_get_type(prop); 6108185029Spjd 6109185029Spjd if (nvpair_type(elem) == DATA_TYPE_STRING) { 6110185029Spjd ASSERT(proptype == PROP_TYPE_STRING); 6111185029Spjd VERIFY(nvpair_value_string(elem, &strval) == 0); 6112185029Spjd VERIFY(zap_update(mos, 6113185029Spjd spa->spa_pool_props_object, propname, 6114185029Spjd 1, strlen(strval) + 1, strval, tx) == 0); 6115185029Spjd 6116185029Spjd } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 6117185029Spjd VERIFY(nvpair_value_uint64(elem, &intval) == 0); 6118185029Spjd 6119185029Spjd if (proptype == PROP_TYPE_INDEX) { 6120185029Spjd const char *unused; 6121185029Spjd VERIFY(zpool_prop_index_to_string( 6122185029Spjd prop, intval, &unused) == 0); 6123185029Spjd } 6124185029Spjd VERIFY(zap_update(mos, 6125185029Spjd spa->spa_pool_props_object, propname, 6126185029Spjd 8, 1, &intval, tx) == 0); 6127185029Spjd } else { 6128185029Spjd ASSERT(0); /* not allowed */ 6129185029Spjd } 6130185029Spjd 6131185029Spjd switch (prop) { 6132185029Spjd case ZPOOL_PROP_DELEGATION: 6133185029Spjd spa->spa_delegation = intval; 6134185029Spjd break; 6135185029Spjd case ZPOOL_PROP_BOOTFS: 6136185029Spjd spa->spa_bootfs = intval; 6137185029Spjd break; 6138185029Spjd case ZPOOL_PROP_FAILUREMODE: 6139185029Spjd spa->spa_failmode = intval; 6140185029Spjd break; 6141219089Spjd case ZPOOL_PROP_AUTOEXPAND: 6142219089Spjd spa->spa_autoexpand = intval; 6143219089Spjd if (tx->tx_txg != TXG_INITIAL) 6144219089Spjd spa_async_request(spa, 6145219089Spjd SPA_ASYNC_AUTOEXPAND); 6146219089Spjd break; 6147219089Spjd case ZPOOL_PROP_DEDUPDITTO: 6148219089Spjd spa->spa_dedup_ditto = intval; 6149219089Spjd break; 6150185029Spjd default: 6151185029Spjd break; 6152185029Spjd } 6153168404Spjd } 6154185029Spjd 6155185029Spjd /* log internal history if this is not a zpool create */ 6156185029Spjd if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY && 6157185029Spjd tx->tx_txg != TXG_INITIAL) { 6158219089Spjd spa_history_log_internal(LOG_POOL_PROPSET, 6159219089Spjd spa, tx, "%s %lld %s", 6160185029Spjd nvpair_name(elem), intval, spa_name(spa)); 6161185029Spjd } 6162168404Spjd } 6163185029Spjd 6164185029Spjd mutex_exit(&spa->spa_props_lock); 6165168404Spjd} 6166168404Spjd 6167168404Spjd/* 6168219089Spjd * Perform one-time upgrade on-disk changes. spa_version() does not 6169219089Spjd * reflect the new version this txg, so there must be no changes this 6170219089Spjd * txg to anything that the upgrade code depends on after it executes. 6171219089Spjd * Therefore this must be called after dsl_pool_sync() does the sync 6172219089Spjd * tasks. 6173219089Spjd */ 6174219089Spjdstatic void 6175219089Spjdspa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) 6176219089Spjd{ 6177219089Spjd dsl_pool_t *dp = spa->spa_dsl_pool; 6178219089Spjd 6179219089Spjd ASSERT(spa->spa_sync_pass == 1); 6180219089Spjd 6181219089Spjd if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && 6182219089Spjd spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { 6183219089Spjd dsl_pool_create_origin(dp, tx); 6184219089Spjd 6185219089Spjd /* Keeping the origin open increases spa_minref */ 6186219089Spjd spa->spa_minref += 3; 6187219089Spjd } 6188219089Spjd 6189219089Spjd if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && 6190219089Spjd spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { 6191219089Spjd dsl_pool_upgrade_clones(dp, tx); 6192219089Spjd } 6193219089Spjd 6194219089Spjd if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && 6195219089Spjd spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { 6196219089Spjd dsl_pool_upgrade_dir_clones(dp, tx); 6197219089Spjd 6198219089Spjd /* Keeping the freedir open increases spa_minref */ 6199219089Spjd spa->spa_minref += 3; 6200219089Spjd } 6201236884Smm 6202236884Smm if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES && 6203236884Smm spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 6204236884Smm spa_feature_create_zap_objects(spa, tx); 6205236884Smm } 6206219089Spjd} 6207219089Spjd 6208219089Spjd/* 6209168404Spjd * Sync the specified transaction group. New blocks may be dirtied as 6210168404Spjd * part of the process, so we iterate until it converges. 6211168404Spjd */ 6212168404Spjdvoid 6213168404Spjdspa_sync(spa_t *spa, uint64_t txg) 6214168404Spjd{ 6215168404Spjd dsl_pool_t *dp = spa->spa_dsl_pool; 6216168404Spjd objset_t *mos = spa->spa_meta_objset; 6217219089Spjd bpobj_t *defer_bpo = &spa->spa_deferred_bpobj; 6218219089Spjd bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; 6219168404Spjd vdev_t *rvd = spa->spa_root_vdev; 6220168404Spjd vdev_t *vd; 6221168404Spjd dmu_tx_t *tx; 6222185029Spjd int error; 6223168404Spjd 6224219089Spjd VERIFY(spa_writeable(spa)); 6225219089Spjd 6226168404Spjd /* 6227168404Spjd * Lock out configuration changes. 6228168404Spjd */ 6229185029Spjd spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 6230168404Spjd 6231168404Spjd spa->spa_syncing_txg = txg; 6232168404Spjd spa->spa_sync_pass = 0; 6233168404Spjd 6234185029Spjd /* 6235185029Spjd * If there are any pending vdev state changes, convert them 6236185029Spjd * into config changes that go out with this transaction group. 6237185029Spjd */ 6238185029Spjd spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6239209962Smm while (list_head(&spa->spa_state_dirty_list) != NULL) { 6240209962Smm /* 6241209962Smm * We need the write lock here because, for aux vdevs, 6242209962Smm * calling vdev_config_dirty() modifies sav_config. 6243209962Smm * This is ugly and will become unnecessary when we 6244209962Smm * eliminate the aux vdev wart by integrating all vdevs 6245209962Smm * into the root vdev tree. 6246209962Smm */ 6247209962Smm spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 6248209962Smm spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); 6249209962Smm while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 6250209962Smm vdev_state_clean(vd); 6251209962Smm vdev_config_dirty(vd); 6252209962Smm } 6253209962Smm spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 6254209962Smm spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 6255185029Spjd } 6256185029Spjd spa_config_exit(spa, SCL_STATE, FTAG); 6257185029Spjd 6258168404Spjd tx = dmu_tx_create_assigned(dp, txg); 6259168404Spjd 6260168404Spjd /* 6261185029Spjd * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 6262168404Spjd * set spa_deflate if we have no raid-z vdevs. 6263168404Spjd */ 6264185029Spjd if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 6265185029Spjd spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 6266168404Spjd int i; 6267168404Spjd 6268168404Spjd for (i = 0; i < rvd->vdev_children; i++) { 6269168404Spjd vd = rvd->vdev_child[i]; 6270168404Spjd if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 6271168404Spjd break; 6272168404Spjd } 6273168404Spjd if (i == rvd->vdev_children) { 6274168404Spjd spa->spa_deflate = TRUE; 6275168404Spjd VERIFY(0 == zap_add(spa->spa_meta_objset, 6276168404Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 6277168404Spjd sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 6278168404Spjd } 6279168404Spjd } 6280168404Spjd 6281168404Spjd /* 6282219089Spjd * If anything has changed in this txg, or if someone is waiting 6283219089Spjd * for this txg to sync (eg, spa_vdev_remove()), push the 6284219089Spjd * deferred frees from the previous txg. If not, leave them 6285219089Spjd * alone so that we don't generate work on an otherwise idle 6286219089Spjd * system. 6287168404Spjd */ 6288168404Spjd if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 6289168404Spjd !txg_list_empty(&dp->dp_dirty_dirs, txg) || 6290219089Spjd !txg_list_empty(&dp->dp_sync_tasks, txg) || 6291219089Spjd ((dsl_scan_active(dp->dp_scan) || 6292219089Spjd txg_sync_waiting(dp)) && !spa_shutting_down(spa))) { 6293219089Spjd zio_t *zio = zio_root(spa, NULL, NULL, 0); 6294219089Spjd VERIFY3U(bpobj_iterate(defer_bpo, 6295219089Spjd spa_free_sync_cb, zio, tx), ==, 0); 6296240415Smm VERIFY0(zio_wait(zio)); 6297219089Spjd } 6298168404Spjd 6299168404Spjd /* 6300168404Spjd * Iterate to convergence. 6301168404Spjd */ 6302168404Spjd do { 6303219089Spjd int pass = ++spa->spa_sync_pass; 6304168404Spjd 6305168404Spjd spa_sync_config_object(spa, tx); 6306185029Spjd spa_sync_aux_dev(spa, &spa->spa_spares, tx, 6307185029Spjd ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 6308185029Spjd spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 6309185029Spjd ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 6310168404Spjd spa_errlog_sync(spa, txg); 6311168404Spjd dsl_pool_sync(dp, txg); 6312168404Spjd 6313243503Smm if (pass < zfs_sync_pass_deferred_free) { 6314219089Spjd zio_t *zio = zio_root(spa, NULL, NULL, 0); 6315219089Spjd bplist_iterate(free_bpl, spa_free_sync_cb, 6316219089Spjd zio, tx); 6317219089Spjd VERIFY(zio_wait(zio) == 0); 6318219089Spjd } else { 6319219089Spjd bplist_iterate(free_bpl, bpobj_enqueue_cb, 6320219089Spjd defer_bpo, tx); 6321168404Spjd } 6322168404Spjd 6323219089Spjd ddt_sync(spa, txg); 6324219089Spjd dsl_scan_sync(dp, tx); 6325168404Spjd 6326219089Spjd while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) 6327219089Spjd vdev_sync(vd, txg); 6328168404Spjd 6329219089Spjd if (pass == 1) 6330219089Spjd spa_sync_upgrades(spa, tx); 6331168404Spjd 6332219089Spjd } while (dmu_objset_is_dirty(mos, txg)); 6333219089Spjd 6334168404Spjd /* 6335168404Spjd * Rewrite the vdev configuration (which includes the uberblock) 6336168404Spjd * to commit the transaction group. 6337168404Spjd * 6338185029Spjd * If there are no dirty vdevs, we sync the uberblock to a few 6339185029Spjd * random top-level vdevs that are known to be visible in the 6340185029Spjd * config cache (see spa_vdev_add() for a complete description). 6341185029Spjd * If there *are* dirty vdevs, sync the uberblock to all vdevs. 6342168404Spjd */ 6343185029Spjd for (;;) { 6344185029Spjd /* 6345185029Spjd * We hold SCL_STATE to prevent vdev open/close/etc. 6346185029Spjd * while we're attempting to write the vdev labels. 6347185029Spjd */ 6348185029Spjd spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6349168404Spjd 6350185029Spjd if (list_is_empty(&spa->spa_config_dirty_list)) { 6351185029Spjd vdev_t *svd[SPA_DVAS_PER_BP]; 6352185029Spjd int svdcount = 0; 6353185029Spjd int children = rvd->vdev_children; 6354185029Spjd int c0 = spa_get_random(children); 6355185029Spjd 6356219089Spjd for (int c = 0; c < children; c++) { 6357185029Spjd vd = rvd->vdev_child[(c0 + c) % children]; 6358185029Spjd if (vd->vdev_ms_array == 0 || vd->vdev_islog) 6359185029Spjd continue; 6360185029Spjd svd[svdcount++] = vd; 6361185029Spjd if (svdcount == SPA_DVAS_PER_BP) 6362185029Spjd break; 6363185029Spjd } 6364213198Smm error = vdev_config_sync(svd, svdcount, txg, B_FALSE); 6365213198Smm if (error != 0) 6366213198Smm error = vdev_config_sync(svd, svdcount, txg, 6367213198Smm B_TRUE); 6368185029Spjd } else { 6369185029Spjd error = vdev_config_sync(rvd->vdev_child, 6370213198Smm rvd->vdev_children, txg, B_FALSE); 6371213198Smm if (error != 0) 6372213198Smm error = vdev_config_sync(rvd->vdev_child, 6373213198Smm rvd->vdev_children, txg, B_TRUE); 6374168404Spjd } 6375185029Spjd 6376239620Smm if (error == 0) 6377239620Smm spa->spa_last_synced_guid = rvd->vdev_guid; 6378239620Smm 6379185029Spjd spa_config_exit(spa, SCL_STATE, FTAG); 6380185029Spjd 6381185029Spjd if (error == 0) 6382185029Spjd break; 6383185029Spjd zio_suspend(spa, NULL); 6384185029Spjd zio_resume_wait(spa); 6385168404Spjd } 6386168404Spjd dmu_tx_commit(tx); 6387168404Spjd 6388168404Spjd /* 6389168404Spjd * Clear the dirty config list. 6390168404Spjd */ 6391185029Spjd while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) 6392168404Spjd vdev_config_clean(vd); 6393168404Spjd 6394168404Spjd /* 6395168404Spjd * Now that the new config has synced transactionally, 6396168404Spjd * let it become visible to the config cache. 6397168404Spjd */ 6398168404Spjd if (spa->spa_config_syncing != NULL) { 6399168404Spjd spa_config_set(spa, spa->spa_config_syncing); 6400168404Spjd spa->spa_config_txg = txg; 6401168404Spjd spa->spa_config_syncing = NULL; 6402168404Spjd } 6403168404Spjd 6404168404Spjd spa->spa_ubsync = spa->spa_uberblock; 6405168404Spjd 6406219089Spjd dsl_pool_sync_done(dp, txg); 6407168404Spjd 6408168404Spjd /* 6409168404Spjd * Update usable space statistics. 6410168404Spjd */ 6411168404Spjd while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 6412168404Spjd vdev_sync_done(vd, txg); 6413168404Spjd 6414219089Spjd spa_update_dspace(spa); 6415219089Spjd 6416168404Spjd /* 6417168404Spjd * It had better be the case that we didn't dirty anything 6418168404Spjd * since vdev_config_sync(). 6419168404Spjd */ 6420168404Spjd ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 6421168404Spjd ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 6422168404Spjd ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 6423168404Spjd 6424219089Spjd spa->spa_sync_pass = 0; 6425219089Spjd 6426185029Spjd spa_config_exit(spa, SCL_CONFIG, FTAG); 6427168404Spjd 6428219089Spjd spa_handle_ignored_writes(spa); 6429219089Spjd 6430168404Spjd /* 6431168404Spjd * If any async tasks have been requested, kick them off. 6432168404Spjd */ 6433168404Spjd spa_async_dispatch(spa); 6434168404Spjd} 6435168404Spjd 6436168404Spjd/* 6437168404Spjd * Sync all pools. We don't want to hold the namespace lock across these 6438168404Spjd * operations, so we take a reference on the spa_t and drop the lock during the 6439168404Spjd * sync. 6440168404Spjd */ 6441168404Spjdvoid 6442168404Spjdspa_sync_allpools(void) 6443168404Spjd{ 6444168404Spjd spa_t *spa = NULL; 6445168404Spjd mutex_enter(&spa_namespace_lock); 6446168404Spjd while ((spa = spa_next(spa)) != NULL) { 6447219089Spjd if (spa_state(spa) != POOL_STATE_ACTIVE || 6448219089Spjd !spa_writeable(spa) || spa_suspended(spa)) 6449168404Spjd continue; 6450168404Spjd spa_open_ref(spa, FTAG); 6451168404Spjd mutex_exit(&spa_namespace_lock); 6452168404Spjd txg_wait_synced(spa_get_dsl(spa), 0); 6453168404Spjd mutex_enter(&spa_namespace_lock); 6454168404Spjd spa_close(spa, FTAG); 6455168404Spjd } 6456168404Spjd mutex_exit(&spa_namespace_lock); 6457168404Spjd} 6458168404Spjd 6459168404Spjd/* 6460168404Spjd * ========================================================================== 6461168404Spjd * Miscellaneous routines 6462168404Spjd * ========================================================================== 6463168404Spjd */ 6464168404Spjd 6465168404Spjd/* 6466168404Spjd * Remove all pools in the system. 6467168404Spjd */ 6468168404Spjdvoid 6469168404Spjdspa_evict_all(void) 6470168404Spjd{ 6471168404Spjd spa_t *spa; 6472168404Spjd 6473168404Spjd /* 6474168404Spjd * Remove all cached state. All pools should be closed now, 6475168404Spjd * so every spa in the AVL tree should be unreferenced. 6476168404Spjd */ 6477168404Spjd mutex_enter(&spa_namespace_lock); 6478168404Spjd while ((spa = spa_next(NULL)) != NULL) { 6479168404Spjd /* 6480168404Spjd * Stop async tasks. The async thread may need to detach 6481168404Spjd * a device that's been replaced, which requires grabbing 6482168404Spjd * spa_namespace_lock, so we must drop it here. 6483168404Spjd */ 6484168404Spjd spa_open_ref(spa, FTAG); 6485168404Spjd mutex_exit(&spa_namespace_lock); 6486168404Spjd spa_async_suspend(spa); 6487168404Spjd mutex_enter(&spa_namespace_lock); 6488168404Spjd spa_close(spa, FTAG); 6489168404Spjd 6490168404Spjd if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 6491168404Spjd spa_unload(spa); 6492168404Spjd spa_deactivate(spa); 6493168404Spjd } 6494168404Spjd spa_remove(spa); 6495168404Spjd } 6496168404Spjd mutex_exit(&spa_namespace_lock); 6497168404Spjd} 6498168404Spjd 6499168404Spjdvdev_t * 6500209962Smmspa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) 6501168404Spjd{ 6502185029Spjd vdev_t *vd; 6503185029Spjd int i; 6504185029Spjd 6505185029Spjd if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 6506185029Spjd return (vd); 6507185029Spjd 6508209962Smm if (aux) { 6509185029Spjd for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 6510185029Spjd vd = spa->spa_l2cache.sav_vdevs[i]; 6511185029Spjd if (vd->vdev_guid == guid) 6512185029Spjd return (vd); 6513185029Spjd } 6514209962Smm 6515209962Smm for (i = 0; i < spa->spa_spares.sav_count; i++) { 6516209962Smm vd = spa->spa_spares.sav_vdevs[i]; 6517209962Smm if (vd->vdev_guid == guid) 6518209962Smm return (vd); 6519209962Smm } 6520185029Spjd } 6521185029Spjd 6522185029Spjd return (NULL); 6523168404Spjd} 6524168404Spjd 6525168404Spjdvoid 6526185029Spjdspa_upgrade(spa_t *spa, uint64_t version) 6527168404Spjd{ 6528219089Spjd ASSERT(spa_writeable(spa)); 6529219089Spjd 6530185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6531168404Spjd 6532168404Spjd /* 6533168404Spjd * This should only be called for a non-faulted pool, and since a 6534168404Spjd * future version would result in an unopenable pool, this shouldn't be 6535168404Spjd * possible. 6536168404Spjd */ 6537185029Spjd ASSERT(spa->spa_uberblock.ub_version <= SPA_VERSION); 6538185029Spjd ASSERT(version >= spa->spa_uberblock.ub_version); 6539168404Spjd 6540185029Spjd spa->spa_uberblock.ub_version = version; 6541168404Spjd vdev_config_dirty(spa->spa_root_vdev); 6542168404Spjd 6543185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 6544168404Spjd 6545168404Spjd txg_wait_synced(spa_get_dsl(spa), 0); 6546168404Spjd} 6547168404Spjd 6548168404Spjdboolean_t 6549168404Spjdspa_has_spare(spa_t *spa, uint64_t guid) 6550168404Spjd{ 6551168404Spjd int i; 6552168404Spjd uint64_t spareguid; 6553185029Spjd spa_aux_vdev_t *sav = &spa->spa_spares; 6554168404Spjd 6555185029Spjd for (i = 0; i < sav->sav_count; i++) 6556185029Spjd if (sav->sav_vdevs[i]->vdev_guid == guid) 6557168404Spjd return (B_TRUE); 6558168404Spjd 6559185029Spjd for (i = 0; i < sav->sav_npending; i++) { 6560185029Spjd if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 6561185029Spjd &spareguid) == 0 && spareguid == guid) 6562168404Spjd return (B_TRUE); 6563168404Spjd } 6564168404Spjd 6565168404Spjd return (B_FALSE); 6566168404Spjd} 6567168404Spjd 6568185029Spjd/* 6569185029Spjd * Check if a pool has an active shared spare device. 6570185029Spjd * Note: reference count of an active spare is 2, as a spare and as a replace 6571185029Spjd */ 6572185029Spjdstatic boolean_t 6573185029Spjdspa_has_active_shared_spare(spa_t *spa) 6574168404Spjd{ 6575185029Spjd int i, refcnt; 6576185029Spjd uint64_t pool; 6577185029Spjd spa_aux_vdev_t *sav = &spa->spa_spares; 6578185029Spjd 6579185029Spjd for (i = 0; i < sav->sav_count; i++) { 6580185029Spjd if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, 6581185029Spjd &refcnt) && pool != 0ULL && pool == spa_guid(spa) && 6582185029Spjd refcnt > 2) 6583185029Spjd return (B_TRUE); 6584185029Spjd } 6585185029Spjd 6586185029Spjd return (B_FALSE); 6587168404Spjd} 6588168404Spjd 6589185029Spjd/* 6590185029Spjd * Post a sysevent corresponding to the given event. The 'name' must be one of 6591185029Spjd * the event definitions in sys/sysevent/eventdefs.h. The payload will be 6592185029Spjd * filled in from the spa and (optionally) the vdev. This doesn't do anything 6593185029Spjd * in the userland libzpool, as we don't want consumers to misinterpret ztest 6594185029Spjd * or zdb as real changes. 6595185029Spjd */ 6596185029Spjdvoid 6597185029Spjdspa_event_notify(spa_t *spa, vdev_t *vd, const char *name) 6598168404Spjd{ 6599185029Spjd#ifdef _KERNEL 6600185029Spjd sysevent_t *ev; 6601185029Spjd sysevent_attr_list_t *attr = NULL; 6602185029Spjd sysevent_value_t value; 6603185029Spjd sysevent_id_t eid; 6604168404Spjd 6605185029Spjd ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", 6606185029Spjd SE_SLEEP); 6607168404Spjd 6608185029Spjd value.value_type = SE_DATA_TYPE_STRING; 6609185029Spjd value.value.sv_string = spa_name(spa); 6610185029Spjd if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) 6611185029Spjd goto done; 6612168404Spjd 6613185029Spjd value.value_type = SE_DATA_TYPE_UINT64; 6614185029Spjd value.value.sv_uint64 = spa_guid(spa); 6615185029Spjd if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) 6616185029Spjd goto done; 6617168404Spjd 6618185029Spjd if (vd) { 6619185029Spjd value.value_type = SE_DATA_TYPE_UINT64; 6620185029Spjd value.value.sv_uint64 = vd->vdev_guid; 6621185029Spjd if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, 6622185029Spjd SE_SLEEP) != 0) 6623185029Spjd goto done; 6624168404Spjd 6625185029Spjd if (vd->vdev_path) { 6626185029Spjd value.value_type = SE_DATA_TYPE_STRING; 6627185029Spjd value.value.sv_string = vd->vdev_path; 6628185029Spjd if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, 6629185029Spjd &value, SE_SLEEP) != 0) 6630185029Spjd goto done; 6631168404Spjd } 6632168404Spjd } 6633168404Spjd 6634185029Spjd if (sysevent_attach_attributes(ev, attr) != 0) 6635185029Spjd goto done; 6636185029Spjd attr = NULL; 6637168404Spjd 6638185029Spjd (void) log_sysevent(ev, SE_SLEEP, &eid); 6639185029Spjd 6640185029Spjddone: 6641185029Spjd if (attr) 6642185029Spjd sysevent_free_attr(attr); 6643185029Spjd sysevent_free(ev); 6644185029Spjd#endif 6645168404Spjd} 6646